Skip to content

Commit

Permalink
Merge pull request #233 from dtcenter/bugfix_219_performance_warning
Browse files Browse the repository at this point in the history
Bugfix 219 performance warning
  • Loading branch information
hankenstein2 authored Sep 15, 2023
2 parents ca440f6 + 65c187f commit 2d9e8b6
Show file tree
Hide file tree
Showing 3 changed files with 90 additions and 60 deletions.
13 changes: 0 additions & 13 deletions METreformat/point_stat.xml

This file was deleted.

6 changes: 4 additions & 2 deletions METreformat/point_stat.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
output_dir: /path/to/output_dir
output_dir: /path/to/output/directory
output_filename: point_stat_reformatted.txt
xml_spec_file: /path/to/xml_spec_file/<xml filename>.xml
met_tool: point_stat
input_data_dir: /path/to/data/directory

131 changes: 86 additions & 45 deletions METreformat/write_stat_ascii.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
Abstract:
History Log: Initial version (supports CNT, CTC, CTS, and SL1L2 line types for point stat .stat file)
Usage: Write MET stat files (.stat) to an ASCII file with additional columns of information.
Parameters: Requires an xml specification file and yaml configuration file
Parameters: Requires a yaml configuration file
Input Files: transformed dataframe of MET lines
Output Files: A text file containing reformatted data
Copyright 2022 UCAR/NCAR/RAL, CSU/CIRES, Regents of the University of Colorado, NOAA/OAR/ESRL/GSD
Expand All @@ -19,6 +19,7 @@
import os
import logging
import time
import pathlib
from datetime import timedelta
from typing import List, Set
import numpy as np
Expand All @@ -36,23 +37,31 @@ class WriteStatAscii:
""" Class to write MET .stat files to an ASCII file
Returns:
None. Generates an output file
a Pandas dataframe and creates an ascii file with reformatted data.
"""

def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict):
""" write MET stat files (.stat) to an ASCII file with stat_name, stat_value, stat_bcl, stat_bcu,
stat_ncl, and stat_ncu columns, converting the original data file from wide form to long form.
def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict) -> pd.DataFrame:
""" write MET stat files (.stat) to an ASCII file with stat_name, stat_value,
stat_bcl, stat_bcu, stat_ncl, and stat_ncu columns, converting the
original data file from wide form to long form.
Args:
@param stat_data: pandas dataframe corresponding to the MET stat input file generated from the METdbLoad
file reader
@param parms: The yaml configuration object (dictionary) containing the settings for output dir, output file
Returns: None, write an output ASCII file associated with the original MET .stat file with statistics
information aggregated into these six columns: stat_name, stat_value, stat_ncl, stat_ncu,
stat_bcl, and stat_bcu (the stat_xyz are not available in all line types,
these will have values of NA)
@param stat_data: pandas dataframe corresponding to the MET stat
input file generated from the METdbLoad file reader
@param parms: The yaml configuration object (dictionary) containing
the settings for output dir, output file
Returns:
combined_df: pandas dataframe with original data reformatted into
'long' form.
Additionally, write an output ASCII file associated with the
original MET .stat file with statistics information aggregated
into these six columns: stat_name,
stat_value, stat_ncl, stat_ncu,
stat_bcl, and stat_bcu (the stat_xyz are not available in all
line types, these will have values of NA)
"""

Expand Down Expand Up @@ -114,6 +123,7 @@ def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict):
mode='a')



except (RuntimeError, TypeError, NameError, KeyError):
logging.error("*** %s in write_stat_ascii ***", sys.exc_info()[0])

Expand All @@ -124,6 +134,8 @@ def write_stat_ascii(self, stat_data: pd.DataFrame, parms: dict):

logging.debug("[--- End write_stat_data ---]")

return combined_dfs

def process_by_stat_linetype(self, linetype: str, stat_data: pd.DataFrame):
"""
For a given linetype, extract the relevant statistics information into the
Expand Down Expand Up @@ -198,16 +210,20 @@ def process_fho(self, stat_data: pd.DataFrame) -> pd.DataFrame:
# Create another index column to preserve the index values from the stat_data dataframe (ie the dataframe
# containing the original data from the MET output file).
idx = list(fho_df.index)
fho_df.insert(loc=0, column='Idx', value=idx)

# Work on a copy of the fho_df dataframe to avoid a possible PerformanceWarning
# message due to a fragmented dataframe.
fho_df_copy = fho_df.copy()
fho_df_copy.insert(loc=0, column='Idx', value=idx)

# Use pandas 'melt' to reshape the data frame from wide to long shape (i.e. collecting the f_rate, h_rate,
# and o_rate values and putting them under the column 'stat_value' corresponding to the 'stat_name' column
# containing the names F_RATE, H_RATE, and O_RATE

# columns that we don't want to change (the last three columns are the stat columns of interest,
# we want to capture that information into the stat_name and stat_values columns)
columns_to_use: List[str] = fho_df.columns[0:-3].tolist()
fho_copy: pd.DataFrame = fho_df.copy(deep=True)
columns_to_use: List[str] = fho_df_copy.columns[0:-3].tolist()
fho_copy: pd.DataFrame = fho_df_copy.copy(deep=True)
linetype_data: pd.DataFrame = pd.melt(fho_copy, id_vars=columns_to_use, var_name='stat_name',
value_name='stat_value')

Expand Down Expand Up @@ -249,26 +265,33 @@ def process_cnt(self, stat_data: pd.DataFrame) -> pd.DataFrame:
# Create another index column to preserve the index values from the stat_data dataframe (ie the dataframe
# containing the original data from the MET output file).
idx = list(cnt_df.index)
cnt_df.insert(loc=0, column='Idx', value=idx)

# Work on a copy of the cnt_df dataframe to avoid a possible PerformanceWarning
# message due to a fragmented dataframe.
cnt_df_copy = cnt_df.copy()
cnt_df_copy.insert(loc=0, column='Idx', value=idx)


# Use the pd.wide_to_long() to collect the statistics and confidence level data into the appropriate columns.
# Rename the <stat_group>_BCL|BCU|NCL|NCU to BCL|BCU|NCL|NCU_<stat_group> in order to
# use pd.wide_to_long().

# Rename confidence level column header names so the BCL, BCU, NCL, and NCU are appended with the statistic name
# (i.e. from FBAR_BCU to BCU_FBAR to be able to use the pandas wide_to_long).
confidence_level_columns_renamed: List[str] = self.rename_confidence_level_columns(cnt_df.columns.tolist())
cnt_df.columns: List[str] = confidence_level_columns_renamed
confidence_level_columns_renamed: List[str] = (
self.rename_confidence_level_columns(cnt_df_copy.columns.tolist()))
cnt_df_copy.columns: List[str] = confidence_level_columns_renamed

# Rename the statistics columns (ie. FBAR, MAE, FSTDEV, etc. to STAT_FBAR, STAT_MAE, etc.)
stat_confidence_level_columns_renamed = self.rename_statistics_columns(cnt_df, cn.CNT_STATISTICS_HEADERS)
cnt_df.columns = stat_confidence_level_columns_renamed
stat_confidence_level_columns_renamed = self.rename_statistics_columns(
cnt_df_copy, cn.CNT_STATISTICS_HEADERS)
cnt_df_copy.columns = stat_confidence_level_columns_renamed

# Get the name of the columns to be used for indexing, this will also preserve the ordering of columns from the
# original data.
indexing_columns = ['Idx'] + cn.LC_COMMON_STAT_HEADER + ['total']

wide_to_long_df: pd.DataFrame = pd.wide_to_long(cnt_df,
wide_to_long_df: pd.DataFrame = pd.wide_to_long(cnt_df_copy,
stubnames=['STAT', 'NCL', 'NCU', 'BCL', 'BCU'],
i=indexing_columns,
j='stat_name',
Expand Down Expand Up @@ -318,13 +341,18 @@ def process_ctc(self, stat_data: pd.DataFrame) -> pd.DataFrame:
# Create another index column to preserve the index values from the stat_data dataframe (ie the dataframe
# containing the original data from the MET output file).
idx = list(ctc_df.index)
ctc_df.insert(loc=0, column='Idx', value=idx)

# Work on a copy of the ctc_df dataframe to avoid a possible PerformanceWarning
# message due to a fragmented dataframe.
ctc_df_copy = ctc_df.copy()
ctc_df_copy.insert(loc=0, column='Idx', value=idx)

# Now apply melt to get the stat_name and stat_values from the statistics

# Columns we don't want to stack (i.e. treat these columns as a multi index)
id_vars_list = ['Idx'] + cn.LC_COMMON_STAT_HEADER + ['total']
linetype_data = ctc_df.melt(id_vars=id_vars_list, value_vars=cn.CTC_STATISTICS_HEADERS,
linetype_data = ctc_df_copy.melt(id_vars=id_vars_list,
value_vars=cn.CTC_STATISTICS_HEADERS,
var_name='stat_name',
value_name='stat_value').sort_values('Idx')

Expand Down Expand Up @@ -366,26 +394,32 @@ def process_cts(self, stat_data: pd.DataFrame) -> pd.DataFrame:
# Create another index column to preserve the index values from the stat_data dataframe (ie the dataframe
# containing the original data from the MET output file).
idx = list(cts_df.index)
cts_df.insert(loc=0, column='Idx', value=idx)

# Work on a copy of the cts_df dataframe to avoid a possible PerformanceWarning
# message due to a fragmented dataframe.
cts_df_copy = cts_df.copy()
cts_df_copy.insert(loc=0, column='Idx', value=idx)

# Use the pd.wide_to_long() to collect the statistics and confidence level data into the appropriate columns.
# Rename the <stat_group>_BCL|BCU|NCL|NCU to BCL|BCU|NCL|NCU_<stat_group> in order to
# use pd.wide_to_long().

# Rename confidence level column header names so the BCL, BCU, NCL, and NCU are appended with the statistic name
# (i.e. from FBAR_BCU to BCU_FBAR to be able to use the pandas wide_to_long).
confidence_level_columns_renamed: List[str] = self.rename_confidence_level_columns(cts_df.columns.tolist())
cts_df.columns: List[str] = confidence_level_columns_renamed
confidence_level_columns_renamed: List[str] = (
self.rename_confidence_level_columns(cts_df_copy.columns.tolist()))
cts_df_copy.columns: List[str] = confidence_level_columns_renamed

# Rename the statistics columns (ie. FBAR, MAE, FSTDEV, etc. to STAT_FBAR, STAT_MAE, etc.)
stat_confidence_level_columns_renamed = self.rename_statistics_columns(cts_df, cn.CTS_STATS_ONLY_HEADERS)
cts_df.columns = stat_confidence_level_columns_renamed
stat_confidence_level_columns_renamed = self.rename_statistics_columns(
cts_df_copy, cn.CTS_STATS_ONLY_HEADERS)
cts_df_copy.columns = stat_confidence_level_columns_renamed

# Get the name of the columns to be used for indexing, this will also preserve the ordering of columns from the
# original data.
indexing_columns = ['Idx'] + cn.LC_COMMON_STAT_HEADER + ['total']

wide_to_long_df: pd.DataFrame = pd.wide_to_long(cts_df,
wide_to_long_df: pd.DataFrame = pd.wide_to_long(cts_df_copy,
stubnames=['STAT', 'NCL', 'NCU', 'BCL', 'BCU'],
i=indexing_columns,
j='stat_name',
Expand Down Expand Up @@ -434,13 +468,18 @@ def process_sl1l2(self, stat_data: pd.DataFrame) -> pd.DataFrame:
# Create another index column to preserve the index values from the stat_data dataframe (ie the dataframe
# containing the original data from the MET output file).
idx = list(sl1l2_df.index)
sl1l2_df.insert(loc=0, column='Idx', value=idx)

# Work on a copy of thesl1l2_df dataframe to avoid a possible PerformanceWarning
# message due to a fragmented dataframe.
sl1l2_df_copy = sl1l2_df.copy()
sl1l2_df_copy.insert(loc=0, column='Idx', value=idx)

# Now apply melt to get the stat_name and stat_values from the statistics

# Columns we don't want to stack (i.e. treat these columns as a multi index)
id_vars_list = ['Idx'] + cn.LC_COMMON_STAT_HEADER + ['total']
reshaped = sl1l2_df.melt(id_vars=id_vars_list, value_vars=cn.SL1L2_STATISTICS_HEADERS,
reshaped = sl1l2_df_copy.melt(id_vars=id_vars_list,
value_vars=cn.SL1L2_STATISTICS_HEADERS,
var_name='stat_name',
value_name='stat_value').sort_values('Idx')

Expand Down Expand Up @@ -513,8 +552,7 @@ def rename_statistics_columns(self, df: pd.DataFrame, statistics_columns: List[s
def main():
'''
Open the yaml config file specified at the command line to get output directory, output filename,
and location and name of the xml specification file. The xml specification file contains information
about what MET file types to reformat and the directory of where input MET output files (.stat) are located.
and location of input files, and the MET tool used to create the input data.
Then invoke necessary methods to read and process data to reformat the MET .stat file from wide to long format to
collect statistics information into stat_name, stat_value, stat_bcl, stat_bcu, stat_ncl, and stat_ncu columns.
Expand All @@ -526,25 +564,28 @@ def main():
with open(config_file, 'r') as stream:
try:
parms: dict = yaml.load(stream, Loader=yaml.FullLoader)
path_to_output = '"' + parms['output_dir'] + '"'
pathlib.Path(parms['output_dir']).mkdir(parents=True, exist_ok=True)
except yaml.YAMLError as exc:
print(exc)

# Read in the XML load file. This contains information about which MET output files are to be loaded.
xml_file: str = parms['xml_spec_file']
xml_loadfile_obj: XmlLoadFile = XmlLoadFile(xml_file)
xml_loadfile_obj.read_xml()

# Read all of the data from the data files into a dataframe
# Replacing the need for an XML specification file, pass in the XMLLoadFile and
# ReadDataFile parameters
rdf_obj: ReadDataFiles = ReadDataFiles()
xml_loadfile_obj: XmlLoadFile = XmlLoadFile(None)

# Retrieve all the filenames in the data_dir specified in the YAML config file
load_files = xml_loadfile_obj.filenames_from_template(parms['input_data_dir'],
{})

# read in the data files, with options specified by XML flags
rdf_obj.read_data(xml_loadfile_obj.flags,
xml_loadfile_obj.load_files,
xml_loadfile_obj.line_types)
flags = xml_loadfile_obj.flags
line_types = xml_loadfile_obj.line_types
rdf_obj.read_data(flags, load_files, line_types)
file_df = rdf_obj.stat_data

# Write stat file in ASCII format, one for each line type
stat_lines_obj: WriteStatAscii = WriteStatAscii()
stat_lines_obj.write_stat_ascii(rdf_obj.stat_data, parms)
stat_lines_obj.write_stat_ascii(file_df, parms)


if __name__ == "__main__":
Expand Down

0 comments on commit 2d9e8b6

Please sign in to comment.