From 54794ad9c1361e72bc8a00a9840361c6f641c8a9 Mon Sep 17 00:00:00 2001 From: Ciheim Brown Date: Tue, 22 Oct 2024 17:36:30 -0400 Subject: [PATCH 1/8] 2.SpeedingUpTheSlowOption --- catalogbuilder/intakebuilder/gfdlcrawler.py | 28 ++++++++++++++++----- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index b2e29c9..df95776 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -59,7 +59,10 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): missingcols.remove("path") #because we get this anyway logger.debug("Missing cols from metadata sources:"+ (str)(missingcols)) - + #Creating a dictionary to track the unique datasets we come across when using slow mode + #The keys don't mean much but the values will be lists tracking var_id,realm,etc.. + unique_datasets = {} + #TODO INCLUDE filter in traversing through directories at the top for dirpath, dirs, files in os.walk(projectdir): searchpath = dirpath @@ -114,12 +117,25 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): # todo do the reverse if slow is on. Open file no matter what and populate dictionary values and if there is something missed out # we can scan filenames or config etc #here, we will see if there are missing header values and compare with file attributes if slow option is turned on - if (slow == True) & (bool(dictInfo) == True) : - print("Slow option turned on.. lets open some files using xarray and lookup atts",filename) - #todo we could look at var attributes, but right now we stick to those that are necessary. scope to extend this easily to missngcols or if header info is not in config yaml - if "standard_name" in missingcols: + if (slow == True) & (bool(dictInfo) == True): + print("Slow option turned on.. lets open some files using xarray and lookup atts") + + #todo we could look at var attributes, but right now we stick to those that are necessary. scope to extend this easily to missngcols or if header info is not in config yaml + if "standard_name" in missingcols: + dictInfo["standard_name"] = "na" - getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo) + + #Check if we've come across a similar dataset + qualities=[dictInfo["variable_id"],dictInfo["realm"]] + for standard_name,quality_list in unique_datasets.items(): + if quality_list == qualities: + dictInfo["standard_name"]=standard_name + + if dictInfo["standard_name"] == "na": + print("Retrieving standard_name from ", filename) + getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo) + unique_datasets.update({ dictInfo["standard_name"] : qualities}) + #replace frequency as needed if 'frequency' in dictInfo.keys(): package_dir = os.path.dirname(os.path.abspath(__file__)) From 4b5538b83d0819e99a5827576c8af34dc0516372 Mon Sep 17 00:00:00 2001 From: Ciheim Brown Date: Thu, 24 Oct 2024 23:37:19 -0400 Subject: [PATCH 2/8] Fixing dictionary logic --- catalogbuilder/intakebuilder/gfdlcrawler.py | 23 +++++++++++---------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index df95776..e217995 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -33,7 +33,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): set_ptemplate = set() set_ftemplate = set() - if( configyaml is not None): + if(configyaml is not None): if (configyaml.output_path_template is not None) & (configyaml.output_file_template is not None) : list_ptemplate = configyaml.output_path_template list_ftemplate = configyaml.output_file_template @@ -60,8 +60,8 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): logger.debug("Missing cols from metadata sources:"+ (str)(missingcols)) #Creating a dictionary to track the unique datasets we come across when using slow mode - #The keys don't mean much but the values will be lists tracking var_id,realm,etc.. - unique_datasets = {} + #The keys are the standard names and the values are lists tracking var_id,realm,etc.. + unique_datasets = {'':''} #TODO INCLUDE filter in traversing through directories at the top for dirpath, dirs, files in os.walk(projectdir): @@ -118,23 +118,24 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): # we can scan filenames or config etc #here, we will see if there are missing header values and compare with file attributes if slow option is turned on if (slow == True) & (bool(dictInfo) == True): - print("Slow option turned on.. lets open some files using xarray and lookup atts") + #print("Slow option turned on.. lets open some files using xarray and lookup atts") #todo we could look at var attributes, but right now we stick to those that are necessary. scope to extend this easily to missngcols or if header info is not in config yaml if "standard_name" in missingcols: + # Set standard_name as na to avoid error from getInfoFromVarAtts dictInfo["standard_name"] = "na" - #Check if we've come across a similar dataset - qualities=[dictInfo["variable_id"],dictInfo["realm"]] - for standard_name,quality_list in unique_datasets.items(): - if quality_list == qualities: - dictInfo["standard_name"]=standard_name + # qualities define the uniqueness and help us determine when to open files. here, we define uniqueness by realm and var_id combinations. we store the realm/var_id pairs + their standard_names in unique_datasets{} and the current pair being checked as a tuple list called 'qualities'. if a pair stored in unique_datasets aligns with the current pair being checked, we won't open the file and will instead use the standard_name already found + qualities=(dictInfo["variable_id"],dictInfo["realm"]) + if qualities in unique_datasets.keys(): + standard_name=unique_datasets[qualities] + dictInfo["standard_name"]=standard_name - if dictInfo["standard_name"] == "na": + else: print("Retrieving standard_name from ", filename) getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo) - unique_datasets.update({ dictInfo["standard_name"] : qualities}) + unique_datasets.update({ qualities : dictInfo["standard_name"] }) #replace frequency as needed if 'frequency' in dictInfo.keys(): From 8953b5ebad39f71e8c17b225a5f88ebe22c1736e Mon Sep 17 00:00:00 2001 From: Ciheim Brown Date: Fri, 25 Oct 2024 13:11:58 -0400 Subject: [PATCH 3/8] Updating to grab long_name when standard_name is not found --- catalogbuilder/intakebuilder/getinfo.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/catalogbuilder/intakebuilder/getinfo.py b/catalogbuilder/intakebuilder/getinfo.py index f413d6b..13d947a 100644 --- a/catalogbuilder/intakebuilder/getinfo.py +++ b/catalogbuilder/intakebuilder/getinfo.py @@ -213,6 +213,11 @@ def getInfoFromVarAtts(fname,variable_id,dictInfo,att="standard_name",filexra=No cfname = filexr[variable_id].attrs["standard_name"] except KeyError: cfname = "NA" + try: + long_name = filexr[variable_id].attrs["long_name"] + except KeyError: + long_name = "NA" + cfname = long_name.replace(" ", "_") dictInfo["standard_name"] = cfname print("standard_name found",dictInfo["standard_name"]) return dictInfo From a03d07406e867fd28278a593014e1728df6ff3e0 Mon Sep 17 00:00:00 2001 From: Ciheim Brown Date: Wed, 30 Oct 2024 09:47:57 -0400 Subject: [PATCH 4/8] Added TODO + changed print --- catalogbuilder/intakebuilder/gfdlcrawler.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index e217995..991f156 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -117,6 +117,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): # todo do the reverse if slow is on. Open file no matter what and populate dictionary values and if there is something missed out # we can scan filenames or config etc #here, we will see if there are missing header values and compare with file attributes if slow option is turned on + # TODO: Possibly use slow option if lookup table can't find standard_name if (slow == True) & (bool(dictInfo) == True): #print("Slow option turned on.. lets open some files using xarray and lookup atts") @@ -127,13 +128,14 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): dictInfo["standard_name"] = "na" # qualities define the uniqueness and help us determine when to open files. here, we define uniqueness by realm and var_id combinations. we store the realm/var_id pairs + their standard_names in unique_datasets{} and the current pair being checked as a tuple list called 'qualities'. if a pair stored in unique_datasets aligns with the current pair being checked, we won't open the file and will instead use the standard_name already found + # TODO: Extended qualities to determine uniquness from more... qualities qualities=(dictInfo["variable_id"],dictInfo["realm"]) if qualities in unique_datasets.keys(): standard_name=unique_datasets[qualities] dictInfo["standard_name"]=standard_name else: - print("Retrieving standard_name from ", filename) + logger.info("Retrieving standard_name from ", filename) getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo) unique_datasets.update({ qualities : dictInfo["standard_name"] }) From dce18c1cd9f80d1e58495c5caa30da176fba6465 Mon Sep 17 00:00:00 2001 From: Aparna Radhakrishnan Date: Thu, 31 Oct 2024 12:49:13 -0400 Subject: [PATCH 5/8] Update catalogbuilder/intakebuilder/gfdlcrawler.py TODO added --- catalogbuilder/intakebuilder/gfdlcrawler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index 991f156..a9410bb 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -129,6 +129,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): # qualities define the uniqueness and help us determine when to open files. here, we define uniqueness by realm and var_id combinations. we store the realm/var_id pairs + their standard_names in unique_datasets{} and the current pair being checked as a tuple list called 'qualities'. if a pair stored in unique_datasets aligns with the current pair being checked, we won't open the file and will instead use the standard_name already found # TODO: Extended qualities to determine uniquness from more... qualities + #TODO extend this to append other qualities later qualities=(dictInfo["variable_id"],dictInfo["realm"]) if qualities in unique_datasets.keys(): standard_name=unique_datasets[qualities] From 622804e4df5f7a000c2e37b4544801b1cc2b4758 Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Thu, 31 Oct 2024 12:58:29 -0400 Subject: [PATCH 6/8] logger bugfix --- catalogbuilder/intakebuilder/gfdlcrawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index a9410bb..149f23b 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -136,7 +136,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): dictInfo["standard_name"]=standard_name else: - logger.info("Retrieving standard_name from ", filename) + logger.info("Retrieving standard_name from "+ (str)(filename)) getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo) unique_datasets.update({ qualities : dictInfo["standard_name"] }) From 6e2dd993d43379f0dca199a967d2ed815284e720 Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Thu, 31 Oct 2024 14:15:22 -0400 Subject: [PATCH 7/8] help message added to --slow --- catalogbuilder/scripts/gen_intake_gfdl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/catalogbuilder/scripts/gen_intake_gfdl.py b/catalogbuilder/scripts/gen_intake_gfdl.py index aa57630..75c709e 100755 --- a/catalogbuilder/scripts/gen_intake_gfdl.py +++ b/catalogbuilder/scripts/gen_intake_gfdl.py @@ -146,7 +146,7 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm= @click.option('--filter_chunk', nargs=1) @click.option('--overwrite', is_flag=True, default=False) @click.option('--append', is_flag=True, default=False) -@click.option('--slow','-s', is_flag=True, default=False) +@click.option('--slow','-s', is_flag=True, default=False, help='This option looks up standard names in netcdf file to fill up the standard name column if its present in the header specs. If standard_name is absent, long_name with space replaced by underscore is utilized') @click.option('--verbose/--silent', default=False, is_flag=True) #default has silent option. Use --verbose for detailed logging def create_catalog_cli(**kwargs): From 9db4d7f3aecc9440cdc1dd3bd6fdb075af752d21 Mon Sep 17 00:00:00 2001 From: aradhakrishnanGFDL Date: Thu, 31 Oct 2024 14:17:12 -0400 Subject: [PATCH 8/8] todo comments added per my review --- catalogbuilder/intakebuilder/gfdlcrawler.py | 1 + 1 file changed, 1 insertion(+) diff --git a/catalogbuilder/intakebuilder/gfdlcrawler.py b/catalogbuilder/intakebuilder/gfdlcrawler.py index 149f23b..ffe26eb 100644 --- a/catalogbuilder/intakebuilder/gfdlcrawler.py +++ b/catalogbuilder/intakebuilder/gfdlcrawler.py @@ -119,6 +119,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow): #here, we will see if there are missing header values and compare with file attributes if slow option is turned on # TODO: Possibly use slow option if lookup table can't find standard_name if (slow == True) & (bool(dictInfo) == True): + #TODO Possibly improvement: get a list of all files to be opened, dmget the files at once or in logical batches before examining with xarray #print("Slow option turned on.. lets open some files using xarray and lookup atts") #todo we could look at var attributes, but right now we stick to those that are necessary. scope to extend this easily to missngcols or if header info is not in config yaml