Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

2.SpeedingUpTheSlowOption #78

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions catalogbuilder/intakebuilder/getinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -213,6 +213,11 @@ def getInfoFromVarAtts(fname,variable_id,dictInfo,att="standard_name",filexra=No
cfname = filexr[variable_id].attrs["standard_name"]
except KeyError:
cfname = "NA"
try:
long_name = filexr[variable_id].attrs["long_name"]
except KeyError:
long_name = "NA"
cfname = long_name.replace(" ", "_")
dictInfo["standard_name"] = cfname
print("standard_name found",dictInfo["standard_name"])
return dictInfo
Expand Down
35 changes: 28 additions & 7 deletions catalogbuilder/intakebuilder/gfdlcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
set_ptemplate = set()
set_ftemplate = set()

if( configyaml is not None):
if(configyaml is not None):
if (configyaml.output_path_template is not None) & (configyaml.output_file_template is not None) :
list_ptemplate = configyaml.output_path_template
list_ftemplate = configyaml.output_file_template
Expand All @@ -59,7 +59,10 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
missingcols.remove("path") #because we get this anyway
logger.debug("Missing cols from metadata sources:"+ (str)(missingcols))


#Creating a dictionary to track the unique datasets we come across when using slow mode
#The keys are the standard names and the values are lists tracking var_id,realm,etc..
unique_datasets = {'':''}

#TODO INCLUDE filter in traversing through directories at the top
for dirpath, dirs, files in os.walk(projectdir):
searchpath = dirpath
Expand Down Expand Up @@ -114,12 +117,30 @@ def crawlLocal(projectdir, dictFilter,dictFilterIgnore,logger,configyaml,slow):
# todo do the reverse if slow is on. Open file no matter what and populate dictionary values and if there is something missed out
# we can scan filenames or config etc
#here, we will see if there are missing header values and compare with file attributes if slow option is turned on
if (slow == True) & (bool(dictInfo) == True) :
print("Slow option turned on.. lets open some files using xarray and lookup atts",filename)
#todo we could look at var attributes, but right now we stick to those that are necessary. scope to extend this easily to missngcols or if header info is not in config yaml
if "standard_name" in missingcols:
# TODO: Possibly use slow option if lookup table can't find standard_name
if (slow == True) & (bool(dictInfo) == True):
#TODO Possibly improvement: get a list of all files to be opened, dmget the files at once or in logical batches before examining with xarray
#print("Slow option turned on.. lets open some files using xarray and lookup atts")

#todo we could look at var attributes, but right now we stick to those that are necessary. scope to extend this easily to missngcols or if header info is not in config yaml
if "standard_name" in missingcols:

# Set standard_name as na to avoid error from getInfoFromVarAtts
dictInfo["standard_name"] = "na"
getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo)

# qualities define the uniqueness and help us determine when to open files. here, we define uniqueness by realm and var_id combinations. we store the realm/var_id pairs + their standard_names in unique_datasets{} and the current pair being checked as a tuple list called 'qualities'. if a pair stored in unique_datasets aligns with the current pair being checked, we won't open the file and will instead use the standard_name already found
# TODO: Extended qualities to determine uniquness from more... qualities
#TODO extend this to append other qualities later
qualities=(dictInfo["variable_id"],dictInfo["realm"])
aradhakrishnanGFDL marked this conversation as resolved.
Show resolved Hide resolved
if qualities in unique_datasets.keys():
standard_name=unique_datasets[qualities]
dictInfo["standard_name"]=standard_name

else:
logger.info("Retrieving standard_name from "+ (str)(filename))
getinfo.getInfoFromVarAtts(dictInfo["path"],dictInfo["variable_id"],dictInfo)
unique_datasets.update({ qualities : dictInfo["standard_name"] })
aradhakrishnanGFDL marked this conversation as resolved.
Show resolved Hide resolved
aradhakrishnanGFDL marked this conversation as resolved.
Show resolved Hide resolved

#replace frequency as needed
if 'frequency' in dictInfo.keys():
package_dir = os.path.dirname(os.path.abspath(__file__))
Expand Down
2 changes: 1 addition & 1 deletion catalogbuilder/scripts/gen_intake_gfdl.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def create_catalog(input_path=None, output_path=None, config=None, filter_realm=
@click.option('--filter_chunk', nargs=1)
@click.option('--overwrite', is_flag=True, default=False)
@click.option('--append', is_flag=True, default=False)
@click.option('--slow','-s', is_flag=True, default=False)
@click.option('--slow','-s', is_flag=True, default=False, help='This option looks up standard names in netcdf file to fill up the standard name column if its present in the header specs. If standard_name is absent, long_name with space replaced by underscore is utilized')
@click.option('--verbose/--silent', default=False, is_flag=True) #default has silent option. Use --verbose for detailed logging

def create_catalog_cli(**kwargs):
Expand Down
Loading