From 0f75f60c92309e163cbfdcacaaa06c7bb6ba40e3 Mon Sep 17 00:00:00 2001 From: Teagan King Date: Mon, 15 Apr 2024 10:07:22 -0600 Subject: [PATCH 01/19] ran black on py files --- cupid/build.py | 36 ++++----- cupid/clear.py | 34 +++++---- cupid/quickstart.py | 2 +- cupid/read.py | 23 +++--- cupid/run.py | 54 +++++++++----- cupid/timeseries.py | 6 +- cupid/util.py | 177 +++++++++++++++++++++++--------------------- docs/conf.py | 72 +++++++++--------- 8 files changed, 217 insertions(+), 187 deletions(-) diff --git a/cupid/build.py b/cupid/build.py index 7c67d09..f030e6f 100755 --- a/cupid/build.py +++ b/cupid/build.py @@ -5,37 +5,39 @@ import os import yaml + def build(): """ Build a Jupyter book based on the TOC in config.yml. Called by `cupid-build`. - + Args: none Returns: None """ - + config_path = str(sys.argv[1]) - + with open(config_path, "r") as fid: control = yaml.safe_load(fid) - + sname = control["data_sources"]["sname"] run_dir = control["data_sources"]["run_dir"] - subprocess.run(["jupyter-book", "clean" , f"{run_dir}/computed_notebooks/{sname}"]) - subprocess.run(["jupyter-book", "build" , f"{run_dir}/computed_notebooks/{sname}", "--all"]) + subprocess.run(["jupyter-book", "clean", f"{run_dir}/computed_notebooks/{sname}"]) + subprocess.run( + ["jupyter-book", "build", f"{run_dir}/computed_notebooks/{sname}", "--all"] + ) -### Originally used this code to copy jupyter book HTML to a location to host it online + ### Originally used this code to copy jupyter book HTML to a location to host it online -# if 'publish_location' in control: - -# user = os.environ.get('USER') -# remote_mach = control["publish_location"]["remote_mach"] -# remote_dir = control["publish_location"]["remote_dir"] -# this seems more complicated than expected...people have mentioned paramiko library? - # subprocess.run(["mkdir", "-p", remote_dir]) - # subprocess.run(["scp", "-r", f"{run_dir}/computed_notebooks/{sname}/_build/html/*", f"{user}@{remote_mach}:{remote_dir}"]) - - return None + # if 'publish_location' in control: + # user = os.environ.get('USER') + # remote_mach = control["publish_location"]["remote_mach"] + # remote_dir = control["publish_location"]["remote_dir"] + # this seems more complicated than expected...people have mentioned paramiko library? + # subprocess.run(["mkdir", "-p", remote_dir]) + # subprocess.run(["scp", "-r", f"{run_dir}/computed_notebooks/{sname}/_build/html/*", f"{user}@{remote_mach}:{remote_dir}"]) + + return None diff --git a/cupid/clear.py b/cupid/clear.py index 869980c..b82887c 100755 --- a/cupid/clear.py +++ b/cupid/clear.py @@ -4,33 +4,35 @@ import cupid.util import shutil + def readConfigFile(config_path): - #Given the file path to config.yml, this function reads the config file content and - #returns the val of the run_dir string with '/computed_notebooks' appended to it - - #Obtain the contents of the config.yml file and extract the run_dir variable + # Given the file path to config.yml, this function reads the config file content and + # returns the val of the run_dir string with '/computed_notebooks' appended to it + + # Obtain the contents of the config.yml file and extract the run_dir variable control = cupid.util.get_control_dict(config_path) - run_dir = control['data_sources'].get('run_dir', None) - + run_dir = control["data_sources"].get("run_dir", None) + if run_dir: - #Append '/computed_notebooks' to the run_dir value if it is not empty - fullPath = os.path.join(run_dir, 'computed_notebooks') + # Append '/computed_notebooks' to the run_dir value if it is not empty + fullPath = os.path.join(run_dir, "computed_notebooks") return fullPath - - else: #run_dir is empty/wasn't found in config file so return error + + else: # run_dir is empty/wasn't found in config file so return error raise ValueError("'run_dir' was empty/not found in the config file.") + @click.command() -@click.argument('config_path') -#Entry point to this script +@click.argument("config_path") +# Entry point to this script def clear(config_path): """Clears the contents of the 'computed_notebooks' folder at the location specified by the 'run_dir' variable in the 'config.yml' file. - + Args: config_path - The path to the config.yml file. """ - + run_dir = readConfigFile(config_path) - #Delete the 'computed_notebooks' folder and all the contents inside of it + # Delete the 'computed_notebooks' folder and all the contents inside of it shutil.rmtree(run_dir) - print(f"All contents in {run_dir} have been cleared.") \ No newline at end of file + print(f"All contents in {run_dir} have been cleared.") diff --git a/cupid/quickstart.py b/cupid/quickstart.py index 39ee180..fbe77ca 100644 --- a/cupid/quickstart.py +++ b/cupid/quickstart.py @@ -1,2 +1,2 @@ ### To be created: a script (maybe called through a command line entry point) that sets up a directory with a config.yml file and -### basics necessary to set up a notebook collection \ No newline at end of file +### basics necessary to set up a notebook collection diff --git a/cupid/read.py b/cupid/read.py index 4164308..11b0818 100644 --- a/cupid/read.py +++ b/cupid/read.py @@ -1,9 +1,10 @@ import intake import yaml + def read_yaml(path_to_yaml): with open(path_to_yaml) as f: - data = yaml.load(f, Loader=yaml.FullLoader) + data = yaml.load(f, Loader=yaml.FullLoader) return data @@ -11,22 +12,22 @@ def get_collection(path_to_catalog, **kwargs): cat = intake.open_esm_datastore(path_to_catalog) ### note that the json file points to the csv, so the path that the ### yaml file contains doesn't actually get used. this can cause issues - + cat_subset = cat.search(**kwargs) - + if "variable" in kwargs.keys(): - + def preprocess(ds): ## the double brackets return a Dataset rather than a DataArray ## this is fragile and could cause issues, i'm not totally sure what subsetting on time_bound does - return ds[[kwargs["variable"], 'time_bound']] - + return ds[[kwargs["variable"], "time_bound"]] + ## not sure what the chunking kwarg is doing here either - dsets = cat_subset.to_dataset_dict(xarray_open_kwargs={'chunks': {'time': -1}}, preprocess=preprocess) - + dsets = cat_subset.to_dataset_dict( + xarray_open_kwargs={"chunks": {"time": -1}}, preprocess=preprocess + ) + else: dsets = cat_subset.to_dataset_dict() - - return dsets - + return dsets diff --git a/cupid/run.py b/cupid/run.py index 79798f3..72f9f70 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -34,7 +34,7 @@ def run(config_path, serial=False, time_series=False): control = cupid.util.get_control_dict(config_path) cupid.util.setup_book(config_path) - ##################################################################### + ##################################################################### # Managing global parameters global_params = dict() @@ -49,26 +49,40 @@ def run(config_path, serial=False, time_series=False): # general timeseries arguments for all components num_procs = timeseries_params["num_procs"] - - - for component in ['atm', 'ocn', 'lnd', 'ice', 'glc']: + for component in ["atm", "ocn", "lnd", "ice", "glc"]: cupid.timeseries.create_time_series( - component, - timeseries_params[component]["vars"], - timeseries_params[component]["derive_vars"], - [timeseries_params["case_name"]], # could also grab from compute_notebooks section of config file - timeseries_params[component]["hist_str"], - [global_params["CESM_output_dir"] + "/" + timeseries_params["case_name"] + f"/{component}/hist/"], # could also grab from compute_notebooks section of config file - [global_params["CESM_output_dir"]+'/'+timeseries_params['case_name']+f'/{component}/proc/tseries/'], - # Note that timeseries output will eventually go in /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ - timeseries_params["ts_done"], - timeseries_params["overwrite_ts"], - timeseries_params[component]["start_years"], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date - timeseries_params[component]["end_years"], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date - timeseries_params[component]["level"], - num_procs, - serial, - ) + component, + timeseries_params[component]["vars"], + timeseries_params[component]["derive_vars"], + [ + timeseries_params["case_name"] + ], # could also grab from compute_notebooks section of config file + timeseries_params[component]["hist_str"], + [ + global_params["CESM_output_dir"] + + "/" + + timeseries_params["case_name"] + + f"/{component}/hist/" + ], # could also grab from compute_notebooks section of config file + [ + global_params["CESM_output_dir"] + + "/" + + timeseries_params["case_name"] + + f"/{component}/proc/tseries/" + ], + # Note that timeseries output will eventually go in /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ + timeseries_params["ts_done"], + timeseries_params["overwrite_ts"], + timeseries_params[component][ + "start_years" + ], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date + timeseries_params[component][ + "end_years" + ], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date + timeseries_params[component]["level"], + num_procs, + serial, + ) # Grab paths diff --git a/cupid/timeseries.py b/cupid/timeseries.py index 86bec80..1f7e7ae 100644 --- a/cupid/timeseries.py +++ b/cupid/timeseries.py @@ -229,8 +229,10 @@ def create_time_series( diag_var_list = hist_file_var_list for var in diag_var_list: if var not in hist_file_var_list: - if component == 'ocn': - print('ocean vars seem to not be present in all files and thus cause errors') + if component == "ocn": + print( + "ocean vars seem to not be present in all files and thus cause errors" + ) continue if ( var in derive_vars.keys() diff --git a/cupid/util.py b/cupid/util.py index 340f2b8..c695102 100644 --- a/cupid/util.py +++ b/cupid/util.py @@ -81,28 +81,28 @@ def get_control_dict(config_path): except FileNotFoundError: print(f"ERROR: {config_path} not found") sys.exit(1) - + # theoretically ploomber should manage this kernel checking by itself, but this seems to add - # the default kernel to info where necessary. currently a bit messy with copy pasting in + # the default kernel to info where necessary. currently a bit messy with copy pasting in # script stuff. - + default_kernel_name = control["computation_config"].pop("default_kernel_name", None) if default_kernel_name is not None: - + for d in control["compute_notebooks"].values(): if "kernel_name" not in d: d["kernel_name"] = default_kernel_name - + if "compute_scripts" in control: for d in control["compute_scripts"].values(): if "kernel_name" not in d: d["kernel_name"] = default_kernel_name - + else: for nb, d in control["compute_notebooks"].items(): assert "kernel_name" in d, f"kernel information missing for {nb}.ipynb" - + for script, d in control["compute_scripts"].items(): assert "kernel_name" in d, f"kernel information missing for {script}.py" @@ -112,7 +112,7 @@ def get_control_dict(config_path): if "compute_scripts" in control: for script, d in control["compute_scripts"].items(): manage_conda_kernel(d["kernel_name"]).ensure_installed() - + return control @@ -122,20 +122,19 @@ def setup_book(config_path): control = get_control_dict(config_path) # ensure directory - run_dir = os.path.expanduser(control['data_sources']["run_dir"]) + run_dir = os.path.expanduser(control["data_sources"]["run_dir"]) output_root = run_dir + "/computed_notebooks" - + os.makedirs(output_root, exist_ok=True) - + output_dir = f'{output_root}/{control["data_sources"]["sname"]}' - + os.makedirs(output_dir, exist_ok=True) - + # create temp catalog directory temp_data_path = run_dir + "/temp_data" - + os.makedirs(temp_data_path, exist_ok=True) - # write table of contents file toc = control["book_toc"] @@ -143,9 +142,9 @@ def setup_book(config_path): yaml.dump(toc, fid, sort_keys=False) # read config defaults - + path_to_here = os.path.dirname(os.path.realpath(__file__)) - + with open(f"{path_to_here}/_jupyter-book-config-defaults.yml", "r") as fid: config = yaml.safe_load(fid) @@ -157,26 +156,27 @@ def setup_book(config_path): yaml.dump(config, fid, sort_keys=False) # get list of computational notebooks - - nb_path_root = os.path.expanduser(control['data_sources']['nb_path_root']) - - compute_notebooks = [f"{nb_path_root}/{f}.ipynb" for f in control["compute_notebooks"].keys()] + + nb_path_root = os.path.expanduser(control["data_sources"]["nb_path_root"]) + + compute_notebooks = [ + f"{nb_path_root}/{f}.ipynb" for f in control["compute_notebooks"].keys() + ] # get toc files; ignore glob expressions toc_files = get_toc_files(nb_path_root, toc, include_glob=False) copy_files = list(set(toc_files) - set(compute_notebooks)) - for src in copy_files: shutil.copyfile(src, f"{output_dir}/{src}") - - + + def get_toc_files(nb_path_root, toc_dict, include_glob=True): """return a list of files in the _toc.yml""" def _toc_files(toc_dict, file_list=[]): for key, value in toc_dict.items(): - + if key in ["root", "file", "glob"]: if not include_glob and key == "glob": continue @@ -205,10 +205,12 @@ def _toc_files(toc_dict, file_list=[]): return _toc_files(toc_dict) -def create_ploomber_nb_task(nb, info, cat_path, nb_path_root, output_dir, global_params, dag, dependency=None): +def create_ploomber_nb_task( + nb, info, cat_path, nb_path_root, output_dir, global_params, dag, dependency=None +): """ Creates a ploomber task for running a notebook, including necessary parameters. - + Args: nb: key from dict of notebooks info: various specifications for the notebook, originally from config.yml @@ -218,68 +220,76 @@ def create_ploomber_nb_task(nb, info, cat_path, nb_path_root, output_dir, global global_params: global parameters from config.yml dag: ploomber DAG to add the task to dependency: what the upstream task is - + Returns: task: ploomber task object """ - parameter_groups = info['parameter_groups'] + parameter_groups = info["parameter_groups"] ### passing in subset kwargs if they're provided - if 'subset' in info: - subset_kwargs = info['subset'] + if "subset" in info: + subset_kwargs = info["subset"] else: subset_kwargs = {} default_params = {} - if 'default_params' in info: - default_params = info['default_params'] + if "default_params" in info: + default_params = info["default_params"] for key, parms in parameter_groups.items(): - input_path = f'{nb_path_root}/{nb}.ipynb' - output_name = ( - f'{nb}-{key}' - if key != 'none' else f'{nb}' - ) + input_path = f"{nb_path_root}/{nb}.ipynb" + output_name = f"{nb}-{key}" if key != "none" else f"{nb}" + + output_path = f"{output_dir}/{output_name}" - output_path = f'{output_dir}/{output_name}' - ### all of these things should be optional parms_in = dict(**default_params) parms_in.update(**global_params) parms_in.update(dict(**parms)) - - parms_in['subset_kwargs'] = subset_kwargs - + + parms_in["subset_kwargs"] = subset_kwargs + if cat_path != None: - parms_in['path_to_cat'] = cat_path - - + parms_in["path_to_cat"] = cat_path + pm_params = { - 'engine_name': 'md_jinja', - 'jinja_data': parms, - 'cwd': nb_path_root} - + "engine_name": "md_jinja", + "jinja_data": parms, + "cwd": nb_path_root, + } + pm.engines.papermill_engines._engines["md_jinja"] = md_jinja_engine - - task = ploomber.tasks.NotebookRunner(Path(input_path), ploomber.products.File(output_path + '.ipynb'), dag, params=parms_in, papermill_params=pm_params, kernelspec_name=info['kernel_name'], name=output_name) - + + task = ploomber.tasks.NotebookRunner( + Path(input_path), + ploomber.products.File(output_path + ".ipynb"), + dag, + params=parms_in, + papermill_params=pm_params, + kernelspec_name=info["kernel_name"], + name=output_name, + ) + print(output_name) - + if dependency != None: raise NotImplementedError - # set DAG dependency here + # set DAG dependency here # something with task.set_upstream(other_task?) - + return task -def create_ploomber_script_task(script, info, cat_path, nb_path_root, global_params, dag, dependency=None): + +def create_ploomber_script_task( + script, info, cat_path, nb_path_root, global_params, dag, dependency=None +): """ Creates a ploomber task for running a script, including necessary parameters. - + UPDATE THIS DOCSTRING - + Args: script: key from dict of scripts info: various specifications for the notebook, originally from config.yml @@ -288,50 +298,51 @@ def create_ploomber_script_task(script, info, cat_path, nb_path_root, global_par global_params: global parameters from config.yml dag: ploomber DAG to add the task to dependency: what the upstream task is - + Returns: task: ploomber task object """ - parameter_groups = info['parameter_groups'] + parameter_groups = info["parameter_groups"] ### passing in subset kwargs if they're provided - if 'subset' in info: - subset_kwargs = info['subset'] + if "subset" in info: + subset_kwargs = info["subset"] else: subset_kwargs = {} default_params = {} - if 'default_params' in info: - default_params = info['default_params'] + if "default_params" in info: + default_params = info["default_params"] for key, parms in parameter_groups.items(): - input_path = f'{nb_path_root}/{script}.py' - output_name = ( - f'{script}-{key}' - if key != 'none' else f'{script}' - ) + input_path = f"{nb_path_root}/{script}.py" + output_name = f"{script}-{key}" if key != "none" else f"{script}" + + # output_path = f'{output_dir}/{output_name}' - #output_path = f'{output_dir}/{output_name}' - ### all of these things should be optional parms_in = dict(**default_params) parms_in.update(**global_params) parms_in.update(dict(**parms)) - - parms_in['subset_kwargs'] = subset_kwargs - + + parms_in["subset_kwargs"] = subset_kwargs + if cat_path != None: - parms_in['path_to_cat'] = cat_path - - - - task = ploomber.tasks.ScriptRunner(Path(input_path), ploomber.products.File(info['product']), dag, params=parms_in, name=output_name) - + parms_in["path_to_cat"] = cat_path + + task = ploomber.tasks.ScriptRunner( + Path(input_path), + ploomber.products.File(info["product"]), + dag, + params=parms_in, + name=output_name, + ) + if dependency != None: raise NotImplementedError - # set DAG dependency here + # set DAG dependency here # something with task.set_upstream(other_task?) - + return task diff --git a/docs/conf.py b/docs/conf.py index 9b028eb..39514e0 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,7 +14,7 @@ import datetime import re -sys.path.insert(0, os.path.abspath('../..')) +sys.path.insert(0, os.path.abspath("../..")) print("sys.path:", sys.path) @@ -22,30 +22,29 @@ # This block allows us to remove the header image from any md files # without affecting the original version, but still pull the source # into the docs build fresh each time. -for file in ['README.md', 'NCAR_tips.md']: - os.system(f'cp ../{file} ./') +for file in ["README.md", "NCAR_tips.md"]: + os.system(f"cp ../{file} ./") - # Remove any images from the first line of the file - with open(file, 'r') as f: - file1 = f.readline() - file1 = re.sub(' ', '', file1) - file_rest = f.read() + # Remove any images from the first line of the file + with open(file, "r") as f: + file1 = f.readline() + file1 = re.sub(" ", "", file1) + file_rest = f.read() - with open(file, 'w') as f: - f.write(file1+file_rest) + with open(file, "w") as f: + f.write(file1 + file_rest) # -- Project information ----------------------------------------------------- -project = 'CUPiD' +project = "CUPiD" current_year = datetime.datetime.now().year -copyright = u'{}, University Corporation for Atmospheric Research'.format( - current_year) +copyright = "{}, University Corporation for Atmospheric Research".format(current_year) -author = 'NSF NCAR' +author = "NSF NCAR" # The master toctree document. -master_doc = 'index' +master_doc = "index" # -- General configuration --------------------------------------------------- @@ -55,41 +54,41 @@ # ones. extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.napoleon', - 'sphinx.ext.autosummary', - 'sphinx.ext.intersphinx', - 'myst_nb', + "sphinx.ext.autodoc", + "sphinx.ext.napoleon", + "sphinx.ext.autosummary", + "sphinx.ext.intersphinx", + "myst_nb", "sphinx_design", "nbsphinx", ] intersphinx_mapping = { - 'dask': ('https://docs.dask.org/en/latest/', None), - 'python': ('https://docs.python.org/3/', None), - 'numpy': ("https://numpy.org/doc/stable", None), - 'scipy': ('https://docs.scipy.org/doc/scipy/reference/', None), - 'xarray': ('http://xarray.pydata.org/en/stable/', None), - 'pint': ('https://pint.readthedocs.io/en/stable/', None), - 'cftime': ('https://unidata.github.io/cftime/', None), + "dask": ("https://docs.dask.org/en/latest/", None), + "python": ("https://docs.python.org/3/", None), + "numpy": ("https://numpy.org/doc/stable", None), + "scipy": ("https://docs.scipy.org/doc/scipy/reference/", None), + "xarray": ("http://xarray.pydata.org/en/stable/", None), + "pint": ("https://pint.readthedocs.io/en/stable/", None), + "cftime": ("https://unidata.github.io/cftime/", None), } autosummary_generate = True # Add any paths that contain templates here, relative to this directory. -templates_path = ['_templates'] +templates_path = ["_templates"] # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['**.ipynb_checkpoints'] +exclude_patterns = ["**.ipynb_checkpoints"] # The suffix(es) of source filenames. # You can specify multiple suffix as a list of string: # source_suffix = ['.rst', '.md'] source_suffix = { - '.rst': 'restructuredtext', - '.ipynb': 'myst-nb', + ".rst": "restructuredtext", + ".ipynb": "myst-nb", } @@ -116,16 +115,15 @@ use_repository_button=True, use_issues_button=True, home_page_in_toc=True, - extra_footer= - "The National Center for Atmospheric Research is sponsored by the National Science Foundation. Any opinions, findings and conclusions or recommendations expressed in this material do not necessarily reflect the views of the National Science Foundation.", + extra_footer="The National Center for Atmospheric Research is sponsored by the National Science Foundation. Any opinions, findings and conclusions or recommendations expressed in this material do not necessarily reflect the views of the National Science Foundation.", ) # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, # so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ['_static'] +html_static_path = ["_static"] -html_logo = '_static/images/logos/logo.png' -html_favicon = '_static/images/logos/logo.png' +html_logo = "_static/images/logos/logo.png" +html_favicon = "_static/images/logos/logo.png" -autoclass_content = 'both' +autoclass_content = "both" From 85331ee4caf69187899dd9caeed52c90386e7d16 Mon Sep 17 00:00:00 2001 From: Teagan King Date: Mon, 15 Apr 2024 11:05:09 -0600 Subject: [PATCH 02/19] addressing some pylint issues --- cupid/build.py | 4 ++-- cupid/clear.py | 20 +++++++++++--------- cupid/read.py | 8 +++++--- cupid/run.py | 18 ++++++------------ cupid/timeseries.py | 12 ++++-------- cupid/util.py | 25 ++++++++++++++----------- 6 files changed, 42 insertions(+), 45 deletions(-) diff --git a/cupid/build.py b/cupid/build.py index f030e6f..9ec4c6a 100755 --- a/cupid/build.py +++ b/cupid/build.py @@ -2,7 +2,6 @@ import subprocess import sys -import os import yaml @@ -38,6 +37,7 @@ def build(): # remote_dir = control["publish_location"]["remote_dir"] # this seems more complicated than expected...people have mentioned paramiko library? # subprocess.run(["mkdir", "-p", remote_dir]) - # subprocess.run(["scp", "-r", f"{run_dir}/computed_notebooks/{sname}/_build/html/*", f"{user}@{remote_mach}:{remote_dir}"]) + # subprocess.run(["scp", "-r", f"{run_dir}/computed_notebooks/{sname}/_build/html/*", + # f"{user}@{remote_mach}:{remote_dir}"]) return None diff --git a/cupid/clear.py b/cupid/clear.py index b82887c..c1eabb2 100755 --- a/cupid/clear.py +++ b/cupid/clear.py @@ -1,22 +1,23 @@ #!/usr/bin/env python import os +import shutil import click import cupid.util -import shutil - -def readConfigFile(config_path): - # Given the file path to config.yml, this function reads the config file content and - # returns the val of the run_dir string with '/computed_notebooks' appended to it +def read_config_file(config_path): + """ + Given the file path to config.yml, this function reads the config file content and + returns the val of the run_dir string with '/computed_notebooks' appended to it + """ # Obtain the contents of the config.yml file and extract the run_dir variable control = cupid.util.get_control_dict(config_path) run_dir = control["data_sources"].get("run_dir", None) if run_dir: # Append '/computed_notebooks' to the run_dir value if it is not empty - fullPath = os.path.join(run_dir, "computed_notebooks") - return fullPath + full_path = os.path.join(run_dir, "computed_notebooks") + return full_path else: # run_dir is empty/wasn't found in config file so return error raise ValueError("'run_dir' was empty/not found in the config file.") @@ -26,13 +27,14 @@ def readConfigFile(config_path): @click.argument("config_path") # Entry point to this script def clear(config_path): - """Clears the contents of the 'computed_notebooks' folder at the location specified by the 'run_dir' variable in the 'config.yml' file. + """Clears the contents of the 'computed_notebooks' folder at the location + specified by the 'run_dir' variable in the 'config.yml' file. Args: config_path - The path to the config.yml file. """ - run_dir = readConfigFile(config_path) + run_dir = read_config_file(config_path) # Delete the 'computed_notebooks' folder and all the contents inside of it shutil.rmtree(run_dir) print(f"All contents in {run_dir} have been cleared.") diff --git a/cupid/read.py b/cupid/read.py index 11b0818..279a437 100644 --- a/cupid/read.py +++ b/cupid/read.py @@ -3,12 +3,14 @@ def read_yaml(path_to_yaml): - with open(path_to_yaml) as f: - data = yaml.load(f, Loader=yaml.FullLoader) + """Read yaml file and return data from loaded yaml file""" + with open(path_to_yaml) as file: + data = yaml.load(file, Loader=yaml.FullLoader) return data def get_collection(path_to_catalog, **kwargs): + """Get collection of datasets from intake catalog""" cat = intake.open_esm_datastore(path_to_catalog) ### note that the json file points to the csv, so the path that the ### yaml file contains doesn't actually get used. this can cause issues @@ -19,7 +21,7 @@ def get_collection(path_to_catalog, **kwargs): def preprocess(ds): ## the double brackets return a Dataset rather than a DataArray - ## this is fragile and could cause issues, i'm not totally sure what subsetting on time_bound does + ## this is fragile and could cause issues, not sure what subsetting on time_bound does return ds[[kwargs["variable"], "time_bound"]] ## not sure what the chunking kwarg is doing here either diff --git a/cupid/run.py b/cupid/run.py index 72f9f70..e415923 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -1,17 +1,11 @@ #!/usr/bin/env python -import click import os -from glob import glob -import papermill as pm +import click +import ploomber import intake import cupid.util import cupid.timeseries -from dask.distributed import Client -import dask -import time -import ploomber -import yaml CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) @@ -70,15 +64,16 @@ def run(config_path, serial=False, time_series=False): + timeseries_params["case_name"] + f"/{component}/proc/tseries/" ], - # Note that timeseries output will eventually go in /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ + # Note that timeseries output will eventually go in + # /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ timeseries_params["ts_done"], timeseries_params["overwrite_ts"], timeseries_params[component][ "start_years" - ], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date + ], timeseries_params[component][ "end_years" - ], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date + ], timeseries_params[component]["level"], num_procs, serial, @@ -101,7 +96,6 @@ def run(config_path, serial=False, time_series=False): cat_path = None if "path_to_cat_json" in control["data_sources"]: - use_catalog = True full_cat_path = os.path.realpath( os.path.expanduser(control["data_sources"]["path_to_cat_json"]) ) diff --git a/cupid/timeseries.py b/cupid/timeseries.py index 1f7e7ae..d97aefb 100644 --- a/cupid/timeseries.py +++ b/cupid/timeseries.py @@ -6,16 +6,12 @@ # Import standard python modules # ++++++++++++++++++++++++++++++ -import sys import glob import multiprocessing as mp import os import subprocess -import xarray as xr - -import importlib - from pathlib import Path +import xarray as xr def call_ncrcat(cmd): @@ -50,7 +46,7 @@ def create_time_series( ---- - component: str name of component, eg 'cam' - # This could alternatively be made into a dictionary and encorporate values such as height_dim + # This could also be made into a dict and encorporate values such as height_dim - derive_vars: dict information on derivable variables eg, {'PRECT': ['PRECL','PRECC'], @@ -62,7 +58,7 @@ def create_time_series( - hist_locs: list, str location of CESM history files - ts_dir: list, str - location where time series files will be saved, or where pre-made time series files exist + location where time series files will be saved, or pre-made time series files exist - ts_done: list, boolean check if time series files already exist - overwrite_ts: list, boolean @@ -327,7 +323,7 @@ def create_time_series( # End variable loop - if vars_to_derive != []: + if vars_to_derive: if component == "atm": derive_cam_variables( vars_to_derive=vars_to_derive, ts_dir=ts_dir[case_idx] diff --git a/cupid/util.py b/cupid/util.py index c695102..6009504 100644 --- a/cupid/util.py +++ b/cupid/util.py @@ -5,17 +5,16 @@ import subprocess import json import sys +from pathlib import Path import yaml import jupyter_client import papermill as pm import ploomber from papermill.engines import NBClientEngine from jinja2 import Template -import dask -from pathlib import Path -class manage_conda_kernel(object): +class ManageCondaKernel(object): """ Manage conda kernels so they can be seen by `papermill` """ @@ -37,6 +36,7 @@ def getcwd(self): return None def isinstalled(self): + """Check if kernel is installed""" return self.kernel_name in jupyter_client.kernelspec.find_kernel_specs() def ensure_installed(self): @@ -61,9 +61,11 @@ def ensure_installed(self): assert self.isinstalled() -class md_jinja_engine(NBClientEngine): +class MdJinjaEngine(NBClientEngine): + """Class for using the Jinja Engine to run notebooks""" @classmethod def execute_managed_notebook(cls, nb_man, kernel_name, **kwargs): + """Execute notebooks with papermill execution engine""" jinja_data = {} if "jinja_data" not in kwargs else kwargs["jinja_data"] # call the papermill execution engine: @@ -75,6 +77,7 @@ def execute_managed_notebook(cls, nb_man, kernel_name, **kwargs): def get_control_dict(config_path): + """Get control dictionary from configuration file""" try: with open(config_path, "r") as fid: control = yaml.safe_load(fid) @@ -107,11 +110,11 @@ def get_control_dict(config_path): assert "kernel_name" in d, f"kernel information missing for {script}.py" for nb, d in control["compute_notebooks"].items(): - manage_conda_kernel(d["kernel_name"]).ensure_installed() + ManageCondaKernel(d["kernel_name"]).ensure_installed() if "compute_scripts" in control: for script, d in control["compute_scripts"].items(): - manage_conda_kernel(d["kernel_name"]).ensure_installed() + ManageCondaKernel(d["kernel_name"]).ensure_installed() return control @@ -251,7 +254,7 @@ def create_ploomber_nb_task( parms_in["subset_kwargs"] = subset_kwargs - if cat_path != None: + if cat_path is not None: parms_in["path_to_cat"] = cat_path pm_params = { @@ -260,7 +263,7 @@ def create_ploomber_nb_task( "cwd": nb_path_root, } - pm.engines.papermill_engines._engines["md_jinja"] = md_jinja_engine + pm.engines.papermill_engines._engines["md_jinja"] = MdJinjaEngine task = ploomber.tasks.NotebookRunner( Path(input_path), @@ -274,7 +277,7 @@ def create_ploomber_nb_task( print(output_name) - if dependency != None: + if dependency is not None: raise NotImplementedError # set DAG dependency here # something with task.set_upstream(other_task?) @@ -329,7 +332,7 @@ def create_ploomber_script_task( parms_in["subset_kwargs"] = subset_kwargs - if cat_path != None: + if cat_path is not None: parms_in["path_to_cat"] = cat_path task = ploomber.tasks.ScriptRunner( @@ -340,7 +343,7 @@ def create_ploomber_script_task( name=output_name, ) - if dependency != None: + if dependency is not None: raise NotImplementedError # set DAG dependency here # something with task.set_upstream(other_task?) From 8d2bc0371e00380f3c4a08aa60b157b657eeeba9 Mon Sep 17 00:00:00 2001 From: Teagan King Date: Mon, 15 Apr 2024 11:16:41 -0600 Subject: [PATCH 03/19] added module docstrings --- cupid/build.py | 9 +++++++++ cupid/clear.py | 9 +++++++++ cupid/quickstart.py | 3 ++- cupid/read.py | 8 ++++++++ cupid/util.py | 16 ++++++++++++++++ 5 files changed, 44 insertions(+), 1 deletion(-) diff --git a/cupid/build.py b/cupid/build.py index 9ec4c6a..3f37862 100755 --- a/cupid/build.py +++ b/cupid/build.py @@ -1,4 +1,13 @@ #!/usr/bin/env python +""" +This script provides functionality to build a Jupyter book based on +the configuration specified in a YAML file. + +The main function `build()` reads the configuration file provided as a command-line +argument, extracts the necessary information such as the name of the book and the +directory containing computed notebooks, and then proceeds to clean and build the +Jupyter book using the `jupyter-book` command-line tool. +""" import subprocess import sys diff --git a/cupid/clear.py b/cupid/clear.py index c1eabb2..74a827b 100755 --- a/cupid/clear.py +++ b/cupid/clear.py @@ -1,4 +1,13 @@ #!/usr/bin/env python +""" +This script provides functionality to clear the contents of the 'computed_notebooks' folder +at the location specified by the 'run_dir' variable in the 'config.yml' file. + +The main function `clear()` takes the path to the config.yml file as input, reads the config file +to obtain the 'run_dir' variable, and then deletes the contents of the 'computed_notebooks' folder +at that location. +""" + import os import shutil import click diff --git a/cupid/quickstart.py b/cupid/quickstart.py index fbe77ca..8c17864 100644 --- a/cupid/quickstart.py +++ b/cupid/quickstart.py @@ -1,2 +1,3 @@ -### To be created: a script (maybe called through a command line entry point) that sets up a directory with a config.yml file and +### To be created: a script, maybe called through a command line entry point, +### that sets up a directory with a config.yml file and ### basics necessary to set up a notebook collection diff --git a/cupid/read.py b/cupid/read.py index 279a437..95ac850 100644 --- a/cupid/read.py +++ b/cupid/read.py @@ -1,3 +1,11 @@ +""" +This module provides functions for reading YAML files and working with intake catalogs. + +Functions: + - read_yaml(path_to_yaml): Read a YAML file and return its content as a dictionary. + - get_collection(path_to_catalog, **kwargs): Get a collection of datasets from an intake catalog based on specified criteria. +""" + import intake import yaml diff --git a/cupid/util.py b/cupid/util.py index 6009504..ae270b3 100644 --- a/cupid/util.py +++ b/cupid/util.py @@ -1,3 +1,19 @@ +""" +This module provides functions and classes for managing conda kernels, +executing notebooks with custom engines, and creating tasks for Ploomber DAGs. + +Functions: + - get_control_dict(): Get the control dictionary from a configuration file. + - setup_book(): Setup run dir and output Jupyter book based on config.yaml + - get_toc_files(): Return a list of files in the '_toc.yml'. + - create_ploomber_nb_task(): Create a Ploomber task for running a notebook. + - create_ploomber_script_task(): Create a Ploomber task for running a script. + +Classes: + - ManageCondaKernel: Class for managing conda kernels. + - MdJinjaEngine: Class for using the Jinja Engine to run notebooks. +""" + import os import shutil from glob import glob From cacb0db75b63453026e56313acd9279fa3c84fc3 Mon Sep 17 00:00:00 2001 From: Teagan King Date: Mon, 15 Apr 2024 11:23:35 -0600 Subject: [PATCH 04/19] this black fix should be ignored for readability... --- cupid/run.py | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/cupid/run.py b/cupid/run.py index e415923..6470fc4 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -48,32 +48,18 @@ def run(config_path, serial=False, time_series=False): component, timeseries_params[component]["vars"], timeseries_params[component]["derive_vars"], - [ - timeseries_params["case_name"] - ], # could also grab from compute_notebooks section of config file + [timeseries_params["case_name"]], + # could also grab from compute_notebooks section of config file timeseries_params[component]["hist_str"], - [ - global_params["CESM_output_dir"] - + "/" - + timeseries_params["case_name"] - + f"/{component}/hist/" - ], # could also grab from compute_notebooks section of config file - [ - global_params["CESM_output_dir"] - + "/" - + timeseries_params["case_name"] - + f"/{component}/proc/tseries/" - ], + [global_params["CESM_output_dir"]+"/"+timeseries_params["case_name"]+f"/{component}/hist/"], + # could also grab from compute_notebooks section of config file + [global_params["CESM_output_dir"]+"/"+timeseries_params["case_name"]+f"/{component}/proc/tseries/"], # Note that timeseries output will eventually go in # /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ timeseries_params["ts_done"], timeseries_params["overwrite_ts"], - timeseries_params[component][ - "start_years" - ], - timeseries_params[component][ - "end_years" - ], + timeseries_params[component]["start_years"], + timeseries_params[component]["end_years"], timeseries_params[component]["level"], num_procs, serial, From 2433567962a6837c3c26671c3f30a0a31a149f17 Mon Sep 17 00:00:00 2001 From: Teagan King <98482480+TeaganKing@users.noreply.github.com> Date: Tue, 7 May 2024 09:47:39 -0600 Subject: [PATCH 05/19] Update util.py with a few additional pylint changes --- cupid/util.py | 56 +++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/cupid/util.py b/cupid/util.py index b41dbf2..cb36faf 100644 --- a/cupid/util.py +++ b/cupid/util.py @@ -15,22 +15,15 @@ """ import os -import shutil -from glob import glob -import pathlib -import subprocess -import json import sys from pathlib import Path -import yaml +import warnings import jupyter_client import papermill as pm import ploomber from papermill.engines import NBClientEngine from jinja2 import Template -import dask -from pathlib import Path -import warnings +import yaml class MdJinjaEngine(NBClientEngine): @@ -60,14 +53,14 @@ def get_control_dict(config_path): default_kernel_name = control["computation_config"].pop("default_kernel_name", None) control["env_check"] = dict() - + if "compute_notebooks" in control: for nb_category in control["compute_notebooks"].values(): - for nb, info in nb_category.items(): + for n_b, info in nb_category.items(): info["kernel_name"] = info.get("kernel_name", default_kernel_name) if info["kernel_name"] is None: info["kernel_name"] = "cupid-analysis" - warnings.warn(f"No conda environment specified for {nb}.ipynb and no default kernel set, will use cupid-analysis environment.") + warnings.warn(f"No conda environment specified for {n_b}.ipynb and no default kernel set, will use cupid-analysis environment.") if info["kernel_name"] not in control["env_check"]: control["env_check"][info["kernel_name"]] = info["kernel_name"] in jupyter_client.kernelspec.find_kernel_specs() @@ -124,13 +117,13 @@ def setup_book(config_path): yaml.dump(config, fid, sort_keys=False) return None - + def create_ploomber_nb_task(nb, info, cat_path, nb_path_root, output_dir, global_params, dag, dependency=None): """ Creates a ploomber task for running a notebook, including necessary parameters. Args: - nb: key from dict of notebooks + n_b: key from dict of notebooks info: various specifications for the notebook, originally from config.yml use_catalog: bool specified earlier, specifying if whole collection uses a catalog or not nb_path_root: from config.yml, path to folder containing template notebooks @@ -157,8 +150,8 @@ def create_ploomber_nb_task(nb, info, cat_path, nb_path_root, output_dir, global for key, parms in parameter_groups.items(): - input_path = f"{nb_path_root}/{nb}.ipynb" - output_name = f"{nb}-{key}" if key != "none" else f"{nb}" + input_path = f"{nb_path_root}/{n_b}.ipynb" + output_name = f"{n_b}-{key}" if key != "none" else f"{n_b}" output_path = f"{output_dir}/{output_name}" @@ -175,11 +168,11 @@ def create_ploomber_nb_task(nb, info, cat_path, nb_path_root, output_dir, global pm_params = {"engine_name": "md_jinja", "jinja_data": parms, "cwd": nb_path_root} - - pm.engines.papermill_engines._engines["md_jinja"] = md_jinja_engine - + + pm.engines.papermill_engines._engines["md_jinja"] = MdJinjaEngine + task = ploomber.tasks.NotebookRunner(Path(input_path), ploomber.products.File(output_path + ".ipynb"), dag, params=parms_in, papermill_params=pm_params, kernelspec_name=info['kernel_name'], name=output_name) - + if dependency != None: raise NotImplementedError # set DAG dependency here @@ -192,21 +185,22 @@ def create_ploomber_script_task( script, info, cat_path, nb_path_root, global_params, dag, dependency=None ): """ - Creates a ploomber task for running a script, including necessary parameters. - - UPDATE THIS DOCSTRING + Creates a Ploomber task for running a script, including necessary parameters. Args: - script: key from dict of scripts - info: various specifications for the notebook, originally from config.yml - use_catalog: bool specified earlier, specifying if whole collection uses a catalog or not - nb_path_root: from config.yml, path to folder containing template notebooks - global_params: global parameters from config.yml - dag: ploomber DAG to add the task to - dependency: what the upstream task is + script (str): The key from the dictionary of scripts. + info (dict): Various specifications for the notebook, originally from config.yml. + cat_path (str or None): Path to the catalog file if using a catalog, otherwise None. + nb_path_root (str): Path to the folder containing template notebooks from config.yml. + global_params (dict): Global parameters from config.yml. + dag (ploomber.DAG): Ploomber DAG to add the task to. + dependency (ploomber.Task, optional): The upstream task. Defaults to None. Returns: - task: ploomber task object + ploomber.Task: The Ploomber task object. + + Raises: + NotImplementedError: Raised if dependency is not None (setting DAG dependency is not implemented yet). """ parameter_groups = info["parameter_groups"] From d71aeb70ababa93532831de561f46adfc05aa1a8 Mon Sep 17 00:00:00 2001 From: Teagan Date: Tue, 7 May 2024 09:52:40 -0600 Subject: [PATCH 06/19] black formatting --- cupid/util.py | 48 ++++++++++++++++++++++++++++++++++++------------ 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/cupid/util.py b/cupid/util.py index cb36faf..defefe1 100644 --- a/cupid/util.py +++ b/cupid/util.py @@ -28,6 +28,7 @@ class MdJinjaEngine(NBClientEngine): """Class for using the Jinja Engine to run notebooks""" + @classmethod def execute_managed_notebook(cls, nb_man, kernel_name, **kwargs): """Execute notebooks with papermill execution engine""" @@ -60,19 +61,29 @@ def get_control_dict(config_path): info["kernel_name"] = info.get("kernel_name", default_kernel_name) if info["kernel_name"] is None: info["kernel_name"] = "cupid-analysis" - warnings.warn(f"No conda environment specified for {n_b}.ipynb and no default kernel set, will use cupid-analysis environment.") + warnings.warn( + f"No conda environment specified for {n_b}.ipynb and no default kernel set, will use cupid-analysis environment." + ) if info["kernel_name"] not in control["env_check"]: - control["env_check"][info["kernel_name"]] = info["kernel_name"] in jupyter_client.kernelspec.find_kernel_specs() - + control["env_check"][info["kernel_name"]] = ( + info["kernel_name"] + in jupyter_client.kernelspec.find_kernel_specs() + ) + if "compute_scripts" in control: for script_category in control["compute_scripts"].values(): for script, info in script_category.items(): info["kernel_name"] = info.get("kernel_name", default_kernel_name) if info["kernel_name"] is None: info["kernel_name"] = "cupid-analysis" - warnings.warn(f"No environment specified for {script}.py and no default kernel set, will use cupid-analysis environment.") + warnings.warn( + f"No environment specified for {script}.py and no default kernel set, will use cupid-analysis environment." + ) if info["kernel_name"] not in control["env_check"]: - control["env_check"][info["kernel_name"]] = info["kernel_name"] in jupyter_client.kernelspec.find_kernel_specs() + control["env_check"][info["kernel_name"]] = ( + info["kernel_name"] + in jupyter_client.kernelspec.find_kernel_specs() + ) return control @@ -88,7 +99,7 @@ def setup_book(config_path): os.makedirs(output_root, exist_ok=True) - output_dir = f"{output_root}/{control["data_sources"]["sname"]}" + output_dir = f'{output_root}/{control["data_sources"]["sname"]}' os.makedirs(output_dir, exist_ok=True) @@ -118,7 +129,10 @@ def setup_book(config_path): return None -def create_ploomber_nb_task(nb, info, cat_path, nb_path_root, output_dir, global_params, dag, dependency=None): + +def create_ploomber_nb_task( + n_b, info, cat_path, nb_path_root, output_dir, global_params, dag, dependency=None +): """ Creates a ploomber task for running a notebook, including necessary parameters. @@ -165,15 +179,25 @@ def create_ploomber_nb_task(nb, info, cat_path, nb_path_root, output_dir, global if cat_path is not None: parms_in["path_to_cat"] = cat_path - pm_params = {"engine_name": "md_jinja", - "jinja_data": parms, - "cwd": nb_path_root} + pm_params = { + "engine_name": "md_jinja", + "jinja_data": parms, + "cwd": nb_path_root, + } pm.engines.papermill_engines._engines["md_jinja"] = MdJinjaEngine - task = ploomber.tasks.NotebookRunner(Path(input_path), ploomber.products.File(output_path + ".ipynb"), dag, params=parms_in, papermill_params=pm_params, kernelspec_name=info['kernel_name'], name=output_name) + task = ploomber.tasks.NotebookRunner( + Path(input_path), + ploomber.products.File(output_path + ".ipynb"), + dag, + params=parms_in, + papermill_params=pm_params, + kernelspec_name=info["kernel_name"], + name=output_name, + ) - if dependency != None: + if dependency: raise NotImplementedError # set DAG dependency here # something with task.set_upstream(other_task?) From 35fb232aeb0db2e8823d6a4acddf3369b2d8e76e Mon Sep 17 00:00:00 2001 From: Teagan Date: Tue, 7 May 2024 10:12:32 -0600 Subject: [PATCH 07/19] pylint and docstring for run.py --- cupid/run.py | 212 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 147 insertions(+), 65 deletions(-) diff --git a/cupid/run.py b/cupid/run.py index 24e49bc..4ed26cb 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -1,33 +1,74 @@ #!/usr/bin/env python +""" +Main script for running all notebooks and scripts specified in the configuration file. + +This script sets up and runs all the specified notebooks and scripts according to the configurations +provided in the specified YAML configuration file. + +Usage: + python main_script.py [OPTIONS] CONFIG_PATH + +Arguments: + CONFIG_PATH (str): Path to the YAML configuration file containing specifications for notebooks + and scripts. + +Options: + --serial, -s Do not use LocalCluster objects. + --time-series, -ts Run time series generation scripts prior to diagnostics. + + # Options to run only specified components; running all is the default + --atmosphere, -atm Run atmosphere component diagnostics. + --ocean, -ocn Run ocean component diagnostics. + --land, -lnd Run land component diagnostics. + --seaice, -ice Run sea ice component diagnostics. + --landice, -glc Run land ice component diagnostics. +""" + import os +import warnings import click -import ploomber import intake +import ploomber +from dask.distributed import Client import cupid.util import cupid.timeseries -from dask.distributed import Client -import dask -import time -import ploomber -import warnings CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) + @click.command(context_settings=CONTEXT_SETTINGS) @click.option("--serial", "-s", is_flag=True, help="Do not use LocalCluster objects") -@click.option("--time-series", "-ts", is_flag=True, - help="Run time series generation scripts prior to diagnostics") +@click.option( + "--time-series", + "-ts", + is_flag=True, + help="Run time series generation scripts prior to diagnostics", +) # Options to turn components on or off -@click.option("--atmosphere", "-atm", is_flag=True, help="Run atmosphere component diagnostics") +@click.option( + "--atmosphere", "-atm", is_flag=True, help="Run atmosphere component diagnostics" +) @click.option("--ocean", "-ocn", is_flag=True, help="Run ocean component diagnostics") @click.option("--land", "-lnd", is_flag=True, help="Run land component diagnostics") -@click.option("--seaice", "-ice", is_flag=True, help="Run sea ice component diagnostics") -@click.option("--landice", "-glc", is_flag=True, help="Run land ice component diagnostics") +@click.option( + "--seaice", "-ice", is_flag=True, help="Run sea ice component diagnostics" +) +@click.option( + "--landice", "-glc", is_flag=True, help="Run land ice component diagnostics" +) @click.argument("config_path") - -def run(config_path, serial=False, time_series=False, - all=False, atmosphere=False, ocean=False, land=False, seaice=False, landice=False): +def run( + config_path, + serial=False, + time_series=False, + all=False, + atmosphere=False, + ocean=False, + land=False, + seaice=False, + landice=False, +): """ Main engine to set up running all the notebooks. """ @@ -36,19 +77,21 @@ def run(config_path, serial=False, time_series=False, control = cupid.util.get_control_dict(config_path) cupid.util.setup_book(config_path) - component_options = {"atm": atmosphere, - "ocn": ocean, - "lnd": land, - "ice": seaice, - "glc": landice} + component_options = { + "atm": atmosphere, + "ocn": ocean, + "lnd": land, + "ice": seaice, + "glc": landice, + } # Automatically run all if no components specified - + if True not in [atmosphere, ocean, land, seaice, landice]: all = True for key in component_options.keys(): component_options[key] = True - + ##################################################################### # Managing global parameters @@ -56,11 +99,11 @@ def run(config_path, serial=False, time_series=False, if "global_params" in control: global_params = control["global_params"] - - global_params['serial'] = serial - + + global_params["serial"] = serial + #################################################################### - + if time_series: timeseries_params = control["timeseries"] @@ -73,19 +116,35 @@ def run(config_path, serial=False, time_series=False, component, timeseries_params[component]["vars"], timeseries_params[component]["derive_vars"], - [timeseries_params["case_name"]], # could also grab from compute_notebooks section of config file + [ + timeseries_params["case_name"] + ], # could also grab from compute_notebooks section of config file timeseries_params[component]["hist_str"], - [global_params["CESM_output_dir"] + "/" + timeseries_params["case_name"] + f"/{component}/hist/"], # could also grab from compute_notebooks section of config file - [global_params["CESM_output_dir"]+'/'+timeseries_params['case_name']+f'/{component}/proc/tseries/'], + [ + global_params["CESM_output_dir"] + + "/" + + timeseries_params["case_name"] + + f"/{component}/hist/" + ], # could also grab from compute_notebooks section of config file + [ + global_params["CESM_output_dir"] + + "/" + + timeseries_params["case_name"] + + f"/{component}/proc/tseries/" + ], # Note that timeseries output will eventually go in /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ timeseries_params["ts_done"], timeseries_params["overwrite_ts"], - timeseries_params[component]["start_years"], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date - timeseries_params[component]["end_years"], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date + timeseries_params[component][ + "start_years" + ], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date + timeseries_params[component][ + "end_years" + ], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date timeseries_params[component]["level"], num_procs, serial, - ) + ) # Grab paths @@ -130,38 +189,50 @@ def run(config_path, serial=False, time_series=False, ##################################################################### # Organizing notebooks to run - - if 'compute_notebooks' in control: - + + if "compute_notebooks" in control: + all_nbs = dict() - - for nb, info in control['compute_notebooks']['infrastructure'].items(): - all_nbs[nb] = info - all_nbs[nb]['nb_path_root'] = nb_path_root + '/infrastructure' - all_nbs[nb]['output_dir'] = output_dir + '/infrastructure' - + + for n_b, info in control["compute_notebooks"]["infrastructure"].items(): + all_nbs[n_b] = info + all_nbs[n_b]["nb_path_root"] = nb_path_root + "/infrastructure" + all_nbs[n_b]["output_dir"] = output_dir + "/infrastructure" + for comp_name, comp_bool in component_options.items(): - if comp_name in control['compute_notebooks'] and comp_bool: - for nb, info in control['compute_notebooks'][comp_name].items(): - all_nbs[nb] = info - all_nbs[nb]['nb_path_root'] = nb_path_root + '/' + comp_name - all_nbs[nb]['output_dir'] = output_dir + '/' + comp_name + if comp_name in control["compute_notebooks"] and comp_bool: + for n_b, info in control["compute_notebooks"][comp_name].items(): + all_nbs[n_b] = info + all_nbs[n_b]["nb_path_root"] = nb_path_root + "/" + comp_name + all_nbs[n_b]["output_dir"] = output_dir + "/" + comp_name elif comp_bool and not all: - warnings.warn(f"No notebooks for {comp_name} component specified in config file.") - + warnings.warn( + f"No notebooks for {comp_name} component specified in config file." + ) + # Checking for existence of environments - - for nb, info in all_nbs.copy().items(): + + for n_b, info in all_nbs.copy().items(): if not control["env_check"][info["kernel_name"]]: bad_env = info["kernel_name"] - warnings.warn(f"Environment {bad_env} specified for {nb}.ipynb could not be found; {nb}.ipynb will not be run. See README.md for environment installation instructions.") - all_nbs.pop(nb) - + warnings.warn( + f"Environment {bad_env} specified for {n_b}.ipynb could not be found; {n_b}.ipynb will not be run. See README.md for environment installation instructions." + ) + all_nbs.pop(n_b) + # Setting up notebook tasks - - for nb, info in all_nbs.items(): - cupid.util.create_ploomber_nb_task(nb, info, cat_path, info["nb_path_root"], - info["output_dir"], global_params, dag, dependency=info.get("dependency")) + + for n_b, info in all_nbs.items(): + cupid.util.create_ploomber_nb_task( + n_b, + info, + cat_path, + info["nb_path_root"], + info["output_dir"], + global_params, + dag, + dependency=info.get("dependency"), + ) ##################################################################### # Organizing scripts @@ -171,26 +242,37 @@ def run(config_path, serial=False, time_series=False, all_scripts = dict() for comp_name, comp_bool in component_options.items(): - if comp_name in control['compute_scripts'] and comp_bool: - for script, info in control['compute_scripts'][comp_name].items(): + if comp_name in control["compute_scripts"] and comp_bool: + for script, info in control["compute_scripts"][comp_name].items(): all_scripts[script] = info - all_scripts[script]['nb_path_root'] = nb_path_root + '/' + comp_name + all_scripts[script]["nb_path_root"] = nb_path_root + "/" + comp_name elif comp_bool and not all: - warnings.warn(f"No scripts for {comp_name} component specified in config file.") + warnings.warn( + f"No scripts for {comp_name} component specified in config file." + ) # Checking for existence of environments - + for script, info in all_scripts.copy().items(): if not control["env_check"][info["kernel_name"]]: bad_env = info["kernel_name"] - warnings.warn(f"Environment {bad_env} specified for {script}.py could not be found; {script}.py will not be run.") + warnings.warn( + f"Environment {bad_env} specified for {script}.py could not be found; {script}.py will not be run." + ) all_scripts.pop(script) - + # Setting up script tasks for script, info in all_scripts.items(): - cupid.util.create_ploomber_script_task(script, info, cat_path, info['nb_path_root'], - global_params, dag, dependency=info.get("dependency")) + cupid.util.create_ploomber_script_task( + script, + info, + cat_path, + info["nb_path_root"], + global_params, + dag, + dependency=info.get("dependency"), + ) # Run the full DAG From 7d6ab937aa7d11514741177ac7e03ef1d35084b4 Mon Sep 17 00:00:00 2001 From: Teagan Date: Tue, 7 May 2024 10:22:06 -0600 Subject: [PATCH 08/19] a few more pylint changes --- cupid/read.py | 4 ++-- cupid/run.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cupid/read.py b/cupid/read.py index 95ac850..20ca27f 100644 --- a/cupid/read.py +++ b/cupid/read.py @@ -27,10 +27,10 @@ def get_collection(path_to_catalog, **kwargs): if "variable" in kwargs.keys(): - def preprocess(ds): + def preprocess(d_s): ## the double brackets return a Dataset rather than a DataArray ## this is fragile and could cause issues, not sure what subsetting on time_bound does - return ds[[kwargs["variable"], "time_bound"]] + return d_s[[kwargs["variable"], "time_bound"]] ## not sure what the chunking kwarg is doing here either dsets = cat_subset.to_dataset_dict( diff --git a/cupid/run.py b/cupid/run.py index 4ed26cb..7e9116a 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -30,7 +30,6 @@ import click import intake import ploomber -from dask.distributed import Client import cupid.util import cupid.timeseries @@ -132,15 +131,16 @@ def run( + timeseries_params["case_name"] + f"/{component}/proc/tseries/" ], - # Note that timeseries output will eventually go in /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ + # Note that timeseries output will eventually go in + # /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ timeseries_params["ts_done"], timeseries_params["overwrite_ts"], timeseries_params[component][ "start_years" - ], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.start_date + ], timeseries_params[component][ "end_years" - ], # could get from yaml file in adf_quick_run.parameter_groups.none.config_fil_str, or for other notebooks config files, eg ocean_surface.parameter_gropus.none.mom6_tools_config.end_date + ], timeseries_params[component]["level"], num_procs, serial, From 9ddb379af1e21cb13560292c3fb9625c40252bc9 Mon Sep 17 00:00:00 2001 From: Teagan Date: Tue, 7 May 2024 12:40:17 -0600 Subject: [PATCH 09/19] updates from review --- cupid/read.py | 6 +++--- cupid/run.py | 51 ++++++++++++++++++++++++--------------------------- cupid/util.py | 13 +++++++------ 3 files changed, 34 insertions(+), 36 deletions(-) diff --git a/cupid/read.py b/cupid/read.py index 20ca27f..04973cd 100644 --- a/cupid/read.py +++ b/cupid/read.py @@ -26,11 +26,11 @@ def get_collection(path_to_catalog, **kwargs): cat_subset = cat.search(**kwargs) if "variable" in kwargs.keys(): - - def preprocess(d_s): + # pylint: disable=invalid-name + def preprocess(ds): ## the double brackets return a Dataset rather than a DataArray ## this is fragile and could cause issues, not sure what subsetting on time_bound does - return d_s[[kwargs["variable"], "time_bound"]] + return ds[[kwargs["variable"], "time_bound"]] ## not sure what the chunking kwarg is doing here either dsets = cat_subset.to_dataset_dict( diff --git a/cupid/run.py b/cupid/run.py index 7e9116a..47ebf4f 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -6,23 +6,19 @@ This script sets up and runs all the specified notebooks and scripts according to the configurations provided in the specified YAML configuration file. -Usage: - python main_script.py [OPTIONS] CONFIG_PATH +Usage: cupid-run [OPTIONS] CONFIG_PATH -Arguments: - CONFIG_PATH (str): Path to the YAML configuration file containing specifications for notebooks - and scripts. + Main engine to set up running all the notebooks. Options: - --serial, -s Do not use LocalCluster objects. - --time-series, -ts Run time series generation scripts prior to diagnostics. - - # Options to run only specified components; running all is the default - --atmosphere, -atm Run atmosphere component diagnostics. - --ocean, -ocn Run ocean component diagnostics. - --land, -lnd Run land component diagnostics. - --seaice, -ice Run sea ice component diagnostics. - --landice, -glc Run land ice component diagnostics. + -s, --serial Do not use LocalCluster objects + -ts, --time-series Run time series generation scripts prior to diagnostics + -atm, --atmosphere Run atmosphere component diagnostics + -ocn, --ocean Run ocean component diagnostics + -lnd, --land Run land component diagnostics + -ice, --seaice Run sea ice component diagnostics + -glc, --landice Run land ice component diagnostics + -h, --help Show this message and exit. """ import os @@ -194,17 +190,18 @@ def run( all_nbs = dict() - for n_b, info in control["compute_notebooks"]["infrastructure"].items(): - all_nbs[n_b] = info - all_nbs[n_b]["nb_path_root"] = nb_path_root + "/infrastructure" - all_nbs[n_b]["output_dir"] = output_dir + "/infrastructure" + # pylint: disable=invalid-name + for nb, info in control["compute_notebooks"]["infrastructure"].items(): + all_nbs[nb] = info + all_nbs[nb]["nb_path_root"] = nb_path_root + "/infrastructure" + all_nbs[nb]["output_dir"] = output_dir + "/infrastructure" for comp_name, comp_bool in component_options.items(): if comp_name in control["compute_notebooks"] and comp_bool: - for n_b, info in control["compute_notebooks"][comp_name].items(): - all_nbs[n_b] = info - all_nbs[n_b]["nb_path_root"] = nb_path_root + "/" + comp_name - all_nbs[n_b]["output_dir"] = output_dir + "/" + comp_name + for nb, info in control["compute_notebooks"][comp_name].items(): + all_nbs[nb] = info + all_nbs[nb]["nb_path_root"] = nb_path_root + "/" + comp_name + all_nbs[nb]["output_dir"] = output_dir + "/" + comp_name elif comp_bool and not all: warnings.warn( f"No notebooks for {comp_name} component specified in config file." @@ -212,19 +209,19 @@ def run( # Checking for existence of environments - for n_b, info in all_nbs.copy().items(): + for nb, info in all_nbs.copy().items(): if not control["env_check"][info["kernel_name"]]: bad_env = info["kernel_name"] warnings.warn( - f"Environment {bad_env} specified for {n_b}.ipynb could not be found; {n_b}.ipynb will not be run. See README.md for environment installation instructions." + f"Environment {bad_env} specified for {nb}.ipynb could not be found; {nb}.ipynb will not be run. See README.md for environment installation instructions." ) - all_nbs.pop(n_b) + all_nbs.pop(nb) # Setting up notebook tasks - for n_b, info in all_nbs.items(): + for nb, info in all_nbs.items(): cupid.util.create_ploomber_nb_task( - n_b, + nb, info, cat_path, info["nb_path_root"], diff --git a/cupid/util.py b/cupid/util.py index defefe1..838bd55 100644 --- a/cupid/util.py +++ b/cupid/util.py @@ -57,12 +57,13 @@ def get_control_dict(config_path): if "compute_notebooks" in control: for nb_category in control["compute_notebooks"].values(): - for n_b, info in nb_category.items(): + # pylint: disable=invalid-name + for nb, info in nb_category.items(): info["kernel_name"] = info.get("kernel_name", default_kernel_name) if info["kernel_name"] is None: info["kernel_name"] = "cupid-analysis" warnings.warn( - f"No conda environment specified for {n_b}.ipynb and no default kernel set, will use cupid-analysis environment." + f"No conda environment specified for {nb}.ipynb and no default kernel set, will use cupid-analysis environment." ) if info["kernel_name"] not in control["env_check"]: control["env_check"][info["kernel_name"]] = ( @@ -131,13 +132,13 @@ def setup_book(config_path): def create_ploomber_nb_task( - n_b, info, cat_path, nb_path_root, output_dir, global_params, dag, dependency=None + nb, info, cat_path, nb_path_root, output_dir, global_params, dag, dependency=None ): """ Creates a ploomber task for running a notebook, including necessary parameters. Args: - n_b: key from dict of notebooks + nb: key from dict of notebooks info: various specifications for the notebook, originally from config.yml use_catalog: bool specified earlier, specifying if whole collection uses a catalog or not nb_path_root: from config.yml, path to folder containing template notebooks @@ -164,8 +165,8 @@ def create_ploomber_nb_task( for key, parms in parameter_groups.items(): - input_path = f"{nb_path_root}/{n_b}.ipynb" - output_name = f"{n_b}-{key}" if key != "none" else f"{n_b}" + input_path = f"{nb_path_root}/{nb}.ipynb" + output_name = f"{nb}-{key}" if key != "none" else f"{nb}" output_path = f"{output_dir}/{output_name}" From c13db7d15541638c14707cfe9cb7812d14b83ca4 Mon Sep 17 00:00:00 2001 From: Teagan Date: Tue, 7 May 2024 12:52:16 -0600 Subject: [PATCH 10/19] turn off black formatting for timeseries bit --- cupid/run.py | 28 +++++++--------------------- 1 file changed, 7 insertions(+), 21 deletions(-) diff --git a/cupid/run.py b/cupid/run.py index 47ebf4f..7f01dcb 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -107,40 +107,26 @@ def run( for component, comp_bool in component_options.items(): if comp_bool: + # fmt: off cupid.timeseries.create_time_series( component, timeseries_params[component]["vars"], timeseries_params[component]["derive_vars"], - [ - timeseries_params["case_name"] - ], # could also grab from compute_notebooks section of config file + [timeseries_params["case_name"]], timeseries_params[component]["hist_str"], - [ - global_params["CESM_output_dir"] - + "/" - + timeseries_params["case_name"] - + f"/{component}/hist/" - ], # could also grab from compute_notebooks section of config file - [ - global_params["CESM_output_dir"] - + "/" - + timeseries_params["case_name"] - + f"/{component}/proc/tseries/" - ], + [global_params["CESM_output_dir"]+"/"+timeseries_params["case_name"]+f"/{component}/hist/"], + [global_params["CESM_output_dir"]+"/"+timeseries_params["case_name"]+f"/{component}/proc/tseries/"], # Note that timeseries output will eventually go in # /glade/derecho/scratch/${USER}/archive/${CASE}/${component}/proc/tseries/ timeseries_params["ts_done"], timeseries_params["overwrite_ts"], - timeseries_params[component][ - "start_years" - ], - timeseries_params[component][ - "end_years" - ], + timeseries_params[component]["start_years"], + timeseries_params[component]["end_years"], timeseries_params[component]["level"], num_procs, serial, ) + # fmt: on # Grab paths From 8c1e50b223056e5beec336366e81912442376f94 Mon Sep 17 00:00:00 2001 From: Teagan Date: Tue, 7 May 2024 13:09:14 -0600 Subject: [PATCH 11/19] a few more pylint/black updates --- cupid/build.py | 2 +- cupid/clear.py | 4 ++-- cupid/read.py | 3 ++- cupid/run.py | 34 +++++++++++++++------------------- cupid/timeseries.py | 19 ++++++++++--------- cupid/util.py | 2 -- 6 files changed, 30 insertions(+), 34 deletions(-) diff --git a/cupid/build.py b/cupid/build.py index 3f37862..77abab7 100755 --- a/cupid/build.py +++ b/cupid/build.py @@ -37,7 +37,7 @@ def build(): ["jupyter-book", "build", f"{run_dir}/computed_notebooks/{sname}", "--all"] ) - ### Originally used this code to copy jupyter book HTML to a location to host it online + # Originally used this code to copy jupyter book HTML to a location to host it online # if 'publish_location' in control: diff --git a/cupid/clear.py b/cupid/clear.py index 74a827b..dc70a96 100755 --- a/cupid/clear.py +++ b/cupid/clear.py @@ -28,8 +28,8 @@ def read_config_file(config_path): full_path = os.path.join(run_dir, "computed_notebooks") return full_path - else: # run_dir is empty/wasn't found in config file so return error - raise ValueError("'run_dir' was empty/not found in the config file.") + # else run_dir is empty/wasn't found in config file so return error + raise ValueError("'run_dir' was empty/not found in the config file.") @click.command() diff --git a/cupid/read.py b/cupid/read.py index 04973cd..03ec029 100644 --- a/cupid/read.py +++ b/cupid/read.py @@ -3,7 +3,8 @@ Functions: - read_yaml(path_to_yaml): Read a YAML file and return its content as a dictionary. - - get_collection(path_to_catalog, **kwargs): Get a collection of datasets from an intake catalog based on specified criteria. + - get_collection(path_to_catalog, **kwargs): Get a collection of datasets from an + intake catalog based on specified criteria. """ import intake diff --git a/cupid/run.py b/cupid/run.py index 7f01dcb..53aa2d0 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -31,27 +31,17 @@ CONTEXT_SETTINGS = dict(help_option_names=["-h", "--help"]) - +# fmt: off +# pylint: disable=line-too-long @click.command(context_settings=CONTEXT_SETTINGS) @click.option("--serial", "-s", is_flag=True, help="Do not use LocalCluster objects") -@click.option( - "--time-series", - "-ts", - is_flag=True, - help="Run time series generation scripts prior to diagnostics", -) +@click.option("--time-series", "-ts", is_flag=True, help="Run time series generation scripts prior to diagnostics") # Options to turn components on or off -@click.option( - "--atmosphere", "-atm", is_flag=True, help="Run atmosphere component diagnostics" -) +@click.option("--atmosphere", "-atm", is_flag=True, help="Run atmosphere component diagnostics") @click.option("--ocean", "-ocn", is_flag=True, help="Run ocean component diagnostics") @click.option("--land", "-lnd", is_flag=True, help="Run land component diagnostics") -@click.option( - "--seaice", "-ice", is_flag=True, help="Run sea ice component diagnostics" -) -@click.option( - "--landice", "-glc", is_flag=True, help="Run land ice component diagnostics" -) +@click.option("--seaice", "-ice", is_flag=True, help="Run sea ice component diagnostics") +@click.option("--landice", "-glc", is_flag=True, help="Run land ice component diagnostics") @click.argument("config_path") def run( config_path, @@ -67,7 +57,8 @@ def run( """ Main engine to set up running all the notebooks. """ - + # fmt: on + # pylint: enable=line-too-long # Get control structure control = cupid.util.get_control_dict(config_path) cupid.util.setup_book(config_path) @@ -108,6 +99,7 @@ def run( for component, comp_bool in component_options.items(): if comp_bool: # fmt: off + # pylint: disable=line-too-long cupid.timeseries.create_time_series( component, timeseries_params[component]["vars"], @@ -127,6 +119,7 @@ def run( serial, ) # fmt: on + # pylint: enable=line-too-long # Grab paths @@ -199,7 +192,9 @@ def run( if not control["env_check"][info["kernel_name"]]: bad_env = info["kernel_name"] warnings.warn( - f"Environment {bad_env} specified for {nb}.ipynb could not be found; {nb}.ipynb will not be run. See README.md for environment installation instructions." + f"Environment {bad_env} specified for {nb}.ipynb could not be found;"+ + f" {nb}.ipynb will not be run."+ + f"See README.md for environment installation instructions." ) all_nbs.pop(nb) @@ -240,7 +235,8 @@ def run( if not control["env_check"][info["kernel_name"]]: bad_env = info["kernel_name"] warnings.warn( - f"Environment {bad_env} specified for {script}.py could not be found; {script}.py will not be run." + f"Environment {bad_env} specified for {script}.py could not be found;"+ + f"{script}.py will not be run." ) all_scripts.pop(script) diff --git a/cupid/timeseries.py b/cupid/timeseries.py index d97aefb..6315c16 100644 --- a/cupid/timeseries.py +++ b/cupid/timeseries.py @@ -86,7 +86,7 @@ def create_time_series( # Check if particular case should be processed: if ts_done[case_idx]: emsg = ( - " Configuration file indicates time series files have been pre-computed" + "Configuration file indicates time series files have been pre-computed" ) emsg += f" for case '{case_name}'. Will rely on those files directly." print(emsg) @@ -169,7 +169,7 @@ def create_time_series( # Print a warning, and assume that no vertical # level information is needed. wmsg = "WARNING! Unable to determine the vertical coordinate" - wmsg += f" type from the {height_dim} long name, which is:\n'{lev_long_name}'." + wmsg += f" type from the {height_dim} long name, \n'{lev_long_name}'." wmsg += ( "\nNo additional vertical coordinate information will be" ) @@ -239,11 +239,10 @@ def create_time_series( diag_var_list.append(constit) vars_to_derive.append(var) continue - else: - msg = f"WARNING: {var} is not in the file {hist_files[0]}." - msg += " No time series will be generated." - print(msg) - continue + msg = f"WARNING: {var} is not in the file {hist_files[0]}." + msg += " No time series will be generated." + print(msg) + continue # Check if variable has a height_dim (eg, 'lev') dimension according to first file: has_lev = bool(height_dim in hist_file_ds[var].dims) @@ -374,7 +373,8 @@ def derive_cam_variables(vars_to_derive=None, ts_dir=None, overwrite=None): Path(prect_file).unlink() else: print( - f"[{__name__}] Warning: PRECT file was found and overwrite is False. Will use existing file." + f"[{__name__}] Warning: PRECT file was found and overwrite is False" + + "Will use existing file." ) continue # append PRECC to the file containing PRECL @@ -407,7 +407,8 @@ def derive_cam_variables(vars_to_derive=None, ts_dir=None, overwrite=None): Path(derived_file).unlink() else: print( - f"[{__name__}] Warning: RESTOM file was found and overwrite is False. Will use existing file." + f"[{__name__}] Warning: RESTOM file was found and overwrite is False." + + "Will use existing file." ) continue # append FSNT to the file containing FLNT diff --git a/cupid/util.py b/cupid/util.py index 838bd55..775ee14 100644 --- a/cupid/util.py +++ b/cupid/util.py @@ -164,7 +164,6 @@ def create_ploomber_nb_task( default_params = info["default_params"] for key, parms in parameter_groups.items(): - input_path = f"{nb_path_root}/{nb}.ipynb" output_name = f"{nb}-{key}" if key != "none" else f"{nb}" @@ -241,7 +240,6 @@ def create_ploomber_script_task( default_params = info["default_params"] for key, parms in parameter_groups.items(): - input_path = f"{nb_path_root}/{script}.py" output_name = f"{script}-{key}" if key != "none" else f"{script}" From 0210d1a0495c08a9015bd7c7b114cb2f28044ccc Mon Sep 17 00:00:00 2001 From: Teagan Date: Tue, 7 May 2024 13:15:48 -0600 Subject: [PATCH 12/19] default config file and change config_path to option not argument --- cupid/build.py | 5 ++++- cupid/run.py | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/cupid/build.py b/cupid/build.py index 77abab7..b515b05 100755 --- a/cupid/build.py +++ b/cupid/build.py @@ -24,7 +24,10 @@ def build(): None """ - config_path = str(sys.argv[1]) + if sys.argv[1]: + config_path = str(sys.argv[1]) + else: + config_path = "config.yml" with open(config_path, "r") as fid: control = yaml.safe_load(fid) diff --git a/cupid/run.py b/cupid/run.py index 53aa2d0..7a9e816 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -6,7 +6,7 @@ This script sets up and runs all the specified notebooks and scripts according to the configurations provided in the specified YAML configuration file. -Usage: cupid-run [OPTIONS] CONFIG_PATH +Usage: cupid-run [OPTIONS] Main engine to set up running all the notebooks. @@ -18,6 +18,7 @@ -lnd, --land Run land component diagnostics -ice, --seaice Run sea ice component diagnostics -glc, --landice Run land ice component diagnostics + -config_path Path to the YAML configuration file containing specifications for notebooks (default: config.yml) -h, --help Show this message and exit. """ @@ -42,7 +43,7 @@ @click.option("--land", "-lnd", is_flag=True, help="Run land component diagnostics") @click.option("--seaice", "-ice", is_flag=True, help="Run sea ice component diagnostics") @click.option("--landice", "-glc", is_flag=True, help="Run land ice component diagnostics") -@click.argument("config_path") +@click.option("--config_path", default="config.yml", help="Path to the YAML configuration file containing specifications for notebooks", show_default=True) def run( config_path, serial=False, From b80182d14c32fd9f5860c57957d149b5ebeff7db Mon Sep 17 00:00:00 2001 From: Teagan King Date: Wed, 8 May 2024 11:59:03 -0600 Subject: [PATCH 13/19] remove need for specifying config file --- README.md | 11 ++++++----- cupid/build.py | 16 ++++++---------- cupid/clear.py | 2 +- docs/addingnotebookstocollection.md | 2 +- 4 files changed, 14 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 963a340..945d681 100644 --- a/README.md +++ b/README.md @@ -54,8 +54,8 @@ To test the package out, try to run `examples/coupled-model`: $ conda activate cupid-dev $ cd examples/coupled_model $ # machine-dependent: request multiple compute cores -$ cupid-run config.yml -$ cupid-build config.yml # Will build HTML from Jupyter Book +$ cupid-run +$ cupid-build # Will build HTML from Jupyter Book ``` After the last step is finished, you can use Jupyter to view generated notebooks in `${CUPID_ROOT}/examples/coupled-model/computed_notebooks/quick-run` @@ -64,7 +64,7 @@ or you can view `${CUPID_ROOT}/examples/coupled-model/computed_notebooks/quick-r Furthermore, to clear the `computed_notebooks` folder which was generated by the `cupid-run` and `cupid-build` commands, you can run the following command: ``` bash -$ cupid-clear config.yml +$ cupid-clear ``` This will clear the `computed_notebooks` folder which is at the location pointed to by the `run_dir` variable in the `config.yml` file. @@ -87,6 +87,7 @@ Options: -lnd, --land Run land component diagnostics -ice, --seaice Run sea ice component diagnostics -glc, --landice Run land ice component diagnostics + --config_path Path to the YAML configuration file containing specifications for notebooks (default config.yml) -h, --help Show this message and exit. ``` @@ -107,8 +108,8 @@ client #### Specifying components -If no component flags are provided, all component diagnostics listed in `config.yml` will be executed by default. Multiple flags can be used together to select a group of components, for example: `cupid-run -ocn -ice config.yml`. +If no component flags are provided, all component diagnostics listed in `config.yml` will be executed by default. Multiple flags can be used together to select a group of components, for example: `cupid-run -ocn -ice`. ### Timeseries File Generation -CUPiD also has the capability to generate single variable timeseries files from history files for all components. To run timeseries, edit the `config.yml` file's timeseries section to fit your preferences, and then run `cupid-run config.yml -ts`. +CUPiD also has the capability to generate single variable timeseries files from history files for all components. To run timeseries, edit the `config.yml` file's timeseries section to fit your preferences, and then run `cupid-run -ts`. diff --git a/cupid/build.py b/cupid/build.py index b515b05..3fa975d 100755 --- a/cupid/build.py +++ b/cupid/build.py @@ -3,32 +3,28 @@ This script provides functionality to build a Jupyter book based on the configuration specified in a YAML file. -The main function `build()` reads the configuration file provided as a command-line -argument, extracts the necessary information such as the name of the book and the +The main function `build()` reads the configuration file (default config.yml), +extracts the necessary information such as the name of the book and the directory containing computed notebooks, and then proceeds to clean and build the Jupyter book using the `jupyter-book` command-line tool. """ +import click import subprocess import sys import yaml - -def build(): +@click.option("--config_path", default="config.yml", help="Path to the YAML configuration file containing specifications for notebooks", show_default=True) +def build(config_path): """ Build a Jupyter book based on the TOC in config.yml. Called by `cupid-build`. Args: - none + config_path: str, path to yml file (default config.yml) Returns: None """ - if sys.argv[1]: - config_path = str(sys.argv[1]) - else: - config_path = "config.yml" - with open(config_path, "r") as fid: control = yaml.safe_load(fid) diff --git a/cupid/clear.py b/cupid/clear.py index dc70a96..0692893 100755 --- a/cupid/clear.py +++ b/cupid/clear.py @@ -33,7 +33,7 @@ def read_config_file(config_path): @click.command() -@click.argument("config_path") +@click.option("--config_path", default="config.yml", help="Path to the YAML configuration file containing specifications for notebooks", show_default=True) # Entry point to this script def clear(config_path): """Clears the contents of the 'computed_notebooks' folder at the location diff --git a/docs/addingnotebookstocollection.md b/docs/addingnotebookstocollection.md index 0034efa..bf54ddd 100644 --- a/docs/addingnotebookstocollection.md +++ b/docs/addingnotebookstocollection.md @@ -40,4 +40,4 @@ Generally, a good fit for a diagnostic notebook is one that reads in CESM output 7. Update your parameters. Parameters that are specific to just this notebook should go under `parameter_groups` in the notebook's entry under `compute_notebooks`. Global parameters that you want passed in to every notebook in the collection should go under `global_params`. When `CUPiD` executes your notebook, all of these parameters will get put in a new cell below the cell tagged `parameters` that you added in step 3. This means they will supercede the values of the parameters that you put in the cell above---the names, notation, etc. should match to make sure your notebook is able to find the variables it needs. -8. All set! Your collection can now be run and built with `cupid-run config.yml` and `cupid-build config.yml` as usual. +8. All set! Your collection can now be run and built with `cupid-run` and `cupid-build` as usual. From 5492294cf051220f58279ca1db583509e22bb2ed Mon Sep 17 00:00:00 2001 From: Teagan King Date: Wed, 8 May 2024 12:15:32 -0600 Subject: [PATCH 14/19] remove markdown in class name --- cupid/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cupid/util.py b/cupid/util.py index 775ee14..747b322 100644 --- a/cupid/util.py +++ b/cupid/util.py @@ -11,7 +11,7 @@ Classes: - ManageCondaKernel: Class for managing conda kernels. - - MdJinjaEngine: Class for using the Jinja Engine to run notebooks. + - JinjaEngine: Class for using the Jinja Engine to run notebooks. """ import os @@ -26,7 +26,7 @@ import yaml -class MdJinjaEngine(NBClientEngine): +class JinjaEngine(NBClientEngine): """Class for using the Jinja Engine to run notebooks""" @classmethod @@ -185,7 +185,7 @@ def create_ploomber_nb_task( "cwd": nb_path_root, } - pm.engines.papermill_engines._engines["md_jinja"] = MdJinjaEngine + pm.engines.papermill_engines._engines["md_jinja"] = JinjaEngine task = ploomber.tasks.NotebookRunner( Path(input_path), From 4664fed37a0530002a60a7ea292c8bc09f6b1273 Mon Sep 17 00:00:00 2001 From: Teagan King Date: Wed, 8 May 2024 12:16:26 -0600 Subject: [PATCH 15/19] black formatting --- cupid/clear.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cupid/clear.py b/cupid/clear.py index 0692893..630c4f4 100755 --- a/cupid/clear.py +++ b/cupid/clear.py @@ -33,7 +33,12 @@ def read_config_file(config_path): @click.command() -@click.option("--config_path", default="config.yml", help="Path to the YAML configuration file containing specifications for notebooks", show_default=True) +@click.option( + "--config_path", + default="config.yml", + help="Path to the YAML configuration file containing specifications for notebooks", + show_default=True, +) # Entry point to this script def clear(config_path): """Clears the contents of the 'computed_notebooks' folder at the location From 03acf19f8bc1f0c18f007a714d102abd65fb4f3f Mon Sep 17 00:00:00 2001 From: Teagan King Date: Wed, 8 May 2024 12:37:49 -0600 Subject: [PATCH 16/19] updated build.py to work without specifying config.yml --- cupid/build.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cupid/build.py b/cupid/build.py index 3fa975d..8b0edd6 100755 --- a/cupid/build.py +++ b/cupid/build.py @@ -14,7 +14,9 @@ import sys import yaml -@click.option("--config_path", default="config.yml", help="Path to the YAML configuration file containing specifications for notebooks", show_default=True) + +@click.command() +@click.argument("config_path", default="config.yml") def build(config_path): """ Build a Jupyter book based on the TOC in config.yml. Called by `cupid-build`. From 8a5916c8e1f793eb4f1fbff84ee657cdbaf813c3 Mon Sep 17 00:00:00 2001 From: Teagan King Date: Wed, 8 May 2024 12:42:52 -0600 Subject: [PATCH 17/19] clarify jinja engine type --- cupid/util.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cupid/util.py b/cupid/util.py index 747b322..c8d8991 100644 --- a/cupid/util.py +++ b/cupid/util.py @@ -11,7 +11,7 @@ Classes: - ManageCondaKernel: Class for managing conda kernels. - - JinjaEngine: Class for using the Jinja Engine to run notebooks. + - MarkdownJinjaEngine: Class for using the Jinja Engine to run notebooks. """ import os @@ -26,7 +26,7 @@ import yaml -class JinjaEngine(NBClientEngine): +class MarkdownJinjaEngine(NBClientEngine): """Class for using the Jinja Engine to run notebooks""" @classmethod @@ -185,7 +185,7 @@ def create_ploomber_nb_task( "cwd": nb_path_root, } - pm.engines.papermill_engines._engines["md_jinja"] = JinjaEngine + pm.engines.papermill_engines._engines["md_jinja"] = MarkdownJinjaEngine task = ploomber.tasks.NotebookRunner( Path(input_path), From ecd517ffed81ccd10d78666d62d24354b7176588 Mon Sep 17 00:00:00 2001 From: Teagan King Date: Wed, 8 May 2024 16:05:32 -0600 Subject: [PATCH 18/19] update to use arguments instead of options --- cupid/build.py | 9 ++++++++- cupid/clear.py | 14 ++++++++------ cupid/run.py | 9 ++++++++- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/cupid/build.py b/cupid/build.py index 8b0edd6..c909878 100755 --- a/cupid/build.py +++ b/cupid/build.py @@ -7,6 +7,12 @@ extracts the necessary information such as the name of the book and the directory containing computed notebooks, and then proceeds to clean and build the Jupyter book using the `jupyter-book` command-line tool. + +Args: + config_path: str, path to configuration file (default config.yml) + +Returns: + None """ import click @@ -22,7 +28,8 @@ def build(config_path): Build a Jupyter book based on the TOC in config.yml. Called by `cupid-build`. Args: - config_path: str, path to yml file (default config.yml) + config_path: str, path to configuration file (default config.yml) + Returns: None """ diff --git a/cupid/clear.py b/cupid/clear.py index 630c4f4..5660b34 100755 --- a/cupid/clear.py +++ b/cupid/clear.py @@ -6,6 +6,7 @@ The main function `clear()` takes the path to the config.yml file as input, reads the config file to obtain the 'run_dir' variable, and then deletes the contents of the 'computed_notebooks' folder at that location. + """ import os @@ -18,6 +19,12 @@ def read_config_file(config_path): """ Given the file path to config.yml, this function reads the config file content and returns the val of the run_dir string with '/computed_notebooks' appended to it + + Args: + config_path: str, path to configuration file (default config.yml) + + Returns: + None """ # Obtain the contents of the config.yml file and extract the run_dir variable control = cupid.util.get_control_dict(config_path) @@ -33,12 +40,7 @@ def read_config_file(config_path): @click.command() -@click.option( - "--config_path", - default="config.yml", - help="Path to the YAML configuration file containing specifications for notebooks", - show_default=True, -) +@click.argument("config_path", default="config.yml") # Entry point to this script def clear(config_path): """Clears the contents of the 'computed_notebooks' folder at the location diff --git a/cupid/run.py b/cupid/run.py index 7a9e816..d51f25e 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -43,7 +43,7 @@ @click.option("--land", "-lnd", is_flag=True, help="Run land component diagnostics") @click.option("--seaice", "-ice", is_flag=True, help="Run sea ice component diagnostics") @click.option("--landice", "-glc", is_flag=True, help="Run land ice component diagnostics") -@click.option("--config_path", default="config.yml", help="Path to the YAML configuration file containing specifications for notebooks", show_default=True) +@click.argument("config_path", default="config.yml") def run( config_path, serial=False, @@ -57,6 +57,13 @@ def run( ): """ Main engine to set up running all the notebooks. + + Args: + config_path: str, path to configuration file (default config.yml) + + Returns: + None + """ # fmt: on # pylint: enable=line-too-long From 0eec99be28e2552e0ac907ee7d18ebc185d6a408 Mon Sep 17 00:00:00 2001 From: Teagan King Date: Wed, 8 May 2024 16:27:14 -0600 Subject: [PATCH 19/19] uppercase CONFIG_PATH and docstrings --- cupid/build.py | 6 +++--- cupid/clear.py | 14 +++++++------- cupid/run.py | 2 +- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/cupid/build.py b/cupid/build.py index c909878..d5778b8 100755 --- a/cupid/build.py +++ b/cupid/build.py @@ -9,7 +9,7 @@ Jupyter book using the `jupyter-book` command-line tool. Args: - config_path: str, path to configuration file (default config.yml) + CONFIG_PATH: str, path to configuration file (default config.yml) Returns: None @@ -25,10 +25,10 @@ @click.argument("config_path", default="config.yml") def build(config_path): """ - Build a Jupyter book based on the TOC in config.yml. Called by `cupid-build`. + Build a Jupyter book based on the TOC in CONFIG_PATH. Called by `cupid-build`. Args: - config_path: str, path to configuration file (default config.yml) + CONFIG_PATH: str, path to configuration file (default config.yml) Returns: None diff --git a/cupid/clear.py b/cupid/clear.py index 5660b34..ea886fd 100755 --- a/cupid/clear.py +++ b/cupid/clear.py @@ -1,9 +1,9 @@ #!/usr/bin/env python """ This script provides functionality to clear the contents of the 'computed_notebooks' folder -at the location specified by the 'run_dir' variable in the 'config.yml' file. +at the location specified by the 'run_dir' variable in the CONFIG_PATH. -The main function `clear()` takes the path to the config.yml file as input, reads the config file +The main function `clear()` takes the path to the configuration file as input, reads the config file to obtain the 'run_dir' variable, and then deletes the contents of the 'computed_notebooks' folder at that location. @@ -17,16 +17,16 @@ def read_config_file(config_path): """ - Given the file path to config.yml, this function reads the config file content and + Given the file path to the configuration file, this function reads the config file content and returns the val of the run_dir string with '/computed_notebooks' appended to it Args: - config_path: str, path to configuration file (default config.yml) + CONFIG_PATH: str, path to configuration file (default config.yml) Returns: None """ - # Obtain the contents of the config.yml file and extract the run_dir variable + # Obtain the contents of the configuration file and extract the run_dir variable control = cupid.util.get_control_dict(config_path) run_dir = control["data_sources"].get("run_dir", None) @@ -44,9 +44,9 @@ def read_config_file(config_path): # Entry point to this script def clear(config_path): """Clears the contents of the 'computed_notebooks' folder at the location - specified by the 'run_dir' variable in the 'config.yml' file. + specified by the 'run_dir' variable in the CONFIG_PATH. - Args: config_path - The path to the config.yml file. + Args: CONFIG_PATH - The path to the configuration file. """ diff --git a/cupid/run.py b/cupid/run.py index d51f25e..536ccc5 100755 --- a/cupid/run.py +++ b/cupid/run.py @@ -59,7 +59,7 @@ def run( Main engine to set up running all the notebooks. Args: - config_path: str, path to configuration file (default config.yml) + CONFIG_PATH: str, path to configuration file (default config.yml) Returns: None