From 3b5578bfa2b9b1b0865f99bce92901cd6d771573 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 9 Aug 2024 09:03:20 -0400 Subject: [PATCH 01/31] matplotlib >=3.8 --- environment.yml | 2 +- mhkit/wave/contours.py | 8 +------- requirements.txt | 2 +- setup.py | 2 +- 4 files changed, 4 insertions(+), 10 deletions(-) diff --git a/environment.yml b/environment.yml index ac679176d..ee1b37bf0 100644 --- a/environment.yml +++ b/environment.yml @@ -8,7 +8,7 @@ dependencies: - pandas>=1.0.0 - numpy>=1.21.0, <2.0.0 - scipy<=1.13.1 - - matplotlib + - matplotlib>=3.8.0 - requests - lxml - scikit-learn diff --git a/mhkit/wave/contours.py b/mhkit/wave/contours.py index 905c560b8..b2a054c4c 100644 --- a/mhkit/wave/contours.py +++ b/mhkit/wave/contours.py @@ -8,11 +8,8 @@ import numpy as np import warnings from mhkit.utils import to_numeric_array - import matplotlib -mpl_version = tuple(map(int, matplotlib.__version__.split("."))) - # Contours def environmental_contours(x1, x2, sea_state_duration, return_period, method, **kwargs): @@ -1696,10 +1693,7 @@ def _bivariate_KDE(x1, x2, bw, fit, nb_steps, Ndata_bivariate_KDE, kwargs): x1_bivariate_KDE = [] x2_bivariate_KDE = [] - if mpl_version < (3, 8): # For versions before 3.8 - segments = vals.allsegs[0] - else: - segments = [path.vertices for path in vals.get_paths()] + segments = [path.vertices for path in vals.get_paths()] for seg in segments: x1_bivariate_KDE.append(seg[:, 1]) diff --git a/requirements.txt b/requirements.txt index 381f1068f..1f68d7614 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ pandas>=1.0.0 numpy>=1.21.0, <2.0.0 scipy<=1.13.1 -matplotlib +matplotlib>=3.8.0 requests pecos>=0.3.0 fatpack diff --git a/setup.py b/setup.py index c30ff2e9f..8d2825b3b 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ "pandas>=1.0.0", "numpy>=1.21.0, <2.0.0", "scipy<=1.13.1", - "matplotlib", + "matplotlib>=3.8.0", "requests", "pecos>=0.3.0", "fatpack", From 8b8d54bc381a728aa84dd5126568121e02317311 Mon Sep 17 00:00:00 2001 From: ssolson Date: Wed, 4 Sep 2024 10:08:21 -0400 Subject: [PATCH 02/31] lint utils --- .github/workflows/pylint.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml index 08458f95d..d5cca43e5 100644 --- a/.github/workflows/pylint.yml +++ b/.github/workflows/pylint.yml @@ -28,3 +28,7 @@ jobs: - name: Run Pylint on mhkit/power/ run: | pylint mhkit/power/ + + - name: Run Pylint on mhkit/utils/ + run: | + pylint mhkit/utils/ From e1196e1bc1f446c57a779743bc3065cac352790d Mon Sep 17 00:00:00 2001 From: ssolson Date: Wed, 4 Sep 2024 10:17:08 -0400 Subject: [PATCH 03/31] 10 lint coverage --- mhkit/utils/__init__.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mhkit/utils/__init__.py b/mhkit/utils/__init__.py index d20d4270b..b484862b0 100644 --- a/mhkit/utils/__init__.py +++ b/mhkit/utils/__init__.py @@ -1,3 +1,9 @@ +""" +This module initializes and imports the essential utility functions for data +conversion, statistical analysis, caching, and event detection for the +MHKiT library. +""" + from .time_utils import matlab_to_datetime, excel_to_datetime, index_to_datetime from .stat_utils import ( get_statistics, @@ -15,4 +21,5 @@ convert_nested_dict_and_pandas, ) +# pylint: disable=invalid-name _matlab = False # Private variable indicating if mhkit is run through matlab From f4ec0092f340707617e1ebeb55a8d07acc24e27d Mon Sep 17 00:00:00 2001 From: ssolson Date: Wed, 4 Sep 2024 11:48:29 -0400 Subject: [PATCH 04/31] reduce handle_caching to 5 inputs --- mhkit/river/io/usgs.py | 12 +- mhkit/tests/utils/test_cache.py | 26 ++- mhkit/tidal/io/noaa.py | 8 +- mhkit/utils/cache.py | 301 ++++++++++++++++++------- mhkit/wave/io/cdip.py | 28 ++- mhkit/wave/io/hindcast/hindcast.py | 24 +- mhkit/wave/io/hindcast/wind_toolkit.py | 13 +- mhkit/wave/io/ndbc.py | 31 ++- 8 files changed, 337 insertions(+), 106 deletions(-) diff --git a/mhkit/river/io/usgs.py b/mhkit/river/io/usgs.py index 9b104f826..35ca11ecf 100644 --- a/mhkit/river/io/usgs.py +++ b/mhkit/river/io/usgs.py @@ -120,7 +120,10 @@ def request_usgs_data( # Use handle_caching to manage cache cached_data, metadata, cache_filepath = handle_caching( - hash_params, cache_dir, write_json, clear_cache + hash_params, + cache_dir, + cache_content={"data": None, "metadata": None, "write_json": write_json}, + clear_cache_file=clear_cache, ) if cached_data is not None: @@ -165,7 +168,12 @@ def request_usgs_data( # After making the API request and processing the response, write the # response to a cache file - handle_caching(hash_params, cache_dir, data=data, clear_cache_file=clear_cache) + handle_caching( + hash_params, + cache_dir, + cache_content={"data": data, "metadata": None, "write_json": None}, + clear_cache_file=clear_cache, + ) if write_json: shutil.copy(cache_filepath, write_json) diff --git a/mhkit/tests/utils/test_cache.py b/mhkit/tests/utils/test_cache.py index 14aae0802..3cd5fff43 100644 --- a/mhkit/tests/utils/test_cache.py +++ b/mhkit/tests/utils/test_cache.py @@ -93,7 +93,11 @@ def test_handle_caching_creates_cache(self): Asserts: - The cache file is successfully created at the expected file path. """ - handle_caching(self.hash_params, self.cache_dir, data=self.data) + handle_caching( + self.hash_params, + self.cache_dir, + cache_content={"data": self.data, "metadata": None, "write_json": None}, + ) cache_filename = ( hashlib.md5(self.hash_params.encode("utf-8")).hexdigest() + ".json" @@ -114,8 +118,18 @@ def test_handle_caching_retrieves_data(self): Asserts: - The retrieved data matches the original sample DataFrame. """ - handle_caching(self.hash_params, self.cache_dir, data=self.data) - retrieved_data, _, _ = handle_caching(self.hash_params, self.cache_dir) + handle_caching( + self.hash_params, + self.cache_dir, + cache_content={"data": self.data, "metadata": None, "write_json": None}, + ) + + retrieved_data, _, _ = handle_caching( + self.hash_params, + self.cache_dir, + cache_content={"data": None, "metadata": None, "write_json": None}, + ) + pd.testing.assert_frame_equal(self.data, retrieved_data, check_freq=False) def test_handle_caching_cdip_file_extension(self): @@ -132,7 +146,11 @@ def test_handle_caching_cdip_file_extension(self): - The cache file with a ".pkl" extension is successfully created at the expected file path. """ cache_dir = os.path.join(self.cache_dir, "cdip") - handle_caching(self.hash_params, cache_dir, data=self.data) + handle_caching( + self.hash_params, + cache_dir, + cache_content={"data": self.data, "metadata": None, "write_json": None}, + ) cache_filename = ( hashlib.md5(self.hash_params.encode("utf-8")).hexdigest() + ".pkl" diff --git a/mhkit/tidal/io/noaa.py b/mhkit/tidal/io/noaa.py index d0aadc861..2ab8a1d2a 100644 --- a/mhkit/tidal/io/noaa.py +++ b/mhkit/tidal/io/noaa.py @@ -124,7 +124,10 @@ def request_noaa_data( # Use handle_caching to manage cache cached_data, cached_metadata, cache_filepath = handle_caching( - hash_params, cache_dir, write_json=write_json, clear_cache_file=clear_cache + hash_params, + cache_dir, + cache_content={"data": None, "metadata": None, "write_json": write_json}, + clear_cache_file=clear_cache, ) if cached_data is not None: @@ -205,8 +208,7 @@ def request_noaa_data( handle_caching( hash_params, cache_dir, - data=data, - metadata=metadata, + cache_content={"data": data, "metadata": metadata, "write_json": None}, clear_cache_file=clear_cache, ) diff --git a/mhkit/utils/cache.py b/mhkit/utils/cache.py index 423a12757..de30a4e7e 100644 --- a/mhkit/utils/cache.py +++ b/mhkit/utils/cache.py @@ -42,18 +42,163 @@ import hashlib import json import os -import re + +# import re import shutil import pickle import pandas as pd +# def old_handle_caching( +# hash_params, +# cache_dir, +# data=None, +# metadata=None, +# write_json=None, +# clear_cache_file=False, +# ): +# """ +# Handles caching of data to avoid redundant network requests or +# computations. + +# The function checks if a cache file exists for the given parameters. +# If it does, the function will load data from the cache file, unless +# the `clear_cache_file` parameter is set to `True`, in which case the +# cache file is cleared. If the cache file does not exist and the +# `data` parameter is not `None`, the function will store the +# provided data in a cache file. + +# Parameters +# ---------- +# hash_params : str +# The parameters to be hashed and used as the filename for the cache file. +# cache_dir : str +# The directory where the cache files are stored. +# data : pandas DataFrame or None +# The data to be stored in the cache file. If `None`, the function +# will attempt to load data from the cache file. +# metadata : dict or None +# Metadata associated with the data. This will be stored in the +# cache file along with the data. +# write_json : str or None +# If specified, the cache file will be copied to a file with this name. +# clear_cache_file : bool +# If `True`, the cache file for the given parameters will be cleared. + +# Returns +# ------- +# data : pandas DataFrame or None +# The data loaded from the cache file. If data was provided as a +# parameter, the same data will be returned. If the cache file +# does not exist and no data was provided, `None` will be returned. +# metadata : dict or None +# The metadata loaded from the cache file. If metadata was provided +# as a parameter, the same metadata will be returned. If the cache +# file does not exist and no metadata was provided, `None` will be +# returned. +# cache_filepath : str +# The path to the cache file. +# """ + +# # Check if 'cdip' is in cache_dir, then use .pkl instead of .json +# file_extension = ( +# ".pkl" +# if "cdip" in cache_dir or "hindcast" in cache_dir or "ndbc" in cache_dir +# else ".json" +# ) + +# # Make cache directory if it doesn't exist +# if not os.path.isdir(cache_dir): +# os.makedirs(cache_dir) + +# # Create a unique filename based on the function parameters +# cache_filename = ( +# hashlib.md5(hash_params.encode("utf-8")).hexdigest() + file_extension +# ) +# cache_filepath = os.path.join(cache_dir, cache_filename) + +# # If clear_cache_file is True, remove the cache file for this request +# if clear_cache_file and os.path.isfile(cache_filepath): +# os.remove(cache_filepath) +# print(f"Cleared cache for {cache_filepath}") + +# # If a cached file exists, load and return the data from the file +# if os.path.isfile(cache_filepath) and data is None: +# if file_extension == ".json": +# with open(cache_filepath, encoding="utf-8") as f: +# json_data = json.load(f) + +# # Extract metadata if it exists +# if "metadata" in json_data: +# metadata = json_data.pop("metadata", None) + +# # Check if index is datetime formatted +# if all( +# re.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", str(dt)) +# for dt in json_data["index"] +# ): +# data = pd.DataFrame( +# json_data["data"], +# index=pd.to_datetime(json_data["index"]), +# columns=json_data["columns"], +# ) +# else: +# data = pd.DataFrame( +# json_data["data"], +# index=json_data["index"], +# columns=json_data["columns"], +# ) + +# # Convert the rest to DataFrame +# data = pd.DataFrame( +# json_data["data"], +# index=pd.to_datetime(json_data["index"]), +# columns=json_data["columns"], +# ) + +# elif file_extension == ".pkl": +# with open(cache_filepath, "rb") as f: +# data, metadata = pickle.load(f) + +# if write_json: +# shutil.copy(cache_filepath, write_json) + +# return data, metadata, cache_filepath + +# # If a cached file does not exist and data is provided, +# # store the data in a cache file +# if data is not None: +# if file_extension == ".json": +# # Convert DataFrame to python dict +# py_data = data.to_dict(orient="split") +# # Add metadata to py_data +# py_data["metadata"] = metadata +# # Check if index is datetime indexed +# if isinstance(data.index, pd.DatetimeIndex): +# py_data["index"] = [ +# dt.strftime("%Y-%m-%d %H:%M:%S") for dt in py_data["index"] +# ] +# else: +# py_data["index"] = list(data.index) +# with open(cache_filepath, "w", encoding="utf-8") as f: +# json.dump(py_data, f) + +# elif file_extension == ".pkl": +# with open(cache_filepath, "wb") as f: +# pickle.dump((data, metadata), f) + +# if write_json: +# shutil.copy(cache_filepath, write_json) + +# return data, metadata, cache_filepath +# # If data is not provided and the cache file doesn't exist, return cache_filepath +# return None, None, cache_filepath + + def handle_caching( hash_params, cache_dir, - data=None, - metadata=None, - write_json=None, + cache_content=None, clear_cache_file=False, ): """ @@ -73,14 +218,10 @@ def handle_caching( The parameters to be hashed and used as the filename for the cache file. cache_dir : str The directory where the cache files are stored. - data : pandas DataFrame or None - The data to be stored in the cache file. If `None`, the function + cache_content : dict or None + Dictionary containing 'data' (pandas DataFrame or None), 'metadata' + (dict or None), and 'write_json' (str or None). If `None`, the function will attempt to load data from the cache file. - metadata : dict or None - Metadata associated with the data. This will be stored in the - cache file along with the data. - write_json : str or None - If specified, the cache file will be copied to a file with this name. clear_cache_file : bool If `True`, the cache file for the given parameters will be cleared. @@ -99,98 +240,92 @@ def handle_caching( The path to the cache file. """ - # Check if 'cdip' is in cache_dir, then use .pkl instead of .json - file_extension = ( - ".pkl" - if "cdip" in cache_dir or "hindcast" in cache_dir or "ndbc" in cache_dir - else ".json" - ) - - # Make cache directory if it doesn't exist - if not os.path.isdir(cache_dir): - os.makedirs(cache_dir) - - # Create a unique filename based on the function parameters - cache_filename = ( - hashlib.md5(hash_params.encode("utf-8")).hexdigest() + file_extension - ) - cache_filepath = os.path.join(cache_dir, cache_filename) - - # If clear_cache_file is True, remove the cache file for this request - if clear_cache_file and os.path.isfile(cache_filepath): - os.remove(cache_filepath) - print(f"Cleared cache for {cache_filepath}") - - # If a cached file exists, load and return the data from the file - if os.path.isfile(cache_filepath) and data is None: + # Initialize data and metadata to None to avoid pylint errors + data = None + metadata = None + + def _generate_cache_filepath(): + """Generates the cache file path based on the hashed parameters.""" + file_extension = ( + ".pkl" + if "cdip" in cache_dir or "hindcast" in cache_dir or "ndbc" in cache_dir + else ".json" + ) + cache_filename = ( + hashlib.md5(hash_params.encode("utf-8")).hexdigest() + file_extension + ) + return os.path.join(cache_dir, cache_filename), file_extension + + def _clear_cache(cache_filepath): + """Clear the cache file if requested.""" + if clear_cache_file and os.path.isfile(cache_filepath): + os.remove(cache_filepath) + print(f"Cleared cache for {cache_filepath}") + + def _load_cache(file_extension, cache_filepath): + """Load data from the cache file based on its extension.""" + nonlocal data, metadata # Specify that these are outer variables if file_extension == ".json": with open(cache_filepath, encoding="utf-8") as f: - jsonData = json.load(f) - - # Extract metadata if it exists - if "metadata" in jsonData: - metadata = jsonData.pop("metadata", None) - - # Check if index is datetime formatted - if all( - re.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", str(dt)) - for dt in jsonData["index"] - ): - data = pd.DataFrame( - jsonData["data"], - index=pd.to_datetime(jsonData["index"]), - columns=jsonData["columns"], - ) - else: - data = pd.DataFrame( - jsonData["data"], - index=jsonData["index"], - columns=jsonData["columns"], - ) + json_data = json.load(f) + + metadata = json_data.pop("metadata", None) - # Convert the rest to DataFrame data = pd.DataFrame( - jsonData["data"], - index=pd.to_datetime(jsonData["index"]), - columns=jsonData["columns"], + json_data["data"], + index=pd.to_datetime(json_data["index"]), + columns=json_data["columns"], ) - elif file_extension == ".pkl": with open(cache_filepath, "rb") as f: data, metadata = pickle.load(f) - if write_json: - shutil.copy(cache_filepath, write_json) - - return data, metadata, cache_filepath + return data, metadata - # If a cached file does not exist and data is provided, - # store the data in a cache file - elif data is not None: + def _write_cache(data, metadata, file_extension, cache_filepath): + """Store data in the cache file based on the extension.""" if file_extension == ".json": - # Convert DataFrame to python dict - pyData = data.to_dict(orient="split") - # Add metadata to pyData - pyData["metadata"] = metadata - # Check if index is datetime indexed + py_data = data.to_dict(orient="split") + py_data["metadata"] = metadata if isinstance(data.index, pd.DatetimeIndex): - pyData["index"] = [ - dt.strftime("%Y-%m-%d %H:%M:%S") for dt in pyData["index"] + py_data["index"] = [ + dt.strftime("%Y-%m-%d %H:%M:%S") for dt in py_data["index"] ] else: - pyData["index"] = list(data.index) + py_data["index"] = list(data.index) with open(cache_filepath, "w", encoding="utf-8") as f: - json.dump(pyData, f) - + json.dump(py_data, f) elif file_extension == ".pkl": with open(cache_filepath, "wb") as f: pickle.dump((data, metadata), f) - if write_json: - shutil.copy(cache_filepath, write_json) + # Create the cache directory if it doesn't exist + if not os.path.isdir(cache_dir): + os.makedirs(cache_dir) + + # Generate cache filepath and extension + cache_filepath, file_extension = _generate_cache_filepath() + + # Clear cache if requested + _clear_cache(cache_filepath) + + # Check if cache file exists and load if no data provided + if os.path.isfile(cache_filepath) and cache_content is None: + return _load_cache(file_extension, cache_filepath) + (cache_filepath,) + + # Store data in cache if provided + if cache_content and cache_content["data"] is not None: + _write_cache( + cache_content["data"], + cache_content["metadata"], + file_extension, + cache_filepath, + ) + if cache_content["write_json"]: + shutil.copy(cache_filepath, cache_content["write_json"]) + + return cache_content["data"], cache_content["metadata"], cache_filepath - return data, metadata, cache_filepath - # If data is not provided and the cache file doesn't exist, return cache_filepath return None, None, cache_filepath diff --git a/mhkit/wave/io/cdip.py b/mhkit/wave/io/cdip.py index 5fb6e34f3..92a1d47e6 100644 --- a/mhkit/wave/io/cdip.py +++ b/mhkit/wave/io/cdip.py @@ -324,7 +324,11 @@ def request_parse_workflow( if not multiyear: # Check the cache first hash_params = f"{station_number}-{parameters}-{start_date}-{end_date}" - data = handle_caching(hash_params, cache_dir) + data, _, _ = handle_caching( + hash_params, + cache_dir, + cache_content={"data": None, "metadata": None, "write_json": None}, + ) if data[:2] == (None, None): data = get_netcdf_variables( @@ -335,7 +339,11 @@ def request_parse_workflow( all_2D_variables=all_2D_variables, silent=silent, ) - handle_caching(hash_params, cache_dir, data=data) + handle_caching( + hash_params, + cache_dir, + cache_content={"data": data, "metadata": None, "write_json": None}, + ) else: data = data[0] @@ -348,7 +356,11 @@ def request_parse_workflow( # Check the cache for each individual year hash_params = f"{station_number}-{parameters}-{start_date}-{end_date}" - year_data = handle_caching(hash_params, cache_dir) + year_data, _, _ = handle_caching( + hash_params, + cache_dir, + cache_content={"data": None, "metadata": None, "write_json": None}, + ) if year_data[:2] == (None, None): year_data = get_netcdf_variables( nc, @@ -359,7 +371,15 @@ def request_parse_workflow( silent=silent, ) # Cache the individual year's data - handle_caching(hash_params, cache_dir, data=year_data) + handle_caching( + hash_params, + cache_dir, + cache_content={ + "data": year_data, + "metadata": None, + "write_json": None, + }, + ) else: year_data = year_data[0] multiyear_data[year] = year_data["data"] diff --git a/mhkit/wave/io/hindcast/hindcast.py b/mhkit/wave/io/hindcast/hindcast.py index 4bcc4486e..c58e55c40 100644 --- a/mhkit/wave/io/hindcast/hindcast.py +++ b/mhkit/wave/io/hindcast/hindcast.py @@ -192,7 +192,11 @@ def request_wpto_point_data( # Construct a string representation of the function parameters hash_params = f"{data_type}_{parameter}_{lat_lon}_{years}_{tree}_{unscale}_{str_decode}_{hsds}_{path}_{to_pandas}" cache_dir = _get_cache_dir() - data, meta, _ = handle_caching(hash_params, cache_dir) + data, meta, _ = handle_caching( + hash_params, + cache_dir, + cache_content={"data": None, "metadata": None, "write_json": None}, + ) if data is not None: return data, meta @@ -277,7 +281,11 @@ def request_wpto_point_data( data = data.drop_vars("index") # save_to_cache(hash_params, data, meta) - handle_caching(hash_params, cache_dir, data, meta) + handle_caching( + hash_params, + cache_dir, + cache_content={"data": data, "metadata": meta, "write_json": None}, + ) return data, meta @@ -374,7 +382,11 @@ def request_wpto_directional_spectrum( # Attempt to load data from cache hash_params = f"{lat_lon}_{year}_{tree}_{unscale}_{str_decode}_{hsds}_{path}" cache_dir = _get_cache_dir() - data, meta, _ = handle_caching(hash_params, cache_dir) + data, meta, _ = handle_caching( + hash_params, + cache_dir, + cache_content={"data": None, "metadata": None, "write_json": None}, + ) if data is not None: return data, meta @@ -480,7 +492,11 @@ def request_wpto_directional_spectrum( }, ) - handle_caching(hash_params, cache_dir, data, meta) + handle_caching( + hash_params, + cache_dir, + cache_content={"data": data, "metadata": meta, "write_json": None}, + ) return data, meta diff --git a/mhkit/wave/io/hindcast/wind_toolkit.py b/mhkit/wave/io/hindcast/wind_toolkit.py index aad65c09d..2205e2be4 100644 --- a/mhkit/wave/io/hindcast/wind_toolkit.py +++ b/mhkit/wave/io/hindcast/wind_toolkit.py @@ -417,7 +417,12 @@ def request_wtk_point_data( hash_params = f"{time_interval}_{parameter}_{lat_lon}_{years}_{preferred_region}_{tree}_{unscale}_{str_decode}_{hsds}" # Use handle_caching to manage caching. - data, meta, _ = handle_caching(hash_params, cache_dir, clear_cache_file=clear_cache) + data, meta, _ = handle_caching( + hash_params, + cache_dir, + cache_content={"data": None, "metadata": None, "write_json": None}, + clear_cache_file=clear_cache, + ) if data is not None and meta is not None: if not to_pandas: @@ -478,7 +483,11 @@ def request_wtk_point_data( meta = meta.reset_index(drop=True) # Save the retrieved data and metadata to cache. - handle_caching(hash_params, cache_dir, data=data, metadata=meta) + handle_caching( + hash_params, + cache_dir, + cache_content={"data": data, "metadata": meta, "write_json": None}, + ) if not to_pandas: data = convert_to_dataset(data) diff --git a/mhkit/wave/io/ndbc.py b/mhkit/wave/io/ndbc.py index 12ad3e9a7..3356358cd 100644 --- a/mhkit/wave/io/ndbc.py +++ b/mhkit/wave/io/ndbc.py @@ -207,7 +207,12 @@ def available_data( cache_dir = os.path.join(os.path.expanduser("~"), ".cache", "mhkit", "ndbc") # Check the cache before making the request - data, _, _ = handle_caching(hash_params, cache_dir, clear_cache_file=clear_cache) + data, _, _ = handle_caching( + hash_params, + cache_dir, + cache_content={"data": None, "metadata": None, "write_json": None}, + clear_cache_file=clear_cache, + ) # no coverage bc in coverage runs we have already cached the data/ run this code if data is None: # pragma: no cover @@ -246,7 +251,16 @@ def available_data( data = available_data[available_data.id == buoy_number[i]] available_data = available_data.append(data) # Cache the result - handle_caching(hash_params, cache_dir, data=available_data) + handle_caching( + hash_params, + cache_dir, + cache_content={ + "data": available_data, + "metadata": None, + "write_json": None, + }, + ) + else: available_data = data @@ -371,7 +385,10 @@ def request_data(parameter, filenames, proxy=None, clear_cache=False, to_pandas= # Create a unique filename based on the function parameters for caching hash_params = f"{buoy_id}_{parameter}_{year}_{filename}" cached_data, _, _ = handle_caching( - hash_params, cache_dir, clear_cache_file=clear_cache + hash_params, + cache_dir, + cache_content={"data": None, "metadata": None, "write_json": None}, + clear_cache_file=clear_cache, ) if cached_data is not None: @@ -415,7 +432,13 @@ def request_data(parameter, filenames, proxy=None, clear_cache=False, to_pandas= # Cache the data after processing it if it exists if year in ndbc_data[buoy_id]: handle_caching( - hash_params, cache_dir, data=ndbc_data[buoy_id][year] + hash_params, + cache_dir, + cache_content={ + "data": ndbc_data[buoy_id][year], + "metadata": None, + "write_json": None, + }, ) if buoy_id and len(ndbc_data) == 1: From 88810cf957411239443c4650251276d5039592b0 Mon Sep 17 00:00:00 2001 From: ssolson Date: Thu, 5 Sep 2024 11:04:29 -0400 Subject: [PATCH 05/31] index is now "t" --- mhkit/tests/tidal/test_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mhkit/tests/tidal/test_io.py b/mhkit/tests/tidal/test_io.py index 280b847ce..5ce38e4fa 100644 --- a/mhkit/tests/tidal/test_io.py +++ b/mhkit/tests/tidal/test_io.py @@ -103,11 +103,11 @@ def test_request_noaa_data_basic_xarray(self): ) # Check if the variable sets are equal data_variables = list(data.variables) - required_variables = ["index", "s", "d", "b"] + required_variables = ["t", "s", "d", "b"] data_variables_set = set(data_variables) required_variables_set = set(required_variables) self.assertTrue(data_variables_set == required_variables_set) - self.assertEqual(len(data["index"]), 183) + self.assertEqual(len(data["t"]), 183) self.assertEqual(data.attrs["id"], "s08010") def test_request_noaa_data_write_json(self): From 71974b33a5e84927fa613a222ed22fbb06e17656 Mon Sep 17 00:00:00 2001 From: ssolson Date: Thu, 5 Sep 2024 11:08:08 -0400 Subject: [PATCH 06/31] 10/10 lint --- mhkit/utils/upcrossing.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mhkit/utils/upcrossing.py b/mhkit/utils/upcrossing.py index 5993d6544..18db3f142 100644 --- a/mhkit/utils/upcrossing.py +++ b/mhkit/utils/upcrossing.py @@ -75,15 +75,15 @@ def upcrossing(t, data): raise ValueError("only 1D data supported, try calling squeeze()") # eliminate zeros - zeroMask = data == 0 - data[zeroMask] = 0.5 * np.min(np.abs(data)) + zero_mask = data == 0 + data[zero_mask] = 0.5 * np.min(np.abs(data)) # zero up-crossings diff = np.diff(np.sign(data)) - zeroUpCrossings_mask = (diff == 2) | (diff == 1) - zeroUpCrossings_index = np.where(zeroUpCrossings_mask)[0] + zero_upcrossings_mask = (diff == 2) | (diff == 1) + zero_upcrossings_index = np.where(zero_upcrossings_mask)[0] - return zeroUpCrossings_index + return zero_upcrossings_index def peaks(t, data, inds=None): From 8d0263f2b66aad88b24ffff699be5af2c0b489c1 Mon Sep 17 00:00:00 2001 From: ssolson Date: Thu, 5 Sep 2024 11:32:45 -0400 Subject: [PATCH 07/31] 10/10 lint --- mhkit/utils/stat_utils.py | 165 ++++++++++++++++++++++++-------------- 1 file changed, 104 insertions(+), 61 deletions(-) diff --git a/mhkit/utils/stat_utils.py b/mhkit/utils/stat_utils.py index f0a7e2994..681636680 100644 --- a/mhkit/utils/stat_utils.py +++ b/mhkit/utils/stat_utils.py @@ -1,9 +1,57 @@ -from mhkit import qc +""" +This module contains functions to perform various statistical calculations +on continuous data. It includes functions for calculating statistics such as +mean, max, min, and standard deviation over specific windows, as well as functions +for vector/directional statistics. The module also provides utility functions +to unwrap vectors, compute magnitudes and phases in 2D/3D, and calculate +the root mean squared values of vector components. + +Functions: +---------- +- get_statistics: Calculates statistics for continuous data. +- vector_statistics: Calculates vector mean and standard deviation. +- unwrap_vector: Unwraps vector data to fall within a 0-360 degree range. +- magnitude_phase: Computes magnitude and phase for 2D or 3D data. +- unorm: Computes root mean squared value of 3D vectors. +""" + import pandas as pd import numpy as np +from mhkit import qc + + +def _calculate_statistics(datachunk, vector_channels): + """ + Calculate the mean, max, min, and standard deviation for the given datachunk. + Also calculate vector statistics for vector_channels. + + Parameters + ---------- + datachunk : pandas DataFrame + A chunk of data on which to perform statistics. + vector_channels : list + List of vector channel names formatted in deg (0-360). + + Returns + ------- + stats : dict + A dictionary containing 'means', 'maxs', 'mins', and 'stdevs'. + """ + means = datachunk.mean() + maxs = datachunk.max() + mins = datachunk.min() + stdevs = datachunk.std() + + for v in vector_channels: + vector_avg, vector_std = vector_statistics(datachunk[v]) + # overwrite scalar average and std for channel + means[v] = vector_avg + stdevs[v] = vector_std + return {"means": means, "maxs": maxs, "mins": mins, "stdevs": stdevs} -def get_statistics(data, freq, period=600, vector_channels=[]): + +def get_statistics(data, freq, period=600, vector_channels=None): """ Calculate mean, max, min and stdev statistics of continuous data for a given statistical window. Default length of statistical window (period) is @@ -26,71 +74,63 @@ def get_statistics(data, freq, period=600, vector_channels=[]): means,maxs,mins,stdevs : pandas DataFrame Calculated statistical values from the data, indexed by the first timestamp """ - # Check data type + if vector_channels is None: + vector_channels = [] + + if isinstance(vector_channels, str): + vector_channels = [vector_channels] + if not isinstance(data, pd.DataFrame): raise TypeError(f"data must be of type pd.DataFrame. Got: {type(data)}") if not isinstance(freq, (float, int)): raise TypeError(f"freq must be of type int or float. Got: {type(freq)}") if not isinstance(period, (float, int)): raise TypeError(f"period must be of type int or float. Got: {type(period)}") - # catch if vector_channels is not an string array - if isinstance(vector_channels, str): - vector_channels = [vector_channels] if not isinstance(vector_channels, list): raise TypeError( f"vector_channels must be a list of strings. Got: {type(vector_channels)}" ) - # Check timestamp using qc module data.index = data.index.round("1ms") - dataQC = qc.check_timestamp(data, 1 / freq) - dataQC = dataQC["cleaned_data"] + data_qc = qc.check_timestamp(data, 1 / freq)["cleaned_data"] - # Check to see if data length contains enough data points for statistical window - if len(dataQC) % (period * freq) > 0: - remain = len(dataQC) % (period * freq) - dataQC = dataQC.iloc[0 : -int(remain)] + if len(data_qc) % (period * freq) > 0: + remain = len(data_qc) % (period * freq) + data_qc = data_qc.iloc[0 : -int(remain)] print( - "WARNING: there were not enough data points in the last statistical period. Last " - + str(remain) - + " points were removed." + f"WARNING: there were not enough data points in the last statistical period. \ + Last {remain} points were removed." ) - # Pre-allocate lists time = [] means = [] maxs = [] mins = [] - stdev = [] + stdevs = [] - # Get data chunks to performs stats on step = period * freq - for i in range(int(len(dataQC) / (period * freq))): - datachunk = dataQC.iloc[i * step : (i + 1) * step] - # Check whether there are any NaNs in datachunk + for i in range(int(len(data_qc) / step)): + datachunk = data_qc.iloc[i * step : (i + 1) * step] if datachunk.isnull().any().any(): print("NaNs found in statistical window...check timestamps!") input("Press to continue") continue - else: - # Get stats - time.append(datachunk.index.values[0]) # time vector - maxs.append(datachunk.max()) # maxes - mins.append(datachunk.min()) # mins - means.append(datachunk.mean()) # means - stdev.append(datachunk.std()) # standard deviation - # calculate vector averages and std - for v in vector_channels: - vector_avg, vector_std = vector_statistics(datachunk[v]) - # overwrite scalar average for channel - means[i][v] = vector_avg - stdev[i][v] = vector_std # overwrite scalar std for channel - - # Convert to DataFrames and set index + + time.append(datachunk.index.values[0]) + + # Calculate statistics for this chunk + stats = _calculate_statistics(datachunk, vector_channels) + + means.append(stats["means"]) + maxs.append(stats["maxs"]) + mins.append(stats["mins"]) + stdevs.append(stats["stdevs"]) + + # Convert lists to DataFrames means = pd.DataFrame(means, index=time) maxs = pd.DataFrame(maxs, index=time) mins = pd.DataFrame(mins, index=time) - stdevs = pd.DataFrame(stdev, index=time) + stdevs = pd.DataFrame(stdevs, index=time) return means, maxs, mins, stdevs @@ -114,22 +154,23 @@ def vector_statistics(data): """ try: data = np.array(data) - except: - pass + except (TypeError, ValueError) as e: + raise TypeError(f"Error converting data to numpy array: {e}") from e + if not isinstance(data, np.ndarray): raise TypeError(f"data must be of type np.ndarray. Got: {type(data)}") # calculate mean - Ux = sum(np.sin(data * np.pi / 180)) / len(data) - Uy = sum(np.cos(data * np.pi / 180)) / len(data) - vector_avg = 90 - np.arctan2(Uy, Ux) * 180 / np.pi + u_x = sum(np.sin(data * np.pi / 180)) / len(data) + u_y = sum(np.cos(data * np.pi / 180)) / len(data) + vector_avg = 90 - np.arctan2(u_y, u_x) * 180 / np.pi if vector_avg < 0: vector_avg = vector_avg + 360 elif vector_avg > 360: vector_avg = vector_avg - 360 # calculate standard deviation # round to 8th decimal place to reduce roundoff error - magsum = round((Ux**2 + Uy**2) * 1e8) / 1e8 + magsum = round((u_x**2 + u_y**2) * 1e8) / 1e8 epsilon = (1 - magsum) ** 0.5 if not np.isreal(epsilon): # check if epsilon is imaginary (error) vector_std = 0 @@ -157,17 +198,19 @@ def unwrap_vector(data): # Check data types try: data = np.array(data) - except: - pass + except (TypeError, ValueError) as e: + raise TypeError(f"Error converting data to numpy array: {e}") from e + if not isinstance(data, np.ndarray): raise TypeError(f"data must be of type np.ndarray. Got: {type(data)}") # Loop through and unwrap points - for i in range(len(data)): - if data[i] < 0: - data[i] = data[i] + 360 - elif data[i] > 360: - data[i] = data[i] - 360 + for i, value in enumerate(data): + if value < 0: + data[i] = value + 360 + elif value > 360: + data[i] = value - 360 + if max(data) > 360 or min(data) < 0: data = unwrap_vector(data) return data @@ -199,10 +242,10 @@ def magnitude_phase(x, y, z=None): x = np.array(x) y = np.array(y) - threeD = False + three_d = False if not isinstance(z, type(None)): z = np.array(z) - threeD = True + three_d = True if not isinstance(x, (float, int, np.ndarray)): raise TypeError(f"x must be of type float, int, or np.ndarray. Got: {type(x)}") @@ -213,15 +256,15 @@ def magnitude_phase(x, y, z=None): f"If specified, z must be of type float, int, or np.ndarray. Got: {type(z)}" ) - if threeD: + if three_d: mag = np.sqrt(x**2 + y**2 + z**2) theta = np.arctan2(y, x) phi = np.arctan2(np.sqrt(x**2 + y**2), z) return mag, theta, phi - else: - mag = np.sqrt(x**2 + y**2) - theta = np.arctan2(y, x) - return mag, theta + + mag = np.sqrt(x**2 + y**2) + theta = np.arctan2(y, x) + return mag, theta def unorm(x, y, z): @@ -239,7 +282,7 @@ def unorm(x, y, z): Returns ------- - unorm : array + u_norm : array The root mean squared of x, y, and z. Example @@ -265,6 +308,6 @@ def unorm(x, y, z): raise ValueError("lengths of arrays must match") xyz = np.array([x, y, z]) - unorm = np.linalg.norm(xyz, axis=0) + u_norm = np.linalg.norm(xyz, axis=0) - return unorm + return u_norm From 769a26fb8ec61155fb2bbf1b15675cdfaecb2c86 Mon Sep 17 00:00:00 2001 From: ssolson Date: Thu, 5 Sep 2024 12:04:47 -0400 Subject: [PATCH 08/31] add test__calculate_statistics --- mhkit/tests/utils/test_utils.py | 35 +++++++++++++++++++++++++++++++-- mhkit/utils/__init__.py | 1 + 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/mhkit/tests/utils/test_utils.py b/mhkit/tests/utils/test_utils.py index ba2b9cb03..192c0b226 100644 --- a/mhkit/tests/utils/test_utils.py +++ b/mhkit/tests/utils/test_utils.py @@ -29,10 +29,10 @@ def test_get_statistics(self): # load in file df = self.data["loads"] df.Timestamp = pd.to_datetime(df.Timestamp) - df.set_index("Timestamp", inplace=True) + test_df = df.set_index("Timestamp") # run function means, maxs, mins, stdevs = utils.get_statistics( - df, + test_df, self.freq, period=self.period, vector_channels=["WD_Nacelle", "WD_NacelleMod"], @@ -57,6 +57,37 @@ def test_get_statistics(self): time = pd.to_datetime(string_time) self.assertTrue(means.index[0] == time) + def test__calculate_statistics(self): + # load in file + df = self.data["loads"] + df.Timestamp = pd.to_datetime(df.Timestamp) + test_df = df.set_index("Timestamp") + + # Select a specific data chunk (the first 10 rows) + datachunk = test_df.iloc[:10] + + # Run the calculate_statistics function + stats = utils._calculate_statistics( + datachunk, vector_channels=["WD_Nacelle", "WD_NacelleMod"] + ) + + means = stats["means"] + maxs = stats["maxs"] + mins = stats["mins"] + stdevs = stats["stdevs"] + + # check statistics for a specific column ('uWind_80m') + self.assertAlmostEqual(means["uWind_80m"], 3.226, 2) # mean + self.assertAlmostEqual(maxs["uWind_80m"], 3.234, 2) # max + self.assertAlmostEqual(mins["uWind_80m"], 3.221, 2) # min + self.assertAlmostEqual(stdevs["uWind_80m"], 0.005049, 2) # standard deviation + + # check vector statistics for 'WD_Nacelle' + self.assertAlmostEqual(means["WD_Nacelle"], 157.302, 2) # vector mean + self.assertAlmostEqual( + stdevs["WD_Nacelle"], 0.000, 2 + ) # vector standard deviation + def test_vector_statistics(self): # load in vector variable df = self.data["loads"] diff --git a/mhkit/utils/__init__.py b/mhkit/utils/__init__.py index b484862b0..328a33200 100644 --- a/mhkit/utils/__init__.py +++ b/mhkit/utils/__init__.py @@ -6,6 +6,7 @@ from .time_utils import matlab_to_datetime, excel_to_datetime, index_to_datetime from .stat_utils import ( + _calculate_statistics, get_statistics, vector_statistics, unwrap_vector, From 2f2655b891c16e09908b9cfe3ab18838905a1630 Mon Sep 17 00:00:00 2001 From: ssolson Date: Thu, 5 Sep 2024 12:10:33 -0400 Subject: [PATCH 09/31] 10/10 lint --- mhkit/utils/time_utils.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/mhkit/utils/time_utils.py b/mhkit/utils/time_utils.py index 1f83f8ff9..2348a9916 100644 --- a/mhkit/utils/time_utils.py +++ b/mhkit/utils/time_utils.py @@ -1,7 +1,16 @@ +""" +This module provides utility functions for converting datetime formats +from MATLAB and Excel to Python datetime formats. + +Functions: +---------- +- matlab_to_datetime: Converts MATLAB datenum format to Python datetime. +- excel_to_datetime: Converts Excel datenum format to Python datetime. +""" + import datetime as dt import pandas as pd import numpy as np -from pecos.utils import index_to_datetime def matlab_to_datetime(matlab_datenum): @@ -21,10 +30,10 @@ def matlab_to_datetime(matlab_datenum): # Check data types try: matlab_datenum = np.array(matlab_datenum, ndmin=1) - except: - pass + except (TypeError, ValueError) as e: + raise TypeError(f"Error converting to numpy array: {e}") from e if not isinstance(matlab_datenum, np.ndarray): - raise TypeError(f"data must be of type np.ndarray. Got: {type(data)}") + raise TypeError(f"data must be of type np.ndarray. Got: {type(matlab_datenum)}") # Pre-allocate time = [] @@ -56,8 +65,8 @@ def excel_to_datetime(excel_num): # Check data types try: excel_num = np.array(excel_num) - except: - pass + except (TypeError, ValueError) as e: + raise TypeError(f"Error converting to numpy array: {e}") from e if not isinstance(excel_num, np.ndarray): raise TypeError(f"excel_num must be of type np.ndarray. Got: {type(excel_num)}") From e6da2ed89452821e0954eb8eb77c0dbb1e396ad5 Mon Sep 17 00:00:00 2001 From: ssolson Date: Thu, 5 Sep 2024 12:23:49 -0400 Subject: [PATCH 10/31] 10/10 pylint --- mhkit/utils/time_utils.py | 3 ++ mhkit/utils/type_handling.py | 78 +++++++++++++++++++++++++----------- 2 files changed, 57 insertions(+), 24 deletions(-) diff --git a/mhkit/utils/time_utils.py b/mhkit/utils/time_utils.py index 2348a9916..8e3db5875 100644 --- a/mhkit/utils/time_utils.py +++ b/mhkit/utils/time_utils.py @@ -12,6 +12,9 @@ import pandas as pd import numpy as np +# pylint: disable=unused-import +from pecos.utils import index_to_datetime + def matlab_to_datetime(matlab_datenum): """ diff --git a/mhkit/utils/type_handling.py b/mhkit/utils/type_handling.py index 1b06c7d12..0f928fb8f 100644 --- a/mhkit/utils/type_handling.py +++ b/mhkit/utils/type_handling.py @@ -1,3 +1,18 @@ +""" +This module provides utility functions for converting various data types +to xarray structures such as xarray.DataArray and xarray.Dataset. It also +includes functions for handling nested dictionaries containing pandas +DataFrames by converting them to xarray Datasets. + +Functions: +---------- +- to_numeric_array: Converts input data to a numeric NumPy array. +- convert_to_dataset: Converts pandas or xarray data structures to xarray.Dataset. +- convert_to_dataarray: Converts various data types to xarray.DataArray. +- convert_nested_dict_and_pandas: Recursively converts pandas DataFrames + in nested dictionaries to xarray Datasets. +""" + import numpy as np import pandas as pd import xarray as xr @@ -27,8 +42,10 @@ def convert_to_dataset(data, name="data"): """ Converts the given data to an xarray.Dataset. - This function is designed to handle inputs that can be either a pandas DataFrame, a pandas Series, - an xarray DataArray, or an xarray Dataset. It ensures that the output is consistently an xarray.Dataset. + This function is designed to handle inputs that can be either a + pandas DataFrame, a pandas Series, an xarray DataArray, or an + xarray Dataset. It ensures that the output is consistently an + xarray.Dataset. Parameters ---------- @@ -36,14 +53,15 @@ def convert_to_dataset(data, name="data"): The data to be converted. name: str (Optional) - The name to assign to the data variable in case the input is an xarray DataArray without a name. + The name to assign to the data variable in case the input is an + xarray DataArray without a name. Default value is 'data'. Returns ------- xarray.Dataset - The input data converted to an xarray.Dataset. If the input is already an xarray.Dataset, - it is returned as is. + The input data converted to an xarray.Dataset. If the input is + already an xarray.Dataset, it is returned as is. Examples -------- @@ -75,7 +93,8 @@ def convert_to_dataset(data, name="data"): # Takes data that could be pd.DataFrame, pd.Series, xr.DataArray, or # xr.Dataset and converts it to xr.Dataset if isinstance(data, pd.DataFrame): - # xr.Dataset(data) is drastically faster (1e1 - 1e2x faster) than using pd.DataFrame.to_xarray() + # xr.Dataset(data) is drastically faster (1e1 - 1e2x faster) + # than using pd.DataFrame.to_xarray() data = xr.Dataset(data) if isinstance(data, pd.Series): @@ -86,7 +105,7 @@ def convert_to_dataset(data, name="data"): if isinstance(data, xr.DataArray): # xr.DataArray.to_dataset() breaks if the data variable is unnamed - if data.name == None: + if data.name is None: data.name = name data = data.to_dataset() @@ -97,18 +116,23 @@ def convert_to_dataarray(data, name="data"): """ Converts the given data to an xarray.DataArray. - This function takes in a numpy ndarray, pandas Series, pandas Dataframe, or xarray Dataset - and outputs an equivalent xarray DataArray. DataArrays can be passed through with no changes. + This function takes in a numpy ndarray, pandas Series, pandas + Dataframe, or xarray Dataset and outputs an equivalent xarray + DataArray. DataArrays can be passed through with no changes. - Xarray datasets can only be input when all variable have the same dimensions. + Xarray datasets can only be input when all variable have the same + dimensions. - Multivariate pandas Dataframes become 2D DataArrays, which is especially useful when IO - functions return Dataframes with an extremely large number of variable. Use the function - convert_to_dataset to change a multivariate Dataframe into a multivariate Dataset. + Multivariate pandas Dataframes become 2D DataArrays, which is + especially useful when IO functions return Dataframes with an + extremely large number of variable. Use the function + convert_to_dataset to change a multivariate Dataframe into a + multivariate Dataset. Parameters ---------- - data: numpy ndarray, pandas DataFrame, pandas Series, xarray DataArray, or xarray Dataset + data: numpy ndarray, pandas DataFrame, pandas Series, xarray + DataArray, or xarray Dataset The data to be converted. name: str (Optional) @@ -118,8 +142,8 @@ def convert_to_dataarray(data, name="data"): Returns ------- xarray.DataArray - The input data converted to an xarray.DataArray. If the input is already an xarray.DataArray, - it is returned as is. + The input data converted to an xarray.DataArray. If the input + is already an xarray.DataArray, it is returned as is. Examples -------- @@ -152,8 +176,10 @@ def convert_to_dataarray(data, name="data"): # Checks pd.DataFrame input and converts to pd.Series if possible if isinstance(data, pd.DataFrame): if data.shape[1] == 1: - # Convert the 1D, univariate case to a Series, which will be caught by the Series conversion below. - # This eliminates an unnecessary variable dimension and names the DataArray with the DataFrame variable name. + # Convert the 1D, univariate case to a Series, which will + # be caught by the Series conversion below. This eliminates + # an unnecessary variable dimension and names the DataArray + # with the DataFrame variable name. # # Use iloc instead of squeeze. For DataFrames/Series with only a # single value, squeeze returns a scalar which is unexpected. @@ -172,32 +198,36 @@ def convert_to_dataarray(data, name="data"): if isinstance(data, xr.Dataset): keys = list(data.keys()) if len(keys) == 1: - # if only one variable, remove the "variable" dimension and rename the DataArray to simplify + # if only one variable, remove the "variable" dimension and + # rename the DataArray to simplify data = data.to_array() data = data.sel(variable=keys[0]) data.name = keys[0] data.drop_vars("variable") else: # Allow multiple variables if they have the same dimensions - if all([data[keys[0]].dims == data[key].dims for key in keys]): + if all(data[keys[0]].dims == data[key].dims for key in keys): data = data.to_array() else: raise ValueError( - "Multivariate Datasets can only be input if all variables have the same dimensions." + "Multivariate Datasets can only be input if all \ + variables have the same dimensions." ) # Converts pd.Series to xr.DataArray if isinstance(data, pd.Series): data = data.to_xarray() - # Converts np.ndarray to xr.DataArray. Assigns a simple 0-based dimension named index to match how pandas converts to xarray + # Converts np.ndarray to xr.DataArray. Assigns a simple 0-based + # dimension named index to match how pandas converts to xarray if isinstance(data, np.ndarray): data = xr.DataArray( data=data, dims="index", coords={"index": np.arange(len(data))} ) - # If there's no data name, add one to prevent issues calling or converting to a Dataset later on - if data.name == None: + # If there's no data name, add one to prevent issues calling or + # converting to a Dataset later on + if data.name is None: data.name = name return data From 1c3602af11b960e8641e7b5f2972b93324f8c286 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 6 Sep 2024 08:58:14 -0400 Subject: [PATCH 11/31] handle cache returns None now --- mhkit/wave/io/cdip.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mhkit/wave/io/cdip.py b/mhkit/wave/io/cdip.py index 92a1d47e6..020664aac 100644 --- a/mhkit/wave/io/cdip.py +++ b/mhkit/wave/io/cdip.py @@ -330,7 +330,7 @@ def request_parse_workflow( cache_content={"data": None, "metadata": None, "write_json": None}, ) - if data[:2] == (None, None): + if data is None: data = get_netcdf_variables( nc, start_date=start_date, @@ -361,7 +361,7 @@ def request_parse_workflow( cache_dir, cache_content={"data": None, "metadata": None, "write_json": None}, ) - if year_data[:2] == (None, None): + if year_data is None: year_data = get_netcdf_variables( nc, start_date=start_date, From 0e343d556d96c19868de95935b09b3fe5019dcc6 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 6 Sep 2024 10:28:20 -0400 Subject: [PATCH 12/31] fix logic around None passed to handle_cache --- mhkit/utils/cache.py | 40 ++++------------------------------------ 1 file changed, 4 insertions(+), 36 deletions(-) diff --git a/mhkit/utils/cache.py b/mhkit/utils/cache.py index de30a4e7e..053f35c5e 100644 --- a/mhkit/utils/cache.py +++ b/mhkit/utils/cache.py @@ -204,40 +204,6 @@ def handle_caching( """ Handles caching of data to avoid redundant network requests or computations. - - The function checks if a cache file exists for the given parameters. - If it does, the function will load data from the cache file, unless - the `clear_cache_file` parameter is set to `True`, in which case the - cache file is cleared. If the cache file does not exist and the - `data` parameter is not `None`, the function will store the - provided data in a cache file. - - Parameters - ---------- - hash_params : str - The parameters to be hashed and used as the filename for the cache file. - cache_dir : str - The directory where the cache files are stored. - cache_content : dict or None - Dictionary containing 'data' (pandas DataFrame or None), 'metadata' - (dict or None), and 'write_json' (str or None). If `None`, the function - will attempt to load data from the cache file. - clear_cache_file : bool - If `True`, the cache file for the given parameters will be cleared. - - Returns - ------- - data : pandas DataFrame or None - The data loaded from the cache file. If data was provided as a - parameter, the same data will be returned. If the cache file - does not exist and no data was provided, `None` will be returned. - metadata : dict or None - The metadata loaded from the cache file. If metadata was provided - as a parameter, the same metadata will be returned. If the cache - file does not exist and no metadata was provided, `None` will be - returned. - cache_filepath : str - The path to the cache file. """ # Initialize data and metadata to None to avoid pylint errors @@ -309,8 +275,10 @@ def _write_cache(data, metadata, file_extension, cache_filepath): # Clear cache if requested _clear_cache(cache_filepath) - # Check if cache file exists and load if no data provided - if os.path.isfile(cache_filepath) and cache_content is None: + # If cache file exists and cache_content["data"] is None, load from cache + if os.path.isfile(cache_filepath) and ( + cache_content is None or cache_content["data"] is None + ): return _load_cache(file_extension, cache_filepath) + (cache_filepath,) # Store data in cache if provided From 486708dcc629d327dee4371584d223c1da295112 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 6 Sep 2024 10:34:23 -0400 Subject: [PATCH 13/31] back to index --- mhkit/tests/tidal/test_io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mhkit/tests/tidal/test_io.py b/mhkit/tests/tidal/test_io.py index 5ce38e4fa..280b847ce 100644 --- a/mhkit/tests/tidal/test_io.py +++ b/mhkit/tests/tidal/test_io.py @@ -103,11 +103,11 @@ def test_request_noaa_data_basic_xarray(self): ) # Check if the variable sets are equal data_variables = list(data.variables) - required_variables = ["t", "s", "d", "b"] + required_variables = ["index", "s", "d", "b"] data_variables_set = set(data_variables) required_variables_set = set(required_variables) self.assertTrue(data_variables_set == required_variables_set) - self.assertEqual(len(data["t"]), 183) + self.assertEqual(len(data["index"]), 183) self.assertEqual(data.attrs["id"], "s08010") def test_request_noaa_data_write_json(self): From bdf74b333addb6da4249a53ca87b265e77e6860a Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 6 Sep 2024 11:11:36 -0400 Subject: [PATCH 14/31] data no longer returned as list --- mhkit/wave/io/cdip.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mhkit/wave/io/cdip.py b/mhkit/wave/io/cdip.py index 020664aac..52322787e 100644 --- a/mhkit/wave/io/cdip.py +++ b/mhkit/wave/io/cdip.py @@ -344,8 +344,6 @@ def request_parse_workflow( cache_dir, cache_content={"data": data, "metadata": None, "write_json": None}, ) - else: - data = data[0] else: data = {"data": {}, "metadata": {}} @@ -380,8 +378,6 @@ def request_parse_workflow( "write_json": None, }, ) - else: - year_data = year_data[0] multiyear_data[year] = year_data["data"] for data_key in year_data["data"].keys(): From dfad8dc4af7e973f3341b882c51672d45d8c493e Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 6 Sep 2024 11:32:43 -0400 Subject: [PATCH 15/31] remove old cache_utils function --- mhkit/utils/cache.py | 148 ------------------------------------------- 1 file changed, 148 deletions(-) diff --git a/mhkit/utils/cache.py b/mhkit/utils/cache.py index 053f35c5e..17d2a0568 100644 --- a/mhkit/utils/cache.py +++ b/mhkit/utils/cache.py @@ -42,159 +42,11 @@ import hashlib import json import os - -# import re import shutil import pickle import pandas as pd -# def old_handle_caching( -# hash_params, -# cache_dir, -# data=None, -# metadata=None, -# write_json=None, -# clear_cache_file=False, -# ): -# """ -# Handles caching of data to avoid redundant network requests or -# computations. - -# The function checks if a cache file exists for the given parameters. -# If it does, the function will load data from the cache file, unless -# the `clear_cache_file` parameter is set to `True`, in which case the -# cache file is cleared. If the cache file does not exist and the -# `data` parameter is not `None`, the function will store the -# provided data in a cache file. - -# Parameters -# ---------- -# hash_params : str -# The parameters to be hashed and used as the filename for the cache file. -# cache_dir : str -# The directory where the cache files are stored. -# data : pandas DataFrame or None -# The data to be stored in the cache file. If `None`, the function -# will attempt to load data from the cache file. -# metadata : dict or None -# Metadata associated with the data. This will be stored in the -# cache file along with the data. -# write_json : str or None -# If specified, the cache file will be copied to a file with this name. -# clear_cache_file : bool -# If `True`, the cache file for the given parameters will be cleared. - -# Returns -# ------- -# data : pandas DataFrame or None -# The data loaded from the cache file. If data was provided as a -# parameter, the same data will be returned. If the cache file -# does not exist and no data was provided, `None` will be returned. -# metadata : dict or None -# The metadata loaded from the cache file. If metadata was provided -# as a parameter, the same metadata will be returned. If the cache -# file does not exist and no metadata was provided, `None` will be -# returned. -# cache_filepath : str -# The path to the cache file. -# """ - -# # Check if 'cdip' is in cache_dir, then use .pkl instead of .json -# file_extension = ( -# ".pkl" -# if "cdip" in cache_dir or "hindcast" in cache_dir or "ndbc" in cache_dir -# else ".json" -# ) - -# # Make cache directory if it doesn't exist -# if not os.path.isdir(cache_dir): -# os.makedirs(cache_dir) - -# # Create a unique filename based on the function parameters -# cache_filename = ( -# hashlib.md5(hash_params.encode("utf-8")).hexdigest() + file_extension -# ) -# cache_filepath = os.path.join(cache_dir, cache_filename) - -# # If clear_cache_file is True, remove the cache file for this request -# if clear_cache_file and os.path.isfile(cache_filepath): -# os.remove(cache_filepath) -# print(f"Cleared cache for {cache_filepath}") - -# # If a cached file exists, load and return the data from the file -# if os.path.isfile(cache_filepath) and data is None: -# if file_extension == ".json": -# with open(cache_filepath, encoding="utf-8") as f: -# json_data = json.load(f) - -# # Extract metadata if it exists -# if "metadata" in json_data: -# metadata = json_data.pop("metadata", None) - -# # Check if index is datetime formatted -# if all( -# re.match(r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}", str(dt)) -# for dt in json_data["index"] -# ): -# data = pd.DataFrame( -# json_data["data"], -# index=pd.to_datetime(json_data["index"]), -# columns=json_data["columns"], -# ) -# else: -# data = pd.DataFrame( -# json_data["data"], -# index=json_data["index"], -# columns=json_data["columns"], -# ) - -# # Convert the rest to DataFrame -# data = pd.DataFrame( -# json_data["data"], -# index=pd.to_datetime(json_data["index"]), -# columns=json_data["columns"], -# ) - -# elif file_extension == ".pkl": -# with open(cache_filepath, "rb") as f: -# data, metadata = pickle.load(f) - -# if write_json: -# shutil.copy(cache_filepath, write_json) - -# return data, metadata, cache_filepath - -# # If a cached file does not exist and data is provided, -# # store the data in a cache file -# if data is not None: -# if file_extension == ".json": -# # Convert DataFrame to python dict -# py_data = data.to_dict(orient="split") -# # Add metadata to py_data -# py_data["metadata"] = metadata -# # Check if index is datetime indexed -# if isinstance(data.index, pd.DatetimeIndex): -# py_data["index"] = [ -# dt.strftime("%Y-%m-%d %H:%M:%S") for dt in py_data["index"] -# ] -# else: -# py_data["index"] = list(data.index) -# with open(cache_filepath, "w", encoding="utf-8") as f: -# json.dump(py_data, f) - -# elif file_extension == ".pkl": -# with open(cache_filepath, "wb") as f: -# pickle.dump((data, metadata), f) - -# if write_json: -# shutil.copy(cache_filepath, write_json) - -# return data, metadata, cache_filepath -# # If data is not provided and the cache file doesn't exist, return cache_filepath -# return None, None, cache_filepath - - def handle_caching( hash_params, cache_dir, From 8d551e4aec16b69c871adfac8f6ebaac4f2ff046 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 6 Sep 2024 11:34:40 -0400 Subject: [PATCH 16/31] clean up --- mhkit/utils/cache.py | 1 - 1 file changed, 1 deletion(-) diff --git a/mhkit/utils/cache.py b/mhkit/utils/cache.py index 17d2a0568..706888d94 100644 --- a/mhkit/utils/cache.py +++ b/mhkit/utils/cache.py @@ -58,7 +58,6 @@ def handle_caching( computations. """ - # Initialize data and metadata to None to avoid pylint errors data = None metadata = None From cf54e12eb6149f16db6c2d71a4cfc85ae1995941 Mon Sep 17 00:00:00 2001 From: ssolson Date: Mon, 9 Sep 2024 10:02:45 -0400 Subject: [PATCH 17/31] type hints --- mhkit/utils/cache.py | 29 +++++++++++--- mhkit/utils/stat_utils.py | 33 +++++++++++++--- mhkit/utils/time_utils.py | 9 ++++- mhkit/utils/type_handling.py | 18 +++++++-- mhkit/utils/upcrossing.py | 76 +++++++++++++++++++++++++----------- 5 files changed, 124 insertions(+), 41 deletions(-) diff --git a/mhkit/utils/cache.py b/mhkit/utils/cache.py index 706888d94..3d6bc2949 100644 --- a/mhkit/utils/cache.py +++ b/mhkit/utils/cache.py @@ -39,6 +39,7 @@ Date: 2023-09-26 """ +from typing import Optional, Tuple, Dict, Any import hashlib import json import os @@ -48,14 +49,30 @@ def handle_caching( - hash_params, - cache_dir, - cache_content=None, - clear_cache_file=False, -): + hash_params: str, + cache_dir: str, + cache_content: Optional[Dict[str, Any]] = None, + clear_cache_file: bool = False, +) -> Tuple[Optional[pd.DataFrame], Optional[Dict[str, Any]], str]: """ Handles caching of data to avoid redundant network requests or computations. + + Parameters + ---------- + hash_params : str + Parameters to generate the cache file hash. + cache_dir : str + Directory where cache files are stored. + cache_content : Optional[Dict[str, Any]], optional + Content to be cached. Should contain 'data', 'metadata', and 'write_json'. + clear_cache_file : bool + Whether to clear the existing cache. + + Returns + ------- + Tuple[Optional[pd.DataFrame], Optional[Dict[str, Any]], str] + Cached data, metadata, and cache file path. """ data = None @@ -148,7 +165,7 @@ def _write_cache(data, metadata, file_extension, cache_filepath): return None, None, cache_filepath -def clear_cache(specific_dir=None): +def clear_cache(specific_dir: Optional[str] = None) -> None: """ Clears the cache. diff --git a/mhkit/utils/stat_utils.py b/mhkit/utils/stat_utils.py index 681636680..972a84f2a 100644 --- a/mhkit/utils/stat_utils.py +++ b/mhkit/utils/stat_utils.py @@ -15,12 +15,15 @@ - unorm: Computes root mean squared value of 3D vectors. """ +from typing import List, Dict, Optional, Tuple, Union import pandas as pd import numpy as np from mhkit import qc -def _calculate_statistics(datachunk, vector_channels): +def _calculate_statistics( + datachunk: pd.DataFrame, vector_channels: List[str] +) -> Dict[str, Union[pd.Series, float]]: """ Calculate the mean, max, min, and standard deviation for the given datachunk. Also calculate vector statistics for vector_channels. @@ -51,7 +54,12 @@ def _calculate_statistics(datachunk, vector_channels): return {"means": means, "maxs": maxs, "mins": mins, "stdevs": stdevs} -def get_statistics(data, freq, period=600, vector_channels=None): +def get_statistics( + data: pd.DataFrame, + freq: Union[float, int], + period: Union[float, int] = 600, + vector_channels: Optional[Union[str, List[str]]] = None, +) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]: """ Calculate mean, max, min and stdev statistics of continuous data for a given statistical window. Default length of statistical window (period) is @@ -135,7 +143,9 @@ def get_statistics(data, freq, period=600, vector_channels=None): return means, maxs, mins, stdevs -def vector_statistics(data): +def vector_statistics( + data: Union[pd.Series, np.ndarray, list] +) -> Tuple[np.ndarray, np.ndarray]: """ Function used to calculate statistics for vector/directional channels based on routine from Campbell data logger and Yamartino algorithm @@ -181,7 +191,7 @@ def vector_statistics(data): return vector_avg, vector_std -def unwrap_vector(data): +def unwrap_vector(data: Union[pd.Series, np.ndarray, list]) -> np.ndarray: """ Function used to unwrap vectors into 0-360 deg range @@ -216,7 +226,14 @@ def unwrap_vector(data): return data -def magnitude_phase(x, y, z=None): +def magnitude_phase( + x: Union[float, int, np.ndarray], + y: Union[float, int, np.ndarray], + z: Optional[Union[float, int, np.ndarray]] = None, +) -> Union[ + Tuple[Union[float, np.ndarray], Union[float, np.ndarray]], + Tuple[Union[float, np.ndarray], Union[float, np.ndarray], Union[float, np.ndarray]], +]: """ Retuns magnitude and phase in two or three dimensions. @@ -267,7 +284,11 @@ def magnitude_phase(x, y, z=None): return mag, theta -def unorm(x, y, z): +def unorm( + x: Union[np.ndarray, np.float64, pd.Series], + y: Union[np.ndarray, np.float64, pd.Series], + z: Union[np.ndarray, np.float64, pd.Series], +) -> Union[np.ndarray, np.float64]: """ Calculates the root mean squared value given three arrays. diff --git a/mhkit/utils/time_utils.py b/mhkit/utils/time_utils.py index 8e3db5875..3eb69f7e1 100644 --- a/mhkit/utils/time_utils.py +++ b/mhkit/utils/time_utils.py @@ -8,6 +8,7 @@ - excel_to_datetime: Converts Excel datenum format to Python datetime. """ +from typing import Union import datetime as dt import pandas as pd import numpy as np @@ -16,7 +17,9 @@ from pecos.utils import index_to_datetime -def matlab_to_datetime(matlab_datenum): +def matlab_to_datetime( + matlab_datenum: Union[np.ndarray, list, float, int] +) -> pd.DatetimeIndex: """ Convert MATLAB datenum format to Python datetime @@ -51,7 +54,9 @@ def matlab_to_datetime(matlab_datenum): return time -def excel_to_datetime(excel_num): +def excel_to_datetime( + excel_num: Union[np.ndarray, list, float, int] +) -> pd.DatetimeIndex: """ Convert Excel datenum format to Python datetime diff --git a/mhkit/utils/type_handling.py b/mhkit/utils/type_handling.py index 0f928fb8f..680c2c563 100644 --- a/mhkit/utils/type_handling.py +++ b/mhkit/utils/type_handling.py @@ -13,12 +13,15 @@ in nested dictionaries to xarray Datasets. """ +from typing import Union, Dict, Any import numpy as np import pandas as pd import xarray as xr -def to_numeric_array(data, name): +def to_numeric_array( + data: Union[list, np.ndarray, pd.Series, xr.DataArray], name: str +) -> np.ndarray: """ Convert input data to a numeric array, ensuring all elements are numeric. """ @@ -38,7 +41,9 @@ def to_numeric_array(data, name): return data -def convert_to_dataset(data, name="data"): +def convert_to_dataset( + data: Union[pd.DataFrame, pd.Series, xr.DataArray, xr.Dataset], name: str = "data" +) -> xr.Dataset: """ Converts the given data to an xarray.Dataset. @@ -112,7 +117,10 @@ def convert_to_dataset(data, name="data"): return data -def convert_to_dataarray(data, name="data"): +def convert_to_dataarray( + data: Union[np.ndarray, pd.DataFrame, pd.Series, xr.DataArray, xr.Dataset], + name: str = "data", +) -> xr.DataArray: """ Converts the given data to an xarray.DataArray. @@ -233,7 +241,9 @@ def convert_to_dataarray(data, name="data"): return data -def convert_nested_dict_and_pandas(data): +def convert_nested_dict_and_pandas( + data: Dict[str, Union[pd.DataFrame, Dict[str, Any]]] +) -> Dict[str, Union[xr.Dataset, Dict[str, Any]]]: """ Recursively searches inside nested dictionaries for pandas DataFrames to convert to xarray Datasets. Typically called by wave.io functions that read diff --git a/mhkit/utils/upcrossing.py b/mhkit/utils/upcrossing.py index 18db3f142..1c5eea03f 100644 --- a/mhkit/utils/upcrossing.py +++ b/mhkit/utils/upcrossing.py @@ -7,21 +7,12 @@ Key Functions: -------------- - `upcrossing`: Finds the zero upcrossing points. - - `peaks`: Finds the peaks between zero crossings. - - `troughs`: Finds the troughs between zero crossings. - - `heights`: Calculates the height between zero crossings. - - `periods`: Calculates the period between zero crossings. - - `custom`: Applies a custom, user-defined function between zero crossings. - -Dependencies: -------------- -- numpy: Data analysis - + Author: ------- mbruggs @@ -34,10 +25,36 @@ """ +from typing import Callable, Optional import numpy as np -def _apply(t, data, f, inds): +def _apply( + t: np.ndarray, + data: np.ndarray, + f: Callable[[int, int], float], + inds: Optional[np.ndarray] = None, +) -> np.ndarray: + """ + Apply a function `f` over intervals defined by `inds`. If `inds` is None, + compute the indices using the upcrossing function. + + Parameters + ---------- + t : np.ndarray + Time array. + data : np.ndarray + Data array. + f : Callable[[int, int], float] + A function to apply to pairs of indices (start, end). + inds : np.ndarray, optional + Indices that define the intervals. If None, `upcrossing` is used to generate them. + + Returns + ------- + np.ndarray + Array of values resulting from applying `f` over the intervals. + """ if inds is None: inds = upcrossing(t, data) @@ -50,7 +67,7 @@ def _apply(t, data, f, inds): return vals -def upcrossing(t, data): +def upcrossing(t: np.ndarray, data: np.ndarray) -> np.ndarray: """ Finds the zero upcrossing points. @@ -86,7 +103,9 @@ def upcrossing(t, data): return zero_upcrossings_index -def peaks(t, data, inds=None): +def peaks( + t: np.ndarray, data: np.ndarray, inds: Optional[np.ndarray] = None +) -> np.ndarray: """ Finds the peaks between zero crossings. @@ -96,7 +115,7 @@ def peaks(t, data, inds=None): Time array. data: np.array Signal time-series. - inds: np.array + inds : np.ndarray, optional Optional indices for the upcrossing. Useful when using several of the upcrossing methods to avoid repeating the upcrossing analysis @@ -117,7 +136,9 @@ def peaks(t, data, inds=None): return _apply(t, data, lambda ind1, ind2: np.max(data[ind1:ind2]), inds) -def troughs(t, data, inds=None): +def troughs( + t: np.ndarray, data: np.ndarray, inds: Optional[np.ndarray] = None +) -> np.ndarray: """ Finds the troughs between zero crossings. @@ -127,7 +148,7 @@ def troughs(t, data, inds=None): Time array. data: np.array Signal time-series. - inds: np.array + inds: np.array, optional Optional indices for the upcrossing. Useful when using several of the upcrossing methods to avoid repeating the upcrossing analysis @@ -148,7 +169,9 @@ def troughs(t, data, inds=None): return _apply(t, data, lambda ind1, ind2: np.min(data[ind1:ind2]), inds) -def heights(t, data, inds=None): +def heights( + t: np.ndarray, data: np.ndarray, inds: Optional[np.ndarray] = None +) -> np.ndarray: """ Calculates the height between zero crossings. @@ -161,7 +184,7 @@ def heights(t, data, inds=None): Time array. data: np.array Signal time-series. - inds: np.array + inds: np.array, optional Optional indices for the upcrossing. Useful when using several of the upcrossing methods to avoid repeating the upcrossing analysis @@ -184,7 +207,9 @@ def func(ind1, ind2): return _apply(t, data, func, inds) -def periods(t, data, inds=None): +def periods( + t: np.ndarray, data: np.ndarray, inds: Optional[np.ndarray] = None +) -> np.ndarray: """ Calculates the period between zero crossings. @@ -194,7 +219,7 @@ def periods(t, data, inds=None): Time array. data: np.array Signal time-series. - inds: np.array + inds: np.array, optional Optional indices for the upcrossing. Useful when using several of the upcrossing methods to avoid repeating the upcrossing analysis @@ -214,7 +239,12 @@ def periods(t, data, inds=None): return _apply(t, data, lambda ind1, ind2: t[ind2] - t[ind1], inds) -def custom(t, data, func, inds=None): +def custom( + t: np.ndarray, + data: np.ndarray, + func: Callable[[int, int], np.ndarray], + inds: Optional[np.ndarray] = None, +) -> np.ndarray: """ Applies a custom function to the timeseries data between upcrossing points. @@ -224,11 +254,11 @@ def custom(t, data, func, inds=None): Time array. data: np.array Signal time-series. - func: f(ind1, ind2) -> np.array + func: Callable[[int, int], np.ndarray] Function to apply between the zero crossing periods given t[ind1], t[ind2], where ind1 < ind2, correspond to the start and end of an upcrossing section. - inds: np.array + inds: np.array, optional Optional indices for the upcrossing. Useful when using several of the upcrossing methods to avoid repeating the upcrossing analysis From 44416f5ba30901eeff7c740d091ddcf40be38b71 Mon Sep 17 00:00:00 2001 From: ssolson Date: Tue, 12 Nov 2024 09:10:47 -0500 Subject: [PATCH 18/31] fix pylint iussues --- mhkit/utils/type_handling.py | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mhkit/utils/type_handling.py b/mhkit/utils/type_handling.py index 046cd07d0..09ad5ccac 100644 --- a/mhkit/utils/type_handling.py +++ b/mhkit/utils/type_handling.py @@ -195,12 +195,11 @@ def convert_to_dataarray( data = data.iloc[:, 0] else: # With this conversion, dataframe columns always become "dim_1". - # Rename to "variable" to match how multiple Dataset variables get converted into a DataArray dimension + # Rename to "variable" to match how multiple Dataset variables get + # converted into a DataArray dimension data = xr.DataArray(data) if data.dims[1] == "dim_1": - # Slight chance there is already a name for the columns data = data.rename({"dim_1": "variable"}) - # Checks xr.Dataset input and converts to xr.DataArray if possible if isinstance(data, xr.Dataset): keys = list(data.keys()) @@ -209,10 +208,10 @@ def convert_to_dataarray( data = data[keys[0]] else: # Allow multiple variables if they have the same dimensions - if all([data[keys[0]].dims == data[key].dims for key in keys]): - data = ( - data.to_array().T - ) # transpose so that the new "variable dimension" is the last dimension (matches DataFrame to DataArray behavior) + # transpose so that the new "variable dimension" is the last + # dimension (matches DataFrame to DataArray behavior) + if all(data[keys[0]].dims == data[key].dims for key in keys): + data = data.to_array().T else: raise ValueError( "Multivariate Datasets can only be input if all \ @@ -232,8 +231,7 @@ def convert_to_dataarray( # If there's no data name, add one to prevent issues calling or # converting to a Dataset later on - if data.name is None: - data.name = name + data.name = data.name if data.name is not None else name return data From ef04cc2488c87d4d1010254df337e5a9da8be7f2 Mon Sep 17 00:00:00 2001 From: ssolson Date: Wed, 13 Nov 2024 07:48:56 -0500 Subject: [PATCH 19/31] clean up package installation --- .github/workflows/main.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 44d8ac03a..5bfc4bb95 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -156,14 +156,13 @@ jobs: - name: Setup Conda environment shell: bash -l {0} run: | - conda install numpy cython pip pytest hdf5 libnetcdf cftime netcdf4 coverage --strict-channel-priority + conda install numpy cython pip pytest hdf5 libnetcdf cftime netcdf4 coverage coveralls --strict-channel-priority pip install -e . --no-deps --force-reinstall - name: Install dependencies shell: bash -l {0} run: | python -m pip install --upgrade pip wheel - pip install coverage pytest coveralls . - name: Prepare Wind Hindcast data shell: bash -l {0} From 7064645e141316508003c448782f98c0ba3dfdbd Mon Sep 17 00:00:00 2001 From: ssolson Date: Wed, 13 Nov 2024 08:08:18 -0500 Subject: [PATCH 20/31] change env name to mhkit-env --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 741f80ffc..b8607ae17 100644 --- a/environment.yml +++ b/environment.yml @@ -1,4 +1,4 @@ -name: myenv +name: mhkit-env channels: - conda-forge - defaults From 7aaedea7b304b3b5cedc6582b061f6e64eb44e84 Mon Sep 17 00:00:00 2001 From: ssolson Date: Wed, 13 Nov 2024 08:08:38 -0500 Subject: [PATCH 21/31] clean up installation --- .github/workflows/main.yml | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 5bfc4bb95..29fd5762a 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -153,20 +153,28 @@ jobs: python-version: ${{ env.PYTHON_VER }} use-only-tar-bz2: true - - name: Setup Conda environment + - name: Create MHKiT Conda environment shell: bash -l {0} run: | - conda install numpy cython pip pytest hdf5 libnetcdf cftime netcdf4 coverage coveralls --strict-channel-priority - pip install -e . --no-deps --force-reinstall + conda env create -f environment.yml + conda activate mhkit-env - - name: Install dependencies + - name: Install testing dependencies shell: bash -l {0} run: | - python -m pip install --upgrade pip wheel + conda activate mhkit-env + conda install -y pytest coverage coveralls + + - name: Install mhkit + shell: bash -l {0} + run: | + conda activate mhkit-env + pip install -e . --no-deps - name: Prepare Wind Hindcast data shell: bash -l {0} run: | + conda activate mhkit-env pytest mhkit/tests/wave/io/hindcast/test_wind_toolkit.py - name: Upload Wind Hindcast data as artifact From 9e0d63dcfaecdbdbbaef9cab843b34ad5a6ff489 Mon Sep 17 00:00:00 2001 From: ssolson Date: Wed, 13 Nov 2024 08:25:37 -0500 Subject: [PATCH 22/31] add cf-staging label --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index b8607ae17..b360b14ee 100644 --- a/environment.yml +++ b/environment.yml @@ -1,5 +1,6 @@ name: mhkit-env channels: + - conda-forge/label/cf-staging - conda-forge - defaults dependencies: From 03c95528a2021af7f4b37c3bbde131c118afdfba Mon Sep 17 00:00:00 2001 From: ssolson Date: Wed, 13 Nov 2024 09:12:29 -0500 Subject: [PATCH 23/31] Use conda env file in all tests --- .github/workflows/main.yml | 105 +++++++++++++++++++++++++------------ 1 file changed, 71 insertions(+), 34 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 29fd5762a..ccf3cd30f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -67,21 +67,28 @@ jobs: activate-environment: TESTconda use-only-tar-bz2: true - - name: Setup Conda environment + - name: Create MHKiT Conda environment shell: bash -l {0} run: | - conda install numpy cython pip hdf5 libnetcdf cftime netcdf4 --strict-channel-priority - pip install -e . --force-reinstall + conda env create -f environment.yml + conda activate mhkit-env - - name: Install dependencies + - name: Install testing dependencies shell: bash -l {0} run: | - python -m pip install --upgrade pip wheel - pip install coverage pytest coveralls . + conda activate mhkit-env + conda install -y pytest coverage coveralls + + - name: Install mhkit + shell: bash -l {0} + run: | + conda activate mhkit-env + pip install -e . --no-deps - name: Prepare non-hindcast API data shell: bash -l {0} run: | + conda activate mhkit-env pytest mhkit/tests/river/test_io_usgs.py pytest mhkit/tests/tidal/test_io.py pytest mhkit/tests/wave/io/test_cdip.py @@ -111,21 +118,28 @@ jobs: python-version: ${{ env.PYTHON_VER }} use-only-tar-bz2: true - - name: Setup Conda environment + - name: Create MHKiT Conda environment shell: bash -l {0} run: | - conda install numpy cython pip pytest hdf5 libnetcdf cftime netcdf4 coverage --strict-channel-priority - pip install -e . --force-reinstall + conda env create -f environment.yml + conda activate mhkit-env - - name: Install dependencies + - name: Install testing dependencies shell: bash -l {0} run: | - python -m pip install --upgrade pip wheel - pip install coverage pytest coveralls . + conda activate mhkit-env + conda install -y pytest coverage coveralls + + - name: Install mhkit + shell: bash -l {0} + run: | + conda activate mhkit-env + pip install -e . --no-deps - name: Prepare Wave Hindcast data shell: bash -l {0} run: | + conda activate mhkit-env pytest mhkit/tests/wave/io/hindcast/test_hindcast.py - name: Upload Wave Hindcast data as artifact @@ -208,21 +222,28 @@ jobs: python-version: ${{ matrix.python-version }} use-only-tar-bz2: false - - name: Create and setup Conda environment + - name: Create MHKiT Conda environment shell: bash -l {0} run: | - conda install -c conda-forge pytest coverage=7.5.0 coveralls --strict-channel-priority - pip install -e . --force-reinstall + conda env create -f environment.yml + conda activate mhkit-env - - name: Download data from artifact - uses: actions/download-artifact@v4 - with: - name: data - path: ~/.cache/mhkit + - name: Install testing dependencies + shell: bash -l {0} + run: | + conda activate mhkit-env + conda install -y pytest coverage coveralls + + - name: Install mhkit + shell: bash -l {0} + run: | + conda activate mhkit-env + pip install -e . --no-deps - name: Run pytest & generate coverage report shell: bash -l {0} run: | + conda activate mhkit-env coverage run --rcfile=.github/workflows/.coveragerc --source=./mhkit/ -m pytest -c .github/workflows/pytest.ini coverage lcov @@ -317,11 +338,23 @@ jobs: python-version: ${{ matrix.python-version }} use-only-tar-bz2: false - - name: Setup Conda environment + - name: Create MHKiT Conda environment + shell: bash -l {0} + run: | + conda env create -f environment.yml + conda activate mhkit-env + + - name: Install testing dependencies shell: bash -l {0} run: | - conda install -c conda-forge pytest coverage=7.5.0 coveralls --strict-channel-priority - pip install -e . --force-reinstall + conda activate mhkit-env + conda install -y pytest coverage coveralls + + - name: Install mhkit + shell: bash -l {0} + run: | + conda activate mhkit-env + pip install -e . --no-deps - name: Download Wave Hindcast data from artifact uses: actions/download-artifact@v4 @@ -342,9 +375,10 @@ jobs: mv ~/.cache/mhkit/wind-hindcast/hindcast/* ~/.cache/mhkit/hindcast/ shell: bash - - name: Install MHKiT and run pytest + - name: Run hindcast pytest shell: bash -l {0} run: | + conda activate mhkit-env coverage run --rcfile=.github/workflows/.coveragehindcastrc -m pytest -c .github/workflows/pytest-hindcast.ini coverage lcov @@ -425,21 +459,23 @@ jobs: activate-environment: TESTconda use-only-tar-bz2: true - - name: Install dependencies + - name: Create MHKiT Conda environment shell: bash -l {0} run: | - conda install numpy cython pip hdf5 libnetcdf cftime netcdf4 --strict-channel-priority - pip install -e . --force-reinstall - python -m pip install --upgrade pip wheel - pip install nbval jupyter - pip install utm folium + conda env create -f environment.yml + conda activate mhkit-env - - name: Ensure Conda environment is activated + - name: Install notebook testing dependencies shell: bash -l {0} run: | - echo "source ~/miniconda3/etc/profile.d/conda.sh" >> ~/.bashrc - echo "conda activate TESTconda" >> ~/.bashrc - source ~/.bashrc + conda activate mhkit-env + conda install -y pytest coverage coveralls nbval jupyter utm folium + + - name: Install mhkit + shell: bash -l {0} + run: | + conda activate mhkit-env + pip install -e . --no-deps - name: Download non-hindcast data uses: actions/download-artifact@v4 @@ -477,6 +513,7 @@ jobs: - name: Run notebook shell: bash -l {0} run: | + conda activate mhkit-env if [[ "${{ matrix.notebook }}" == "examples/metocean_example.ipynb" || "${{ matrix.notebook }}" == "examples/WPTO_hindcast_example.ipynb" ]]; then if [[ "${{ needs.check-changes.outputs.should-run-hindcast }}" == 'true' ]]; then jupyter nbconvert --to notebook --execute --inplace --ExecutePreprocessor.timeout=${{ matrix.timeout }} "${{ matrix.notebook }}" From ddfd14fa6eccf4e55952194824a881fb4d6b5986 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 15 Nov 2024 08:59:51 -0500 Subject: [PATCH 24/31] add configs and debug --- .github/workflows/main.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index ccf3cd30f..2bf611a6f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -67,10 +67,16 @@ jobs: activate-environment: TESTconda use-only-tar-bz2: true + - name: Configure conda channels + run: | + conda config --add channels conda-forge/label/cf-staging + conda config --add channels conda-forge + conda config --add channels defaults + - name: Create MHKiT Conda environment shell: bash -l {0} run: | - conda env create -f environment.yml + conda env create -f environment.yml --debug conda activate mhkit-env - name: Install testing dependencies From 9f5e4276b3032eada5da969d2f1f4f11f492d8c7 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 15 Nov 2024 09:14:42 -0500 Subject: [PATCH 25/31] use legacy solver --- .github/workflows/main.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 2bf611a6f..a144d6acf 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -67,6 +67,9 @@ jobs: activate-environment: TESTconda use-only-tar-bz2: true + - name: Use legacy solver + run: conda config --set solver classic + - name: Configure conda channels run: | conda config --add channels conda-forge/label/cf-staging From e5f1b5c5b0a955bba7c989b997b14f68fe7c721b Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 15 Nov 2024 09:30:00 -0500 Subject: [PATCH 26/31] Ensure compatibility with modern packages --- .github/workflows/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index a144d6acf..83d06789f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -65,7 +65,7 @@ jobs: auto-update-conda: true python-version: ${{ env.PYTHON_VER }} activate-environment: TESTconda - use-only-tar-bz2: true + use-only-tar-bz2: false - name: Use legacy solver run: conda config --set solver classic From fd9646b073c8131ec6e1c5c19c08231ad0bca49f Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 15 Nov 2024 09:35:18 -0500 Subject: [PATCH 27/31] Ensure compatibility with modern packages --- .github/workflows/main.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 83d06789f..4b6442cfd 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -125,7 +125,7 @@ jobs: auto-update-conda: true activate-environment: TEST python-version: ${{ env.PYTHON_VER }} - use-only-tar-bz2: true + use-only-tar-bz2: false - name: Create MHKiT Conda environment shell: bash -l {0} @@ -174,7 +174,7 @@ jobs: auto-update-conda: true activate-environment: TEST python-version: ${{ env.PYTHON_VER }} - use-only-tar-bz2: true + use-only-tar-bz2: false - name: Create MHKiT Conda environment shell: bash -l {0} @@ -466,7 +466,7 @@ jobs: auto-update-conda: true python-version: '3.11' activate-environment: TESTconda - use-only-tar-bz2: true + use-only-tar-bz2: false - name: Create MHKiT Conda environment shell: bash -l {0} From b721f03db952e829c94a8c85c9aa8ea301a27fc9 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 15 Nov 2024 10:25:59 -0500 Subject: [PATCH 28/31] add pecos --- environment.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/environment.yml b/environment.yml index b360b14ee..a67d4ef02 100644 --- a/environment.yml +++ b/environment.yml @@ -19,6 +19,7 @@ dependencies: - numexpr>=2.10.0 - lxml - bottleneck + - pecos - pip: - netCDF4>=1.7.1.post1 - matplotlib>=3.9.1 From e7fc5843f99ddf66de102e9888cfea459cbd88c5 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 15 Nov 2024 10:45:37 -0500 Subject: [PATCH 29/31] netcdf4 from pip to conda --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index a67d4ef02..27bd1cce2 100644 --- a/environment.yml +++ b/environment.yml @@ -20,8 +20,8 @@ dependencies: - lxml - bottleneck - pecos + - netCDF4>=1.7.2 - pip: - - netCDF4>=1.7.1.post1 - matplotlib>=3.9.1 - pecos>=0.3.0 - fatpack From 24fa8a577e0e9a820fd33a5cae80f6add9f1c1b5 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 15 Nov 2024 11:45:08 -0500 Subject: [PATCH 30/31] py 3.11, relax hdf5& netCDF4 --- environment.yml | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/environment.yml b/environment.yml index 27bd1cce2..f849ff3ac 100644 --- a/environment.yml +++ b/environment.yml @@ -1,10 +1,9 @@ name: mhkit-env channels: - - conda-forge/label/cf-staging - conda-forge - defaults dependencies: - - python>=3.10 + - python=3.11 - pip - numpy>=2.0.0 - pandas>=2.2.2 @@ -13,17 +12,17 @@ dependencies: - scikit-learn>=1.5.1 - h5py>=3.11.0 - h5pyd>=0.18.0 + - netCDF4>=1.6.5 + - hdf5>=1.14.3,<1.14.5.0a0 - statsmodels>=0.14.2 - requests - beautifulsoup4 - numexpr>=2.10.0 - lxml - bottleneck - - pecos - - netCDF4>=1.7.2 + - pecos>=0.3.0 - pip: - matplotlib>=3.9.1 - - pecos>=0.3.0 - fatpack - NREL-rex>=0.2.63 - notebook From 259e7e58531f704558b7dabcfbcb27894b1163e3 Mon Sep 17 00:00:00 2001 From: ssolson Date: Fri, 15 Nov 2024 11:58:24 -0500 Subject: [PATCH 31/31] relax python constraints --- environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index f849ff3ac..81cdaf613 100644 --- a/environment.yml +++ b/environment.yml @@ -3,7 +3,7 @@ channels: - conda-forge - defaults dependencies: - - python=3.11 + - python>=3.10 - pip - numpy>=2.0.0 - pandas>=2.2.2