diff --git a/docs-source/source/auto_examples/filter/index.rst b/docs-source/source/auto_examples/filter/index.rst index 2bdc0c5d..7867c924 100644 --- a/docs-source/source/auto_examples/filter/index.rst +++ b/docs-source/source/auto_examples/filter/index.rst @@ -31,6 +31,23 @@ Examples of how to use the filter functions included in Cognite Charts. +.. raw:: html + +
+ +.. only:: html + + .. image:: /auto_examples/filter/images/thumb/sphx_glr_plot_hilbert_huang_transform_thumb.png + :alt: + + :ref:`sphx_glr_auto_examples_filter_plot_hilbert_huang_transform.py` + +.. raw:: html + +
Trend extraction using the Hilbert-Huang Transform
+
+ + .. raw:: html @@ -40,4 +57,5 @@ Examples of how to use the filter functions included in Cognite Charts. :hidden: /auto_examples/filter/plot_wavelet_filter + /auto_examples/filter/plot_hilbert_huang_transform diff --git a/docs-source/source/auto_examples/filter/sg_execution_times.rst b/docs-source/source/auto_examples/filter/sg_execution_times.rst index 8d838830..aca756a4 100644 --- a/docs-source/source/auto_examples/filter/sg_execution_times.rst +++ b/docs-source/source/auto_examples/filter/sg_execution_times.rst @@ -3,12 +3,13 @@ .. _sphx_glr_auto_examples_filter_sg_execution_times: + Computation times ================= -**00:27.907** total execution time for **auto_examples_filter** files: +**00:06.221** total execution time for **auto_examples_filter** files: -+------------------------------------------------------------------------------------------+-----------+--------+ -| :ref:`sphx_glr_auto_examples_filter_plot_wavelet_filter.py` (``plot_wavelet_filter.py``) | 00:27.907 | 0.0 MB | -+------------------------------------------------------------------------------------------+-----------+--------+ -| :ref:`sphx_glr_auto_examples_filter_plot_trend.py` (``plot_trend.py``) | 00:00.000 | 0.0 MB | -+------------------------------------------------------------------------------------------+-----------+--------+ ++------------------------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_filter_plot_hilbert_huang_transform.py` (``plot_hilbert_huang_transform.py``) | 00:06.221 | 0.0 MB | ++------------------------------------------------------------------------------------------------------------+-----------+--------+ +| :ref:`sphx_glr_auto_examples_filter_plot_wavelet_filter.py` (``plot_wavelet_filter.py``) | 00:00.000 | 0.0 MB | ++------------------------------------------------------------------------------------------------------------+-----------+--------+ diff --git a/examples/filter/plot_hilbert_huang_transform.py b/examples/filter/plot_hilbert_huang_transform.py new file mode 100644 index 00000000..49b0b2de --- /dev/null +++ b/examples/filter/plot_hilbert_huang_transform.py @@ -0,0 +1,129 @@ +""" +===================================================================================== +Trend extraction using the Hilbert-Huang Transform +===================================================================================== +Example of trend extraction from non-linear, non-stationary signals using Empirical Mode Decomposition (EMD) and the +Hilbert-Huang Transform. We generate a synthetic signal composed of: + * Three oscillatory signals of different but significant amplitudes + * Two polynomial functions or trends + * Data drift +To make the case more realistic, from an industrial perspective, the timestamps are modified to make them nonuniform +and 35% of the data points are randomly removed. Finally, Gaussian noise with a signal-to-noise ratio of 10 db is +added to it. +The figure below shows each of the components of the synthetic signal (except for the Gaussian noise), the resulting +synthetic signal and the trend obtained by means of Empirical Mode Decomposition and the Hilbert-Huang method +implemented. It can be seen that the trend reflects the general signal behaviour. +""" + +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from indsl.signals import insert_data_gaps, line, perturb_timestamp, sine_wave, white_noise +from matplotlib.dates import DateFormatter +from indsl.filter import hilbert_huang_transform + +start_date = pd.Timestamp("2023-07-14") +end_date = pd.Timestamp("2023-07-16") + +# Wave 1: Small amplitude, long wave period +wave01 = sine_wave( + start_date=start_date, + end_date=end_date, + sample_freq=pd.Timedelta("1m"), + wave_period=pd.Timedelta("6h"), + wave_mean=0, + wave_amplitude=6.5, + wave_phase=0, +) +wave01 = np.exp(wave01) + +# Wave 2: Large amplitude, short wave period +wave02 = sine_wave( + start_date=start_date, + end_date=end_date, + sample_freq=pd.Timedelta("1m"), + wave_period=pd.Timedelta("1h"), + wave_mean=0, + wave_amplitude=100, + wave_phase=0, +) + +# Wave 3: Large amplitude, short wave period +wave03 = sine_wave( + start_date=start_date, + end_date=end_date, + sample_freq=pd.Timedelta("1m"), + wave_period=pd.Timedelta("3h"), + wave_mean=5, + wave_amplitude=35, + wave_phase=np.pi, +) + +# Trends +trend_01 = ( + line(start_date=start_date, end_date=end_date, sample_freq=pd.Timedelta("1m"), slope=0.00008, intercept=1) ** 3 +) + +trend_02 = ( + line(start_date=start_date, end_date=end_date, sample_freq=pd.Timedelta("1m"), slope=-0.00005, intercept=5) ** 5 +) + +drift = line(start_date=start_date, end_date=end_date, sample_freq=pd.Timedelta("1m"), slope=0.00005, intercept=0) + +signal = ( + pd.Series(wave01) + + pd.Series(wave02) + + pd.Series(wave03) + + pd.Series(trend_01) + + pd.Series(trend_02) + - pd.Series(drift) +) +signal_w_noise = perturb_timestamp(white_noise(signal, snr_db=10)) +signal_to_detrend = insert_data_gaps(signal_w_noise, fraction=0.35) + +# Unpack the trend values from the tuple +trend_rho = hilbert_huang_transform(signal_to_detrend) + +fig, ax = plt.subplots(3, 1, figsize=[9, 7]) + +ax[0].plot(wave01, label="Wave 1") +ax[0].plot(wave02, label="Wave 2") +ax[0].plot(wave03, label="Wave 3") +ax[0].set_title("Oscillatory components") +ax[0].set_ylabel("Amplitude") +ax[0].legend() + +ax[1].plot(trend_01, label="Polynomial 1") +ax[1].plot(trend_02, label="Polynomial 2") +ax[1].set_title("Trends & Drift") +ax[1].set_ylabel("Magnitude") +ax[1].legend() + +color = "tab:red" +ax2 = ax[1].twinx() +ax2.plot(-drift, color=color) +ax2.set_ylabel("Drift", color=color) +ax2.tick_params(axis="y", labelcolor=color) + +ax[2].plot(signal, label="Signal without noise") +ax[2].set_title("Signal without noise") +ax[2].set_ylabel("Magnitude") +ax[2].set_xlabel("Date") + +# sphinx_gallery_thumbnail_number = 2 +fig2, axs = plt.subplots(figsize=[9, 7]) + +# original signal +axs.plot(signal_to_detrend.index, signal_to_detrend.values, label="Signal") + +# Trend extracted from the signal using rho +axs.plot(signal_to_detrend.index, trend_rho, label="Trend of the signal") + +axs.set_title("Trend found using the Hilbert-Huang Transform") + +# Formatting x axis +axs.xaxis.set_major_formatter(DateFormatter("%b %d, %H:%M")) +plt.setp(axs.get_xticklabels(), rotation=45) +axs.legend(loc="lower right") +plt.tight_layout() +plt.show() diff --git a/indsl/filter/__init__.py b/indsl/filter/__init__.py index 31a21d8f..ba67f5b0 100644 --- a/indsl/filter/__init__.py +++ b/indsl/filter/__init__.py @@ -1,10 +1,11 @@ # Copyright 2023 Cognite AS +from .hilbert_huang_transform import hilbert_huang_transform from .simple_filters import status_flag_filter from .wavelet_filter import wavelet_filter TOOLBOX_NAME = "Filter" -__all__ = ["wavelet_filter", "status_flag_filter"] +__all__ = ["hilbert_huang_transform", "wavelet_filter", "status_flag_filter"] __cognite__ = ["wavelet_filter", "status_flag_filter"] diff --git a/indsl/filter/hilbert_huang_transform.py b/indsl/filter/hilbert_huang_transform.py new file mode 100644 index 00000000..be87d229 --- /dev/null +++ b/indsl/filter/hilbert_huang_transform.py @@ -0,0 +1,202 @@ +from typing import Optional + +import numpy as np +import pandas as pd + +from scipy.signal import hilbert + +from indsl.ts_utils import get_timestamps + + +# DROP_SENTINAL is used to get the smallest (most negative) +# number that can be represented with a 32-bit signed integer. +DROP_SENTINAL = np.iinfo(np.int32).min + + +def _compute_hilbert_2d_spectrum(values_X, values_Z, x_boundaries): + """Compute 2D Hilbert spectrum. + + Calculate a 2D Hilbert-Huang power distribution in sparse format. + This utility function generates a sparse matrix that captures a two-dimensional + power distribution. + """ + from sparse import COO + + # Calculates bin indices for values_X using the np.digitize() function. + # This function returns an array of indices that can be used to bin the + # input values_X into the bins defined by x_boundaries. Any values in + # values_X that are out of the range defined by x_boundaries are assigned + # the DROP_SENTINAL value. + primary_indices = np.digitize(values_X, x_boundaries) - 1 + out_of_bounds = np.logical_or(values_X < x_boundaries[0], values_X >= x_boundaries[-1]) + primary_indices[out_of_bounds] = DROP_SENTINAL + + # Create meshgrids for time and IMF indices + time_indices, imf_indices = np.meshgrid(np.arange(values_X.shape[0]), np.arange(values_X.shape[1]), indexing="ij") + + # Flatten arrays for COO matrix creation + flattened_coordinates = np.vstack([primary_indices.ravel(), time_indices.ravel(), imf_indices.ravel()]) + exclusions = np.any(flattened_coordinates == DROP_SENTINAL, axis=0) + + # Apply the exclusions mask to values_Z + values_Z_masked = np.ma.masked_array(values_Z, mask=exclusions.reshape(values_Z.shape)).compressed() + + # Adjust coordinates for valid points + valid_coordinates = flattened_coordinates[:, ~exclusions] + + # Create the sparse COO matrix + spectrum_shape = (x_boundaries.shape[0] - 1, values_X.shape[0], values_X.shape[1]) + sparse_spectrum = COO(valid_coordinates, values_Z_masked, shape=spectrum_shape) + + return sparse_spectrum + + +def _calculate_histogram_bins_and_centers(dataset, margin=1e-8): + """Calculate histogram bins and centers. + + Determine bin edges for a histogram based on the provided dataset. + """ + # Determine the data range with margin + data_lower_bound = dataset.min() - margin + data_upper_bound = dataset.max() + margin + + # Calculate the number of bins + bin_count = min(int(np.sqrt(dataset.size)), 2048) # Assuming a bin limit of 2048 + + # Establish bin edges + bin_edges = np.linspace(data_lower_bound, data_upper_bound, bin_count + 1) + + return bin_edges + + +def _hilbert_spectrum( + instant_freq, + instant_amp, + use_sparse_format=True, +): + """Hilbert Spectrum. + + Calculate the Hilbert-Huang Transform (HHT) from instantaneous frequency and amplitude. + This transform depicts the energy of a time-series signal over both time and frequency domains. By default, the + output is aggregated over time and IMFs, giving only the frequency spectrum. + """ + # Preliminary checks and setups. Add a dimension if necessary. + instant_freq = instant_freq[:, np.newaxis] if instant_freq.ndim == 1 else instant_freq + instant_amp = instant_amp[:, np.newaxis] if instant_amp.ndim == 1 else instant_amp + + bin_edges = _calculate_histogram_bins_and_centers(instant_freq.ravel()) + + # Calculate the 2D Hilbert spectrum + spectral_data = _compute_hilbert_2d_spectrum(instant_freq, instant_amp, bin_edges) + + # Calculate the final spectrum as a power spectrum and return the result in the desired format (sparse or dense) + final_spectra = spectral_data**2 if use_sparse_format else spectral_data.todense() + return final_spectra + + +def hilbert_huang_transform( + signal: pd.Series, + sift_thresh: float = 1e-8, + max_num_imfs: Optional[int] = None, + error_tolerance: float = 0.05, + return_trend: bool = True, +) -> pd.Series: + r"""Hilbert-Huang Transform. + + The Hilbert-Huang Transform is a technique that combines Empirical Mode Decomposition (EMD) + and the Hilbert Transform to analyze non-stationary and non-linear time series data, + where the Hilbert transform is applied to each mode after having performed the EMD. + These are oscillatory modes of the full signal with well-defined instantaneous frequencies called + Intrinsic Mode Functions (IMFs). We calculate the hilbert spectrum of the IMFs to determine the significant ones + from which we extract the trend as their sum. One can select either the trend or the detrended signal as an output + of this function. You can read more about the Hilbert-Huang Transformation and the sifting process + in the given sources. + + Args: + signal (pd.Series): Input time series signal. + sift_thresh (float, optional): The threshold used for sifting convergence. Defaults to 1e-8. + max_num_imfs (int, optional): The maximum number of oscillatory components to extract. If None, extract all IMFs. Defaults to None. + error_tolerance (float, optional): The tolerance for determining significant IMFs. Defaults to 0.05. + return_trend (bool, optional): Flag indicating whether to return the trend component of the signal. Defaults to True. + + Returns: + pd.Series: The detrended signal if `return_trend` is False, otherwise, the trend component of the signal. + + Raises: + ValueError: If `sift_thresh` is not a number higher than zero. + ValueError: If `max_num_imfs` is not an integer higher than zero. + ValueError: If `error_tolerance` is not higher than zero. + Any exceptions that may occur during Empirical Mode Decomposition (EMD). + + + References: + - Huang, Norden E., et al. "The empirical mode decomposition and the Hilbert spectrum for nonlinear and non-stationary time series analysis." + Proceedings of the Royal Society of London. Series A: Mathematical, Physical and Engineering Sciences 454.1971 (1998): 903-995. + - Yang, Zhijing & Bingham, Chris & Ling, Bingo & Gallimore, Michael & Stewart, Paul & Zhang, Yu. (2012). + "Trend extraction based on Hilbert-Huang transform." 1-5. 10.1109/CSNDSP.2012.6292713. + """ + from PyEMD import EMD + + if sift_thresh <= 0: + raise ValueError("The sifting threshold must be positive") + if max_num_imfs is not None and max_num_imfs <= 0: + raise ValueError("The maximum number of IMFs must be positive") + if error_tolerance <= 0: + raise ValueError("Error tolerance must be positive") + + signal_array = signal.to_numpy() + + # Empirical Mode Decomposition + emd = EMD() + # If max_num_imfs is None, Python's slicing will automatically include all IMFs. + imfs = emd(signal_array)[:max_num_imfs] + + # Compute the analytic signal for each IMF using the Hilbert transform and store them in a list + analytic_signals = [hilbert(imf) for imf in imfs] + + # Find the total number of IMFs and the index of the last IMF + index_of_the_last_imf = imfs.shape[1] - 1 + significant_imf_index_rho = len(imfs) - 1 + + # Compute the instantaneous frequency and amplitude of each IMF + phase = np.unwrap(np.angle(analytic_signals)) + dt = np.mean(np.diff(get_timestamps(signal, "s").to_numpy())) + frequency = np.gradient(phase) / (2 * np.pi * dt) + amplitude = np.abs(analytic_signals) + + # Transpose to get the right shape for the array + frequency_array = np.array(frequency).T + + # average over time + avg_frequency_array = np.mean(frequency_array, axis=-1) + + amplitude_array = np.array(amplitude) + + # Ensure that the arrays are 1D by flattening them + # to avoid errors if the arrays have more than 2 dimensions + flat_amplitude = amplitude_array.flatten() + flat_avg_frequency = avg_frequency_array.flatten() + + # timestamps = get_timestamps(signal, "s") + # sample_rate_hz = 1 / (np.mean(np.diff(timestamps.to_numpy()))) + + # compute the marginal Hilbert spectrum as a sum over the sample rate of the Hilbert spectrum + hht = _hilbert_spectrum(flat_avg_frequency, flat_amplitude) + # marginal_hilbert_spectrum = sample_rate_hz * np.sum(hht, axis=0) + + # Loop for rho calculations + for i in range(index_of_the_last_imf - 1, -1, -1): + rho = np.sum(hht[:, i] * hht[:, i + 1]) / (np.sum(hht[:, i]) * np.sum(hht[:, i + 1])) # type: ignore + + if rho < error_tolerance: + break + else: + significant_imf_index_rho -= 1 + + trend_rho = np.sum(imfs[significant_imf_index_rho:], axis=0, dtype=np.float64) + trend_series_rho = pd.Series(trend_rho, index=signal.index) + + # Return the trend component of the signal if return_trend is True, otherwise, return the detrended signal + result_rho = trend_series_rho if return_trend else signal - trend_series_rho + + return result_rho diff --git a/poetry.lock b/poetry.lock index fbfad050..397c16a5 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.5.1 and should not be changed by hand. [[package]] name = "alabaster" @@ -753,6 +753,20 @@ files = [ {file = "defusedxml-0.7.1.tar.gz", hash = "sha256:1bb3032db185915b62d7c6209c5a8792be6a32ab2fedacc84e01b52c51aa3e69"}, ] +[[package]] +name = "dill" +version = "0.3.7" +description = "serialize all of Python" +optional = true +python-versions = ">=3.7" +files = [ + {file = "dill-0.3.7-py3-none-any.whl", hash = "sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e"}, + {file = "dill-0.3.7.tar.gz", hash = "sha256:cc1c8b182eb3013e24bd475ff2e9295af86c1a38eb1aff128dac8962a9ce3c03"}, +] + +[package.extras] +graph = ["objgraph (>=1.7.2)"] + [[package]] name = "distlib" version = "0.3.7" @@ -797,6 +811,29 @@ files = [ {file = "docutils-0.18.1.tar.gz", hash = "sha256:679987caf361a7539d76e584cbeddc311e3aee937877c87346f31debc63e9d06"}, ] +[[package]] +name = "emd-signal" +version = "1.5.2" +description = "Implementation of the Empirical Mode Decomposition (EMD) and its variations" +optional = true +python-versions = "<4,>=3.6" +files = [ + {file = "EMD-signal-1.5.2.tar.gz", hash = "sha256:8e4f1f6d4ff7751b3a308789b575b10b2596f60ca52acbb14efc32b8fba98f38"}, + {file = "EMD_signal-1.5.2-py3-none-any.whl", hash = "sha256:a5a55233f7d94f9e92fe43515fb54cc470a1c374ae035a0fe2f848153f777dc8"}, +] + +[package.dependencies] +numpy = ">=1.12" +pathos = ">=0.2.1" +scipy = ">=0.19" +tqdm = "==4.64.*" + +[package.extras] +dev = ["black (==23.3.*)", "click (==8.0.4)", "isort (==5.10.*)", "pycodestyle (==2.8.*)"] +doc = ["numpydoc", "sphinx", "sphinx-rtd-theme"] +jit = ["numba (==0.56.*)"] +test = ["codecov", "pytest"] + [[package]] name = "exceptiongroup" version = "1.2.0" @@ -1067,13 +1104,13 @@ files = [ [[package]] name = "ipykernel" -version = "6.26.0" +version = "6.27.0" description = "IPython Kernel for Jupyter" optional = false python-versions = ">=3.8" files = [ - {file = "ipykernel-6.26.0-py3-none-any.whl", hash = "sha256:3ba3dc97424b87b31bb46586b5167b3161b32d7820b9201a9e698c71e271602c"}, - {file = "ipykernel-6.26.0.tar.gz", hash = "sha256:553856658eb8430bbe9653ea041a41bff63e9606fc4628873fc92a6cf3abd404"}, + {file = "ipykernel-6.27.0-py3-none-any.whl", hash = "sha256:4388caa3c2cba0a381e20d289545e88a8aef1fe57a884d4c018718ec8c23c121"}, + {file = "ipykernel-6.27.0.tar.gz", hash = "sha256:7f4986f606581be73bfb32dc7a1ac9fa0e804c9be50ddf1c7a119413e982693f"}, ] [package.dependencies] @@ -1437,13 +1474,13 @@ test = ["jupyter-contrib-core[testing-utils]", "mock", "nose", "requests", "sele [[package]] name = "jupyter-server" -version = "2.10.1" +version = "2.11.0" description = "The backend—i.e. core services, APIs, and REST endpoints—to Jupyter web applications." optional = false python-versions = ">=3.8" files = [ - {file = "jupyter_server-2.10.1-py3-none-any.whl", hash = "sha256:20519e355d951fc5e1b6ac5952854fe7620d0cfb56588fa4efe362a758977ed3"}, - {file = "jupyter_server-2.10.1.tar.gz", hash = "sha256:e6da2657a954a7879eed28cc08e0817b01ffd81d7eab8634660397b55f926472"}, + {file = "jupyter_server-2.11.0-py3-none-any.whl", hash = "sha256:c9bd6e6d71dc5a2a25df167dc323422997f14682b008bfecb5d7920a55020ea7"}, + {file = "jupyter_server-2.11.0.tar.gz", hash = "sha256:78c97ec8049f9062f0151725bc8a1364dfed716646a66819095e0e8a24793eba"}, ] [package.dependencies] @@ -1710,7 +1747,7 @@ test = ["pytest (>=7.4)", "pytest-cov (>=4.1)"] name = "llvmlite" version = "0.41.1" description = "lightweight wrapper around basic LLVM functionality" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "llvmlite-0.41.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c1e1029d47ee66d3a0c4d6088641882f75b93db82bd0e6178f7bd744ebce42b9"}, @@ -1897,16 +1934,6 @@ files = [ {file = "MarkupSafe-2.1.3-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win32.whl", hash = "sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb"}, {file = "MarkupSafe-2.1.3-cp311-cp311-win_amd64.whl", hash = "sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win32.whl", hash = "sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007"}, - {file = "MarkupSafe-2.1.3-cp312-cp312-win_amd64.whl", hash = "sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b"}, {file = "MarkupSafe-2.1.3-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707"}, @@ -2063,6 +2090,34 @@ files = [ {file = "mistune-3.0.2.tar.gz", hash = "sha256:fc7f93ded930c92394ef2cb6f04a8aabab4117a91449e72dcc8dfa646a508be8"}, ] +[[package]] +name = "multiprocess" +version = "0.70.15" +description = "better multiprocessing and multithreading in Python" +optional = true +python-versions = ">=3.7" +files = [ + {file = "multiprocess-0.70.15-pp310-pypy310_pp73-macosx_10_9_x86_64.whl", hash = "sha256:aa36c7ed16f508091438687fe9baa393a7a8e206731d321e443745e743a0d4e5"}, + {file = "multiprocess-0.70.15-pp37-pypy37_pp73-macosx_10_9_x86_64.whl", hash = "sha256:20e024018c46d0d1602024c613007ac948f9754659e3853b0aa705e83f6931d8"}, + {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_i686.whl", hash = "sha256:e576062981c91f0fe8a463c3d52506e598dfc51320a8dd8d78b987dfca91c5db"}, + {file = "multiprocess-0.70.15-pp37-pypy37_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:e73f497e6696a0f5433ada2b3d599ae733b87a6e8b008e387c62ac9127add177"}, + {file = "multiprocess-0.70.15-pp38-pypy38_pp73-macosx_10_9_x86_64.whl", hash = "sha256:73db2e7b32dcc7f9b0f075c2ffa45c90b6729d3f1805f27e88534c8d321a1be5"}, + {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_i686.whl", hash = "sha256:4271647bd8a49c28ecd6eb56a7fdbd3c212c45529ad5303b40b3c65fc6928e5f"}, + {file = "multiprocess-0.70.15-pp38-pypy38_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:cf981fb998d6ec3208cb14f0cf2e9e80216e834f5d51fd09ebc937c32b960902"}, + {file = "multiprocess-0.70.15-pp39-pypy39_pp73-macosx_10_9_x86_64.whl", hash = "sha256:18f9f2c7063346d1617bd1684fdcae8d33380ae96b99427260f562e1a1228b67"}, + {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_i686.whl", hash = "sha256:0eac53214d664c49a34695e5824872db4006b1a465edd7459a251809c3773370"}, + {file = "multiprocess-0.70.15-pp39-pypy39_pp73-manylinux_2_24_x86_64.whl", hash = "sha256:1a51dd34096db47fb21fa2b839e615b051d51b97af9a67afbcdaa67186b44883"}, + {file = "multiprocess-0.70.15-py310-none-any.whl", hash = "sha256:7dd58e33235e83cf09d625e55cffd7b0f0eede7ee9223cdd666a87624f60c21a"}, + {file = "multiprocess-0.70.15-py311-none-any.whl", hash = "sha256:134f89053d82c9ed3b73edd3a2531eb791e602d4f4156fc92a79259590bd9670"}, + {file = "multiprocess-0.70.15-py37-none-any.whl", hash = "sha256:f7d4a1629bccb433114c3b4885f69eccc200994323c80f6feee73b0edc9199c5"}, + {file = "multiprocess-0.70.15-py38-none-any.whl", hash = "sha256:bee9afba476c91f9ebee7beeee0601face9eff67d822e893f9a893725fbd6316"}, + {file = "multiprocess-0.70.15-py39-none-any.whl", hash = "sha256:3e0953f5d52b4c76f1c973eaf8214554d146f2be5decb48e928e55c7a2d19338"}, + {file = "multiprocess-0.70.15.tar.gz", hash = "sha256:f20eed3036c0ef477b07a4177cf7c1ba520d9a2677870a4f47fe026f0cd6787e"}, +] + +[package.dependencies] +dill = ">=0.3.7" + [[package]] name = "mypy-extensions" version = "1.0.0" @@ -2287,7 +2342,7 @@ test = ["pytest", "pytest-console-scripts", "pytest-jupyter", "pytest-tornasync" name = "numba" version = "0.58.1" description = "compiling Python code using LLVM" -optional = true +optional = false python-versions = ">=3.8" files = [ {file = "numba-0.58.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:07f2fa7e7144aa6f275f27260e73ce0d808d3c62b30cff8906ad1dec12d87bbe"}, @@ -2420,8 +2475,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.21.0", markers = "python_version >= \"3.10\" and python_version < \"3.11\""}, {version = ">=1.20.3", markers = "python_version < \"3.10\""}, + {version = ">=1.21.0", markers = "python_version >= \"3.10\""}, {version = ">=1.23.2", markers = "python_version >= \"3.11\""}, ] python-dateutil = ">=2.8.2" @@ -2477,6 +2532,23 @@ files = [ qa = ["flake8 (==3.8.3)", "mypy (==0.782)"] testing = ["docopt", "pytest (<6.0.0)"] +[[package]] +name = "pathos" +version = "0.3.1" +description = "parallel graph management and execution in heterogeneous computing" +optional = true +python-versions = ">=3.7" +files = [ + {file = "pathos-0.3.1-py3-none-any.whl", hash = "sha256:b1c7145e2adcc19c7e9cac48f110ea5a63e300c1cc10c2947d4857dc97a47b46"}, + {file = "pathos-0.3.1.tar.gz", hash = "sha256:c9a088021493c5cb627d4459bba6c0533c684199e271a5dc297d62be23d74019"}, +] + +[package.dependencies] +dill = ">=0.3.7" +multiprocess = ">=0.70.15" +pox = ">=0.3.3" +ppft = ">=1.7.6.7" + [[package]] name = "pathspec" version = "0.11.2" @@ -2617,6 +2689,31 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "pox" +version = "0.3.3" +description = "utilities for filesystem exploration and automated builds" +optional = true +python-versions = ">=3.7" +files = [ + {file = "pox-0.3.3-py3-none-any.whl", hash = "sha256:e95febf7401918478a3c1441a3630656d9a2049803889b4f589821372889d0ce"}, + {file = "pox-0.3.3.tar.gz", hash = "sha256:e1ced66f2a0c92a58cf3646bc7ccb8b4773d40884b76f85eeda0670474871667"}, +] + +[[package]] +name = "ppft" +version = "1.7.6.7" +description = "distributed and parallel Python" +optional = true +python-versions = ">=3.7" +files = [ + {file = "ppft-1.7.6.7-py3-none-any.whl", hash = "sha256:fedb1b1253729d62483f2e1f36547fd50a5fc873ffbf9b78b48cfdc727d4180c"}, + {file = "ppft-1.7.6.7.tar.gz", hash = "sha256:ab34436814e2f18238f35688fd869b2641b2d2d8dca22b8d246f6701dfc954c8"}, +] + +[package.extras] +dill = ["dill (>=0.3.7)"] + [[package]] name = "pre-commit" version = "3.5.0" @@ -3495,6 +3592,28 @@ files = [ {file = "soupsieve-2.5.tar.gz", hash = "sha256:5663d5a7b3bfaeee0bc4372e7fc48f9cff4940b3eec54a6451cc5299f1097690"}, ] +[[package]] +name = "sparse" +version = "0.14.0" +description = "Sparse n-dimensional arrays" +optional = false +python-versions = ">=3.8, <4" +files = [ + {file = "sparse-0.14.0-py2.py3-none-any.whl", hash = "sha256:77e3c32ace6184de29074db94901c63e113c7729b28c8b3f1885aaf5e1934323"}, + {file = "sparse-0.14.0.tar.gz", hash = "sha256:5f5827a37f6cd6f6730a541f994c95c60a3ae2329e01f4ba21ced5339aea0098"}, +] + +[package.dependencies] +numba = ">=0.49" +numpy = ">=1.17" +scipy = ">=0.19" + +[package.extras] +all = ["dask[array]", "pytest (>=3.5)", "pytest-black", "pytest-cov", "sphinx", "sphinx-rtd-theme", "tox"] +docs = ["sphinx", "sphinx-rtd-theme"] +tests = ["dask[array]", "pytest (>=3.5)", "pytest-black", "pytest-cov"] +tox = ["dask[array]", "pytest (>=3.5)", "pytest-black", "pytest-cov", "tox"] + [[package]] name = "sphinx" version = "7.2.6" @@ -3727,20 +3846,12 @@ files = [ {file = "statsmodels-0.14.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5a6a0a1a06ff79be8aa89c8494b33903442859add133f0dda1daf37c3c71682e"}, {file = "statsmodels-0.14.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:77b3cd3a5268ef966a0a08582c591bd29c09c88b4566c892a7c087935234f285"}, {file = "statsmodels-0.14.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9c64ebe9cf376cba0c31aed138e15ed179a1d128612dd241cdf299d159e5e882"}, - {file = "statsmodels-0.14.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:229b2f676b4a45cb62d132a105c9c06ca8a09ffba060abe34935391eb5d9ba87"}, {file = "statsmodels-0.14.0-cp310-cp310-win_amd64.whl", hash = "sha256:fb471f757fc45102a87e5d86e87dc2c8c78b34ad4f203679a46520f1d863b9da"}, {file = "statsmodels-0.14.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:582f9e41092e342aaa04920d17cc3f97240e3ee198672f194719b5a3d08657d6"}, {file = "statsmodels-0.14.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:7ebe885ccaa64b4bc5ad49ac781c246e7a594b491f08ab4cfd5aa456c363a6f6"}, {file = "statsmodels-0.14.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b587ee5d23369a0e881da6e37f78371dce4238cf7638a455db4b633a1a1c62d6"}, {file = "statsmodels-0.14.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ef7fa4813c7a73b0d8a0c830250f021c102c71c95e9fe0d6877bcfb56d38b8c"}, - {file = "statsmodels-0.14.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:afe80544ef46730ea1b11cc655da27038bbaa7159dc5af4bc35bbc32982262f2"}, {file = "statsmodels-0.14.0-cp311-cp311-win_amd64.whl", hash = "sha256:a6ad7b8aadccd4e4dd7f315a07bef1bca41d194eeaf4ec600d20dea02d242fce"}, - {file = "statsmodels-0.14.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:0eea4a0b761aebf0c355b726ac5616b9a8b618bd6e81a96b9f998a61f4fd7484"}, - {file = "statsmodels-0.14.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4c815ce7a699047727c65a7c179bff4031cff9ae90c78ca730cfd5200eb025dd"}, - {file = "statsmodels-0.14.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:575f61337c8e406ae5fa074d34bc6eb77b5a57c544b2d4ee9bc3da6a0a084cf1"}, - {file = "statsmodels-0.14.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8be53cdeb82f49c4cb0fda6d7eeeb2d67dbd50179b3e1033510e061863720d93"}, - {file = "statsmodels-0.14.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:6f7d762df4e04d1dde8127d07e91aff230eae643aa7078543e60e83e7d5b40db"}, - {file = "statsmodels-0.14.0-cp312-cp312-win_amd64.whl", hash = "sha256:fc2c7931008a911e3060c77ea8933f63f7367c0f3af04f82db3a04808ad2cd2c"}, {file = "statsmodels-0.14.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:3757542c95247e4ab025291a740efa5da91dc11a05990c033d40fce31c450dc9"}, {file = "statsmodels-0.14.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:de489e3ed315bdba55c9d1554a2e89faa65d212e365ab81bc323fa52681fc60e"}, {file = "statsmodels-0.14.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:76e290f4718177bffa8823a780f3b882d56dd64ad1c18cfb4bc8b5558f3f5757"}, @@ -3756,8 +3867,8 @@ files = [ [package.dependencies] numpy = [ - {version = ">=1.22.3", markers = "python_version == \"3.10\" and platform_system == \"Windows\" and platform_python_implementation != \"PyPy\""}, {version = ">=1.18", markers = "python_version != \"3.10\" or platform_system != \"Windows\" or platform_python_implementation == \"PyPy\""}, + {version = ">=1.22.3", markers = "python_version == \"3.10\" and platform_system == \"Windows\" and platform_python_implementation != \"PyPy\""}, ] packaging = ">=21.3" pandas = ">=1.0" @@ -3903,6 +4014,26 @@ files = [ {file = "tornado-6.3.3.tar.gz", hash = "sha256:e7d8db41c0181c80d76c982aacc442c0783a2c54d6400fe028954201a2e032fe"}, ] +[[package]] +name = "tqdm" +version = "4.64.1" +description = "Fast, Extensible Progress Meter" +optional = true +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,>=2.7" +files = [ + {file = "tqdm-4.64.1-py2.py3-none-any.whl", hash = "sha256:6fee160d6ffcd1b1c68c65f14c829c22832bc401726335ce92c52d395944a6a1"}, + {file = "tqdm-4.64.1.tar.gz", hash = "sha256:5f4f682a004951c1b450bc753c710e9280c5746ce6ffedee253ddbcbf54cf1e4"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "platform_system == \"Windows\""} + +[package.extras] +dev = ["py-make (>=0.1.0)", "twine", "wheel"] +notebook = ["ipywidgets (>=6)"] +slack = ["slack-sdk"] +telegram = ["requests"] + [[package]] name = "traitlets" version = "5.13.0" @@ -4089,15 +4220,17 @@ docs = ["furo", "jaraco.packaging (>=9.3)", "jaraco.tidelift (>=1.4)", "rst.link testing = ["big-O", "jaraco.functools", "jaraco.itertools", "more-itertools", "pytest (>=6)", "pytest-black (>=0.3.7)", "pytest-checkdocs (>=2.4)", "pytest-cov", "pytest-enabler (>=2.2)", "pytest-ignore-flaky", "pytest-mypy (>=0.9.1)", "pytest-ruff"] [extras] -all = ["csaps", "fluids", "kneed", "matplotlib", "numba", "scikit-image", "scikit-learn", "statsmodels"] +all = ["csaps", "emd-signal", "fluids", "kneed", "matplotlib", "numba", "scikit-image", "scikit-learn", "sparse", "statsmodels"] +emd-signal = ["emd-signal"] fluids = ["fluids"] modeling = ["csaps", "kneed"] numba = ["numba"] plot = ["matplotlib"] scikit = ["scikit-image", "scikit-learn"] +sparse = ["sparse"] stats = ["statsmodels"] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.13" -content-hash = "6184cf7820e9796b0cf117ecb120f3a994e26eed4d30c559aa540aa6f9afe222" +content-hash = "96d0e4a494cc9d121332582fab6c6c8285afcfd69cab9e736dc95d9f989fbb74" diff --git a/pyproject.toml b/pyproject.toml index fbbc3f5e..304a6093 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,8 @@ csaps = { version = "^1.1.0", optional = true } statsmodels = { version = "^0.14.0", optional = true } scikit-image = { version = "^0.21.0", optional = true } scikit-learn = { version = "^1.3.0", optional = true } +sparse = "^0.14.0" +emd-signal = { version = "^1.5.2", optional = true } [tool.poetry.group.dev.dependencies] pre-commit = { version = "^3.0", python = "^3.6.1" } @@ -75,7 +77,9 @@ modeling = ["csaps", "kneed"] stats = ["statsmodels"] scikit = ["scikit-image", "scikit-learn"] fluids = ["fluids"] -all = ["fluids", "matplotlib", "numba", "csaps", "kneed", "statsmodels", "scikit-image", "scikit-learn"] +emd-signal = ["emd-signal"] +sparse = ["sparse"] +all = ["emd-signal", "fluids", "matplotlib", "numba", "csaps", "kneed", "statsmodels", "scikit-image", "scikit-learn", "sparse"] [tool.black] line-length = 120 diff --git a/tests/filters/test_hilbert_huang_filter.py b/tests/filters/test_hilbert_huang_filter.py new file mode 100644 index 00000000..380048fe --- /dev/null +++ b/tests/filters/test_hilbert_huang_filter.py @@ -0,0 +1,114 @@ +import logging + +import numpy as np +import pandas as pd +import pytest + +from scipy.stats import anderson, shapiro + +from indsl.filter import hilbert_huang_transform +from indsl.resample.auto_align import auto_align +from indsl.signals import insert_data_gaps, line, perturb_timestamp, sine_wave, white_noise +from indsl.ts_utils.operators import sub +from sparse import COO + + +def generate_synthetic_signal(): + wave = sine_wave( + start_date=pd.Timestamp("1975-05-09"), + end_date=pd.Timestamp("1975-05-10"), + wave_amplitude=2, + wave_mean=10, + wave_period=pd.Timedelta("0.25 D"), + wave_phase=np.pi / 4, + ) + line_function = line( + start_date=pd.Timestamp("1975-05-09"), + end_date=pd.Timestamp("1975-05-10"), + intercept=10, + slope=0.0001, + sample_freq=pd.Timedelta("1 s"), + ) + + return wave + line_function + + +def test_trend_of_signal(): + signal = generate_synthetic_signal() + signal_with_noise = white_noise(signal, snr_db=10, seed=42) + + trend = hilbert_huang_transform(signal_with_noise) + + detrended_signal = signal_with_noise - trend + + expected_detrended_signal = signal_with_noise - signal + + # mean is close to zero + assert abs(np.mean(detrended_signal)) <= 0.001 + assert np.allclose(trend, signal, atol=0.04, rtol=0.04) + assert np.allclose(detrended_signal, expected_detrended_signal, atol=0.05, rtol=0.05) + + # normality test for detrended signal + stat, p = shapiro(detrended_signal) + assert p > 0.05 + + try: + assert np.allclose( + trend["1975-05-09 03:00:00":"1975-05-09 14:00:00"], + signal["1975-05-09 03:00:00":"1975-05-09 14:00:00"], + atol=0.4, + rtol=0.4, + ) + except AssertionError: + logging.error("trend and signal are not close. Max difference: %s", np.max(np.abs(trend - signal))) + raise + + +def test_trend_of_data_with_gaps(): + signal = generate_synthetic_signal() + signal_with_gaps = insert_data_gaps(signal, num_gaps=4, method="Multiple") + signal_with_noise = white_noise(signal_with_gaps, snr_db=30, seed=42) + + trend = hilbert_huang_transform(signal_with_noise) + + detrended_signal = signal_with_noise - trend + expected_detrended_signal = signal_with_noise - signal_with_gaps + + assert abs(np.mean(detrended_signal)) <= 0.001 + + # compare data without gap + assert np.allclose( + trend["1975-05-09 03:00:00":"1975-05-09 14:00:00"], + signal["1975-05-09 03:00:00":"1975-05-09 14:00:00"], + atol=0.04, + rtol=0.04, + ) + assert np.allclose( + detrended_signal["1975-05-09 03:00:00":"1975-05-09 14:00:00"], + expected_detrended_signal["1975-05-09 03:00:00":"1975-05-09 14:00:00"], + atol=0.05, + rtol=0.05, + ) + + +def test_trend_data_perturb_timestamp(): + # synthetic signal for testing + signal = generate_synthetic_signal() + signal_with_noise = white_noise(signal, snr_db=30, seed=42) + signal_with_perturbation = perturb_timestamp(signal_with_noise) + + trend = hilbert_huang_transform(signal_with_perturbation) + detrended_signal = sub(signal_with_perturbation, trend, True) + + expected_detrended_signal = sub(signal_with_perturbation, signal, True) + + assert abs(np.mean(expected_detrended_signal)) <= 0.001 + + # normality test for detrended signal + stat, p = shapiro(detrended_signal) + assert p > 0.05 + + assert np.allclose(trend, signal, atol=0.04, rtol=0.04) + + detrended_signal, expected_detrended_signal = auto_align([detrended_signal, expected_detrended_signal]) + assert np.allclose(detrended_signal, expected_detrended_signal, atol=0.05, rtol=0.05)