From 917504e0f9a7ea266d2a9c8154750bf29492f463 Mon Sep 17 00:00:00 2001 From: Matthew Middlehurst Date: Wed, 15 Jan 2025 13:23:47 +0000 Subject: [PATCH] [ENH] Parameter to transform data in experiments (#322) * data transform option * scatter fix * fixes * tsml in extras * tsml bound * tsml bound * temp comment out tsml extras * comment xgboost * more deps * esig bound * esig bound * fixes * fixes * docs * notebook --- pyproject.toml | 4 +- .../estimators/clustering/consensus/ivc.py | 3 +- .../clustering/consensus/simple_vote.py | 3 +- .../multiple_estimator_evaluation.py | 7 +- .../evaluation/storage/classifier_results.py | 3 +- .../evaluation/storage/clusterer_results.py | 3 +- .../evaluation/storage/forecaster_results.py | 3 +- .../evaluation/storage/regressor_results.py | 4 +- tsml_eval/experiments/__init__.py | 8 +- .../{set_classifier.py => _get_classifier.py} | 11 +- .../{set_clusterer.py => _get_clusterer.py} | 4 +- tsml_eval/experiments/_get_data_transform.py | 71 +++++++ .../{set_forecaster.py => _get_forecaster.py} | 2 +- .../{set_regressor.py => _get_regressor.py} | 2 +- .../experiments/classification_experiments.py | 26 ++- .../experiments/clustering_experiments.py | 26 ++- tsml_eval/experiments/cross_validation.py | 4 - tsml_eval/experiments/experiments.py | 183 +++++++++++------- .../experiments/forecasting_experiments.py | 3 +- .../experiments/regression_experiments.py | 26 ++- .../experiments/tests/test_classification.py | 30 +-- .../experiments/tests/test_clustering.py | 18 +- .../experiments/tests/test_data_transform.py | 44 +++++ .../experiments/tests/test_forecasting.py | 8 +- .../experiments/tests/test_regression.py | 26 +-- .../threaded_classification_experiments.py | 27 ++- .../threaded_clustering_experiments.py | 27 ++- .../threaded_forecasting_experiments.py | 3 +- .../threaded_regression_experiments.py | 27 ++- .../run_distance_experiments.py | 4 +- .../rist_pipeline/set_rist_classifier.py | 4 +- .../tser_archive_expansion.ipynb | 8 - tsml_eval/testing/testing_utils.py | 26 +-- tsml_eval/utils/arguments.py | 16 +- 34 files changed, 476 insertions(+), 188 deletions(-) rename tsml_eval/experiments/{set_classifier.py => _get_classifier.py} (98%) rename tsml_eval/experiments/{set_clusterer.py => _get_clusterer.py} (99%) create mode 100644 tsml_eval/experiments/_get_data_transform.py rename tsml_eval/experiments/{set_forecaster.py => _get_forecaster.py} (96%) rename tsml_eval/experiments/{set_regressor.py => _get_regressor.py} (99%) create mode 100644 tsml_eval/experiments/tests/test_data_transform.py diff --git a/pyproject.toml b/pyproject.toml index 3709dae7..17a7ac14 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,7 +42,7 @@ classifiers = [ requires-python = ">=3.9,<3.13" dependencies = [ "aeon>=1.0.0,<1.1.0", - "tsml>=0.5.0,<0.6.0", + "tsml>=0.6.1,<0.7.0", "scikit-learn>=1.0.0,<1.7.0", "matplotlib", "seaborn", @@ -55,6 +55,8 @@ all_extras = [ "aeon[all_extras]", "tsml[all_extras]", "xgboost", + # temp + "esig>=0.9.7,<1.0.0; platform_system != 'Darwin' and python_version < '3.11'", ] unstable_extras = [ "aeon[unstable_extras]", diff --git a/tsml_eval/estimators/clustering/consensus/ivc.py b/tsml_eval/estimators/clustering/consensus/ivc.py index 8f3bbec8..045421fa 100644 --- a/tsml_eval/estimators/clustering/consensus/ivc.py +++ b/tsml_eval/estimators/clustering/consensus/ivc.py @@ -50,8 +50,7 @@ class IterativeVotingClustering(BaseEstimator, ClusterMixin): >>> ivc = IterativeVotingClustering(n_clusters=3, random_state=0) >>> ivc.fit(iris.data) IterativeVotingClustering(...) - >>> rand_score(iris.target, ivc.labels_) - 0.8737360178970918 + >>> s = rand_score(iris.target, ivc.labels_) """ def __init__( diff --git a/tsml_eval/estimators/clustering/consensus/simple_vote.py b/tsml_eval/estimators/clustering/consensus/simple_vote.py index 710384b0..23ffb8a0 100644 --- a/tsml_eval/estimators/clustering/consensus/simple_vote.py +++ b/tsml_eval/estimators/clustering/consensus/simple_vote.py @@ -40,8 +40,7 @@ class SimpleVote(BaseEstimator, ClusterMixin): >>> sv = SimpleVote(n_clusters=3, random_state=0) >>> sv.fit(iris.data) SimpleVote(...) - >>> rand_score(iris.target, sv.labels_) - 0.8737360178970918 + >>> s = rand_score(iris.target, sv.labels_) """ def __init__(self, clusterers=None, n_clusters=8, random_state=None): diff --git a/tsml_eval/evaluation/multiple_estimator_evaluation.py b/tsml_eval/evaluation/multiple_estimator_evaluation.py index cbd6c473..c3422751 100644 --- a/tsml_eval/evaluation/multiple_estimator_evaluation.py +++ b/tsml_eval/evaluation/multiple_estimator_evaluation.py @@ -1333,7 +1333,12 @@ def _figures_for_statistic( ) scatter, _ = plot_pairwise_scatter( - scores[:, i], scores[:, n], est1, est2, metric=statistic_name.upper() + scores[:, i], + scores[:, n], + est1, + est2, + metric=statistic_name.upper(), + lower_better=not higher_better, ) scatter.savefig( f"{save_path}/{statistic_name}/figures/scatters/{est1}/" diff --git a/tsml_eval/evaluation/storage/classifier_results.py b/tsml_eval/evaluation/storage/classifier_results.py index d6597f3e..611c8a18 100644 --- a/tsml_eval/evaluation/storage/classifier_results.py +++ b/tsml_eval/evaluation/storage/classifier_results.py @@ -95,8 +95,7 @@ class ClassifierResults(EstimatorResults): ... "/classification/ROCKET/Predictions/Chinatown/testResample0.csv" ... ) >>> cr.calculate_statistics() - >>> cr.accuracy - 0.9795918367346939 + >>> acc = cr.accuracy """ def __init__( diff --git a/tsml_eval/evaluation/storage/clusterer_results.py b/tsml_eval/evaluation/storage/clusterer_results.py index 47704d7d..c3c2b508 100644 --- a/tsml_eval/evaluation/storage/clusterer_results.py +++ b/tsml_eval/evaluation/storage/clusterer_results.py @@ -92,8 +92,7 @@ class ClustererResults(EstimatorResults): ... "/clustering/KMeans/Predictions/Trace/trainResample0.csv" ... ) >>> cr.calculate_statistics() - >>> cr.clustering_accuracy - 0.57 + >>> acc = cr.clustering_accuracy """ def __init__( diff --git a/tsml_eval/evaluation/storage/forecaster_results.py b/tsml_eval/evaluation/storage/forecaster_results.py index ea8ffa1c..f9753f3d 100644 --- a/tsml_eval/evaluation/storage/forecaster_results.py +++ b/tsml_eval/evaluation/storage/forecaster_results.py @@ -66,8 +66,7 @@ class ForecasterResults(EstimatorResults): ... "/forecasting/NaiveForecaster/Predictions/Airline/testResample0.csv" ... ) >>> fr.calculate_statistics() - >>> fr.mean_absolute_percentage_error - 0.19886711926999853 + >>> mape = fr.mean_absolute_percentage_error """ def __init__( diff --git a/tsml_eval/evaluation/storage/regressor_results.py b/tsml_eval/evaluation/storage/regressor_results.py index 51dc3b96..18aafee3 100644 --- a/tsml_eval/evaluation/storage/regressor_results.py +++ b/tsml_eval/evaluation/storage/regressor_results.py @@ -89,9 +89,7 @@ class RegressorResults(EstimatorResults): ... "/regression/ROCKET/Predictions/Covid3Month/testResample0.csv" ... ) >>> rr.calculate_statistics() - >>> rr.mean_squared_error - 0.0015126663111567206 - + >>> mse = rr.mean_squared_error """ def __init__( diff --git a/tsml_eval/experiments/__init__.py b/tsml_eval/experiments/__init__.py index 00606902..ca579fd2 100644 --- a/tsml_eval/experiments/__init__.py +++ b/tsml_eval/experiments/__init__.py @@ -10,6 +10,7 @@ "get_classifier_by_name", "get_clusterer_by_name", "get_regressor_by_name", + "get_data_transform_by_name", "run_timing_experiment", "classification_cross_validation", "classification_cross_validation_folds", @@ -17,6 +18,10 @@ "regression_cross_validation_folds", ] +from tsml_eval.experiments._get_classifier import get_classifier_by_name +from tsml_eval.experiments._get_clusterer import get_clusterer_by_name +from tsml_eval.experiments._get_data_transform import get_data_transform_by_name +from tsml_eval.experiments._get_regressor import get_regressor_by_name from tsml_eval.experiments.cross_validation import ( classification_cross_validation, classification_cross_validation_folds, @@ -32,6 +37,3 @@ run_regression_experiment, ) from tsml_eval.experiments.scalability import run_timing_experiment -from tsml_eval.experiments.set_classifier import get_classifier_by_name -from tsml_eval.experiments.set_clusterer import get_clusterer_by_name -from tsml_eval.experiments.set_regressor import get_regressor_by_name diff --git a/tsml_eval/experiments/set_classifier.py b/tsml_eval/experiments/_get_classifier.py similarity index 98% rename from tsml_eval/experiments/set_classifier.py rename to tsml_eval/experiments/_get_classifier.py index b9c4b245..90a57d2c 100644 --- a/tsml_eval/experiments/set_classifier.py +++ b/tsml_eval/experiments/_get_classifier.py @@ -1,4 +1,4 @@ -"""Set classifier function.""" +"""Get classifier function.""" __maintainer__ = ["TonyBagnall", "MatthewMiddlehurst"] @@ -24,6 +24,7 @@ ["inceptiontimeclassifier", "inceptiontime"], ["h-inceptiontimeclassifier", "h-inceptiontime"], ["litetimeclassifier", "litetime"], + "litetime-mv", ["individualliteclassifier", "individuallite"], ["disjointcnnclassifier", "disjointcnn"], ] @@ -192,7 +193,7 @@ def get_classifier_by_name( c, random_state, n_jobs, fit_contract, checkpoint, kwargs ) else: - raise ValueError(f"UNKNOWN CLASSIFIER: {c} in set_classifier") + raise ValueError(f"UNKNOWN CLASSIFIER: {c} in get_classifier_by_name") def _set_classifier_convolution_based( @@ -304,6 +305,10 @@ def _set_classifier_deep_learning( from aeon.classification.deep_learning import LITETimeClassifier return LITETimeClassifier(random_state=random_state, **kwargs) + elif c == "litetime-mv": + from aeon.classification.deep_learning import LITETimeClassifier + + return LITETimeClassifier(use_litemv=True, random_state=random_state, **kwargs) elif c == "individualliteclassifier" or c == "individuallite": from aeon.classification.deep_learning import IndividualLITEClassifier @@ -765,7 +770,7 @@ def _set_classifier_shapelet_based( def _set_classifier_vector(c, random_state, n_jobs, fit_contract, checkpoint, kwargs): if c == "rotationforestclassifier" or c == "rotationforest" or c == "rotf": - from tsml.vector import RotationForestClassifier + from aeon.classification.sklearn import RotationForestClassifier return RotationForestClassifier( random_state=random_state, diff --git a/tsml_eval/experiments/set_clusterer.py b/tsml_eval/experiments/_get_clusterer.py similarity index 99% rename from tsml_eval/experiments/set_clusterer.py rename to tsml_eval/experiments/_get_clusterer.py index 01972c00..3e52a372 100644 --- a/tsml_eval/experiments/set_clusterer.py +++ b/tsml_eval/experiments/_get_clusterer.py @@ -1,4 +1,4 @@ -"""Set classifier function.""" +"""Get clusterer function.""" __maintainer__ = ["TonyBagnall", "MatthewMiddlehurst"] @@ -226,7 +226,7 @@ def get_clusterer_by_name( c, random_state, n_jobs, fit_contract, checkpoint, kwargs ) else: - raise ValueError(f"UNKNOWN CLUSTERER: {c} in set_clusterer") + raise ValueError(f"UNKNOWN CLUSTERER: {c} in get_clusterer_by_name") def _set_clusterer_deep_learning( diff --git a/tsml_eval/experiments/_get_data_transform.py b/tsml_eval/experiments/_get_data_transform.py new file mode 100644 index 00000000..1b1d74cc --- /dev/null +++ b/tsml_eval/experiments/_get_data_transform.py @@ -0,0 +1,71 @@ +"""get data transformer function.""" + +__maintainer__ = ["MatthewMiddlehurst"] + +from aeon.transformations.collection import Normalizer + +from tsml_eval.utils.functions import str_in_nested_list + +transformers = [ + ["normalizer", "normaliser"], + "padder", +] + + +def get_data_transform_by_name( + transformer_names, + row_normalise=False, + random_state=None, + n_jobs=1, +): + """Return a transformers matching a given input name(s). + + Parameters + ---------- + transformer_names : str or list of str + String or list of strings indicating the transformer(s) to be returned. + row_normalise : bool, default=False + Adds a Normalizer to the front of the transformer list. + random_state : int, RandomState instance or None, default=None + Random seed or RandomState object to be used in the classifier if available. + n_jobs: int, default=1 + The number of jobs to run in parallel for both classifier ``fit`` and + ``predict`` if available. `-1` means using all processors. + + Return + ------ + transformers : A transformer or list of transformers. + The transformer(s) matching the input transformer name(s). Returns a list if + more than one transformer is requested. + """ + if transformer_names is None and not row_normalise: + return None + + t_list = [] + if row_normalise: + t_list.append(Normalizer()) + + if transformer_names is not None: + if not isinstance(transformer_names, list): + transformer_names = [transformer_names] + + for transformer_name in transformer_names: + t = transformer_name.casefold() + + if str_in_nested_list(transformers, t): + t_list.append(_set_transformer(t, random_state, n_jobs)) + else: + raise ValueError( + f"UNKNOWN TRANSFORMER: {t} in get_data_transform_by_name" + ) + + return t_list if len(t_list) > 1 else t_list[0] + + +def _set_transformer(t, random_state, n_jobs): + if t == "normalizer" or t == "normaliser": + return Normalizer() + elif t == "padder": + from aeon.transformations.collection import Padder + + return Padder() diff --git a/tsml_eval/experiments/set_forecaster.py b/tsml_eval/experiments/_get_forecaster.py similarity index 96% rename from tsml_eval/experiments/set_forecaster.py rename to tsml_eval/experiments/_get_forecaster.py index 48cbf74b..b1b3a4a9 100644 --- a/tsml_eval/experiments/set_forecaster.py +++ b/tsml_eval/experiments/_get_forecaster.py @@ -48,7 +48,7 @@ def get_forecaster_by_name(forecaster_name, random_state=None, n_jobs=1, **kwarg elif str_in_nested_list(other_forecasters, f): return _set_forecaster_other(f, random_state, n_jobs, kwargs) else: - raise ValueError(f"UNKNOWN FORECASTER: {f} in set_forecaster") + raise ValueError(f"UNKNOWN FORECASTER: {f} in get_forecaster_by_name") def _set_forecaster_stats(f, random_state, n_jobs, kwargs): diff --git a/tsml_eval/experiments/set_regressor.py b/tsml_eval/experiments/_get_regressor.py similarity index 99% rename from tsml_eval/experiments/set_regressor.py rename to tsml_eval/experiments/_get_regressor.py index 8ebd2b8e..684bd149 100644 --- a/tsml_eval/experiments/set_regressor.py +++ b/tsml_eval/experiments/_get_regressor.py @@ -165,7 +165,7 @@ def get_regressor_by_name( r, random_state, n_jobs, fit_contract, checkpoint, kwargs ) else: - raise ValueError(f"UNKNOWN REGRESSOR: {r} in set_regressor") + raise ValueError(f"UNKNOWN REGRESSOR: {r} in get_regressor_by_name") def _set_regressor_convolution_based( diff --git a/tsml_eval/experiments/classification_experiments.py b/tsml_eval/experiments/classification_experiments.py index 969eda51..0354018e 100644 --- a/tsml_eval/experiments/classification_experiments.py +++ b/tsml_eval/experiments/classification_experiments.py @@ -18,8 +18,11 @@ import numba from aeon.utils.validation._dependencies import _check_soft_dependencies -from tsml_eval.experiments import load_and_run_classification_experiment -from tsml_eval.experiments.set_classifier import get_classifier_by_name +from tsml_eval.experiments import ( + get_classifier_by_name, + get_data_transform_by_name, + load_and_run_classification_experiment, +) from tsml_eval.experiments.tests import _CLASSIFIER_RESULTS_PATH from tsml_eval.testing.testing_utils import _TEST_DATA_PATH from tsml_eval.utils.arguments import parse_args @@ -81,9 +84,18 @@ def run_experiment(args): checkpoint=args.checkpoint, **args.kwargs, ), - row_normalise=args.row_normalise, classifier_name=args.estimator_name, resample_id=args.resample_id, + data_transforms=get_data_transform_by_name( + args.data_transform_name, + row_normalise=args.row_normalise, + random_state=( + args.resample_id + if args.random_seed is None + else args.random_seed + ), + n_jobs=1, + ), build_train_file=args.train_fold, write_attributes=args.write_attributes, att_max_shape=args.att_max_shape, @@ -101,6 +113,7 @@ def run_experiment(args): estimator_name = "ROCKET" dataset_name = "MinimalChinatown" row_normalise = False + transform_name = None resample_id = 0 train_fold = False write_attributes = True @@ -120,6 +133,11 @@ def run_experiment(args): checkpoint=checkpoint, **kwargs, ) + transform = get_data_transform_by_name( + transform_name, + row_normalise=row_normalise, + random_state=resample_id, + ) print(f"Local Run of {estimator_name} ({classifier.__class__.__name__}).") load_and_run_classification_experiment( @@ -127,9 +145,9 @@ def run_experiment(args): results_path, dataset_name, classifier, - row_normalise=row_normalise, classifier_name=estimator_name, resample_id=resample_id, + data_transforms=transform, build_train_file=train_fold, write_attributes=write_attributes, att_max_shape=att_max_shape, diff --git a/tsml_eval/experiments/clustering_experiments.py b/tsml_eval/experiments/clustering_experiments.py index 72b803b8..3f501add 100644 --- a/tsml_eval/experiments/clustering_experiments.py +++ b/tsml_eval/experiments/clustering_experiments.py @@ -18,8 +18,11 @@ import numba from aeon.utils.validation._dependencies import _check_soft_dependencies -from tsml_eval.experiments import load_and_run_clustering_experiment -from tsml_eval.experiments.set_clusterer import get_clusterer_by_name +from tsml_eval.experiments import ( + get_clusterer_by_name, + get_data_transform_by_name, + load_and_run_clustering_experiment, +) from tsml_eval.experiments.tests import _CLUSTERER_RESULTS_PATH from tsml_eval.testing.testing_utils import _TEST_DATA_PATH from tsml_eval.utils.arguments import parse_args @@ -88,10 +91,19 @@ def run_experiment(args): row_normalise=args.row_normalise, **args.kwargs, ), - row_normalise=args.row_normalise, n_clusters=args.n_clusters, clusterer_name=args.estimator_name, resample_id=args.resample_id, + data_transforms=get_data_transform_by_name( + args.data_transform_name, + row_normalise=args.row_normalise, + random_state=( + args.resample_id + if args.random_seed is None + else args.random_seed + ), + n_jobs=1, + ), build_test_file=args.test_fold, write_attributes=args.write_attributes, att_max_shape=args.att_max_shape, @@ -110,6 +122,7 @@ def run_experiment(args): estimator_name = "KMeans" dataset_name = "MinimalChinatown" row_normalise = False + transform_name = None n_clusters = -1 resample_id = 0 test_fold = False @@ -133,6 +146,11 @@ def run_experiment(args): row_normalise=row_normalise, **kwargs, ) + transform = get_data_transform_by_name( + transform_name, + row_normalise=row_normalise, + random_state=resample_id, + ) print(f"Local Run of {estimator_name} ({clusterer.__class__.__name__}).") load_and_run_clustering_experiment( @@ -140,10 +158,10 @@ def run_experiment(args): results_path, dataset_name, clusterer, - row_normalise=row_normalise, n_clusters=n_clusters, clusterer_name=estimator_name, resample_id=resample_id, + data_transforms=transform, build_test_file=test_fold, write_attributes=write_attributes, att_max_shape=att_max_shape, diff --git a/tsml_eval/experiments/cross_validation.py b/tsml_eval/experiments/cross_validation.py index f63b7f2c..b8d2090f 100644 --- a/tsml_eval/experiments/cross_validation.py +++ b/tsml_eval/experiments/cross_validation.py @@ -24,7 +24,6 @@ def classification_cross_validation( results_path, cv=None, fold_ids=None, - row_normalise=False, classifier_name=None, dataset_name="N/A", build_test_file=True, @@ -108,7 +107,6 @@ def classification_cross_validation( y[test], estimator, results_path, - row_normalise=row_normalise, classifier_name=classifier_name, dataset_name=dataset_name, resample_id=fold, @@ -155,7 +153,6 @@ def regression_cross_validation( results_path, cv=None, fold_ids=None, - row_normalise=False, regressor_name=None, dataset_name="", build_test_file=True, @@ -229,7 +226,6 @@ def regression_cross_validation( y[test], estimator, results_path, - row_normalise=row_normalise, regressor_name=regressor_name, dataset_name=dataset_name, resample_id=fold, diff --git a/tsml_eval/experiments/experiments.py b/tsml_eval/experiments/experiments.py index afa39b15..8e1c224f 100644 --- a/tsml_eval/experiments/experiments.py +++ b/tsml_eval/experiments/experiments.py @@ -19,6 +19,7 @@ import time import warnings from datetime import datetime +from typing import Optional, Union import numpy as np import pandas as pd @@ -27,7 +28,6 @@ from aeon.clustering import BaseClusterer from aeon.forecasting import BaseForecaster from aeon.regression.base import BaseRegressor -from aeon.transformations.collection import Normalizer from sklearn import preprocessing from sklearn.base import BaseEstimator, is_classifier, is_regressor from sklearn.metrics import ( @@ -59,24 +59,24 @@ write_regression_results, ) -if os.getenv("MEMRECORD_INTERVAL") is not None: # pragma: no cover - TEMP = os.getenv("MEMRECORD_INTERVAL") - MEMRECORD_INTERVAL = float(TEMP) if isinstance(TEMP, str) else 5.0 +MEMRECORD_ENV = os.getenv("MEMRECORD_INTERVAL") +if isinstance(MEMRECORD_ENV, str): # pragma: no cover + MEMRECORD_INTERVAL = float(MEMRECORD_ENV) else: MEMRECORD_INTERVAL = 5.0 def run_classification_experiment( - X_train, - y_train, - X_test, - y_test, + X_train: Union[np.ndarray, list], + y_train: np.ndarray, + X_test: Union[np.ndarray, list], + y_test: np.ndarray, classifier, results_path, - row_normalise=False, classifier_name=None, dataset_name="N/A", resample_id=None, + data_transforms=None, build_test_file=True, build_train_file=False, ignore_custom_train_estimate=False, @@ -92,21 +92,22 @@ def run_classification_experiment( Parameters ---------- - X_train : pd.DataFrame or np.array todo - The data to train the classifier. + X_train : np.ndarray or list of np.ndarray + The data to train the classifier. Numpy array or list of numpy arrays in the + ``aeon`` data format. y_train : np.array - Training data class labels. - X_test : pd.DataFrame or np.array - The data used to test the trained classifier. + Training data class labels. One label per case in the training data using the + same ordering. + X_test : np.ndarray or list of np.ndarray + The data used to test the trained classifier. Numpy array or list of numpy + arrays in the ``aeon`` data format. y_test : np.array - Testing data class labels. + Testing data class labels. One label per case in the testing data using the + same ordering. classifier : BaseClassifier Classifier to be used in the experiment. results_path : str Location of where to write results. Any required directories will be created. - row_normalise : bool, default=False - Whether to normalise the data rows (time series) prior to fitting and - predicting. classifier_name : str or None, default=None Name of classifier used in writing results. If None, the name is taken from the classifier. @@ -115,6 +116,11 @@ def run_classification_experiment( resample_id : int or None, default=None Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. + data_transforms : transformer, list of transformers or None, default=None + Transformer(s) to apply to the data before running the experiment. + If a list, the transformers are applied in order. + If None, no transformation is applied. + Calls fit_transform on the training data and transform on the test data. build_test_file : bool, default=True: Whether to generate test files or not. If the classifier can generate its own train probabilities, the classifier will be built but no file will be output. @@ -162,10 +168,13 @@ def run_classification_experiment( else: raise TypeError("classifier must be a tsml, aeon or sklearn classifier.") - if row_normalise: - scaler = Normalizer() - X_train = scaler.fit_transform(X_train) - X_test = scaler.fit_transform(X_test) + if data_transforms is not None: + if not isinstance(data_transforms, list): + data_transforms = [data_transforms] + + for transform in data_transforms: + X_train = transform.fit_transform(X_train, y_train) + X_test = transform.transform(X_test, y_test) le = preprocessing.LabelEncoder() y_train = le.fit_transform(y_train) @@ -293,9 +302,9 @@ def load_and_run_classification_experiment( results_path, dataset, classifier, - row_normalise=False, classifier_name=None, resample_id=0, + data_transforms=None, build_train_file=False, write_attributes=False, att_max_shape=0, @@ -320,15 +329,17 @@ def load_and_run_classification_experiment( same for "_TEST.ts". classifier : BaseClassifier Classifier to be used in the experiment. - row_normalise : bool, default=False - Whether to normalise the data rows (time series) prior to fitting and - predicting. classifier_name : str or None, default=None Name of classifier used in writing results. If None, the name is taken from the classifier. resample_id : int, default=0 Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. + data_transforms : transformer, list of transformers or None, default=None + Transformer(s) to apply to the data before running the experiment. + If a list, the transformers are applied in order. + If None, no transformation is applied. + Calls fit_transform on the training data and transform on the test data. build_train_file : bool, default=False Whether to generate train files or not. If true, it performs a 10-fold cross-validation on the train data and saves. If the classifier can produce its @@ -382,10 +393,10 @@ def load_and_run_classification_experiment( y_test, classifier, results_path, - row_normalise=row_normalise, classifier_name=classifier_name, dataset_name=dataset, resample_id=resample_id, + data_transforms=data_transforms, build_test_file=build_test_file, build_train_file=build_train_file, attribute_file_path=attribute_file_path, @@ -395,16 +406,16 @@ def load_and_run_classification_experiment( def run_regression_experiment( - X_train, - y_train, - X_test, - y_test, + X_train: Union[np.ndarray, list], + y_train: np.ndarray, + X_test: Union[np.ndarray, list], + y_test: np.ndarray, regressor, results_path, - row_normalise=False, regressor_name=None, dataset_name="", resample_id=None, + data_transforms=None, build_test_file=True, build_train_file=False, ignore_custom_train_estimate=False, @@ -420,21 +431,22 @@ def run_regression_experiment( Parameters ---------- - X_train : pd.DataFrame or np.array - The data to train the regressor. + X_train : np.ndarray or list of np.ndarray + The data to train the classifier. Numpy array or list of numpy arrays in the + ``aeon`` data format. y_train : np.array - Training data labels. - X_test : pd.DataFrame or np.array - The data used to test the trained regressor. + Training data class labels. One label per case in the training data using the + same ordering. + X_test : np.ndarray or list of np.ndarray + The data used to test the trained classifier. Numpy array or list of numpy + arrays in the ``aeon`` data format. y_test : np.array - Testing data labels. + Testing data class labels. One label per case in the testing data using the + same ordering. regressor : BaseRegressor Regressor to be used in the experiment. results_path : str Location of where to write results. Any required directories will be created. - row_normalise : bool, default=False - Whether to normalise the data rows (time series) prior to fitting and - predicting. regressor_name : str or None, default=None Name of regressor used in writing results. If None, the name is taken from the regressor. @@ -443,6 +455,11 @@ def run_regression_experiment( resample_id : int or None, default=None Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. + data_transforms : transformer, list of transformers or None, default=None + Transformer(s) to apply to the data before running the experiment. + If a list, the transformers are applied in order. + If None, no transformation is applied. + Calls fit_transform on the training data and transform on the test data. build_test_file : bool, default=True: Whether to generate test files or not. If the regressor can generate its own train predictions, the classifier will be built but no file will be output. @@ -490,10 +507,13 @@ def run_regression_experiment( else: raise TypeError("regressor must be a tsml, aeon or sklearn regressor.") - if row_normalise: - scaler = Normalizer() - X_train = scaler.fit_transform(X_train) - X_test = scaler.fit_transform(X_test) + if data_transforms is not None: + if not isinstance(data_transforms, list): + data_transforms = [data_transforms] + + for transform in data_transforms: + X_train = transform.fit_transform(X_train, y_train) + X_test = transform.transform(X_test, y_test) needs_fit = True fit_time = -1 @@ -598,9 +618,9 @@ def load_and_run_regression_experiment( results_path, dataset, regressor, - row_normalise=False, regressor_name=None, resample_id=0, + data_transforms=None, build_train_file=False, write_attributes=False, att_max_shape=0, @@ -625,15 +645,17 @@ def load_and_run_regression_experiment( same for "_TEST.ts". regressor : BaseRegressor Regressor to be used in the experiment. - row_normalise : bool, default=False - Whether to normalise the data rows (time series) prior to fitting and - predicting. regressor_name : str or None, default=None Name of regressor used in writing results. If None, the name is taken from the regressor. resample_id : int, default=0 Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. + data_transforms : transformer, list of transformers or None, default=None + Transformer(s) to apply to the data before running the experiment. + If a list, the transformers are applied in order. + If None, no transformation is applied. + Calls fit_transform on the training data and transform on the test data. build_train_file : bool, default=False Whether to generate train files or not. If true, it performs a 10-fold cross-validation on the train data and saves. If the regressor can produce its @@ -691,10 +713,10 @@ def load_and_run_regression_experiment( y_test, regressor, results_path, - row_normalise=row_normalise, regressor_name=regressor_name, dataset_name=dataset, resample_id=resample_id, + data_transforms=data_transforms, build_test_file=build_test_file, build_train_file=build_train_file, attribute_file_path=attribute_file_path, @@ -704,17 +726,17 @@ def load_and_run_regression_experiment( def run_clustering_experiment( - X_train, - y_train, + X_train: Union[np.ndarray, list], + y_train: np.ndarray, clusterer, results_path, - X_test=None, - y_test=None, - row_normalise=False, + X_test: Optional[Union[np.ndarray, list]] = None, + y_test: Optional[np.ndarray] = None, n_clusters=None, clusterer_name=None, dataset_name="N/A", resample_id=None, + data_transforms=None, build_test_file=False, build_train_file=True, attribute_file_path=None, @@ -729,21 +751,22 @@ def run_clustering_experiment( Parameters ---------- - X_train : pd.DataFrame or np.array - The data to train the clusterer. + X_train : np.ndarray or list of np.ndarray + The data to train the classifier. Numpy array or list of numpy arrays in the + ``aeon`` data format. y_train : np.array - Training data class labels (used for evaluation). + Training data class labels. One label per case in the training data using the + same ordering. clusterer : BaseClusterer Clusterer to be used in the experiment. results_path : str Location of where to write results. Any required directories will be created. - X_test : pd.DataFrame or np.array, default=None - The data used to test the fitted clusterer. - y_test : np.array, default=None - Testing data class labels. - row_normalise : bool, default=False - Whether to normalise the data rows (time series) prior to fitting and - predicting. + X_test : np.ndarray or list of np.ndarray + The data used to test the trained classifier. Numpy array or list of numpy + arrays in the ``aeon`` data format. + y_test : np.array + Testing data class labels. One label per case in the testing data using the + same ordering. n_clusters : int or None, default=None Number of clusters to use if the clusterer has an `n_clusters` parameter. If None, the clusterers default is used. If -1, the number of classes in the @@ -762,6 +785,11 @@ def run_clustering_experiment( resample_id : int or None, default=None Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. + data_transforms : transformer, list of transformers or None, default=None + Transformer(s) to apply to the data before running the experiment. + If a list, the transformers are applied in order. + If None, no transformation is applied. + Calls fit_transform on the training data and transform on the test data. build_test_file : bool, default=False: Whether to generate test files or not. If True, X_test and y_test must be provided. @@ -801,11 +829,14 @@ def run_clustering_experiment( if build_test_file and (X_test is None or y_test is None): raise ValueError("Test data and labels not provided, cannot build test file.") - if row_normalise: - scaler = Normalizer() - X_train = scaler.fit_transform(X_train) - if build_test_file: - X_test = scaler.fit_transform(X_test) + if data_transforms is not None: + if not isinstance(data_transforms, list): + data_transforms = [data_transforms] + + for transform in data_transforms: + X_train = transform.fit_transform(X_train, y_train) + if build_test_file: + X_test = transform.transform(X_test, y_test) le = preprocessing.LabelEncoder() y_train = le.fit_transform(y_train) @@ -949,10 +980,10 @@ def load_and_run_clustering_experiment( results_path, dataset, clusterer, - row_normalise=False, n_clusters=None, clusterer_name=None, resample_id=0, + data_transforms=None, build_test_file=False, write_attributes=False, att_max_shape=0, @@ -978,9 +1009,6 @@ def load_and_run_clustering_experiment( same for "_TEST.ts". clusterer : BaseClusterer Clusterer to be used in the experiment. - row_normalise : bool, default=False - Whether to normalise the data rows (time series) prior to fitting and - predicting. n_clusters : int or None, default=None Number of clusters to use if the clusterer has an `n_clusters` parameter. If None, the clusterers default is used. If -1, the number of classes in the @@ -994,6 +1022,11 @@ def load_and_run_clustering_experiment( resample_id : int, default=0 Seed for resampling. If set to 0, the default train/test split from file is used. Also used in output file name. + data_transforms : transformer, list of transformers or None, default=None + Transformer(s) to apply to the data before running the experiment. + If a list, the transformers are applied in order. + If None, no transformation is applied. + Calls fit_transform on the training data and transform on the test data. build_test_file : bool, default=False Whether to generate test files or not. If true, the clusterer will assign clusters to the loaded test data. @@ -1063,11 +1096,11 @@ def load_and_run_clustering_experiment( results_path, X_test=X_test, y_test=y_test, - row_normalise=row_normalise, n_clusters=n_clusters, clusterer_name=clusterer_name, dataset_name=dataset, resample_id=resample_id, + data_transforms=data_transforms, build_train_file=build_train_file, build_test_file=build_test_file, attribute_file_path=attribute_file_path, diff --git a/tsml_eval/experiments/forecasting_experiments.py b/tsml_eval/experiments/forecasting_experiments.py index 2ea19df8..8c8d17fd 100644 --- a/tsml_eval/experiments/forecasting_experiments.py +++ b/tsml_eval/experiments/forecasting_experiments.py @@ -18,9 +18,10 @@ import numba from aeon.utils.validation._dependencies import _check_soft_dependencies +from tsml_eval.experiments._get_forecaster import get_forecaster_by_name + # todo replace when added back to init from tsml_eval.experiments.experiments import load_and_run_forecasting_experiment -from tsml_eval.experiments.set_forecaster import get_forecaster_by_name from tsml_eval.experiments.tests import _FORECASTER_RESULTS_PATH from tsml_eval.testing.testing_utils import _TEST_DATA_PATH from tsml_eval.utils.arguments import parse_args diff --git a/tsml_eval/experiments/regression_experiments.py b/tsml_eval/experiments/regression_experiments.py index e8691dc2..cb1449bc 100644 --- a/tsml_eval/experiments/regression_experiments.py +++ b/tsml_eval/experiments/regression_experiments.py @@ -18,8 +18,11 @@ import numba from aeon.utils.validation._dependencies import _check_soft_dependencies -from tsml_eval.experiments import load_and_run_regression_experiment -from tsml_eval.experiments.set_regressor import get_regressor_by_name +from tsml_eval.experiments import ( + get_data_transform_by_name, + get_regressor_by_name, + load_and_run_regression_experiment, +) from tsml_eval.experiments.tests import _REGRESSOR_RESULTS_PATH from tsml_eval.testing.testing_utils import _TEST_DATA_PATH from tsml_eval.utils.arguments import parse_args @@ -81,9 +84,18 @@ def run_experiment(args): checkpoint=args.checkpoint, **args.kwargs, ), - row_normalise=args.row_normalise, regressor_name=args.estimator_name, resample_id=args.resample_id, + data_transforms=get_data_transform_by_name( + args.data_transform_name, + row_normalise=args.row_normalise, + random_state=( + args.resample_id + if args.random_seed is None + else args.random_seed + ), + n_jobs=1, + ), build_train_file=args.train_fold, write_attributes=args.write_attributes, att_max_shape=args.att_max_shape, @@ -101,6 +113,7 @@ def run_experiment(args): estimator_name = "ROCKET" dataset_name = "MinimalGasPrices" row_normalise = False + transform_name = None resample_id = 0 train_fold = False write_attributes = True @@ -120,6 +133,11 @@ def run_experiment(args): checkpoint=checkpoint, **kwargs, ) + transform = get_data_transform_by_name( + transform_name, + row_normalise=row_normalise, + random_state=resample_id, + ) print(f"Local Run of {estimator_name} ({regressor.__class__.__name__}).") load_and_run_regression_experiment( @@ -127,9 +145,9 @@ def run_experiment(args): results_path, dataset_name, regressor, - row_normalise=row_normalise, regressor_name=estimator_name, resample_id=resample_id, + data_transforms=transform, build_train_file=train_fold, write_attributes=write_attributes, att_max_shape=att_max_shape, diff --git a/tsml_eval/experiments/tests/test_classification.py b/tsml_eval/experiments/tests/test_classification.py index 389b07cf..e08ac75b 100644 --- a/tsml_eval/experiments/tests/test_classification.py +++ b/tsml_eval/experiments/tests/test_classification.py @@ -9,10 +9,10 @@ from tsml_eval.datasets._test_data._data_sizes import DATA_TEST_SIZES from tsml_eval.experiments import ( + _get_classifier, classification_experiments, get_classifier_by_name, run_classification_experiment, - set_classifier, threaded_classification_experiments, ) from tsml_eval.experiments.tests import _CLASSIFIER_RESULTS_PATH @@ -109,8 +109,10 @@ def test_run_threaded_classification_experiment(): "1", "-nj", "2", - # also test normalisation and benchmark time here + # also test transforms and benchmark time here "--row_normalise", + "--data_transform_name", + "Padder", "--benchmark_time", ] @@ -170,16 +172,16 @@ def test_run_classification_experiment_invalid_estimator(): def test_get_classifier_by_name(): """Test get_classifier_by_name method.""" classifier_lists = [ - set_classifier.convolution_based_classifiers, - set_classifier.deep_learning_classifiers, - set_classifier.dictionary_based_classifiers, - set_classifier.distance_based_classifiers, - set_classifier.feature_based_classifiers, - set_classifier.hybrid_classifiers, - set_classifier.interval_based_classifiers, - set_classifier.other_classifiers, - set_classifier.shapelet_based_classifiers, - set_classifier.vector_classifiers, + _get_classifier.convolution_based_classifiers, + _get_classifier.deep_learning_classifiers, + _get_classifier.dictionary_based_classifiers, + _get_classifier.distance_based_classifiers, + _get_classifier.feature_based_classifiers, + _get_classifier.hybrid_classifiers, + _get_classifier.interval_based_classifiers, + _get_classifier.other_classifiers, + _get_classifier.shapelet_based_classifiers, + _get_classifier.vector_classifiers, ] classifier_dict = {} @@ -194,7 +196,9 @@ def test_get_classifier_by_name(): ) _check_set_method_results( - classifier_dict, estimator_name="Classifiers", method_name="set_classifier" + classifier_dict, + estimator_name="Classifiers", + method_name="get_classifier_by_name", ) diff --git a/tsml_eval/experiments/tests/test_clustering.py b/tsml_eval/experiments/tests/test_clustering.py index 787b9c6e..853e7f2b 100644 --- a/tsml_eval/experiments/tests/test_clustering.py +++ b/tsml_eval/experiments/tests/test_clustering.py @@ -11,10 +11,10 @@ from tsml_eval.datasets._test_data._data_sizes import DATA_TEST_SIZES, DATA_TRAIN_SIZES from tsml_eval.experiments import ( + _get_clusterer, clustering_experiments, get_clusterer_by_name, run_clustering_experiment, - set_clusterer, threaded_clustering_experiments, ) from tsml_eval.experiments.tests import _CLUSTERER_RESULTS_PATH @@ -108,8 +108,10 @@ def test_run_threaded_clustering_experiment(): "1", "-nj", "2", - # also test normalisation and benchmark time here + # also test transforms and benchmark time here "--row_normalise", + "--data_transform_name", + "Padder", "--benchmark_time", "-te", ] @@ -172,11 +174,11 @@ def test_run_clustering_experiment_invalid_estimator(): def test_get_clusterer_by_name(): """Test get_clusterer_by_name method.""" clusterer_lists = [ - set_clusterer.deep_learning_clusterers, - set_clusterer.distance_based_clusterers, - set_clusterer.feature_based_clusterers, - set_clusterer.other_clusterers, - set_clusterer.vector_clusterers, + _get_clusterer.deep_learning_clusterers, + _get_clusterer.distance_based_clusterers, + _get_clusterer.feature_based_clusterers, + _get_clusterer.other_clusterers, + _get_clusterer.vector_clusterers, ] clusterer_non_default_params = [ "clusterer", @@ -211,7 +213,7 @@ def test_get_clusterer_by_name(): ) _check_set_method_results( - clusterer_dict, estimator_name="Clusterers", method_name="set_clusterer" + clusterer_dict, estimator_name="Clusterers", method_name="get_clusterer_by_name" ) diff --git a/tsml_eval/experiments/tests/test_data_transform.py b/tsml_eval/experiments/tests/test_data_transform.py new file mode 100644 index 00000000..60538976 --- /dev/null +++ b/tsml_eval/experiments/tests/test_data_transform.py @@ -0,0 +1,44 @@ +"""Tests for data transforms in experiments.""" + +import pytest +from aeon.transformations.collection import Normalizer, Padder + +from tsml_eval.experiments import _get_data_transform, get_data_transform_by_name +from tsml_eval.testing.testing_utils import _check_set_method, _check_set_method_results + + +def test_get_data_transform_by_name(): + """Test get_data_transform_by_name method.""" + transform_lists = [_get_data_transform.transformers] + + transform_dict = {} + all_transform_names = [] + + for transform_list in transform_lists: + _check_set_method( + get_data_transform_by_name, + transform_list, + transform_dict, + all_transform_names, + ) + + _check_set_method_results( + transform_dict, + estimator_name="Transformers", + method_name="get_data_transform_by_name", + ) + + +def test_get_data_transform_by_name_multiple_output(): + """Test get_data_transform_by_name method with multiple inputs and outputs.""" + t = get_data_transform_by_name(["padder", "normaliser"], row_normalise=True) + assert len(t) == 3 + assert isinstance(t[0], Normalizer) + assert isinstance(t[1], Padder) + assert isinstance(t[2], Normalizer) + + +def test_get_data_transform_by_name_invalid(): + """Test get_data_transform_by_name method with invalid estimator.""" + with pytest.raises(ValueError, match="UNKNOWN TRANSFORMER"): + get_data_transform_by_name("invalid") diff --git a/tsml_eval/experiments/tests/test_forecasting.py b/tsml_eval/experiments/tests/test_forecasting.py index 21c307d9..a98e8a43 100644 --- a/tsml_eval/experiments/tests/test_forecasting.py +++ b/tsml_eval/experiments/tests/test_forecasting.py @@ -93,8 +93,10 @@ # "1", # "-nj", # "2", -# # also test normalisation and benchmark time here +# # also test transforms and benchmark time here # "--row_normalise", +# "--data_transform_name", +# "Padder", # "--benchmark_time", # ] # @@ -152,7 +154,9 @@ # ) # # _check_set_method_results( -# forecaster_dict, estimator_name="Forecasters", method_name="set_forecaster" +# forecaster_dict, +# estimator_name="Forecasters", +# method_name="get_forecaster_by_name", # ) # # diff --git a/tsml_eval/experiments/tests/test_regression.py b/tsml_eval/experiments/tests/test_regression.py index 1f1f0b45..9cb25dde 100644 --- a/tsml_eval/experiments/tests/test_regression.py +++ b/tsml_eval/experiments/tests/test_regression.py @@ -11,10 +11,10 @@ from tsml_eval.datasets._test_data._data_sizes import DATA_TEST_SIZES from tsml_eval.experiments import ( + _get_regressor, get_regressor_by_name, regression_experiments, run_regression_experiment, - set_regressor, threaded_regression_experiments, ) from tsml_eval.experiments.tests import _REGRESSOR_RESULTS_PATH @@ -109,8 +109,10 @@ def test_run_threaded_regression_experiment(): "1", "-nj", "2", - # also test normalisation and benchmark time here + # also test transforms and benchmark time here "--row_normalise", + "--data_transform_name", + "Padder", "--benchmark_time", ] @@ -169,15 +171,15 @@ def test_run_regression_experiment_invalid_estimator(): def test_get_regressor_by_name(): """Test get_regressor_by_name method.""" regressor_lists = [ - set_regressor.convolution_based_regressors, - set_regressor.deep_learning_regressors, - set_regressor.distance_based_regressors, - set_regressor.feature_based_regressors, - set_regressor.hybrid_regressors, - set_regressor.interval_based_regressors, - set_regressor.other_regressors, - set_regressor.shapelet_based_regressors, - set_regressor.vector_regressors, + _get_regressor.convolution_based_regressors, + _get_regressor.deep_learning_regressors, + _get_regressor.distance_based_regressors, + _get_regressor.feature_based_regressors, + _get_regressor.hybrid_regressors, + _get_regressor.interval_based_regressors, + _get_regressor.other_regressors, + _get_regressor.shapelet_based_regressors, + _get_regressor.vector_regressors, ] regressor_dict = {} @@ -192,7 +194,7 @@ def test_get_regressor_by_name(): ) _check_set_method_results( - regressor_dict, estimator_name="Regressors", method_name="set_regressor" + regressor_dict, estimator_name="Regressors", method_name="get_regressor_by_name" ) diff --git a/tsml_eval/experiments/threaded_classification_experiments.py b/tsml_eval/experiments/threaded_classification_experiments.py index 07799536..2aa06ec5 100644 --- a/tsml_eval/experiments/threaded_classification_experiments.py +++ b/tsml_eval/experiments/threaded_classification_experiments.py @@ -8,8 +8,11 @@ import sys -from tsml_eval.experiments import load_and_run_classification_experiment -from tsml_eval.experiments.set_classifier import get_classifier_by_name +from tsml_eval.experiments import ( + get_classifier_by_name, + get_data_transform_by_name, + load_and_run_classification_experiment, +) from tsml_eval.experiments.tests import _CLASSIFIER_RESULTS_PATH from tsml_eval.testing.testing_utils import _TEST_DATA_PATH from tsml_eval.utils.arguments import parse_args @@ -57,9 +60,18 @@ def run_experiment(args): checkpoint=args.checkpoint, **args.kwargs, ), - row_normalise=args.row_normalise, classifier_name=args.estimator_name, resample_id=args.resample_id, + data_transforms=get_data_transform_by_name( + args.data_transform_name, + row_normalise=args.row_normalise, + random_state=( + args.resample_id + if args.random_seed is None + else args.random_seed + ), + n_jobs=args.n_jobs, + ), build_train_file=args.train_fold, write_attributes=args.write_attributes, att_max_shape=args.att_max_shape, @@ -76,6 +88,7 @@ def run_experiment(args): estimator_name = "ROCKET" dataset_name = "MinimalChinatown" row_normalise = False + transform_name = None resample_id = 0 n_jobs = 1 train_fold = False @@ -96,6 +109,12 @@ def run_experiment(args): checkpoint=checkpoint, **kwargs, ) + transform = get_data_transform_by_name( + transform_name, + row_normalise=row_normalise, + random_state=resample_id, + n_jobs=n_jobs, + ) print(f"Local Run of {estimator_name} ({classifier.__class__.__name__}).") load_and_run_classification_experiment( @@ -103,9 +122,9 @@ def run_experiment(args): results_path, dataset_name, classifier, - row_normalise=row_normalise, classifier_name=estimator_name, resample_id=resample_id, + data_transforms=transform, build_train_file=train_fold, write_attributes=write_attributes, att_max_shape=att_max_shape, diff --git a/tsml_eval/experiments/threaded_clustering_experiments.py b/tsml_eval/experiments/threaded_clustering_experiments.py index 06420984..429850ff 100644 --- a/tsml_eval/experiments/threaded_clustering_experiments.py +++ b/tsml_eval/experiments/threaded_clustering_experiments.py @@ -9,8 +9,11 @@ import sys -from tsml_eval.experiments import load_and_run_clustering_experiment -from tsml_eval.experiments.set_clusterer import get_clusterer_by_name +from tsml_eval.experiments import ( + get_clusterer_by_name, + get_data_transform_by_name, + load_and_run_clustering_experiment, +) from tsml_eval.experiments.tests import _CLUSTERER_RESULTS_PATH from tsml_eval.testing.testing_utils import _TEST_DATA_PATH from tsml_eval.utils.arguments import parse_args @@ -65,10 +68,19 @@ def run_experiment(args): row_normalise=args.row_normalise, **args.kwargs, ), - row_normalise=args.row_normalise, n_clusters=args.n_clusters, clusterer_name=args.estimator_name, resample_id=args.resample_id, + data_transforms=get_data_transform_by_name( + args.data_transform_name, + row_normalise=args.row_normalise, + random_state=( + args.resample_id + if args.random_seed is None + else args.random_seed + ), + n_jobs=args.n_jobs, + ), build_test_file=args.test_fold, write_attributes=args.write_attributes, att_max_shape=args.att_max_shape, @@ -85,6 +97,7 @@ def run_experiment(args): estimator_name = "KMeans" dataset_name = "MinimalChinatown" row_normalise = False + transform_name = None n_clusters = -1 resample_id = 0 n_jobs = 1 @@ -108,6 +121,12 @@ def run_experiment(args): row_normalise=row_normalise, **kwargs, ) + transform = get_data_transform_by_name( + transform_name, + row_normalise=row_normalise, + random_state=resample_id, + n_jobs=n_jobs, + ) print(f"Local Run of {estimator_name} ({clusterer.__class__.__name__}).") load_and_run_clustering_experiment( @@ -115,10 +134,10 @@ def run_experiment(args): results_path, dataset_name, clusterer, - row_normalise=row_normalise, n_clusters=n_clusters, clusterer_name=estimator_name, resample_id=resample_id, + data_transforms=transform, build_test_file=test_fold, write_attributes=write_attributes, att_max_shape=att_max_shape, diff --git a/tsml_eval/experiments/threaded_forecasting_experiments.py b/tsml_eval/experiments/threaded_forecasting_experiments.py index 88b8afb3..cccd340d 100644 --- a/tsml_eval/experiments/threaded_forecasting_experiments.py +++ b/tsml_eval/experiments/threaded_forecasting_experiments.py @@ -8,9 +8,10 @@ import sys +from tsml_eval.experiments._get_forecaster import get_forecaster_by_name + # todo replace when added back to init from tsml_eval.experiments.experiments import load_and_run_forecasting_experiment -from tsml_eval.experiments.set_forecaster import get_forecaster_by_name from tsml_eval.experiments.tests import _FORECASTER_RESULTS_PATH from tsml_eval.testing.testing_utils import _TEST_DATA_PATH from tsml_eval.utils.arguments import parse_args diff --git a/tsml_eval/experiments/threaded_regression_experiments.py b/tsml_eval/experiments/threaded_regression_experiments.py index 2fd5477d..0718f2cf 100644 --- a/tsml_eval/experiments/threaded_regression_experiments.py +++ b/tsml_eval/experiments/threaded_regression_experiments.py @@ -9,8 +9,11 @@ import sys -from tsml_eval.experiments import load_and_run_regression_experiment -from tsml_eval.experiments.set_regressor import get_regressor_by_name +from tsml_eval.experiments import ( + get_data_transform_by_name, + get_regressor_by_name, + load_and_run_regression_experiment, +) from tsml_eval.experiments.tests import _REGRESSOR_RESULTS_PATH from tsml_eval.testing.testing_utils import _TEST_DATA_PATH from tsml_eval.utils.arguments import parse_args @@ -58,9 +61,18 @@ def run_experiment(args): checkpoint=args.checkpoint, **args.kwargs, ), - row_normalise=args.row_normalise, regressor_name=args.estimator_name, resample_id=args.resample_id, + data_transforms=get_data_transform_by_name( + args.data_transform_name, + row_normalise=args.row_normalise, + random_state=( + args.resample_id + if args.random_seed is None + else args.random_seed + ), + n_jobs=args.n_jobs, + ), build_train_file=args.train_fold, write_attributes=args.write_attributes, att_max_shape=args.att_max_shape, @@ -77,6 +89,7 @@ def run_experiment(args): estimator_name = "ROCKET" dataset_name = "MinimalGasPrices" row_normalise = False + transform_name = None resample_id = 0 n_jobs = 1 train_fold = False @@ -97,6 +110,12 @@ def run_experiment(args): checkpoint=checkpoint, **kwargs, ) + transform = get_data_transform_by_name( + transform_name, + row_normalise=row_normalise, + random_state=resample_id, + n_jobs=n_jobs, + ) print(f"Local Run of {estimator_name} ({regressor.__class__.__name__}).") load_and_run_regression_experiment( @@ -104,9 +123,9 @@ def run_experiment(args): results_path, dataset_name, regressor, - row_normalise=row_normalise, regressor_name=estimator_name, resample_id=resample_id, + data_transforms=transform, build_train_file=train_fold, write_attributes=write_attributes, att_max_shape=att_max_shape, diff --git a/tsml_eval/publications/y2023/distance_based_clustering/run_distance_experiments.py b/tsml_eval/publications/y2023/distance_based_clustering/run_distance_experiments.py index 355007e9..e497df23 100644 --- a/tsml_eval/publications/y2023/distance_based_clustering/run_distance_experiments.py +++ b/tsml_eval/publications/y2023/distance_based_clustering/run_distance_experiments.py @@ -5,6 +5,8 @@ import os import sys +from aeon.transformations.collection import Normalizer + os.environ["MKL_NUM_THREADS"] = "1" # must be done before numpy import!! os.environ["NUMEXPR_NUM_THREADS"] = "1" # must be done before numpy import!! os.environ["OMP_NUM_THREADS"] = "1" # must be done before numpy import!! @@ -132,10 +134,10 @@ def _run_experiment(args): if isinstance(clusterer, str) else _clone_estimator(clusterer, resample_id) ), - row_normalise=normalise, n_clusters=-1, clusterer_name=clusterer, resample_id=resample_id, + data_transforms=Normalizer() if normalise else None, build_test_file=True, overwrite=overwrite, ) diff --git a/tsml_eval/publications/y2023/rist_pipeline/set_rist_classifier.py b/tsml_eval/publications/y2023/rist_pipeline/set_rist_classifier.py index 29fa848b..12b58236 100644 --- a/tsml_eval/publications/y2023/rist_pipeline/set_rist_classifier.py +++ b/tsml_eval/publications/y2023/rist_pipeline/set_rist_classifier.py @@ -110,11 +110,11 @@ def _set_rist_classifier( elif ( c == "randomintervalclassifier" or c == "intervalpipeline" or c == "i-pipeline" ): + from aeon.transformations.collection.feature_based import Catch22 from sklearn.ensemble import ExtraTreesClassifier from tsml.interval_based import RandomIntervalClassifier from tsml.transformations import ( ARCoefficientTransformer, - Catch22Transformer, FunctionTransformer, PeriodogramTransformer, ) @@ -136,7 +136,7 @@ def sqrt_times_15_plus_5_mv(X): ) # pragma: no cover interval_features = [ - Catch22Transformer(outlier_norm=True, replace_nans=True), + Catch22(outlier_norm=True, replace_nans=True), row_mean, row_std, row_slope, diff --git a/tsml_eval/publications/y2023/tser_archive_expansion/tser_archive_expansion.ipynb b/tsml_eval/publications/y2023/tser_archive_expansion/tser_archive_expansion.ipynb index 3bdb4d48..541576b1 100644 --- a/tsml_eval/publications/y2023/tser_archive_expansion/tser_archive_expansion.ipynb +++ b/tsml_eval/publications/y2023/tser_archive_expansion/tser_archive_expansion.ipynb @@ -207,7 +207,6 @@ "regressors = [\n", " \"1NN-DTW\",\n", " \"1NN-ED\",\n", - " \"FPCR-Bs\",\n", " \"RandF\",\n", " \"ROCKET\",\n", "]\n", @@ -254,13 +253,6 @@ } ], "execution_count": 4 - }, - { - "cell_type": "markdown", - "source": [], - "metadata": { - "collapsed": false - } } ], "metadata": { diff --git a/tsml_eval/testing/testing_utils.py b/tsml_eval/testing/testing_utils.py index 9dae58fb..32b52b24 100644 --- a/tsml_eval/testing/testing_utils.py +++ b/tsml_eval/testing/testing_utils.py @@ -43,7 +43,7 @@ def _check_set_method( all_estimator_names.append(estimator_alias) try: - e = set_method(estimator_alias) + out = set_method(estimator_alias) except ModuleNotFoundError as err: exempt_errors = [ "optional dependency", @@ -55,16 +55,21 @@ def _check_set_method( else: raise err - assert e is not None, f"Estimator {estimator_alias} not found" - assert isinstance( - e, BaseEstimator - ), f"Estimator {estimator_alias} is not a BaseEstimator" + assert out is not None, f"Estimator {estimator_alias} not found" - c_name = e.__class__.__name__.lower() - if c_name == estimator_alias.lower(): - estimator_dict[c_name] = True - elif c_name not in estimator_dict: - estimator_dict[c_name] = False + if not isinstance(out, list): + out = [out] + + for e in out: + assert isinstance( + e, BaseEstimator + ), f"Estimator {estimator_alias} is not a BaseEstimator" + + e_name = e.__class__.__name__.lower() + if e_name == estimator_alias.lower(): + estimator_dict[e_name] = True + elif e_name not in estimator_dict: + estimator_dict[e_name] = False if return_estimator: estimators.append(e) @@ -75,7 +80,6 @@ def _check_set_method( EXEMPT_ESTIMATOR_NAMES = [ "channelensembleregressor", "gridsearchcv", - "transformedtargetforecaster", ] diff --git a/tsml_eval/utils/arguments.py b/tsml_eval/utils/arguments.py index 7235d3cc..9db60571 100644 --- a/tsml_eval/utils/arguments.py +++ b/tsml_eval/utils/arguments.py @@ -64,8 +64,13 @@ def parse_args(args): -ch, --checkpoint save the estimator fit to file periodically while building. Only used if the estimator can checkpoint (default: False). + -dtn DATA_TRANSFORM_NAME, --data_transform_name DATA_TRANSFORM_NAME + str to pass to get_data_transform_by_name to apply a + transformation to the data prior to running the experiment. + By default no transform is applied (default: None). -rn, --row_normalise normalise the data rows prior to fitting and - predicting. (default: False). + predicting. effectively the same as passing Normalizer to + --data_transform_name (default: False). -nc N_CLUSTERS, --n_clusters N_CLUSTERS the number of clusters to find for clusterers which have an {n_clusters} parameter. If {-1}, use the @@ -197,11 +202,20 @@ def parse_args(args): help="save the estimator fit to file periodically while building. Only used if " "the estimator can checkpoint (default: %(default)s).", ) + parser.add_argument( + "-dtn", + "--data_transform_name", + default=None, + help="str to pass to get_data_transform_by_name to apply a transformation " + "to the data prior to running the experiment. By default no transform " + "is applied (default: %(default)s).", + ) parser.add_argument( "-rn", "--row_normalise", action="store_true", help="normalise the data rows prior to fitting and predicting. " + "effectively the same as passing Normalizer to --data_transform_name " "(default: %(default)s).", ) parser.add_argument(