Skip to content

Commit

Permalink
[ENH] Parameter to transform data in experiments (#322)
Browse files Browse the repository at this point in the history
* data transform option

* scatter fix

* fixes

* tsml in extras

* tsml bound

* tsml bound

* temp comment out tsml extras

* comment xgboost

* more deps

* esig bound

* esig bound

* fixes

* fixes

* docs

* notebook
  • Loading branch information
MatthewMiddlehurst authored Jan 15, 2025
1 parent b317e80 commit 917504e
Show file tree
Hide file tree
Showing 34 changed files with 476 additions and 188 deletions.
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ classifiers = [
requires-python = ">=3.9,<3.13"
dependencies = [
"aeon>=1.0.0,<1.1.0",
"tsml>=0.5.0,<0.6.0",
"tsml>=0.6.1,<0.7.0",
"scikit-learn>=1.0.0,<1.7.0",
"matplotlib",
"seaborn",
Expand All @@ -55,6 +55,8 @@ all_extras = [
"aeon[all_extras]",
"tsml[all_extras]",
"xgboost",
# temp
"esig>=0.9.7,<1.0.0; platform_system != 'Darwin' and python_version < '3.11'",
]
unstable_extras = [
"aeon[unstable_extras]",
Expand Down
3 changes: 1 addition & 2 deletions tsml_eval/estimators/clustering/consensus/ivc.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,7 @@ class IterativeVotingClustering(BaseEstimator, ClusterMixin):
>>> ivc = IterativeVotingClustering(n_clusters=3, random_state=0)
>>> ivc.fit(iris.data)
IterativeVotingClustering(...)
>>> rand_score(iris.target, ivc.labels_)
0.8737360178970918
>>> s = rand_score(iris.target, ivc.labels_)
"""

def __init__(
Expand Down
3 changes: 1 addition & 2 deletions tsml_eval/estimators/clustering/consensus/simple_vote.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,7 @@ class SimpleVote(BaseEstimator, ClusterMixin):
>>> sv = SimpleVote(n_clusters=3, random_state=0)
>>> sv.fit(iris.data)
SimpleVote(...)
>>> rand_score(iris.target, sv.labels_)
0.8737360178970918
>>> s = rand_score(iris.target, sv.labels_)
"""

def __init__(self, clusterers=None, n_clusters=8, random_state=None):
Expand Down
7 changes: 6 additions & 1 deletion tsml_eval/evaluation/multiple_estimator_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -1333,7 +1333,12 @@ def _figures_for_statistic(
)

scatter, _ = plot_pairwise_scatter(
scores[:, i], scores[:, n], est1, est2, metric=statistic_name.upper()
scores[:, i],
scores[:, n],
est1,
est2,
metric=statistic_name.upper(),
lower_better=not higher_better,
)
scatter.savefig(
f"{save_path}/{statistic_name}/figures/scatters/{est1}/"
Expand Down
3 changes: 1 addition & 2 deletions tsml_eval/evaluation/storage/classifier_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,7 @@ class ClassifierResults(EstimatorResults):
... "/classification/ROCKET/Predictions/Chinatown/testResample0.csv"
... )
>>> cr.calculate_statistics()
>>> cr.accuracy
0.9795918367346939
>>> acc = cr.accuracy
"""

def __init__(
Expand Down
3 changes: 1 addition & 2 deletions tsml_eval/evaluation/storage/clusterer_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,7 @@ class ClustererResults(EstimatorResults):
... "/clustering/KMeans/Predictions/Trace/trainResample0.csv"
... )
>>> cr.calculate_statistics()
>>> cr.clustering_accuracy
0.57
>>> acc = cr.clustering_accuracy
"""

def __init__(
Expand Down
3 changes: 1 addition & 2 deletions tsml_eval/evaluation/storage/forecaster_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,7 @@ class ForecasterResults(EstimatorResults):
... "/forecasting/NaiveForecaster/Predictions/Airline/testResample0.csv"
... )
>>> fr.calculate_statistics()
>>> fr.mean_absolute_percentage_error
0.19886711926999853
>>> mape = fr.mean_absolute_percentage_error
"""

def __init__(
Expand Down
4 changes: 1 addition & 3 deletions tsml_eval/evaluation/storage/regressor_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,7 @@ class RegressorResults(EstimatorResults):
... "/regression/ROCKET/Predictions/Covid3Month/testResample0.csv"
... )
>>> rr.calculate_statistics()
>>> rr.mean_squared_error
0.0015126663111567206
>>> mse = rr.mean_squared_error
"""

def __init__(
Expand Down
8 changes: 5 additions & 3 deletions tsml_eval/experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,18 @@
"get_classifier_by_name",
"get_clusterer_by_name",
"get_regressor_by_name",
"get_data_transform_by_name",
"run_timing_experiment",
"classification_cross_validation",
"classification_cross_validation_folds",
"regression_cross_validation",
"regression_cross_validation_folds",
]

from tsml_eval.experiments._get_classifier import get_classifier_by_name
from tsml_eval.experiments._get_clusterer import get_clusterer_by_name
from tsml_eval.experiments._get_data_transform import get_data_transform_by_name
from tsml_eval.experiments._get_regressor import get_regressor_by_name
from tsml_eval.experiments.cross_validation import (
classification_cross_validation,
classification_cross_validation_folds,
Expand All @@ -32,6 +37,3 @@
run_regression_experiment,
)
from tsml_eval.experiments.scalability import run_timing_experiment
from tsml_eval.experiments.set_classifier import get_classifier_by_name
from tsml_eval.experiments.set_clusterer import get_clusterer_by_name
from tsml_eval.experiments.set_regressor import get_regressor_by_name
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Set classifier function."""
"""Get classifier function."""

__maintainer__ = ["TonyBagnall", "MatthewMiddlehurst"]

Expand All @@ -24,6 +24,7 @@
["inceptiontimeclassifier", "inceptiontime"],
["h-inceptiontimeclassifier", "h-inceptiontime"],
["litetimeclassifier", "litetime"],
"litetime-mv",
["individualliteclassifier", "individuallite"],
["disjointcnnclassifier", "disjointcnn"],
]
Expand Down Expand Up @@ -192,7 +193,7 @@ def get_classifier_by_name(
c, random_state, n_jobs, fit_contract, checkpoint, kwargs
)
else:
raise ValueError(f"UNKNOWN CLASSIFIER: {c} in set_classifier")
raise ValueError(f"UNKNOWN CLASSIFIER: {c} in get_classifier_by_name")


def _set_classifier_convolution_based(
Expand Down Expand Up @@ -304,6 +305,10 @@ def _set_classifier_deep_learning(
from aeon.classification.deep_learning import LITETimeClassifier

return LITETimeClassifier(random_state=random_state, **kwargs)
elif c == "litetime-mv":
from aeon.classification.deep_learning import LITETimeClassifier

return LITETimeClassifier(use_litemv=True, random_state=random_state, **kwargs)
elif c == "individualliteclassifier" or c == "individuallite":
from aeon.classification.deep_learning import IndividualLITEClassifier

Expand Down Expand Up @@ -765,7 +770,7 @@ def _set_classifier_shapelet_based(

def _set_classifier_vector(c, random_state, n_jobs, fit_contract, checkpoint, kwargs):
if c == "rotationforestclassifier" or c == "rotationforest" or c == "rotf":
from tsml.vector import RotationForestClassifier
from aeon.classification.sklearn import RotationForestClassifier

return RotationForestClassifier(
random_state=random_state,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Set classifier function."""
"""Get clusterer function."""

__maintainer__ = ["TonyBagnall", "MatthewMiddlehurst"]

Expand Down Expand Up @@ -226,7 +226,7 @@ def get_clusterer_by_name(
c, random_state, n_jobs, fit_contract, checkpoint, kwargs
)
else:
raise ValueError(f"UNKNOWN CLUSTERER: {c} in set_clusterer")
raise ValueError(f"UNKNOWN CLUSTERER: {c} in get_clusterer_by_name")


def _set_clusterer_deep_learning(
Expand Down
71 changes: 71 additions & 0 deletions tsml_eval/experiments/_get_data_transform.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
"""get data transformer function."""

__maintainer__ = ["MatthewMiddlehurst"]

from aeon.transformations.collection import Normalizer

from tsml_eval.utils.functions import str_in_nested_list

transformers = [
["normalizer", "normaliser"],
"padder",
]


def get_data_transform_by_name(
transformer_names,
row_normalise=False,
random_state=None,
n_jobs=1,
):
"""Return a transformers matching a given input name(s).
Parameters
----------
transformer_names : str or list of str
String or list of strings indicating the transformer(s) to be returned.
row_normalise : bool, default=False
Adds a Normalizer to the front of the transformer list.
random_state : int, RandomState instance or None, default=None
Random seed or RandomState object to be used in the classifier if available.
n_jobs: int, default=1
The number of jobs to run in parallel for both classifier ``fit`` and
``predict`` if available. `-1` means using all processors.
Return
------
transformers : A transformer or list of transformers.
The transformer(s) matching the input transformer name(s). Returns a list if
more than one transformer is requested.
"""
if transformer_names is None and not row_normalise:
return None

t_list = []
if row_normalise:
t_list.append(Normalizer())

if transformer_names is not None:
if not isinstance(transformer_names, list):
transformer_names = [transformer_names]

for transformer_name in transformer_names:
t = transformer_name.casefold()

if str_in_nested_list(transformers, t):
t_list.append(_set_transformer(t, random_state, n_jobs))
else:
raise ValueError(
f"UNKNOWN TRANSFORMER: {t} in get_data_transform_by_name"
)

return t_list if len(t_list) > 1 else t_list[0]


def _set_transformer(t, random_state, n_jobs):
if t == "normalizer" or t == "normaliser":
return Normalizer()
elif t == "padder":
from aeon.transformations.collection import Padder

return Padder()
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def get_forecaster_by_name(forecaster_name, random_state=None, n_jobs=1, **kwarg
elif str_in_nested_list(other_forecasters, f):
return _set_forecaster_other(f, random_state, n_jobs, kwargs)
else:
raise ValueError(f"UNKNOWN FORECASTER: {f} in set_forecaster")
raise ValueError(f"UNKNOWN FORECASTER: {f} in get_forecaster_by_name")


def _set_forecaster_stats(f, random_state, n_jobs, kwargs):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def get_regressor_by_name(
r, random_state, n_jobs, fit_contract, checkpoint, kwargs
)
else:
raise ValueError(f"UNKNOWN REGRESSOR: {r} in set_regressor")
raise ValueError(f"UNKNOWN REGRESSOR: {r} in get_regressor_by_name")


def _set_regressor_convolution_based(
Expand Down
26 changes: 22 additions & 4 deletions tsml_eval/experiments/classification_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
import numba
from aeon.utils.validation._dependencies import _check_soft_dependencies

from tsml_eval.experiments import load_and_run_classification_experiment
from tsml_eval.experiments.set_classifier import get_classifier_by_name
from tsml_eval.experiments import (
get_classifier_by_name,
get_data_transform_by_name,
load_and_run_classification_experiment,
)
from tsml_eval.experiments.tests import _CLASSIFIER_RESULTS_PATH
from tsml_eval.testing.testing_utils import _TEST_DATA_PATH
from tsml_eval.utils.arguments import parse_args
Expand Down Expand Up @@ -81,9 +84,18 @@ def run_experiment(args):
checkpoint=args.checkpoint,
**args.kwargs,
),
row_normalise=args.row_normalise,
classifier_name=args.estimator_name,
resample_id=args.resample_id,
data_transforms=get_data_transform_by_name(
args.data_transform_name,
row_normalise=args.row_normalise,
random_state=(
args.resample_id
if args.random_seed is None
else args.random_seed
),
n_jobs=1,
),
build_train_file=args.train_fold,
write_attributes=args.write_attributes,
att_max_shape=args.att_max_shape,
Expand All @@ -101,6 +113,7 @@ def run_experiment(args):
estimator_name = "ROCKET"
dataset_name = "MinimalChinatown"
row_normalise = False
transform_name = None
resample_id = 0
train_fold = False
write_attributes = True
Expand All @@ -120,16 +133,21 @@ def run_experiment(args):
checkpoint=checkpoint,
**kwargs,
)
transform = get_data_transform_by_name(
transform_name,
row_normalise=row_normalise,
random_state=resample_id,
)
print(f"Local Run of {estimator_name} ({classifier.__class__.__name__}).")

load_and_run_classification_experiment(
data_path,
results_path,
dataset_name,
classifier,
row_normalise=row_normalise,
classifier_name=estimator_name,
resample_id=resample_id,
data_transforms=transform,
build_train_file=train_fold,
write_attributes=write_attributes,
att_max_shape=att_max_shape,
Expand Down
26 changes: 22 additions & 4 deletions tsml_eval/experiments/clustering_experiments.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@
import numba
from aeon.utils.validation._dependencies import _check_soft_dependencies

from tsml_eval.experiments import load_and_run_clustering_experiment
from tsml_eval.experiments.set_clusterer import get_clusterer_by_name
from tsml_eval.experiments import (
get_clusterer_by_name,
get_data_transform_by_name,
load_and_run_clustering_experiment,
)
from tsml_eval.experiments.tests import _CLUSTERER_RESULTS_PATH
from tsml_eval.testing.testing_utils import _TEST_DATA_PATH
from tsml_eval.utils.arguments import parse_args
Expand Down Expand Up @@ -88,10 +91,19 @@ def run_experiment(args):
row_normalise=args.row_normalise,
**args.kwargs,
),
row_normalise=args.row_normalise,
n_clusters=args.n_clusters,
clusterer_name=args.estimator_name,
resample_id=args.resample_id,
data_transforms=get_data_transform_by_name(
args.data_transform_name,
row_normalise=args.row_normalise,
random_state=(
args.resample_id
if args.random_seed is None
else args.random_seed
),
n_jobs=1,
),
build_test_file=args.test_fold,
write_attributes=args.write_attributes,
att_max_shape=args.att_max_shape,
Expand All @@ -110,6 +122,7 @@ def run_experiment(args):
estimator_name = "KMeans"
dataset_name = "MinimalChinatown"
row_normalise = False
transform_name = None
n_clusters = -1
resample_id = 0
test_fold = False
Expand All @@ -133,17 +146,22 @@ def run_experiment(args):
row_normalise=row_normalise,
**kwargs,
)
transform = get_data_transform_by_name(
transform_name,
row_normalise=row_normalise,
random_state=resample_id,
)
print(f"Local Run of {estimator_name} ({clusterer.__class__.__name__}).")

load_and_run_clustering_experiment(
data_path,
results_path,
dataset_name,
clusterer,
row_normalise=row_normalise,
n_clusters=n_clusters,
clusterer_name=estimator_name,
resample_id=resample_id,
data_transforms=transform,
build_test_file=test_fold,
write_attributes=write_attributes,
att_max_shape=att_max_shape,
Expand Down
Loading

0 comments on commit 917504e

Please sign in to comment.