From 950f676c24e3f3b67917991f63200fa3482b9d84 Mon Sep 17 00:00:00 2001 From: Mikhail Bolev <92105261+kenshi777@users.noreply.github.com> Date: Thu, 17 Oct 2024 14:44:16 +0300 Subject: [PATCH 1/4] Fix `TSDataset.drop_features` allows to drop target (#491) --- CHANGELOG.md | 2 +- etna/datasets/tsdataset.py | 5 ++- tests/test_datasets/test_dataset.py | 9 +++- .../test_filter_transform.py | 43 ++++++++++++------- 4 files changed, 41 insertions(+), 18 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2635d90ab..2bb451112 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -38,7 +38,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fix working with `embedding_sizes` in `202-NN_examples` notebook ([#489](https://github.com/etna-team/etna/pull/489)) -- +- Disallow dropping target in `TSDataset.drop_features` ([#491](https://github.com/etna-team/etna/pull/491)) - - - diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py index 5a767eeef..3488f9ea2 100644 --- a/etna/datasets/tsdataset.py +++ b/etna/datasets/tsdataset.py @@ -1296,7 +1296,7 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False): Raises ------ ValueError: - If ``features`` list contains target components + If ``features`` list contains target or target components """ features_set = set(features) @@ -1312,6 +1312,9 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False): "Prediction intervals can't be dropped from the dataset using this method! Use `drop_prediction_intervals` method!" ) + if "target" in features_set: + raise ValueError(f"Target can't be dropped from the dataset!") + dfs = [("df", self.df)] if drop_from_exog: dfs.append(("df_exog", self.df_exog)) diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py index ce3ada570..5a43a4e21 100644 --- a/tests/test_datasets/test_dataset.py +++ b/tests/test_datasets/test_dataset.py @@ -1592,7 +1592,7 @@ def test_drop_features_raise_warning_on_unknown_columns( @pytest.mark.parametrize( "features, expected_regressors", ( - (["target", "regressor_2"], ["regressor_1"]), + (["regressor_2"], ["regressor_1"]), (["out_of_dataset_column"], ["regressor_1", "regressor_2"]), ), ) @@ -1603,6 +1603,13 @@ def test_drop_features_update_regressors(df_and_regressors, features, expected_r assert sorted(ts.regressors) == sorted(expected_regressors) +def test_drop_features_throw_error_on_target(df_and_regressors): + df, df_exog, known_future = df_and_regressors + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future) + with pytest.raises(ValueError, match="Target can't be dropped from the dataset!"): + ts.drop_features(features=["target"], drop_from_exog=False) + + def test_drop_features_throw_error_on_target_components(ts_with_target_components): with pytest.raises( ValueError, diff --git a/tests/test_transforms/test_feature_selection/test_filter_transform.py b/tests/test_transforms/test_feature_selection/test_filter_transform.py index 42ae4e211..6a2fc3e26 100644 --- a/tests/test_transforms/test_feature_selection/test_filter_transform.py +++ b/tests/test_transforms/test_feature_selection/test_filter_transform.py @@ -43,7 +43,7 @@ def test_set_none(): _ = FilterFeaturesTransform() -@pytest.mark.parametrize("include", [[], ["target"], ["exog_1"], ["exog_1", "exog_2", "target"]]) +@pytest.mark.parametrize("include", [["target"], ["target", "exog_1"], ["exog_1", "exog_2", "target"]]) def test_include_filter(ts_with_features, include): """Test that transform remains only features in include.""" original_df = ts_with_features.to_pandas() @@ -60,9 +60,15 @@ def test_include_filter(ts_with_features, include): "exclude, expected_columns", [ ([], ["target", "exog_1", "exog_2"]), - (["target"], ["exog_1", "exog_2"]), + (["exog_1"], ["target", "exog_2"]), (["exog_1", "exog_2"], ["target"]), - (["target", "exog_1", "exog_2"], []), + ( + ["exog_2"], + [ + "target", + "exog_1", + ], + ), ], ) def test_exclude_filter(ts_with_features, exclude, expected_columns): @@ -95,9 +101,9 @@ def test_exclude_filter_wrong_column(ts_with_features): "columns, saved_columns", [ ([], []), - (["target"], ["target"]), + (["exog_1"], ["exog_1"]), (["exog_1", "exog_2"], ["exog_1", "exog_2"]), - (["target", "exog_1", "exog_2"], ["target", "exog_1", "exog_2"]), + (["exog_2"], ["exog_2"]), ], ) def test_transform_exclude_save_columns(ts_with_features, columns, saved_columns, return_features): @@ -120,9 +126,9 @@ def test_transform_exclude_save_columns(ts_with_features, columns, saved_columns @pytest.mark.parametrize( "columns, saved_columns", [ - ([], ["target", "exog_1", "exog_2"]), + (["target", "exog_1"], ["exog_2"]), (["target"], ["exog_1", "exog_2"]), - (["exog_1", "exog_2"], ["target"]), + (["target", "exog_2"], ["exog_1"]), (["target", "exog_1", "exog_2"], []), ], ) @@ -147,12 +153,19 @@ def test_transform_include_save_columns(ts_with_features, columns, saved_columns [ ([], True, ["exog_1", "target", "exog_2"]), ([], False, ["target", "exog_1", "exog_2"]), - (["target"], True, ["exog_1", "target", "exog_2"]), - (["target"], False, ["exog_2", "exog_1"]), + (["exog_1"], True, ["target", "exog_2", "exog_1"]), + (["exog_1"], False, ["exog_2", "target"]), (["exog_1", "exog_2"], True, ["exog_1", "target", "exog_2"]), (["exog_1", "exog_2"], False, ["target"]), - (["target", "exog_1", "exog_2"], True, ["exog_1", "target", "exog_2"]), - (["target", "exog_1", "exog_2"], False, []), + (["exog_2"], True, ["exog_1", "target", "exog_2"]), + ( + ["exog_2"], + False, + [ + "target", + "exog_1", + ], + ), ], ) def test_inverse_transform_back_excluded_columns(ts_with_features, columns, return_features, expected_columns): @@ -169,12 +182,12 @@ def test_inverse_transform_back_excluded_columns(ts_with_features, columns, retu @pytest.mark.parametrize( "columns, return_features, expected_columns", [ - ([], True, ["exog_1", "target", "exog_2"]), - ([], False, []), + (["target", "exog_1"], True, ["exog_1", "target", "exog_2"]), + (["target", "exog_1"], False, ["exog_1", "target"]), (["target"], True, ["exog_1", "target", "exog_2"]), (["target"], False, ["target"]), - (["exog_1", "exog_2"], True, ["exog_1", "target", "exog_2"]), - (["exog_1", "exog_2"], False, ["exog_1", "exog_2"]), + (["target", "exog_2"], True, ["exog_1", "target", "exog_2"]), + (["target", "exog_2"], False, ["exog_2", "target"]), (["target", "exog_1", "exog_2"], True, ["exog_1", "target", "exog_2"]), (["target", "exog_1", "exog_2"], False, ["exog_1", "target", "exog_2"]), ], From 327da676e265d80a694803b21dbe614f2eea1b71 Mon Sep 17 00:00:00 2001 From: Egor Baturin <82458209+egoriyaa@users.noreply.github.com> Date: Tue, 22 Oct 2024 17:23:16 +0300 Subject: [PATCH 2/4] Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples (#494) * remove unnecessary columns * update changelog * add tests for models except deepstate and deepar * fix PR name --------- Co-authored-by: Egor Baturin --- CHANGELOG.md | 2 +- etna/models/nn/tft_native/tft.py | 59 +++++++++++-------- .../test_nn/nbeats/test_nbeats_nets.py | 1 + tests/test_models/test_nn/test_mlp.py | 5 ++ tests/test_models/test_nn/test_patchts.py | 1 + tests/test_models/test_nn/test_rnn.py | 5 ++ .../test_nn/tft_native/test_tft_native.py | 6 ++ 7 files changed, 52 insertions(+), 27 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2bb451112..0b77f3a6a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -39,7 +39,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Fixed - Fix working with `embedding_sizes` in `202-NN_examples` notebook ([#489](https://github.com/etna-team/etna/pull/489)) - Disallow dropping target in `TSDataset.drop_features` ([#491](https://github.com/etna-team/etna/pull/491)) -- +- Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494)) - - - diff --git a/etna/models/nn/tft_native/tft.py b/etna/models/nn/tft_native/tft.py index 7cf08d7dc..b8d902a17 100644 --- a/etna/models/nn/tft_native/tft.py +++ b/etna/models/nn/tft_native/tft.py @@ -381,6 +381,17 @@ def make_samples(self, df: pd.DataFrame, encoder_length: int, decoder_length: in segment = df["segment"].values[0] for feature in self.num_embeddings: df[feature] = df[feature].astype(float).fillna(self.num_embeddings[feature]) + + reals_columns = list(set(self.static_reals + self.time_varying_reals_encoder + self.time_varying_reals_decoder)) + categ_columns = list( + set( + self.static_categoricals + + self.time_varying_categoricals_encoder + + self.time_varying_categoricals_decoder + ) + ) + + df = df[reals_columns + categ_columns] column_to_index = {column: index for index, column in enumerate(df.columns)} values = df.values.T @@ -410,48 +421,44 @@ def _make( return None sample["segment"] = segment - sample["decoder_target"] = ( - values[column_to_index["target"]][start_idx + encoder_length : start_idx + total_sample_length] - .reshape(-1, 1) - .astype(float) + sample["decoder_target"] = values[column_to_index["target"]][ + start_idx + encoder_length : start_idx + total_sample_length + ].reshape( + -1, 1 ) # (decoder_length, 1) for feature in self.static_reals: - sample["static_reals"][feature] = ( - values[column_to_index[feature]][:1].reshape(-1, 1).astype(float) - ) # (1, 1) + sample["static_reals"][feature] = values[column_to_index[feature]][:1].reshape(-1, 1) # (1, 1) for feature in self.static_categoricals: - sample["static_categoricals"][feature] = ( - values[column_to_index[feature]][:1].reshape(-1, 1).astype(float) - ) # (1, 1) + sample["static_categoricals"][feature] = values[column_to_index[feature]][:1].reshape(-1, 1) # (1, 1) for feature in self.time_varying_categoricals_encoder: - sample["time_varying_categoricals_encoder"][feature] = ( - values[column_to_index[feature]][start_idx : start_idx + encoder_length] - .reshape(-1, 1) - .astype(float) + sample["time_varying_categoricals_encoder"][feature] = values[column_to_index[feature]][ + start_idx : start_idx + encoder_length + ].reshape( + -1, 1 ) # (encoder_length, 1) for feature in self.time_varying_categoricals_decoder: - sample["time_varying_categoricals_decoder"][feature] = ( - values[column_to_index[feature]][start_idx + encoder_length : start_idx + total_sample_length] - .reshape(-1, 1) - .astype(float) + sample["time_varying_categoricals_decoder"][feature] = values[column_to_index[feature]][ + start_idx + encoder_length : start_idx + total_sample_length + ].reshape( + -1, 1 ) # (decoder_length, 1) for feature in self.time_varying_reals_encoder: - sample["time_varying_reals_encoder"][feature] = ( - values[column_to_index[feature]][start_idx : start_idx + encoder_length] - .reshape(-1, 1) - .astype(float) + sample["time_varying_reals_encoder"][feature] = values[column_to_index[feature]][ + start_idx : start_idx + encoder_length + ].reshape( + -1, 1 ) # (encoder_length, 1) for feature in self.time_varying_reals_decoder: - sample["time_varying_reals_decoder"][feature] = ( - values[column_to_index[feature]][start_idx + encoder_length : start_idx + total_sample_length] - .reshape(-1, 1) - .astype(float) + sample["time_varying_reals_decoder"][feature] = values[column_to_index[feature]][ + start_idx + encoder_length : start_idx + total_sample_length + ].reshape( + -1, 1 ) # (decoder_length, 1) return sample diff --git a/tests/test_models/test_nn/nbeats/test_nbeats_nets.py b/tests/test_models/test_nn/nbeats/test_nbeats_nets.py index e95583403..eda8dfb7b 100644 --- a/tests/test_models/test_nn/nbeats/test_nbeats_nets.py +++ b/tests/test_models/test_nn/nbeats/test_nbeats_nets.py @@ -53,6 +53,7 @@ def test_make_samples(df_name, request): assert first_sample["target_mask"] is None assert first_sample["segment"] == "segment_1" np.testing.assert_equal(first_sample["history"], expected_first_sample["history"]) + assert first_sample["history"].base is not None @pytest.mark.parametrize( diff --git a/tests/test_models/test_nn/test_mlp.py b/tests/test_models/test_nn/test_mlp.py index 002f12923..fe8bceb07 100644 --- a/tests/test_models/test_nn/test_mlp.py +++ b/tests/test_models/test_nn/test_mlp.py @@ -111,6 +111,11 @@ def test_mlp_make_samples(df_name, cat_columns, request): assert ts_samples[i]["segment"] == "segment_1" for key in expected_sample: np.testing.assert_equal(ts_samples[i][key], expected_sample[key]) + if "categorical" in key: + for column in ts_samples[i][key]: + assert ts_samples[i][key][column].base is not None + else: + assert ts_samples[i][key].base is not None def test_mlp_forward_fail_nans(): diff --git a/tests/test_models/test_nn/test_patchts.py b/tests/test_models/test_nn/test_patchts.py index 22c7aed10..6cef3292e 100644 --- a/tests/test_models/test_nn/test_patchts.py +++ b/tests/test_models/test_nn/test_patchts.py @@ -79,6 +79,7 @@ def test_patchts_make_samples(df_name, request): assert ts_samples[i]["segment"] == "segment_1" for key in expected_sample: np.testing.assert_equal(ts_samples[i][key], expected_sample[key]) + assert ts_samples[i][key].base is not None def test_save_load(example_tsds): diff --git a/tests/test_models/test_nn/test_rnn.py b/tests/test_models/test_nn/test_rnn.py index 4e1974a55..2acc47fef 100644 --- a/tests/test_models/test_nn/test_rnn.py +++ b/tests/test_models/test_nn/test_rnn.py @@ -121,6 +121,11 @@ def test_rnn_make_samples(df_name, cat_columns, request): assert ts_samples[i]["segment"] == "segment_1" for key in expected_sample: np.testing.assert_equal(ts_samples[i][key], expected_sample[key]) + if "categorical" in key: + for column in ts_samples[i][key]: + assert ts_samples[i][key][column].base is not None + else: + assert ts_samples[i][key].base is not None @pytest.mark.parametrize("encoder_length", [1, 2, 10]) diff --git a/tests/test_models/test_nn/tft_native/test_tft_native.py b/tests/test_models/test_nn/tft_native/test_tft_native.py index 4a2a671d4..29db902cc 100644 --- a/tests/test_models/test_nn/tft_native/test_tft_native.py +++ b/tests/test_models/test_nn/tft_native/test_tft_native.py @@ -188,31 +188,37 @@ def test_tft_make_samples( df[[feature]].iloc[:1], first_sample["static_reals"][feature], ) + assert first_sample["static_reals"][feature].base is not None for feature in static_categoricals: np.testing.assert_almost_equal( df[[feature]].iloc[:1], first_sample["static_categoricals"][feature], ) + assert first_sample["static_categoricals"][feature].base is not None for feature in time_varying_categoricals_encoder: np.testing.assert_almost_equal( df[[feature]].iloc[:encoder_length], first_sample["time_varying_categoricals_encoder"][feature], ) + assert first_sample["time_varying_categoricals_encoder"][feature].base is not None for feature in time_varying_categoricals_decoder: np.testing.assert_almost_equal( df[[feature]].iloc[encoder_length : encoder_length + decoder_length], first_sample["time_varying_categoricals_decoder"][feature], ) + assert first_sample["time_varying_categoricals_decoder"][feature].base is not None for feature in time_varying_reals_encoder: np.testing.assert_almost_equal( df[[feature]].iloc[:encoder_length], first_sample["time_varying_reals_encoder"][feature], ) + assert first_sample["time_varying_reals_encoder"][feature].base is not None for feature in time_varying_reals_decoder: np.testing.assert_almost_equal( df[[feature]].iloc[encoder_length : encoder_length + decoder_length], first_sample["time_varying_reals_decoder"][feature], ) + assert first_sample["time_varying_reals_decoder"][feature].base is not None @pytest.mark.parametrize("encoder_length, decoder_length", [(2, 1), (1, 2), (10, 5)]) From 6cd66ae639fc6beb6370033b20d76e930e6b7196 Mon Sep 17 00:00:00 2001 From: Egor Baturin <82458209+egoriyaa@users.noreply.github.com> Date: Tue, 29 Oct 2024 11:10:38 +0300 Subject: [PATCH 3/4] Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples (#499) * eliminate samples copying in DeepARNative and DeepState * update changelog * fix * fix changelog * add checks on copying arrays --------- Co-authored-by: Egor Baturin --- CHANGELOG.md | 2 +- etna/models/nn/deepar_native/deepar.py | 20 ++++++++------ etna/models/nn/deepstate/deepstate.py | 26 +++++++++---------- .../deepar_native/test_deepar_native.py | 11 +++++--- tests/test_models/test_nn/test_deepstate.py | 5 ++++ 5 files changed, 38 insertions(+), 26 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0b77f3a6a..44402f9ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -40,7 +40,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Fix working with `embedding_sizes` in `202-NN_examples` notebook ([#489](https://github.com/etna-team/etna/pull/489)) - Disallow dropping target in `TSDataset.drop_features` ([#491](https://github.com/etna-team/etna/pull/491)) - Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494)) -- +- Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499)) - - - diff --git a/etna/models/nn/deepar_native/deepar.py b/etna/models/nn/deepar_native/deepar.py index ece67ce34..42475c37f 100644 --- a/etna/models/nn/deepar_native/deepar.py +++ b/etna/models/nn/deepar_native/deepar.py @@ -132,6 +132,11 @@ def forward(self, x: DeepARNativeBatch, *args, **kwargs): # type: ignore decoder_categorical = x["decoder_categorical"] # each (batch_size, decoder_length, 1) decoder_target = x["decoder_target"].float() # (batch_size, decoder_length, 1) decoder_length = decoder_real.shape[1] + weights = x["weight"] + + # scale target values at index 0 + encoder_real[:, :, 0] = encoder_real[:, :, 0] / weights.unsqueeze(1) + decoder_real[:, :, 0] = decoder_real[:, :, 0] / weights.unsqueeze(1) encoder_embeddings = self.embedding(encoder_categorical) if self.embedding is not None else torch.Tensor() decoder_embeddings = self.embedding(decoder_categorical) if self.embedding is not None else torch.Tensor() @@ -191,7 +196,11 @@ def step(self, batch: DeepARNativeBatch, *args, **kwargs): # type: ignore decoder_categorical = batch["decoder_categorical"] # each (batch_size, decoder_length, 1) encoder_target = batch["encoder_target"].float() # (batch_size, encoder_length-1, 1) decoder_target = batch["decoder_target"].float() # (batch_size, decoder_length, 1) - weights = batch["weight"] + weights = batch["weight"] # (batch_size) + + # scale target values at index 0 + encoder_real[:, :, 0] = encoder_real[:, :, 0] / weights.unsqueeze(1) + decoder_real[:, :, 0] = decoder_real[:, :, 0] / weights.unsqueeze(1) encoder_embeddings = self.embedding(encoder_categorical) if self.embedding is not None else torch.Tensor() decoder_embeddings = self.embedding(decoder_categorical) if self.embedding is not None else torch.Tensor() @@ -255,11 +264,10 @@ def _make( return None # Get shifted target and concatenate it with real values features - sample["decoder_real"] = values_real[start_idx + encoder_length : start_idx + total_sample_length].copy() + sample["decoder_real"] = values_real[start_idx + encoder_length : start_idx + total_sample_length] # Get shifted target and concatenate it with real values features - sample["encoder_real"] = values_real[start_idx : start_idx + encoder_length].copy() - sample["encoder_real"] = sample["encoder_real"][1:] + sample["encoder_real"] = values_real[start_idx + 1 : start_idx + encoder_length] for index, feature in enumerate(self.embedding_sizes.keys()): sample["encoder_categorical"][feature] = values_categorical[index][ @@ -276,10 +284,6 @@ def _make( sample["segment"] = segment sample["weight"] = 1 + sample["encoder_target"].mean() if self.scale else 1 - sample["encoder_real"][:, 0] = values_real[start_idx + 1 : start_idx + encoder_length, 0] / sample["weight"] - sample["decoder_real"][:, 0] = ( - values_real[start_idx + encoder_length : start_idx + total_sample_length, 0] / sample["weight"] - ) return sample diff --git a/etna/models/nn/deepstate/deepstate.py b/etna/models/nn/deepstate/deepstate.py index d2f72d02a..7250bde03 100644 --- a/etna/models/nn/deepstate/deepstate.py +++ b/etna/models/nn/deepstate/deepstate.py @@ -4,6 +4,7 @@ from typing import Optional from typing import Tuple +import numpy as np import pandas as pd from typing_extensions import TypedDict @@ -117,9 +118,9 @@ def step(self, batch: DeepStateBatch, *args, **kwargs): # type: ignore : loss, true_target, prediction_target """ - encoder_real = batch["encoder_real"] # (batch_size, seq_length, input_size) + encoder_real = batch["encoder_real"].float() # (batch_size, seq_length, input_size) encoder_categorical = batch["encoder_categorical"] # each (batch_size, seq_length, 1) - targets = batch["encoder_target"] # (batch_size, seq_length, 1) + targets = batch["encoder_target"].float() # (batch_size, seq_length, 1) seq_length = targets.shape[1] datetime_index = batch["datetime_index"].permute(1, 0, 2)[ :, :, :seq_length @@ -159,11 +160,11 @@ def forward(self, x: DeepStateBatch, *args, **kwargs): # type: ignore : forecast with shape (batch_size, decoder_length, 1) """ - encoder_real = x["encoder_real"] # (batch_size, seq_length, input_size) + encoder_real = x["encoder_real"].float() # (batch_size, seq_length, input_size) encoder_categorical = x["encoder_categorical"] # each (batch_size, seq_length, 1) seq_length = encoder_real.shape[1] - targets = x["encoder_target"][:, :seq_length] # (batch_size, seq_length, 1) - decoder_real = x["decoder_real"] # (batch_size, horizon, input_size) + targets = x["encoder_target"][:, :seq_length].float() # (batch_size, seq_length, 1) + decoder_real = x["decoder_real"].float() # (batch_size, horizon, input_size) decoder_categorical = x["decoder_categorical"] # each (batch_size, horizon, 1) datetime_index_train = x["datetime_index"].permute(1, 0, 2)[ :, :, :seq_length @@ -213,26 +214,23 @@ def forward(self, x: DeepStateBatch, *args, **kwargs): # type: ignore def make_samples(self, df: pd.DataFrame, encoder_length: int, decoder_length: int) -> Iterator[dict]: """Make samples from segment DataFrame.""" values_real = df.drop(columns=["target", "segment", "timestamp"] + list(self.embedding_sizes.keys())).values - values_real = torch.from_numpy(values_real).float() # Categories that were not seen during `fit` will be filled with new category for feature in self.embedding_sizes: df[feature] = df[feature].astype(float).fillna(self.embedding_sizes[feature][0]) # Columns in `values_categorical` are in the same order as in `embedding_sizes` - values_categorical = torch.from_numpy(df[self.embedding_sizes.keys()].values.T) + values_categorical = df[self.embedding_sizes.keys()].values.T - values_datetime = torch.from_numpy(self.ssm.generate_datetime_index(df["timestamp"])) - values_datetime = values_datetime.to(torch.int64) + values_datetime = self.ssm.generate_datetime_index(df["timestamp"]).astype(int) values_target = df["target"].values - values_target = torch.from_numpy(values_target).float() segment = df["segment"].values[0] def _make( - values_target: torch.Tensor, - values_real: torch.Tensor, - values_categorical: torch.Tensor, - values_datetime: torch.Tensor, + values_target: np.ndarray, + values_real: np.ndarray, + values_categorical: np.ndarray, + values_datetime: np.ndarray, segment: str, start_idx: int, encoder_length: int, diff --git a/tests/test_models/test_nn/deepar_native/test_deepar_native.py b/tests/test_models/test_nn/deepar_native/test_deepar_native.py index 16e009dee..3cb298b8a 100644 --- a/tests/test_models/test_nn/deepar_native/test_deepar_native.py +++ b/tests/test_models/test_nn/deepar_native/test_deepar_native.py @@ -105,12 +105,11 @@ def test_deepar_make_samples(df_name, scale, weights, cat_columns, request): num_samples_check = 2 df["target_shifted"] = df["target"].shift(1) for i in range(num_samples_check): - df[f"target_shifted_scaled_{i}"] = df["target_shifted"] / weights[i] expected_sample = { - "encoder_real": df[[f"target_shifted_scaled_{i}", "regressor_float", "regressor_int"]] + "encoder_real": df[["target_shifted", "regressor_float", "regressor_int"]] .iloc[1 + i : encoder_length + i] .values, - "decoder_real": df[[f"target_shifted_scaled_{i}", "regressor_float", "regressor_int"]] + "decoder_real": df[["target_shifted", "regressor_float", "regressor_int"]] .iloc[encoder_length + i : encoder_length + decoder_length + i] .values, "encoder_categorical": { @@ -138,6 +137,12 @@ def test_deepar_make_samples(df_name, scale, weights, cat_columns, request): assert ts_samples[i]["segment"] == "segment_1" for key in expected_sample: np.testing.assert_equal(ts_samples[i][key], expected_sample[key]) + if "categorical" in key: + for column in ts_samples[i][key]: + assert ts_samples[i][key][column].base is not None + else: + if key != "weight": + assert ts_samples[i][key].base is not None @pytest.mark.parametrize("encoder_length", [1, 2, 10]) diff --git a/tests/test_models/test_nn/test_deepstate.py b/tests/test_models/test_nn/test_deepstate.py index a56b2cd4a..78efb4370 100644 --- a/tests/test_models/test_nn/test_deepstate.py +++ b/tests/test_models/test_nn/test_deepstate.py @@ -139,6 +139,11 @@ def test_deepstate_make_samples(df_name, cat_columns, request): assert ts_samples[i]["segment"] == "segment_1" for key in expected_sample: np.testing.assert_equal(ts_samples[i][key], expected_sample[key]) + if "categorical" in key: + for column in ts_samples[i][key]: + assert ts_samples[i][key][column].base is not None + else: + assert ts_samples[i][key].base is not None def test_save_load(example_tsds): From 4a6e975fcf9f416776d4aed9e14be5c337b8f365 Mon Sep 17 00:00:00 2001 From: Egor Baturin <82458209+egoriyaa@users.noreply.github.com> Date: Wed, 6 Nov 2024 16:47:57 +0300 Subject: [PATCH 4/4] `MeanEncoderTransform` generates wrong values (#492) * fix MeanEncoder * update changelog * fix * fix * fix * restore rnn file * fix numba method * add comments * add comment for fixtures * add blank lines for more readability * fix spelling * combine two tests in one * add test for 2 segments --------- Co-authored-by: Egor Baturin --- CHANGELOG.md | 2 +- etna/transforms/encoders/mean_encoder.py | 66 ++++++-- .../test_mean_encoder_transform.py | 147 +++++++++++++++++- 3 files changed, 199 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 44402f9ed..fa97ef948 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -41,7 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Disallow dropping target in `TSDataset.drop_features` ([#491](https://github.com/etna-team/etna/pull/491)) - Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494)) - Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499)) -- +- Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492)) - - - diff --git a/etna/transforms/encoders/mean_encoder.py b/etna/transforms/encoders/mean_encoder.py index 207f0d734..f92a7cd0a 100644 --- a/etna/transforms/encoders/mean_encoder.py +++ b/etna/transforms/encoders/mean_encoder.py @@ -3,9 +3,11 @@ from typing import Dict from typing import List from typing import Optional +from typing import Tuple from typing import Union from typing import cast +import numba import numpy as np import pandas as pd from bottleneck import nanmean @@ -165,6 +167,39 @@ def _count_macro_running_mean(df, n_segments): expanding_mean = pd.Series(index=df.index, data=expanding_mean.values).shift(n_segments) return expanding_mean + @staticmethod + @numba.njit() + def _count_per_segment_cumstats(target: np.ndarray, categories: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + ans_cumsum = np.full_like(target, np.nan) + ans_cumcount = np.full_like(target, np.nan) + unique_categories = np.unique(categories) + for category in unique_categories: + idx = np.where(category == categories)[0] + t = target[idx] + + # Mask for valid (non-NaN) target values + valid = ~np.isnan(t) + + # Compute cumulative sums and counts for valid values + cumsum = np.cumsum(np.where(valid, t, 0)) + cumcount = np.cumsum(valid).astype(np.float32) + + # Shift statistics by 1 to get statistics not including current index + cumsum = np.roll(cumsum, 1) + cumcount = np.roll(cumcount, 1) + + cumsum[0] = np.NaN + cumcount[0] = np.NaN + + # Handle positions with no previous valid values + cumsum[cumcount == 0] = np.NaN + cumcount[cumcount == 0] = np.NaN + + # Assign the computed values back to the answer arrays + ans_cumsum[idx] = cumsum + ans_cumcount[idx] = cumcount + return ans_cumsum, ans_cumcount + def _transform(self, df: pd.DataFrame) -> pd.DataFrame: """ Get encoded values for the segment. @@ -211,20 +246,24 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: for segment in segments: segment_df = TSDataset.to_flatten(intersected_df.loc[:, self.idx[segment, :]]) y = segment_df["target"] + categories = segment_df[self.in_column].values.astype(str) + + unique_categories = np.unique(categories) + cat_to_int = {cat: idx for idx, cat in enumerate(unique_categories)} + int_categories = np.array([cat_to_int[cat] for cat in categories], dtype=np.int64) + # first timestamp is NaN expanding_mean = y.expanding().mean().shift() - # cumcount not including current timestamp - cumcount = y.groupby(segment_df[self.in_column].astype(str)).agg("cumcount") - # cumsum not including current timestamp - cumsum = ( - y.groupby(segment_df[self.in_column].astype(str)) - .transform(lambda x: x.shift().cumsum()) - .fillna(0) - ) + + cumsum, cumcount = self._count_per_segment_cumstats(y.values, int_categories) + cumsum = pd.Series(cumsum) + cumcount = pd.Series(cumcount) + feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing) if self.handle_missing is MissingMode.global_mean: nan_feature_index = segment_df[segment_df[self.in_column].isnull()].index feature.loc[nan_feature_index] = expanding_mean.loc[nan_feature_index] + intersected_df.loc[:, self.idx[segment, self.out_column]] = feature.values else: @@ -237,16 +276,19 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: timestamps = intersected_df.index categories = pd.unique(df.loc[:, self.idx[:, self.in_column]].values.ravel()) - cumstats = pd.DataFrame(data={"sum": 0, "count": 0, self.in_column: categories}) + cumstats = pd.DataFrame(data={"sum": np.NaN, "count": np.NaN, self.in_column: categories}) cur_timestamp_idx = np.arange(0, len(timestamps) * n_segments, len(timestamps)) for _ in range(len(timestamps)): timestamp_df = flatten.loc[cur_timestamp_idx] + # statistics from previous timestamp cumsum_dict = dict(cumstats[[self.in_column, "sum"]].values) cumcount_dict = dict(cumstats[[self.in_column, "count"]].values) + # map categories for current timestamp to statistics temp.loc[cur_timestamp_idx, "cumsum"] = timestamp_df[self.in_column].map(cumsum_dict) temp.loc[cur_timestamp_idx, "cumcount"] = timestamp_df[self.in_column].map(cumcount_dict) + # count statistics for current timestamp stats = ( timestamp_df["target"] @@ -254,8 +296,14 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: .agg(["count", "sum"]) .reset_index() ) + # statistics become zeros for categories with target=NaN + stats = stats.replace({"count": 0, "sum": 0}, np.NaN) + # sum current and previous statistics cumstats = pd.concat([cumstats, stats]).groupby(self.in_column, as_index=False, dropna=False).sum() + # zeros appear for categories that weren't updated in previous line and whose statistics were NaN + cumstats = cumstats.replace({"count": 0, "sum": 0}, np.NaN) + cur_timestamp_idx += 1 feature = (temp["cumsum"] + running_mean * self.smoothing) / (temp["cumcount"] + self.smoothing) diff --git a/tests/test_transforms/test_encoders/test_mean_encoder_transform.py b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py index 6b2ad6279..973bcae05 100644 --- a/tests/test_transforms/test_encoders/test_mean_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py @@ -31,7 +31,7 @@ def category_ts() -> TSDataset: def expected_micro_category_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 1, 1.5, 1.5, 2.75, 2.25] + [np.NaN, 6.0, 6.25, 7, 7.625, 8.0] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 1.5, 2.75, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, np.NaN] ts = TSDataset(df, freq="D") return ts @@ -41,7 +41,7 @@ def expected_micro_category_ts() -> TSDataset: def expected_micro_global_mean_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 1, 1.5, 1.5, 2.5, 2.25] + [np.NaN, 6.0, 6.25, 7, 7.625, 8.0] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, 1.5, 1.5, 2.5, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, 8.0] ts = TSDataset(df, freq="D") return ts @@ -61,7 +61,7 @@ def expected_micro_category_make_future_ts() -> TSDataset: def expected_macro_category_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 3.5, 4, 4.875, 4, 4.85] + [np.NaN, 3.5, 3.66, 4.875, 5.5, 4.275] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 4.875, 4, 4.851] + [np.NaN, np.NaN, 3.66, 4.875, 5.5, 4.27] ts = TSDataset(df, freq="D") return ts @@ -71,7 +71,7 @@ def expected_macro_category_ts() -> TSDataset: def expected_macro_global_mean_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 3.5, 4, 4.875, 5, 4.85] + [np.NaN, 3.5, 3.66, 4.875, 5.5, 5.55] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, 4, 4.875, 5, 4.85] + [np.NaN, np.NaN, 3.66, 4.875, 5.5, 5.55] ts = TSDataset(df, freq="D") return ts @@ -104,7 +104,7 @@ def ts_begin_nan() -> TSDataset: def expected_ts_begin_nan_smooth_1() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, np.NaN, 0.5, 1.16, 1.5, 2.5] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 1.75, 1.5, 2.5] ts = TSDataset(df, freq="D") return ts @@ -114,12 +114,97 @@ def expected_ts_begin_nan_smooth_1() -> TSDataset: def expected_ts_begin_nan_smooth_2() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, np.NaN, 2 / 3, 5 / 4, 5 / 3, 2.5] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 5 / 3, 5 / 3, 2.5] ts = TSDataset(df, freq="D") return ts +@pytest.fixture +def multiple_nan_target_category_ts() -> TSDataset: + """Fixture with segment having multiple NaN targets: + + * For `regressor="A"` set of NaN timestamp goes before first notna value + * For `regressor="B"` set of NaN timestamp goes after first notna value + """ + df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=8) + df["target"] = [np.nan, 1.5, np.nan, 3.0, 4.0, np.NaN, np.NaN, np.NaN] + + df_exog = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=9) + df_exog.rename(columns={"target": "regressor"}, inplace=True) + df_exog["regressor"] = ["A", "B", "A", "A", "B", "B", "B", "A", "A"] + + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future="all") + + return ts + + +@pytest.fixture +def expected_multiple_nan_target_category_ts() -> TSDataset: + df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=8) + df.rename(columns={"target": "regressor_mean"}, inplace=True) + df["regressor_mean"] = [np.NaN, np.NaN, np.NaN, np.NaN, 1.5, 2.75, 2.75, 3.0] + + ts = TSDataset(df=df, freq="D") + + return ts + + +@pytest.fixture +def mean_segment_encoder_ts() -> TSDataset: + df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5) + df["target"] = [0, 1, np.NaN, 3, 4] + + df_exog = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=10) + df_exog.rename(columns={"target": "segment_feature"}, inplace=True) + df_exog["segment_feature"] = "segment_0" + + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future="all") + + return ts + + +@pytest.fixture +def expected_mean_segment_encoder_ts() -> TSDataset: + df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5) + df.rename(columns={"target": "segment_mean"}, inplace=True) + df["segment_mean"] = [np.NaN, 0, 0.5, 0.5, 1.33] + + ts = TSDataset(df=df, freq="D") + + return ts + + +@pytest.fixture +def multiple_nan_target_two_segments_ts() -> TSDataset: + """Fixture with two segments having multiple NaN targets: + + * For `regressor="A"` set of NaN timestamp goes before first notna value + * For `regressor="B"` set of NaN timestamp goes after first notna value + """ + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) + df["target"] = [np.NaN, 2, np.NaN, 4, np.NaN, 5] + [np.NaN, 7, np.NaN, np.NaN, 10, 11] + + df_exog = generate_ar_df(start_time="2001-01-01", periods=7, n_segments=2) + df_exog.rename(columns={"target": "regressor"}, inplace=True) + df_exog["regressor"] = ["A", "B", "A", "A", "B", "B", "A"] + ["A", "B", "A", "B", "A", "B", "A"] + + ts = TSDataset(df, df_exog=df_exog, freq="D", known_future="all") + + return ts + + +@pytest.fixture +def expected_multiple_nan_target_two_segments_ts() -> TSDataset: + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) + df.rename(columns={"target": "regressor_mean"}, inplace=True) + df["regressor_mean"] = [np.NaN, np.NaN, np.NaN, np.NaN, 4.5, 4.5] + [np.NaN, np.NaN, np.NaN, 4.5, 4, 4.5] + + ts = TSDataset(df=df, freq="D") + + return ts + + @pytest.mark.smoke @pytest.mark.parametrize("mode", ["per-segment", "macro"]) @pytest.mark.parametrize("handle_missing", ["category", "global_mean"]) @@ -311,6 +396,56 @@ def test_ts_begin_nan_smooth_2(ts_begin_nan, expected_ts_begin_nan_smooth_2): ) +def test_mean_segment_encoder(mean_segment_encoder_ts, expected_mean_segment_encoder_ts): + mean_encoder = MeanEncoderTransform( + in_column="segment_feature", + mode="per-segment", + handle_missing="category", + smoothing=0, + out_column="segment_mean", + ) + mean_encoder.fit_transform(mean_segment_encoder_ts) + assert_frame_equal( + mean_segment_encoder_ts.df.loc[:, pd.IndexSlice[:, "segment_mean"]], + expected_mean_segment_encoder_ts.df, + atol=0.01, + ) + + +def test_multiple_nan_target_category_ts(multiple_nan_target_category_ts, expected_multiple_nan_target_category_ts): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode="per-segment", + handle_missing="category", + smoothing=0, + out_column="regressor_mean", + ) + mean_encoder.fit_transform(multiple_nan_target_category_ts) + assert_frame_equal( + multiple_nan_target_category_ts.df.loc[:, pd.IndexSlice[:, "regressor_mean"]], + expected_multiple_nan_target_category_ts.df, + atol=0.01, + ) + + +def test_multiple_nan_target_two_segments_ts( + multiple_nan_target_two_segments_ts, expected_multiple_nan_target_two_segments_ts +): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode="macro", + handle_missing="category", + smoothing=0, + out_column="regressor_mean", + ) + mean_encoder.fit_transform(multiple_nan_target_two_segments_ts) + assert_frame_equal( + multiple_nan_target_two_segments_ts.df.loc[:, pd.IndexSlice[:, "regressor_mean"]], + expected_multiple_nan_target_two_segments_ts.df, + atol=0.01, + ) + + def test_save_load(category_ts): mean_encoder = MeanEncoderTransform(in_column="regressor", out_column="mean_encoded_regressor") assert_transformation_equals_loaded_original(transform=mean_encoder, ts=category_ts)