Skip to content

Commit

Permalink
fix mean encoder
Browse files Browse the repository at this point in the history
  • Loading branch information
Egor Baturin committed Nov 1, 2024
2 parents 4bace07 + 6cd66ae commit c8cfd04
Show file tree
Hide file tree
Showing 16 changed files with 295 additions and 103 deletions.
6 changes: 3 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed
- Fix working with `embedding_sizes` in `202-NN_examples` notebook ([#489](https://github.com/etna-team/etna/pull/489))
-
-
-
- Disallow dropping target in `TSDataset.drop_features` ([#491](https://github.com/etna-team/etna/pull/491))
- Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494))
- Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499))
- Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492))
-
-
Expand Down
5 changes: 4 additions & 1 deletion etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1296,7 +1296,7 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
Raises
------
ValueError:
If ``features`` list contains target components
If ``features`` list contains target or target components
"""
features_set = set(features)

Expand All @@ -1312,6 +1312,9 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
"Prediction intervals can't be dropped from the dataset using this method! Use `drop_prediction_intervals` method!"
)

if "target" in features_set:
raise ValueError(f"Target can't be dropped from the dataset!")

dfs = [("df", self.df)]
if drop_from_exog:
dfs.append(("df_exog", self.df_exog))
Expand Down
20 changes: 12 additions & 8 deletions etna/models/nn/deepar_native/deepar.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ def forward(self, x: DeepARNativeBatch, *args, **kwargs): # type: ignore
decoder_categorical = x["decoder_categorical"] # each (batch_size, decoder_length, 1)
decoder_target = x["decoder_target"].float() # (batch_size, decoder_length, 1)
decoder_length = decoder_real.shape[1]
weights = x["weight"]

# scale target values at index 0
encoder_real[:, :, 0] = encoder_real[:, :, 0] / weights.unsqueeze(1)
decoder_real[:, :, 0] = decoder_real[:, :, 0] / weights.unsqueeze(1)

encoder_embeddings = self.embedding(encoder_categorical) if self.embedding is not None else torch.Tensor()
decoder_embeddings = self.embedding(decoder_categorical) if self.embedding is not None else torch.Tensor()
Expand Down Expand Up @@ -191,7 +196,11 @@ def step(self, batch: DeepARNativeBatch, *args, **kwargs): # type: ignore
decoder_categorical = batch["decoder_categorical"] # each (batch_size, decoder_length, 1)
encoder_target = batch["encoder_target"].float() # (batch_size, encoder_length-1, 1)
decoder_target = batch["decoder_target"].float() # (batch_size, decoder_length, 1)
weights = batch["weight"]
weights = batch["weight"] # (batch_size)

# scale target values at index 0
encoder_real[:, :, 0] = encoder_real[:, :, 0] / weights.unsqueeze(1)
decoder_real[:, :, 0] = decoder_real[:, :, 0] / weights.unsqueeze(1)

encoder_embeddings = self.embedding(encoder_categorical) if self.embedding is not None else torch.Tensor()
decoder_embeddings = self.embedding(decoder_categorical) if self.embedding is not None else torch.Tensor()
Expand Down Expand Up @@ -255,11 +264,10 @@ def _make(
return None

# Get shifted target and concatenate it with real values features
sample["decoder_real"] = values_real[start_idx + encoder_length : start_idx + total_sample_length].copy()
sample["decoder_real"] = values_real[start_idx + encoder_length : start_idx + total_sample_length]

# Get shifted target and concatenate it with real values features
sample["encoder_real"] = values_real[start_idx : start_idx + encoder_length].copy()
sample["encoder_real"] = sample["encoder_real"][1:]
sample["encoder_real"] = values_real[start_idx + 1 : start_idx + encoder_length]

for index, feature in enumerate(self.embedding_sizes.keys()):
sample["encoder_categorical"][feature] = values_categorical[index][
Expand All @@ -276,10 +284,6 @@ def _make(

sample["segment"] = segment
sample["weight"] = 1 + sample["encoder_target"].mean() if self.scale else 1
sample["encoder_real"][:, 0] = values_real[start_idx + 1 : start_idx + encoder_length, 0] / sample["weight"]
sample["decoder_real"][:, 0] = (
values_real[start_idx + encoder_length : start_idx + total_sample_length, 0] / sample["weight"]
)

return sample

Expand Down
26 changes: 12 additions & 14 deletions etna/models/nn/deepstate/deepstate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Optional
from typing import Tuple

import numpy as np
import pandas as pd
from typing_extensions import TypedDict

Expand Down Expand Up @@ -117,9 +118,9 @@ def step(self, batch: DeepStateBatch, *args, **kwargs): # type: ignore
:
loss, true_target, prediction_target
"""
encoder_real = batch["encoder_real"] # (batch_size, seq_length, input_size)
encoder_real = batch["encoder_real"].float() # (batch_size, seq_length, input_size)
encoder_categorical = batch["encoder_categorical"] # each (batch_size, seq_length, 1)
targets = batch["encoder_target"] # (batch_size, seq_length, 1)
targets = batch["encoder_target"].float() # (batch_size, seq_length, 1)
seq_length = targets.shape[1]
datetime_index = batch["datetime_index"].permute(1, 0, 2)[
:, :, :seq_length
Expand Down Expand Up @@ -159,11 +160,11 @@ def forward(self, x: DeepStateBatch, *args, **kwargs): # type: ignore
:
forecast with shape (batch_size, decoder_length, 1)
"""
encoder_real = x["encoder_real"] # (batch_size, seq_length, input_size)
encoder_real = x["encoder_real"].float() # (batch_size, seq_length, input_size)
encoder_categorical = x["encoder_categorical"] # each (batch_size, seq_length, 1)
seq_length = encoder_real.shape[1]
targets = x["encoder_target"][:, :seq_length] # (batch_size, seq_length, 1)
decoder_real = x["decoder_real"] # (batch_size, horizon, input_size)
targets = x["encoder_target"][:, :seq_length].float() # (batch_size, seq_length, 1)
decoder_real = x["decoder_real"].float() # (batch_size, horizon, input_size)
decoder_categorical = x["decoder_categorical"] # each (batch_size, horizon, 1)
datetime_index_train = x["datetime_index"].permute(1, 0, 2)[
:, :, :seq_length
Expand Down Expand Up @@ -213,26 +214,23 @@ def forward(self, x: DeepStateBatch, *args, **kwargs): # type: ignore
def make_samples(self, df: pd.DataFrame, encoder_length: int, decoder_length: int) -> Iterator[dict]:
"""Make samples from segment DataFrame."""
values_real = df.drop(columns=["target", "segment", "timestamp"] + list(self.embedding_sizes.keys())).values
values_real = torch.from_numpy(values_real).float()

# Categories that were not seen during `fit` will be filled with new category
for feature in self.embedding_sizes:
df[feature] = df[feature].astype(float).fillna(self.embedding_sizes[feature][0])

# Columns in `values_categorical` are in the same order as in `embedding_sizes`
values_categorical = torch.from_numpy(df[self.embedding_sizes.keys()].values.T)
values_categorical = df[self.embedding_sizes.keys()].values.T

values_datetime = torch.from_numpy(self.ssm.generate_datetime_index(df["timestamp"]))
values_datetime = values_datetime.to(torch.int64)
values_datetime = self.ssm.generate_datetime_index(df["timestamp"]).astype(int)
values_target = df["target"].values
values_target = torch.from_numpy(values_target).float()
segment = df["segment"].values[0]

def _make(
values_target: torch.Tensor,
values_real: torch.Tensor,
values_categorical: torch.Tensor,
values_datetime: torch.Tensor,
values_target: np.ndarray,
values_real: np.ndarray,
values_categorical: np.ndarray,
values_datetime: np.ndarray,
segment: str,
start_idx: int,
encoder_length: int,
Expand Down
59 changes: 33 additions & 26 deletions etna/models/nn/tft_native/tft.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,17 @@ def make_samples(self, df: pd.DataFrame, encoder_length: int, decoder_length: in
segment = df["segment"].values[0]
for feature in self.num_embeddings:
df[feature] = df[feature].astype(float).fillna(self.num_embeddings[feature])

reals_columns = list(set(self.static_reals + self.time_varying_reals_encoder + self.time_varying_reals_decoder))
categ_columns = list(
set(
self.static_categoricals
+ self.time_varying_categoricals_encoder
+ self.time_varying_categoricals_decoder
)
)

df = df[reals_columns + categ_columns]
column_to_index = {column: index for index, column in enumerate(df.columns)}
values = df.values.T

Expand Down Expand Up @@ -410,48 +421,44 @@ def _make(
return None

sample["segment"] = segment
sample["decoder_target"] = (
values[column_to_index["target"]][start_idx + encoder_length : start_idx + total_sample_length]
.reshape(-1, 1)
.astype(float)
sample["decoder_target"] = values[column_to_index["target"]][
start_idx + encoder_length : start_idx + total_sample_length
].reshape(
-1, 1
) # (decoder_length, 1)

for feature in self.static_reals:
sample["static_reals"][feature] = (
values[column_to_index[feature]][:1].reshape(-1, 1).astype(float)
) # (1, 1)
sample["static_reals"][feature] = values[column_to_index[feature]][:1].reshape(-1, 1) # (1, 1)

for feature in self.static_categoricals:
sample["static_categoricals"][feature] = (
values[column_to_index[feature]][:1].reshape(-1, 1).astype(float)
) # (1, 1)
sample["static_categoricals"][feature] = values[column_to_index[feature]][:1].reshape(-1, 1) # (1, 1)

for feature in self.time_varying_categoricals_encoder:
sample["time_varying_categoricals_encoder"][feature] = (
values[column_to_index[feature]][start_idx : start_idx + encoder_length]
.reshape(-1, 1)
.astype(float)
sample["time_varying_categoricals_encoder"][feature] = values[column_to_index[feature]][
start_idx : start_idx + encoder_length
].reshape(
-1, 1
) # (encoder_length, 1)

for feature in self.time_varying_categoricals_decoder:
sample["time_varying_categoricals_decoder"][feature] = (
values[column_to_index[feature]][start_idx + encoder_length : start_idx + total_sample_length]
.reshape(-1, 1)
.astype(float)
sample["time_varying_categoricals_decoder"][feature] = values[column_to_index[feature]][
start_idx + encoder_length : start_idx + total_sample_length
].reshape(
-1, 1
) # (decoder_length, 1)

for feature in self.time_varying_reals_encoder:
sample["time_varying_reals_encoder"][feature] = (
values[column_to_index[feature]][start_idx : start_idx + encoder_length]
.reshape(-1, 1)
.astype(float)
sample["time_varying_reals_encoder"][feature] = values[column_to_index[feature]][
start_idx : start_idx + encoder_length
].reshape(
-1, 1
) # (encoder_length, 1)

for feature in self.time_varying_reals_decoder:
sample["time_varying_reals_decoder"][feature] = (
values[column_to_index[feature]][start_idx + encoder_length : start_idx + total_sample_length]
.reshape(-1, 1)
.astype(float)
sample["time_varying_reals_decoder"][feature] = values[column_to_index[feature]][
start_idx + encoder_length : start_idx + total_sample_length
].reshape(
-1, 1
) # (decoder_length, 1)

return sample
Expand Down
35 changes: 28 additions & 7 deletions etna/transforms/encoders/mean_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union
from typing import cast

Expand Down Expand Up @@ -165,6 +166,28 @@ def _count_macro_running_mean(df, n_segments):
expanding_mean = pd.Series(index=df.index, data=expanding_mean.values).shift(n_segments)
return expanding_mean

@staticmethod
def _count_per_segment_cumstats(target: np.ndarray, categories: np.ndarray) -> Tuple[List[float], List[float]]:
cumsum = {}
cumcount = {}
for category in np.unique(categories):
cumsum[category] = np.NaN
cumcount[category] = np.NaN

ans_cumsum = []
ans_cumcount = []

for i in range(len(target)):
ans_cumsum.append(cumsum[categories[i]])
ans_cumcount.append(cumcount[categories[i]])
if not np.isnan(target[i]):
if np.isnan(cumsum[categories[i]]):
cumsum[categories[i]] = 0
cumcount[categories[i]] = 0
cumsum[categories[i]] += target[i]
cumcount[categories[i]] += 1
return ans_cumsum, ans_cumcount

def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Get encoded values for the segment.
Expand Down Expand Up @@ -211,15 +234,13 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
for segment in segments:
segment_df = TSDataset.to_flatten(intersected_df.loc[:, self.idx[segment, :]])
y = segment_df["target"]
categories = segment_df[self.in_column].values.astype(str)

# first timestamp is NaN
expanding_mean = y.expanding().mean().shift()
# cumcount not including current timestamp
cumcount = segment_df.loc[y.notna()].groupby(self.in_column, dropna=False).cumcount().reindex(y.index).replace(0, np.NaN)
# cumsum not including current timestamp
cumsum = segment_df['target'].groupby(segment_df[self.in_column].astype(str), dropna=False).transform(
lambda x: x.shift().fillna(0).cumsum()
)
cumsum = cumsum.where(cumcount.notna(), np.NaN)
cumsum, cumcount = self._count_per_segment_cumstats(y.values, categories)
cumsum = pd.Series(cumsum)
cumcount = pd.Series(cumcount)

feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing)
if self.handle_missing is MissingMode.global_mean:
Expand Down
9 changes: 8 additions & 1 deletion tests/test_datasets/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1592,7 +1592,7 @@ def test_drop_features_raise_warning_on_unknown_columns(
@pytest.mark.parametrize(
"features, expected_regressors",
(
(["target", "regressor_2"], ["regressor_1"]),
(["regressor_2"], ["regressor_1"]),
(["out_of_dataset_column"], ["regressor_1", "regressor_2"]),
),
)
Expand All @@ -1603,6 +1603,13 @@ def test_drop_features_update_regressors(df_and_regressors, features, expected_r
assert sorted(ts.regressors) == sorted(expected_regressors)


def test_drop_features_throw_error_on_target(df_and_regressors):
df, df_exog, known_future = df_and_regressors
ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future=known_future)
with pytest.raises(ValueError, match="Target can't be dropped from the dataset!"):
ts.drop_features(features=["target"], drop_from_exog=False)


def test_drop_features_throw_error_on_target_components(ts_with_target_components):
with pytest.raises(
ValueError,
Expand Down
11 changes: 8 additions & 3 deletions tests/test_models/test_nn/deepar_native/test_deepar_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,12 +105,11 @@ def test_deepar_make_samples(df_name, scale, weights, cat_columns, request):
num_samples_check = 2
df["target_shifted"] = df["target"].shift(1)
for i in range(num_samples_check):
df[f"target_shifted_scaled_{i}"] = df["target_shifted"] / weights[i]
expected_sample = {
"encoder_real": df[[f"target_shifted_scaled_{i}", "regressor_float", "regressor_int"]]
"encoder_real": df[["target_shifted", "regressor_float", "regressor_int"]]
.iloc[1 + i : encoder_length + i]
.values,
"decoder_real": df[[f"target_shifted_scaled_{i}", "regressor_float", "regressor_int"]]
"decoder_real": df[["target_shifted", "regressor_float", "regressor_int"]]
.iloc[encoder_length + i : encoder_length + decoder_length + i]
.values,
"encoder_categorical": {
Expand Down Expand Up @@ -138,6 +137,12 @@ def test_deepar_make_samples(df_name, scale, weights, cat_columns, request):
assert ts_samples[i]["segment"] == "segment_1"
for key in expected_sample:
np.testing.assert_equal(ts_samples[i][key], expected_sample[key])
if "categorical" in key:
for column in ts_samples[i][key]:
assert ts_samples[i][key][column].base is not None
else:
if key != "weight":
assert ts_samples[i][key].base is not None


@pytest.mark.parametrize("encoder_length", [1, 2, 10])
Expand Down
1 change: 1 addition & 0 deletions tests/test_models/test_nn/nbeats/test_nbeats_nets.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def test_make_samples(df_name, request):
assert first_sample["target_mask"] is None
assert first_sample["segment"] == "segment_1"
np.testing.assert_equal(first_sample["history"], expected_first_sample["history"])
assert first_sample["history"].base is not None


@pytest.mark.parametrize(
Expand Down
5 changes: 5 additions & 0 deletions tests/test_models/test_nn/test_deepstate.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,11 @@ def test_deepstate_make_samples(df_name, cat_columns, request):
assert ts_samples[i]["segment"] == "segment_1"
for key in expected_sample:
np.testing.assert_equal(ts_samples[i][key], expected_sample[key])
if "categorical" in key:
for column in ts_samples[i][key]:
assert ts_samples[i][key][column].base is not None
else:
assert ts_samples[i][key].base is not None


def test_save_load(example_tsds):
Expand Down
5 changes: 5 additions & 0 deletions tests/test_models/test_nn/test_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,11 @@ def test_mlp_make_samples(df_name, cat_columns, request):
assert ts_samples[i]["segment"] == "segment_1"
for key in expected_sample:
np.testing.assert_equal(ts_samples[i][key], expected_sample[key])
if "categorical" in key:
for column in ts_samples[i][key]:
assert ts_samples[i][key][column].base is not None
else:
assert ts_samples[i][key].base is not None


def test_mlp_forward_fail_nans():
Expand Down
1 change: 1 addition & 0 deletions tests/test_models/test_nn/test_patchts.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def test_patchts_make_samples(df_name, request):
assert ts_samples[i]["segment"] == "segment_1"
for key in expected_sample:
np.testing.assert_equal(ts_samples[i][key], expected_sample[key])
assert ts_samples[i][key].base is not None


def test_save_load(example_tsds):
Expand Down
5 changes: 5 additions & 0 deletions tests/test_models/test_nn/test_rnn.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,11 @@ def test_rnn_make_samples(df_name, cat_columns, request):
assert ts_samples[i]["segment"] == "segment_1"
for key in expected_sample:
np.testing.assert_equal(ts_samples[i][key], expected_sample[key])
if "categorical" in key:
for column in ts_samples[i][key]:
assert ts_samples[i][key][column].base is not None
else:
assert ts_samples[i][key].base is not None


@pytest.mark.parametrize("encoder_length", [1, 2, 10])
Expand Down
Loading

0 comments on commit c8cfd04

Please sign in to comment.