Skip to content

Commit

Permalink
fix MeanSegmentEncoder
Browse files Browse the repository at this point in the history
  • Loading branch information
Egor Baturin committed Nov 7, 2024
2 parents 132d07f + 4a6e975 commit f62f2d4
Show file tree
Hide file tree
Showing 19 changed files with 357 additions and 210 deletions.
8 changes: 4 additions & 4 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

### Fixed
- Fix working with `embedding_sizes` in `202-NN_examples` notebook ([#489](https://github.com/etna-team/etna/pull/489))
-
-
-
-
- Disallow dropping target in `TSDataset.drop_features` ([#491](https://github.com/etna-team/etna/pull/491))
- Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494))
- Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499))
- Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492))
-
-
-
Expand Down
5 changes: 4 additions & 1 deletion etna/datasets/tsdataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1296,7 +1296,7 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
Raises
------
ValueError:
If ``features`` list contains target components
If ``features`` list contains target or target components
"""
features_set = set(features)

Expand All @@ -1312,6 +1312,9 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
"Prediction intervals can't be dropped from the dataset using this method! Use `drop_prediction_intervals` method!"
)

if "target" in features_set:
raise ValueError(f"Target can't be dropped from the dataset!")

dfs = [("df", self.df)]
if drop_from_exog:
dfs.append(("df_exog", self.df_exog))
Expand Down
20 changes: 12 additions & 8 deletions etna/models/nn/deepar_native/deepar.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,11 @@ def forward(self, x: DeepARNativeBatch, *args, **kwargs): # type: ignore
decoder_categorical = x["decoder_categorical"] # each (batch_size, decoder_length, 1)
decoder_target = x["decoder_target"].float() # (batch_size, decoder_length, 1)
decoder_length = decoder_real.shape[1]
weights = x["weight"]

# scale target values at index 0
encoder_real[:, :, 0] = encoder_real[:, :, 0] / weights.unsqueeze(1)
decoder_real[:, :, 0] = decoder_real[:, :, 0] / weights.unsqueeze(1)

encoder_embeddings = self.embedding(encoder_categorical) if self.embedding is not None else torch.Tensor()
decoder_embeddings = self.embedding(decoder_categorical) if self.embedding is not None else torch.Tensor()
Expand Down Expand Up @@ -191,7 +196,11 @@ def step(self, batch: DeepARNativeBatch, *args, **kwargs): # type: ignore
decoder_categorical = batch["decoder_categorical"] # each (batch_size, decoder_length, 1)
encoder_target = batch["encoder_target"].float() # (batch_size, encoder_length-1, 1)
decoder_target = batch["decoder_target"].float() # (batch_size, decoder_length, 1)
weights = batch["weight"]
weights = batch["weight"] # (batch_size)

# scale target values at index 0
encoder_real[:, :, 0] = encoder_real[:, :, 0] / weights.unsqueeze(1)
decoder_real[:, :, 0] = decoder_real[:, :, 0] / weights.unsqueeze(1)

encoder_embeddings = self.embedding(encoder_categorical) if self.embedding is not None else torch.Tensor()
decoder_embeddings = self.embedding(decoder_categorical) if self.embedding is not None else torch.Tensor()
Expand Down Expand Up @@ -255,11 +264,10 @@ def _make(
return None

# Get shifted target and concatenate it with real values features
sample["decoder_real"] = values_real[start_idx + encoder_length : start_idx + total_sample_length].copy()
sample["decoder_real"] = values_real[start_idx + encoder_length : start_idx + total_sample_length]

# Get shifted target and concatenate it with real values features
sample["encoder_real"] = values_real[start_idx : start_idx + encoder_length].copy()
sample["encoder_real"] = sample["encoder_real"][1:]
sample["encoder_real"] = values_real[start_idx + 1 : start_idx + encoder_length]

for index, feature in enumerate(self.embedding_sizes.keys()):
sample["encoder_categorical"][feature] = values_categorical[index][
Expand All @@ -276,10 +284,6 @@ def _make(

sample["segment"] = segment
sample["weight"] = 1 + sample["encoder_target"].mean() if self.scale else 1
sample["encoder_real"][:, 0] = values_real[start_idx + 1 : start_idx + encoder_length, 0] / sample["weight"]
sample["decoder_real"][:, 0] = (
values_real[start_idx + encoder_length : start_idx + total_sample_length, 0] / sample["weight"]
)

return sample

Expand Down
26 changes: 12 additions & 14 deletions etna/models/nn/deepstate/deepstate.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import Optional
from typing import Tuple

import numpy as np
import pandas as pd
from typing_extensions import TypedDict

Expand Down Expand Up @@ -117,9 +118,9 @@ def step(self, batch: DeepStateBatch, *args, **kwargs): # type: ignore
:
loss, true_target, prediction_target
"""
encoder_real = batch["encoder_real"] # (batch_size, seq_length, input_size)
encoder_real = batch["encoder_real"].float() # (batch_size, seq_length, input_size)
encoder_categorical = batch["encoder_categorical"] # each (batch_size, seq_length, 1)
targets = batch["encoder_target"] # (batch_size, seq_length, 1)
targets = batch["encoder_target"].float() # (batch_size, seq_length, 1)
seq_length = targets.shape[1]
datetime_index = batch["datetime_index"].permute(1, 0, 2)[
:, :, :seq_length
Expand Down Expand Up @@ -159,11 +160,11 @@ def forward(self, x: DeepStateBatch, *args, **kwargs): # type: ignore
:
forecast with shape (batch_size, decoder_length, 1)
"""
encoder_real = x["encoder_real"] # (batch_size, seq_length, input_size)
encoder_real = x["encoder_real"].float() # (batch_size, seq_length, input_size)
encoder_categorical = x["encoder_categorical"] # each (batch_size, seq_length, 1)
seq_length = encoder_real.shape[1]
targets = x["encoder_target"][:, :seq_length] # (batch_size, seq_length, 1)
decoder_real = x["decoder_real"] # (batch_size, horizon, input_size)
targets = x["encoder_target"][:, :seq_length].float() # (batch_size, seq_length, 1)
decoder_real = x["decoder_real"].float() # (batch_size, horizon, input_size)
decoder_categorical = x["decoder_categorical"] # each (batch_size, horizon, 1)
datetime_index_train = x["datetime_index"].permute(1, 0, 2)[
:, :, :seq_length
Expand Down Expand Up @@ -213,26 +214,23 @@ def forward(self, x: DeepStateBatch, *args, **kwargs): # type: ignore
def make_samples(self, df: pd.DataFrame, encoder_length: int, decoder_length: int) -> Iterator[dict]:
"""Make samples from segment DataFrame."""
values_real = df.drop(columns=["target", "segment", "timestamp"] + list(self.embedding_sizes.keys())).values
values_real = torch.from_numpy(values_real).float()

# Categories that were not seen during `fit` will be filled with new category
for feature in self.embedding_sizes:
df[feature] = df[feature].astype(float).fillna(self.embedding_sizes[feature][0])

# Columns in `values_categorical` are in the same order as in `embedding_sizes`
values_categorical = torch.from_numpy(df[self.embedding_sizes.keys()].values.T)
values_categorical = df[self.embedding_sizes.keys()].values.T

values_datetime = torch.from_numpy(self.ssm.generate_datetime_index(df["timestamp"]))
values_datetime = values_datetime.to(torch.int64)
values_datetime = self.ssm.generate_datetime_index(df["timestamp"]).astype(int)
values_target = df["target"].values
values_target = torch.from_numpy(values_target).float()
segment = df["segment"].values[0]

def _make(
values_target: torch.Tensor,
values_real: torch.Tensor,
values_categorical: torch.Tensor,
values_datetime: torch.Tensor,
values_target: np.ndarray,
values_real: np.ndarray,
values_categorical: np.ndarray,
values_datetime: np.ndarray,
segment: str,
start_idx: int,
encoder_length: int,
Expand Down
59 changes: 33 additions & 26 deletions etna/models/nn/tft_native/tft.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,17 @@ def make_samples(self, df: pd.DataFrame, encoder_length: int, decoder_length: in
segment = df["segment"].values[0]
for feature in self.num_embeddings:
df[feature] = df[feature].astype(float).fillna(self.num_embeddings[feature])

reals_columns = list(set(self.static_reals + self.time_varying_reals_encoder + self.time_varying_reals_decoder))
categ_columns = list(
set(
self.static_categoricals
+ self.time_varying_categoricals_encoder
+ self.time_varying_categoricals_decoder
)
)

df = df[reals_columns + categ_columns]
column_to_index = {column: index for index, column in enumerate(df.columns)}
values = df.values.T

Expand Down Expand Up @@ -410,48 +421,44 @@ def _make(
return None

sample["segment"] = segment
sample["decoder_target"] = (
values[column_to_index["target"]][start_idx + encoder_length : start_idx + total_sample_length]
.reshape(-1, 1)
.astype(float)
sample["decoder_target"] = values[column_to_index["target"]][
start_idx + encoder_length : start_idx + total_sample_length
].reshape(
-1, 1
) # (decoder_length, 1)

for feature in self.static_reals:
sample["static_reals"][feature] = (
values[column_to_index[feature]][:1].reshape(-1, 1).astype(float)
) # (1, 1)
sample["static_reals"][feature] = values[column_to_index[feature]][:1].reshape(-1, 1) # (1, 1)

for feature in self.static_categoricals:
sample["static_categoricals"][feature] = (
values[column_to_index[feature]][:1].reshape(-1, 1).astype(float)
) # (1, 1)
sample["static_categoricals"][feature] = values[column_to_index[feature]][:1].reshape(-1, 1) # (1, 1)

for feature in self.time_varying_categoricals_encoder:
sample["time_varying_categoricals_encoder"][feature] = (
values[column_to_index[feature]][start_idx : start_idx + encoder_length]
.reshape(-1, 1)
.astype(float)
sample["time_varying_categoricals_encoder"][feature] = values[column_to_index[feature]][
start_idx : start_idx + encoder_length
].reshape(
-1, 1
) # (encoder_length, 1)

for feature in self.time_varying_categoricals_decoder:
sample["time_varying_categoricals_decoder"][feature] = (
values[column_to_index[feature]][start_idx + encoder_length : start_idx + total_sample_length]
.reshape(-1, 1)
.astype(float)
sample["time_varying_categoricals_decoder"][feature] = values[column_to_index[feature]][
start_idx + encoder_length : start_idx + total_sample_length
].reshape(
-1, 1
) # (decoder_length, 1)

for feature in self.time_varying_reals_encoder:
sample["time_varying_reals_encoder"][feature] = (
values[column_to_index[feature]][start_idx : start_idx + encoder_length]
.reshape(-1, 1)
.astype(float)
sample["time_varying_reals_encoder"][feature] = values[column_to_index[feature]][
start_idx : start_idx + encoder_length
].reshape(
-1, 1
) # (encoder_length, 1)

for feature in self.time_varying_reals_decoder:
sample["time_varying_reals_decoder"][feature] = (
values[column_to_index[feature]][start_idx + encoder_length : start_idx + total_sample_length]
.reshape(-1, 1)
.astype(float)
sample["time_varying_reals_decoder"][feature] = values[column_to_index[feature]][
start_idx + encoder_length : start_idx + total_sample_length
].reshape(
-1, 1
) # (decoder_length, 1)

return sample
Expand Down
83 changes: 56 additions & 27 deletions etna/transforms/encoders/mean_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
from typing import Dict
from typing import List
from typing import Optional
from typing import Tuple
from typing import Union
from typing import cast

import numba
import numpy as np
import pandas as pd
from bottleneck import nanmean
Expand Down Expand Up @@ -165,6 +167,39 @@ def _count_macro_running_mean(df, n_segments):
expanding_mean = pd.Series(index=df.index, data=expanding_mean.values).shift(n_segments)
return expanding_mean

@staticmethod
@numba.njit()
def _count_per_segment_cumstats(target: np.ndarray, categories: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
ans_cumsum = np.full_like(target, np.nan)
ans_cumcount = np.full_like(target, np.nan)
unique_categories = np.unique(categories)
for category in unique_categories:
idx = np.where(category == categories)[0]
t = target[idx]

# Mask for valid (non-NaN) target values
valid = ~np.isnan(t)

# Compute cumulative sums and counts for valid values
cumsum = np.cumsum(np.where(valid, t, 0))
cumcount = np.cumsum(valid).astype(np.float32)

# Shift statistics by 1 to get statistics not including current index
cumsum = np.roll(cumsum, 1)
cumcount = np.roll(cumcount, 1)

cumsum[0] = np.NaN
cumcount[0] = np.NaN

# Handle positions with no previous valid values
cumsum[cumcount == 0] = np.NaN
cumcount[cumcount == 0] = np.NaN

# Assign the computed values back to the answer arrays
ans_cumsum[idx] = cumsum
ans_cumcount[idx] = cumcount
return ans_cumsum, ans_cumcount

def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Get encoded values for the segment.
Expand Down Expand Up @@ -211,33 +246,24 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
for segment in segments:
segment_df = TSDataset.to_flatten(intersected_df.loc[:, self.idx[segment, :]])
y = segment_df["target"]
categories = segment_df[self.in_column].values.astype(str)

unique_categories = np.unique(categories)
cat_to_int = {cat: idx for idx, cat in enumerate(unique_categories)}
int_categories = np.array([cat_to_int[cat] for cat in categories], dtype=np.int64)

# first timestamp is NaN
expanding_mean = y.expanding().mean().shift()
# generate NaN mask
first_notna_index = segment_df.loc[y.notna()].groupby(self.in_column).head(1).index
first_notna_index = pd.Series(index=first_notna_index, data=True).reindex(y.index).fillna(False)
first_appearance = segment_df.groupby(self.in_column).cumcount() == 0
mask = ~(first_appearance | first_notna_index)
# cumcount not including current timestamp
cumcount_include_nan_index = y.groupby(segment_df[self.in_column].astype(str)).cumcount()
cumcount = (
y.dropna()
.groupby(segment_df[self.in_column].astype(str))
.cumcount()
.reindex(y.index)
.fillna(cumcount_include_nan_index)
)
cumcount = cumcount.where(mask, np.nan)
# cumsum not including current timestamp
cumsum = y.groupby(segment_df[self.in_column].astype(str)).transform(
lambda x: x.shift().fillna(0).cumsum()
)
cumsum = cumsum.where(mask, np.nan)

cumsum, cumcount = self._count_per_segment_cumstats(y.values, int_categories)
cumsum = pd.Series(cumsum)
cumcount = pd.Series(cumcount)

feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing)
if self.handle_missing is MissingMode.global_mean:
nan_feature_index = segment_df[segment_df[self.in_column].isnull()].index
feature.loc[nan_feature_index] = expanding_mean.loc[nan_feature_index]

intersected_df.loc[:, self.idx[segment, self.out_column]] = feature.values

else:
Expand All @@ -254,27 +280,30 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
cur_timestamp_idx = np.arange(0, len(timestamps) * n_segments, len(timestamps))
for _ in range(len(timestamps)):
timestamp_df = flatten.loc[cur_timestamp_idx]

# statistics from previous timestamp
cumsum_dict = dict(cumstats[[self.in_column, "sum"]].values)
cumcount_dict = dict(cumstats[[self.in_column, "count"]].values)

# map categories for current timestamp to statistics
temp.loc[cur_timestamp_idx, "cumsum"] = timestamp_df[self.in_column].map(cumsum_dict)
temp.loc[cur_timestamp_idx, "cumcount"] = timestamp_df[self.in_column].map(cumcount_dict)

# count statistics for current timestamp
stats = (
timestamp_df["target"]
.groupby(timestamp_df[self.in_column], dropna=False)
.agg(["count", "sum"])
.reset_index()
.replace(0, np.NaN)
)
# statistics become zeros for categories with target=NaN
stats = stats.replace({"count": 0, "sum": 0}, np.NaN)

# sum current and previous statistics
cumstats = (
pd.concat([cumstats, stats])
.groupby(self.in_column, as_index=False, dropna=False)
.sum()
.replace(0, np.NaN)
)
cumstats = pd.concat([cumstats, stats]).groupby(self.in_column, as_index=False, dropna=False).sum()
# zeros appear for categories that weren't updated in previous line and whose statistics were NaN
cumstats = cumstats.replace({"count": 0, "sum": 0}, np.NaN)

cur_timestamp_idx += 1

feature = (temp["cumsum"] + running_mean * self.smoothing) / (temp["cumcount"] + self.smoothing)
Expand Down
Loading

0 comments on commit f62f2d4

Please sign in to comment.