fix MeanSegmentEncoder

etna-team · Nov 7, 2024 · f62f2d4 · f62f2d4
2 parents 132d07f + 4a6e975
commit f62f2d4
Show file tree

Hide file tree

Showing 19 changed files with 357 additions and 210 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -38,10 +38,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Fixed
 - Fix working with `embedding_sizes` in `202-NN_examples` notebook ([#489](https://github.com/etna-team/etna/pull/489))
-- 
-- 
-- 
-- 
+- Disallow dropping target in `TSDataset.drop_features` ([#491](https://github.com/etna-team/etna/pull/491))
+- Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494))
+- Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499))
+- Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492))
 - 
 - 
 - 

diff --git a/etna/datasets/tsdataset.py b/etna/datasets/tsdataset.py
@@ -1296,7 +1296,7 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
         Raises
         ------
         ValueError:
-            If ``features`` list contains target components
+            If ``features`` list contains target or target components
         """
         features_set = set(features)
 
@@ -1312,6 +1312,9 @@ def drop_features(self, features: List[str], drop_from_exog: bool = False):
                 "Prediction intervals can't be dropped from the dataset using this method! Use `drop_prediction_intervals` method!"
             )
 
+        if "target" in features_set:
+            raise ValueError(f"Target can't be dropped from the dataset!")
+
         dfs = [("df", self.df)]
         if drop_from_exog:
             dfs.append(("df_exog", self.df_exog))

diff --git a/etna/models/nn/deepar_native/deepar.py b/etna/models/nn/deepar_native/deepar.py
@@ -132,6 +132,11 @@ def forward(self, x: DeepARNativeBatch, *args, **kwargs):  # type: ignore
         decoder_categorical = x["decoder_categorical"]  # each (batch_size, decoder_length, 1)
         decoder_target = x["decoder_target"].float()  # (batch_size, decoder_length, 1)
         decoder_length = decoder_real.shape[1]
+        weights = x["weight"]
+
+        # scale target values at index 0
+        encoder_real[:, :, 0] = encoder_real[:, :, 0] / weights.unsqueeze(1)
+        decoder_real[:, :, 0] = decoder_real[:, :, 0] / weights.unsqueeze(1)
 
         encoder_embeddings = self.embedding(encoder_categorical) if self.embedding is not None else torch.Tensor()
         decoder_embeddings = self.embedding(decoder_categorical) if self.embedding is not None else torch.Tensor()
@@ -191,7 +196,11 @@ def step(self, batch: DeepARNativeBatch, *args, **kwargs):  # type: ignore
         decoder_categorical = batch["decoder_categorical"]  # each (batch_size, decoder_length, 1)
         encoder_target = batch["encoder_target"].float()  # (batch_size, encoder_length-1, 1)
         decoder_target = batch["decoder_target"].float()  # (batch_size, decoder_length, 1)
-        weights = batch["weight"]
+        weights = batch["weight"]  # (batch_size)
+
+        # scale target values at index 0
+        encoder_real[:, :, 0] = encoder_real[:, :, 0] / weights.unsqueeze(1)
+        decoder_real[:, :, 0] = decoder_real[:, :, 0] / weights.unsqueeze(1)
 
         encoder_embeddings = self.embedding(encoder_categorical) if self.embedding is not None else torch.Tensor()
         decoder_embeddings = self.embedding(decoder_categorical) if self.embedding is not None else torch.Tensor()
@@ -255,11 +264,10 @@ def _make(
                 return None
 
             # Get shifted target and concatenate it with real values features
-            sample["decoder_real"] = values_real[start_idx + encoder_length : start_idx + total_sample_length].copy()
+            sample["decoder_real"] = values_real[start_idx + encoder_length : start_idx + total_sample_length]
 
             # Get shifted target and concatenate it with real values features
-            sample["encoder_real"] = values_real[start_idx : start_idx + encoder_length].copy()
-            sample["encoder_real"] = sample["encoder_real"][1:]
+            sample["encoder_real"] = values_real[start_idx + 1 : start_idx + encoder_length]
 
             for index, feature in enumerate(self.embedding_sizes.keys()):
                 sample["encoder_categorical"][feature] = values_categorical[index][
@@ -276,10 +284,6 @@ def _make(
 
             sample["segment"] = segment
             sample["weight"] = 1 + sample["encoder_target"].mean() if self.scale else 1
-            sample["encoder_real"][:, 0] = values_real[start_idx + 1 : start_idx + encoder_length, 0] / sample["weight"]
-            sample["decoder_real"][:, 0] = (
-                values_real[start_idx + encoder_length : start_idx + total_sample_length, 0] / sample["weight"]
-            )
 
             return sample
 

diff --git a/etna/models/nn/deepstate/deepstate.py b/etna/models/nn/deepstate/deepstate.py
@@ -4,6 +4,7 @@
 from typing import Optional
 from typing import Tuple
 
+import numpy as np
 import pandas as pd
 from typing_extensions import TypedDict
 
@@ -117,9 +118,9 @@ def step(self, batch: DeepStateBatch, *args, **kwargs):  # type: ignore
         :
             loss, true_target, prediction_target
         """
-        encoder_real = batch["encoder_real"]  # (batch_size, seq_length, input_size)
+        encoder_real = batch["encoder_real"].float()  # (batch_size, seq_length, input_size)
         encoder_categorical = batch["encoder_categorical"]  # each (batch_size, seq_length, 1)
-        targets = batch["encoder_target"]  # (batch_size, seq_length, 1)
+        targets = batch["encoder_target"].float()  # (batch_size, seq_length, 1)
         seq_length = targets.shape[1]
         datetime_index = batch["datetime_index"].permute(1, 0, 2)[
             :, :, :seq_length
@@ -159,11 +160,11 @@ def forward(self, x: DeepStateBatch, *args, **kwargs):  # type: ignore
         :
             forecast with shape (batch_size, decoder_length, 1)
         """
-        encoder_real = x["encoder_real"]  # (batch_size, seq_length, input_size)
+        encoder_real = x["encoder_real"].float()  # (batch_size, seq_length, input_size)
         encoder_categorical = x["encoder_categorical"]  # each (batch_size, seq_length, 1)
         seq_length = encoder_real.shape[1]
-        targets = x["encoder_target"][:, :seq_length]  # (batch_size, seq_length, 1)
-        decoder_real = x["decoder_real"]  # (batch_size, horizon, input_size)
+        targets = x["encoder_target"][:, :seq_length].float()  # (batch_size, seq_length, 1)
+        decoder_real = x["decoder_real"].float()  # (batch_size, horizon, input_size)
         decoder_categorical = x["decoder_categorical"]  # each (batch_size, horizon, 1)
         datetime_index_train = x["datetime_index"].permute(1, 0, 2)[
             :, :, :seq_length
@@ -213,26 +214,23 @@ def forward(self, x: DeepStateBatch, *args, **kwargs):  # type: ignore
     def make_samples(self, df: pd.DataFrame, encoder_length: int, decoder_length: int) -> Iterator[dict]:
         """Make samples from segment DataFrame."""
         values_real = df.drop(columns=["target", "segment", "timestamp"] + list(self.embedding_sizes.keys())).values
-        values_real = torch.from_numpy(values_real).float()
 
         # Categories that were not seen during `fit` will be filled with new category
         for feature in self.embedding_sizes:
             df[feature] = df[feature].astype(float).fillna(self.embedding_sizes[feature][0])
 
         # Columns in `values_categorical` are in the same order as in `embedding_sizes`
-        values_categorical = torch.from_numpy(df[self.embedding_sizes.keys()].values.T)
+        values_categorical = df[self.embedding_sizes.keys()].values.T
 
-        values_datetime = torch.from_numpy(self.ssm.generate_datetime_index(df["timestamp"]))
-        values_datetime = values_datetime.to(torch.int64)
+        values_datetime = self.ssm.generate_datetime_index(df["timestamp"]).astype(int)
         values_target = df["target"].values
-        values_target = torch.from_numpy(values_target).float()
         segment = df["segment"].values[0]
 
         def _make(
-            values_target: torch.Tensor,
-            values_real: torch.Tensor,
-            values_categorical: torch.Tensor,
-            values_datetime: torch.Tensor,
+            values_target: np.ndarray,
+            values_real: np.ndarray,
+            values_categorical: np.ndarray,
+            values_datetime: np.ndarray,
             segment: str,
             start_idx: int,
             encoder_length: int,

diff --git a/etna/models/nn/tft_native/tft.py b/etna/models/nn/tft_native/tft.py
@@ -381,6 +381,17 @@ def make_samples(self, df: pd.DataFrame, encoder_length: int, decoder_length: in
         segment = df["segment"].values[0]
         for feature in self.num_embeddings:
             df[feature] = df[feature].astype(float).fillna(self.num_embeddings[feature])
+
+        reals_columns = list(set(self.static_reals + self.time_varying_reals_encoder + self.time_varying_reals_decoder))
+        categ_columns = list(
+            set(
+                self.static_categoricals
+                + self.time_varying_categoricals_encoder
+                + self.time_varying_categoricals_decoder
+            )
+        )
+
+        df = df[reals_columns + categ_columns]
         column_to_index = {column: index for index, column in enumerate(df.columns)}
         values = df.values.T
 
@@ -410,48 +421,44 @@ def _make(
                 return None
 
             sample["segment"] = segment
-            sample["decoder_target"] = (
-                values[column_to_index["target"]][start_idx + encoder_length : start_idx + total_sample_length]
-                .reshape(-1, 1)
-                .astype(float)
+            sample["decoder_target"] = values[column_to_index["target"]][
+                start_idx + encoder_length : start_idx + total_sample_length
+            ].reshape(
+                -1, 1
             )  # (decoder_length, 1)
 
             for feature in self.static_reals:
-                sample["static_reals"][feature] = (
-                    values[column_to_index[feature]][:1].reshape(-1, 1).astype(float)
-                )  # (1, 1)
+                sample["static_reals"][feature] = values[column_to_index[feature]][:1].reshape(-1, 1)  # (1, 1)
 
             for feature in self.static_categoricals:
-                sample["static_categoricals"][feature] = (
-                    values[column_to_index[feature]][:1].reshape(-1, 1).astype(float)
-                )  # (1, 1)
+                sample["static_categoricals"][feature] = values[column_to_index[feature]][:1].reshape(-1, 1)  # (1, 1)
 
             for feature in self.time_varying_categoricals_encoder:
-                sample["time_varying_categoricals_encoder"][feature] = (
-                    values[column_to_index[feature]][start_idx : start_idx + encoder_length]
-                    .reshape(-1, 1)
-                    .astype(float)
+                sample["time_varying_categoricals_encoder"][feature] = values[column_to_index[feature]][
+                    start_idx : start_idx + encoder_length
+                ].reshape(
+                    -1, 1
                 )  # (encoder_length, 1)
 
             for feature in self.time_varying_categoricals_decoder:
-                sample["time_varying_categoricals_decoder"][feature] = (
-                    values[column_to_index[feature]][start_idx + encoder_length : start_idx + total_sample_length]
-                    .reshape(-1, 1)
-                    .astype(float)
+                sample["time_varying_categoricals_decoder"][feature] = values[column_to_index[feature]][
+                    start_idx + encoder_length : start_idx + total_sample_length
+                ].reshape(
+                    -1, 1
                 )  # (decoder_length, 1)
 
             for feature in self.time_varying_reals_encoder:
-                sample["time_varying_reals_encoder"][feature] = (
-                    values[column_to_index[feature]][start_idx : start_idx + encoder_length]
-                    .reshape(-1, 1)
-                    .astype(float)
+                sample["time_varying_reals_encoder"][feature] = values[column_to_index[feature]][
+                    start_idx : start_idx + encoder_length
+                ].reshape(
+                    -1, 1
                 )  # (encoder_length, 1)
 
             for feature in self.time_varying_reals_decoder:
-                sample["time_varying_reals_decoder"][feature] = (
-                    values[column_to_index[feature]][start_idx + encoder_length : start_idx + total_sample_length]
-                    .reshape(-1, 1)
-                    .astype(float)
+                sample["time_varying_reals_decoder"][feature] = values[column_to_index[feature]][
+                    start_idx + encoder_length : start_idx + total_sample_length
+                ].reshape(
+                    -1, 1
                 )  # (decoder_length, 1)
 
             return sample

diff --git a/etna/transforms/encoders/mean_encoder.py b/etna/transforms/encoders/mean_encoder.py
@@ -3,9 +3,11 @@
 from typing import Dict
 from typing import List
 from typing import Optional
+from typing import Tuple
 from typing import Union
 from typing import cast
 
+import numba
 import numpy as np
 import pandas as pd
 from bottleneck import nanmean
@@ -165,6 +167,39 @@ def _count_macro_running_mean(df, n_segments):
         expanding_mean = pd.Series(index=df.index, data=expanding_mean.values).shift(n_segments)
         return expanding_mean
 
+    @staticmethod
+    @numba.njit()
+    def _count_per_segment_cumstats(target: np.ndarray, categories: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        ans_cumsum = np.full_like(target, np.nan)
+        ans_cumcount = np.full_like(target, np.nan)
+        unique_categories = np.unique(categories)
+        for category in unique_categories:
+            idx = np.where(category == categories)[0]
+            t = target[idx]
+
+            # Mask for valid (non-NaN) target values
+            valid = ~np.isnan(t)
+
+            # Compute cumulative sums and counts for valid values
+            cumsum = np.cumsum(np.where(valid, t, 0))
+            cumcount = np.cumsum(valid).astype(np.float32)
+
+            # Shift statistics by 1 to get statistics not including current index
+            cumsum = np.roll(cumsum, 1)
+            cumcount = np.roll(cumcount, 1)
+
+            cumsum[0] = np.NaN
+            cumcount[0] = np.NaN
+
+            # Handle positions with no previous valid values
+            cumsum[cumcount == 0] = np.NaN
+            cumcount[cumcount == 0] = np.NaN
+
+            # Assign the computed values back to the answer arrays
+            ans_cumsum[idx] = cumsum
+            ans_cumcount[idx] = cumcount
+        return ans_cumsum, ans_cumcount
+
     def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
         """
         Get encoded values for the segment.
@@ -211,33 +246,24 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
                 for segment in segments:
                     segment_df = TSDataset.to_flatten(intersected_df.loc[:, self.idx[segment, :]])
                     y = segment_df["target"]
+                    categories = segment_df[self.in_column].values.astype(str)
+
+                    unique_categories = np.unique(categories)
+                    cat_to_int = {cat: idx for idx, cat in enumerate(unique_categories)}
+                    int_categories = np.array([cat_to_int[cat] for cat in categories], dtype=np.int64)
+
                     # first timestamp is NaN
                     expanding_mean = y.expanding().mean().shift()
-                    # generate NaN mask
-                    first_notna_index = segment_df.loc[y.notna()].groupby(self.in_column).head(1).index
-                    first_notna_index = pd.Series(index=first_notna_index, data=True).reindex(y.index).fillna(False)
-                    first_appearance = segment_df.groupby(self.in_column).cumcount() == 0
-                    mask = ~(first_appearance | first_notna_index)
-                    # cumcount not including current timestamp
-                    cumcount_include_nan_index = y.groupby(segment_df[self.in_column].astype(str)).cumcount()
-                    cumcount = (
-                        y.dropna()
-                        .groupby(segment_df[self.in_column].astype(str))
-                        .cumcount()
-                        .reindex(y.index)
-                        .fillna(cumcount_include_nan_index)
-                    )
-                    cumcount = cumcount.where(mask, np.nan)
-                    # cumsum not including current timestamp
-                    cumsum = y.groupby(segment_df[self.in_column].astype(str)).transform(
-                        lambda x: x.shift().fillna(0).cumsum()
-                    )
-                    cumsum = cumsum.where(mask, np.nan)
+
+                    cumsum, cumcount = self._count_per_segment_cumstats(y.values, int_categories)
+                    cumsum = pd.Series(cumsum)
+                    cumcount = pd.Series(cumcount)
 
                     feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing)
                     if self.handle_missing is MissingMode.global_mean:
                         nan_feature_index = segment_df[segment_df[self.in_column].isnull()].index
                         feature.loc[nan_feature_index] = expanding_mean.loc[nan_feature_index]
+
                     intersected_df.loc[:, self.idx[segment, self.out_column]] = feature.values
 
             else:
@@ -254,27 +280,30 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
                 cur_timestamp_idx = np.arange(0, len(timestamps) * n_segments, len(timestamps))
                 for _ in range(len(timestamps)):
                     timestamp_df = flatten.loc[cur_timestamp_idx]
+
                     # statistics from previous timestamp
                     cumsum_dict = dict(cumstats[[self.in_column, "sum"]].values)
                     cumcount_dict = dict(cumstats[[self.in_column, "count"]].values)
+
                     # map categories for current timestamp to statistics
                     temp.loc[cur_timestamp_idx, "cumsum"] = timestamp_df[self.in_column].map(cumsum_dict)
                     temp.loc[cur_timestamp_idx, "cumcount"] = timestamp_df[self.in_column].map(cumcount_dict)
+
                     # count statistics for current timestamp
                     stats = (
                         timestamp_df["target"]
                         .groupby(timestamp_df[self.in_column], dropna=False)
                         .agg(["count", "sum"])
                         .reset_index()
-                        .replace(0, np.NaN)
                     )
+                    # statistics become zeros for categories with target=NaN
+                    stats = stats.replace({"count": 0, "sum": 0}, np.NaN)
+
                     # sum current and previous statistics
-                    cumstats = (
-                        pd.concat([cumstats, stats])
-                        .groupby(self.in_column, as_index=False, dropna=False)
-                        .sum()
-                        .replace(0, np.NaN)
-                    )
+                    cumstats = pd.concat([cumstats, stats]).groupby(self.in_column, as_index=False, dropna=False).sum()
+                    # zeros appear for categories that weren't updated in previous line and whose statistics were NaN
+                    cumstats = cumstats.replace({"count": 0, "sum": 0}, np.NaN)
+
                     cur_timestamp_idx += 1
 
                 feature = (temp["cumsum"] + running_mean * self.smoothing) / (temp["cumcount"] + self.smoothing)