MeanEncoderTransform generates wrong values (#492)

* fix MeanEncoder * update changelog * fix * fix * fix * restore rnn file * fix numba method * add comments * add comment for fixtures * add blank lines for more readability * fix spelling * combine two tests in one * add test for 2 segments --------- Co-authored-by: Egor Baturin <[email protected]>
etna-team · Nov 6, 2024 · 4a6e975 · 4a6e975
1 parent 6cd66ae
commit 4a6e975
Show file tree

Hide file tree

Showing 3 changed files with 199 additions and 16 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,7 +41,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Disallow dropping target in `TSDataset.drop_features` ([#491](https://github.com/etna-team/etna/pull/491))
 - Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494))
 - Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499))
-- 
+- Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492))
 - 
 - 
 - 

diff --git a/etna/transforms/encoders/mean_encoder.py b/etna/transforms/encoders/mean_encoder.py
@@ -3,9 +3,11 @@
 from typing import Dict
 from typing import List
 from typing import Optional
+from typing import Tuple
 from typing import Union
 from typing import cast
 
+import numba
 import numpy as np
 import pandas as pd
 from bottleneck import nanmean
@@ -165,6 +167,39 @@ def _count_macro_running_mean(df, n_segments):
         expanding_mean = pd.Series(index=df.index, data=expanding_mean.values).shift(n_segments)
         return expanding_mean
 
+    @staticmethod
+    @numba.njit()
+    def _count_per_segment_cumstats(target: np.ndarray, categories: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        ans_cumsum = np.full_like(target, np.nan)
+        ans_cumcount = np.full_like(target, np.nan)
+        unique_categories = np.unique(categories)
+        for category in unique_categories:
+            idx = np.where(category == categories)[0]
+            t = target[idx]
+
+            # Mask for valid (non-NaN) target values
+            valid = ~np.isnan(t)
+
+            # Compute cumulative sums and counts for valid values
+            cumsum = np.cumsum(np.where(valid, t, 0))
+            cumcount = np.cumsum(valid).astype(np.float32)
+
+            # Shift statistics by 1 to get statistics not including current index
+            cumsum = np.roll(cumsum, 1)
+            cumcount = np.roll(cumcount, 1)
+
+            cumsum[0] = np.NaN
+            cumcount[0] = np.NaN
+
+            # Handle positions with no previous valid values
+            cumsum[cumcount == 0] = np.NaN
+            cumcount[cumcount == 0] = np.NaN
+
+            # Assign the computed values back to the answer arrays
+            ans_cumsum[idx] = cumsum
+            ans_cumcount[idx] = cumcount
+        return ans_cumsum, ans_cumcount
+
     def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
         """
         Get encoded values for the segment.
@@ -211,20 +246,24 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
                 for segment in segments:
                     segment_df = TSDataset.to_flatten(intersected_df.loc[:, self.idx[segment, :]])
                     y = segment_df["target"]
+                    categories = segment_df[self.in_column].values.astype(str)
+
+                    unique_categories = np.unique(categories)
+                    cat_to_int = {cat: idx for idx, cat in enumerate(unique_categories)}
+                    int_categories = np.array([cat_to_int[cat] for cat in categories], dtype=np.int64)
+
                     # first timestamp is NaN
                     expanding_mean = y.expanding().mean().shift()
-                    # cumcount not including current timestamp
-                    cumcount = y.groupby(segment_df[self.in_column].astype(str)).agg("cumcount")
-                    # cumsum not including current timestamp
-                    cumsum = (
-                        y.groupby(segment_df[self.in_column].astype(str))
-                        .transform(lambda x: x.shift().cumsum())
-                        .fillna(0)
-                    )
+
+                    cumsum, cumcount = self._count_per_segment_cumstats(y.values, int_categories)
+                    cumsum = pd.Series(cumsum)
+                    cumcount = pd.Series(cumcount)
+
                     feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing)
                     if self.handle_missing is MissingMode.global_mean:
                         nan_feature_index = segment_df[segment_df[self.in_column].isnull()].index
                         feature.loc[nan_feature_index] = expanding_mean.loc[nan_feature_index]
+
                     intersected_df.loc[:, self.idx[segment, self.out_column]] = feature.values
 
             else:
@@ -237,25 +276,34 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
                 timestamps = intersected_df.index
                 categories = pd.unique(df.loc[:, self.idx[:, self.in_column]].values.ravel())
 
-                cumstats = pd.DataFrame(data={"sum": 0, "count": 0, self.in_column: categories})
+                cumstats = pd.DataFrame(data={"sum": np.NaN, "count": np.NaN, self.in_column: categories})
                 cur_timestamp_idx = np.arange(0, len(timestamps) * n_segments, len(timestamps))
                 for _ in range(len(timestamps)):
                     timestamp_df = flatten.loc[cur_timestamp_idx]
+
                     # statistics from previous timestamp
                     cumsum_dict = dict(cumstats[[self.in_column, "sum"]].values)
                     cumcount_dict = dict(cumstats[[self.in_column, "count"]].values)
+
                     # map categories for current timestamp to statistics
                     temp.loc[cur_timestamp_idx, "cumsum"] = timestamp_df[self.in_column].map(cumsum_dict)
                     temp.loc[cur_timestamp_idx, "cumcount"] = timestamp_df[self.in_column].map(cumcount_dict)
+
                     # count statistics for current timestamp
                     stats = (
                         timestamp_df["target"]
                         .groupby(timestamp_df[self.in_column], dropna=False)
                         .agg(["count", "sum"])
                         .reset_index()
                     )
+                    # statistics become zeros for categories with target=NaN
+                    stats = stats.replace({"count": 0, "sum": 0}, np.NaN)
+
                     # sum current and previous statistics
                     cumstats = pd.concat([cumstats, stats]).groupby(self.in_column, as_index=False, dropna=False).sum()
+                    # zeros appear for categories that weren't updated in previous line and whose statistics were NaN
+                    cumstats = cumstats.replace({"count": 0, "sum": 0}, np.NaN)
+
                     cur_timestamp_idx += 1
 
                 feature = (temp["cumsum"] + running_mean * self.smoothing) / (temp["cumcount"] + self.smoothing)

diff --git a/tests/test_transforms/test_encoders/test_mean_encoder_transform.py b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py
@@ -31,7 +31,7 @@ def category_ts() -> TSDataset:
 def expected_micro_category_ts() -> TSDataset:
     df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2)
     df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True)
-    df["mean_encoded_regressor"] = [np.NaN, 1, 1.5, 1.5, 2.75, 2.25] + [np.NaN, 6.0, 6.25, 7, 7.625, 8.0]
+    df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 1.5, 2.75, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, np.NaN]
 
     ts = TSDataset(df, freq="D")
     return ts
@@ -41,7 +41,7 @@ def expected_micro_category_ts() -> TSDataset:
 def expected_micro_global_mean_ts() -> TSDataset:
     df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2)
     df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True)
-    df["mean_encoded_regressor"] = [np.NaN, 1, 1.5, 1.5, 2.5, 2.25] + [np.NaN, 6.0, 6.25, 7, 7.625, 8.0]
+    df["mean_encoded_regressor"] = [np.NaN, np.NaN, 1.5, 1.5, 2.5, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, 8.0]
 
     ts = TSDataset(df, freq="D")
     return ts
@@ -61,7 +61,7 @@ def expected_micro_category_make_future_ts() -> TSDataset:
 def expected_macro_category_ts() -> TSDataset:
     df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2)
     df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True)
-    df["mean_encoded_regressor"] = [np.NaN, 3.5, 4, 4.875, 4, 4.85] + [np.NaN, 3.5, 3.66, 4.875, 5.5, 4.275]
+    df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 4.875, 4, 4.851] + [np.NaN, np.NaN, 3.66, 4.875, 5.5, 4.27]
 
     ts = TSDataset(df, freq="D")
     return ts
@@ -71,7 +71,7 @@ def expected_macro_category_ts() -> TSDataset:
 def expected_macro_global_mean_ts() -> TSDataset:
     df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2)
     df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True)
-    df["mean_encoded_regressor"] = [np.NaN, 3.5, 4, 4.875, 5, 4.85] + [np.NaN, 3.5, 3.66, 4.875, 5.5, 5.55]
+    df["mean_encoded_regressor"] = [np.NaN, np.NaN, 4, 4.875, 5, 4.85] + [np.NaN, np.NaN, 3.66, 4.875, 5.5, 5.55]
 
     ts = TSDataset(df, freq="D")
     return ts
@@ -104,7 +104,7 @@ def ts_begin_nan() -> TSDataset:
 def expected_ts_begin_nan_smooth_1() -> TSDataset:
     df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1)
     df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True)
-    df["mean_encoded_regressor"] = [np.NaN, np.NaN, 0.5, 1.16, 1.5, 2.5]
+    df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 1.75, 1.5, 2.5]
 
     ts = TSDataset(df, freq="D")
     return ts
@@ -114,12 +114,97 @@ def expected_ts_begin_nan_smooth_1() -> TSDataset:
 def expected_ts_begin_nan_smooth_2() -> TSDataset:
     df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1)
     df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True)
-    df["mean_encoded_regressor"] = [np.NaN, np.NaN, 2 / 3, 5 / 4, 5 / 3, 2.5]
+    df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 5 / 3, 5 / 3, 2.5]
 
     ts = TSDataset(df, freq="D")
     return ts
 
 
+@pytest.fixture
+def multiple_nan_target_category_ts() -> TSDataset:
+    """Fixture with segment having multiple NaN targets:
+
+    * For `regressor="A"` set of NaN timestamp goes before first notna value
+    * For `regressor="B"` set of NaN timestamp goes after first notna value
+    """
+    df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=8)
+    df["target"] = [np.nan, 1.5, np.nan, 3.0, 4.0, np.NaN, np.NaN, np.NaN]
+
+    df_exog = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=9)
+    df_exog.rename(columns={"target": "regressor"}, inplace=True)
+    df_exog["regressor"] = ["A", "B", "A", "A", "B", "B", "B", "A", "A"]
+
+    ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future="all")
+
+    return ts
+
+
+@pytest.fixture
+def expected_multiple_nan_target_category_ts() -> TSDataset:
+    df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=8)
+    df.rename(columns={"target": "regressor_mean"}, inplace=True)
+    df["regressor_mean"] = [np.NaN, np.NaN, np.NaN, np.NaN, 1.5, 2.75, 2.75, 3.0]
+
+    ts = TSDataset(df=df, freq="D")
+
+    return ts
+
+
+@pytest.fixture
+def mean_segment_encoder_ts() -> TSDataset:
+    df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5)
+    df["target"] = [0, 1, np.NaN, 3, 4]
+
+    df_exog = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=10)
+    df_exog.rename(columns={"target": "segment_feature"}, inplace=True)
+    df_exog["segment_feature"] = "segment_0"
+
+    ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future="all")
+
+    return ts
+
+
+@pytest.fixture
+def expected_mean_segment_encoder_ts() -> TSDataset:
+    df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5)
+    df.rename(columns={"target": "segment_mean"}, inplace=True)
+    df["segment_mean"] = [np.NaN, 0, 0.5, 0.5, 1.33]
+
+    ts = TSDataset(df=df, freq="D")
+
+    return ts
+
+
+@pytest.fixture
+def multiple_nan_target_two_segments_ts() -> TSDataset:
+    """Fixture with two segments having multiple NaN targets:
+
+    * For `regressor="A"` set of NaN timestamp goes before first notna value
+    * For `regressor="B"` set of NaN timestamp goes after first notna value
+    """
+    df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2)
+    df["target"] = [np.NaN, 2, np.NaN, 4, np.NaN, 5] + [np.NaN, 7, np.NaN, np.NaN, 10, 11]
+
+    df_exog = generate_ar_df(start_time="2001-01-01", periods=7, n_segments=2)
+    df_exog.rename(columns={"target": "regressor"}, inplace=True)
+    df_exog["regressor"] = ["A", "B", "A", "A", "B", "B", "A"] + ["A", "B", "A", "B", "A", "B", "A"]
+
+    ts = TSDataset(df, df_exog=df_exog, freq="D", known_future="all")
+
+    return ts
+
+
+@pytest.fixture
+def expected_multiple_nan_target_two_segments_ts() -> TSDataset:
+    df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2)
+    df.rename(columns={"target": "regressor_mean"}, inplace=True)
+    df["regressor_mean"] = [np.NaN, np.NaN, np.NaN, np.NaN, 4.5, 4.5] + [np.NaN, np.NaN, np.NaN, 4.5, 4, 4.5]
+
+    ts = TSDataset(df=df, freq="D")
+
+    return ts
+
+
 @pytest.mark.smoke
 @pytest.mark.parametrize("mode", ["per-segment", "macro"])
 @pytest.mark.parametrize("handle_missing", ["category", "global_mean"])
@@ -311,6 +396,56 @@ def test_ts_begin_nan_smooth_2(ts_begin_nan, expected_ts_begin_nan_smooth_2):
     )
 
 
+def test_mean_segment_encoder(mean_segment_encoder_ts, expected_mean_segment_encoder_ts):
+    mean_encoder = MeanEncoderTransform(
+        in_column="segment_feature",
+        mode="per-segment",
+        handle_missing="category",
+        smoothing=0,
+        out_column="segment_mean",
+    )
+    mean_encoder.fit_transform(mean_segment_encoder_ts)
+    assert_frame_equal(
+        mean_segment_encoder_ts.df.loc[:, pd.IndexSlice[:, "segment_mean"]],
+        expected_mean_segment_encoder_ts.df,
+        atol=0.01,
+    )
+
+
+def test_multiple_nan_target_category_ts(multiple_nan_target_category_ts, expected_multiple_nan_target_category_ts):
+    mean_encoder = MeanEncoderTransform(
+        in_column="regressor",
+        mode="per-segment",
+        handle_missing="category",
+        smoothing=0,
+        out_column="regressor_mean",
+    )
+    mean_encoder.fit_transform(multiple_nan_target_category_ts)
+    assert_frame_equal(
+        multiple_nan_target_category_ts.df.loc[:, pd.IndexSlice[:, "regressor_mean"]],
+        expected_multiple_nan_target_category_ts.df,
+        atol=0.01,
+    )
+
+
+def test_multiple_nan_target_two_segments_ts(
+    multiple_nan_target_two_segments_ts, expected_multiple_nan_target_two_segments_ts
+):
+    mean_encoder = MeanEncoderTransform(
+        in_column="regressor",
+        mode="macro",
+        handle_missing="category",
+        smoothing=0,
+        out_column="regressor_mean",
+    )
+    mean_encoder.fit_transform(multiple_nan_target_two_segments_ts)
+    assert_frame_equal(
+        multiple_nan_target_two_segments_ts.df.loc[:, pd.IndexSlice[:, "regressor_mean"]],
+        expected_multiple_nan_target_two_segments_ts.df,
+        atol=0.01,
+    )
+
+
 def test_save_load(category_ts):
     mean_encoder = MeanEncoderTransform(in_column="regressor", out_column="mean_encoded_regressor")
     assert_transformation_equals_loaded_original(transform=mean_encoder, ts=category_ts)