Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix target leakage in MeanSegmentEncoderTransform #503

Merged
merged 5 commits into from
Nov 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494))
- Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499))
- Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492))
-
- Fix `target` leakage in `MeanSegmentEncoderTransform` ([#503](https://github.com/etna-team/etna/pull/503))
-
-
-
Expand Down
54 changes: 24 additions & 30 deletions etna/transforms/encoders/mean_segment_encoder.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,34 @@
import reprlib
from typing import Dict
from typing import List
from typing import Optional

import numpy as np
import pandas as pd

from etna.transforms import IrreversibleTransform
from etna.transforms.math.statistics import MeanTransform
from etna.transforms.encoders.mean_encoder import MeanEncoderTransform


class MeanSegmentEncoderTransform(IrreversibleTransform):
"""Makes expanding mean target encoding of the segment. Creates column 'segment_mean'."""

idx = pd.IndexSlice
_segment_column = "segment_column"
out_column = "segment_mean"

def __init__(self):
super().__init__(required_features=["target"])
self.mean_encoder = MeanTransform(in_column="target", window=-1, out_column="segment_mean")
self.global_means: Optional[Dict[str, float]] = None
self._mean_encoder = MeanEncoderTransform(
in_column=self._segment_column, mode="per-segment", out_column=self.out_column, smoothing=0
)

def _add_segment_column(self, df):
segments = df.columns.get_level_values("segment").unique()
flatten_segments = np.repeat(segments.values[np.newaxis, :], len(df), axis=0)
segment_values = pd.DataFrame(
data=flatten_segments,
columns=pd.MultiIndex.from_product([segments, [self._segment_column]]),
index=df.index,
)
df = pd.concat([df, segment_values], axis=1).sort_index(axis=1)
return df

def _fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform":
"""
Expand All @@ -34,10 +44,8 @@ def _fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform":
:
Fitted transform
"""
self.mean_encoder._fit(df)
mean_values = df.loc[:, self.idx[:, "target"]].mean().to_dict()
mean_values = {key[0]: value for key, value in mean_values.items()}
self.global_means = mean_values
df = self._add_segment_column(df)
self._mean_encoder._fit(df)
return self

def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
Expand All @@ -61,25 +69,11 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
NotImplementedError:
If there are segments that weren't present during training.
"""
if self.global_means is None:
raise ValueError("The transform isn't fitted!")

segments = df.columns.get_level_values("segment").unique().tolist()
new_segments = set(segments) - self.global_means.keys()
if len(new_segments) > 0:
raise NotImplementedError(
f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}"
)

df = self.mean_encoder._transform(df)
segment = segments[0]
nan_timestamps = df[df.loc[:, self.idx[segment, "target"]].isna()].index
values_to_set = np.array([self.global_means[x] for x in segments])
# repetition isn't necessary for pandas >= 1.2
values_to_set = np.repeat(values_to_set[np.newaxis, :], len(nan_timestamps), axis=0)
df.loc[nan_timestamps, self.idx[:, "segment_mean"]] = values_to_set
return df
df = self._add_segment_column(df)
df_transformed = self._mean_encoder._transform(df)
df_transformed = df_transformed.drop(columns=[self._segment_column], level="feature")
return df_transformed

def get_regressors_info(self) -> List[str]:
"""Return the list with regressors created by the transform."""
return ["segment_mean"]
return [self.out_column]
50 changes: 23 additions & 27 deletions tests/test_transforms/test_encoders/conftest.py
Original file line number Diff line number Diff line change
@@ -1,38 +1,34 @@
import numpy as np
import pandas as pd
import pytest

from etna.datasets import TSDataset
from etna.datasets import generate_ar_df


@pytest.fixture
def simple_ts() -> TSDataset:
df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_1["segment"] = "Moscow"
df_1["target"] = [1.0, 2.0, 3.0, 4.0, 5.0, np.NAN, np.NAN]
df_1["exog"] = [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
df_2["segment"] = "Omsk"
df_2["target"] = [10.0, 20.0, 30.0, 40.0, 50.0, np.NAN, np.NAN]
df_2["exog"] = [60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0]
classic_df = pd.concat([df_1, df_2], ignore_index=True)
df = TSDataset.to_dataset(classic_df)
ts = TSDataset(df, freq="D")
def mean_segment_encoder_ts() -> TSDataset:
d-a-bunin marked this conversation as resolved.
Show resolved Hide resolved
df = generate_ar_df(n_segments=2, start_time="2001-01-01", periods=5)
df["target"] = [0.0, 1.0, np.NaN, 3.0, 4.0] + [np.NaN, 1.0, 2.0, 3.0, 4.0]

ts = TSDataset(df=df, freq="D")
return ts


@pytest.fixture
def expected_mean_segment_encoder_ts() -> TSDataset:
df = generate_ar_df(n_segments=2, start_time="2001-01-01", periods=5)
df["target"] = [0.0, 1.0, np.NaN, 3.0, 4.0] + [np.NaN, 1.0, 2.0, 3.0, 4.0]
df["segment_mean"] = [np.NaN, 0, 0.5, 0.5, 1.33] + [np.NaN, np.NaN, 1, 1.5, 2.0]

ts = TSDataset(df=df, freq="D")
return ts


@pytest.fixture
def transformed_simple_df() -> pd.DataFrame:
df_1 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_2 = pd.DataFrame.from_dict({"timestamp": pd.date_range("2021-06-01", "2021-06-07", freq="D")})
df_1["segment"] = "Moscow"
df_1["target"] = [1.0, 2.0, 3.0, 4.0, 5.0, np.NAN, np.NAN]
df_1["exog"] = [6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0]
df_1["segment_mean"] = [1, 1.5, 2, 2.5, 3, 3, 3]
df_2["segment"] = "Omsk"
df_2["target"] = [10.0, 20.0, 30.0, 40.0, 50.0, np.NAN, np.NAN]
df_2["exog"] = [60.0, 70.0, 80.0, 90.0, 100.0, 110.0, 120.0]
df_2["segment_mean"] = [10.0, 15.0, 20.0, 25.0, 30, 30, 30]
classic_df = pd.concat([df_1, df_2], ignore_index=True)
df = TSDataset.to_dataset(classic_df)
return df
def expected_make_future_mean_segment_encoder_ts() -> TSDataset:
df = generate_ar_df(start_time="2001-01-06", periods=2, n_segments=2)
df["target"] = [np.NaN, np.NaN] + [np.NaN, np.NaN]
df["segment_mean"] = [2.0, 2.0] + [2.5, 2.5]

ts = TSDataset(df=df, freq="D")
return ts
31 changes: 8 additions & 23 deletions tests/test_transforms/test_encoders/test_mean_encoder_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ def expected_micro_category_ts() -> TSDataset:
df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2)
df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True)
df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 1.5, 2.75, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, np.NaN]

ts = TSDataset(df, freq="D")
return ts

Expand Down Expand Up @@ -151,28 +150,14 @@ def expected_multiple_nan_target_category_ts() -> TSDataset:


@pytest.fixture
def mean_segment_encoder_ts() -> TSDataset:
df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5)
df["target"] = [0, 1, np.NaN, 3, 4]

df_exog = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=10)
df_exog.rename(columns={"target": "segment_feature"}, inplace=True)
df_exog["segment_feature"] = "segment_0"

ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future="all")

return ts


@pytest.fixture
def expected_mean_segment_encoder_ts() -> TSDataset:
df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5)
df.rename(columns={"target": "segment_mean"}, inplace=True)
df["segment_mean"] = [np.NaN, 0, 0.5, 0.5, 1.33]
def mean_segment_encoder_ts(mean_segment_encoder_ts) -> TSDataset:
df = generate_ar_df(n_segments=2, start_time="2001-01-01", periods=7)
df = df.drop(columns=["target"])
df["segment_feature"] = ["segment_0"] * 7 + ["segment_1"] * 7
df_wide = TSDataset.to_dataset(df)
mean_segment_encoder_ts.add_columns_from_pandas(df_wide, update_exog=True, regressors=["segment_feature"])

ts = TSDataset(df=df, freq="D")

return ts
return mean_segment_encoder_ts


@pytest.fixture
Expand Down Expand Up @@ -407,7 +392,7 @@ def test_mean_segment_encoder(mean_segment_encoder_ts, expected_mean_segment_enc
mean_encoder.fit_transform(mean_segment_encoder_ts)
assert_frame_equal(
mean_segment_encoder_ts.df.loc[:, pd.IndexSlice[:, "segment_mean"]],
expected_mean_segment_encoder_ts.df,
expected_mean_segment_encoder_ts.df.loc[:, pd.IndexSlice[:, "segment_mean"]],
atol=0.01,
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import pandas as pd
import pytest
from pandas.testing import assert_frame_equal

from etna.datasets import TSDataset
from etna.metrics import R2
Expand All @@ -10,44 +11,31 @@
from tests.utils import select_segments_subset


@pytest.mark.parametrize("expected_global_means", [{"Moscow": 3, "Omsk": 30}])
def test_mean_segment_encoder_fit(simple_ts, expected_global_means):
def test_mean_segment_encoder_transform(mean_segment_encoder_ts, expected_mean_segment_encoder_ts):
encoder = MeanSegmentEncoderTransform()
encoder.fit(simple_ts)
assert encoder.global_means == expected_global_means
transformed_df = encoder.fit_transform(mean_segment_encoder_ts).to_pandas()
assert_frame_equal(transformed_df, expected_mean_segment_encoder_ts.to_pandas(), atol=0.01)


def test_mean_segment_encoder_transform(simple_ts, transformed_simple_df):
encoder = MeanSegmentEncoderTransform()
transformed_df = encoder.fit_transform(simple_ts).to_pandas()
transformed_simple_df.index.freq = "D"
pd.testing.assert_frame_equal(transformed_simple_df, transformed_df)


def test_subset_segments(simple_ts):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We have such type of tests in inference tests

train_ts = simple_ts
test_df = simple_ts.loc[:, pd.IndexSlice["Omsk", :]]
test_ts = TSDataset(df=test_df, freq=simple_ts.freq)
transform = MeanSegmentEncoderTransform()

transform.fit(train_ts)
transformed_test_df = transform.transform(test_ts).to_pandas()
def test_make_future_mean_segment_encoder_transform(
mean_segment_encoder_ts, expected_make_future_mean_segment_encoder_ts
):
mean_segment_encoder = MeanSegmentEncoderTransform()
mean_segment_encoder.fit_transform(mean_segment_encoder_ts)
future_ts = mean_segment_encoder_ts.make_future(future_steps=2, transforms=[mean_segment_encoder])

segments = sorted(transformed_test_df.columns.get_level_values("segment").unique())
features = sorted(transformed_test_df.columns.get_level_values("feature").unique())
assert segments == ["Omsk"]
assert features == ["exog", "segment_mean", "target"]
assert_frame_equal(future_ts.to_pandas(), expected_make_future_mean_segment_encoder_ts.to_pandas())


def test_not_fitted_error(simple_ts):
def test_not_fitted_error(mean_segment_encoder_ts):
encoder = MeanSegmentEncoderTransform()
with pytest.raises(ValueError, match="The transform isn't fitted"):
encoder.transform(simple_ts)
encoder.transform(mean_segment_encoder_ts)


def test_new_segments_error(simple_ts):
train_ts = select_segments_subset(ts=simple_ts, segments=["Moscow"])
test_ts = select_segments_subset(ts=simple_ts, segments=["Omsk"])
def test_new_segments_error(mean_segment_encoder_ts):
train_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_0"])
test_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_1"])
transform = MeanSegmentEncoderTransform()

transform.fit(train_ts)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,54 +2,36 @@
import pandas as pd
import pytest

from etna.datasets import TSDataset
from etna.transforms import SegmentEncoderTransform
from tests.test_transforms.utils import assert_transformation_equals_loaded_original
from tests.utils import select_segments_subset


def test_segment_encoder_transform(simple_ts):
def test_segment_encoder_transform(mean_segment_encoder_ts):
transform = SegmentEncoderTransform()
transformed_df = transform.fit_transform(simple_ts).to_pandas()
transformed_df = transform.fit_transform(mean_segment_encoder_ts).to_pandas()
assert (
len(transformed_df.loc[:, pd.IndexSlice[:, "segment_code"]].columns) == 2
), "Number of columns not the same as segments"
assert len(simple_ts.to_pandas()) == len(transformed_df), "Row missing"
assert len(mean_segment_encoder_ts.to_pandas()) == len(transformed_df), "Row missing"
codes = set()
for segment in simple_ts.segments:
for segment in mean_segment_encoder_ts.segments:
column = transformed_df.loc[:, pd.IndexSlice[segment, "segment_code"]]
assert column.dtype == "category", "Column type is not category"
assert np.all(column == column.iloc[0]), "Values are not the same for the whole column"
codes.add(column.iloc[0])
assert codes == {0, 1}, "Codes are not 0 and 1"


def test_subset_segments(simple_ts):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same as for MeanSegmentEncoder

train_ts = simple_ts
test_df = simple_ts.loc[:, pd.IndexSlice["Omsk", :]]
test_ts = TSDataset(df=test_df, freq=simple_ts.freq)
transform = SegmentEncoderTransform()

transform.fit(train_ts)
transformed_test_df = transform.transform(test_ts).to_pandas()

segments = sorted(transformed_test_df.columns.get_level_values("segment").unique())
features = sorted(transformed_test_df.columns.get_level_values("feature").unique())
assert segments == ["Omsk"]
assert features == ["exog", "segment_code", "target"]
values = transformed_test_df.loc[:, pd.IndexSlice[:, "segment_code"]]
assert np.all(values == values.iloc[0])


def test_not_fitted_error(simple_ts):
def test_not_fitted_error(mean_segment_encoder_ts):
encoder = SegmentEncoderTransform()
with pytest.raises(ValueError, match="The transform isn't fitted"):
encoder.transform(simple_ts)
encoder.transform(mean_segment_encoder_ts)


def test_new_segments_error(simple_ts):
train_ts = select_segments_subset(ts=simple_ts, segments=["Moscow"])
test_ts = select_segments_subset(ts=simple_ts, segments=["Omsk"])
def test_new_segments_error(mean_segment_encoder_ts):
train_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_0"])
test_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_1"])
transform = SegmentEncoderTransform()

transform.fit(train_ts)
Expand Down
Loading