From 132d07f0a903be637510132037aff742cc3f0462 Mon Sep 17 00:00:00 2001 From: Egor Baturin Date: Wed, 16 Oct 2024 07:20:55 +0300 Subject: [PATCH 1/4] fix MeanEncoder --- etna/transforms/encoders/mean_encoder.py | 33 +++++++++--- .../test_mean_encoder_transform.py | 53 ++++++++++++++++--- 2 files changed, 73 insertions(+), 13 deletions(-) diff --git a/etna/transforms/encoders/mean_encoder.py b/etna/transforms/encoders/mean_encoder.py index 207f0d734..c26d7458a 100644 --- a/etna/transforms/encoders/mean_encoder.py +++ b/etna/transforms/encoders/mean_encoder.py @@ -213,14 +213,27 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: y = segment_df["target"] # first timestamp is NaN expanding_mean = y.expanding().mean().shift() + # generate NaN mask + first_notna_index = segment_df.loc[y.notna()].groupby(self.in_column).head(1).index + first_notna_index = pd.Series(index=first_notna_index, data=True).reindex(y.index).fillna(False) + first_appearance = segment_df.groupby(self.in_column).cumcount() == 0 + mask = ~(first_appearance | first_notna_index) # cumcount not including current timestamp - cumcount = y.groupby(segment_df[self.in_column].astype(str)).agg("cumcount") + cumcount_include_nan_index = y.groupby(segment_df[self.in_column].astype(str)).cumcount() + cumcount = ( + y.dropna() + .groupby(segment_df[self.in_column].astype(str)) + .cumcount() + .reindex(y.index) + .fillna(cumcount_include_nan_index) + ) + cumcount = cumcount.where(mask, np.nan) # cumsum not including current timestamp - cumsum = ( - y.groupby(segment_df[self.in_column].astype(str)) - .transform(lambda x: x.shift().cumsum()) - .fillna(0) + cumsum = y.groupby(segment_df[self.in_column].astype(str)).transform( + lambda x: x.shift().fillna(0).cumsum() ) + cumsum = cumsum.where(mask, np.nan) + feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing) if self.handle_missing is MissingMode.global_mean: nan_feature_index = segment_df[segment_df[self.in_column].isnull()].index @@ -237,7 +250,7 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: timestamps = intersected_df.index categories = pd.unique(df.loc[:, self.idx[:, self.in_column]].values.ravel()) - cumstats = pd.DataFrame(data={"sum": 0, "count": 0, self.in_column: categories}) + cumstats = pd.DataFrame(data={"sum": np.NaN, "count": np.NaN, self.in_column: categories}) cur_timestamp_idx = np.arange(0, len(timestamps) * n_segments, len(timestamps)) for _ in range(len(timestamps)): timestamp_df = flatten.loc[cur_timestamp_idx] @@ -253,9 +266,15 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: .groupby(timestamp_df[self.in_column], dropna=False) .agg(["count", "sum"]) .reset_index() + .replace(0, np.NaN) ) # sum current and previous statistics - cumstats = pd.concat([cumstats, stats]).groupby(self.in_column, as_index=False, dropna=False).sum() + cumstats = ( + pd.concat([cumstats, stats]) + .groupby(self.in_column, as_index=False, dropna=False) + .sum() + .replace(0, np.NaN) + ) cur_timestamp_idx += 1 feature = (temp["cumsum"] + running_mean * self.smoothing) / (temp["cumcount"] + self.smoothing) diff --git a/tests/test_transforms/test_encoders/test_mean_encoder_transform.py b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py index 6b2ad6279..af182239d 100644 --- a/tests/test_transforms/test_encoders/test_mean_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py @@ -27,11 +27,36 @@ def category_ts() -> TSDataset: return ts +@pytest.fixture +def mean_segment_encoder_ts() -> TSDataset: + df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5) + df["target"] = [0, 1, np.NaN, 3, 4] + + df_exog = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=10) + df_exog.rename(columns={"target": "segment_feature"}, inplace=True) + df_exog["segment_feature"] = "segment_0" + + ts = TSDataset(df=df, df_exog=df_exog, freq="D", known_future="all") + + return ts + + +@pytest.fixture +def expected_mean_segment_encoder_ts() -> TSDataset: + df = generate_ar_df(n_segments=1, start_time="2001-01-01", periods=5) + df.rename(columns={"target": "segment_mean"}, inplace=True) + df["segment_mean"] = [np.NaN, 0, 0.5, 0.5, 1.33] + + ts = TSDataset(df=df, freq="D") + + return ts + + @pytest.fixture def expected_micro_category_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 1, 1.5, 1.5, 2.75, 2.25] + [np.NaN, 6.0, 6.25, 7, 7.625, 8.0] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, 1.5, 1.5, 2.75, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, 8.0] ts = TSDataset(df, freq="D") return ts @@ -41,7 +66,7 @@ def expected_micro_category_ts() -> TSDataset: def expected_micro_global_mean_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 1, 1.5, 1.5, 2.5, 2.25] + [np.NaN, 6.0, 6.25, 7, 7.625, 8.0] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, 1.5, 1.5, 2.5, 2.25] + [np.NaN, np.NaN, 6.25, 7, 7.625, 8.0] ts = TSDataset(df, freq="D") return ts @@ -61,7 +86,7 @@ def expected_micro_category_make_future_ts() -> TSDataset: def expected_macro_category_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 3.5, 4, 4.875, 4, 4.85] + [np.NaN, 3.5, 3.66, 4.875, 5.5, 4.275] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 4.875, 4, 4.851] + [np.NaN, np.NaN, 3.66, 4.875, 5.5, 4.27] ts = TSDataset(df, freq="D") return ts @@ -71,7 +96,7 @@ def expected_macro_category_ts() -> TSDataset: def expected_macro_global_mean_ts() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, 3.5, 4, 4.875, 5, 4.85] + [np.NaN, 3.5, 3.66, 4.875, 5.5, 5.55] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, 4, 4.875, 5, 4.85] + [np.NaN, np.NaN, 3.66, 4.875, 5.5, 5.55] ts = TSDataset(df, freq="D") return ts @@ -104,7 +129,7 @@ def ts_begin_nan() -> TSDataset: def expected_ts_begin_nan_smooth_1() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, np.NaN, 0.5, 1.16, 1.5, 2.5] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 1.75, 1.5, 2.5] ts = TSDataset(df, freq="D") return ts @@ -114,7 +139,7 @@ def expected_ts_begin_nan_smooth_1() -> TSDataset: def expected_ts_begin_nan_smooth_2() -> TSDataset: df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1) df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) - df["mean_encoded_regressor"] = [np.NaN, np.NaN, 2 / 3, 5 / 4, 5 / 3, 2.5] + df["mean_encoded_regressor"] = [np.NaN, np.NaN, np.NaN, 5 / 3, 5 / 3, 2.5] ts = TSDataset(df, freq="D") return ts @@ -311,6 +336,22 @@ def test_ts_begin_nan_smooth_2(ts_begin_nan, expected_ts_begin_nan_smooth_2): ) +def test_mean_segment_encoder(mean_segment_encoder_ts, expected_mean_segment_encoder_ts): + mean_encoder = MeanEncoderTransform( + in_column="segment_feature", + mode="per-segment", + handle_missing="category", + smoothing=0, + out_column="segment_mean", + ) + mean_encoder.fit_transform(mean_segment_encoder_ts) + assert_frame_equal( + mean_segment_encoder_ts.df.loc[:, pd.IndexSlice[:, "segment_mean"]], + expected_mean_segment_encoder_ts.df, + atol=0.01, + ) + + def test_save_load(category_ts): mean_encoder = MeanEncoderTransform(in_column="regressor", out_column="mean_encoded_regressor") assert_transformation_equals_loaded_original(transform=mean_encoder, ts=category_ts) From 5e864cbf767fd8738a94ae4f97fd59c797f5a216 Mon Sep 17 00:00:00 2001 From: Egor Baturin Date: Thu, 7 Nov 2024 13:15:47 +0300 Subject: [PATCH 2/4] update changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fa97ef948..25f8e741c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -42,7 +42,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Optimize memory usage in `TFTNativeModel` by eliminating copying during making samples ([#494](https://github.com/etna-team/etna/pull/494)) - Optimize memory usage in `DeepStateModel` and `DeepARNativeModel` by eliminating copying during making samples ([#499](https://github.com/etna-team/etna/pull/499)) - Fix working with NaN target in `MeanEncoderTransform` ([#492](https://github.com/etna-team/etna/pull/492)) -- +- Fix `target` leakage in `MeanSegmentEncoderTransform` ([#503](https://github.com/etna-team/etna/pull/503)) - - - From ce5c3ede2ce772f0c562c704386ade1794e1ea10 Mon Sep 17 00:00:00 2001 From: Egor Baturin Date: Thu, 7 Nov 2024 14:49:55 +0300 Subject: [PATCH 3/4] fix segment encoder tests --- .../test_segment_encoder_transform.py | 36 +++++-------------- 1 file changed, 9 insertions(+), 27 deletions(-) diff --git a/tests/test_transforms/test_encoders/test_segment_encoder_transform.py b/tests/test_transforms/test_encoders/test_segment_encoder_transform.py index 60574b904..fa869d9d8 100644 --- a/tests/test_transforms/test_encoders/test_segment_encoder_transform.py +++ b/tests/test_transforms/test_encoders/test_segment_encoder_transform.py @@ -2,21 +2,20 @@ import pandas as pd import pytest -from etna.datasets import TSDataset from etna.transforms import SegmentEncoderTransform from tests.test_transforms.utils import assert_transformation_equals_loaded_original from tests.utils import select_segments_subset -def test_segment_encoder_transform(simple_ts): +def test_segment_encoder_transform(mean_segment_encoder_ts): transform = SegmentEncoderTransform() - transformed_df = transform.fit_transform(simple_ts).to_pandas() + transformed_df = transform.fit_transform(mean_segment_encoder_ts).to_pandas() assert ( len(transformed_df.loc[:, pd.IndexSlice[:, "segment_code"]].columns) == 2 ), "Number of columns not the same as segments" - assert len(simple_ts.to_pandas()) == len(transformed_df), "Row missing" + assert len(mean_segment_encoder_ts.to_pandas()) == len(transformed_df), "Row missing" codes = set() - for segment in simple_ts.segments: + for segment in mean_segment_encoder_ts.segments: column = transformed_df.loc[:, pd.IndexSlice[segment, "segment_code"]] assert column.dtype == "category", "Column type is not category" assert np.all(column == column.iloc[0]), "Values are not the same for the whole column" @@ -24,32 +23,15 @@ def test_segment_encoder_transform(simple_ts): assert codes == {0, 1}, "Codes are not 0 and 1" -def test_subset_segments(simple_ts): - train_ts = simple_ts - test_df = simple_ts.loc[:, pd.IndexSlice["Omsk", :]] - test_ts = TSDataset(df=test_df, freq=simple_ts.freq) - transform = SegmentEncoderTransform() - - transform.fit(train_ts) - transformed_test_df = transform.transform(test_ts).to_pandas() - - segments = sorted(transformed_test_df.columns.get_level_values("segment").unique()) - features = sorted(transformed_test_df.columns.get_level_values("feature").unique()) - assert segments == ["Omsk"] - assert features == ["exog", "segment_code", "target"] - values = transformed_test_df.loc[:, pd.IndexSlice[:, "segment_code"]] - assert np.all(values == values.iloc[0]) - - -def test_not_fitted_error(simple_ts): +def test_not_fitted_error(mean_segment_encoder_ts): encoder = SegmentEncoderTransform() with pytest.raises(ValueError, match="The transform isn't fitted"): - encoder.transform(simple_ts) + encoder.transform(mean_segment_encoder_ts) -def test_new_segments_error(simple_ts): - train_ts = select_segments_subset(ts=simple_ts, segments=["Moscow"]) - test_ts = select_segments_subset(ts=simple_ts, segments=["Omsk"]) +def test_new_segments_error(mean_segment_encoder_ts): + train_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_0"]) + test_ts = select_segments_subset(ts=mean_segment_encoder_ts, segments=["segment_1"]) transform = SegmentEncoderTransform() transform.fit(train_ts) From 7b8a5f75ea915d887f35f14b00b8f190930f29c6 Mon Sep 17 00:00:00 2001 From: Egor Baturin Date: Fri, 8 Nov 2024 14:44:31 +0300 Subject: [PATCH 4/4] make attributes private --- etna/transforms/encoders/mean_segment_encoder.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/etna/transforms/encoders/mean_segment_encoder.py b/etna/transforms/encoders/mean_segment_encoder.py index 02c9c6010..c5037e077 100644 --- a/etna/transforms/encoders/mean_segment_encoder.py +++ b/etna/transforms/encoders/mean_segment_encoder.py @@ -10,14 +10,13 @@ class MeanSegmentEncoderTransform(IrreversibleTransform): """Makes expanding mean target encoding of the segment. Creates column 'segment_mean'.""" - idx = pd.IndexSlice - segment_column = "segment_column" + _segment_column = "segment_column" out_column = "segment_mean" def __init__(self): super().__init__(required_features=["target"]) - self.mean_encoder = MeanEncoderTransform( - in_column=self.segment_column, mode="per-segment", out_column=self.out_column, smoothing=0 + self._mean_encoder = MeanEncoderTransform( + in_column=self._segment_column, mode="per-segment", out_column=self.out_column, smoothing=0 ) def _add_segment_column(self, df): @@ -25,7 +24,7 @@ def _add_segment_column(self, df): flatten_segments = np.repeat(segments.values[np.newaxis, :], len(df), axis=0) segment_values = pd.DataFrame( data=flatten_segments, - columns=pd.MultiIndex.from_product([segments, [self.segment_column]]), + columns=pd.MultiIndex.from_product([segments, [self._segment_column]]), index=df.index, ) df = pd.concat([df, segment_values], axis=1).sort_index(axis=1) @@ -46,7 +45,7 @@ def _fit(self, df: pd.DataFrame) -> "MeanSegmentEncoderTransform": Fitted transform """ df = self._add_segment_column(df) - self.mean_encoder._fit(df) + self._mean_encoder._fit(df) return self def _transform(self, df: pd.DataFrame) -> pd.DataFrame: @@ -71,8 +70,8 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: If there are segments that weren't present during training. """ df = self._add_segment_column(df) - df_transformed = self.mean_encoder._transform(df) - df_transformed = df_transformed.drop(columns=[self.segment_column], level="feature") + df_transformed = self._mean_encoder._transform(df) + df_transformed = df_transformed.drop(columns=[self._segment_column], level="feature") return df_transformed def get_regressors_info(self) -> List[str]: