From 12f19fb6260eb57ba21fad55279f61c4f782bcba Mon Sep 17 00:00:00 2001 From: Egor Baturin <82458209+egoriyaa@users.noreply.github.com> Date: Fri, 12 Jul 2024 22:42:21 +0300 Subject: [PATCH] Add `MeanEncoderTransform` (#413) * add MeanEncoderTransform * fix fit * fix code * final * fix mode name * resolve changelog * resolve changelog * fix all * add comments * satisfy mypy * add tests, fix docs * fix * fix --------- Co-authored-by: Egor Baturin --- CHANGELOG.md | 2 +- docs/source/api_reference/transforms.rst | 1 + etna/transforms/__init__.py | 1 + etna/transforms/encoders/__init__.py | 1 + etna/transforms/encoders/mean_encoder.py | 313 +++++++++++++++++ .../test_mean_encoder_transform.py | 321 ++++++++++++++++++ .../test_inference/test_inverse_transform.py | 11 + .../test_inference/test_transform.py | 35 ++ 8 files changed, 684 insertions(+), 1 deletion(-) create mode 100644 etna/transforms/encoders/mean_encoder.py create mode 100644 tests/test_transforms/test_encoders/test_mean_encoder_transform.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 246537891..f36192a89 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `get_anomalies_mad` function for anomaly detection ([#398](https://github.com/etna-team/etna/pull/398)) - Add `TSDataset.features` property to get list of all features in a dataset ([#405](https://github.com/etna-team/etna/pull/405)) - Add `MADOutlierTransform` class for anomaly detection ([#415](https://github.com/etna-team/etna/pull/415)) -- +- Add `MeanEncoderTransform` ([#413](https://github.com/etna-team/etna/pull/413)) ### Changed - Allow to change `device`, `batch_size` and `num_workers` of embedding models ([#396](https://github.com/etna-team/etna/pull/396)) diff --git a/docs/source/api_reference/transforms.rst b/docs/source/api_reference/transforms.rst index 409bf97fc..42fca2c0f 100644 --- a/docs/source/api_reference/transforms.rst +++ b/docs/source/api_reference/transforms.rst @@ -52,6 +52,7 @@ Categorical encoding transforms: :template: class.rst SegmentEncoderTransform + MeanEncoderTransform MeanSegmentEncoderTransform LabelEncoderTransform OneHotEncoderTransform diff --git a/etna/transforms/__init__.py b/etna/transforms/__init__.py index 59e4f536a..fb4b93896 100644 --- a/etna/transforms/__init__.py +++ b/etna/transforms/__init__.py @@ -21,6 +21,7 @@ from etna.transforms.embeddings import EmbeddingSegmentTransform from etna.transforms.embeddings import EmbeddingWindowTransform from etna.transforms.encoders import LabelEncoderTransform +from etna.transforms.encoders import MeanEncoderTransform from etna.transforms.encoders import MeanSegmentEncoderTransform from etna.transforms.encoders import OneHotEncoderTransform from etna.transforms.encoders import SegmentEncoderTransform diff --git a/etna/transforms/encoders/__init__.py b/etna/transforms/encoders/__init__.py index 2b23a01ce..7cc314b40 100644 --- a/etna/transforms/encoders/__init__.py +++ b/etna/transforms/encoders/__init__.py @@ -1,4 +1,5 @@ from etna.transforms.encoders.categorical import LabelEncoderTransform from etna.transforms.encoders.categorical import OneHotEncoderTransform +from etna.transforms.encoders.mean_encoder import MeanEncoderTransform from etna.transforms.encoders.mean_segment_encoder import MeanSegmentEncoderTransform from etna.transforms.encoders.segment_encoder import SegmentEncoderTransform diff --git a/etna/transforms/encoders/mean_encoder.py b/etna/transforms/encoders/mean_encoder.py new file mode 100644 index 000000000..207f0d734 --- /dev/null +++ b/etna/transforms/encoders/mean_encoder.py @@ -0,0 +1,313 @@ +import reprlib +from enum import Enum +from typing import Dict +from typing import List +from typing import Optional +from typing import Union +from typing import cast + +import numpy as np +import pandas as pd +from bottleneck import nanmean + +from etna.datasets import TSDataset +from etna.distributions import BaseDistribution +from etna.distributions import FloatDistribution +from etna.transforms import IrreversibleTransform + + +class EncoderMode(str, Enum): + """Enum for different encoding strategies.""" + + per_segment = "per-segment" + macro = "macro" + + @classmethod + def _missing_(cls, value): + raise ValueError(f"The strategy '{value}' doesn't exist") + + +class MissingMode(str, Enum): + """Enum for handle missing strategies.""" + + category = "category" + global_mean = "global_mean" + + @classmethod + def _missing_(cls, value): + raise NotImplementedError( + f"{value} is not a valid {cls.__name__}. Supported types: {', '.join([repr(m.value) for m in cls])}" + ) + + +class MeanEncoderTransform(IrreversibleTransform): + """ + Makes encoding of categorical feature. + + For timestamps that are before the last timestamp seen in ``fit`` transformations are made using the formula below: + + .. math:: + \\frac{TargetSum + RunningMean * Smoothing}{FeatureCount + Smoothing} + + where + + * TargetSum is the sum of target up to the current timestamp for the current category, not including the current timestamp + * RunningMean is target mean up to the current timestamp, not including the current timestamp + * FeatureCount is the number of categories with the same value as in the current timestamp, not including the current timestamp + + For future timestamps: + + * for known categories encoding are filled with global mean of target for these categories calculated during ``fit`` + * for unknown categories encoding are filled with global mean of target in the whole dataset calculated during ``fit`` + + All types of NaN values are considering as one category. + """ + + idx = pd.IndexSlice + + def __init__( + self, + in_column: str, + out_column: str, + mode: Union[EncoderMode, str] = "per-segment", + handle_missing: str = MissingMode.category, + smoothing: int = 1, + ): + """ + Init MeanEncoderTransform. + + Parameters + ---------- + in_column: + categorical column to apply transform + out_column: + name of added column + mode: + mode to encode segments + + * 'per-segment' - statistics are calculated across each segment individually + + * 'macro' - statistics are calculated across all segments. In this mode transform can work with new segments that were not seen during ``fit`` + handle_missing: + mode to handle missing values in ``in_column`` + + * 'category' - NaNs they are interpreted as a separate categorical feature + + * 'global_mean' - NaNs are filled with the running mean + smoothing: + smoothing parameter + """ + super().__init__(required_features=["target", in_column]) + self.in_column = in_column + self.out_column = out_column + self.mode = EncoderMode(mode) + self.handle_missing = MissingMode(handle_missing) + self.smoothing = smoothing + + self._global_means: Optional[Union[float, Dict[str, float]]] = None + self._global_means_category: Optional[Union[Dict[str, float], Dict[str, Dict[str, float]]]] = None + self._last_timestamp: Union[pd.Timestamp, int, None] + + def _fit(self, df: pd.DataFrame) -> "MeanEncoderTransform": + """ + Fit encoder. + + Parameters + ---------- + df: + dataframe with data to fit expanding mean target encoder. + + Returns + ------- + : + Fitted transform + """ + df.loc[:, pd.IndexSlice[:, self.in_column]] = df.loc[:, pd.IndexSlice[:, self.in_column]].fillna(np.NaN) + + if self.mode is EncoderMode.per_segment: + axis = 0 + segments = df.columns.get_level_values("segment").unique().tolist() + global_means = nanmean(df.loc[:, self.idx[:, "target"]], axis=axis) + global_means = dict(zip(segments, global_means)) + + global_means_category = {} + for segment in segments: + segment_df = TSDataset.to_flatten(df.loc[:, pd.IndexSlice[segment, :]]) + global_means_category[segment] = ( + segment_df[[self.in_column, "target"]] + .groupby(self.in_column, dropna=False) + .mean() + .to_dict()["target"] + ) + else: + axis = None + global_means = nanmean(df.loc[:, self.idx[:, "target"]], axis=axis) + + segment_df = TSDataset.to_flatten(df) + global_means_category = ( + segment_df[[self.in_column, "target"]].groupby(self.in_column, dropna=False).mean().to_dict()["target"] + ) + + self._global_means = global_means + self._global_means_category = global_means_category + self._last_timestamp = df.index[-1] + + return self + + @staticmethod + def _count_macro_running_mean(df, n_segments): + y = df["target"] + timestamp_count = y.groupby(df["timestamp"]).transform("count") + timestamp_sum = y.groupby(df["timestamp"]).transform("sum") + expanding_mean = timestamp_sum.iloc[::n_segments].cumsum() / timestamp_count.iloc[::n_segments].cumsum() + expanding_mean = expanding_mean.repeat(n_segments) + # first timestamp is NaN + expanding_mean = pd.Series(index=df.index, data=expanding_mean.values).shift(n_segments) + return expanding_mean + + def _transform(self, df: pd.DataFrame) -> pd.DataFrame: + """ + Get encoded values for the segment. + + Parameters + ---------- + df: + dataframe with data to transform. + + Returns + ------- + : + result dataframe + + Raises + ------ + ValueError: + If transform isn't fitted. + NotImplementedError: + If there are segments that weren't present during training. + """ + if self._global_means is None: + raise ValueError("The transform isn't fitted!") + + segments = df.columns.get_level_values("segment").unique().tolist() + n_segments = len(segments) + if self.mode is EncoderMode.per_segment: + self._global_means = cast(Dict[str, float], self._global_means) + new_segments = set(segments) - self._global_means.keys() + if len(new_segments) > 0: + raise NotImplementedError( + f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}" + ) + df.loc[:, self.idx[:, self.in_column]] = df.loc[:, self.idx[:, self.in_column]].fillna(np.NaN) + + future_timestamps = df.index[df.index > self._last_timestamp] + intersected_timestamps = df.index[df.index <= self._last_timestamp] + + intersected_df = df.loc[intersected_timestamps, self.idx[:, :]] + future_df = df.loc[future_timestamps, self.idx[:, :]] + + if len(intersected_df) > 0: + if self.mode is EncoderMode.per_segment: + for segment in segments: + segment_df = TSDataset.to_flatten(intersected_df.loc[:, self.idx[segment, :]]) + y = segment_df["target"] + # first timestamp is NaN + expanding_mean = y.expanding().mean().shift() + # cumcount not including current timestamp + cumcount = y.groupby(segment_df[self.in_column].astype(str)).agg("cumcount") + # cumsum not including current timestamp + cumsum = ( + y.groupby(segment_df[self.in_column].astype(str)) + .transform(lambda x: x.shift().cumsum()) + .fillna(0) + ) + feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing) + if self.handle_missing is MissingMode.global_mean: + nan_feature_index = segment_df[segment_df[self.in_column].isnull()].index + feature.loc[nan_feature_index] = expanding_mean.loc[nan_feature_index] + intersected_df.loc[:, self.idx[segment, self.out_column]] = feature.values + + else: + flatten = TSDataset.to_flatten(intersected_df) + flatten = flatten.sort_values(["timestamp", "segment"]) + running_mean = self._count_macro_running_mean(flatten, n_segments) + + temp = pd.DataFrame(index=flatten.index, columns=["cumsum", "cumcount"], dtype=float) + + timestamps = intersected_df.index + categories = pd.unique(df.loc[:, self.idx[:, self.in_column]].values.ravel()) + + cumstats = pd.DataFrame(data={"sum": 0, "count": 0, self.in_column: categories}) + cur_timestamp_idx = np.arange(0, len(timestamps) * n_segments, len(timestamps)) + for _ in range(len(timestamps)): + timestamp_df = flatten.loc[cur_timestamp_idx] + # statistics from previous timestamp + cumsum_dict = dict(cumstats[[self.in_column, "sum"]].values) + cumcount_dict = dict(cumstats[[self.in_column, "count"]].values) + # map categories for current timestamp to statistics + temp.loc[cur_timestamp_idx, "cumsum"] = timestamp_df[self.in_column].map(cumsum_dict) + temp.loc[cur_timestamp_idx, "cumcount"] = timestamp_df[self.in_column].map(cumcount_dict) + # count statistics for current timestamp + stats = ( + timestamp_df["target"] + .groupby(timestamp_df[self.in_column], dropna=False) + .agg(["count", "sum"]) + .reset_index() + ) + # sum current and previous statistics + cumstats = pd.concat([cumstats, stats]).groupby(self.in_column, as_index=False, dropna=False).sum() + cur_timestamp_idx += 1 + + feature = (temp["cumsum"] + running_mean * self.smoothing) / (temp["cumcount"] + self.smoothing) + if self.handle_missing is MissingMode.global_mean: + nan_feature_index = flatten[flatten[self.in_column].isnull()].index + feature.loc[nan_feature_index] = running_mean.loc[nan_feature_index] + + feature = pd.DataFrame( + feature.values.reshape(len(timestamps), n_segments), + columns=pd.MultiIndex.from_product([segments, [self.out_column]]), + index=intersected_df.index, + ) + intersected_df = pd.concat([intersected_df, feature], axis=1) + + if len(future_df) > 0: + n_timestamps = len(future_df.index) + if self.mode is EncoderMode.per_segment: + self._global_means_category = cast(Dict[str, Dict[str, float]], self._global_means_category) + self._global_means = cast(Dict[str, float], self._global_means) + for segment in segments: + segment_df = TSDataset.to_flatten(future_df.loc[:, self.idx[segment, :]]) + feature = segment_df[self.in_column].map(self._global_means_category[segment]) + feature = feature.fillna(self._global_means[segment]) + future_df.loc[:, self.idx[segment, self.out_column]] = feature.values + else: + flatten = TSDataset.to_flatten(future_df) + feature = flatten[self.in_column].map(self._global_means_category) + feature = feature.fillna(self._global_means) + feature = pd.DataFrame( + feature.values.reshape(len(segments), n_timestamps).T, + columns=pd.MultiIndex.from_product([segments, [self.out_column]]), + index=future_df.index, + ) + future_df = pd.concat([future_df, feature], axis=1) + + intersected_df = intersected_df.sort_index(axis=1) + future_df = future_df.sort_index(axis=1) + transformed_df = pd.concat((intersected_df, future_df), axis=0) + return transformed_df + + def get_regressors_info(self) -> List[str]: + """Return the list with regressors created by the transform.""" + return [self.out_column] + + def params_to_tune(self) -> Dict[str, BaseDistribution]: + """Get default grid for tuning hyperparameters. + + This grid tunes ``smoothing`` parameter. Other parameters are expected to be set by the user. + + Returns + ------- + : + Grid to tune. + """ + return {"smoothing": FloatDistribution(low=0, high=2)} diff --git a/tests/test_transforms/test_encoders/test_mean_encoder_transform.py b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py new file mode 100644 index 000000000..6b2ad6279 --- /dev/null +++ b/tests/test_transforms/test_encoders/test_mean_encoder_transform.py @@ -0,0 +1,321 @@ +import numpy as np +import pandas as pd +import pytest +from pandas.testing import assert_frame_equal + +from etna.datasets import TSDataset +from etna.datasets import generate_ar_df +from etna.metrics import MSE +from etna.models import LinearMultiSegmentModel +from etna.pipeline import Pipeline +from etna.transforms import FilterFeaturesTransform +from etna.transforms import MeanEncoderTransform +from tests.test_transforms.utils import assert_transformation_equals_loaded_original +from tests.utils import select_segments_subset + + +@pytest.fixture +def category_ts() -> TSDataset: + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) + df["target"] = [1, 2, 3, 4, np.NaN, 5] + [6, 7, 8, 9, 10, 11] + + df_exog = generate_ar_df(start_time="2001-01-01", periods=8, n_segments=2) + df_exog.rename(columns={"target": "regressor"}, inplace=True) + df_exog["regressor"] = ["A", "B", np.NaN, "A", pd.NA, "B", "C", "A"] + ["A", "B", "A", "A", "A", np.NaN, "A", "C"] + + ts = TSDataset(df, df_exog=df_exog, freq="D", known_future="all") + return ts + + +@pytest.fixture +def expected_micro_category_ts() -> TSDataset: + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) + df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) + df["mean_encoded_regressor"] = [np.NaN, 1, 1.5, 1.5, 2.75, 2.25] + [np.NaN, 6.0, 6.25, 7, 7.625, 8.0] + + ts = TSDataset(df, freq="D") + return ts + + +@pytest.fixture +def expected_micro_global_mean_ts() -> TSDataset: + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) + df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) + df["mean_encoded_regressor"] = [np.NaN, 1, 1.5, 1.5, 2.5, 2.25] + [np.NaN, 6.0, 6.25, 7, 7.625, 8.0] + + ts = TSDataset(df, freq="D") + return ts + + +@pytest.fixture +def expected_micro_category_make_future_ts() -> TSDataset: + df = generate_ar_df(start_time="2001-01-07", periods=2, n_segments=2) + df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) + df["mean_encoded_regressor"] = [3, 2.5] + [8.25, 8.5] + + ts = TSDataset(df, freq="D") + return ts + + +@pytest.fixture +def expected_macro_category_ts() -> TSDataset: + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) + df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) + df["mean_encoded_regressor"] = [np.NaN, 3.5, 4, 4.875, 4, 4.85] + [np.NaN, 3.5, 3.66, 4.875, 5.5, 4.275] + + ts = TSDataset(df, freq="D") + return ts + + +@pytest.fixture +def expected_macro_global_mean_ts() -> TSDataset: + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=2) + df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) + df["mean_encoded_regressor"] = [np.NaN, 3.5, 4, 4.875, 5, 4.85] + [np.NaN, 3.5, 3.66, 4.875, 5.5, 5.55] + + ts = TSDataset(df, freq="D") + return ts + + +@pytest.fixture +def expected_macro_category_make_future_ts() -> TSDataset: + df = generate_ar_df(start_time="2001-01-07", periods=2, n_segments=2) + df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) + df["mean_encoded_regressor"] = [6, 6.33] + [6.33, 6] + + ts = TSDataset(df, freq="D") + return ts + + +@pytest.fixture +def ts_begin_nan() -> TSDataset: + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1) + df["target"] = [np.NaN, 1, 2, 3, 4, 5] + + df_exog = generate_ar_df(start_time="2001-01-01", periods=8, n_segments=1) + df_exog.rename(columns={"target": "regressor"}, inplace=True) + df_exog["regressor"] = ["A", "B", "A", "A", "B", "B", "C", "A"] + + ts = TSDataset(df, df_exog=df_exog, freq="D", known_future="all") + return ts + + +@pytest.fixture +def expected_ts_begin_nan_smooth_1() -> TSDataset: + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1) + df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) + df["mean_encoded_regressor"] = [np.NaN, np.NaN, 0.5, 1.16, 1.5, 2.5] + + ts = TSDataset(df, freq="D") + return ts + + +@pytest.fixture +def expected_ts_begin_nan_smooth_2() -> TSDataset: + df = generate_ar_df(start_time="2001-01-01", periods=6, n_segments=1) + df.rename(columns={"target": "mean_encoded_regressor"}, inplace=True) + df["mean_encoded_regressor"] = [np.NaN, np.NaN, 2 / 3, 5 / 4, 5 / 3, 2.5] + + ts = TSDataset(df, freq="D") + return ts + + +@pytest.mark.smoke +@pytest.mark.parametrize("mode", ["per-segment", "macro"]) +@pytest.mark.parametrize("handle_missing", ["category", "global_mean"]) +@pytest.mark.parametrize("smoothing", [1, 2]) +def test_fit(category_ts, mode, handle_missing, smoothing): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode=mode, + handle_missing=handle_missing, + smoothing=smoothing, + out_column="mean_encoded_regressor", + ) + mean_encoder.fit(category_ts) + + +@pytest.mark.smoke +@pytest.mark.parametrize("mode", ["per-segment", "macro"]) +@pytest.mark.parametrize("handle_missing", ["category", "global_mean"]) +@pytest.mark.parametrize("smoothing", [1, 2]) +def test_fit_transform(category_ts, mode, handle_missing, smoothing): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode=mode, + handle_missing=handle_missing, + smoothing=smoothing, + out_column="mean_encoded_regressor", + ) + mean_encoder.fit_transform(category_ts) + + +@pytest.mark.smoke +@pytest.mark.parametrize("mode", ["per-segment", "macro"]) +@pytest.mark.parametrize("handle_missing", ["category", "global_mean"]) +@pytest.mark.parametrize("smoothing", [1, 2]) +def test_make_future(category_ts, mode, handle_missing, smoothing): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode=mode, + handle_missing=handle_missing, + smoothing=smoothing, + out_column="mean_encoded_regressor", + ) + category_ts.fit_transform([mean_encoder]) + _ = category_ts.make_future(future_steps=2, transforms=[mean_encoder]) + + +@pytest.mark.smoke +@pytest.mark.parametrize("mode", ["per-segment", "macro"]) +@pytest.mark.parametrize("handle_missing", ["category", "global_mean"]) +@pytest.mark.parametrize("smoothing", [1, 2]) +def test_pipeline(category_ts, mode, handle_missing, smoothing): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode=mode, + handle_missing=handle_missing, + smoothing=smoothing, + out_column="mean_encoded_regressor", + ) + filter_transform = FilterFeaturesTransform(exclude=["regressor"]) + pipeline = Pipeline(model=LinearMultiSegmentModel(), transforms=[mean_encoder, filter_transform], horizon=1) + pipeline.backtest(category_ts, n_folds=1, metrics=[MSE()]) + + +def test_not_fitted_error(category_ts): + mean_encoder = MeanEncoderTransform(in_column="regressor", out_column="mean_encoded_regressor") + with pytest.raises(ValueError, match="The transform isn't fitted"): + mean_encoder.transform(category_ts) + + +def test_new_segments_error(category_ts): + train_ts = select_segments_subset(ts=category_ts, segments=["segment_0"]) + test_ts = select_segments_subset(ts=category_ts, segments=["segment_1"]) + mean_encoder = MeanEncoderTransform(in_column="regressor", out_column="mean_encoded_regressor") + + mean_encoder.fit(train_ts) + with pytest.raises( + NotImplementedError, match="This transform can't process segments that weren't present on train data" + ): + _ = mean_encoder.transform(test_ts) + + +def test_transform_micro_category_expected(category_ts, expected_micro_category_ts): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode="per-segment", + handle_missing="category", + smoothing=1, + out_column="mean_encoded_regressor", + ) + mean_encoder.fit_transform(category_ts) + assert_frame_equal( + category_ts.df.loc[:, pd.IndexSlice[:, "mean_encoded_regressor"]], expected_micro_category_ts.df, atol=0.01 + ) + + +def test_transform_micro_global_mean_expected(category_ts, expected_micro_global_mean_ts): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode="per-segment", + handle_missing="global_mean", + smoothing=1, + out_column="mean_encoded_regressor", + ) + mean_encoder.fit_transform(category_ts) + assert_frame_equal( + category_ts.df.loc[:, pd.IndexSlice[:, "mean_encoded_regressor"]], expected_micro_global_mean_ts.df + ) + + +def test_transform_micro_make_future_expected(category_ts, expected_micro_category_make_future_ts): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode="per-segment", + handle_missing="category", + smoothing=1, + out_column="mean_encoded_regressor", + ) + mean_encoder.fit_transform(category_ts) + future = category_ts.make_future(future_steps=2, transforms=[mean_encoder]) + + assert_frame_equal( + future.df.loc[:, pd.IndexSlice[:, "mean_encoded_regressor"]], expected_micro_category_make_future_ts.df + ) + + +def test_transform_macro_category_expected(category_ts, expected_macro_category_ts): + mean_encoder = MeanEncoderTransform( + in_column="regressor", mode="macro", handle_missing="category", smoothing=1, out_column="mean_encoded_regressor" + ) + mean_encoder.fit_transform(category_ts) + assert_frame_equal( + category_ts.df.loc[:, pd.IndexSlice[:, "mean_encoded_regressor"]], expected_macro_category_ts.df, atol=0.01 + ) + + +def test_transform_macro_global_mean_expected(category_ts, expected_macro_global_mean_ts): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode="macro", + handle_missing="global_mean", + smoothing=1, + out_column="mean_encoded_regressor", + ) + mean_encoder.fit_transform(category_ts) + assert_frame_equal( + category_ts.df.loc[:, pd.IndexSlice[:, "mean_encoded_regressor"]], expected_macro_global_mean_ts.df, atol=0.02 + ) + + +def test_transform_macro_make_future_expected(category_ts, expected_macro_category_make_future_ts): + mean_encoder = MeanEncoderTransform( + in_column="regressor", mode="macro", handle_missing="category", smoothing=1, out_column="mean_encoded_regressor" + ) + mean_encoder.fit_transform(category_ts) + future = category_ts.make_future(future_steps=2, transforms=[mean_encoder]) + + assert_frame_equal( + future.df.loc[:, pd.IndexSlice[:, "mean_encoded_regressor"]], + expected_macro_category_make_future_ts.df, + atol=0.01, + ) + + +def test_ts_begin_nan_smooth_1(ts_begin_nan, expected_ts_begin_nan_smooth_1): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode="per-segment", + handle_missing="category", + smoothing=1, + out_column="mean_encoded_regressor", + ) + mean_encoder.fit_transform(ts_begin_nan) + assert_frame_equal( + ts_begin_nan.df.loc[:, pd.IndexSlice[:, "mean_encoded_regressor"]], expected_ts_begin_nan_smooth_1.df, atol=0.01 + ) + + +def test_ts_begin_nan_smooth_2(ts_begin_nan, expected_ts_begin_nan_smooth_2): + mean_encoder = MeanEncoderTransform( + in_column="regressor", + mode="per-segment", + handle_missing="category", + smoothing=2, + out_column="mean_encoded_regressor", + ) + mean_encoder.fit_transform(ts_begin_nan) + assert_frame_equal( + ts_begin_nan.df.loc[:, pd.IndexSlice[:, "mean_encoded_regressor"]], expected_ts_begin_nan_smooth_2.df, atol=0.01 + ) + + +def test_save_load(category_ts): + mean_encoder = MeanEncoderTransform(in_column="regressor", out_column="mean_encoded_regressor") + assert_transformation_equals_loaded_original(transform=mean_encoder, ts=category_ts) + + +def test_params_to_tune(): + mean_encoder = MeanEncoderTransform(in_column="regressor", out_column="mean_encoded_regressor") + assert len(mean_encoder.params_to_tune()) == 1 diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py index b3c7edc7e..34f43fcc2 100644 --- a/tests/test_transforms/test_inference/test_inverse_transform.py +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -38,6 +38,7 @@ from etna.transforms import MADTransform from etna.transforms import MaxAbsScalerTransform from etna.transforms import MaxTransform +from etna.transforms import MeanEncoderTransform from etna.transforms import MeanSegmentEncoderTransform from etna.transforms import MeanTransform from etna.transforms import MedianOutliersTransform @@ -185,6 +186,7 @@ def _test_inverse_transform_train(self, ts, transform, expected_changes): "ts_with_exog", {}, ), + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog", {}), (MeanSegmentEncoderTransform(), "regular_ts", {}), (SegmentEncoderTransform(), "regular_ts", {}), # feature_selection @@ -646,6 +648,7 @@ def test_inverse_transform_train_fail_resample(self, transform, dataset_name, ex "ts_with_exog", {}, ), + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog", {}), (MeanSegmentEncoderTransform(), "regular_ts", {}), (SegmentEncoderTransform(), "regular_ts", {}), # feature_selection @@ -1129,6 +1132,7 @@ def _test_inverse_transform_train_subset_segments(self, ts, transform, segments) # encoders (LabelEncoderTransform(in_column="weekday"), "ts_with_exog"), (OneHotEncoderTransform(in_column="weekday"), "ts_with_exog"), + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog"), (MeanSegmentEncoderTransform(), "regular_ts"), (SegmentEncoderTransform(), "regular_ts"), # feature_selection @@ -1414,6 +1418,7 @@ def _test_inverse_transform_future_subset_segments(self, ts, transform, segments # encoders (LabelEncoderTransform(in_column="weekday"), "ts_with_exog"), (OneHotEncoderTransform(in_column="weekday"), "ts_with_exog"), + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog"), (MeanSegmentEncoderTransform(), "regular_ts"), (SegmentEncoderTransform(), "regular_ts"), # feature_selection @@ -1691,6 +1696,7 @@ def _test_inverse_transform_train_new_segments(self, ts, transform, train_segmen "ts_with_exog", {}, ), + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder", mode="macro"), "ts_with_exog", {}), # feature_selection (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {}), (FilterFeaturesTransform(exclude=["year"], return_features=True), "ts_with_exog", {"create": {"year"}}), @@ -1969,6 +1975,7 @@ def test_inverse_transform_train_new_segments(self, transform, dataset_name, exp "regular_ts", ), # encoders + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog"), (MeanSegmentEncoderTransform(), "regular_ts"), (SegmentEncoderTransform(), "regular_ts"), # math @@ -2111,6 +2118,7 @@ def _test_inverse_transform_future_new_segments(self, ts, transform, train_segme "ts_with_exog", {}, ), + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder", mode="macro"), "ts_with_exog", {}), # feature_selection (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {}), ( @@ -2408,6 +2416,7 @@ def test_inverse_transform_future_new_segments(self, transform, dataset_name, ex "regular_ts", ), # encoders + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog"), (MeanSegmentEncoderTransform(), "regular_ts"), (SegmentEncoderTransform(), "regular_ts"), # math @@ -2633,6 +2642,7 @@ def _test_inverse_transform_future_with_target( "ts_with_exog", {}, ), + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog", {}), (MeanSegmentEncoderTransform(), "regular_ts", {}), (SegmentEncoderTransform(), "regular_ts", {}), # feature_selection @@ -3140,6 +3150,7 @@ def _test_inverse_transform_future_without_target( "ts_with_exog", {}, ), + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog", {}), (MeanSegmentEncoderTransform(), "regular_ts", {}), (SegmentEncoderTransform(), "regular_ts", {}), # feature_selection diff --git a/tests/test_transforms/test_inference/test_transform.py b/tests/test_transforms/test_inference/test_transform.py index f4d9215c6..3f1073a73 100644 --- a/tests/test_transforms/test_inference/test_transform.py +++ b/tests/test_transforms/test_inference/test_transform.py @@ -38,6 +38,7 @@ from etna.transforms import MADTransform from etna.transforms import MaxAbsScalerTransform from etna.transforms import MaxTransform +from etna.transforms import MeanEncoderTransform from etna.transforms import MeanSegmentEncoderTransform from etna.transforms import MeanTransform from etna.transforms import MedianOutliersTransform @@ -175,6 +176,11 @@ def _test_transform_train(self, ts, transform, expected_changes): "ts_with_exog", {"create": {"res_0", "res_1", "res_2", "res_3", "res_4", "res_5", "res_6"}}, ), + ( + MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), + "ts_with_exog", + {"create": {"mean_encoder"}}, + ), (MeanSegmentEncoderTransform(), "regular_ts", {"create": {"segment_mean"}}), (SegmentEncoderTransform(), "regular_ts", {"create": {"segment_code"}}), # feature_selection @@ -592,6 +598,11 @@ def test_transform_train_datetime_timestamp(self, transform, dataset_name, expec "ts_with_exog", {"create": {"res_0", "res_1", "res_2", "res_3", "res_4", "res_5", "res_6"}}, ), + ( + MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), + "ts_with_exog", + {"create": {"mean_encoder"}}, + ), (MeanSegmentEncoderTransform(), "regular_ts", {"create": {"segment_mean"}}), (SegmentEncoderTransform(), "regular_ts", {"create": {"segment_code"}}), # feature_selection @@ -1085,6 +1096,7 @@ def _test_transform_train_subset_segments(self, ts, transform, segments): # encoders (LabelEncoderTransform(in_column="weekday"), "ts_with_exog"), (OneHotEncoderTransform(in_column="weekday"), "ts_with_exog"), + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog"), (MeanSegmentEncoderTransform(), "regular_ts"), (SegmentEncoderTransform(), "regular_ts"), # feature_selection @@ -1352,6 +1364,7 @@ def _test_transform_future_subset_segments(self, ts, transform, segments, horizo # encoders (LabelEncoderTransform(in_column="weekday"), "ts_with_exog"), (OneHotEncoderTransform(in_column="weekday"), "ts_with_exog"), + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog"), (MeanSegmentEncoderTransform(), "regular_ts"), (SegmentEncoderTransform(), "regular_ts"), # feature_selection @@ -1619,6 +1632,11 @@ def _test_transform_train_new_segments(self, ts, transform, train_segments, expe "ts_with_exog", {"create": {"res_0", "res_1", "res_2", "res_3", "res_4", "res_5", "res_6"}}, ), + ( + MeanEncoderTransform(in_column="weekday", out_column="mean_encoder", mode="macro"), + "ts_with_exog", + {"create": {"mean_encoder"}}, + ), # feature_selection (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {"remove": {"year"}}), ( @@ -1872,6 +1890,7 @@ def test_transform_train_new_segments(self, transform, dataset_name, expected_ch "regular_ts", ), # encoders + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog"), (MeanSegmentEncoderTransform(), "regular_ts"), (SegmentEncoderTransform(), "regular_ts"), # math @@ -2001,6 +2020,11 @@ def _test_transform_future_new_segments(self, ts, transform, train_segments, exp "ts_with_exog", {"create": {"res_0", "res_1", "res_2", "res_3", "res_4", "res_5", "res_6"}}, ), + ( + MeanEncoderTransform(in_column="weekday", out_column="mean_encoder", mode="macro"), + "ts_with_exog", + {"create": {"mean_encoder"}}, + ), # feature_selection (FilterFeaturesTransform(exclude=["year"]), "ts_with_exog", {"remove": {"year"}}), ( @@ -2301,6 +2325,7 @@ def test_transform_future_new_segments(self, transform, dataset_name, expected_c "regular_ts", ), # encoders + (MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), "ts_with_exog"), (MeanSegmentEncoderTransform(), "regular_ts"), (SegmentEncoderTransform(), "regular_ts"), # math @@ -2468,6 +2493,11 @@ def _test_transform_future_with_target(self, ts, transform, expected_changes, ga "ts_with_exog", {"create": {"res_0", "res_1", "res_2", "res_3", "res_4", "res_5", "res_6"}}, ), + ( + MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), + "ts_with_exog", + {"create": {"mean_encoder"}}, + ), (MeanSegmentEncoderTransform(), "regular_ts", {"create": {"segment_mean"}}), (SegmentEncoderTransform(), "regular_ts", {"create": {"segment_code"}}), # feature_selection @@ -2907,6 +2937,11 @@ def _test_transform_future_without_target(self, ts, transform, expected_changes, "ts_with_exog", {"create": {"res_0", "res_1", "res_2", "res_3", "res_4", "res_5", "res_6"}}, ), + ( + MeanEncoderTransform(in_column="weekday", out_column="mean_encoder"), + "ts_with_exog", + {"create": {"mean_encoder"}}, + ), (MeanSegmentEncoderTransform(), "regular_ts", {"create": {"segment_mean"}}), (SegmentEncoderTransform(), "regular_ts", {"create": {"segment_code"}}), # feature_selection