Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MeanEncoderTransform #413

Merged
merged 15 commits into from
Jul 12, 2024
Merged
Show file tree
Hide file tree
Changes from 13 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add `get_anomalies_mad` function for anomaly detection ([#398](https://github.com/etna-team/etna/pull/398))
- Add `TSDataset.features` property to get list of all features in a dataset ([#405](https://github.com/etna-team/etna/pull/405))
- Add `MADOutlierTransform` class for anomaly detection ([#415](https://github.com/etna-team/etna/pull/415))
-
- Add `MeanEncoderTransform` ([#413](https://github.com/etna-team/etna/pull/413))

### Changed
- Allow to change `device`, `batch_size` and `num_workers` of embedding models ([#396](https://github.com/etna-team/etna/pull/396))
Expand Down
1 change: 1 addition & 0 deletions docs/source/api_reference/transforms.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ Categorical encoding transforms:
:template: class.rst

SegmentEncoderTransform
MeanEncoderTransform
MeanSegmentEncoderTransform
LabelEncoderTransform
OneHotEncoderTransform
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from etna.transforms.embeddings import EmbeddingSegmentTransform
from etna.transforms.embeddings import EmbeddingWindowTransform
from etna.transforms.encoders import LabelEncoderTransform
from etna.transforms.encoders import MeanEncoderTransform
from etna.transforms.encoders import MeanSegmentEncoderTransform
from etna.transforms.encoders import OneHotEncoderTransform
from etna.transforms.encoders import SegmentEncoderTransform
Expand Down
1 change: 1 addition & 0 deletions etna/transforms/encoders/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from etna.transforms.encoders.categorical import LabelEncoderTransform
from etna.transforms.encoders.categorical import OneHotEncoderTransform
from etna.transforms.encoders.mean_encoder import MeanEncoderTransform
from etna.transforms.encoders.mean_segment_encoder import MeanSegmentEncoderTransform
from etna.transforms.encoders.segment_encoder import SegmentEncoderTransform
318 changes: 318 additions & 0 deletions etna/transforms/encoders/mean_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,318 @@
import reprlib
from enum import Enum
from typing import Dict
from typing import List
from typing import Optional
from typing import Union
from typing import cast

import numpy as np
import pandas as pd
from bottleneck import nanmean

from etna.datasets import TSDataset
from etna.distributions import BaseDistribution
from etna.distributions import FloatDistribution
from etna.transforms import IrreversibleTransform


class EncoderMode(str, Enum):
"""Enum for different encoding strategies."""

per_segment = "per-segment"
macro = "macro"

@classmethod
def _missing_(cls, value):
raise ValueError(f"The strategy '{value}' doesn't exist")


class MissingMode(str, Enum):
"""Enum for handle missing strategies."""

category = "category"
global_mean = "global_mean"

@classmethod
def _missing_(cls, value):
raise NotImplementedError(
f"{value} is not a valid {cls.__name__}. Supported types: {', '.join([repr(m.value) for m in cls])}"
)


class MeanEncoderTransform(IrreversibleTransform):
"""
Makes encoding of categorical feature.

For timestamps that are before the last timestamp seen in ``fit`` transformations are made using the formula below:

.. math::
\\frac{TargetSum + RunningMean * Smoothing}{FeatureCount + Smoothing}

where
d-a-bunin marked this conversation as resolved.
Show resolved Hide resolved

* TargetSum is the sum of target up to the current timestamp for the current category, not including the current timestamp
* RunningMean is target mean up to the current timestamp, not including the current timestamp
* FeatureCount is the number of categories with the same value as in the current timestamp, not including the current timestamp

For future timestamps:

* for known categories encoding are filled with global mean of target for these categories calculated during ``fit``
* for unknown categories encoding are filled with global mean of target in the whole dataset calculated during ``fit``

All types of NaN values are considering as one category.
"""

idx = pd.IndexSlice
d-a-bunin marked this conversation as resolved.
Show resolved Hide resolved

def __init__(
self,
in_column: str,
out_column: str,
mode: Union[EncoderMode, str] = "per-segment",
handle_missing: str = MissingMode.category,
smoothing: int = 1,
):
"""
Init MeanEncoderTransform.

Parameters
----------
in_column:
categorical column to apply transform
out_column:
name of added column
mode:
mode to encode segments

* 'per-segment' - statistics are calculated across each segment individually

* 'macro' - statistics are calculated across all segments. In this mode transform can work with new segments that were not seen during ``fit``
handle_missing:
mode to handle missing values in ``in_column``

* 'category' - NaNs they are interpreted as a separate categorical feature

* 'global_mean' - NaNs are filled with the running mean
smoothing:
smoothing parameter
"""
super().__init__(required_features=["target", in_column])
self.in_column = in_column
self.out_column = out_column
self.mode = EncoderMode(mode)
self.handle_missing = MissingMode(handle_missing)
self.smoothing = smoothing

self._global_means: Optional[Union[float, Dict[str, float]]] = None
self._global_means_category: Optional[Union[Dict[str, float], Dict[str, Dict[str, float]]]] = None
self._last_timestamp: Union[pd.Timestamp, int, None]

def _fit(self, df: pd.DataFrame) -> "MeanEncoderTransform":
"""
Fit encoder.

Parameters
----------
df:
dataframe with data to fit expanding mean target encoder.

Returns
-------
:
Fitted transform
"""
df.loc[:, pd.IndexSlice[:, self.in_column]] = df.loc[:, pd.IndexSlice[:, self.in_column]].fillna(np.NaN)

if self.mode is EncoderMode.per_segment:
axis = 0
segments = df.columns.get_level_values("segment").unique().tolist()
global_means = nanmean(df.loc[:, self.idx[:, "target"]], axis=axis)
global_means = dict(zip(segments, global_means))

global_means_category = {}
for segment in segments:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't we, in theory, groupby by both "segment" and in_column to get rid of this cycle over segments?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It remains valid.

segment_df = TSDataset.to_flatten(df.loc[:, pd.IndexSlice[segment, :]])
global_means_category[segment] = (
segment_df[[self.in_column, "target"]]
.groupby(self.in_column, dropna=False)
.mean()
.to_dict()["target"]
)
else:
axis = None
global_means = nanmean(df.loc[:, self.idx[:, "target"]], axis=axis)

segment_df = TSDataset.to_flatten(df)
global_means_category = (
segment_df[[self.in_column, "target"]].groupby(self.in_column, dropna=False).mean().to_dict()["target"]
)

self._global_means = global_means
self._global_means_category = global_means_category
self._last_timestamp = df.index[-1]

return self

@staticmethod
def _count_macro_running_mean(df, n_segments):
y = df["target"]
timestamp_count = y.groupby(df["timestamp"]).transform("count")
timestamp_sum = y.groupby(df["timestamp"]).transform("sum")
expanding_mean = timestamp_sum.iloc[::n_segments].cumsum() / timestamp_count.iloc[::n_segments].cumsum()
expanding_mean = expanding_mean.repeat(n_segments)
# first timestamp is NaN
expanding_mean = pd.Series(index=df.index, data=expanding_mean.values).shift(n_segments)
return expanding_mean

def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Get encoded values for the segment.

Parameters
----------
df:
dataframe with data to transform.

Returns
-------
:
result dataframe

Raises
------
ValueError:
If transform isn't fitted.
NotImplementedError:
If there are segments that weren't present during training.
"""
if self._global_means is None:
raise ValueError("The transform isn't fitted!")

segments = df.columns.get_level_values("segment").unique().tolist()
n_segments = len(segments)
if self.mode is EncoderMode.per_segment:
self._global_means = cast(Dict[str, float], self._global_means)
new_segments = set(segments) - self._global_means.keys()
d-a-bunin marked this conversation as resolved.
Show resolved Hide resolved
if len(new_segments) > 0:
raise NotImplementedError(
f"This transform can't process segments that weren't present on train data: {reprlib.repr(new_segments)}"
)
df.loc[:, self.idx[:, self.in_column]] = df.loc[:, self.idx[:, self.in_column]].fillna(np.NaN)

future_timestamps = df.index[df.index > self._last_timestamp]
intersected_timestamps = df.index[df.index <= self._last_timestamp]

intersected_df = df.loc[intersected_timestamps, self.idx[:, :]]
future_df = df.loc[future_timestamps, self.idx[:, :]]

if len(intersected_df) > 0:
if self.mode is EncoderMode.per_segment:
for segment in segments:
segment_df = TSDataset.to_flatten(intersected_df.loc[:, self.idx[segment, :]])
y = segment_df["target"]
# first timestamp is NaN
expanding_mean = y.expanding().mean().shift()
# cumcount not including current timestamp
cumcount = y.groupby(segment_df[self.in_column].astype(str)).agg("cumcount")
# cumsum not including current timestamp
cumsum = (
y.groupby(segment_df[self.in_column].astype(str))
.transform(lambda x: x.shift().cumsum())
.fillna(0)
)
feature = (cumsum + expanding_mean * self.smoothing) / (cumcount + self.smoothing)
if self.handle_missing is MissingMode.global_mean:
nan_feature_index = segment_df[segment_df[self.in_column].isnull()].index
feature.loc[nan_feature_index] = expanding_mean.loc[nan_feature_index]
intersected_df.loc[:, self.idx[segment, self.out_column]] = feature.values

else:
flatten = TSDataset.to_flatten(intersected_df)
flatten = flatten.sort_values(["timestamp", "segment"])
running_mean = self._count_macro_running_mean(flatten, n_segments)

temp = pd.DataFrame(index=flatten.index, columns=["cumsum", "cumcount"], dtype=float)

timestamps = intersected_df.index
categories = pd.unique(df.loc[:, self.idx[:, self.in_column]].values.ravel())

cumstats = pd.DataFrame(data={"sum": 0, "count": 0, self.in_column: categories})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why are we using a dataframe here? I think it isn't really convenient of efficient.

cur_timestamp_idx = np.arange(0, len(timestamps) * n_segments, len(timestamps))
for timestamp in timestamps:
# .loc[timestamp] returns series and .to_flatten fails.
timestamp_df = TSDataset.to_flatten(intersected_df.loc[timestamp:timestamp, self.idx[:, :]])
# statistics from previous timestamp
cumsum_dict = dict(cumstats[[self.in_column, "sum"]].values)
d-a-bunin marked this conversation as resolved.
Show resolved Hide resolved
cumcount_dict = dict(cumstats[[self.in_column, "count"]].values)
# map categories for current timestamp to statistics
temp.loc[cur_timestamp_idx, "cumsum"] = flatten.loc[cur_timestamp_idx, self.in_column].map(
cumsum_dict
)
temp.loc[cur_timestamp_idx, "cumcount"] = flatten.loc[cur_timestamp_idx, self.in_column].map(
cumcount_dict
)
# count statistics for current timestamp
stats = (
timestamp_df["target"]
.groupby(timestamp_df[self.in_column], dropna=False)
.agg(["count", "sum"])
.reset_index()
)
# sum current and previous statistics
cumstats = pd.concat([cumstats, stats]).groupby(self.in_column, as_index=False, dropna=False).sum()
cur_timestamp_idx += 1

feature = (temp["cumsum"] + running_mean * self.smoothing) / (temp["cumcount"] + self.smoothing)
if self.handle_missing is MissingMode.global_mean:
nan_feature_index = flatten[flatten[self.in_column].isnull()].index
feature.loc[nan_feature_index] = running_mean.loc[nan_feature_index]

feature = pd.DataFrame(
feature.values.reshape(len(timestamps), n_segments),
columns=pd.MultiIndex.from_product([segments, [self.out_column]]),
index=intersected_df.index,
)
intersected_df = pd.concat([intersected_df, feature], axis=1)

if len(future_df) > 0:
n_timestamps = len(future_df.index)
if self.mode is EncoderMode.per_segment:
self._global_means_category = cast(Dict[str, Dict[str, float]], self._global_means_category)
self._global_means = cast(Dict[str, float], self._global_means)
for segment in segments:
segment_df = TSDataset.to_flatten(future_df.loc[:, self.idx[segment, :]])
feature = segment_df[self.in_column].map(self._global_means_category[segment])
d-a-bunin marked this conversation as resolved.
Show resolved Hide resolved
feature = feature.fillna(self._global_means[segment])
future_df.loc[:, self.idx[segment, self.out_column]] = feature.values
else:
flatten = TSDataset.to_flatten(future_df)
feature = flatten[self.in_column].map(self._global_means_category)
feature = feature.fillna(self._global_means)
feature = pd.DataFrame(
feature.values.reshape(len(segments), n_timestamps).T,
columns=pd.MultiIndex.from_product([segments, [self.out_column]]),
index=future_df.index,
)
future_df = pd.concat([future_df, feature], axis=1)

intersected_df = intersected_df.sort_index(axis=1)
future_df = future_df.sort_index(axis=1)
transformed_df = pd.concat((intersected_df, future_df), axis=0)
return transformed_df

def get_regressors_info(self) -> List[str]:
"""Return the list with regressors created by the transform."""
return [self.out_column]

def params_to_tune(self) -> Dict[str, BaseDistribution]:
"""Get default grid for tuning hyperparameters.

This grid tunes ``smoothing`` parameter. Other parameters are expected to be set by the user.

Returns
-------
:
Grid to tune.
"""
return {"smoothing": FloatDistribution(low=0, high=2)}
d-a-bunin marked this conversation as resolved.
Show resolved Hide resolved
Loading
Loading