Skip to content

Commit

Permalink
holiday_transform (#240)
Browse files Browse the repository at this point in the history
* holiday_tranform

* fixes

* fixes

* fix name of errors

* major changes in holidaytransform

* minor changes

* edit changelog and delete some checks

* some changes

* fix changes

* change_test_inference

* add tests, fix codestyle

* .

* Fix docstrings in holiday module

* add tests

* fix_tests

* major changes in define_period

* fix week logic

* final_changes

* lint

* fix tests

* change description of mode

* better lint

* upd description

* change description

* change description

* correct gen

* l

* fix: fix docs rendering

---------

Co-authored-by: Yakov Malyshev <[email protected]>
Co-authored-by: Dmitry Bunin <[email protected]>
Co-authored-by: d-a-bunin <[email protected]>
  • Loading branch information
4 people authored Mar 5, 2024
1 parent a93e1b1 commit 3e28dfa
Show file tree
Hide file tree
Showing 6 changed files with 244 additions and 27 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Add dataset integrity check using hash for internal datasets ([#151](https://github.com/etna-team/etna/pull/151))
- Create page about internal datasets in documentation ([#175](https://github.com/etna-team/etna/pull/175))
- Add usage example of internal datasets in `101-get_started.ipynb` and `305-classification.ipynb` tutorials ([#202](https://github.com/etna-team/etna/pull/202))
- Add new `mode="days_count"` to `HolidayTransform`([#239](https://github.com/etna-team/etna/issues/239))
- Add size method to `TSDataset` class ([#238](https://github.com/etna-team/etna/pull/238))
- Add the `index_only` parameter to outlier analysis functions for return type control ([#231](https://github.com/etna-team/etna/pull/231))

Expand Down
124 changes: 108 additions & 16 deletions etna/transforms/timestamp/holiday.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,64 @@
import datetime
from enum import Enum
from typing import List
from typing import Optional

import holidays
import numpy as np
import pandas as pd

from pandas.tseries.offsets import MonthBegin
from pandas.tseries.offsets import MonthEnd
from pandas.tseries.offsets import QuarterBegin
from pandas.tseries.offsets import QuarterEnd
from pandas.tseries.offsets import Week
from pandas.tseries.offsets import YearBegin
from pandas.tseries.offsets import YearEnd
from typing_extensions import assert_never

from etna.datasets import TSDataset
from etna.transforms.base import IrreversibleTransform


def bigger_than_day(freq: Optional[str]):
"""Compare frequency with day."""
dt = "2000-01-01"
dates_day = pd.date_range(start=dt, periods=2, freq="D")
dates_freq = pd.date_range(start=dt, periods=2, freq=freq)
return dates_freq[-1] > dates_day[-1]


def define_period(offset: pd.tseries.offsets.BaseOffset, dt: pd.Timestamp, freq: Optional[str]):
"""Define start_date and end_date of period using dataset frequency."""
if isinstance(offset, Week) and offset.weekday == 6:
start_date = dt - pd.tseries.frequencies.to_offset("W") + pd.Timedelta(days=1)
end_date = dt
elif isinstance(offset, Week):
start_date = dt - pd.tseries.frequencies.to_offset("W") + pd.Timedelta(days=1)
end_date = dt + pd.tseries.frequencies.to_offset("W")
elif isinstance(offset, YearEnd) and offset.month == 12:
start_date = dt - pd.tseries.frequencies.to_offset("Y") + pd.Timedelta(days=1)
end_date = dt
elif isinstance(offset, (YearBegin, YearEnd)):
start_date = dt - pd.tseries.frequencies.to_offset("Y") + pd.Timedelta(days=1)
end_date = dt + pd.tseries.frequencies.to_offset("Y")
elif isinstance(offset, (MonthEnd, QuarterEnd, YearEnd)):
start_date = dt - offset + pd.Timedelta(days=1)
end_date = dt
elif isinstance(offset, (MonthBegin, QuarterBegin, YearBegin)):
start_date = dt
end_date = dt + offset - pd.Timedelta(days=1)
else:
raise ValueError(
f"Days_count mode works only with weekly, monthly, quarterly or yearly data. You have freq={freq}"
)
return start_date, end_date


class HolidayTransformMode(str, Enum):
"""Enum for different imputation strategy."""

binary = "binary"
category = "category"
days_count = "days_count"

@classmethod
def _missing_(cls, value):
Expand All @@ -27,8 +71,13 @@ class HolidayTransform(IrreversibleTransform):
"""
HolidayTransform generates series that indicates holidays in given dataset.
In ``binary`` mode shows the presence of holiday in that day. In ``category`` mode shows the name of the holiday
with value "NO_HOLIDAY" reserved for days without holidays.
* In ``binary`` mode shows the presence of holiday in that day.
* In ``category`` mode shows the name of the holiday with value "NO_HOLIDAY" reserved for days without holidays.
* In ``days_count`` mode shows the frequency of holidays in a given period.
* If the frequency is weekly, then we count the proportion of holidays in a week (Monday-Sunday) that contains this day.
* If the frequency is monthly, then we count the proportion of holidays in a month that contains this day.
* If the frequency is yearly, then we count the proportion of holidays in a year that contains this day.
"""

_no_holiday_name: str = "NO_HOLIDAY"
Expand All @@ -40,9 +89,10 @@ def __init__(self, iso_code: str = "RUS", mode: str = "binary", out_column: Opti
Parameters
----------
iso_code:
internationally recognised codes, designated to country for which we want to find the holidays
internationally recognised codes, designated to country for which we want to find the holidays.
mode:
`binary` to indicate holidays, `category` to specify which holiday do we have at each day
`binary` to indicate holidays, `category` to specify which holiday do we have at each day,
`days_count` to determine the proportion of holidays in a given period of time.
out_column:
name of added column. Use ``self.__repr__()`` if not given.
"""
Expand All @@ -52,6 +102,7 @@ def __init__(self, iso_code: str = "RUS", mode: str = "binary", out_column: Opti
self._mode = HolidayTransformMode(mode)
self.holidays = holidays.country_holidays(iso_code)
self.out_column = out_column
self.freq: Optional[str] = None

def _get_column_name(self) -> str:
if self.out_column:
Expand All @@ -60,19 +111,40 @@ def _get_column_name(self) -> str:
return self.__repr__()

def _fit(self, df: pd.DataFrame) -> "HolidayTransform":
"""Fit the transform.
Parameters
----------
df:
Dataset to fit the transform on.
Returns
-------
:
The fitted transform instance.
"""
Fit HolidayTransform with data from df. Does nothing in this case.
return self

def fit(self, ts: TSDataset):
"""Fit the transform.
Parameters
----------
df: pd.DataFrame
value series with index column in timestamp format
ts:
Dataset to fit the transform on.
Returns
-------
:
The fitted transform instance.
"""
super().fit(ts=ts)
self.freq = ts.freq
return self

def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Transform data from df with HolidayTransform and generate a column of holidays flags or its titles.
Transform data.
Parameters
----------
Expand All @@ -83,27 +155,47 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
-------
:
pd.DataFrame with added holidays
Raises
------
ValueError:
if the frequency is greater than daily and this is a ``binary`` or ``categorical`` mode
ValueError:
if the frequency is not weekly, monthly, quarterly or yearly and this is ``days_count`` mode
"""
if (df.index[1] - df.index[0]) > datetime.timedelta(days=1):
raise ValueError("Frequency of data should be no more than daily.")
if self.freq is None:
raise ValueError("Transform is not fitted")
if bigger_than_day(self.freq) and self._mode is not HolidayTransformMode.days_count:
raise ValueError("For binary and category modes frequency of data should be no more than daily.")

cols = df.columns.get_level_values("segment").unique()
out_column = self._get_column_name()

if self._mode is HolidayTransformMode.category:
if self._mode is HolidayTransformMode.days_count:
date_offset = pd.tseries.frequencies.to_offset(self.freq)
encoded_matrix = np.empty(0)
for dt in df.index:
start_date, end_date = define_period(date_offset, pd.Timestamp(dt), self.freq)
date_range = pd.date_range(start=start_date, end=end_date, freq="D")
count_holidays = sum(1 for d in date_range if d in self.holidays)
holidays_freq = count_holidays / date_range.size
encoded_matrix = np.append(encoded_matrix, holidays_freq)
elif self._mode is HolidayTransformMode.category:
encoded_matrix = np.array(
[self.holidays[x] if x in self.holidays else self._no_holiday_name for x in df.index]
)
else:
elif self._mode is HolidayTransformMode.binary:
encoded_matrix = np.array([int(x in self.holidays) for x in df.index])
else:
assert_never(self._mode)
encoded_matrix = encoded_matrix.reshape(-1, 1).repeat(len(cols), axis=1)
encoded_df = pd.DataFrame(
encoded_matrix,
columns=pd.MultiIndex.from_product([cols, [out_column]], names=("segment", "feature")),
index=df.index,
)
encoded_df = encoded_df.astype("category")

if self._mode is not HolidayTransformMode.days_count:
encoded_df = encoded_df.astype("category")
df = df.join(encoded_df)
df = df.sort_index(axis=1)
return df
Expand Down
22 changes: 22 additions & 0 deletions tests/test_transforms/test_inference/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,28 @@ def regular_ts(random_seed) -> TSDataset:
return tsds


@pytest.fixture
def regular_ts_one_month(random_seed) -> TSDataset:
periods = 100
df_1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods, freq="M")})
df_1["segment"] = "segment_1"
df_1["target"] = np.random.uniform(10, 20, size=periods)

df_2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods, freq="M")})
df_2["segment"] = "segment_2"
df_2["target"] = np.random.uniform(-15, 5, size=periods)

df_3 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods, freq="M")})
df_3["segment"] = "segment_3"
df_3["target"] = np.random.uniform(-5, 5, size=periods)

df = pd.concat([df_1, df_2, df_3]).reset_index(drop=True)
df = TSDataset.to_dataset(df)
tsds = TSDataset(df, freq="M")

return tsds


@pytest.fixture
def ts_with_exog(regular_ts) -> TSDataset:
df = regular_ts.to_pandas(flatten=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ def _test_inverse_transform_train_subset_segments(self, ts, transform, segments)
(FourierTransform(period=7, order=2), "regular_ts"),
(HolidayTransform(mode="binary"), "regular_ts"),
(HolidayTransform(mode="category"), "regular_ts"),
(HolidayTransform(mode="days_count"), "regular_ts_one_month"),
(SpecialDaysTransform(), "regular_ts"),
(TimeFlagsTransform(), "regular_ts"),
(EventTransform(in_column="holiday", out_column="holiday", n_pre=1, n_post=1), "ts_with_binary_exog"),
Expand Down Expand Up @@ -450,6 +451,7 @@ def _test_inverse_transform_future_subset_segments(self, ts, transform, segments
(FourierTransform(period=7, order=2), "regular_ts"),
(HolidayTransform(mode="binary"), "regular_ts"),
(HolidayTransform(mode="category"), "regular_ts"),
(HolidayTransform(mode="days_count"), "regular_ts_one_month"),
(SpecialDaysTransform(), "regular_ts"),
(TimeFlagsTransform(), "regular_ts"),
(EventTransform(in_column="holiday", out_column="holiday", n_pre=1, n_post=1), "ts_with_binary_exog"),
Expand Down Expand Up @@ -687,6 +689,7 @@ def _test_inverse_transform_train_new_segments(self, ts, transform, train_segmen
),
(HolidayTransform(out_column="res", mode="binary"), "regular_ts", {}),
(HolidayTransform(out_column="res", mode="category"), "regular_ts", {}),
(HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {}),
(
TimeFlagsTransform(out_column="res"),
"regular_ts",
Expand Down Expand Up @@ -1031,6 +1034,7 @@ def _test_inverse_transform_future_new_segments(self, ts, transform, train_segme
),
(HolidayTransform(out_column="res", mode="binary"), "regular_ts", {}),
(HolidayTransform(out_column="res", mode="category"), "regular_ts", {}),
(HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {}),
(
TimeFlagsTransform(out_column="res"),
"regular_ts",
Expand Down Expand Up @@ -1524,6 +1528,7 @@ def _test_inverse_transform_future_with_target(
),
(HolidayTransform(out_column="res", mode="binary"), "regular_ts", {}),
(HolidayTransform(out_column="res", mode="category"), "regular_ts", {}),
(HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {}),
(
TimeFlagsTransform(out_column="res"),
"regular_ts",
Expand Down Expand Up @@ -1957,6 +1962,7 @@ def _test_inverse_transform_future_without_target(
),
(HolidayTransform(out_column="res", mode="binary"), "regular_ts", {}),
(HolidayTransform(out_column="res", mode="category"), "regular_ts", {}),
(HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {}),
(
TimeFlagsTransform(out_column="res"),
"regular_ts",
Expand Down
6 changes: 6 additions & 0 deletions tests/test_transforms/test_inference/test_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,7 @@ def _test_transform_train_subset_segments(self, ts, transform, segments):
(FourierTransform(period=7, order=2), "regular_ts"),
(HolidayTransform(mode="binary"), "regular_ts"),
(HolidayTransform(mode="category"), "regular_ts"),
(HolidayTransform(mode="days_count"), "regular_ts_one_month"),
(SpecialDaysTransform(), "regular_ts"),
(TimeFlagsTransform(), "regular_ts"),
(EventTransform(in_column="holiday", out_column="holiday", n_pre=1, n_post=1), "ts_with_binary_exog"),
Expand Down Expand Up @@ -432,6 +433,7 @@ def _test_transform_future_subset_segments(self, ts, transform, segments, horizo
(FourierTransform(period=7, order=2), "regular_ts"),
(HolidayTransform(mode="binary"), "regular_ts"),
(HolidayTransform(mode="category"), "regular_ts"),
(HolidayTransform(mode="days_count"), "regular_ts_one_month"),
(SpecialDaysTransform(), "regular_ts"),
(TimeFlagsTransform(), "regular_ts"),
(EventTransform(in_column="holiday", out_column="holiday", n_pre=1, n_post=1), "ts_with_binary_exog"),
Expand Down Expand Up @@ -631,6 +633,7 @@ def _test_transform_train_new_segments(self, ts, transform, train_segments, expe
),
(HolidayTransform(out_column="res", mode="binary"), "regular_ts", {"create": {"res"}}),
(HolidayTransform(out_column="res", mode="category"), "regular_ts", {"create": {"res"}}),
(HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {"create": {"res"}}),
(
TimeFlagsTransform(out_column="res"),
"regular_ts",
Expand Down Expand Up @@ -969,6 +972,7 @@ def _test_transform_future_new_segments(self, ts, transform, train_segments, exp
),
(HolidayTransform(out_column="res", mode="binary"), "regular_ts", {"create": {"res"}}),
(HolidayTransform(out_column="res", mode="category"), "regular_ts", {"create": {"res"}}),
(HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {"create": {"res"}}),
(
TimeFlagsTransform(out_column="res"),
"regular_ts",
Expand Down Expand Up @@ -1383,6 +1387,7 @@ def _test_transform_future_with_target(self, ts, transform, expected_changes, ga
),
(HolidayTransform(out_column="res", mode="binary"), "regular_ts", {"create": {"res"}}),
(HolidayTransform(out_column="res", mode="category"), "regular_ts", {"create": {"res"}}),
(HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {"create": {"res"}}),
(
TimeFlagsTransform(out_column="res"),
"regular_ts",
Expand Down Expand Up @@ -1780,6 +1785,7 @@ def _test_transform_future_without_target(self, ts, transform, expected_changes,
),
(HolidayTransform(out_column="res", mode="binary"), "regular_ts", {"create": {"res"}}),
(HolidayTransform(out_column="res", mode="category"), "regular_ts", {"create": {"res"}}),
(HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {"create": {"res"}}),
(
TimeFlagsTransform(out_column="res"),
"regular_ts",
Expand Down
Loading

0 comments on commit 3e28dfa

Please sign in to comment.