diff --git a/CHANGELOG.md b/CHANGELOG.md index c7be14d56..47c2d3d35 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add dataset integrity check using hash for internal datasets ([#151](https://github.com/etna-team/etna/pull/151)) - Create page about internal datasets in documentation ([#175](https://github.com/etna-team/etna/pull/175)) - Add usage example of internal datasets in `101-get_started.ipynb` and `305-classification.ipynb` tutorials ([#202](https://github.com/etna-team/etna/pull/202)) +- Add new `mode="days_count"` to `HolidayTransform`([#239](https://github.com/etna-team/etna/issues/239)) - Add size method to `TSDataset` class ([#238](https://github.com/etna-team/etna/pull/238)) - Add the `index_only` parameter to outlier analysis functions for return type control ([#231](https://github.com/etna-team/etna/pull/231)) diff --git a/etna/transforms/timestamp/holiday.py b/etna/transforms/timestamp/holiday.py index bc3fe6c45..453fb756f 100644 --- a/etna/transforms/timestamp/holiday.py +++ b/etna/transforms/timestamp/holiday.py @@ -1,4 +1,3 @@ -import datetime from enum import Enum from typing import List from typing import Optional @@ -6,15 +5,60 @@ import holidays import numpy as np import pandas as pd - +from pandas.tseries.offsets import MonthBegin +from pandas.tseries.offsets import MonthEnd +from pandas.tseries.offsets import QuarterBegin +from pandas.tseries.offsets import QuarterEnd +from pandas.tseries.offsets import Week +from pandas.tseries.offsets import YearBegin +from pandas.tseries.offsets import YearEnd +from typing_extensions import assert_never + +from etna.datasets import TSDataset from etna.transforms.base import IrreversibleTransform +def bigger_than_day(freq: Optional[str]): + """Compare frequency with day.""" + dt = "2000-01-01" + dates_day = pd.date_range(start=dt, periods=2, freq="D") + dates_freq = pd.date_range(start=dt, periods=2, freq=freq) + return dates_freq[-1] > dates_day[-1] + + +def define_period(offset: pd.tseries.offsets.BaseOffset, dt: pd.Timestamp, freq: Optional[str]): + """Define start_date and end_date of period using dataset frequency.""" + if isinstance(offset, Week) and offset.weekday == 6: + start_date = dt - pd.tseries.frequencies.to_offset("W") + pd.Timedelta(days=1) + end_date = dt + elif isinstance(offset, Week): + start_date = dt - pd.tseries.frequencies.to_offset("W") + pd.Timedelta(days=1) + end_date = dt + pd.tseries.frequencies.to_offset("W") + elif isinstance(offset, YearEnd) and offset.month == 12: + start_date = dt - pd.tseries.frequencies.to_offset("Y") + pd.Timedelta(days=1) + end_date = dt + elif isinstance(offset, (YearBegin, YearEnd)): + start_date = dt - pd.tseries.frequencies.to_offset("Y") + pd.Timedelta(days=1) + end_date = dt + pd.tseries.frequencies.to_offset("Y") + elif isinstance(offset, (MonthEnd, QuarterEnd, YearEnd)): + start_date = dt - offset + pd.Timedelta(days=1) + end_date = dt + elif isinstance(offset, (MonthBegin, QuarterBegin, YearBegin)): + start_date = dt + end_date = dt + offset - pd.Timedelta(days=1) + else: + raise ValueError( + f"Days_count mode works only with weekly, monthly, quarterly or yearly data. You have freq={freq}" + ) + return start_date, end_date + + class HolidayTransformMode(str, Enum): """Enum for different imputation strategy.""" binary = "binary" category = "category" + days_count = "days_count" @classmethod def _missing_(cls, value): @@ -27,8 +71,13 @@ class HolidayTransform(IrreversibleTransform): """ HolidayTransform generates series that indicates holidays in given dataset. - In ``binary`` mode shows the presence of holiday in that day. In ``category`` mode shows the name of the holiday - with value "NO_HOLIDAY" reserved for days without holidays. + * In ``binary`` mode shows the presence of holiday in that day. + * In ``category`` mode shows the name of the holiday with value "NO_HOLIDAY" reserved for days without holidays. + * In ``days_count`` mode shows the frequency of holidays in a given period. + + * If the frequency is weekly, then we count the proportion of holidays in a week (Monday-Sunday) that contains this day. + * If the frequency is monthly, then we count the proportion of holidays in a month that contains this day. + * If the frequency is yearly, then we count the proportion of holidays in a year that contains this day. """ _no_holiday_name: str = "NO_HOLIDAY" @@ -40,9 +89,10 @@ def __init__(self, iso_code: str = "RUS", mode: str = "binary", out_column: Opti Parameters ---------- iso_code: - internationally recognised codes, designated to country for which we want to find the holidays + internationally recognised codes, designated to country for which we want to find the holidays. mode: - `binary` to indicate holidays, `category` to specify which holiday do we have at each day + `binary` to indicate holidays, `category` to specify which holiday do we have at each day, + `days_count` to determine the proportion of holidays in a given period of time. out_column: name of added column. Use ``self.__repr__()`` if not given. """ @@ -52,6 +102,7 @@ def __init__(self, iso_code: str = "RUS", mode: str = "binary", out_column: Opti self._mode = HolidayTransformMode(mode) self.holidays = holidays.country_holidays(iso_code) self.out_column = out_column + self.freq: Optional[str] = None def _get_column_name(self) -> str: if self.out_column: @@ -60,19 +111,40 @@ def _get_column_name(self) -> str: return self.__repr__() def _fit(self, df: pd.DataFrame) -> "HolidayTransform": + """Fit the transform. + + Parameters + ---------- + df: + Dataset to fit the transform on. + + Returns + ------- + : + The fitted transform instance. """ - Fit HolidayTransform with data from df. Does nothing in this case. + return self + + def fit(self, ts: TSDataset): + """Fit the transform. Parameters ---------- - df: pd.DataFrame - value series with index column in timestamp format + ts: + Dataset to fit the transform on. + + Returns + ------- + : + The fitted transform instance. """ + super().fit(ts=ts) + self.freq = ts.freq return self def _transform(self, df: pd.DataFrame) -> pd.DataFrame: """ - Transform data from df with HolidayTransform and generate a column of holidays flags or its titles. + Transform data. Parameters ---------- @@ -83,27 +155,47 @@ def _transform(self, df: pd.DataFrame) -> pd.DataFrame: ------- : pd.DataFrame with added holidays + + Raises + ------ + ValueError: + if the frequency is greater than daily and this is a ``binary`` or ``categorical`` mode + ValueError: + if the frequency is not weekly, monthly, quarterly or yearly and this is ``days_count`` mode """ - if (df.index[1] - df.index[0]) > datetime.timedelta(days=1): - raise ValueError("Frequency of data should be no more than daily.") + if self.freq is None: + raise ValueError("Transform is not fitted") + if bigger_than_day(self.freq) and self._mode is not HolidayTransformMode.days_count: + raise ValueError("For binary and category modes frequency of data should be no more than daily.") cols = df.columns.get_level_values("segment").unique() out_column = self._get_column_name() - if self._mode is HolidayTransformMode.category: + if self._mode is HolidayTransformMode.days_count: + date_offset = pd.tseries.frequencies.to_offset(self.freq) + encoded_matrix = np.empty(0) + for dt in df.index: + start_date, end_date = define_period(date_offset, pd.Timestamp(dt), self.freq) + date_range = pd.date_range(start=start_date, end=end_date, freq="D") + count_holidays = sum(1 for d in date_range if d in self.holidays) + holidays_freq = count_holidays / date_range.size + encoded_matrix = np.append(encoded_matrix, holidays_freq) + elif self._mode is HolidayTransformMode.category: encoded_matrix = np.array( [self.holidays[x] if x in self.holidays else self._no_holiday_name for x in df.index] ) - else: + elif self._mode is HolidayTransformMode.binary: encoded_matrix = np.array([int(x in self.holidays) for x in df.index]) + else: + assert_never(self._mode) encoded_matrix = encoded_matrix.reshape(-1, 1).repeat(len(cols), axis=1) encoded_df = pd.DataFrame( encoded_matrix, columns=pd.MultiIndex.from_product([cols, [out_column]], names=("segment", "feature")), index=df.index, ) - encoded_df = encoded_df.astype("category") - + if self._mode is not HolidayTransformMode.days_count: + encoded_df = encoded_df.astype("category") df = df.join(encoded_df) df = df.sort_index(axis=1) return df diff --git a/tests/test_transforms/test_inference/conftest.py b/tests/test_transforms/test_inference/conftest.py index a2295302e..813ee4063 100644 --- a/tests/test_transforms/test_inference/conftest.py +++ b/tests/test_transforms/test_inference/conftest.py @@ -28,6 +28,28 @@ def regular_ts(random_seed) -> TSDataset: return tsds +@pytest.fixture +def regular_ts_one_month(random_seed) -> TSDataset: + periods = 100 + df_1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods, freq="M")}) + df_1["segment"] = "segment_1" + df_1["target"] = np.random.uniform(10, 20, size=periods) + + df_2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods, freq="M")}) + df_2["segment"] = "segment_2" + df_2["target"] = np.random.uniform(-15, 5, size=periods) + + df_3 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods, freq="M")}) + df_3["segment"] = "segment_3" + df_3["target"] = np.random.uniform(-5, 5, size=periods) + + df = pd.concat([df_1, df_2, df_3]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + tsds = TSDataset(df, freq="M") + + return tsds + + @pytest.fixture def ts_with_exog(regular_ts) -> TSDataset: df = regular_ts.to_pandas(flatten=True) diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py index 57cbc6731..57b7db2ec 100644 --- a/tests/test_transforms/test_inference/test_inverse_transform.py +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -229,6 +229,7 @@ def _test_inverse_transform_train_subset_segments(self, ts, transform, segments) (FourierTransform(period=7, order=2), "regular_ts"), (HolidayTransform(mode="binary"), "regular_ts"), (HolidayTransform(mode="category"), "regular_ts"), + (HolidayTransform(mode="days_count"), "regular_ts_one_month"), (SpecialDaysTransform(), "regular_ts"), (TimeFlagsTransform(), "regular_ts"), (EventTransform(in_column="holiday", out_column="holiday", n_pre=1, n_post=1), "ts_with_binary_exog"), @@ -450,6 +451,7 @@ def _test_inverse_transform_future_subset_segments(self, ts, transform, segments (FourierTransform(period=7, order=2), "regular_ts"), (HolidayTransform(mode="binary"), "regular_ts"), (HolidayTransform(mode="category"), "regular_ts"), + (HolidayTransform(mode="days_count"), "regular_ts_one_month"), (SpecialDaysTransform(), "regular_ts"), (TimeFlagsTransform(), "regular_ts"), (EventTransform(in_column="holiday", out_column="holiday", n_pre=1, n_post=1), "ts_with_binary_exog"), @@ -687,6 +689,7 @@ def _test_inverse_transform_train_new_segments(self, ts, transform, train_segmen ), (HolidayTransform(out_column="res", mode="binary"), "regular_ts", {}), (HolidayTransform(out_column="res", mode="category"), "regular_ts", {}), + (HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {}), ( TimeFlagsTransform(out_column="res"), "regular_ts", @@ -1031,6 +1034,7 @@ def _test_inverse_transform_future_new_segments(self, ts, transform, train_segme ), (HolidayTransform(out_column="res", mode="binary"), "regular_ts", {}), (HolidayTransform(out_column="res", mode="category"), "regular_ts", {}), + (HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {}), ( TimeFlagsTransform(out_column="res"), "regular_ts", @@ -1524,6 +1528,7 @@ def _test_inverse_transform_future_with_target( ), (HolidayTransform(out_column="res", mode="binary"), "regular_ts", {}), (HolidayTransform(out_column="res", mode="category"), "regular_ts", {}), + (HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {}), ( TimeFlagsTransform(out_column="res"), "regular_ts", @@ -1957,6 +1962,7 @@ def _test_inverse_transform_future_without_target( ), (HolidayTransform(out_column="res", mode="binary"), "regular_ts", {}), (HolidayTransform(out_column="res", mode="category"), "regular_ts", {}), + (HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {}), ( TimeFlagsTransform(out_column="res"), "regular_ts", diff --git a/tests/test_transforms/test_inference/test_transform.py b/tests/test_transforms/test_inference/test_transform.py index cfb5c47cd..66e93b778 100644 --- a/tests/test_transforms/test_inference/test_transform.py +++ b/tests/test_transforms/test_inference/test_transform.py @@ -221,6 +221,7 @@ def _test_transform_train_subset_segments(self, ts, transform, segments): (FourierTransform(period=7, order=2), "regular_ts"), (HolidayTransform(mode="binary"), "regular_ts"), (HolidayTransform(mode="category"), "regular_ts"), + (HolidayTransform(mode="days_count"), "regular_ts_one_month"), (SpecialDaysTransform(), "regular_ts"), (TimeFlagsTransform(), "regular_ts"), (EventTransform(in_column="holiday", out_column="holiday", n_pre=1, n_post=1), "ts_with_binary_exog"), @@ -432,6 +433,7 @@ def _test_transform_future_subset_segments(self, ts, transform, segments, horizo (FourierTransform(period=7, order=2), "regular_ts"), (HolidayTransform(mode="binary"), "regular_ts"), (HolidayTransform(mode="category"), "regular_ts"), + (HolidayTransform(mode="days_count"), "regular_ts_one_month"), (SpecialDaysTransform(), "regular_ts"), (TimeFlagsTransform(), "regular_ts"), (EventTransform(in_column="holiday", out_column="holiday", n_pre=1, n_post=1), "ts_with_binary_exog"), @@ -631,6 +633,7 @@ def _test_transform_train_new_segments(self, ts, transform, train_segments, expe ), (HolidayTransform(out_column="res", mode="binary"), "regular_ts", {"create": {"res"}}), (HolidayTransform(out_column="res", mode="category"), "regular_ts", {"create": {"res"}}), + (HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {"create": {"res"}}), ( TimeFlagsTransform(out_column="res"), "regular_ts", @@ -969,6 +972,7 @@ def _test_transform_future_new_segments(self, ts, transform, train_segments, exp ), (HolidayTransform(out_column="res", mode="binary"), "regular_ts", {"create": {"res"}}), (HolidayTransform(out_column="res", mode="category"), "regular_ts", {"create": {"res"}}), + (HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {"create": {"res"}}), ( TimeFlagsTransform(out_column="res"), "regular_ts", @@ -1383,6 +1387,7 @@ def _test_transform_future_with_target(self, ts, transform, expected_changes, ga ), (HolidayTransform(out_column="res", mode="binary"), "regular_ts", {"create": {"res"}}), (HolidayTransform(out_column="res", mode="category"), "regular_ts", {"create": {"res"}}), + (HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {"create": {"res"}}), ( TimeFlagsTransform(out_column="res"), "regular_ts", @@ -1780,6 +1785,7 @@ def _test_transform_future_without_target(self, ts, transform, expected_changes, ), (HolidayTransform(out_column="res", mode="binary"), "regular_ts", {"create": {"res"}}), (HolidayTransform(out_column="res", mode="category"), "regular_ts", {"create": {"res"}}), + (HolidayTransform(out_column="res", mode="days_count"), "regular_ts_one_month", {"create": {"res"}}), ( TimeFlagsTransform(out_column="res"), "regular_ts", diff --git a/tests/test_transforms/test_timestamp/test_holiday_transform.py b/tests/test_transforms/test_timestamp/test_holiday_transform.py index 2aeac8a41..828963659 100644 --- a/tests/test_transforms/test_timestamp/test_holiday_transform.py +++ b/tests/test_transforms/test_timestamp/test_holiday_transform.py @@ -5,6 +5,7 @@ from etna.datasets import TSDataset from etna.datasets import generate_const_df from etna.transforms.timestamp import HolidayTransform +from etna.transforms.timestamp.holiday import define_period from tests.test_transforms.utils import assert_transformation_equals_loaded_original @@ -26,6 +27,14 @@ def simple_constant_df_daily(): return df +@pytest.fixture() +def simple_constant_df_day_15_min(): + df = pd.DataFrame({"timestamp": pd.date_range(start="2020-11-25 22:30", end="2020-12-11", freq="1D 15MIN")}) + df["target"] = 42 + df.set_index("timestamp", inplace=True) + return df + + @pytest.fixture() def two_segments_simple_ts_daily(simple_constant_df_daily: pd.DataFrame): df_1 = simple_constant_df_daily.reset_index() @@ -41,6 +50,21 @@ def two_segments_simple_ts_daily(simple_constant_df_daily: pd.DataFrame): return ts +@pytest.fixture() +def two_segments_simple_ts_day_15min(simple_constant_df_day_15_min: pd.DataFrame): + df_1 = simple_constant_df_day_15_min.reset_index() + df_2 = simple_constant_df_day_15_min.reset_index() + df_1 = df_1[3:] + + df_1["segment"] = "segment_1" + df_2["segment"] = "segment_2" + + classic_df = pd.concat([df_1, df_2], ignore_index=True) + df = TSDataset.to_dataset(classic_df) + ts = TSDataset(df, freq="1D 15MIN") + return ts + + @pytest.fixture() def simple_constant_df_hour(): df = pd.DataFrame({"timestamp": pd.date_range(start="2020-01-08 22:15", end="2020-01-10", freq="H")}) @@ -49,6 +73,44 @@ def simple_constant_df_hour(): return df +@pytest.fixture() +def simple_week_mon_df(): + df = pd.DataFrame({"timestamp": pd.date_range(start="2020-01-08 22:15", end="2020-05-12", freq="W-MON")}) + df["target"] = 7 + df.set_index("timestamp", inplace=True) + return df + + +@pytest.fixture() +def two_segments_w_mon(simple_week_mon_df: pd.DataFrame): + df_1 = simple_week_mon_df.reset_index() + df_2 = simple_week_mon_df.reset_index() + df_1 = df_1[3:] + + df_1["segment"] = "segment_1" + df_2["segment"] = "segment_2" + + classic_df = pd.concat([df_1, df_2], ignore_index=True) + df = TSDataset.to_dataset(classic_df) + ts = TSDataset(df, freq="W-MON") + return ts + + +@pytest.fixture() +def two_segments_simple_ts_hour(simple_constant_df_hour: pd.DataFrame): + df_1 = simple_constant_df_hour.reset_index() + df_2 = simple_constant_df_hour.reset_index() + df_1 = df_1[3:] + + df_1["segment"] = "segment_1" + df_2["segment"] = "segment_2" + + classic_df = pd.concat([df_1, df_2], ignore_index=True) + df = TSDataset.to_dataset(classic_df) + ts = TSDataset(df, freq="H") + return ts + + @pytest.fixture() def two_segments_simple_ts_hour(simple_constant_df_hour: pd.DataFrame): df_1 = simple_constant_df_hour.reset_index() @@ -194,18 +256,24 @@ def test_holidays_min(iso_code: str, answer: np.array, two_segments_simple_ts_mi assert np.array_equal(df[segment]["regressor_holidays"].values, answer) -@pytest.mark.parametrize( - "index", - ( - (pd.date_range(start="2020-11-25 22:30", end="2020-12-11", freq="1D 15MIN")), - (pd.date_range(start="2019-11-25", end="2021-02-25", freq="M")), - ), -) -def test_holidays_failed(index: pd.DatetimeIndex, two_segments_simple_ts_daily: TSDataset): - ts = two_segments_simple_ts_daily - ts.df.index = index +@pytest.mark.parametrize("ts_name", ("two_segments_w_mon", "two_segments_simple_ts_day_15min")) +def test_holidays_failed(ts_name, request): + ts = request.getfixturevalue(ts_name) holidays_finder = HolidayTransform(out_column="holiday") - with pytest.raises(ValueError, match="Frequency of data should be no more than daily."): + with pytest.raises( + ValueError, match="For binary and category modes frequency of data should be no more than daily." + ): + ts = holidays_finder.fit_transform(ts) + + +@pytest.mark.parametrize("ts_name", ("two_segments_simple_ts_daily", "two_segments_simple_ts_min")) +def test_holidays_days_count_mode_failed(ts_name, request): + ts = request.getfixturevalue(ts_name) + holidays_finder = HolidayTransform(out_column="holiday", mode="days_count") + with pytest.raises( + ValueError, + match=f"Days_count mode works only with weekly, monthly, quarterly or yearly data. You have freq={ts.freq}", + ): ts = holidays_finder.fit_transform(ts) @@ -224,3 +292,25 @@ def test_save_load(example_tsds): def test_params_to_tune(): transform = HolidayTransform() assert len(transform.params_to_tune()) == 0 + + +@pytest.mark.parametrize( + "freq, timestamp, expected_result", + ( + ("Y", pd.Timestamp("2000-12-31"), [pd.Timestamp("2000-01-01"), pd.Timestamp("2000-12-31")]), + ("YS", pd.Timestamp("2000-01-01"), [pd.Timestamp("2000-01-01"), pd.Timestamp("2000-12-31")]), + ("A-OCT", pd.Timestamp("2000-10-31"), [pd.Timestamp("2000-01-01"), pd.Timestamp("2000-12-31")]), + ("AS-OCT", pd.Timestamp("2000-10-01"), [pd.Timestamp("2000-01-01"), pd.Timestamp("2000-12-31")]), + ("Q", pd.Timestamp("2000-12-31"), [pd.Timestamp("2000-10-01"), pd.Timestamp("2000-12-31")]), + ("QS", pd.Timestamp("2000-01-01"), [pd.Timestamp("2000-01-01"), pd.Timestamp("2000-03-31")]), + ("Q-NOV", pd.Timestamp("2000-11-30"), [pd.Timestamp("2000-09-01"), pd.Timestamp("2000-11-30")]), + ("QS-NOV", pd.Timestamp("2000-11-01"), [pd.Timestamp("2000-11-01"), pd.Timestamp("2001-01-31")]), + ("M", pd.Timestamp("2000-01-31"), [pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-31")]), + ("MS", pd.Timestamp("2000-01-01"), [pd.Timestamp("2000-01-01"), pd.Timestamp("2000-01-31")]), + ("W", pd.Timestamp("2000-12-03"), [pd.Timestamp("2000-11-27"), pd.Timestamp("2000-12-03")]), + ("W-THU", pd.Timestamp("2000-11-30"), [pd.Timestamp("2000-11-27"), pd.Timestamp("2000-12-03")]), + ), +) +def test_define_period_end(freq, timestamp, expected_result): + assert (define_period(pd.tseries.frequencies.to_offset(freq), timestamp, freq))[0] == expected_result[0] + assert (define_period(pd.tseries.frequencies.to_offset(freq), timestamp, freq))[1] == expected_result[1]