From 2b92650be85ce068b1a38f4a3f544ec21cc01788 Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Thu, 25 Jul 2024 13:04:10 +0300 Subject: [PATCH 1/9] added implementation --- etna/transforms/decomposition/dft_based.py | 199 +++++++++++++++++++++ 1 file changed, 199 insertions(+) create mode 100644 etna/transforms/decomposition/dft_based.py diff --git a/etna/transforms/decomposition/dft_based.py b/etna/transforms/decomposition/dft_based.py new file mode 100644 index 000000000..329536d87 --- /dev/null +++ b/etna/transforms/decomposition/dft_based.py @@ -0,0 +1,199 @@ +from typing import List + +import numpy as np +import pandas as pd + +from etna.datasets import TSDataset +from etna.datasets.utils import determine_num_steps +from etna.transforms import IrreversibleTransform + + +class FourierDecomposeTransform(IrreversibleTransform): + """Transform that uses Fourier transformation to estimate series decomposition. + + Note + ---- + This transform decomposes only in-sample data. For the future timestamps it produces ``NaN``. + For the dataset to be transformed, it should contain at least the minimum amount of in-sample timestamps that are required by transform. + + Warning + ------- + This transform adds new columns to the dataset, that correspond to the selected frequencies. Such columns are named with + ``dft_{i}`` suffix. Suffix index do NOT indicate any relation to the frequencies. Produced names should be thought of as + arbitrary identifiers to the produced sinusoids. + """ + + def __init__(self, k: int, in_column: str = "target", residuals: bool = False): + """Init ``FourierDecomposeTransform``. + + Parameters + ---------- + k: + how many top positive frequencies selected for the decomposition. Selection performed proportional to the amplitudes. + in_column: + name of the processed column. + residuals: + whether to add residuals after decomposition. This guarantees that all components, including residuals, sum up to the series. + """ + if k <= 0: + raise ValueError("Parameter `k` must be positive integer!") + + self.k = k + self.in_column = in_column + self.residuals = residuals + + self._first_timestamp = None + self._last_timestamp = None + + super().__init__(required_features=[in_column]) + + def get_regressors_info(self) -> List[str]: + """Return the list with regressors created by the transform.""" + return [] + + def _fit(self, df: pd.DataFrame): + """Fit transform with the dataframe.""" + pass + + def _transform(self, df: pd.DataFrame) -> pd.DataFrame: + """Transform provided dataframe.""" + pass + + @staticmethod + def _get_num_pos_freqs(series: pd.Series) -> int: + """Get number of positive frequencies for the series.""" + num_obs = len(series) + return int(np.ceil((num_obs - 1) / 2) + 1) + + def _check_segments(self, df: pd.DataFrame): + """Check if series satisfy conditions.""" + segments_with_missing = [] + min_num_pos_freq = float("inf") + for segment in df: + series = df[segment] + series = series.loc[series.first_valid_index() : series.last_valid_index()] + if series.isna().any(): + segments_with_missing.append(segment) + + min_num_pos_freq = min(min_num_pos_freq, self._get_num_pos_freqs(series)) + + if len(segments_with_missing) > 0: + raise ValueError( + f"Feature `{self.in_column}` contains missing values in segments: {segments_with_missing}!" + ) + + if self.k > min_num_pos_freq: + raise ValueError(f"Parameter `k` must not be greater then {min_num_pos_freq} for the provided dataset!") + + def _dft_components(self, series: pd.Series) -> pd.DataFrame: + """Estimate series decomposition using FFT.""" + initial_index = series.index + series = series.loc[series.first_valid_index() : series.last_valid_index()] + + num_pos_freqs = self._get_num_pos_freqs(series) + + # compute Fourier decomposition of the series + dft_series = np.fft.fft(series) + + # compute "amplitudes" for each frequency + abs_dft_series = np.abs(dft_series) + + # select top-k indices + abs_pos_dft_series = abs_dft_series[:num_pos_freqs] + top_k_idxs = np.argpartition(abs_pos_dft_series, num_pos_freqs - self.k)[-self.k :] + + # select top-k and separate each frequency + freq_matrix = np.diag(dft_series) + freq_matrix = freq_matrix[:num_pos_freqs] + selected_freqs = freq_matrix[top_k_idxs] + + # return frequencies to initial domain + components = np.fft.ifft(selected_freqs).real + + components_df = pd.DataFrame( + data=components.T, columns=[f"dft_{i}" for i in range(components.shape[0])], index=series.index + ) + + if self.residuals: + components_df["dft_residuals"] = series.values - np.sum(components, axis=0) + + if not components_df.index.equals(initial_index): + components_df = components_df.reindex(index=initial_index, fill_value=np.nan) + + return components_df + + def fit(self, ts: TSDataset) -> "FourierDecomposeTransform": + """Fit the transform and the decomposition model. + + Parameters + ---------- + ts: + dataset to fit the transform on. + + Returns + ------- + : + the fitted transform instance. + """ + self._first_timestamp = ts.index.min() + self._last_timestamp = ts.index.max() + + self._check_segments(df=ts[..., self.in_column].droplevel("feature", axis=1)) + + return self + + def transform(self, ts: TSDataset) -> TSDataset: + """Transform ``TSDataset`` inplace. + + Parameters + ---------- + ts: + Dataset to transform. + + Returns + ------- + : + Transformed ``TSDataset``. + """ + if self._first_timestamp is None: + raise ValueError("Transform is not fitted!") + + if ts.index.min() < self._first_timestamp: + raise ValueError( + f"First index of the dataset to be transformed must be larger or equal then {self._first_timestamp}!" + ) + + if ts.index.min() > self._last_timestamp: + raise ValueError( + f"Dataset to be transformed must contain historical observations in range {self._first_timestamp} - {self._last_timestamp}" + ) + + segment_df = ts[..., self.in_column].droplevel("feature", axis=1) + + ts_max_timestamp = ts.index.max() + if ts_max_timestamp > self._last_timestamp: + future_steps = determine_num_steps(self._last_timestamp, ts_max_timestamp, freq=ts.freq) + segment_df.iloc[-future_steps:] = np.nan + + self._check_segments(df=segment_df) + + segments = segment_df.columns + segment_components = [] + for segment in segments: + components_df = self._dft_components(series=segment_df[segment]) + components_df.columns = f"{self.in_column}_" + components_df.columns + + components_df.columns = pd.MultiIndex.from_product( + [[segment], components_df.columns], names=["segment", "feature"] + ) + + segment_components.append(components_df) + + segment_components = pd.concat(segment_components, axis=1) + + ts.add_columns_from_pandas(segment_components) + + return ts + + +__all__ = ["FourierDecomposeTransform"] From 37da3cd3ad497878126e915da6a8e4e252be2edb Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Thu, 25 Jul 2024 13:04:56 +0300 Subject: [PATCH 2/9] added tests --- etna/transforms/__init__.py | 1 + etna/transforms/decomposition/__init__.py | 1 + tests/conftest.py | 26 ++ .../test_decomposition/test_dft_based.py | 344 ++++++++++++++++++ 4 files changed, 372 insertions(+) create mode 100644 tests/test_transforms/test_decomposition/test_dft_based.py diff --git a/etna/transforms/__init__.py b/etna/transforms/__init__.py index 59e4f536a..d0395a9fb 100644 --- a/etna/transforms/__init__.py +++ b/etna/transforms/__init__.py @@ -12,6 +12,7 @@ from etna.transforms.decomposition import ChangePointsSegmentationTransform from etna.transforms.decomposition import ChangePointsTrendTransform from etna.transforms.decomposition import DeseasonalityTransform +from etna.transforms.decomposition import FourierDecomposeTransform from etna.transforms.decomposition import IrreversibleChangePointsTransform from etna.transforms.decomposition import LinearTrendTransform from etna.transforms.decomposition import ReversibleChangePointsTransform diff --git a/etna/transforms/decomposition/__init__.py b/etna/transforms/decomposition/__init__.py index a5516ec62..ae2558af6 100644 --- a/etna/transforms/decomposition/__init__.py +++ b/etna/transforms/decomposition/__init__.py @@ -16,4 +16,5 @@ from etna.transforms.decomposition.deseasonal import DeseasonalityTransform from etna.transforms.decomposition.detrend import LinearTrendTransform from etna.transforms.decomposition.detrend import TheilSenTrendTransform +from etna.transforms.decomposition.dft_based import FourierDecomposeTransform from etna.transforms.decomposition.stl import STLTransform diff --git a/tests/conftest.py b/tests/conftest.py index dd81e2e3c..260afbef2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -908,3 +908,29 @@ def ts_with_binary_exog() -> TSDataset: df_exog = TSDataset.to_dataset(df_exog) ts = TSDataset(df, freq="D", df_exog=df_exog, known_future="all") return ts + + +@pytest.fixture() +def outliers_solid_tsds(): + """Create TSDataset with outliers and same last date.""" + timestamp = pd.date_range("2021-01-01", end="2021-02-20", freq="D") + target1 = [np.sin(i) for i in range(len(timestamp))] + target1[10] += 10 + + target2 = [np.sin(i) for i in range(len(timestamp))] + target2[8] += 8 + target2[15] = 2 + target2[26] -= 12 + + df1 = pd.DataFrame({"timestamp": timestamp, "target": target1, "segment": "1"}) + df2 = pd.DataFrame({"timestamp": timestamp, "target": target2, "segment": "2"}) + df = pd.concat([df1, df2], ignore_index=True) + df_exog = df.copy() + df_exog.columns = ["timestamp", "regressor_1", "segment"] + ts = TSDataset( + df=TSDataset.to_dataset(df).iloc[:-10], + df_exog=TSDataset.to_dataset(df_exog), + freq="D", + known_future="all", + ) + return ts diff --git a/tests/test_transforms/test_decomposition/test_dft_based.py b/tests/test_transforms/test_decomposition/test_dft_based.py new file mode 100644 index 000000000..27fc750d3 --- /dev/null +++ b/tests/test_transforms/test_decomposition/test_dft_based.py @@ -0,0 +1,344 @@ +import numpy as np +import pandas as pd +import pytest + +from etna.datasets import TSDataset +from etna.datasets import generate_ar_df +from etna.metrics import MAE +from etna.models import CatBoostPerSegmentModel +from etna.models import HoltWintersModel +from etna.models import ProphetModel +from etna.pipeline import Pipeline +from etna.transforms import FourierDecomposeTransform +from etna.transforms import IForestOutlierTransform +from etna.transforms import TimeSeriesImputerTransform + + +def simple_pipeline_with_decompose(in_column, horizon, k): + pipeline = Pipeline( + transforms=[FourierDecomposeTransform(k=k, in_column=in_column)], + model=HoltWintersModel(), + horizon=horizon, + ) + return pipeline + + +@pytest.fixture() +def ts_with_exogs() -> TSDataset: + periods = 100 + periods_exog = periods + 10 + df = generate_ar_df(start_time="2020-01-01", periods=periods, freq="D", n_segments=2) + df_exog = generate_ar_df(start_time="2020-01-01", periods=periods_exog, freq="D", n_segments=2, random_seed=2) + df_exog.rename(columns={"target": "exog"}, inplace=True) + df_exog["holiday"] = np.random.choice([0, 1], size=periods_exog * 2) + + df = TSDataset.to_dataset(df) + df_exog = TSDataset.to_dataset(df_exog) + ts = TSDataset(df, freq="D", df_exog=df_exog, known_future="all") + return ts + + +@pytest.fixture() +def ts_with_exogs_train_test(ts_with_exogs): + return ts_with_exogs.train_test_split(test_size=20) + + +@pytest.fixture() +def forward_stride_datasets(ts_with_exogs): + train_df = ts_with_exogs.df.iloc[:-10] + test_df = ts_with_exogs.df.iloc[-20:] + + train_ts = TSDataset(df=train_df, freq=ts_with_exogs.freq) + test_ts = TSDataset(df=test_df, freq=ts_with_exogs.freq) + + return train_ts, test_ts + + +@pytest.fixture() +def ts_with_missing(ts_with_exogs): + target_df = ts_with_exogs[..., "target"] + target_df.iloc[10] = np.nan + + return TSDataset(df=target_df, freq=ts_with_exogs.freq) + + +@pytest.mark.parametrize("in_column", ("target", "feat")) +def test_init(in_column): + transform = FourierDecomposeTransform(k=5, in_column=in_column) + assert transform.required_features == [in_column] + assert transform._first_timestamp is None + assert transform._last_timestamp is None + + +@pytest.mark.parametrize("k", (-1, 0)) +def test_invalid_k(k): + with pytest.raises(ValueError, match="Parameter `k` must be positive integer!"): + FourierDecomposeTransform(k=k, in_column="target") + + +@pytest.mark.parametrize( + "series, answ", + ( + (pd.Series([1]), 1), + (pd.Series([1, 2]), 2), + (pd.Series([1, 2, 3]), 2), + (pd.Series([1, 2, 3, 4]), 3), + (pd.Series([1, 2, 3, 4, 5]), 3), + (pd.Series([1, 2, 3, 4, 5, 6]), 4), + ), +) +def test_get_num_pos_freqs(series, answ): + res = FourierDecomposeTransform._get_num_pos_freqs(series=series) + assert res == answ + + +def test_check_segments_missing_values(ts_with_missing): + df = ts_with_missing[..., "target"] + transform = FourierDecomposeTransform(k=5) + with pytest.raises(ValueError, match=f"Feature `target` contains missing values"): + transform._check_segments(df=df) + + +def test_check_segments_large_k(ts_with_exogs): + df = ts_with_exogs[..., "target"] + transform = FourierDecomposeTransform(k=100) + with pytest.raises(ValueError, match=f"Parameter `k` must not be greater then"): + transform._check_segments(df=df) + + +def test_check_segments_ok(ts_with_exogs): + df = ts_with_exogs[..., "target"] + transform = FourierDecomposeTransform(k=5) + transform._check_segments(df=df) + + +@pytest.mark.parametrize( + "series", + ( + pd.Series(np.arange(5)), + pd.Series(np.arange(10)), + pd.Series([np.nan] * 2 + list(range(5)) + [np.nan] * 3), + ), +) +def test_fft_components_out_forma(series): + expected_columns = ["dft_0", "dft_1", "dft_2", "dft_residuals"] + transform = FourierDecomposeTransform(k=3, residuals=True) + + decompose_df = transform._dft_components(series=series) + + assert isinstance(decompose_df, pd.DataFrame) + pd.testing.assert_index_equal(decompose_df.index, series.index) + assert (decompose_df.columns == expected_columns).all() + np.testing.assert_allclose(np.sum(decompose_df.values, axis=1), series.values) + + +def test_is_not_fitted(simple_tsdf): + transform = FourierDecomposeTransform(k=5, in_column="feat") + with pytest.raises(ValueError, match="Transform is not fitted!"): + transform.transform(ts=simple_tsdf) + + +@pytest.mark.parametrize( + "ts_name,in_column", + ( + ("outliers_df_with_two_columns", "target"), + ("outliers_df_with_two_columns", "feature"), + ("ts_with_exogs", "target"), + ("ts_with_exogs", "exog"), + ("ts_with_exogs", "holiday"), + ("example_tsds_int_timestamp", "target"), + ), +) +def test_fit(ts_name, in_column, request): + ts = request.getfixturevalue(ts_name) + transform = FourierDecomposeTransform(k=5, in_column=in_column) + transform.fit(ts=ts) + + assert transform._first_timestamp == ts.index.min() + assert transform._last_timestamp == ts.index.max() + + +@pytest.mark.parametrize("residuals", (True, False)) +@pytest.mark.parametrize("in_column", ("target", "exog")) +def test_add_residuals(ts_with_exogs, residuals, in_column): + ts = ts_with_exogs + + transform = FourierDecomposeTransform(k=5, in_column=in_column, residuals=residuals) + transformed = transform.fit_transform(ts=ts) + + assert (f"{in_column}_dft_residuals" in transformed.features) is residuals + + +def test_timestamp_from_history(ts_with_exogs_train_test): + test, train = ts_with_exogs_train_test + transform = FourierDecomposeTransform(k=5) + transform.fit_transform(train) + + with pytest.raises(ValueError, match="First index of the dataset to be transformed must be larger"): + transform.transform(test) + + +def test_timestamp_from_future(ts_with_exogs_train_test): + train, test = ts_with_exogs_train_test + transform = FourierDecomposeTransform(k=5) + transform.fit_transform(train) + + with pytest.raises(ValueError, match="Dataset to be transformed must contain historical observations in range"): + transform.transform(test) + + +@pytest.mark.parametrize( + "in_column", + ( + "target", + "holiday", + "exog", + ), +) +@pytest.mark.parametrize("horizon", (1, 5)) +def test_simple_pipeline_forecast(ts_with_exogs, in_column, horizon): + ts = ts_with_exogs + + pipeline = simple_pipeline_with_decompose(in_column=in_column, horizon=horizon, k=5) + + pipeline.fit(ts=ts) + forecast = pipeline.forecast() + + assert forecast.size()[0] == horizon + assert np.sum(forecast[..., "target"].isna().sum()) == 0 + + +@pytest.mark.parametrize( + "in_column", + ( + "target", + "holiday", + "exog", + ), +) +@pytest.mark.parametrize("horizon", (1, 5)) +def test_simple_pipeline_predict(ts_with_exogs, in_column, horizon): + ts = ts_with_exogs + + pipeline = simple_pipeline_with_decompose(in_column=in_column, horizon=horizon, k=5) + + pipeline.fit(ts=ts) + forecast = pipeline.predict(ts) + + assert forecast.size()[0] == ts.size()[0] + assert np.sum(forecast[..., "target"].isna().sum()) == 0 + + +@pytest.mark.parametrize( + "in_column", + ( + "target", + "holiday", + "exog", + ), +) +@pytest.mark.parametrize("horizon", (1, 5)) +def test_simple_pipeline_predict_components(ts_with_exogs, in_column, horizon): + ts = ts_with_exogs + + pipeline = simple_pipeline_with_decompose(in_column=in_column, horizon=horizon, k=5) + + pipeline.fit(ts=ts) + forecast = pipeline.predict(ts, return_components=True) + + assert forecast.size()[0] == ts.size()[0] + assert forecast.target_components_names == ("target_component_level",) + + +@pytest.mark.parametrize( + "in_column", + ( + "target", + "holiday", + "exog", + ), +) +@pytest.mark.parametrize("horizon", (1, 5)) +def test_simple_pipeline_backtest(ts_with_exogs, in_column, horizon): + ts = ts_with_exogs + + pipeline = simple_pipeline_with_decompose(in_column=in_column, horizon=horizon, k=5) + + _, forecast, _ = pipeline.backtest(ts=ts, metrics=[MAE()], n_folds=3) + + assert len(forecast) == horizon * 3 + assert np.sum(forecast.loc[:, pd.IndexSlice[:, "target"]].isna().sum()) == 0 + + +@pytest.mark.parametrize( + "ts_name,in_column", + ( + ("outliers_df_with_two_columns", "target"), + ("outliers_df_with_two_columns", "feature"), + ("ts_with_exogs", "target"), + ("ts_with_exogs", "exog"), + ), +) +@pytest.mark.parametrize("k", (1, 5, 10, 40, 51)) +@pytest.mark.parametrize("forecast_model", (ProphetModel(), CatBoostPerSegmentModel(iterations=10))) +def test_pipeline_ks(ts_name, in_column, forecast_model, k, request): + ts = request.getfixturevalue(ts_name) + + pipeline = Pipeline( + transforms=[FourierDecomposeTransform(k=5, in_column=in_column)], + model=forecast_model, + horizon=3, + ) + + pipeline.fit(ts) + forecast = pipeline.forecast() + + assert forecast.size()[0] == 3 + assert np.sum(forecast.loc[:, pd.IndexSlice[:, "target"]].isna().sum()) == 0 + + +@pytest.mark.parametrize("answer", ({"1": ["2021-01-11"], "2": ["2021-01-09"]},)) +def test_outlier_detection(outliers_solid_tsds, answer): + ts = outliers_solid_tsds + + transforms = [ + FourierDecomposeTransform(k=2, in_column="target", residuals=True), + IForestOutlierTransform( + in_column="target", + features_to_ignore=["target", "regressor_1"], + contamination=0.01, + ), + ] + ts.fit_transform(transforms) + + for segment in ts.segments: + empty_values = pd.isna(ts[:, segment, "target"]) + assert empty_values.sum() == len(answer[segment]) + assert all(empty_values[answer[segment]]) + + +def test_outlier_detection_pipeline(outliers_solid_tsds): + ts = outliers_solid_tsds + pipeline = Pipeline( + transforms=[ + FourierDecomposeTransform(k=5, in_column="target"), + IForestOutlierTransform(in_column="target"), + TimeSeriesImputerTransform(in_column="target"), + ], + model=ProphetModel(), + horizon=3, + ) + pipeline.fit(ts) + + +@pytest.mark.parametrize("k", (1, 5)) +def test_stride_transform(forward_stride_datasets, k): + train, test = forward_stride_datasets + + transform = FourierDecomposeTransform(k=k, residuals=True) + + transform.fit(train) + transformed = transform.transform(test) + + assert not transformed.df.iloc[:10].isna().any().any() + assert transformed.df.iloc[10:].isna().all().any() From 915afe635c4d7706266a88d61ef066f751ddb595 Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Thu, 25 Jul 2024 13:05:29 +0300 Subject: [PATCH 3/9] moved fixture --- .../test_outliers/test_outliers_transform.py | 26 ------------------- 1 file changed, 26 deletions(-) diff --git a/tests/test_transforms/test_outliers/test_outliers_transform.py b/tests/test_transforms/test_outliers/test_outliers_transform.py index 87ff27ecb..daad6fc8d 100644 --- a/tests/test_transforms/test_outliers/test_outliers_transform.py +++ b/tests/test_transforms/test_outliers/test_outliers_transform.py @@ -78,32 +78,6 @@ def compare_outputs(ts, in_column, method, transform_constructor, method_kwargs, assert np.all(transformed_column[transformed_column.isna()].index == nan_timestamps) -@pytest.fixture() -def outliers_solid_tsds(): - """Create TSDataset with outliers and same last date.""" - timestamp = pd.date_range("2021-01-01", end="2021-02-20", freq="D") - target1 = [np.sin(i) for i in range(len(timestamp))] - target1[10] += 10 - - target2 = [np.sin(i) for i in range(len(timestamp))] - target2[8] += 8 - target2[15] = 2 - target2[26] -= 12 - - df1 = pd.DataFrame({"timestamp": timestamp, "target": target1, "segment": "1"}) - df2 = pd.DataFrame({"timestamp": timestamp, "target": target2, "segment": "2"}) - df = pd.concat([df1, df2], ignore_index=True) - df_exog = df.copy() - df_exog.columns = ["timestamp", "regressor_1", "segment"] - ts = TSDataset( - df=TSDataset.to_dataset(df).iloc[:-10], - df_exog=TSDataset.to_dataset(df_exog), - freq="D", - known_future="all", - ) - return ts - - @pytest.fixture() def outliers_solid_tsds_with_holidays(outliers_solid_tsds): """Create TSDataset with outliers with holidays""" From a3b90ac69abf991b7131c098b5edf7f561824451 Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Thu, 25 Jul 2024 13:05:45 +0300 Subject: [PATCH 4/9] updated inference tests --- .../test_inference/test_inverse_transform.py | 9 +++++++ .../test_inference/test_transform.py | 25 +++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py index b3c7edc7e..56dd7c9fa 100644 --- a/tests/test_transforms/test_inference/test_inverse_transform.py +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -23,6 +23,7 @@ from etna.transforms import EventTransform from etna.transforms import ExogShiftTransform from etna.transforms import FilterFeaturesTransform +from etna.transforms import FourierDecomposeTransform from etna.transforms import FourierTransform from etna.transforms import GaleShapleyFeatureSelectionTransform from etna.transforms import HolidayTransform @@ -141,6 +142,7 @@ def _test_inverse_transform_train(self, ts, transform, expected_changes): "regular_ts", {}, ), + (FourierDecomposeTransform(in_column="target", k=5, residuals=True), "regular_ts", {}), # embeddings ( EmbeddingSegmentTransform( @@ -602,6 +604,7 @@ def test_inverse_transform_train_fail_resample(self, transform, dataset_name, ex "regular_ts", {}, ), + (FourierDecomposeTransform(in_column="target", k=5, residuals=True), "regular_ts", {}), # embeddings ( EmbeddingSegmentTransform( @@ -1093,6 +1096,7 @@ def _test_inverse_transform_train_subset_segments(self, ts, transform, segments) ), "regular_ts", ), + (FourierDecomposeTransform(in_column="target", k=5, residuals=True), "regular_ts"), # embeddings ( EmbeddingSegmentTransform( @@ -1378,6 +1382,8 @@ def _test_inverse_transform_future_subset_segments(self, ts, transform, segments ), "regular_ts", ), + (FourierDecomposeTransform(in_column="target", k=5, residuals=True), "regular_ts"), + (FourierDecomposeTransform(in_column="positive", k=5, residuals=True), "ts_with_exog"), # embeddings ( EmbeddingSegmentTransform( @@ -2589,6 +2595,7 @@ def _test_inverse_transform_future_with_target( "regular_ts", {}, ), + # (FourierDecomposeTransform(in_column="target", k=5, residuals=True), "regular_ts", {}), # embeddings ( EmbeddingSegmentTransform( @@ -3096,6 +3103,8 @@ def _test_inverse_transform_future_without_target( "regular_ts", {}, ), + (FourierDecomposeTransform(in_column="target", k=5, residuals=True), "regular_ts", {}), + (FourierDecomposeTransform(in_column="positive", k=5, residuals=True), "ts_with_exog", {}), # embeddings ( EmbeddingSegmentTransform( diff --git a/tests/test_transforms/test_inference/test_transform.py b/tests/test_transforms/test_inference/test_transform.py index f4d9215c6..a3c15ed6e 100644 --- a/tests/test_transforms/test_inference/test_transform.py +++ b/tests/test_transforms/test_inference/test_transform.py @@ -23,6 +23,7 @@ from etna.transforms import EventTransform from etna.transforms import ExogShiftTransform from etna.transforms import FilterFeaturesTransform +from etna.transforms import FourierDecomposeTransform from etna.transforms import FourierTransform from etna.transforms import GaleShapleyFeatureSelectionTransform from etna.transforms import HolidayTransform @@ -127,6 +128,11 @@ def _test_transform_train(self, ts, transform, expected_changes): "regular_ts", {"create": {"res"}}, ), + ( + FourierDecomposeTransform(in_column="target", k=2, residuals=True), + "regular_ts", + {"create": {"target_dft_0", "target_dft_1", "target_dft_residuals"}}, + ), # embeddings ( EmbeddingSegmentTransform( @@ -544,6 +550,11 @@ def test_transform_train_datetime_timestamp(self, transform, dataset_name, expec "regular_ts", {"create": {"res"}}, ), + ( + FourierDecomposeTransform(in_column="target", k=2, residuals=True), + "regular_ts", + {"create": {"target_dft_0", "target_dft_1", "target_dft_residuals"}}, + ), # embeddings ( EmbeddingSegmentTransform( @@ -1049,6 +1060,7 @@ def _test_transform_train_subset_segments(self, ts, transform, segments): ), "regular_ts", ), + (FourierDecomposeTransform(in_column="target", k=2, residuals=True), "regular_ts"), # embeddings ( EmbeddingSegmentTransform( @@ -1316,6 +1328,8 @@ def _test_transform_future_subset_segments(self, ts, transform, segments, horizo ), "regular_ts", ), + (FourierDecomposeTransform(in_column="target", k=2, residuals=True), "regular_ts"), + (FourierDecomposeTransform(in_column="positive", k=2, residuals=True), "ts_with_exog"), # embeddings ( EmbeddingSegmentTransform( @@ -2420,6 +2434,7 @@ def _test_transform_future_with_target(self, ts, transform, expected_changes, ga "regular_ts", {"create": {"res"}}, ), + # (FourierDecomposeTransform(in_column="target", k=2, residuals=True), "regular_ts", {"create": {"target_dft_0", "target_dft_1", "target_dft_residuals"}}), # embeddings ( EmbeddingSegmentTransform( @@ -2859,6 +2874,16 @@ def _test_transform_future_without_target(self, ts, transform, expected_changes, "regular_ts", {"create": {"res"}}, ), + ( + FourierDecomposeTransform(in_column="target", k=2, residuals=True), + "regular_ts", + {"create": {"target_dft_0", "target_dft_1", "target_dft_residuals"}}, + ), + ( + FourierDecomposeTransform(in_column="positive", k=2, residuals=True), + "ts_with_exog", + {"create": {"positive_dft_0", "positive_dft_1", "positive_dft_residuals"}}, + ), # embeddings ( EmbeddingSegmentTransform( From 35d5c44406c7b97cd146d3be12855c5e970123f1 Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Thu, 25 Jul 2024 13:05:59 +0300 Subject: [PATCH 5/9] updated doc --- docs/source/api_reference/transforms.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/api_reference/transforms.rst b/docs/source/api_reference/transforms.rst index 409bf97fc..ce8ec263a 100644 --- a/docs/source/api_reference/transforms.rst +++ b/docs/source/api_reference/transforms.rst @@ -44,6 +44,7 @@ Decomposition transforms and their utilities: decomposition.MedianPerIntervalModel decomposition.SklearnPreprocessingPerIntervalModel decomposition.SklearnRegressionPerIntervalModel + decomposition.FourierDecomposeTransform Categorical encoding transforms: From fcece80724020cdc2a7e928cc46d7304da5a228d Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Tue, 30 Jul 2024 12:15:55 +0300 Subject: [PATCH 6/9] review fixes --- etna/transforms/decomposition/dft_based.py | 2 +- tests/test_transforms/test_decomposition/test_dft_based.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/etna/transforms/decomposition/dft_based.py b/etna/transforms/decomposition/dft_based.py index 329536d87..2fe6476ca 100644 --- a/etna/transforms/decomposition/dft_based.py +++ b/etna/transforms/decomposition/dft_based.py @@ -160,7 +160,7 @@ def transform(self, ts: TSDataset) -> TSDataset: if ts.index.min() < self._first_timestamp: raise ValueError( - f"First index of the dataset to be transformed must be larger or equal then {self._first_timestamp}!" + f"First index of the dataset to be transformed must be larger or equal than {self._first_timestamp}!" ) if ts.index.min() > self._last_timestamp: diff --git a/tests/test_transforms/test_decomposition/test_dft_based.py b/tests/test_transforms/test_decomposition/test_dft_based.py index 27fc750d3..13c2ff44d 100644 --- a/tests/test_transforms/test_decomposition/test_dft_based.py +++ b/tests/test_transforms/test_decomposition/test_dft_based.py @@ -32,8 +32,6 @@ def ts_with_exogs() -> TSDataset: df_exog.rename(columns={"target": "exog"}, inplace=True) df_exog["holiday"] = np.random.choice([0, 1], size=periods_exog * 2) - df = TSDataset.to_dataset(df) - df_exog = TSDataset.to_dataset(df_exog) ts = TSDataset(df, freq="D", df_exog=df_exog, known_future="all") return ts @@ -120,7 +118,7 @@ def test_check_segments_ok(ts_with_exogs): pd.Series([np.nan] * 2 + list(range(5)) + [np.nan] * 3), ), ) -def test_fft_components_out_forma(series): +def test_fft_components_out_format(series): expected_columns = ["dft_0", "dft_1", "dft_2", "dft_residuals"] transform = FourierDecomposeTransform(k=3, residuals=True) @@ -281,7 +279,7 @@ def test_simple_pipeline_backtest(ts_with_exogs, in_column, horizon): ) @pytest.mark.parametrize("k", (1, 5, 10, 40, 51)) @pytest.mark.parametrize("forecast_model", (ProphetModel(), CatBoostPerSegmentModel(iterations=10))) -def test_pipeline_ks(ts_name, in_column, forecast_model, k, request): +def test_pipeline_parameter_k(ts_name, in_column, forecast_model, k, request): ts = request.getfixturevalue(ts_name) pipeline = Pipeline( From c266594d735e87a38c431b7d6aca72e8ebb8507f Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Tue, 30 Jul 2024 13:38:16 +0300 Subject: [PATCH 7/9] review fixes --- etna/transforms/decomposition/dft_based.py | 1 + tests/test_transforms/test_decomposition/test_dft_based.py | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/etna/transforms/decomposition/dft_based.py b/etna/transforms/decomposition/dft_based.py index 2fe6476ca..def3eeed8 100644 --- a/etna/transforms/decomposition/dft_based.py +++ b/etna/transforms/decomposition/dft_based.py @@ -117,6 +117,7 @@ def _dft_components(self, series: pd.Series) -> pd.DataFrame: if self.residuals: components_df["dft_residuals"] = series.values - np.sum(components, axis=0) + # return trailing and leading nans to the series if any existed initially if not components_df.index.equals(initial_index): components_df = components_df.reindex(index=initial_index, fill_value=np.nan) diff --git a/tests/test_transforms/test_decomposition/test_dft_based.py b/tests/test_transforms/test_decomposition/test_dft_based.py index 13c2ff44d..e773d02b5 100644 --- a/tests/test_transforms/test_decomposition/test_dft_based.py +++ b/tests/test_transforms/test_decomposition/test_dft_based.py @@ -97,9 +97,10 @@ def test_check_segments_missing_values(ts_with_missing): transform._check_segments(df=df) -def test_check_segments_large_k(ts_with_exogs): +@pytest.mark.parametrize("k", (52, 100)) +def test_check_segments_large_k(ts_with_exogs, k): df = ts_with_exogs[..., "target"] - transform = FourierDecomposeTransform(k=100) + transform = FourierDecomposeTransform(k=k) with pytest.raises(ValueError, match=f"Parameter `k` must not be greater then"): transform._check_segments(df=df) From dcdd47faaafc93d0b8669f6b8258e062fb3bf36a Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Tue, 30 Jul 2024 14:25:22 +0300 Subject: [PATCH 8/9] added tests --- .../test_inference/test_inverse_transform.py | 14 +++++++++++++- .../test_inference/test_transform.py | 18 +++++++++++++++++- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/tests/test_transforms/test_inference/test_inverse_transform.py b/tests/test_transforms/test_inference/test_inverse_transform.py index 56dd7c9fa..f734df2c7 100644 --- a/tests/test_transforms/test_inference/test_inverse_transform.py +++ b/tests/test_transforms/test_inference/test_inverse_transform.py @@ -2595,7 +2595,6 @@ def _test_inverse_transform_future_with_target( "regular_ts", {}, ), - # (FourierDecomposeTransform(in_column="target", k=5, residuals=True), "regular_ts", {}), # embeddings ( EmbeddingSegmentTransform( @@ -2993,6 +2992,19 @@ def test_inverse_transform_future_with_target_fail_difference( with pytest.raises(ValueError, match="Test should go after the train without gaps"): self._test_inverse_transform_future_with_target(ts, transform, expected_changes=expected_changes) + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + [ + (FourierDecomposeTransform(in_column="target", k=5, residuals=True), "regular_ts", {}), + ], + ) + def test_inverse_transform_future_with_target_fail_require_history( + self, transform, dataset_name, expected_changes, request + ): + ts = request.getfixturevalue(dataset_name) + with pytest.raises(ValueError, match="Dataset to be transformed must contain historical observations"): + self._test_inverse_transform_future_with_target(ts, transform, expected_changes=expected_changes) + # It is the only transform that doesn't change values back during `inverse_transform` @to_be_fixed(raises=AssertionError) @pytest.mark.parametrize( diff --git a/tests/test_transforms/test_inference/test_transform.py b/tests/test_transforms/test_inference/test_transform.py index a3c15ed6e..234d568db 100644 --- a/tests/test_transforms/test_inference/test_transform.py +++ b/tests/test_transforms/test_inference/test_transform.py @@ -2434,7 +2434,6 @@ def _test_transform_future_with_target(self, ts, transform, expected_changes, ga "regular_ts", {"create": {"res"}}, ), - # (FourierDecomposeTransform(in_column="target", k=2, residuals=True), "regular_ts", {"create": {"target_dft_0", "target_dft_1", "target_dft_residuals"}}), # embeddings ( EmbeddingSegmentTransform( @@ -2801,6 +2800,23 @@ def test_transform_future_with_target(self, transform, dataset_name, expected_ch ts = request.getfixturevalue(dataset_name) self._test_transform_future_with_target(ts, transform, expected_changes=expected_changes) + @pytest.mark.parametrize( + "transform, dataset_name, expected_changes", + ( + ( + FourierDecomposeTransform(in_column="target", k=2, residuals=True), + "regular_ts", + {"create": {"target_dft_0", "target_dft_1", "target_dft_residuals"}}, + ), + ), + ) + def test_transform_future_with_target_fail_require_history( + self, transform, dataset_name, expected_changes, request + ): + ts = request.getfixturevalue(dataset_name) + with pytest.raises(ValueError, match="Dataset to be transformed must contain historical observations"): + self._test_transform_future_with_target(ts, transform, expected_changes=expected_changes) + class TestTransformFutureWithoutTarget: """Test transform on future dataset with unknown target. From 5601e9914226128d3304c67422d6b5022382946c Mon Sep 17 00:00:00 2001 From: brsnw250 Date: Tue, 30 Jul 2024 14:31:04 +0300 Subject: [PATCH 9/9] updated changelog --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 246537891..c8cd7f6b7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,7 +15,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Add `get_anomalies_mad` function for anomaly detection ([#398](https://github.com/etna-team/etna/pull/398)) - Add `TSDataset.features` property to get list of all features in a dataset ([#405](https://github.com/etna-team/etna/pull/405)) - Add `MADOutlierTransform` class for anomaly detection ([#415](https://github.com/etna-team/etna/pull/415)) -- +- Add `FourierDecomposeTransform` transform for series decomposition using DFT ([#430](https://github.com/etna-team/etna/pull/430)) ### Changed - Allow to change `device`, `batch_size` and `num_workers` of embedding models ([#396](https://github.com/etna-team/etna/pull/396))