Merge remote-tracking branch 'origin/master' into issue-428

# Conflicts: # CHANGELOG.md
etna-team · Jul 31, 2024 · 9e1f917 · 9e1f917
2 parents f59df25 + 12c0e8f
commit 9e1f917
Show file tree

Hide file tree

Showing 15 changed files with 737 additions and 124 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -16,6 +16,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add `TSDataset.features` property to get list of all features in a dataset ([#405](https://github.com/etna-team/etna/pull/405))
 - Add `MADOutlierTransform` class for anomaly detection ([#415](https://github.com/etna-team/etna/pull/415))
 - Add `MeanEncoderTransform` ([#413](https://github.com/etna-team/etna/pull/413))
+- Add `FourierDecomposeTransform` transform for series decomposition using DFT ([#430](https://github.com/etna-team/etna/pull/430))
 
 ### Changed
 - Allow to change `device`, `batch_size` and `num_workers` of embedding models ([#396](https://github.com/etna-team/etna/pull/396))
@@ -35,6 +36,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Fix typo in 103 tutorial ([#408](https://github.com/etna-team/etna/pull/408))
 - Remove sorting of `ts.df` by timestamps in `plot_forecast` and `plot_forecast_decomposition` ([#410](https://github.com/etna-team/etna/pull/410))
 - Fix forecast visualization with `horizon=1` ([#426](https://github.com/etna-team/etna/pull/426))
+- Set upper bound `<2` on numpy version ([#431](https://github.com/etna-team/etna/pull/431))
+- 
 - 
 - Fix `VotingEnsemble`, `StackingEnsemble`, `DirectEnsemble` have a valid `params_to_tune` that returns empty dict ([#432](https://github.com/etna-team/etna/pull/432))
 - Fix passing custom model to `STLTransform` ([#412](https://github.com/etna-team/etna/pull/412))

diff --git a/docs/source/api_reference/transforms.rst b/docs/source/api_reference/transforms.rst
@@ -44,6 +44,7 @@ Decomposition transforms and their utilities:
    decomposition.MedianPerIntervalModel
    decomposition.SklearnPreprocessingPerIntervalModel
    decomposition.SklearnRegressionPerIntervalModel
+   decomposition.FourierDecomposeTransform
 
 Categorical encoding transforms:
 

diff --git a/etna/experimental/change_points/regularization_search.py b/etna/experimental/change_points/regularization_search.py
@@ -71,9 +71,9 @@ def _get_next_value(
         next value and its bounds
     """
     if need_greater:
-        return np.mean([now_value, lower_bound]), lower_bound, now_value
+        return float(np.mean([now_value, lower_bound])), lower_bound, now_value
     else:
-        return np.mean([now_value, upper_bound]), now_value, upper_bound
+        return float(np.mean([now_value, upper_bound])), now_value, upper_bound
 
 
 def bin_search(
@@ -121,7 +121,7 @@ def bin_search(
         raise ValueError("Impossible number of changepoints. Please, increase max_value or increase n_bkps value.")
 
     lower_bound, upper_bound = 0.0, max_value
-    now_value = np.mean([lower_bound, upper_bound])
+    now_value = float(np.mean([lower_bound, upper_bound]))
     now_n_bkps = _get_n_bkps(series, change_point_model, **{opt_param: now_value})
     iters = 0
 

diff --git a/etna/transforms/__init__.py b/etna/transforms/__init__.py
@@ -12,6 +12,7 @@
 from etna.transforms.decomposition import ChangePointsSegmentationTransform
 from etna.transforms.decomposition import ChangePointsTrendTransform
 from etna.transforms.decomposition import DeseasonalityTransform
+from etna.transforms.decomposition import FourierDecomposeTransform
 from etna.transforms.decomposition import IrreversibleChangePointsTransform
 from etna.transforms.decomposition import LinearTrendTransform
 from etna.transforms.decomposition import ReversibleChangePointsTransform

diff --git a/etna/transforms/decomposition/__init__.py b/etna/transforms/decomposition/__init__.py
@@ -16,4 +16,5 @@
 from etna.transforms.decomposition.deseasonal import DeseasonalityTransform
 from etna.transforms.decomposition.detrend import LinearTrendTransform
 from etna.transforms.decomposition.detrend import TheilSenTrendTransform
+from etna.transforms.decomposition.dft_based import FourierDecomposeTransform
 from etna.transforms.decomposition.stl import STLTransform
diff --git a/etna/transforms/decomposition/dft_based.py b/etna/transforms/decomposition/dft_based.py
@@ -0,0 +1,200 @@
+from typing import List
+
+import numpy as np
+import pandas as pd
+
+from etna.datasets import TSDataset
+from etna.datasets.utils import determine_num_steps
+from etna.transforms import IrreversibleTransform
+
+
+class FourierDecomposeTransform(IrreversibleTransform):
+    """Transform that uses Fourier transformation to estimate series decomposition.
+
+    Note
+    ----
+    This transform decomposes only in-sample data. For the future timestamps it produces ``NaN``.
+    For the dataset to be transformed, it should contain at least the minimum amount of in-sample timestamps that are required by transform.
+
+    Warning
+    -------
+    This transform adds new columns to the dataset, that correspond to the selected frequencies. Such columns are named with
+    ``dft_{i}`` suffix. Suffix index do NOT indicate any relation to the frequencies. Produced names should be thought of as
+    arbitrary identifiers to the produced sinusoids.
+    """
+
+    def __init__(self, k: int, in_column: str = "target", residuals: bool = False):
+        """Init ``FourierDecomposeTransform``.
+
+        Parameters
+        ----------
+        k:
+            how many top positive frequencies selected for the decomposition. Selection performed proportional to the amplitudes.
+        in_column:
+            name of the processed column.
+        residuals:
+            whether to add residuals after decomposition. This guarantees that all components, including residuals, sum up to the series.
+        """
+        if k <= 0:
+            raise ValueError("Parameter `k` must be positive integer!")
+
+        self.k = k
+        self.in_column = in_column
+        self.residuals = residuals
+
+        self._first_timestamp = None
+        self._last_timestamp = None
+
+        super().__init__(required_features=[in_column])
+
+    def get_regressors_info(self) -> List[str]:
+        """Return the list with regressors created by the transform."""
+        return []
+
+    def _fit(self, df: pd.DataFrame):
+        """Fit transform with the dataframe."""
+        pass
+
+    def _transform(self, df: pd.DataFrame) -> pd.DataFrame:
+        """Transform provided dataframe."""
+        pass
+
+    @staticmethod
+    def _get_num_pos_freqs(series: pd.Series) -> int:
+        """Get number of positive frequencies for the series."""
+        num_obs = len(series)
+        return int(np.ceil((num_obs - 1) / 2) + 1)
+
+    def _check_segments(self, df: pd.DataFrame):
+        """Check if series satisfy conditions."""
+        segments_with_missing = []
+        min_num_pos_freq = float("inf")
+        for segment in df:
+            series = df[segment]
+            series = series.loc[series.first_valid_index() : series.last_valid_index()]
+            if series.isna().any():
+                segments_with_missing.append(segment)
+
+            min_num_pos_freq = min(min_num_pos_freq, self._get_num_pos_freqs(series))
+
+        if len(segments_with_missing) > 0:
+            raise ValueError(
+                f"Feature `{self.in_column}` contains missing values in segments: {segments_with_missing}!"
+            )
+
+        if self.k > min_num_pos_freq:
+            raise ValueError(f"Parameter `k` must not be greater then {min_num_pos_freq} for the provided dataset!")
+
+    def _dft_components(self, series: pd.Series) -> pd.DataFrame:
+        """Estimate series decomposition using FFT."""
+        initial_index = series.index
+        series = series.loc[series.first_valid_index() : series.last_valid_index()]
+
+        num_pos_freqs = self._get_num_pos_freqs(series)
+
+        # compute Fourier decomposition of the series
+        dft_series = np.fft.fft(series)
+
+        # compute "amplitudes" for each frequency
+        abs_dft_series = np.abs(dft_series)
+
+        # select top-k indices
+        abs_pos_dft_series = abs_dft_series[:num_pos_freqs]
+        top_k_idxs = np.argpartition(abs_pos_dft_series, num_pos_freqs - self.k)[-self.k :]
+
+        # select top-k and separate each frequency
+        freq_matrix = np.diag(dft_series)
+        freq_matrix = freq_matrix[:num_pos_freqs]
+        selected_freqs = freq_matrix[top_k_idxs]
+
+        # return frequencies to initial domain
+        components = np.fft.ifft(selected_freqs).real
+
+        components_df = pd.DataFrame(
+            data=components.T, columns=[f"dft_{i}" for i in range(components.shape[0])], index=series.index
+        )
+
+        if self.residuals:
+            components_df["dft_residuals"] = series.values - np.sum(components, axis=0)
+
+        # return trailing and leading nans to the series if any existed initially
+        if not components_df.index.equals(initial_index):
+            components_df = components_df.reindex(index=initial_index, fill_value=np.nan)
+
+        return components_df
+
+    def fit(self, ts: TSDataset) -> "FourierDecomposeTransform":
+        """Fit the transform and the decomposition model.
+
+        Parameters
+        ----------
+        ts:
+            dataset to fit the transform on.
+
+        Returns
+        -------
+        :
+            the fitted transform instance.
+        """
+        self._first_timestamp = ts.index.min()
+        self._last_timestamp = ts.index.max()
+
+        self._check_segments(df=ts[..., self.in_column].droplevel("feature", axis=1))
+
+        return self
+
+    def transform(self, ts: TSDataset) -> TSDataset:
+        """Transform ``TSDataset`` inplace.
+
+        Parameters
+        ----------
+        ts:
+            Dataset to transform.
+
+        Returns
+        -------
+        :
+            Transformed ``TSDataset``.
+        """
+        if self._first_timestamp is None:
+            raise ValueError("Transform is not fitted!")
+
+        if ts.index.min() < self._first_timestamp:
+            raise ValueError(
+                f"First index of the dataset to be transformed must be larger or equal than {self._first_timestamp}!"
+            )
+
+        if ts.index.min() > self._last_timestamp:
+            raise ValueError(
+                f"Dataset to be transformed must contain historical observations in range {self._first_timestamp} - {self._last_timestamp}"
+            )
+
+        segment_df = ts[..., self.in_column].droplevel("feature", axis=1)
+
+        ts_max_timestamp = ts.index.max()
+        if ts_max_timestamp > self._last_timestamp:
+            future_steps = determine_num_steps(self._last_timestamp, ts_max_timestamp, freq=ts.freq)
+            segment_df.iloc[-future_steps:] = np.nan
+
+        self._check_segments(df=segment_df)
+
+        segments = segment_df.columns
+        segment_components = []
+        for segment in segments:
+            components_df = self._dft_components(series=segment_df[segment])
+            components_df.columns = f"{self.in_column}_" + components_df.columns
+
+            components_df.columns = pd.MultiIndex.from_product(
+                [[segment], components_df.columns], names=["segment", "feature"]
+            )
+
+            segment_components.append(components_df)
+
+        segment_components = pd.concat(segment_components, axis=1)
+
+        ts.add_columns_from_pandas(segment_components)
+
+        return ts
+
+
+__all__ = ["FourierDecomposeTransform"]