From 409e5ba4c92232d14ddedc077d9ac8323ef800fd Mon Sep 17 00:00:00 2001 From: Maverick Lin Date: Wed, 29 Jan 2020 13:54:51 -0500 Subject: [PATCH 01/10] add arima functionality --- gs_quant/test/timeseries/test_arima.py | 196 ++++++++++++++++ gs_quant/timeseries/__init__.py | 1 + gs_quant/timeseries/arima.py | 310 +++++++++++++++++++++++++ 3 files changed, 507 insertions(+) create mode 100644 gs_quant/test/timeseries/test_arima.py create mode 100644 gs_quant/timeseries/arima.py diff --git a/gs_quant/test/timeseries/test_arima.py b/gs_quant/test/timeseries/test_arima.py new file mode 100644 index 00000000..90c75cd1 --- /dev/null +++ b/gs_quant/test/timeseries/test_arima.py @@ -0,0 +1,196 @@ +""" +Copyright 2020 Goldman Sachs. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +""" +from datetime import date + +import pytest +import pandas as pd +import numpy as np + +from pandas import Timestamp +from pandas.util.testing import assert_series_equal +from numpy.testing import assert_raises, assert_array_equal, assert_allclose + +import gs_quant.timeseries as ts + +def test_arima_fit(): + test_dict = { + 'High': + {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.62580132484436, + Timestamp('1989-01-06 00:00:00'): 3.62580132484436, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.575721263885498, + Timestamp('1989-01-11 00:00:00'): 3.5657050609588623, + Timestamp('1989-01-12 00:00:00'): 3.635817289352417, + Timestamp('1989-01-13 00:00:00'): 3.615785360336304, + Timestamp('1989-01-16 00:00:00'): 3.615785360336304, + Timestamp('1989-01-17 00:00:00'): 3.635817289352417, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.695913553237915, + Timestamp('1989-01-20 00:00:00'): 3.665865421295166, + Timestamp('1989-01-23 00:00:00'): 3.675881385803223, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.695913553237915, + Timestamp('1989-01-26 00:00:00'): 3.7760417461395264, + Timestamp('1989-01-27 00:00:00'): 3.8561699390411377, + Timestamp('1989-01-30 00:00:00'): 3.8561699390411377}, + 'Low': + {Timestamp('1989-01-03 00:00:00'): 3.4855768680572514, + Timestamp('1989-01-04 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-05 00:00:00'): 3.575721263885498, + Timestamp('1989-01-06 00:00:00'): 3.575721263885498, + Timestamp('1989-01-09 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-10 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-11 00:00:00'): 3.5256409645080566, + Timestamp('1989-01-12 00:00:00'): 3.5456731319427486, + Timestamp('1989-01-13 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-18 00:00:00'): 3.615785360336304, + Timestamp('1989-01-19 00:00:00'): 3.655849456787109, + Timestamp('1989-01-20 00:00:00'): 3.62580132484436, + Timestamp('1989-01-23 00:00:00'): 3.615785360336304, + Timestamp('1989-01-24 00:00:00'): 3.615785360336304, + Timestamp('1989-01-25 00:00:00'): 3.655849456787109, + Timestamp('1989-01-26 00:00:00'): 3.665865421295166, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.786057710647583}, + 'Open': + {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, + Timestamp('1989-01-04 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-05 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-06 00:00:00'): 3.605769157409668, + Timestamp('1989-01-09 00:00:00'): 3.5456731319427486, + Timestamp('1989-01-10 00:00:00'): 3.575721263885498, + Timestamp('1989-01-11 00:00:00'): 3.5456731319427486, + Timestamp('1989-01-12 00:00:00'): 3.5456731319427486, + Timestamp('1989-01-13 00:00:00'): 3.605769157409668, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-18 00:00:00'): 3.635817289352417, + Timestamp('1989-01-19 00:00:00'): 3.6858973503112793, + Timestamp('1989-01-20 00:00:00'): 3.665865421295166, + Timestamp('1989-01-23 00:00:00'): 3.6458332538604736, + Timestamp('1989-01-24 00:00:00'): 3.62580132484436, + Timestamp('1989-01-25 00:00:00'): 3.6858973503112793, + Timestamp('1989-01-26 00:00:00'): 3.675881385803223, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.806089639663696}, + 'Close': + {Timestamp('1989-01-03 00:00:00'): 3.5256409645080566, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.575721263885498, + Timestamp('1989-01-06 00:00:00'): 3.575721263885498, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-11 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-12 00:00:00'): 3.605769157409668, + Timestamp('1989-01-13 00:00:00'): 3.605769157409668, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.62580132484436, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.665865421295166, + Timestamp('1989-01-20 00:00:00'): 3.6458332538604736, + Timestamp('1989-01-23 00:00:00'): 3.62580132484436, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.675881385803223, + Timestamp('1989-01-26 00:00:00'): 3.756009578704834, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.846153736114502}, + 'Volume': + {Timestamp('1989-01-03 00:00:00'): 21873600.0, + Timestamp('1989-01-04 00:00:00'): 13487100.0, + Timestamp('1989-01-05 00:00:00'): 20733000.0, + Timestamp('1989-01-06 00:00:00'): 20654400.0, + Timestamp('1989-01-09 00:00:00'): 21478000.0, + Timestamp('1989-01-10 00:00:00'): 15541300.0, + Timestamp('1989-01-11 00:00:00'): 11465300.0, + Timestamp('1989-01-12 00:00:00'): 26481300.0, + Timestamp('1989-01-13 00:00:00'): 10236000.0, + Timestamp('1989-01-16 00:00:00'): 8888200.0, + Timestamp('1989-01-17 00:00:00'): 12934200.0, + Timestamp('1989-01-18 00:00:00'): 25965800.0, + Timestamp('1989-01-19 00:00:00'): 25556500.0, + Timestamp('1989-01-20 00:00:00'): 13779100.0, + Timestamp('1989-01-23 00:00:00'): 13680500.0, + Timestamp('1989-01-24 00:00:00'): 16870400.0, + Timestamp('1989-01-25 00:00:00'): 16959000.0, + Timestamp('1989-01-26 00:00:00'): 29040900.0, + Timestamp('1989-01-27 00:00:00'): 50615100.0, + Timestamp('1989-01-30 00:00:00'): 27567000.0}, + 'Adj Close': + {Timestamp('1989-01-03 00:00:00'): 0.13199026882648468, + Timestamp('1989-01-04 00:00:00'): 0.13424012064933774, + Timestamp('1989-01-05 00:00:00'): 0.1338651180267334, + Timestamp('1989-01-06 00:00:00'): 0.1338651180267334, + Timestamp('1989-01-09 00:00:00'): 0.1338651180267334, + Timestamp('1989-01-10 00:00:00'): 0.13311512768268585, + Timestamp('1989-01-11 00:00:00'): 0.13311512768268585, + Timestamp('1989-01-12 00:00:00'): 0.13499003648757935, + Timestamp('1989-01-13 00:00:00'): 0.13499003648757935, + Timestamp('1989-01-16 00:00:00'): 0.13461506366729736, + Timestamp('1989-01-17 00:00:00'): 0.13573989272117615, + Timestamp('1989-01-18 00:00:00'): 0.13761481642723086, + Timestamp('1989-01-19 00:00:00'): 0.13723985850811005, + Timestamp('1989-01-20 00:00:00'): 0.13648992776870728, + Timestamp('1989-01-23 00:00:00'): 0.13573989272117615, + Timestamp('1989-01-24 00:00:00'): 0.13761481642723086, + Timestamp('1989-01-25 00:00:00'): 0.13761481642723086, + Timestamp('1989-01-26 00:00:00'): 0.14061467349529266, + Timestamp('1989-01-27 00:00:00'): 0.14211450517177582, + Timestamp('1989-01-30 00:00:00'): 0.14398930966854095}} + test_df = pd.DataFrame(test_dict) + arima = ts.arima() + arima.fit(test_df, train_size=0.8, freq='B') + transformed_test_df = arima.transform(test_df) + + for col in transformed_test_df.keys(): + count_nans = arima.best_params[col]['p'] + arima.best_params[col]['d'] + assert(count_nans == transformed_test_df[col].isna().sum()) + + # Test (1,1,0) Model + diff_test_df_high = test_df['High'].diff() + assert(transformed_test_df['High'][2] == (arima.best_params['High']['best_params']['const'] + diff_test_df_high[1] * arima.best_params['High']['best_params']['ar.L1.D.High'])) + assert(transformed_test_df['High'][3] == (arima.best_params['High']['best_params']['const'] + diff_test_df_high[2] * arima.best_params['High']['best_params']['ar.L1.D.High'])) + assert(transformed_test_df['High'][-1] == (arima.best_params['High']['best_params']['const'] + diff_test_df_high[-2] * arima.best_params['High']['best_params']['ar.L1.D.High'])) + + # Test (2,1,0) Model + diff_test_df_low = test_df['Low'].diff() + assert(transformed_test_df['Low'][3] == (arima.best_params['Low']['best_params']['const'] + diff_test_df_low[2] * arima.best_params['Low']['best_params']['ar.L1.D.Low'] + diff_test_df_low[1] * arima.best_params['Low']['best_params']['ar.L2.D.Low'])) + assert(transformed_test_df['Low'][4] == (arima.best_params['Low']['best_params']['const'] + diff_test_df_low[3] * arima.best_params['Low']['best_params']['ar.L1.D.Low'] + diff_test_df_low[2] * arima.best_params['Low']['best_params']['ar.L2.D.Low'])) + assert(transformed_test_df['Low'][-1] == (arima.best_params['Low']['best_params']['const'] + diff_test_df_low[-2] * arima.best_params['Low']['best_params']['ar.L1.D.Low'] + diff_test_df_low[-3] * arima.best_params['Low']['best_params']['ar.L2.D.Low'])) + + # Test (1,2,0) Model + diff_test_df_close = test_df['Close'].diff()[1:].diff() + first_day = pd.Series([np.nan]) + first_day.index = [diff_test_df_close.index[0] - pd.DateOffset(days=1)] + first_day.name = 'Close' + diff_test_df_close = pd.concat([first_day, diff_test_df_close]) + diff_test_df_close.index.name = "Date" + + assert(transformed_test_df['Close'][4] == (arima.best_params['Close']['best_params']['const'] + diff_test_df_close[3] * arima.best_params['Close']['best_params']['ar.L1.D2.Close'])) + assert(transformed_test_df['Close'][5] == (arima.best_params['Close']['best_params']['const'] + diff_test_df_close[4] * arima.best_params['Close']['best_params']['ar.L1.D2.Close'])) + assert(transformed_test_df['Close'][-1] == (arima.best_params['Close']['best_params']['const'] + diff_test_df_close[-2] * arima.best_params['Close']['best_params']['ar.L1.D2.Close'])) + + # Test (0,2,0) Model + diff_test_df_volumne = test_df['Volume'].diff()[1:].diff() + first_day = pd.Series([np.nan]) + first_day.index = [diff_test_df_volumne.index[0] - pd.DateOffset(days=1)] + first_day.name = 'Volume' + diff_test_df_volumne = pd.concat([first_day, diff_test_df_volumne]) + diff_test_df_volumne.index.name = "Date" + assert(transformed_test_df['Volume'][2] == + diff_test_df_volumne[2]) diff --git a/gs_quant/timeseries/__init__.py b/gs_quant/timeseries/__init__.py index 1b7777be..992ca9d6 100644 --- a/gs_quant/timeseries/__init__.py +++ b/gs_quant/timeseries/__init__.py @@ -22,5 +22,6 @@ from .technicals import * from .measures import * from .helper import * +from .arima import * __name__ = 'timeseries' diff --git a/gs_quant/timeseries/arima.py b/gs_quant/timeseries/arima.py new file mode 100644 index 00000000..0e69ebdd --- /dev/null +++ b/gs_quant/timeseries/arima.py @@ -0,0 +1,310 @@ +# Copyright 2020 Goldman Sachs. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# +# Chart Service will attempt to make public functions (not prefixed with _) from this module available. Such functions +# should be fully documented: docstrings should describe parameters and the return value, and provide a 1-line +# description. Type annotations should be provided for parameters. + +from __future__ import annotations +from typing import Iterable, Optional, Union, Tuple + +import pandas as pd +import numpy as np +from tqdm import tqdm +from statsmodels.tsa.arima_model import ARIMA +from sklearn.metrics import mean_squared_error + + +"""ARIMA is the Autoregressive Integrated Moving Average Model and is used +to normalize and forecast time series data. ARIMA here is used without the +moving averages component, so predictions of future values of a +series is done by regressing on its own lagged values. ARIMA has 3 +parameters: (p, d, q) where: + :p is the number of autoregressive terms + :d is the number of nonseasonal differences + :q is the number of lagged forecast errors in the prediction equation + +An ARIMA is selected from 9 possible combinations: (0,0,0), (1,0,0), +(2,0,0), (0,1,0), (1,1,0), (2,1,0), (0,2,0), (1,2,0), (2,2,0). The time +series is split into train and test sets and an ARIMA model is fit for every +combination on the training set. The model with the lowest mean-squared +error (MSE) on the test set is selected as the best model. The original +times series is then transformed by this model. + +Autoregressive components are past values of the variable of interest. An +AR(p) model with order p = 1 may be written as Y(t) = A(1) * Y(t-1) + E(t), +where + :X(t) is the time series under investigation + :A(1) is the autoregressive parameter + :X(t-1) is the time series lagged 1 period + :E(t) is the error term of the model or white noise + +In other words, any value in X(t) can be explained using a linear +combination of the past value T(t-1) plus some error term E(t). X(t) +could also be a linear combination of more than one past value: +X(t) = A(1) * X(t-1) + A(2) * X(t-2) + E(t). + +Differencing is a way of making a non-stationary time series stationary. This +is done by computing the differences between consuective observations +(subtracting the observation from the current period from the previous one). +Differencing can help stabilize the mean of a time series by removing changes +in the level of a time series, which reduces trend and seasonality. If the +transformation is done once, then the data has been "first differenced". The +same transformation can be done again, so the data would be "second +differenced".""" + + +class arima(): + """ + An ARIMA class used to normalize time series data. + """ + + def __init__(self): + self.best_params = {} + + def _evaluate_arima_model(self, X: Union[pd.Series, pd.DataFrame], arima_order: Tuple[int, int, int], train_size: float, freq: str) -> Tuple[float, dict]: + train_size = int(len(X) * train_size) + train, test = X[0:train_size].astype(float), X[train_size:].astype(float) + + model = ARIMA(train, order=arima_order, freq=freq) + model_fit = model.fit(disp=0) + yhat = model_fit.forecast(len(test))[0] + + model_params = model_fit.params + + # calculate test error + mse = mean_squared_error(test, yhat) + + return mse, model_params + + def fit(self, X: Union[pd.Series, pd.DataFrame], train_size: float, p_vals: list = [0,1,2], d_vals: list=[0,1,2], q_vals: list=[0], freq: str=None) -> arima: + """ + Train a combination of ARIMA models. If pandas DataFrame, finds the + best arima model parameters for each column. If pandas Series, finds + the best arima model parameters for the series. + + :param X: time series to be operated on; required parameter + :param train_size: between 0.0 and 1.0 and represents the proportion of the dataset to include in the train split + :p_vals: number of autoregressive terms to search; default is [0,1,2] + :d_vals: number of differences to search; default is [0,1,2] + :q_vals: number of lagged forecast to search; always [0] + :freq: frequency of time series, default is None + :return: self + """ + + if isinstance(X, pd.DataFrame): + for series_id in tqdm(X.columns): + series = X[series_id] + best_score, best_cfg, best_params = float("inf"), None, None + for p in p_vals: + for d in d_vals: + for q in q_vals: + order = (p, d, q) + try: + mse, model_params = self._evaluate_arima_model(series, order, train_size, freq) + if mse < best_score: + best_score = mse + best_cfg = order + best_params = model_params + except Exception as e: + print(' {}'.format(e)) + continue + self.best_params[series_id] = {"p": best_cfg[0], + "d": best_cfg[1], + "q": best_cfg[2], + "best_params": best_params.to_dict(), + "first_val": series[0], + "second_val": series[1], + "third_val": series[2], + "last_val": series[-1]} + elif isinstance(X, pd.Series): + series = X + best_score, best_cfg, best_params = float("inf"), None, None + for p in p_vals: + for d in d_vals: + for q in q_vals: + order = (p, d, q) + try: + mse, model_params = self._evaluate_arima_model(series, order, train_size, freq) + if mse < best_score: + best_score = mse + best_cfg = order + best_params = model_params + except Exception as e: + print(' {}'.format(e)) + continue + + self.best_params['y'] = {"p": best_cfg[0], + "d": best_cfg[1], + "q": best_cfg[2], + "best_params": best_params.to_dict(), + "first_val": series.iloc[0], + "second_val": series.iloc[1], + "third_val": series.iloc[2], + "last_val": series.iloc[-1]} + else: + raise ValueError("Not DataFrame or Series!") + + return self + + def _arima_transform_series(self, X: pd.Series, p: int, d: int, c:float, ar1: float=None, ar2: float=None) -> pd.Series: + # Difference first + if d == 0: + pass + elif d == 1: + X = X.diff() + elif d == 2: + X = X.diff()[1:].diff() + first_day = pd.Series([np.nan]) + + first_day.index = [X.index[0] - pd.DateOffset(days=1)] + first_day.name = X.name + X = pd.concat([first_day, X]) + X.index.name = "Date" + else: + raise ValueError("d is not 0, 1, or 2") + + # Create copy of transformed array + transformed = X.copy() + + if p == 0: + return transformed + elif p == 1: + for idx, val in enumerate(list(X)[1:], start=1): + lag1_val = X.iloc[idx-1] + transformed.iloc[idx] = c + (ar1 * lag1_val) + + transformed.iloc[0] = np.nan + + return transformed + elif p == 2: + for idx, val in enumerate(list(X)[2:], start=2): + lag1_val = X.iloc[idx-1] + lag2_val = X.iloc[idx-2] + + transformed.iloc[idx] = c + (ar1 * lag1_val) + (ar2 * lag2_val) + + transformed.iloc[0] = np.nan + transformed.iloc[1] = np.nan + + return transformed + + + def _arima_transform_df(self, X: pd.DataFrame, p: int, d: int, c:float, ar1: float=None, ar2: float=None) -> pd.DataFrame: + # Difference first + if d == 0: + pass + elif d == 1: + X = X.diff() + elif d == 2: + X = X.diff()[1:].diff() + first_day = pd.Series([np.nan]) + + first_day.index = [X.index[0] - pd.DateOffset(days=1)] + first_day.name = X.name + X = pd.concat([first_day, X]) + X.index.name = "Date" + else: + raise ValueError("d is not 0, 1, or 2") + + # Create copy of transformed array + transformed = X.copy() + + if p == 0: + return transformed + elif p == 1: + for idx, val in enumerate(list(X.iteritems())[1:], start=1): + curr_date = val[0] + lag1_val = X.loc[X.index[idx-1]] + transformed[curr_date] = c + (ar1 * lag1_val) + transformed[transformed.index[0]] = np.nan + return transformed + elif p == 2: + for idx, val in enumerate(list(X.iteritems())[2:], start=2): + curr_date = val[0] + lag1_val = X.loc[X.index[idx-1]] + lag2_val = X.loc[X.index[idx-2]] + + transformed[curr_date] = c + (ar1 * lag1_val) + (ar2 * lag2_val) + + transformed[transformed.index[0]] = np.nan + transformed[transformed.index[1]] = np.nan + + return transformed + + + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]: + """ + Transform a series based on the best ARIMA found from .fit(). If input + is DataFrame, returns a transformed DataFrame. + If Series, returns a transformed Series. + + :param X: time series to be operated on; required parameter + :return: DataFrame or Series + """ + + series = {} + for series_id in self.best_params.keys(): + p = self.best_params[series_id]["p"] + d = self.best_params[series_id]["d"] + q = self.best_params[series_id]["q"] + + first_val = self.best_params[series_id]['first_val'] + second_val = self.best_params[series_id]['second_val'] + third_val = self.best_params[series_id]['third_val'] + last_val = self.best_params[series_id]['last_val'] + + try: + const = self.best_params[series_id]["best_params"]["const"] + except: + const = 0 + + if d == 0: + if p == 0: + lag1_coeff = 0 + lag2_coeff = 0 + elif p == 1: + lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.{}".format(series_id)] + lag2_coeff = 0 + elif p == 2: + lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.{}".format(series_id)] + lag2_coeff = self.best_params[series_id]["best_params"]["ar.L2.{}".format(series_id)] + elif d == 1: + if p == 0: + lag1_coeff = 0 + lag2_coeff = 0 + elif p == 1: + lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.D.{}".format(series_id)] + lag2_coeff = 0 + elif p == 2: + lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.D.{}".format(series_id)] + lag2_coeff = self.best_params[series_id]["best_params"]["ar.L2.D.{}".format(series_id)] + elif d == 2: + if p == 0: + lag1_coeff = 0 + lag2_coeff = 0 + elif p == 1: + lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.D2.{}".format(series_id)] + lag2_coeff = 0 + elif p == 2: + lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.D2.{}".format(series_id)] + lag2_coeff = self.best_params[series_id]["best_params"]["ar.L2.D2.{}".format(series_id)] + + if isinstance(X, pd.DataFrame): + new_series = self._arima_transform_df(X[series_id], p=p, d=d, c=const, ar1=lag1_coeff, ar2=lag2_coeff) + series[series_id] = new_series + elif isinstance(X, pd.Series): + new_series = self._arima_transform_series(X, p=p, d=d, c=const, ar1=lag1_coeff, ar2=lag2_coeff) + return new_series + + return pd.DataFrame.from_dict(series) From e6ab97631d7e2d914751298831310b3d7c2a84f3 Mon Sep 17 00:00:00 2001 From: Maverick Lin Date: Thu, 13 Feb 2020 12:16:00 -0500 Subject: [PATCH 02/10] fixed arima functionality --- gs_quant/test/timeseries/test_arima.py | 27 +- gs_quant/timeseries/arima.py | 336 ++++++++++--------------- setup.py | 3 + 3 files changed, 154 insertions(+), 212 deletions(-) diff --git a/gs_quant/test/timeseries/test_arima.py b/gs_quant/test/timeseries/test_arima.py index 90c75cd1..220505a2 100644 --- a/gs_quant/test/timeseries/test_arima.py +++ b/gs_quant/test/timeseries/test_arima.py @@ -14,6 +14,7 @@ under the License. """ from datetime import date +from math import isclose import pytest import pandas as pd @@ -155,24 +156,24 @@ def test_arima_fit(): Timestamp('1989-01-30 00:00:00'): 0.14398930966854095}} test_df = pd.DataFrame(test_dict) arima = ts.arima() - arima.fit(test_df, train_size=0.8, freq='B') + arima.fit(test_df, train_size=0.8, freq='B', q_vals=[0]) transformed_test_df = arima.transform(test_df) for col in transformed_test_df.keys(): - count_nans = arima.best_params[col]['p'] + arima.best_params[col]['d'] + count_nans = arima.best_params[col].p + arima.best_params[col].d assert(count_nans == transformed_test_df[col].isna().sum()) # Test (1,1,0) Model diff_test_df_high = test_df['High'].diff() - assert(transformed_test_df['High'][2] == (arima.best_params['High']['best_params']['const'] + diff_test_df_high[1] * arima.best_params['High']['best_params']['ar.L1.D.High'])) - assert(transformed_test_df['High'][3] == (arima.best_params['High']['best_params']['const'] + diff_test_df_high[2] * arima.best_params['High']['best_params']['ar.L1.D.High'])) - assert(transformed_test_df['High'][-1] == (arima.best_params['High']['best_params']['const'] + diff_test_df_high[-2] * arima.best_params['High']['best_params']['ar.L1.D.High'])) + assert(transformed_test_df['High'][2] == (arima.best_params['High'].const + diff_test_df_high[1] * arima.best_params['High'].ar_coef[0])) + assert(transformed_test_df['High'][3] == (arima.best_params['High'].const + diff_test_df_high[2] * arima.best_params['High'].ar_coef[0])) + assert(transformed_test_df['High'][-1] == (arima.best_params['High'].const + diff_test_df_high[-2] * arima.best_params['High'].ar_coef[0])) # Test (2,1,0) Model diff_test_df_low = test_df['Low'].diff() - assert(transformed_test_df['Low'][3] == (arima.best_params['Low']['best_params']['const'] + diff_test_df_low[2] * arima.best_params['Low']['best_params']['ar.L1.D.Low'] + diff_test_df_low[1] * arima.best_params['Low']['best_params']['ar.L2.D.Low'])) - assert(transformed_test_df['Low'][4] == (arima.best_params['Low']['best_params']['const'] + diff_test_df_low[3] * arima.best_params['Low']['best_params']['ar.L1.D.Low'] + diff_test_df_low[2] * arima.best_params['Low']['best_params']['ar.L2.D.Low'])) - assert(transformed_test_df['Low'][-1] == (arima.best_params['Low']['best_params']['const'] + diff_test_df_low[-2] * arima.best_params['Low']['best_params']['ar.L1.D.Low'] + diff_test_df_low[-3] * arima.best_params['Low']['best_params']['ar.L2.D.Low'])) + assert(isclose(transformed_test_df['Low'][3], (arima.best_params['Low'].const + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[1] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][4], (arima.best_params['Low'].const + diff_test_df_low[3] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][-1], (arima.best_params['Low'].const + diff_test_df_low[-2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[-3] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) # Test (1,2,0) Model diff_test_df_close = test_df['Close'].diff()[1:].diff() @@ -182,9 +183,9 @@ def test_arima_fit(): diff_test_df_close = pd.concat([first_day, diff_test_df_close]) diff_test_df_close.index.name = "Date" - assert(transformed_test_df['Close'][4] == (arima.best_params['Close']['best_params']['const'] + diff_test_df_close[3] * arima.best_params['Close']['best_params']['ar.L1.D2.Close'])) - assert(transformed_test_df['Close'][5] == (arima.best_params['Close']['best_params']['const'] + diff_test_df_close[4] * arima.best_params['Close']['best_params']['ar.L1.D2.Close'])) - assert(transformed_test_df['Close'][-1] == (arima.best_params['Close']['best_params']['const'] + diff_test_df_close[-2] * arima.best_params['Close']['best_params']['ar.L1.D2.Close'])) + assert(transformed_test_df['Close'][4] == (arima.best_params['Close'].const + diff_test_df_close[3] * arima.best_params['Close'].ar_coef[0])) + assert(transformed_test_df['Close'][5] == (arima.best_params['Close'].const + diff_test_df_close[4] * arima.best_params['Close'].ar_coef[0])) + assert(transformed_test_df['Close'][-1] == (arima.best_params['Close'].const+ diff_test_df_close[-2] * arima.best_params['Close'].ar_coef[0])) # Test (0,2,0) Model diff_test_df_volumne = test_df['Volume'].diff()[1:].diff() @@ -193,4 +194,6 @@ def test_arima_fit(): first_day.name = 'Volume' diff_test_df_volumne = pd.concat([first_day, diff_test_df_volumne]) diff_test_df_volumne.index.name = "Date" - assert(transformed_test_df['Volume'][2] == + diff_test_df_volumne[2]) + assert(transformed_test_df['Volume'][2] == arima.best_params['Volume'].const + diff_test_df_volumne[2]) + +test_arima_fit() \ No newline at end of file diff --git a/gs_quant/timeseries/arima.py b/gs_quant/timeseries/arima.py index 0e69ebdd..c0177529 100644 --- a/gs_quant/timeseries/arima.py +++ b/gs_quant/timeseries/arima.py @@ -16,8 +16,11 @@ # description. Type annotations should be provided for parameters. from __future__ import annotations +from dataclasses import dataclass from typing import Iterable, Optional, Union, Tuple +import itertools +import datetime as dt import pandas as pd import numpy as np from tqdm import tqdm @@ -25,24 +28,34 @@ from sklearn.metrics import mean_squared_error +@dataclass +class ARIMA_BestParams: + freq: str = '' + p: int = None + d: int = None + q: int = None + const: float = None + ar_coef: list = None + ma_coef: list = None + resid: list = None + series: pd.Series = None + + """ARIMA is the Autoregressive Integrated Moving Average Model and is used -to normalize and forecast time series data. ARIMA here is used without the -moving averages component, so predictions of future values of a -series is done by regressing on its own lagged values. ARIMA has 3 -parameters: (p, d, q) where: +to normalize and forecast time series data. ARIMA has 3 parameters: (p, d, q) +where: :p is the number of autoregressive terms :d is the number of nonseasonal differences :q is the number of lagged forecast errors in the prediction equation -An ARIMA is selected from 9 possible combinations: (0,0,0), (1,0,0), -(2,0,0), (0,1,0), (1,1,0), (2,1,0), (0,2,0), (1,2,0), (2,2,0). The time -series is split into train and test sets and an ARIMA model is fit for every -combination on the training set. The model with the lowest mean-squared +An ARIMA model is selected from the Catesian product of sets p, q, and d. The +time series is split into train and test sets and an ARIMA model is fit for +every combination on the training set. The model with the lowest mean-squared error (MSE) on the test set is selected as the best model. The original -times series is then transformed by this model. +times series can then be transformed by the best model. Autoregressive components are past values of the variable of interest. An -AR(p) model with order p = 1 may be written as Y(t) = A(1) * Y(t-1) + E(t), +AR(p) model with order p = 1 may be written as X(t) = A(1) * X(t-1) + E(t), where :X(t) is the time series under investigation :A(1) is the autoregressive parameter @@ -61,7 +74,13 @@ in the level of a time series, which reduces trend and seasonality. If the transformation is done once, then the data has been "first differenced". The same transformation can be done again, so the data would be "second -differenced".""" +differenced". + +Moving average components uses past forecast errors E(t). In other words, +X(t) can be thought of as a weighted moving average of the past forecast +errors: X(t) = c + E(t) + W(1)*E(t-1) + ... + W(q)*E(t-q). + +""" class arima(): @@ -69,25 +88,32 @@ class arima(): An ARIMA class used to normalize time series data. """ + def __init__(self): self.best_params = {} + def _evaluate_arima_model(self, X: Union[pd.Series, pd.DataFrame], arima_order: Tuple[int, int, int], train_size: float, freq: str) -> Tuple[float, dict]: train_size = int(len(X) * train_size) train, test = X[0:train_size].astype(float), X[train_size:].astype(float) model = ARIMA(train, order=arima_order, freq=freq) model_fit = model.fit(disp=0) - yhat = model_fit.forecast(len(test))[0] + ar_coef = model_fit.arparams + ma_coef = model_fit.maparams + resid = model_fit.resid - model_params = model_fit.params + model_params = model_fit.params.to_dict() + const = model_params.get("const", 0) # calculate test error - mse = mean_squared_error(test, yhat) - - return mse, model_params - - def fit(self, X: Union[pd.Series, pd.DataFrame], train_size: float, p_vals: list = [0,1,2], d_vals: list=[0,1,2], q_vals: list=[0], freq: str=None) -> arima: + yhat = model_fit.forecast(len(test))[0] + error = mean_squared_error(test, yhat) + + return error, const, ar_coef, ma_coef, resid + + + def fit(self, X: Union[pd.Series, pd.DataFrame], train_size: float, p_vals: list = [0,1,2], d_vals: list=[0,1,2], q_vals: list=[0,1,2], freq: str=None) -> arima: """ Train a combination of ARIMA models. If pandas DataFrame, finds the best arima model parameters for each column. If pandas Series, finds @@ -97,214 +123,124 @@ def fit(self, X: Union[pd.Series, pd.DataFrame], train_size: float, p_vals: list :param train_size: between 0.0 and 1.0 and represents the proportion of the dataset to include in the train split :p_vals: number of autoregressive terms to search; default is [0,1,2] :d_vals: number of differences to search; default is [0,1,2] - :q_vals: number of lagged forecast to search; always [0] + :q_vals: number of lagged forecast to search; always [0,1,2] :freq: frequency of time series, default is None :return: self """ + if isinstance(X, pd.Series): X = X.to_frame() + if isinstance(X, pd.DataFrame): for series_id in tqdm(X.columns): series = X[series_id] - best_score, best_cfg, best_params = float("inf"), None, None - for p in p_vals: - for d in d_vals: - for q in q_vals: - order = (p, d, q) - try: - mse, model_params = self._evaluate_arima_model(series, order, train_size, freq) - if mse < best_score: - best_score = mse - best_cfg = order - best_params = model_params - except Exception as e: - print(' {}'.format(e)) - continue - self.best_params[series_id] = {"p": best_cfg[0], - "d": best_cfg[1], - "q": best_cfg[2], - "best_params": best_params.to_dict(), - "first_val": series[0], - "second_val": series[1], - "third_val": series[2], - "last_val": series[-1]} - elif isinstance(X, pd.Series): - series = X - best_score, best_cfg, best_params = float("inf"), None, None - for p in p_vals: - for d in d_vals: - for q in q_vals: - order = (p, d, q) - try: - mse, model_params = self._evaluate_arima_model(series, order, train_size, freq) - if mse < best_score: - best_score = mse - best_cfg = order - best_params = model_params - except Exception as e: - print(' {}'.format(e)) - continue - - self.best_params['y'] = {"p": best_cfg[0], - "d": best_cfg[1], - "q": best_cfg[2], - "best_params": best_params.to_dict(), - "first_val": series.iloc[0], - "second_val": series.iloc[1], - "third_val": series.iloc[2], - "last_val": series.iloc[-1]} + best_score, best_order, best_const, best_ar_coef, best_ma_coef, best_resid = float("inf"), None, None, None, None, None + for order in list(itertools.product(*[p_vals, d_vals, q_vals])): + try: + mse, const, ar_coef, ma_coef, resid = self._evaluate_arima_model(series, order, train_size, freq) + if mse < best_score: + best_score = mse + best_order = order + best_const = const + best_ar_coef = ar_coef + best_ma_coef = ma_coef + best_resid = resid + except Exception as e: + print(' {}'.format(e)) + continue + + p, d, q = best_order[0], best_order[1], best_order[2] + assert(p == len(best_ar_coef)) + + self.best_params[series_id] = ARIMA_BestParams(freq=freq, + p=p, + d=d, + q=q, + const=best_const, + ar_coef=best_ar_coef, + ma_coef=best_ma_coef, + resid=best_resid, + series=series) else: raise ValueError("Not DataFrame or Series!") - return self - def _arima_transform_series(self, X: pd.Series, p: int, d: int, c:float, ar1: float=None, ar2: float=None) -> pd.Series: - # Difference first + + # Helper Function to Difference Time Series n Times + def _difference(self, X: pd.Series, d: int): if d == 0: - pass + return X elif d == 1: - X = X.diff() - elif d == 2: - X = X.diff()[1:].diff() - first_day = pd.Series([np.nan]) - - first_day.index = [X.index[0] - pd.DateOffset(days=1)] - first_day.name = X.name - X = pd.concat([first_day, X]) - X.index.name = "Date" + return X.diff() else: - raise ValueError("d is not 0, 1, or 2") + return self._difference(X.diff(), d-1) - # Create copy of transformed array - transformed = X.copy() + # Helper Function to Calculate AutoRegressive(AR) Component + def _lagged_values(self, X: pd.Series, p: int, ar_coef: list): if p == 0: - return transformed - elif p == 1: - for idx, val in enumerate(list(X)[1:], start=1): - lag1_val = X.iloc[idx-1] - transformed.iloc[idx] = c + (ar1 * lag1_val) - - transformed.iloc[0] = np.nan - - return transformed - elif p == 2: - for idx, val in enumerate(list(X)[2:], start=2): - lag1_val = X.iloc[idx-1] - lag2_val = X.iloc[idx-2] - - transformed.iloc[idx] = c + (ar1 * lag1_val) + (ar2 * lag2_val) - - transformed.iloc[0] = np.nan - transformed.iloc[1] = np.nan - - return transformed + return X + elif p > 0: + transformed_df = pd.concat([X.copy().shift(periods=i) for i in range(1, p+1)], axis=1) + transformed_df = transformed_df.dot(ar_coef) + return transformed_df - def _arima_transform_df(self, X: pd.DataFrame, p: int, d: int, c:float, ar1: float=None, ar2: float=None) -> pd.DataFrame: + # Helper Function to Calculate Moving Average(MA) Component + def _moving_average(self, X: pd.Series, p: int, d: int, q: int, ma_coef: list, resid: list, freq: str): + raise NotImplementedError("MA Component Transformation Not Implemented!") + + + # Helper Function to Transform Series + def _arima_transform_series(self, X: pd.Series, p: int, d: int, q:int, const:float, ar_coef:list, ma_coef:list, resid:list, freq:str) -> pd.Series: + X_original = X.copy(deep=True) + # Difference first - if d == 0: - pass - elif d == 1: - X = X.diff() - elif d == 2: - X = X.diff()[1:].diff() - first_day = pd.Series([np.nan]) - - first_day.index = [X.index[0] - pd.DateOffset(days=1)] - first_day.name = X.name - X = pd.concat([first_day, X]) - X.index.name = "Date" + X_differenced = self._difference(X_original, d) + + # Calculate Autoregressive Component + X_autoregressive = self._lagged_values(X_differenced, p, ar_coef) + + # Calculate MA Component + if q == 0: + ARIMA = const + X_autoregressive else: - raise ValueError("d is not 0, 1, or 2") + X_MA = self._moving_average(X, p, d, q, ma_coef, resid, freq) + ARIMA = const + X_autoregressive + X_MA + return ARIMA - # Create copy of transformed array - transformed = X.copy() - if p == 0: - return transformed - elif p == 1: - for idx, val in enumerate(list(X.iteritems())[1:], start=1): - curr_date = val[0] - lag1_val = X.loc[X.index[idx-1]] - transformed[curr_date] = c + (ar1 * lag1_val) - transformed[transformed.index[0]] = np.nan - return transformed - elif p == 2: - for idx, val in enumerate(list(X.iteritems())[2:], start=2): - curr_date = val[0] - lag1_val = X.loc[X.index[idx-1]] - lag2_val = X.loc[X.index[idx-2]] - - transformed[curr_date] = c + (ar1 * lag1_val) + (ar2 * lag2_val) - - transformed[transformed.index[0]] = np.nan - transformed[transformed.index[1]] = np.nan + # Helper Function to Transform DataFrame + def _arima_transform_df(self, X: pd.DataFrame) -> pd.DataFrame: + series = {} + for series_id in X.columns: + freq = self.best_params[series_id].freq + p = self.best_params[series_id].p + d = self.best_params[series_id].d + q = self.best_params[series_id].q + const = self.best_params[series_id].const + ar_coef = self.best_params[series_id].ar_coef + ma_coef = self.best_params[series_id].ma_coef + resid = self.best_params[series_id].resid + + series[series_id] = self._arima_transform_series(X[series_id], p=p, d=d, q=q, const=const, ar_coef=ar_coef, ma_coef=ma_coef, resid=resid, freq=freq) + return pd.DataFrame(series) + - return transformed - - - def transform(self, X: Union[pd.Series, pd.DataFrame]) -> Union[pd.Series, pd.DataFrame]: + def transform(self, X: Union[pd.Series, pd.DataFrame]) -> Union[pd.DataFrame]: """ - Transform a series based on the best ARIMA found from .fit(). If input - is DataFrame, returns a transformed DataFrame. - If Series, returns a transformed Series. + Transform a series based on the best ARIMA found from fit(). + Does not support tranformation using MA components. :param X: time series to be operated on; required parameter - :return: DataFrame or Series + :return: DataFrame """ - series = {} - for series_id in self.best_params.keys(): - p = self.best_params[series_id]["p"] - d = self.best_params[series_id]["d"] - q = self.best_params[series_id]["q"] - - first_val = self.best_params[series_id]['first_val'] - second_val = self.best_params[series_id]['second_val'] - third_val = self.best_params[series_id]['third_val'] - last_val = self.best_params[series_id]['last_val'] - - try: - const = self.best_params[series_id]["best_params"]["const"] - except: - const = 0 - - if d == 0: - if p == 0: - lag1_coeff = 0 - lag2_coeff = 0 - elif p == 1: - lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.{}".format(series_id)] - lag2_coeff = 0 - elif p == 2: - lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.{}".format(series_id)] - lag2_coeff = self.best_params[series_id]["best_params"]["ar.L2.{}".format(series_id)] - elif d == 1: - if p == 0: - lag1_coeff = 0 - lag2_coeff = 0 - elif p == 1: - lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.D.{}".format(series_id)] - lag2_coeff = 0 - elif p == 2: - lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.D.{}".format(series_id)] - lag2_coeff = self.best_params[series_id]["best_params"]["ar.L2.D.{}".format(series_id)] - elif d == 2: - if p == 0: - lag1_coeff = 0 - lag2_coeff = 0 - elif p == 1: - lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.D2.{}".format(series_id)] - lag2_coeff = 0 - elif p == 2: - lag1_coeff = self.best_params[series_id]["best_params"]["ar.L1.D2.{}".format(series_id)] - lag2_coeff = self.best_params[series_id]["best_params"]["ar.L2.D2.{}".format(series_id)] - - if isinstance(X, pd.DataFrame): - new_series = self._arima_transform_df(X[series_id], p=p, d=d, c=const, ar1=lag1_coeff, ar2=lag2_coeff) - series[series_id] = new_series - elif isinstance(X, pd.Series): - new_series = self._arima_transform_series(X, p=p, d=d, c=const, ar1=lag1_coeff, ar2=lag2_coeff) - return new_series - - return pd.DataFrame.from_dict(series) + if isinstance(X, pd.Series): + X = X.to_frame() + + if isinstance(X, pd.DataFrame): + transformed = self._arima_transform_df(X) + else: + raise ValueError("Not DataFrame or Series!") + + return transformed diff --git a/setup.py b/setup.py index 2f93ba20..2938d88e 100644 --- a/setup.py +++ b/setup.py @@ -63,9 +63,12 @@ "pandas", "python-dateutil>=2.7.0", "requests", + "scikit-learn", "scipy", "six", + "statsmodels", "typing;python_version<'3.7'" + "tqdm" ], extras_require={ "internal": ["gs_quant_internal>=0.4.1", "requests_kerberos"], From a577fa57d40bb9cc2e479b0d8fb7201fec532946 Mon Sep 17 00:00:00 2001 From: Maverick Lin Date: Wed, 26 Feb 2020 13:23:32 -0500 Subject: [PATCH 03/10] added moving average component to arima --- gs_quant/test/timeseries/test_arima.py | 22 ++-- gs_quant/timeseries/arima.py | 136 +++++++++++++------------ setup.py | 1 - 3 files changed, 80 insertions(+), 79 deletions(-) diff --git a/gs_quant/test/timeseries/test_arima.py b/gs_quant/test/timeseries/test_arima.py index 220505a2..53548578 100644 --- a/gs_quant/test/timeseries/test_arima.py +++ b/gs_quant/test/timeseries/test_arima.py @@ -163,10 +163,10 @@ def test_arima_fit(): count_nans = arima.best_params[col].p + arima.best_params[col].d assert(count_nans == transformed_test_df[col].isna().sum()) - # Test (1,1,0) Model - diff_test_df_high = test_df['High'].diff() - assert(transformed_test_df['High'][2] == (arima.best_params['High'].const + diff_test_df_high[1] * arima.best_params['High'].ar_coef[0])) + # Test (1,2,0) Model + diff_test_df_high = test_df['High'].diff().diff() assert(transformed_test_df['High'][3] == (arima.best_params['High'].const + diff_test_df_high[2] * arima.best_params['High'].ar_coef[0])) + assert(transformed_test_df['High'][4] == (arima.best_params['High'].const + diff_test_df_high[3] * arima.best_params['High'].ar_coef[0])) assert(transformed_test_df['High'][-1] == (arima.best_params['High'].const + diff_test_df_high[-2] * arima.best_params['High'].ar_coef[0])) # Test (2,1,0) Model @@ -187,13 +187,13 @@ def test_arima_fit(): assert(transformed_test_df['Close'][5] == (arima.best_params['Close'].const + diff_test_df_close[4] * arima.best_params['Close'].ar_coef[0])) assert(transformed_test_df['Close'][-1] == (arima.best_params['Close'].const+ diff_test_df_close[-2] * arima.best_params['Close'].ar_coef[0])) - # Test (0,2,0) Model - diff_test_df_volumne = test_df['Volume'].diff()[1:].diff() - first_day = pd.Series([np.nan]) - first_day.index = [diff_test_df_volumne.index[0] - pd.DateOffset(days=1)] - first_day.name = 'Volume' - diff_test_df_volumne = pd.concat([first_day, diff_test_df_volumne]) - diff_test_df_volumne.index.name = "Date" - assert(transformed_test_df['Volume'][2] == arima.best_params['Volume'].const + diff_test_df_volumne[2]) + # # Test (0,2,0) Model + # diff_test_df_volumne = test_df['Volume'].diff()[1:].diff() + # first_day = pd.Series([np.nan]) + # first_day.index = [diff_test_df_volumne.index[0] - pd.DateOffset(days=1)] + # first_day.name = 'Volume' + # diff_test_df_volumne = pd.concat([first_day, diff_test_df_volumne]) + # diff_test_df_volumne.index.name = "Date" + # assert(transformed_test_df['Volume'][2] == arima.best_params['Volume'].const + diff_test_df_volumne[2]) test_arima_fit() \ No newline at end of file diff --git a/gs_quant/timeseries/arima.py b/gs_quant/timeseries/arima.py index c0177529..527e818d 100644 --- a/gs_quant/timeseries/arima.py +++ b/gs_quant/timeseries/arima.py @@ -9,11 +9,6 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# -# -# Chart Service will attempt to make public functions (not prefixed with _) from this module available. Such functions -# should be fully documented: docstrings should describe parameters and the return value, and provide a 1-line -# description. Type annotations should be provided for parameters. from __future__ import annotations from dataclasses import dataclass @@ -25,7 +20,7 @@ import numpy as np from tqdm import tqdm from statsmodels.tsa.arima_model import ARIMA -from sklearn.metrics import mean_squared_error +from statsmodels.tools.eval_measures import mse @dataclass @@ -40,55 +35,48 @@ class ARIMA_BestParams: resid: list = None series: pd.Series = None - -"""ARIMA is the Autoregressive Integrated Moving Average Model and is used -to normalize and forecast time series data. ARIMA has 3 parameters: (p, d, q) -where: - :p is the number of autoregressive terms - :d is the number of nonseasonal differences - :q is the number of lagged forecast errors in the prediction equation - -An ARIMA model is selected from the Catesian product of sets p, q, and d. The -time series is split into train and test sets and an ARIMA model is fit for -every combination on the training set. The model with the lowest mean-squared -error (MSE) on the test set is selected as the best model. The original -times series can then be transformed by the best model. - -Autoregressive components are past values of the variable of interest. An -AR(p) model with order p = 1 may be written as X(t) = A(1) * X(t-1) + E(t), -where - :X(t) is the time series under investigation - :A(1) is the autoregressive parameter - :X(t-1) is the time series lagged 1 period - :E(t) is the error term of the model or white noise - -In other words, any value in X(t) can be explained using a linear -combination of the past value T(t-1) plus some error term E(t). X(t) -could also be a linear combination of more than one past value: -X(t) = A(1) * X(t-1) + A(2) * X(t-2) + E(t). - -Differencing is a way of making a non-stationary time series stationary. This -is done by computing the differences between consuective observations -(subtracting the observation from the current period from the previous one). -Differencing can help stabilize the mean of a time series by removing changes -in the level of a time series, which reduces trend and seasonality. If the -transformation is done once, then the data has been "first differenced". The -same transformation can be done again, so the data would be "second -differenced". - -Moving average components uses past forecast errors E(t). In other words, -X(t) can be thought of as a weighted moving average of the past forecast -errors: X(t) = c + E(t) + W(1)*E(t-1) + ... + W(q)*E(t-q). - -""" - - class arima(): """ - An ARIMA class used to normalize time series data. + ARIMA is the Autoregressive Integrated Moving Average Model and is used + to normalize and forecast time series data. ARIMA has 3 parameters: (p, d, q) + where: + :p is the number of autoregressive terms + :d is the number of nonseasonal differences + :q is the number of lagged forecast errors in the prediction equation + + An ARIMA model is selected from the Catesian product of sets p, q, and d. The + time series is split into train and test sets and an ARIMA model is fit for + every combination on the training set. The model with the lowest mean-squared + error (MSE) on the test set is selected as the best model. The original + times series can then be transformed by the best model. + + Autoregressive components are past values of the variable of interest. An + AR(p) model with order p = 1 may be written as X(t) = A(1) * X(t-1) + E(t), + where + :X(t) is the time series under investigation + :A(1) is the autoregressive parameter + :X(t-1) is the time series lagged 1 period + :E(t) is the error term of the model or white noise + + In other words, any value in X(t) can be explained using a linear + combination of the past value T(t-1) plus some error term E(t). X(t) + could also be a linear combination of more than one past value: + X(t) = A(1) * X(t-1) + A(2) * X(t-2) + E(t). + + Differencing is a way of making a non-stationary time series stationary. This + is done by computing the differences between consuective observations + (subtracting the observation from the current period from the previous one). + Differencing can help stabilize the mean of a time series by removing changes + in the level of a time series, which reduces trend and seasonality. If the + transformation is done once, then the data has been "first differenced". The + same transformation can be done again, so the data would be "second + differenced". + + Moving average components uses past forecast errors E(t). In other words, + X(t) can be thought of as a weighted moving average of the past forecast + errors: X(t) = c + E(t) + W(1)*E(t-1) + ... + W(q)*E(t-q). """ - def __init__(self): self.best_params = {} @@ -98,7 +86,7 @@ def _evaluate_arima_model(self, X: Union[pd.Series, pd.DataFrame], arima_order: train, test = X[0:train_size].astype(float), X[train_size:].astype(float) model = ARIMA(train, order=arima_order, freq=freq) - model_fit = model.fit(disp=0) + model_fit = model.fit(disp=False, method='css', trend="nc") ar_coef = model_fit.arparams ma_coef = model_fit.maparams resid = model_fit.resid @@ -108,7 +96,7 @@ def _evaluate_arima_model(self, X: Union[pd.Series, pd.DataFrame], arima_order: # calculate test error yhat = model_fit.forecast(len(test))[0] - error = mean_squared_error(test, yhat) + error = mse(test, yhat) return error, const, ar_coef, ma_coef, resid @@ -148,7 +136,7 @@ def fit(self, X: Union[pd.Series, pd.DataFrame], train_size: float, p_vals: list print(' {}'.format(e)) continue - p, d, q = best_order[0], best_order[1], best_order[2] + p, d, q = best_order assert(p == len(best_ar_coef)) self.best_params[series_id] = ARIMA_BestParams(freq=freq, @@ -184,29 +172,43 @@ def _lagged_values(self, X: pd.Series, p: int, ar_coef: list): transformed_df = transformed_df.dot(ar_coef) return transformed_df + # Helper Function to Calculate Residuals/MA Component + def _calculate_residuals(self, X_ar: pd.Series, X_diff: pd.Series, p: int, d: int, q: int, ar_coef: list, ma_coef: list, freq: str): + ma_coef = ma_coef[::-1] + + resid = X_ar.copy(deep=True) + resid[:] = 0 + + X_ma = X_ar.copy(deep=True) + X_ma[:] = np.nan + + for x in range(p + d, len(X_ar)): + ma_component = resid[x-q: x].dot(ma_coef) + prediction = X_ar[x] + ma_component + residual = X_diff[x] - prediction + resid[x] = residual + X_ma[x] = prediction - # Helper Function to Calculate Moving Average(MA) Component - def _moving_average(self, X: pd.Series, p: int, d: int, q: int, ma_coef: list, resid: list, freq: str): - raise NotImplementedError("MA Component Transformation Not Implemented!") + return resid, X_ma # Helper Function to Transform Series def _arima_transform_series(self, X: pd.Series, p: int, d: int, q:int, const:float, ar_coef:list, ma_coef:list, resid:list, freq:str) -> pd.Series: - X_original = X.copy(deep=True) # Difference first - X_differenced = self._difference(X_original, d) + X_diff = self._difference(X, d) # Calculate Autoregressive Component - X_autoregressive = self._lagged_values(X_differenced, p, ar_coef) + X_diff_ar = self._lagged_values(X_diff, p, ar_coef) + + # Caluclate Residuals and Moving Average Component + calcualted_resid, X_diff_ar_ma = self._calculate_residuals(X_diff_ar, X_diff, p, d, q, ar_coef, ma_coef, freq) - # Calculate MA Component - if q == 0: - ARIMA = const + X_autoregressive - else: - X_MA = self._moving_average(X, p, d, q, ma_coef, resid, freq) - ARIMA = const + X_autoregressive + X_MA - return ARIMA + # Check that calculated residuals are close with ARIMA statsmodels residuals + residuals_df = pd.concat([calcualted_resid, resid], axis=1, join='inner') + assert(np.allclose(residuals_df[residuals_df.columns[0]], residuals_df[residuals_df.columns[1]])) + + return X_diff_ar_ma # Helper Function to Transform DataFrame diff --git a/setup.py b/setup.py index 2938d88e..7f6a1050 100644 --- a/setup.py +++ b/setup.py @@ -63,7 +63,6 @@ "pandas", "python-dateutil>=2.7.0", "requests", - "scikit-learn", "scipy", "six", "statsmodels", From 59679567615ebc1db51635016302c38eb30c458b Mon Sep 17 00:00:00 2001 From: Maverick Lin Date: Wed, 18 Mar 2020 11:46:38 -0400 Subject: [PATCH 04/10] additional changes, pep8 formatting --- gs_quant/test/timeseries/test_arima.py | 75 ++++---- gs_quant/timeseries/arima.py | 226 +++++++++++++++---------- setup.py | 1 - 3 files changed, 171 insertions(+), 131 deletions(-) diff --git a/gs_quant/test/timeseries/test_arima.py b/gs_quant/test/timeseries/test_arima.py index 53548578..3926ff70 100644 --- a/gs_quant/test/timeseries/test_arima.py +++ b/gs_quant/test/timeseries/test_arima.py @@ -26,30 +26,31 @@ import gs_quant.timeseries as ts + def test_arima_fit(): test_dict = { - 'High': - {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, - Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-05 00:00:00'): 3.62580132484436, - Timestamp('1989-01-06 00:00:00'): 3.62580132484436, - Timestamp('1989-01-09 00:00:00'): 3.575721263885498, - Timestamp('1989-01-10 00:00:00'): 3.575721263885498, - Timestamp('1989-01-11 00:00:00'): 3.5657050609588623, - Timestamp('1989-01-12 00:00:00'): 3.635817289352417, - Timestamp('1989-01-13 00:00:00'): 3.615785360336304, - Timestamp('1989-01-16 00:00:00'): 3.615785360336304, - Timestamp('1989-01-17 00:00:00'): 3.635817289352417, - Timestamp('1989-01-18 00:00:00'): 3.675881385803223, - Timestamp('1989-01-19 00:00:00'): 3.695913553237915, - Timestamp('1989-01-20 00:00:00'): 3.665865421295166, - Timestamp('1989-01-23 00:00:00'): 3.675881385803223, - Timestamp('1989-01-24 00:00:00'): 3.675881385803223, - Timestamp('1989-01-25 00:00:00'): 3.695913553237915, - Timestamp('1989-01-26 00:00:00'): 3.7760417461395264, - Timestamp('1989-01-27 00:00:00'): 3.8561699390411377, - Timestamp('1989-01-30 00:00:00'): 3.8561699390411377}, - 'Low': + 'High': + {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.62580132484436, + Timestamp('1989-01-06 00:00:00'): 3.62580132484436, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.575721263885498, + Timestamp('1989-01-11 00:00:00'): 3.5657050609588623, + Timestamp('1989-01-12 00:00:00'): 3.635817289352417, + Timestamp('1989-01-13 00:00:00'): 3.615785360336304, + Timestamp('1989-01-16 00:00:00'): 3.615785360336304, + Timestamp('1989-01-17 00:00:00'): 3.635817289352417, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.695913553237915, + Timestamp('1989-01-20 00:00:00'): 3.665865421295166, + Timestamp('1989-01-23 00:00:00'): 3.675881385803223, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.695913553237915, + Timestamp('1989-01-26 00:00:00'): 3.7760417461395264, + Timestamp('1989-01-27 00:00:00'): 3.8561699390411377, + Timestamp('1989-01-30 00:00:00'): 3.8561699390411377}, + 'Low': {Timestamp('1989-01-03 00:00:00'): 3.4855768680572514, Timestamp('1989-01-04 00:00:00'): 3.5356571674346924, Timestamp('1989-01-05 00:00:00'): 3.575721263885498, @@ -70,7 +71,7 @@ def test_arima_fit(): Timestamp('1989-01-26 00:00:00'): 3.665865421295166, Timestamp('1989-01-27 00:00:00'): 3.79607367515564, Timestamp('1989-01-30 00:00:00'): 3.786057710647583}, - 'Open': + 'Open': {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, Timestamp('1989-01-04 00:00:00'): 3.5556890964508057, Timestamp('1989-01-05 00:00:00'): 3.5857372283935547, @@ -91,7 +92,7 @@ def test_arima_fit(): Timestamp('1989-01-26 00:00:00'): 3.675881385803223, Timestamp('1989-01-27 00:00:00'): 3.79607367515564, Timestamp('1989-01-30 00:00:00'): 3.806089639663696}, - 'Close': + 'Close': {Timestamp('1989-01-03 00:00:00'): 3.5256409645080566, Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, Timestamp('1989-01-05 00:00:00'): 3.575721263885498, @@ -112,7 +113,7 @@ def test_arima_fit(): Timestamp('1989-01-26 00:00:00'): 3.756009578704834, Timestamp('1989-01-27 00:00:00'): 3.79607367515564, Timestamp('1989-01-30 00:00:00'): 3.846153736114502}, - 'Volume': + 'Volume': {Timestamp('1989-01-03 00:00:00'): 21873600.0, Timestamp('1989-01-04 00:00:00'): 13487100.0, Timestamp('1989-01-05 00:00:00'): 20733000.0, @@ -133,7 +134,7 @@ def test_arima_fit(): Timestamp('1989-01-26 00:00:00'): 29040900.0, Timestamp('1989-01-27 00:00:00'): 50615100.0, Timestamp('1989-01-30 00:00:00'): 27567000.0}, - 'Adj Close': + 'Adj Close': {Timestamp('1989-01-03 00:00:00'): 0.13199026882648468, Timestamp('1989-01-04 00:00:00'): 0.13424012064933774, Timestamp('1989-01-05 00:00:00'): 0.1338651180267334, @@ -171,29 +172,21 @@ def test_arima_fit(): # Test (2,1,0) Model diff_test_df_low = test_df['Low'].diff() - assert(isclose(transformed_test_df['Low'][3], (arima.best_params['Low'].const + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[1] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) - assert(isclose(transformed_test_df['Low'][4], (arima.best_params['Low'].const + diff_test_df_low[3] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) - assert(isclose(transformed_test_df['Low'][-1], (arima.best_params['Low'].const + diff_test_df_low[-2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[-3] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][3], (arima.best_params['Low'].const + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[1] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][4], (arima.best_params['Low'].const + diff_test_df_low[3] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][-1], (arima.best_params['Low'].const + diff_test_df_low[-2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[-3] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) # Test (1,2,0) Model diff_test_df_close = test_df['Close'].diff()[1:].diff() first_day = pd.Series([np.nan]) first_day.index = [diff_test_df_close.index[0] - pd.DateOffset(days=1)] first_day.name = 'Close' - diff_test_df_close = pd.concat([first_day, diff_test_df_close]) + diff_test_df_close = pd.concat([first_day, diff_test_df_close]) diff_test_df_close.index.name = "Date" assert(transformed_test_df['Close'][4] == (arima.best_params['Close'].const + diff_test_df_close[3] * arima.best_params['Close'].ar_coef[0])) assert(transformed_test_df['Close'][5] == (arima.best_params['Close'].const + diff_test_df_close[4] * arima.best_params['Close'].ar_coef[0])) - assert(transformed_test_df['Close'][-1] == (arima.best_params['Close'].const+ diff_test_df_close[-2] * arima.best_params['Close'].ar_coef[0])) - - # # Test (0,2,0) Model - # diff_test_df_volumne = test_df['Volume'].diff()[1:].diff() - # first_day = pd.Series([np.nan]) - # first_day.index = [diff_test_df_volumne.index[0] - pd.DateOffset(days=1)] - # first_day.name = 'Volume' - # diff_test_df_volumne = pd.concat([first_day, diff_test_df_volumne]) - # diff_test_df_volumne.index.name = "Date" - # assert(transformed_test_df['Volume'][2] == arima.best_params['Volume'].const + diff_test_df_volumne[2]) + assert(transformed_test_df['Close'][-1] == (arima.best_params['Close'].const + diff_test_df_close[-2] * arima.best_params['Close'].ar_coef[0])) -test_arima_fit() \ No newline at end of file + print("All tests passed!") +test_arima_fit() diff --git a/gs_quant/timeseries/arima.py b/gs_quant/timeseries/arima.py index 527e818d..30869fc7 100644 --- a/gs_quant/timeseries/arima.py +++ b/gs_quant/timeseries/arima.py @@ -18,7 +18,6 @@ import datetime as dt import pandas as pd import numpy as np -from tqdm import tqdm from statsmodels.tsa.arima_model import ARIMA from statsmodels.tools.eval_measures import mse @@ -35,64 +34,52 @@ class ARIMA_BestParams: resid: list = None series: pd.Series = None + class arima(): """ - ARIMA is the Autoregressive Integrated Moving Average Model and is used - to normalize and forecast time series data. ARIMA has 3 parameters: (p, d, q) - where: + ARIMA is the Autoregressive Integrated Moving Average Model and is used + to normalize and forecast time series data. ARIMA has 3 parameters: + (p, d, q) where: :p is the number of autoregressive terms :d is the number of nonseasonal differences :q is the number of lagged forecast errors in the prediction equation - - An ARIMA model is selected from the Catesian product of sets p, q, and d. The - time series is split into train and test sets and an ARIMA model is fit for - every combination on the training set. The model with the lowest mean-squared - error (MSE) on the test set is selected as the best model. The original - times series can then be transformed by the best model. - - Autoregressive components are past values of the variable of interest. An - AR(p) model with order p = 1 may be written as X(t) = A(1) * X(t-1) + E(t), - where - :X(t) is the time series under investigation - :A(1) is the autoregressive parameter - :X(t-1) is the time series lagged 1 period - :E(t) is the error term of the model or white noise - - In other words, any value in X(t) can be explained using a linear - combination of the past value T(t-1) plus some error term E(t). X(t) - could also be a linear combination of more than one past value: - X(t) = A(1) * X(t-1) + A(2) * X(t-2) + E(t). - - Differencing is a way of making a non-stationary time series stationary. This - is done by computing the differences between consuective observations - (subtracting the observation from the current period from the previous one). - Differencing can help stabilize the mean of a time series by removing changes - in the level of a time series, which reduces trend and seasonality. If the - transformation is done once, then the data has been "first differenced". The - same transformation can be done again, so the data would be "second - differenced". - - Moving average components uses past forecast errors E(t). In other words, - X(t) can be thought of as a weighted moving average of the past forecast - errors: X(t) = c + E(t) + W(1)*E(t-1) + ... + W(q)*E(t-q). + + An ARIMA model is selected from the Catesian product of sets p, q, and d. + The time series is split into train and test sets and an ARIMA model is fit + for every combination on the training set. The model with the lowest + mean-squared error (MSE) on the test set is selected as the best model. The + original times series can then be transformed by the best model. """ def __init__(self): self.best_params = {} - - def _evaluate_arima_model(self, X: Union[pd.Series, pd.DataFrame], arima_order: Tuple[int, int, int], train_size: float, freq: str) -> Tuple[float, dict]: - train_size = int(len(X) * train_size) - train, test = X[0:train_size].astype(float), X[train_size:].astype(float) + def _evaluate_arima_model( + self, X: + Union[pd.Series, pd.DataFrame], + arima_order: Tuple[int, int, int], + train_size: Union[float, int, None], + freq: str + ) -> Tuple[float, dict]: + if type(train_size) == float: + train_size = int(len(X) * train_size) + train, test = X[:train_size].astype(float), X[train_size:].astype(float) + elif type(train_size) == int: + train, test = X[:train_size].astype(float), X[train_size:].astype(float) + elif train_size is None: + train_size = int(len(X) * 0.75) + train, test = X[:train_size].astype(float), X[train_size:].astype(float) + else: + raise ValueError('train_size is not int, float, or None') model = ARIMA(train, order=arima_order, freq=freq) - model_fit = model.fit(disp=False, method='css', trend="nc") + model_fit = model.fit(disp=False, method='css', trend='nc') ar_coef = model_fit.arparams ma_coef = model_fit.maparams resid = model_fit.resid model_params = model_fit.params.to_dict() - const = model_params.get("const", 0) + const = model_params.get('const', 0) # calculate test error yhat = model_fit.forecast(len(test))[0] @@ -100,15 +87,25 @@ def _evaluate_arima_model(self, X: Union[pd.Series, pd.DataFrame], arima_order: return error, const, ar_coef, ma_coef, resid - - def fit(self, X: Union[pd.Series, pd.DataFrame], train_size: float, p_vals: list = [0,1,2], d_vals: list=[0,1,2], q_vals: list=[0,1,2], freq: str=None) -> arima: + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + train_size: Union[float, int, None]=None, + p_vals: list=[0, 1, 2], + d_vals: list=[0, 1, 2], + q_vals: list=[0, 1, 2], + freq: str=None + ) -> arima: """ - Train a combination of ARIMA models. If pandas DataFrame, finds the - best arima model parameters for each column. If pandas Series, finds + Train a combination of ARIMA models. If pandas DataFrame, finds the + best arima model parameters for each column. If pandas Series, finds the best arima model parameters for the series. :param X: time series to be operated on; required parameter - :param train_size: between 0.0 and 1.0 and represents the proportion of the dataset to include in the train split + :param train_size: if float, should be between 0.0 and 1.0 and + represent the proportion of the dataset to include in the train split. + If int, represents the absolute number of train samples. If None, + the value is automatically set 0.75 :p_vals: number of autoregressive terms to search; default is [0,1,2] :d_vals: number of differences to search; default is [0,1,2] :q_vals: number of lagged forecast to search; always [0,1,2] @@ -116,12 +113,18 @@ def fit(self, X: Union[pd.Series, pd.DataFrame], train_size: float, p_vals: list :return: self """ - if isinstance(X, pd.Series): X = X.to_frame() + if isinstance(X, pd.Series): + X = X.to_frame() if isinstance(X, pd.DataFrame): - for series_id in tqdm(X.columns): + for series_id in X.columns: series = X[series_id] - best_score, best_order, best_const, best_ar_coef, best_ma_coef, best_resid = float("inf"), None, None, None, None, None + best_score = float('inf') + best_order = None + best_const = None + best_ar_coef = None + best_ma_coef = None + best_resid = None for order in list(itertools.product(*[p_vals, d_vals, q_vals])): try: mse, const, ar_coef, ma_coef, resid = self._evaluate_arima_model(series, order, train_size, freq) @@ -139,22 +142,23 @@ def fit(self, X: Union[pd.Series, pd.DataFrame], train_size: float, p_vals: list p, d, q = best_order assert(p == len(best_ar_coef)) - self.best_params[series_id] = ARIMA_BestParams(freq=freq, - p=p, - d=d, - q=q, - const=best_const, - ar_coef=best_ar_coef, - ma_coef=best_ma_coef, - resid=best_resid, - series=series) + self.best_params[series_id] = ARIMA_BestParams( + freq=freq, + p=p, + d=d, + q=q, + const=best_const, + ar_coef=best_ar_coef, + ma_coef=best_ma_coef, + resid=best_resid, + series=series) else: - raise ValueError("Not DataFrame or Series!") - return self - + raise ValueError('Not DataFrame or Series!') + return self - # Helper Function to Difference Time Series n Times def _difference(self, X: pd.Series, d: int): + """Helper Function to Difference Time Series n Times""" + if d == 0: return X elif d == 1: @@ -162,23 +166,36 @@ def _difference(self, X: pd.Series, d: int): else: return self._difference(X.diff(), d-1) - - # Helper Function to Calculate AutoRegressive(AR) Component def _lagged_values(self, X: pd.Series, p: int, ar_coef: list): + """Helper Function to Calculate AutoRegressive(AR) Component""" + if p == 0: return X elif p > 0: - transformed_df = pd.concat([X.copy().shift(periods=i) for i in range(1, p+1)], axis=1) + transformed_df = pd.concat([X.copy().shift(periods=i) + for i in range(1, p+1)], axis=1) transformed_df = transformed_df.dot(ar_coef) + else: + raise ValueError("p should not be less than 0!") return transformed_df - # Helper Function to Calculate Residuals/MA Component - def _calculate_residuals(self, X_ar: pd.Series, X_diff: pd.Series, p: int, d: int, q: int, ar_coef: list, ma_coef: list, freq: str): + def _calculate_residuals( + self, + X_ar: pd.Series, + X_diff: pd.Series, + p: int, + d: int, + q: int, + ar_coef: list, + ma_coef: list, + freq: str + ): + """Helper Function to Calculate Residuals/MA Component""" + ma_coef = ma_coef[::-1] - resid = X_ar.copy(deep=True) resid[:] = 0 - + X_ma = X_ar.copy(deep=True) X_ma[:] = np.nan @@ -191,28 +208,46 @@ def _calculate_residuals(self, X_ar: pd.Series, X_diff: pd.Series, p: int, d: in return resid, X_ma + def _arima_transform_series( + self, + X: pd.Series, + p: int, + d: int, + q: int, + const: float, + ar_coef: list, + ma_coef: list, + resid: list, + freq: str + ) -> pd.Series: + """Helper Function to Transform Series""" - # Helper Function to Transform Series - def _arima_transform_series(self, X: pd.Series, p: int, d: int, q:int, const:float, ar_coef:list, ma_coef:list, resid:list, freq:str) -> pd.Series: - # Difference first X_diff = self._difference(X, d) - + # Calculate Autoregressive Component X_diff_ar = self._lagged_values(X_diff, p, ar_coef) # Caluclate Residuals and Moving Average Component - calcualted_resid, X_diff_ar_ma = self._calculate_residuals(X_diff_ar, X_diff, p, d, q, ar_coef, ma_coef, freq) - - # Check that calculated residuals are close with ARIMA statsmodels residuals - residuals_df = pd.concat([calcualted_resid, resid], axis=1, join='inner') - assert(np.allclose(residuals_df[residuals_df.columns[0]], residuals_df[residuals_df.columns[1]])) - - return X_diff_ar_ma + calcualted_resid, X_diff_ar_ma = self._calculate_residuals(X_diff_ar, + X_diff, + p, + d, + q, + ar_coef, + ma_coef, + freq) + + # Check calculated residuals are close with ARIMA statsmodels residuals + resid_df = pd.concat([calcualted_resid, resid], axis=1, join='inner') + assert(np.allclose(resid_df[resid_df.columns[0]], + resid_df[resid_df.columns[1]])) + return X_diff_ar_ma - # Helper Function to Transform DataFrame def _arima_transform_df(self, X: pd.DataFrame) -> pd.DataFrame: + """Helper Function to Transform DataFrame""" + series = {} for series_id in X.columns: freq = self.best_params[series_id].freq @@ -223,26 +258,39 @@ def _arima_transform_df(self, X: pd.DataFrame) -> pd.DataFrame: ar_coef = self.best_params[series_id].ar_coef ma_coef = self.best_params[series_id].ma_coef resid = self.best_params[series_id].resid - - series[series_id] = self._arima_transform_series(X[series_id], p=p, d=d, q=q, const=const, ar_coef=ar_coef, ma_coef=ma_coef, resid=resid, freq=freq) + + series[series_id] = self._arima_transform_series( + X[series_id], + p=p, + d=d, + q=q, + const=const, + ar_coef=ar_coef, + ma_coef=ma_coef, + resid=resid, + freq=freq + ) + return pd.DataFrame(series) - - def transform(self, X: Union[pd.Series, pd.DataFrame]) -> Union[pd.DataFrame]: + def transform( + self, + X: Union[pd.Series, pd.DataFrame] + ) -> Union[pd.DataFrame]: """ - Transform a series based on the best ARIMA found from fit(). + Transform a series based on the best ARIMA found from fit(). Does not support tranformation using MA components. :param X: time series to be operated on; required parameter :return: DataFrame """ - if isinstance(X, pd.Series): + if isinstance(X, pd.Series): X = X.to_frame() if isinstance(X, pd.DataFrame): transformed = self._arima_transform_df(X) else: - raise ValueError("Not DataFrame or Series!") + raise ValueError('Not DataFrame or Series!') return transformed diff --git a/setup.py b/setup.py index 7f6a1050..96238384 100644 --- a/setup.py +++ b/setup.py @@ -67,7 +67,6 @@ "six", "statsmodels", "typing;python_version<'3.7'" - "tqdm" ], extras_require={ "internal": ["gs_quant_internal>=0.4.1", "requests_kerberos"], From d9918e052d691f054874b4c3f768dd2c9c0635bf Mon Sep 17 00:00:00 2001 From: Maverick Lin Date: Thu, 2 Apr 2020 12:06:49 -0400 Subject: [PATCH 05/10] additional changes, modified test file --- gs_quant/test/timeseries/test_arima.py | 193 +++++++++---------------- gs_quant/timeseries/arima.py | 82 +++++------ 2 files changed, 106 insertions(+), 169 deletions(-) diff --git a/gs_quant/test/timeseries/test_arima.py b/gs_quant/test/timeseries/test_arima.py index 3926ff70..16805bcd 100644 --- a/gs_quant/test/timeseries/test_arima.py +++ b/gs_quant/test/timeseries/test_arima.py @@ -29,132 +29,68 @@ def test_arima_fit(): test_dict = { - 'High': - {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, - Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-05 00:00:00'): 3.62580132484436, - Timestamp('1989-01-06 00:00:00'): 3.62580132484436, - Timestamp('1989-01-09 00:00:00'): 3.575721263885498, - Timestamp('1989-01-10 00:00:00'): 3.575721263885498, - Timestamp('1989-01-11 00:00:00'): 3.5657050609588623, - Timestamp('1989-01-12 00:00:00'): 3.635817289352417, - Timestamp('1989-01-13 00:00:00'): 3.615785360336304, - Timestamp('1989-01-16 00:00:00'): 3.615785360336304, - Timestamp('1989-01-17 00:00:00'): 3.635817289352417, - Timestamp('1989-01-18 00:00:00'): 3.675881385803223, - Timestamp('1989-01-19 00:00:00'): 3.695913553237915, - Timestamp('1989-01-20 00:00:00'): 3.665865421295166, - Timestamp('1989-01-23 00:00:00'): 3.675881385803223, - Timestamp('1989-01-24 00:00:00'): 3.675881385803223, - Timestamp('1989-01-25 00:00:00'): 3.695913553237915, - Timestamp('1989-01-26 00:00:00'): 3.7760417461395264, - Timestamp('1989-01-27 00:00:00'): 3.8561699390411377, - Timestamp('1989-01-30 00:00:00'): 3.8561699390411377}, - 'Low': - {Timestamp('1989-01-03 00:00:00'): 3.4855768680572514, - Timestamp('1989-01-04 00:00:00'): 3.5356571674346924, - Timestamp('1989-01-05 00:00:00'): 3.575721263885498, - Timestamp('1989-01-06 00:00:00'): 3.575721263885498, - Timestamp('1989-01-09 00:00:00'): 3.5356571674346924, - Timestamp('1989-01-10 00:00:00'): 3.5356571674346924, - Timestamp('1989-01-11 00:00:00'): 3.5256409645080566, - Timestamp('1989-01-12 00:00:00'): 3.5456731319427486, - Timestamp('1989-01-13 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, - Timestamp('1989-01-17 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-18 00:00:00'): 3.615785360336304, - Timestamp('1989-01-19 00:00:00'): 3.655849456787109, - Timestamp('1989-01-20 00:00:00'): 3.62580132484436, - Timestamp('1989-01-23 00:00:00'): 3.615785360336304, - Timestamp('1989-01-24 00:00:00'): 3.615785360336304, - Timestamp('1989-01-25 00:00:00'): 3.655849456787109, - Timestamp('1989-01-26 00:00:00'): 3.665865421295166, - Timestamp('1989-01-27 00:00:00'): 3.79607367515564, - Timestamp('1989-01-30 00:00:00'): 3.786057710647583}, - 'Open': - {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, - Timestamp('1989-01-04 00:00:00'): 3.5556890964508057, - Timestamp('1989-01-05 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-06 00:00:00'): 3.605769157409668, - Timestamp('1989-01-09 00:00:00'): 3.5456731319427486, - Timestamp('1989-01-10 00:00:00'): 3.575721263885498, - Timestamp('1989-01-11 00:00:00'): 3.5456731319427486, - Timestamp('1989-01-12 00:00:00'): 3.5456731319427486, - Timestamp('1989-01-13 00:00:00'): 3.605769157409668, - Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, - Timestamp('1989-01-17 00:00:00'): 3.5957531929016118, - Timestamp('1989-01-18 00:00:00'): 3.635817289352417, - Timestamp('1989-01-19 00:00:00'): 3.6858973503112793, - Timestamp('1989-01-20 00:00:00'): 3.665865421295166, - Timestamp('1989-01-23 00:00:00'): 3.6458332538604736, - Timestamp('1989-01-24 00:00:00'): 3.62580132484436, - Timestamp('1989-01-25 00:00:00'): 3.6858973503112793, - Timestamp('1989-01-26 00:00:00'): 3.675881385803223, - Timestamp('1989-01-27 00:00:00'): 3.79607367515564, - Timestamp('1989-01-30 00:00:00'): 3.806089639663696}, - 'Close': - {Timestamp('1989-01-03 00:00:00'): 3.5256409645080566, - Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-05 00:00:00'): 3.575721263885498, - Timestamp('1989-01-06 00:00:00'): 3.575721263885498, - Timestamp('1989-01-09 00:00:00'): 3.575721263885498, - Timestamp('1989-01-10 00:00:00'): 3.5556890964508057, - Timestamp('1989-01-11 00:00:00'): 3.5556890964508057, - Timestamp('1989-01-12 00:00:00'): 3.605769157409668, - Timestamp('1989-01-13 00:00:00'): 3.605769157409668, - Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, - Timestamp('1989-01-17 00:00:00'): 3.62580132484436, - Timestamp('1989-01-18 00:00:00'): 3.675881385803223, - Timestamp('1989-01-19 00:00:00'): 3.665865421295166, - Timestamp('1989-01-20 00:00:00'): 3.6458332538604736, - Timestamp('1989-01-23 00:00:00'): 3.62580132484436, - Timestamp('1989-01-24 00:00:00'): 3.675881385803223, - Timestamp('1989-01-25 00:00:00'): 3.675881385803223, - Timestamp('1989-01-26 00:00:00'): 3.756009578704834, - Timestamp('1989-01-27 00:00:00'): 3.79607367515564, - Timestamp('1989-01-30 00:00:00'): 3.846153736114502}, - 'Volume': - {Timestamp('1989-01-03 00:00:00'): 21873600.0, - Timestamp('1989-01-04 00:00:00'): 13487100.0, - Timestamp('1989-01-05 00:00:00'): 20733000.0, - Timestamp('1989-01-06 00:00:00'): 20654400.0, - Timestamp('1989-01-09 00:00:00'): 21478000.0, - Timestamp('1989-01-10 00:00:00'): 15541300.0, - Timestamp('1989-01-11 00:00:00'): 11465300.0, - Timestamp('1989-01-12 00:00:00'): 26481300.0, - Timestamp('1989-01-13 00:00:00'): 10236000.0, - Timestamp('1989-01-16 00:00:00'): 8888200.0, - Timestamp('1989-01-17 00:00:00'): 12934200.0, - Timestamp('1989-01-18 00:00:00'): 25965800.0, - Timestamp('1989-01-19 00:00:00'): 25556500.0, - Timestamp('1989-01-20 00:00:00'): 13779100.0, - Timestamp('1989-01-23 00:00:00'): 13680500.0, - Timestamp('1989-01-24 00:00:00'): 16870400.0, - Timestamp('1989-01-25 00:00:00'): 16959000.0, - Timestamp('1989-01-26 00:00:00'): 29040900.0, - Timestamp('1989-01-27 00:00:00'): 50615100.0, - Timestamp('1989-01-30 00:00:00'): 27567000.0}, - 'Adj Close': - {Timestamp('1989-01-03 00:00:00'): 0.13199026882648468, - Timestamp('1989-01-04 00:00:00'): 0.13424012064933774, - Timestamp('1989-01-05 00:00:00'): 0.1338651180267334, - Timestamp('1989-01-06 00:00:00'): 0.1338651180267334, - Timestamp('1989-01-09 00:00:00'): 0.1338651180267334, - Timestamp('1989-01-10 00:00:00'): 0.13311512768268585, - Timestamp('1989-01-11 00:00:00'): 0.13311512768268585, - Timestamp('1989-01-12 00:00:00'): 0.13499003648757935, - Timestamp('1989-01-13 00:00:00'): 0.13499003648757935, - Timestamp('1989-01-16 00:00:00'): 0.13461506366729736, - Timestamp('1989-01-17 00:00:00'): 0.13573989272117615, - Timestamp('1989-01-18 00:00:00'): 0.13761481642723086, - Timestamp('1989-01-19 00:00:00'): 0.13723985850811005, - Timestamp('1989-01-20 00:00:00'): 0.13648992776870728, - Timestamp('1989-01-23 00:00:00'): 0.13573989272117615, - Timestamp('1989-01-24 00:00:00'): 0.13761481642723086, - Timestamp('1989-01-25 00:00:00'): 0.13761481642723086, - Timestamp('1989-01-26 00:00:00'): 0.14061467349529266, - Timestamp('1989-01-27 00:00:00'): 0.14211450517177582, - Timestamp('1989-01-30 00:00:00'): 0.14398930966854095}} + 'High': {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.62580132484436, + Timestamp('1989-01-06 00:00:00'): 3.62580132484436, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.575721263885498, + Timestamp('1989-01-11 00:00:00'): 3.5657050609588623, + Timestamp('1989-01-12 00:00:00'): 3.635817289352417, + Timestamp('1989-01-13 00:00:00'): 3.615785360336304, + Timestamp('1989-01-16 00:00:00'): 3.615785360336304, + Timestamp('1989-01-17 00:00:00'): 3.635817289352417, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.695913553237915, + Timestamp('1989-01-20 00:00:00'): 3.665865421295166, + Timestamp('1989-01-23 00:00:00'): 3.675881385803223, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.695913553237915, + Timestamp('1989-01-26 00:00:00'): 3.7760417461395264, + Timestamp('1989-01-27 00:00:00'): 3.8561699390411377, + Timestamp('1989-01-30 00:00:00'): 3.8561699390411377}, + 'Low': {Timestamp('1989-01-03 00:00:00'): 3.4855768680572514, + Timestamp('1989-01-04 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-05 00:00:00'): 3.575721263885498, + Timestamp('1989-01-06 00:00:00'): 3.575721263885498, + Timestamp('1989-01-09 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-10 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-11 00:00:00'): 3.5256409645080566, + Timestamp('1989-01-12 00:00:00'): 3.5456731319427486, + Timestamp('1989-01-13 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-18 00:00:00'): 3.615785360336304, + Timestamp('1989-01-19 00:00:00'): 3.655849456787109, + Timestamp('1989-01-20 00:00:00'): 3.62580132484436, + Timestamp('1989-01-23 00:00:00'): 3.615785360336304, + Timestamp('1989-01-24 00:00:00'): 3.615785360336304, + Timestamp('1989-01-25 00:00:00'): 3.655849456787109, + Timestamp('1989-01-26 00:00:00'): 3.665865421295166, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.786057710647583}, + 'Close': {Timestamp('1989-01-03 00:00:00'): 3.5256409645080566, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.575721263885498, + Timestamp('1989-01-06 00:00:00'): 3.575721263885498, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-11 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-12 00:00:00'): 3.605769157409668, + Timestamp('1989-01-13 00:00:00'): 3.605769157409668, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.62580132484436, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.665865421295166, + Timestamp('1989-01-20 00:00:00'): 3.6458332538604736, + Timestamp('1989-01-23 00:00:00'): 3.62580132484436, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.675881385803223, + Timestamp('1989-01-26 00:00:00'): 3.756009578704834, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.846153736114502}, + } + test_df = pd.DataFrame(test_dict) arima = ts.arima() arima.fit(test_df, train_size=0.8, freq='B', q_vals=[0]) @@ -188,5 +124,6 @@ def test_arima_fit(): assert(transformed_test_df['Close'][5] == (arima.best_params['Close'].const + diff_test_df_close[4] * arima.best_params['Close'].ar_coef[0])) assert(transformed_test_df['Close'][-1] == (arima.best_params['Close'].const + diff_test_df_close[-2] * arima.best_params['Close'].ar_coef[0])) - print("All tests passed!") -test_arima_fit() + +if __name__ == "__main__": + pytest.main(args=["test_arima.py"]) diff --git a/gs_quant/timeseries/arima.py b/gs_quant/timeseries/arima.py index 30869fc7..7639e69e 100644 --- a/gs_quant/timeseries/arima.py +++ b/gs_quant/timeseries/arima.py @@ -10,7 +10,6 @@ # specific language governing permissions and limitations # under the License. -from __future__ import annotations from dataclasses import dataclass from typing import Iterable, Optional, Union, Tuple @@ -95,15 +94,15 @@ def fit( d_vals: list=[0, 1, 2], q_vals: list=[0, 1, 2], freq: str=None - ) -> arima: + ) -> 'arima': """ Train a combination of ARIMA models. If pandas DataFrame, finds the best arima model parameters for each column. If pandas Series, finds the best arima model parameters for the series. :param X: time series to be operated on; required parameter - :param train_size: if float, should be between 0.0 and 1.0 and - represent the proportion of the dataset to include in the train split. + :param train_size: if float, should be between 0.0 and 1.0 and + represent the proportion of the dataset to include in the train split. If int, represents the absolute number of train samples. If None, the value is automatically set 0.75 :p_vals: number of autoregressive terms to search; default is [0,1,2] @@ -116,44 +115,45 @@ def fit( if isinstance(X, pd.Series): X = X.to_frame() - if isinstance(X, pd.DataFrame): - for series_id in X.columns: - series = X[series_id] - best_score = float('inf') - best_order = None - best_const = None - best_ar_coef = None - best_ma_coef = None - best_resid = None - for order in list(itertools.product(*[p_vals, d_vals, q_vals])): - try: - mse, const, ar_coef, ma_coef, resid = self._evaluate_arima_model(series, order, train_size, freq) - if mse < best_score: - best_score = mse - best_order = order - best_const = const - best_ar_coef = ar_coef - best_ma_coef = ma_coef - best_resid = resid - except Exception as e: - print(' {}'.format(e)) - continue - - p, d, q = best_order - assert(p == len(best_ar_coef)) - - self.best_params[series_id] = ARIMA_BestParams( - freq=freq, - p=p, - d=d, - q=q, - const=best_const, - ar_coef=best_ar_coef, - ma_coef=best_ma_coef, - resid=best_resid, - series=series) - else: + if not isinstance(X, pd.DataFrame): raise ValueError('Not DataFrame or Series!') + + for series_id in X.columns: + series = X[series_id] + best_score = float('inf') + best_order = None + best_const = None + best_ar_coef = None + best_ma_coef = None + best_resid = None + for order in list(itertools.product(*[p_vals, d_vals, q_vals])): + try: + mse, const, ar_coef, ma_coef, resid = self._evaluate_arima_model(series, order, train_size, freq) + if mse < best_score: + best_score = mse + best_order = order + best_const = const + best_ar_coef = ar_coef + best_ma_coef = ma_coef + best_resid = resid + except Exception as e: + print(' {}'.format(e)) + continue + + p, d, q = best_order + assert(p == len(best_ar_coef)) + + self.best_params[series_id] = ARIMA_BestParams( + freq=freq, + p=p, + d=d, + q=q, + const=best_const, + ar_coef=best_ar_coef, + ma_coef=best_ma_coef, + resid=best_resid, + series=series) + return self def _difference(self, X: pd.Series, d: int): From 68d3cc2259bf57722319e9ddea2d93f097b6d8d7 Mon Sep 17 00:00:00 2001 From: Maverick Lin Date: Tue, 7 Apr 2020 12:13:59 -0400 Subject: [PATCH 06/10] moved arima normalization to econometrics.py --- gs_quant/test/timeseries/test_arima.py | 129 -------- gs_quant/test/timeseries/test_econometrics.py | 101 ++++++ gs_quant/timeseries/arima.py | 296 ------------------ gs_quant/timeseries/econometrics.py | 291 +++++++++++++++++ 4 files changed, 392 insertions(+), 425 deletions(-) delete mode 100644 gs_quant/test/timeseries/test_arima.py delete mode 100644 gs_quant/timeseries/arima.py diff --git a/gs_quant/test/timeseries/test_arima.py b/gs_quant/test/timeseries/test_arima.py deleted file mode 100644 index 16805bcd..00000000 --- a/gs_quant/test/timeseries/test_arima.py +++ /dev/null @@ -1,129 +0,0 @@ -""" -Copyright 2020 Goldman Sachs. -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, -software distributed under the License is distributed on an -"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -KIND, either express or implied. See the License for the -specific language governing permissions and limitations -under the License. -""" -from datetime import date -from math import isclose - -import pytest -import pandas as pd -import numpy as np - -from pandas import Timestamp -from pandas.util.testing import assert_series_equal -from numpy.testing import assert_raises, assert_array_equal, assert_allclose - -import gs_quant.timeseries as ts - - -def test_arima_fit(): - test_dict = { - 'High': {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, - Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-05 00:00:00'): 3.62580132484436, - Timestamp('1989-01-06 00:00:00'): 3.62580132484436, - Timestamp('1989-01-09 00:00:00'): 3.575721263885498, - Timestamp('1989-01-10 00:00:00'): 3.575721263885498, - Timestamp('1989-01-11 00:00:00'): 3.5657050609588623, - Timestamp('1989-01-12 00:00:00'): 3.635817289352417, - Timestamp('1989-01-13 00:00:00'): 3.615785360336304, - Timestamp('1989-01-16 00:00:00'): 3.615785360336304, - Timestamp('1989-01-17 00:00:00'): 3.635817289352417, - Timestamp('1989-01-18 00:00:00'): 3.675881385803223, - Timestamp('1989-01-19 00:00:00'): 3.695913553237915, - Timestamp('1989-01-20 00:00:00'): 3.665865421295166, - Timestamp('1989-01-23 00:00:00'): 3.675881385803223, - Timestamp('1989-01-24 00:00:00'): 3.675881385803223, - Timestamp('1989-01-25 00:00:00'): 3.695913553237915, - Timestamp('1989-01-26 00:00:00'): 3.7760417461395264, - Timestamp('1989-01-27 00:00:00'): 3.8561699390411377, - Timestamp('1989-01-30 00:00:00'): 3.8561699390411377}, - 'Low': {Timestamp('1989-01-03 00:00:00'): 3.4855768680572514, - Timestamp('1989-01-04 00:00:00'): 3.5356571674346924, - Timestamp('1989-01-05 00:00:00'): 3.575721263885498, - Timestamp('1989-01-06 00:00:00'): 3.575721263885498, - Timestamp('1989-01-09 00:00:00'): 3.5356571674346924, - Timestamp('1989-01-10 00:00:00'): 3.5356571674346924, - Timestamp('1989-01-11 00:00:00'): 3.5256409645080566, - Timestamp('1989-01-12 00:00:00'): 3.5456731319427486, - Timestamp('1989-01-13 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, - Timestamp('1989-01-17 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-18 00:00:00'): 3.615785360336304, - Timestamp('1989-01-19 00:00:00'): 3.655849456787109, - Timestamp('1989-01-20 00:00:00'): 3.62580132484436, - Timestamp('1989-01-23 00:00:00'): 3.615785360336304, - Timestamp('1989-01-24 00:00:00'): 3.615785360336304, - Timestamp('1989-01-25 00:00:00'): 3.655849456787109, - Timestamp('1989-01-26 00:00:00'): 3.665865421295166, - Timestamp('1989-01-27 00:00:00'): 3.79607367515564, - Timestamp('1989-01-30 00:00:00'): 3.786057710647583}, - 'Close': {Timestamp('1989-01-03 00:00:00'): 3.5256409645080566, - Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-05 00:00:00'): 3.575721263885498, - Timestamp('1989-01-06 00:00:00'): 3.575721263885498, - Timestamp('1989-01-09 00:00:00'): 3.575721263885498, - Timestamp('1989-01-10 00:00:00'): 3.5556890964508057, - Timestamp('1989-01-11 00:00:00'): 3.5556890964508057, - Timestamp('1989-01-12 00:00:00'): 3.605769157409668, - Timestamp('1989-01-13 00:00:00'): 3.605769157409668, - Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, - Timestamp('1989-01-17 00:00:00'): 3.62580132484436, - Timestamp('1989-01-18 00:00:00'): 3.675881385803223, - Timestamp('1989-01-19 00:00:00'): 3.665865421295166, - Timestamp('1989-01-20 00:00:00'): 3.6458332538604736, - Timestamp('1989-01-23 00:00:00'): 3.62580132484436, - Timestamp('1989-01-24 00:00:00'): 3.675881385803223, - Timestamp('1989-01-25 00:00:00'): 3.675881385803223, - Timestamp('1989-01-26 00:00:00'): 3.756009578704834, - Timestamp('1989-01-27 00:00:00'): 3.79607367515564, - Timestamp('1989-01-30 00:00:00'): 3.846153736114502}, - } - - test_df = pd.DataFrame(test_dict) - arima = ts.arima() - arima.fit(test_df, train_size=0.8, freq='B', q_vals=[0]) - transformed_test_df = arima.transform(test_df) - - for col in transformed_test_df.keys(): - count_nans = arima.best_params[col].p + arima.best_params[col].d - assert(count_nans == transformed_test_df[col].isna().sum()) - - # Test (1,2,0) Model - diff_test_df_high = test_df['High'].diff().diff() - assert(transformed_test_df['High'][3] == (arima.best_params['High'].const + diff_test_df_high[2] * arima.best_params['High'].ar_coef[0])) - assert(transformed_test_df['High'][4] == (arima.best_params['High'].const + diff_test_df_high[3] * arima.best_params['High'].ar_coef[0])) - assert(transformed_test_df['High'][-1] == (arima.best_params['High'].const + diff_test_df_high[-2] * arima.best_params['High'].ar_coef[0])) - - # Test (2,1,0) Model - diff_test_df_low = test_df['Low'].diff() - assert(isclose(transformed_test_df['Low'][3], (arima.best_params['Low'].const + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[1] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) - assert(isclose(transformed_test_df['Low'][4], (arima.best_params['Low'].const + diff_test_df_low[3] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) - assert(isclose(transformed_test_df['Low'][-1], (arima.best_params['Low'].const + diff_test_df_low[-2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[-3] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) - - # Test (1,2,0) Model - diff_test_df_close = test_df['Close'].diff()[1:].diff() - first_day = pd.Series([np.nan]) - first_day.index = [diff_test_df_close.index[0] - pd.DateOffset(days=1)] - first_day.name = 'Close' - diff_test_df_close = pd.concat([first_day, diff_test_df_close]) - diff_test_df_close.index.name = "Date" - - assert(transformed_test_df['Close'][4] == (arima.best_params['Close'].const + diff_test_df_close[3] * arima.best_params['Close'].ar_coef[0])) - assert(transformed_test_df['Close'][5] == (arima.best_params['Close'].const + diff_test_df_close[4] * arima.best_params['Close'].ar_coef[0])) - assert(transformed_test_df['Close'][-1] == (arima.best_params['Close'].const + diff_test_df_close[-2] * arima.best_params['Close'].ar_coef[0])) - - -if __name__ == "__main__": - pytest.main(args=["test_arima.py"]) diff --git a/gs_quant/test/timeseries/test_econometrics.py b/gs_quant/test/timeseries/test_econometrics.py index e30882be..ad5f74d2 100644 --- a/gs_quant/test/timeseries/test_econometrics.py +++ b/gs_quant/test/timeseries/test_econometrics.py @@ -14,7 +14,10 @@ under the License. """ +from math import isclose + import pytest +from pandas import Timestamp from pandas.util.testing import assert_series_equal from gs_quant.timeseries import * @@ -366,5 +369,103 @@ def test_max_drawdown(): assert_series_equal(output_window, pd.Series([0.0, 0.0, 0.0, -0.2, -0.2, -0.75]), obj="Max drawdown window") +def test_arima_fit(): + test_dict = { + 'High': {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.62580132484436, + Timestamp('1989-01-06 00:00:00'): 3.62580132484436, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.575721263885498, + Timestamp('1989-01-11 00:00:00'): 3.5657050609588623, + Timestamp('1989-01-12 00:00:00'): 3.635817289352417, + Timestamp('1989-01-13 00:00:00'): 3.615785360336304, + Timestamp('1989-01-16 00:00:00'): 3.615785360336304, + Timestamp('1989-01-17 00:00:00'): 3.635817289352417, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.695913553237915, + Timestamp('1989-01-20 00:00:00'): 3.665865421295166, + Timestamp('1989-01-23 00:00:00'): 3.675881385803223, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.695913553237915, + Timestamp('1989-01-26 00:00:00'): 3.7760417461395264, + Timestamp('1989-01-27 00:00:00'): 3.8561699390411377, + Timestamp('1989-01-30 00:00:00'): 3.8561699390411377}, + 'Low': {Timestamp('1989-01-03 00:00:00'): 3.4855768680572514, + Timestamp('1989-01-04 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-05 00:00:00'): 3.575721263885498, + Timestamp('1989-01-06 00:00:00'): 3.575721263885498, + Timestamp('1989-01-09 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-10 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-11 00:00:00'): 3.5256409645080566, + Timestamp('1989-01-12 00:00:00'): 3.5456731319427486, + Timestamp('1989-01-13 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-18 00:00:00'): 3.615785360336304, + Timestamp('1989-01-19 00:00:00'): 3.655849456787109, + Timestamp('1989-01-20 00:00:00'): 3.62580132484436, + Timestamp('1989-01-23 00:00:00'): 3.615785360336304, + Timestamp('1989-01-24 00:00:00'): 3.615785360336304, + Timestamp('1989-01-25 00:00:00'): 3.655849456787109, + Timestamp('1989-01-26 00:00:00'): 3.665865421295166, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.786057710647583}, + 'Close': {Timestamp('1989-01-03 00:00:00'): 3.5256409645080566, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.575721263885498, + Timestamp('1989-01-06 00:00:00'): 3.575721263885498, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-11 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-12 00:00:00'): 3.605769157409668, + Timestamp('1989-01-13 00:00:00'): 3.605769157409668, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.62580132484436, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.665865421295166, + Timestamp('1989-01-20 00:00:00'): 3.6458332538604736, + Timestamp('1989-01-23 00:00:00'): 3.62580132484436, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.675881385803223, + Timestamp('1989-01-26 00:00:00'): 3.756009578704834, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.846153736114502}, + } + + test_df = pd.DataFrame(test_dict) + arima = econometrics.arima() + arima.fit(test_df, train_size=0.8, freq='B', q_vals=[0]) + transformed_test_df = arima.transform(test_df) + + for col in transformed_test_df.keys(): + count_nans = arima.best_params[col].p + arima.best_params[col].d + assert(count_nans == transformed_test_df[col].isna().sum()) + + # Test (1,2,0) Model + diff_test_df_high = test_df['High'].diff().diff() + assert(transformed_test_df['High'][3] == (arima.best_params['High'].const + diff_test_df_high[2] * arima.best_params['High'].ar_coef[0])) + assert(transformed_test_df['High'][4] == (arima.best_params['High'].const + diff_test_df_high[3] * arima.best_params['High'].ar_coef[0])) + assert(transformed_test_df['High'][-1] == (arima.best_params['High'].const + diff_test_df_high[-2] * arima.best_params['High'].ar_coef[0])) + + # Test (2,1,0) Model + diff_test_df_low = test_df['Low'].diff() + assert(isclose(transformed_test_df['Low'][3], (arima.best_params['Low'].const + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[1] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][4], (arima.best_params['Low'].const + diff_test_df_low[3] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][-1], (arima.best_params['Low'].const + diff_test_df_low[-2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[-3] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + + # Test (1,2,0) Model + diff_test_df_close = test_df['Close'].diff()[1:].diff() + first_day = pd.Series([np.nan]) + first_day.index = [diff_test_df_close.index[0] - pd.DateOffset(days=1)] + first_day.name = 'Close' + diff_test_df_close = pd.concat([first_day, diff_test_df_close]) + diff_test_df_close.index.name = "Date" + + assert(transformed_test_df['Close'][4] == (arima.best_params['Close'].const + diff_test_df_close[3] * arima.best_params['Close'].ar_coef[0])) + assert(transformed_test_df['Close'][5] == (arima.best_params['Close'].const + diff_test_df_close[4] * arima.best_params['Close'].ar_coef[0])) + assert(transformed_test_df['Close'][-1] == (arima.best_params['Close'].const + diff_test_df_close[-2] * arima.best_params['Close'].ar_coef[0])) + + if __name__ == "__main__": pytest.main(args=["test_econometrics.py"]) diff --git a/gs_quant/timeseries/arima.py b/gs_quant/timeseries/arima.py deleted file mode 100644 index 7639e69e..00000000 --- a/gs_quant/timeseries/arima.py +++ /dev/null @@ -1,296 +0,0 @@ -# Copyright 2020 Goldman Sachs. -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# http://www.apache.org/licenses/LICENSE-2.0 -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from dataclasses import dataclass -from typing import Iterable, Optional, Union, Tuple - -import itertools -import datetime as dt -import pandas as pd -import numpy as np -from statsmodels.tsa.arima_model import ARIMA -from statsmodels.tools.eval_measures import mse - - -@dataclass -class ARIMA_BestParams: - freq: str = '' - p: int = None - d: int = None - q: int = None - const: float = None - ar_coef: list = None - ma_coef: list = None - resid: list = None - series: pd.Series = None - - -class arima(): - """ - ARIMA is the Autoregressive Integrated Moving Average Model and is used - to normalize and forecast time series data. ARIMA has 3 parameters: - (p, d, q) where: - :p is the number of autoregressive terms - :d is the number of nonseasonal differences - :q is the number of lagged forecast errors in the prediction equation - - An ARIMA model is selected from the Catesian product of sets p, q, and d. - The time series is split into train and test sets and an ARIMA model is fit - for every combination on the training set. The model with the lowest - mean-squared error (MSE) on the test set is selected as the best model. The - original times series can then be transformed by the best model. - """ - - def __init__(self): - self.best_params = {} - - def _evaluate_arima_model( - self, X: - Union[pd.Series, pd.DataFrame], - arima_order: Tuple[int, int, int], - train_size: Union[float, int, None], - freq: str - ) -> Tuple[float, dict]: - if type(train_size) == float: - train_size = int(len(X) * train_size) - train, test = X[:train_size].astype(float), X[train_size:].astype(float) - elif type(train_size) == int: - train, test = X[:train_size].astype(float), X[train_size:].astype(float) - elif train_size is None: - train_size = int(len(X) * 0.75) - train, test = X[:train_size].astype(float), X[train_size:].astype(float) - else: - raise ValueError('train_size is not int, float, or None') - - model = ARIMA(train, order=arima_order, freq=freq) - model_fit = model.fit(disp=False, method='css', trend='nc') - ar_coef = model_fit.arparams - ma_coef = model_fit.maparams - resid = model_fit.resid - - model_params = model_fit.params.to_dict() - const = model_params.get('const', 0) - - # calculate test error - yhat = model_fit.forecast(len(test))[0] - error = mse(test, yhat) - - return error, const, ar_coef, ma_coef, resid - - def fit( - self, - X: Union[pd.Series, pd.DataFrame], - train_size: Union[float, int, None]=None, - p_vals: list=[0, 1, 2], - d_vals: list=[0, 1, 2], - q_vals: list=[0, 1, 2], - freq: str=None - ) -> 'arima': - """ - Train a combination of ARIMA models. If pandas DataFrame, finds the - best arima model parameters for each column. If pandas Series, finds - the best arima model parameters for the series. - - :param X: time series to be operated on; required parameter - :param train_size: if float, should be between 0.0 and 1.0 and - represent the proportion of the dataset to include in the train split. - If int, represents the absolute number of train samples. If None, - the value is automatically set 0.75 - :p_vals: number of autoregressive terms to search; default is [0,1,2] - :d_vals: number of differences to search; default is [0,1,2] - :q_vals: number of lagged forecast to search; always [0,1,2] - :freq: frequency of time series, default is None - :return: self - """ - - if isinstance(X, pd.Series): - X = X.to_frame() - - if not isinstance(X, pd.DataFrame): - raise ValueError('Not DataFrame or Series!') - - for series_id in X.columns: - series = X[series_id] - best_score = float('inf') - best_order = None - best_const = None - best_ar_coef = None - best_ma_coef = None - best_resid = None - for order in list(itertools.product(*[p_vals, d_vals, q_vals])): - try: - mse, const, ar_coef, ma_coef, resid = self._evaluate_arima_model(series, order, train_size, freq) - if mse < best_score: - best_score = mse - best_order = order - best_const = const - best_ar_coef = ar_coef - best_ma_coef = ma_coef - best_resid = resid - except Exception as e: - print(' {}'.format(e)) - continue - - p, d, q = best_order - assert(p == len(best_ar_coef)) - - self.best_params[series_id] = ARIMA_BestParams( - freq=freq, - p=p, - d=d, - q=q, - const=best_const, - ar_coef=best_ar_coef, - ma_coef=best_ma_coef, - resid=best_resid, - series=series) - - return self - - def _difference(self, X: pd.Series, d: int): - """Helper Function to Difference Time Series n Times""" - - if d == 0: - return X - elif d == 1: - return X.diff() - else: - return self._difference(X.diff(), d-1) - - def _lagged_values(self, X: pd.Series, p: int, ar_coef: list): - """Helper Function to Calculate AutoRegressive(AR) Component""" - - if p == 0: - return X - elif p > 0: - transformed_df = pd.concat([X.copy().shift(periods=i) - for i in range(1, p+1)], axis=1) - transformed_df = transformed_df.dot(ar_coef) - else: - raise ValueError("p should not be less than 0!") - return transformed_df - - def _calculate_residuals( - self, - X_ar: pd.Series, - X_diff: pd.Series, - p: int, - d: int, - q: int, - ar_coef: list, - ma_coef: list, - freq: str - ): - """Helper Function to Calculate Residuals/MA Component""" - - ma_coef = ma_coef[::-1] - resid = X_ar.copy(deep=True) - resid[:] = 0 - - X_ma = X_ar.copy(deep=True) - X_ma[:] = np.nan - - for x in range(p + d, len(X_ar)): - ma_component = resid[x-q: x].dot(ma_coef) - prediction = X_ar[x] + ma_component - residual = X_diff[x] - prediction - resid[x] = residual - X_ma[x] = prediction - - return resid, X_ma - - def _arima_transform_series( - self, - X: pd.Series, - p: int, - d: int, - q: int, - const: float, - ar_coef: list, - ma_coef: list, - resid: list, - freq: str - ) -> pd.Series: - """Helper Function to Transform Series""" - - # Difference first - X_diff = self._difference(X, d) - - # Calculate Autoregressive Component - X_diff_ar = self._lagged_values(X_diff, p, ar_coef) - - # Caluclate Residuals and Moving Average Component - calcualted_resid, X_diff_ar_ma = self._calculate_residuals(X_diff_ar, - X_diff, - p, - d, - q, - ar_coef, - ma_coef, - freq) - - # Check calculated residuals are close with ARIMA statsmodels residuals - resid_df = pd.concat([calcualted_resid, resid], axis=1, join='inner') - assert(np.allclose(resid_df[resid_df.columns[0]], - resid_df[resid_df.columns[1]])) - - return X_diff_ar_ma - - def _arima_transform_df(self, X: pd.DataFrame) -> pd.DataFrame: - """Helper Function to Transform DataFrame""" - - series = {} - for series_id in X.columns: - freq = self.best_params[series_id].freq - p = self.best_params[series_id].p - d = self.best_params[series_id].d - q = self.best_params[series_id].q - const = self.best_params[series_id].const - ar_coef = self.best_params[series_id].ar_coef - ma_coef = self.best_params[series_id].ma_coef - resid = self.best_params[series_id].resid - - series[series_id] = self._arima_transform_series( - X[series_id], - p=p, - d=d, - q=q, - const=const, - ar_coef=ar_coef, - ma_coef=ma_coef, - resid=resid, - freq=freq - ) - - return pd.DataFrame(series) - - def transform( - self, - X: Union[pd.Series, pd.DataFrame] - ) -> Union[pd.DataFrame]: - """ - Transform a series based on the best ARIMA found from fit(). - Does not support tranformation using MA components. - - :param X: time series to be operated on; required parameter - :return: DataFrame - """ - - if isinstance(X, pd.Series): - X = X.to_frame() - - if isinstance(X, pd.DataFrame): - transformed = self._arima_transform_df(X) - else: - raise ValueError('Not DataFrame or Series!') - - return transformed diff --git a/gs_quant/timeseries/econometrics.py b/gs_quant/timeseries/econometrics.py index 95c07f1d..76909ef3 100644 --- a/gs_quant/timeseries/econometrics.py +++ b/gs_quant/timeseries/econometrics.py @@ -15,6 +15,16 @@ # should be fully documented: docstrings should describe parameters and the return value, and provide a 1-line # description. Type annotations should be provided for parameters. +import itertools +import datetime as dt +from dataclasses import dataclass +from typing import Iterable, Optional, Union, Tuple + +import numpy as np +import pandas as pd +from statsmodels.tsa.arima_model import ARIMA +from statsmodels.tools.eval_measures import mse + from .statistics import * from ..errors import * @@ -500,3 +510,284 @@ def max_drawdown(x: pd.Series, w: Union[Window, int] = Window(None, 0)) -> pd.Se rolling_max = x.rolling(w.w, 0).max() result = (x / rolling_max - 1).rolling(w.w, 0).min() return apply_ramp(result, w) + + +@dataclass +class ARIMA_BestParams: + freq: str = '' + p: int = None + d: int = None + q: int = None + const: float = None + ar_coef: list = None + ma_coef: list = None + resid: list = None + series: pd.Series = None + + +class arima(): + """ + ARIMA is the Autoregressive Integrated Moving Average Model and is used + to normalize and forecast time series data. ARIMA has 3 parameters: + (p, d, q) where: + :p is the number of autoregressive terms + :d is the number of nonseasonal differences + :q is the number of lagged forecast errors in the prediction equation + + An ARIMA model is selected from the Catesian product of sets p, q, and d. + The time series is split into train and test sets and an ARIMA model is fit + for every combination on the training set. The model with the lowest + mean-squared error (MSE) on the test set is selected as the best model. The + original times series can then be transformed by the best model. + + **Examples** + >>> series = generate_series(100) + >>> arima = econometrics.arima() + >>> arima.fit(series, train_size=0.8) + >>> transformed_time_series = arima.transform(series) + """ + + def __init__(self): + self.best_params = {} + + def _evaluate_arima_model( + self, X: + Union[pd.Series, pd.DataFrame], + arima_order: Tuple[int, int, int], + train_size: Union[float, int, None], + freq: str + ) -> Tuple[float, dict]: + if type(train_size) == float: + train_size = int(len(X) * train_size) + train, test = X[:train_size].astype(float), X[train_size:].astype(float) + elif type(train_size) == int: + train, test = X[:train_size].astype(float), X[train_size:].astype(float) + elif train_size is None: + train_size = int(len(X) * 0.75) + train, test = X[:train_size].astype(float), X[train_size:].astype(float) + else: + raise ValueError('train_size is not int, float, or None') + + model = ARIMA(train, order=arima_order, freq=freq) + model_fit = model.fit(disp=False, method='css', trend='nc') + ar_coef = model_fit.arparams + ma_coef = model_fit.maparams + resid = model_fit.resid + + model_params = model_fit.params.to_dict() + const = model_params.get('const', 0) + + # calculate test error + yhat = model_fit.forecast(len(test))[0] + error = mse(test, yhat) + + return error, const, ar_coef, ma_coef, resid + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + train_size: Union[float, int, None]=None, + p_vals: list=[0, 1, 2], + d_vals: list=[0, 1, 2], + q_vals: list=[0, 1, 2], + freq: str=None + ) -> 'arima': + """ + Train a combination of ARIMA models. If pandas DataFrame, finds the + best arima model parameters for each column. If pandas Series, finds + the best arima model parameters for the series. + + :param X: time series to be operated on; required parameter + :param train_size: if float, should be between 0.0 and 1.0 and + represent the proportion of the dataset to include in the train split. + If int, represents the absolute number of train samples. If None, + the value is automatically set 0.75 + :p_vals: number of autoregressive terms to search; default is [0,1,2] + :d_vals: number of differences to search; default is [0,1,2] + :q_vals: number of lagged forecast to search; always [0,1,2] + :freq: frequency of time series, default is None + :return: self + """ + + if isinstance(X, pd.Series): + X = X.to_frame() + + if not isinstance(X, pd.DataFrame): + raise ValueError('Not DataFrame or Series!') + + for series_id in X.columns: + series = X[series_id] + best_score = float('inf') + best_order = None + best_const = None + best_ar_coef = None + best_ma_coef = None + best_resid = None + for order in list(itertools.product(*[p_vals, d_vals, q_vals])): + try: + mse, const, ar_coef, ma_coef, resid = self._evaluate_arima_model(series, order, train_size, freq) + if mse < best_score: + best_score = mse + best_order = order + best_const = const + best_ar_coef = ar_coef + best_ma_coef = ma_coef + best_resid = resid + except Exception as e: + print(' {}'.format(e)) + continue + + p, d, q = best_order + assert(p == len(best_ar_coef)) + + self.best_params[series_id] = ARIMA_BestParams( + freq=freq, + p=p, + d=d, + q=q, + const=best_const, + ar_coef=best_ar_coef, + ma_coef=best_ma_coef, + resid=best_resid, + series=series) + + return self + + def _difference(self, X: pd.Series, d: int): + """Helper Function to Difference Time Series n Times""" + + if d == 0: + return X + elif d == 1: + return X.diff() + else: + return self._difference(X.diff(), d-1) + + def _lagged_values(self, X: pd.Series, p: int, ar_coef: list): + """Helper Function to Calculate AutoRegressive(AR) Component""" + + if p == 0: + return X + elif p > 0: + transformed_df = pd.concat([X.copy().shift(periods=i) + for i in range(1, p+1)], axis=1) + transformed_df = transformed_df.dot(ar_coef) + else: + raise ValueError("p should not be less than 0!") + return transformed_df + + def _calculate_residuals( + self, + X_ar: pd.Series, + X_diff: pd.Series, + p: int, + d: int, + q: int, + ar_coef: list, + ma_coef: list, + freq: str + ): + """Helper Function to Calculate Residuals/MA Component""" + + ma_coef = ma_coef[::-1] + resid = X_ar.copy(deep=True) + resid[:] = 0 + + X_ma = X_ar.copy(deep=True) + X_ma[:] = np.nan + + for x in range(p + d, len(X_ar)): + ma_component = resid[x-q: x].dot(ma_coef) + prediction = X_ar[x] + ma_component + residual = X_diff[x] - prediction + resid[x] = residual + X_ma[x] = prediction + + return resid, X_ma + + def _arima_transform_series( + self, + X: pd.Series, + p: int, + d: int, + q: int, + const: float, + ar_coef: list, + ma_coef: list, + resid: list, + freq: str + ) -> pd.Series: + """Helper Function to Transform Series""" + + # Difference first + X_diff = self._difference(X, d) + + # Calculate Autoregressive Component + X_diff_ar = self._lagged_values(X_diff, p, ar_coef) + + # Caluclate Residuals and Moving Average Component + calcualted_resid, X_diff_ar_ma = self._calculate_residuals(X_diff_ar, + X_diff, + p, + d, + q, + ar_coef, + ma_coef, + freq) + + # Check calculated residuals are close with ARIMA statsmodels residuals + resid_df = pd.concat([calcualted_resid, resid], axis=1, join='inner') + assert(np.allclose(resid_df[resid_df.columns[0]], + resid_df[resid_df.columns[1]])) + + return X_diff_ar_ma + + def _arima_transform_df(self, X: pd.DataFrame) -> pd.DataFrame: + """Helper Function to Transform DataFrame""" + + series = {} + for series_id in X.columns: + freq = self.best_params[series_id].freq + p = self.best_params[series_id].p + d = self.best_params[series_id].d + q = self.best_params[series_id].q + const = self.best_params[series_id].const + ar_coef = self.best_params[series_id].ar_coef + ma_coef = self.best_params[series_id].ma_coef + resid = self.best_params[series_id].resid + + series[series_id] = self._arima_transform_series( + X[series_id], + p=p, + d=d, + q=q, + const=const, + ar_coef=ar_coef, + ma_coef=ma_coef, + resid=resid, + freq=freq + ) + + return pd.DataFrame(series) + + def transform( + self, + X: Union[pd.Series, pd.DataFrame] + ) -> Union[pd.DataFrame]: + """ + Transform a series based on the best ARIMA found from fit(). + Does not support tranformation using MA components. + + :param X: time series to be operated on; required parameter + :return: DataFrame + """ + + if isinstance(X, pd.Series): + X = X.to_frame() + + if isinstance(X, pd.DataFrame): + transformed = self._arima_transform_df(X) + else: + raise ValueError('Not DataFrame or Series!') + + return transformed From 36d3294605b44a55a8a0b59f283c13eb4adcbf16 Mon Sep 17 00:00:00 2001 From: Maverick Lin Date: Wed, 8 Apr 2020 17:38:10 -0400 Subject: [PATCH 07/10] remove arima from timeseries __init__.py --- gs_quant/timeseries/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/gs_quant/timeseries/__init__.py b/gs_quant/timeseries/__init__.py index 992ca9d6..1b7777be 100644 --- a/gs_quant/timeseries/__init__.py +++ b/gs_quant/timeseries/__init__.py @@ -22,6 +22,5 @@ from .technicals import * from .measures import * from .helper import * -from .arima import * __name__ = 'timeseries' From 94a1596980ea708cfa984892bd18f6dbb41bb3b7 Mon Sep 17 00:00:00 2001 From: Maverick Lin Date: Mon, 20 Apr 2020 15:06:03 -0400 Subject: [PATCH 08/10] pep8 and added code coverage for econometrics --- gs_quant/test/timeseries/test_econometrics.py | 238 +++++++++++------- gs_quant/timeseries/econometrics.py | 38 +-- 2 files changed, 158 insertions(+), 118 deletions(-) diff --git a/gs_quant/test/timeseries/test_econometrics.py b/gs_quant/test/timeseries/test_econometrics.py index ad5f74d2..4a8cfb15 100644 --- a/gs_quant/test/timeseries/test_econometrics.py +++ b/gs_quant/test/timeseries/test_econometrics.py @@ -371,100 +371,156 @@ def test_max_drawdown(): def test_arima_fit(): test_dict = { - 'High': {Timestamp('1989-01-03 00:00:00'): 3.575721263885498, - Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-05 00:00:00'): 3.62580132484436, - Timestamp('1989-01-06 00:00:00'): 3.62580132484436, - Timestamp('1989-01-09 00:00:00'): 3.575721263885498, - Timestamp('1989-01-10 00:00:00'): 3.575721263885498, - Timestamp('1989-01-11 00:00:00'): 3.5657050609588623, - Timestamp('1989-01-12 00:00:00'): 3.635817289352417, - Timestamp('1989-01-13 00:00:00'): 3.615785360336304, - Timestamp('1989-01-16 00:00:00'): 3.615785360336304, - Timestamp('1989-01-17 00:00:00'): 3.635817289352417, - Timestamp('1989-01-18 00:00:00'): 3.675881385803223, - Timestamp('1989-01-19 00:00:00'): 3.695913553237915, - Timestamp('1989-01-20 00:00:00'): 3.665865421295166, - Timestamp('1989-01-23 00:00:00'): 3.675881385803223, - Timestamp('1989-01-24 00:00:00'): 3.675881385803223, - Timestamp('1989-01-25 00:00:00'): 3.695913553237915, - Timestamp('1989-01-26 00:00:00'): 3.7760417461395264, - Timestamp('1989-01-27 00:00:00'): 3.8561699390411377, - Timestamp('1989-01-30 00:00:00'): 3.8561699390411377}, - 'Low': {Timestamp('1989-01-03 00:00:00'): 3.4855768680572514, - Timestamp('1989-01-04 00:00:00'): 3.5356571674346924, - Timestamp('1989-01-05 00:00:00'): 3.575721263885498, - Timestamp('1989-01-06 00:00:00'): 3.575721263885498, - Timestamp('1989-01-09 00:00:00'): 3.5356571674346924, - Timestamp('1989-01-10 00:00:00'): 3.5356571674346924, - Timestamp('1989-01-11 00:00:00'): 3.5256409645080566, - Timestamp('1989-01-12 00:00:00'): 3.5456731319427486, - Timestamp('1989-01-13 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, - Timestamp('1989-01-17 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-18 00:00:00'): 3.615785360336304, - Timestamp('1989-01-19 00:00:00'): 3.655849456787109, - Timestamp('1989-01-20 00:00:00'): 3.62580132484436, - Timestamp('1989-01-23 00:00:00'): 3.615785360336304, - Timestamp('1989-01-24 00:00:00'): 3.615785360336304, - Timestamp('1989-01-25 00:00:00'): 3.655849456787109, - Timestamp('1989-01-26 00:00:00'): 3.665865421295166, - Timestamp('1989-01-27 00:00:00'): 3.79607367515564, - Timestamp('1989-01-30 00:00:00'): 3.786057710647583}, - 'Close': {Timestamp('1989-01-03 00:00:00'): 3.5256409645080566, - Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, - Timestamp('1989-01-05 00:00:00'): 3.575721263885498, - Timestamp('1989-01-06 00:00:00'): 3.575721263885498, - Timestamp('1989-01-09 00:00:00'): 3.575721263885498, - Timestamp('1989-01-10 00:00:00'): 3.5556890964508057, - Timestamp('1989-01-11 00:00:00'): 3.5556890964508057, - Timestamp('1989-01-12 00:00:00'): 3.605769157409668, - Timestamp('1989-01-13 00:00:00'): 3.605769157409668, - Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, - Timestamp('1989-01-17 00:00:00'): 3.62580132484436, - Timestamp('1989-01-18 00:00:00'): 3.675881385803223, - Timestamp('1989-01-19 00:00:00'): 3.665865421295166, - Timestamp('1989-01-20 00:00:00'): 3.6458332538604736, - Timestamp('1989-01-23 00:00:00'): 3.62580132484436, - Timestamp('1989-01-24 00:00:00'): 3.675881385803223, - Timestamp('1989-01-25 00:00:00'): 3.675881385803223, - Timestamp('1989-01-26 00:00:00'): 3.756009578704834, - Timestamp('1989-01-27 00:00:00'): 3.79607367515564, - Timestamp('1989-01-30 00:00:00'): 3.846153736114502}, - } + 'High': { + Timestamp('1989-01-03 00:00:00'): 3.575721263885498, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.62580132484436, + Timestamp('1989-01-06 00:00:00'): 3.62580132484436, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.575721263885498, + Timestamp('1989-01-11 00:00:00'): 3.5657050609588623, + Timestamp('1989-01-12 00:00:00'): 3.635817289352417, + Timestamp('1989-01-13 00:00:00'): 3.615785360336304, + Timestamp('1989-01-16 00:00:00'): 3.615785360336304, + Timestamp('1989-01-17 00:00:00'): 3.635817289352417, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.695913553237915, + Timestamp('1989-01-20 00:00:00'): 3.665865421295166, + Timestamp('1989-01-23 00:00:00'): 3.675881385803223, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.695913553237915, + Timestamp('1989-01-26 00:00:00'): 3.7760417461395264, + Timestamp('1989-01-27 00:00:00'): 3.8561699390411377, + Timestamp('1989-01-30 00:00:00'): 3.8561699390411377}, + 'Low': { + Timestamp('1989-01-03 00:00:00'): 3.4855768680572514, + Timestamp('1989-01-04 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-05 00:00:00'): 3.575721263885498, + Timestamp('1989-01-06 00:00:00'): 3.575721263885498, + Timestamp('1989-01-09 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-10 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-11 00:00:00'): 3.5256409645080566, + Timestamp('1989-01-12 00:00:00'): 3.5456731319427486, + Timestamp('1989-01-13 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-18 00:00:00'): 3.615785360336304, + Timestamp('1989-01-19 00:00:00'): 3.655849456787109, + Timestamp('1989-01-20 00:00:00'): 3.62580132484436, + Timestamp('1989-01-23 00:00:00'): 3.615785360336304, + Timestamp('1989-01-24 00:00:00'): 3.615785360336304, + Timestamp('1989-01-25 00:00:00'): 3.655849456787109, + Timestamp('1989-01-26 00:00:00'): 3.665865421295166, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.786057710647583}, + 'Close': { + Timestamp('1989-01-03 00:00:00'): 3.5256409645080566, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.575721263885498, + Timestamp('1989-01-06 00:00:00'): 3.575721263885498, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-11 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-12 00:00:00'): 3.605769157409668, + Timestamp('1989-01-13 00:00:00'): 3.605769157409668, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.62580132484436, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.665865421295166, + Timestamp('1989-01-20 00:00:00'): 3.6458332538604736, + Timestamp('1989-01-23 00:00:00'): 3.62580132484436, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.675881385803223, + Timestamp('1989-01-26 00:00:00'): 3.756009578704834, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.846153736114502}, + } test_df = pd.DataFrame(test_dict) arima = econometrics.arima() - arima.fit(test_df, train_size=0.8, freq='B', q_vals=[0]) - transformed_test_df = arima.transform(test_df) - - for col in transformed_test_df.keys(): - count_nans = arima.best_params[col].p + arima.best_params[col].d - assert(count_nans == transformed_test_df[col].isna().sum()) - - # Test (1,2,0) Model - diff_test_df_high = test_df['High'].diff().diff() - assert(transformed_test_df['High'][3] == (arima.best_params['High'].const + diff_test_df_high[2] * arima.best_params['High'].ar_coef[0])) - assert(transformed_test_df['High'][4] == (arima.best_params['High'].const + diff_test_df_high[3] * arima.best_params['High'].ar_coef[0])) - assert(transformed_test_df['High'][-1] == (arima.best_params['High'].const + diff_test_df_high[-2] * arima.best_params['High'].ar_coef[0])) - - # Test (2,1,0) Model - diff_test_df_low = test_df['Low'].diff() - assert(isclose(transformed_test_df['Low'][3], (arima.best_params['Low'].const + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[1] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) - assert(isclose(transformed_test_df['Low'][4], (arima.best_params['Low'].const + diff_test_df_low[3] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[2] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) - assert(isclose(transformed_test_df['Low'][-1], (arima.best_params['Low'].const + diff_test_df_low[-2] * arima.best_params['Low'].ar_coef[0] + diff_test_df_low[-3] * arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) - - # Test (1,2,0) Model - diff_test_df_close = test_df['Close'].diff()[1:].diff() - first_day = pd.Series([np.nan]) - first_day.index = [diff_test_df_close.index[0] - pd.DateOffset(days=1)] - first_day.name = 'Close' - diff_test_df_close = pd.concat([first_day, diff_test_df_close]) - diff_test_df_close.index.name = "Date" - - assert(transformed_test_df['Close'][4] == (arima.best_params['Close'].const + diff_test_df_close[3] * arima.best_params['Close'].ar_coef[0])) - assert(transformed_test_df['Close'][5] == (arima.best_params['Close'].const + diff_test_df_close[4] * arima.best_params['Close'].ar_coef[0])) - assert(transformed_test_df['Close'][-1] == (arima.best_params['Close'].const + diff_test_df_close[-2] * arima.best_params['Close'].ar_coef[0])) + + train_size_values = [0.75, int(0.75*len(test_df)), None] + for train_size in train_size_values: + arima.fit(test_df, train_size=train_size, freq='B', q_vals=[0]) + transformed_test_df = arima.transform(test_df) + + for col in transformed_test_df.keys(): + count_nans = arima.best_params[col].p + arima.best_params[col].d + assert(count_nans == transformed_test_df[col].isna().sum()) + + # Test (2,1,0) Model + test_df_high = test_df['High'].diff() + assert(isclose(transformed_test_df['High'][3], (arima.best_params['High'].const + test_df_high[2] * + arima.best_params['High'].ar_coef[0] + test_df_high[1] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['High'][4], (arima.best_params['High'].const + test_df_high[3] * + arima.best_params['High'].ar_coef[0] + test_df_high[2] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['High'][-1], (arima.best_params['High'].const + test_df_high[-2] * + arima.best_params['High'].ar_coef[0] + test_df_high[-3] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + + # Test (2,2,0) Model + test_df_low = test_df['Low'].diff().diff() + assert(isclose(transformed_test_df['Low'][4], (arima.best_params['Low'].const + test_df_low[3] * + arima.best_params['Low'].ar_coef[0] + test_df_low[2] * + arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][5], (arima.best_params['Low'].const + test_df_low[4] * + arima.best_params['Low'].ar_coef[0] + test_df_low[3] * + arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][-1], (arima.best_params['Low'].const + test_df_low[-2] * + arima.best_params['Low'].ar_coef[0] + test_df_low[-3] * + arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + + # Test (2,1,0) Model + test_df_close = test_df['Close'].diff() + assert(isclose(transformed_test_df['Close'][3], (arima.best_params['Close'].const + test_df_close[2] * + arima.best_params['Close'].ar_coef[0] + test_df_close[1] * + arima.best_params['Close'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Close'][4], (arima.best_params['Close'].const + test_df_close[3] * + arima.best_params['Close'].ar_coef[0] + test_df_close[2] * + arima.best_params['Close'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Close'][-1], (arima.best_params['Close'].const + test_df_close[-2] * + arima.best_params['Close'].ar_coef[0] + test_df_close[-3] * + arima.best_params['Close'].ar_coef[1]), abs_tol=1e-8)) + + # Test if input is pd.Series + test_high_series = pd.Series(test_df['High']) + arima.fit(test_high_series, train_size=0.75, freq='B', q_vals=[0]) + transformed_test_series = arima.transform(test_high_series) + test_series_high = test_df['High'].diff() + assert(isclose(transformed_test_series['High'][3], (arima.best_params['High'].const + test_series_high[2] * + arima.best_params['High'].ar_coef[0] + test_series_high[1] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_series['High'][4], (arima.best_params['High'].const + test_series_high[3] * + arima.best_params['High'].ar_coef[0] + test_series_high[2] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_series['High'][-1], (arima.best_params['High'].const + test_series_high[-2] * + arima.best_params['High'].ar_coef[0] + test_series_high[-3] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + + # Test if p=0 and d=0 + new_arima = econometrics.arima() + zero_resid = test_high_series.copy(deep=True) + zero_resid[:] = 0 + new_arima.best_params = {'High': econometrics.ARIMA_BestParams(p=0, q=0, d=0, + const=0, ar_coef=[0], ma_coef=[], resid=zero_resid)} + + transformed_test_df = new_arima.transform(test_high_series) + assert_series_equal(transformed_test_df['High'], test_df['High']) + + # Test if train_size is str + with pytest.raises(ValueError) as e: + arima.fit(test_df, train_size='str', freq='B', q_vals=[0]) + + # Test if input is list + with pytest.raises(ValueError) as e: + arima.fit([1, 2, 3, 4], train_size=0.75, freq='B', q_vals=[0]) + + # Test transform with list + with pytest.raises(ValueError) as e: + arima.fit(test_df, train_size=train_size, freq='B', q_vals=[0]) + transformed_test_df = arima.transform([1, 2, 3, 4]) if __name__ == "__main__": diff --git a/gs_quant/timeseries/econometrics.py b/gs_quant/timeseries/econometrics.py index 76909ef3..0b08c817 100644 --- a/gs_quant/timeseries/econometrics.py +++ b/gs_quant/timeseries/econometrics.py @@ -565,8 +565,6 @@ def _evaluate_arima_model( elif train_size is None: train_size = int(len(X) * 0.75) train, test = X[:train_size].astype(float), X[train_size:].astype(float) - else: - raise ValueError('train_size is not int, float, or None') model = ARIMA(train, order=arima_order, freq=freq) model_fit = model.fit(disp=False, method='css', trend='nc') @@ -615,6 +613,9 @@ def fit( if not isinstance(X, pd.DataFrame): raise ValueError('Not DataFrame or Series!') + if not isinstance(train_size, (float, int, type(None))): + raise ValueError('train_size is not int, float, or None') + for series_id in X.columns: series = X[series_id] best_score = float('inf') @@ -640,15 +641,8 @@ def fit( p, d, q = best_order assert(p == len(best_ar_coef)) - self.best_params[series_id] = ARIMA_BestParams( - freq=freq, - p=p, - d=d, - q=q, - const=best_const, - ar_coef=best_ar_coef, - ma_coef=best_ma_coef, - resid=best_resid, + self.best_params[series_id] = ARIMA_BestParams(freq=freq, p=p, d=d, q=q, const=best_const, + ar_coef=best_ar_coef, ma_coef=best_ma_coef, resid=best_resid, series=series) return self @@ -661,7 +655,7 @@ def _difference(self, X: pd.Series, d: int): elif d == 1: return X.diff() else: - return self._difference(X.diff(), d-1) + return self._difference(X.diff(), d - 1) def _lagged_values(self, X: pd.Series, p: int, ar_coef: list): """Helper Function to Calculate AutoRegressive(AR) Component""" @@ -670,10 +664,9 @@ def _lagged_values(self, X: pd.Series, p: int, ar_coef: list): return X elif p > 0: transformed_df = pd.concat([X.copy().shift(periods=i) - for i in range(1, p+1)], axis=1) + for i in range(1, p + 1)], axis=1) transformed_df = transformed_df.dot(ar_coef) - else: - raise ValueError("p should not be less than 0!") + return transformed_df def _calculate_residuals( @@ -697,7 +690,7 @@ def _calculate_residuals( X_ma[:] = np.nan for x in range(p + d, len(X_ar)): - ma_component = resid[x-q: x].dot(ma_coef) + ma_component = resid[x - q: x].dot(ma_coef) prediction = X_ar[x] + ma_component residual = X_diff[x] - prediction resid[x] = residual @@ -756,17 +749,8 @@ def _arima_transform_df(self, X: pd.DataFrame) -> pd.DataFrame: ma_coef = self.best_params[series_id].ma_coef resid = self.best_params[series_id].resid - series[series_id] = self._arima_transform_series( - X[series_id], - p=p, - d=d, - q=q, - const=const, - ar_coef=ar_coef, - ma_coef=ma_coef, - resid=resid, - freq=freq - ) + series[series_id] = self._arima_transform_series(X[series_id], p=p, d=d, q=q, const=const, ar_coef=ar_coef, + ma_coef=ma_coef, resid=resid, freq=freq) return pd.DataFrame(series) From e835ddc9198c07972b008aa9d9c0d61d6dd3b9c5 Mon Sep 17 00:00:00 2001 From: Maverick Lin Date: Mon, 20 Apr 2020 15:34:03 -0400 Subject: [PATCH 09/10] pep8 changes --- gs_quant/test/timeseries/test_econometrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gs_quant/test/timeseries/test_econometrics.py b/gs_quant/test/timeseries/test_econometrics.py index 4a8cfb15..391bdf80 100644 --- a/gs_quant/test/timeseries/test_econometrics.py +++ b/gs_quant/test/timeseries/test_econometrics.py @@ -510,15 +510,15 @@ def test_arima_fit(): assert_series_equal(transformed_test_df['High'], test_df['High']) # Test if train_size is str - with pytest.raises(ValueError) as e: + with pytest.raises(ValueError): arima.fit(test_df, train_size='str', freq='B', q_vals=[0]) # Test if input is list - with pytest.raises(ValueError) as e: + with pytest.raises(ValueError): arima.fit([1, 2, 3, 4], train_size=0.75, freq='B', q_vals=[0]) # Test transform with list - with pytest.raises(ValueError) as e: + with pytest.raises(ValueError): arima.fit(test_df, train_size=train_size, freq='B', q_vals=[0]) transformed_test_df = arima.transform([1, 2, 3, 4]) From 6b9cc409f059258fe60f4d7446eb82ad7fc713bc Mon Sep 17 00:00:00 2001 From: Maverick Lin Date: Mon, 20 Apr 2020 15:39:32 -0400 Subject: [PATCH 10/10] more code coverage --- gs_quant/test/timeseries/test_econometrics.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gs_quant/test/timeseries/test_econometrics.py b/gs_quant/test/timeseries/test_econometrics.py index 391bdf80..19f74934 100644 --- a/gs_quant/test/timeseries/test_econometrics.py +++ b/gs_quant/test/timeseries/test_econometrics.py @@ -510,15 +510,15 @@ def test_arima_fit(): assert_series_equal(transformed_test_df['High'], test_df['High']) # Test if train_size is str - with pytest.raises(ValueError): + with pytest.raises(ValueError, match='train_size is not int, float, or None'): arima.fit(test_df, train_size='str', freq='B', q_vals=[0]) # Test if input is list - with pytest.raises(ValueError): + with pytest.raises(ValueError, match='Not DataFrame or Series!'): arima.fit([1, 2, 3, 4], train_size=0.75, freq='B', q_vals=[0]) # Test transform with list - with pytest.raises(ValueError): + with pytest.raises(ValueError, match='Not DataFrame or Series!'): arima.fit(test_df, train_size=train_size, freq='B', q_vals=[0]) transformed_test_df = arima.transform([1, 2, 3, 4])