diff --git a/gs_quant/test/timeseries/test_econometrics.py b/gs_quant/test/timeseries/test_econometrics.py index e30882be..19f74934 100644 --- a/gs_quant/test/timeseries/test_econometrics.py +++ b/gs_quant/test/timeseries/test_econometrics.py @@ -14,7 +14,10 @@ under the License. """ +from math import isclose + import pytest +from pandas import Timestamp from pandas.util.testing import assert_series_equal from gs_quant.timeseries import * @@ -366,5 +369,159 @@ def test_max_drawdown(): assert_series_equal(output_window, pd.Series([0.0, 0.0, 0.0, -0.2, -0.2, -0.75]), obj="Max drawdown window") +def test_arima_fit(): + test_dict = { + 'High': { + Timestamp('1989-01-03 00:00:00'): 3.575721263885498, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.62580132484436, + Timestamp('1989-01-06 00:00:00'): 3.62580132484436, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.575721263885498, + Timestamp('1989-01-11 00:00:00'): 3.5657050609588623, + Timestamp('1989-01-12 00:00:00'): 3.635817289352417, + Timestamp('1989-01-13 00:00:00'): 3.615785360336304, + Timestamp('1989-01-16 00:00:00'): 3.615785360336304, + Timestamp('1989-01-17 00:00:00'): 3.635817289352417, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.695913553237915, + Timestamp('1989-01-20 00:00:00'): 3.665865421295166, + Timestamp('1989-01-23 00:00:00'): 3.675881385803223, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.695913553237915, + Timestamp('1989-01-26 00:00:00'): 3.7760417461395264, + Timestamp('1989-01-27 00:00:00'): 3.8561699390411377, + Timestamp('1989-01-30 00:00:00'): 3.8561699390411377}, + 'Low': { + Timestamp('1989-01-03 00:00:00'): 3.4855768680572514, + Timestamp('1989-01-04 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-05 00:00:00'): 3.575721263885498, + Timestamp('1989-01-06 00:00:00'): 3.575721263885498, + Timestamp('1989-01-09 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-10 00:00:00'): 3.5356571674346924, + Timestamp('1989-01-11 00:00:00'): 3.5256409645080566, + Timestamp('1989-01-12 00:00:00'): 3.5456731319427486, + Timestamp('1989-01-13 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-18 00:00:00'): 3.615785360336304, + Timestamp('1989-01-19 00:00:00'): 3.655849456787109, + Timestamp('1989-01-20 00:00:00'): 3.62580132484436, + Timestamp('1989-01-23 00:00:00'): 3.615785360336304, + Timestamp('1989-01-24 00:00:00'): 3.615785360336304, + Timestamp('1989-01-25 00:00:00'): 3.655849456787109, + Timestamp('1989-01-26 00:00:00'): 3.665865421295166, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.786057710647583}, + 'Close': { + Timestamp('1989-01-03 00:00:00'): 3.5256409645080566, + Timestamp('1989-01-04 00:00:00'): 3.5857372283935547, + Timestamp('1989-01-05 00:00:00'): 3.575721263885498, + Timestamp('1989-01-06 00:00:00'): 3.575721263885498, + Timestamp('1989-01-09 00:00:00'): 3.575721263885498, + Timestamp('1989-01-10 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-11 00:00:00'): 3.5556890964508057, + Timestamp('1989-01-12 00:00:00'): 3.605769157409668, + Timestamp('1989-01-13 00:00:00'): 3.605769157409668, + Timestamp('1989-01-16 00:00:00'): 3.5957531929016118, + Timestamp('1989-01-17 00:00:00'): 3.62580132484436, + Timestamp('1989-01-18 00:00:00'): 3.675881385803223, + Timestamp('1989-01-19 00:00:00'): 3.665865421295166, + Timestamp('1989-01-20 00:00:00'): 3.6458332538604736, + Timestamp('1989-01-23 00:00:00'): 3.62580132484436, + Timestamp('1989-01-24 00:00:00'): 3.675881385803223, + Timestamp('1989-01-25 00:00:00'): 3.675881385803223, + Timestamp('1989-01-26 00:00:00'): 3.756009578704834, + Timestamp('1989-01-27 00:00:00'): 3.79607367515564, + Timestamp('1989-01-30 00:00:00'): 3.846153736114502}, + } + + test_df = pd.DataFrame(test_dict) + arima = econometrics.arima() + + train_size_values = [0.75, int(0.75*len(test_df)), None] + for train_size in train_size_values: + arima.fit(test_df, train_size=train_size, freq='B', q_vals=[0]) + transformed_test_df = arima.transform(test_df) + + for col in transformed_test_df.keys(): + count_nans = arima.best_params[col].p + arima.best_params[col].d + assert(count_nans == transformed_test_df[col].isna().sum()) + + # Test (2,1,0) Model + test_df_high = test_df['High'].diff() + assert(isclose(transformed_test_df['High'][3], (arima.best_params['High'].const + test_df_high[2] * + arima.best_params['High'].ar_coef[0] + test_df_high[1] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['High'][4], (arima.best_params['High'].const + test_df_high[3] * + arima.best_params['High'].ar_coef[0] + test_df_high[2] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['High'][-1], (arima.best_params['High'].const + test_df_high[-2] * + arima.best_params['High'].ar_coef[0] + test_df_high[-3] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + + # Test (2,2,0) Model + test_df_low = test_df['Low'].diff().diff() + assert(isclose(transformed_test_df['Low'][4], (arima.best_params['Low'].const + test_df_low[3] * + arima.best_params['Low'].ar_coef[0] + test_df_low[2] * + arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][5], (arima.best_params['Low'].const + test_df_low[4] * + arima.best_params['Low'].ar_coef[0] + test_df_low[3] * + arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Low'][-1], (arima.best_params['Low'].const + test_df_low[-2] * + arima.best_params['Low'].ar_coef[0] + test_df_low[-3] * + arima.best_params['Low'].ar_coef[1]), abs_tol=1e-8)) + + # Test (2,1,0) Model + test_df_close = test_df['Close'].diff() + assert(isclose(transformed_test_df['Close'][3], (arima.best_params['Close'].const + test_df_close[2] * + arima.best_params['Close'].ar_coef[0] + test_df_close[1] * + arima.best_params['Close'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Close'][4], (arima.best_params['Close'].const + test_df_close[3] * + arima.best_params['Close'].ar_coef[0] + test_df_close[2] * + arima.best_params['Close'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_df['Close'][-1], (arima.best_params['Close'].const + test_df_close[-2] * + arima.best_params['Close'].ar_coef[0] + test_df_close[-3] * + arima.best_params['Close'].ar_coef[1]), abs_tol=1e-8)) + + # Test if input is pd.Series + test_high_series = pd.Series(test_df['High']) + arima.fit(test_high_series, train_size=0.75, freq='B', q_vals=[0]) + transformed_test_series = arima.transform(test_high_series) + test_series_high = test_df['High'].diff() + assert(isclose(transformed_test_series['High'][3], (arima.best_params['High'].const + test_series_high[2] * + arima.best_params['High'].ar_coef[0] + test_series_high[1] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_series['High'][4], (arima.best_params['High'].const + test_series_high[3] * + arima.best_params['High'].ar_coef[0] + test_series_high[2] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + assert(isclose(transformed_test_series['High'][-1], (arima.best_params['High'].const + test_series_high[-2] * + arima.best_params['High'].ar_coef[0] + test_series_high[-3] * + arima.best_params['High'].ar_coef[1]), abs_tol=1e-8)) + + # Test if p=0 and d=0 + new_arima = econometrics.arima() + zero_resid = test_high_series.copy(deep=True) + zero_resid[:] = 0 + new_arima.best_params = {'High': econometrics.ARIMA_BestParams(p=0, q=0, d=0, + const=0, ar_coef=[0], ma_coef=[], resid=zero_resid)} + + transformed_test_df = new_arima.transform(test_high_series) + assert_series_equal(transformed_test_df['High'], test_df['High']) + + # Test if train_size is str + with pytest.raises(ValueError, match='train_size is not int, float, or None'): + arima.fit(test_df, train_size='str', freq='B', q_vals=[0]) + + # Test if input is list + with pytest.raises(ValueError, match='Not DataFrame or Series!'): + arima.fit([1, 2, 3, 4], train_size=0.75, freq='B', q_vals=[0]) + + # Test transform with list + with pytest.raises(ValueError, match='Not DataFrame or Series!'): + arima.fit(test_df, train_size=train_size, freq='B', q_vals=[0]) + transformed_test_df = arima.transform([1, 2, 3, 4]) + + if __name__ == "__main__": pytest.main(args=["test_econometrics.py"]) diff --git a/gs_quant/timeseries/econometrics.py b/gs_quant/timeseries/econometrics.py index 95c07f1d..0b08c817 100644 --- a/gs_quant/timeseries/econometrics.py +++ b/gs_quant/timeseries/econometrics.py @@ -15,6 +15,16 @@ # should be fully documented: docstrings should describe parameters and the return value, and provide a 1-line # description. Type annotations should be provided for parameters. +import itertools +import datetime as dt +from dataclasses import dataclass +from typing import Iterable, Optional, Union, Tuple + +import numpy as np +import pandas as pd +from statsmodels.tsa.arima_model import ARIMA +from statsmodels.tools.eval_measures import mse + from .statistics import * from ..errors import * @@ -500,3 +510,268 @@ def max_drawdown(x: pd.Series, w: Union[Window, int] = Window(None, 0)) -> pd.Se rolling_max = x.rolling(w.w, 0).max() result = (x / rolling_max - 1).rolling(w.w, 0).min() return apply_ramp(result, w) + + +@dataclass +class ARIMA_BestParams: + freq: str = '' + p: int = None + d: int = None + q: int = None + const: float = None + ar_coef: list = None + ma_coef: list = None + resid: list = None + series: pd.Series = None + + +class arima(): + """ + ARIMA is the Autoregressive Integrated Moving Average Model and is used + to normalize and forecast time series data. ARIMA has 3 parameters: + (p, d, q) where: + :p is the number of autoregressive terms + :d is the number of nonseasonal differences + :q is the number of lagged forecast errors in the prediction equation + + An ARIMA model is selected from the Catesian product of sets p, q, and d. + The time series is split into train and test sets and an ARIMA model is fit + for every combination on the training set. The model with the lowest + mean-squared error (MSE) on the test set is selected as the best model. The + original times series can then be transformed by the best model. + + **Examples** + >>> series = generate_series(100) + >>> arima = econometrics.arima() + >>> arima.fit(series, train_size=0.8) + >>> transformed_time_series = arima.transform(series) + """ + + def __init__(self): + self.best_params = {} + + def _evaluate_arima_model( + self, X: + Union[pd.Series, pd.DataFrame], + arima_order: Tuple[int, int, int], + train_size: Union[float, int, None], + freq: str + ) -> Tuple[float, dict]: + if type(train_size) == float: + train_size = int(len(X) * train_size) + train, test = X[:train_size].astype(float), X[train_size:].astype(float) + elif type(train_size) == int: + train, test = X[:train_size].astype(float), X[train_size:].astype(float) + elif train_size is None: + train_size = int(len(X) * 0.75) + train, test = X[:train_size].astype(float), X[train_size:].astype(float) + + model = ARIMA(train, order=arima_order, freq=freq) + model_fit = model.fit(disp=False, method='css', trend='nc') + ar_coef = model_fit.arparams + ma_coef = model_fit.maparams + resid = model_fit.resid + + model_params = model_fit.params.to_dict() + const = model_params.get('const', 0) + + # calculate test error + yhat = model_fit.forecast(len(test))[0] + error = mse(test, yhat) + + return error, const, ar_coef, ma_coef, resid + + def fit( + self, + X: Union[pd.Series, pd.DataFrame], + train_size: Union[float, int, None]=None, + p_vals: list=[0, 1, 2], + d_vals: list=[0, 1, 2], + q_vals: list=[0, 1, 2], + freq: str=None + ) -> 'arima': + """ + Train a combination of ARIMA models. If pandas DataFrame, finds the + best arima model parameters for each column. If pandas Series, finds + the best arima model parameters for the series. + + :param X: time series to be operated on; required parameter + :param train_size: if float, should be between 0.0 and 1.0 and + represent the proportion of the dataset to include in the train split. + If int, represents the absolute number of train samples. If None, + the value is automatically set 0.75 + :p_vals: number of autoregressive terms to search; default is [0,1,2] + :d_vals: number of differences to search; default is [0,1,2] + :q_vals: number of lagged forecast to search; always [0,1,2] + :freq: frequency of time series, default is None + :return: self + """ + + if isinstance(X, pd.Series): + X = X.to_frame() + + if not isinstance(X, pd.DataFrame): + raise ValueError('Not DataFrame or Series!') + + if not isinstance(train_size, (float, int, type(None))): + raise ValueError('train_size is not int, float, or None') + + for series_id in X.columns: + series = X[series_id] + best_score = float('inf') + best_order = None + best_const = None + best_ar_coef = None + best_ma_coef = None + best_resid = None + for order in list(itertools.product(*[p_vals, d_vals, q_vals])): + try: + mse, const, ar_coef, ma_coef, resid = self._evaluate_arima_model(series, order, train_size, freq) + if mse < best_score: + best_score = mse + best_order = order + best_const = const + best_ar_coef = ar_coef + best_ma_coef = ma_coef + best_resid = resid + except Exception as e: + print(' {}'.format(e)) + continue + + p, d, q = best_order + assert(p == len(best_ar_coef)) + + self.best_params[series_id] = ARIMA_BestParams(freq=freq, p=p, d=d, q=q, const=best_const, + ar_coef=best_ar_coef, ma_coef=best_ma_coef, resid=best_resid, + series=series) + + return self + + def _difference(self, X: pd.Series, d: int): + """Helper Function to Difference Time Series n Times""" + + if d == 0: + return X + elif d == 1: + return X.diff() + else: + return self._difference(X.diff(), d - 1) + + def _lagged_values(self, X: pd.Series, p: int, ar_coef: list): + """Helper Function to Calculate AutoRegressive(AR) Component""" + + if p == 0: + return X + elif p > 0: + transformed_df = pd.concat([X.copy().shift(periods=i) + for i in range(1, p + 1)], axis=1) + transformed_df = transformed_df.dot(ar_coef) + + return transformed_df + + def _calculate_residuals( + self, + X_ar: pd.Series, + X_diff: pd.Series, + p: int, + d: int, + q: int, + ar_coef: list, + ma_coef: list, + freq: str + ): + """Helper Function to Calculate Residuals/MA Component""" + + ma_coef = ma_coef[::-1] + resid = X_ar.copy(deep=True) + resid[:] = 0 + + X_ma = X_ar.copy(deep=True) + X_ma[:] = np.nan + + for x in range(p + d, len(X_ar)): + ma_component = resid[x - q: x].dot(ma_coef) + prediction = X_ar[x] + ma_component + residual = X_diff[x] - prediction + resid[x] = residual + X_ma[x] = prediction + + return resid, X_ma + + def _arima_transform_series( + self, + X: pd.Series, + p: int, + d: int, + q: int, + const: float, + ar_coef: list, + ma_coef: list, + resid: list, + freq: str + ) -> pd.Series: + """Helper Function to Transform Series""" + + # Difference first + X_diff = self._difference(X, d) + + # Calculate Autoregressive Component + X_diff_ar = self._lagged_values(X_diff, p, ar_coef) + + # Caluclate Residuals and Moving Average Component + calcualted_resid, X_diff_ar_ma = self._calculate_residuals(X_diff_ar, + X_diff, + p, + d, + q, + ar_coef, + ma_coef, + freq) + + # Check calculated residuals are close with ARIMA statsmodels residuals + resid_df = pd.concat([calcualted_resid, resid], axis=1, join='inner') + assert(np.allclose(resid_df[resid_df.columns[0]], + resid_df[resid_df.columns[1]])) + + return X_diff_ar_ma + + def _arima_transform_df(self, X: pd.DataFrame) -> pd.DataFrame: + """Helper Function to Transform DataFrame""" + + series = {} + for series_id in X.columns: + freq = self.best_params[series_id].freq + p = self.best_params[series_id].p + d = self.best_params[series_id].d + q = self.best_params[series_id].q + const = self.best_params[series_id].const + ar_coef = self.best_params[series_id].ar_coef + ma_coef = self.best_params[series_id].ma_coef + resid = self.best_params[series_id].resid + + series[series_id] = self._arima_transform_series(X[series_id], p=p, d=d, q=q, const=const, ar_coef=ar_coef, + ma_coef=ma_coef, resid=resid, freq=freq) + + return pd.DataFrame(series) + + def transform( + self, + X: Union[pd.Series, pd.DataFrame] + ) -> Union[pd.DataFrame]: + """ + Transform a series based on the best ARIMA found from fit(). + Does not support tranformation using MA components. + + :param X: time series to be operated on; required parameter + :return: DataFrame + """ + + if isinstance(X, pd.Series): + X = X.to_frame() + + if isinstance(X, pd.DataFrame): + transformed = self._arima_transform_df(X) + else: + raise ValueError('Not DataFrame or Series!') + + return transformed diff --git a/setup.py b/setup.py index 2f93ba20..96238384 100644 --- a/setup.py +++ b/setup.py @@ -65,6 +65,7 @@ "requests", "scipy", "six", + "statsmodels", "typing;python_version<'3.7'" ], extras_require={