Skip to content

basic example - finance data #5

Open
@andrewczgithub

Description

@andrewczgithub

Hi @hrayrhar !

Amazing algorithm, I am trying to use it on a basic two dimesnaional dataset.
Please see my attempt below -

from __future__ import print_function
from __future__ import absolute_import

from tcorex.experiments.data import load_modular_sudden_change
from tcorex.experiments import baselines
from tcorex import base
from tcorex import TCorex
from tcorex import covariance as cov_utils

import numpy as np
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt

import yfinance as yf
data = yf.download("SPY GOOGL", start="2014-01-01", end="2019-04-30")
data
return_target=data['Close'].pct_change().dropna()

nv = 2        # number of observed variables
m = 1           # number of hidden variables
nt = 10         # number of time periods
train_cnt = 16  # number of training samples for each time period
val_cnt = 4     # number of validation samples for each time period

# Generate some data with a sudden change in the middle.
#data, ground_truth_sigma = load_modular_sudden_change(nv=nv, m=m, nt=nt, ns=(train_cnt + val_cnt))

data =return_target.values

# Split it into train and validation.
#train_data = [X[:train_cnt] for X in data]

train_data=data
#val_data = [X[train_cnt:] for X in data]

# NOTE: the load_modular_sudden_change function above creates data where the time axis
# is already divided into time periods. If your data is not divided into time periods
# you can use the following procedure to do that:
# bucketed_data, index_to_bucket = make_buckets(data, window=train_cnt + val_cnt, stride='full')
# where the make_buckets function can be found at tcorex.experiments.data

# The core method we have is the tcorex.TCorex class.
tc = TCorex(nt=nt,
         nv=nv,
         n_hidden=m,
         max_iter=500,
         device='cpu',  # for GPU set 'cuda',
         l1=0.3,        # coefficient of temporal regularization term
         gamma=0.3,     # parameter that controls sample weights
         verbose=1,     # 0, 1, 2
         )

# # Fit the parameters of T-CorEx.
tc.fit(train_data)

Activity

andrewczgithub

andrewczgithub commented on Jan 26, 2020

@andrewczgithub
Author

I am getting the below error. Please help.

' ' 'python


IndexError Traceback (most recent call last)
in
34
35 # # Fit the parameters of T-CorEx.
---> 36 tc.fit(train_data)

~/tutorial-env1/lib/python3.7/site-packages/T_CorEx-1.0-py3.7.egg/tcorex/tcorex.py in fit(self, x)
250 self.theta[t] = (mean_prior, std_prior)
251
--> 252 x = self.preprocess(x, fit=False) # standardize the data using the better estimates
253 x = [np.array(xt, dtype=np.float32) for xt in x] # convert to np.float32
254 self.x_input = x # to have an access to input

~/tutorial-env1/lib/python3.7/site-packages/T_CorEx-1.0-py3.7.egg/tcorex/base.py in preprocess(self, X, fit)
224 std = np.sqrt(np.sum((x - mean) ** 2, axis=0) / n_obs).clip(1e-10)
225 self.theta.append((mean, std))
--> 226 x = ((x - self.theta[t][0]) / self.theta[t][1])
227 if np.max(np.abs(x)) > 6 and self.verbose > 0:
228 warnings.append("Warning: outliers more than 6 stds away from mean. "

IndexError: list index out of range

' ' '

andrewczgithub

andrewczgithub commented on Jan 27, 2020

@andrewczgithub
Author

just looping in all authors @gregversteeg

gregversteeg

gregversteeg commented on Jan 27, 2020

@gregversteeg
Collaborator

Hmmm, that's strange. It's still in the numpy preprocessing. Can you just print out "train_data.shape" to be sure it really is an array of size (2, number of timesteps). (Not something like (2, samples per time period, number of time periods).

hrayrhar

hrayrhar commented on Jan 27, 2020

@hrayrhar
Owner

Hi @andrewczgithub,

The fit() function of T-CorEx expects train_data to be a list of T 2D arrays of shape (n_samples, n_variables). The T above is the number of time periods.

andrewczgithub

andrewczgithub commented on Jan 29, 2020

@andrewczgithub
Author

Hi All!!
Thank you for your help!

This is what i have below -

from __future__ import print_function
from __future__ import absolute_import

from tcorex.experiments.data import load_modular_sudden_change
from tcorex.experiments import baselines
from tcorex import base
from tcorex import TCorex
from tcorex import covariance as cov_utils

import numpy as np
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt


import yfinance as yf
data = yf.download("SPY GOOGL", start="2014-01-01", end="2019-04-30")
data

return_target=data['Close'].pct_change().dropna()

L=return_target.to_numpy()
L

nv = 2        # number of observed variables
m = 1           # number of hidden variables
nt = 10         # number of time periods
train_cnt = 16  # number of training samples for each time period
val_cnt = 4     # number of validation samples for each time period

# Generate some data with a sudden change in the middle.
#data, ground_truth_sigma = load_modular_sudden_change(nv=nv, m=m, nt=nt, ns=(train_cnt + val_cnt))


# Split it into train and validation.
#train_data = [X[:train_cnt] for X in data]


#val_data = [X[train_cnt:] for X in data]

# NOTE: the load_modular_sudden_change function above creates data where the time axis
# is already divided into time periods. If your data is not divided into time periods
# you can use the following procedure to do that:
# bucketed_data, index_to_bucket = make_buckets(data, window=train_cnt + val_cnt, stride='full')
# where the make_buckets function can be found at tcorex.experiments.data

# The core method we have is the tcorex.TCorex class.
tc = TCorex(nt=nt,
        nv=nv,
        n_hidden=m,
        max_iter=500,
        device='cpu',  # for GPU set 'cuda',
        l1=0.3,        # coefficient of temporal regularization term
        gamma=0.3,     # parameter that controls sample weights
        verbose=1,     # 0, 1, 2
        )

# # Fit the parameters of T-CorEx.
tc.fit(L)
andrewczgithub

andrewczgithub commented on Jan 29, 2020

@andrewczgithub
Author

I get the error

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-15-22093ca59768> in <module>
     57 
     58 # # Fit the parameters of T-CorEx.
---> 59 tc.fit(L)

~/T-CorEx/tcorex/tcorex.py in fit(self, x)
    250             self.theta[t] = (mean_prior, std_prior)
    251 
--> 252         x = self.preprocess(x, fit=False)  # standardize the data using the better estimates
    253         x = [np.array(xt, dtype=np.float32) for xt in x]  # convert to np.float32
    254         self.x_input = x  # to have an access to input

~/T-CorEx/tcorex/base.py in preprocess(self, X, fit)
    224                     std = np.sqrt(np.sum((x - mean) ** 2, axis=0) / n_obs).clip(1e-10)
    225                     self.theta.append((mean, std))
--> 226                 x = ((x - self.theta[t][0]) / self.theta[t][1])
    227                 if np.max(np.abs(x)) > 6 and self.verbose > 0:
    228                     warnings.append("Warning: outliers more than 6 stds away from mean. "

IndexError: list index out of range
andrewczgithub

andrewczgithub commented on Jan 29, 2020

@andrewczgithub
Author

please help @hrayrhar @gregversteeg , I am not sure what i am doing wrong :(

cheers,
Andrew

hrayrhar

hrayrhar commented on Jan 29, 2020

@hrayrhar
Owner

I think L above is a 2D array of form (n_samples, n_stocks). To apply T-CorEx you need data to be split into some number of time periods and have the shape (n_time_periods, n_samples, n_stocks).

If you want to ignore the temporal aspect of the data, you can use the Corex class instead of TCorex. That class expects a 2D array. Passing L above to it should work.

andrewczgithub

andrewczgithub commented on Jan 30, 2020

@andrewczgithub
Author

Hi @hrayrhar & @gregversteeg !

Thank you so much for your help.!
So I have tried to create the list data structure as you have said but I am still getting errors.

Could you please assist.

from __future__ import print_function
from __future__ import absolute_import

from tcorex.experiments.data import load_modular_sudden_change
from tcorex.experiments import baselines
from tcorex import base
from tcorex import TCorex
from tcorex import covariance as cov_utils

import numpy as np
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt


import yfinance as yf
data = yf.download("SPY GOOGL", start="2014-01-01", end="2019-04-30")
data
return_target=data['Close'].pct_change().dropna()



return_target.index = return_target.index.astype(str)
lll=return_target.reset_index().values





nv = 2         # number of observed variables
m = 1           # number of hidden variables
nt = 10         # number of time periods
train_cnt = 16  # number of training samples for each time period
val_cnt = 4     # number of validation samples for each time period


# The core method we have is the tcorex.TCorex class.
tc = TCorex(nt=nt,
            nv=nv,
            n_hidden=m,
            max_iter=500,
            device='cpu',  # for GPU set 'cuda',
            l1=0.3,        # coefficient of temporal regularization term
            gamma=0.3,     # parameter that controls sample weights
            verbose=1,     # 0, 1, 2
            )

# Fit the parameters of T-CorEx.
tc.fit(lll)
andrewczgithub

andrewczgithub commented on Jan 31, 2020

@andrewczgithub
Author

I also try to used to bucketed data function

from __future__ import print_function
from __future__ import absolute_import

from tcorex.experiments.data import load_modular_sudden_change
from tcorex.experiments.data import make_buckets
from tcorex.experiments import baselines
from tcorex import base
from tcorex import TCorex
from tcorex import covariance as cov_utils

import numpy as np
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt


import yfinance as yf

nv = 2         # number of observed variables
m = 1           # number of hidden variables
nt = 10         # number of time periods
train_cnt = 16  # number of training samples for each time period
val_cnt = 4     # number of validation samples for each time period

data = yf.download("SPY GOOGL", start="2014-01-01", end="2019-04-30")
data
return_target=data['Close'].pct_change().dropna()


bucketed_data, index_to_bucket = make_buckets(return_target, window=train_cnt + val_cnt, stride='full')


#return_target.index = return_target.index.astype(str)
#lll=return_target.reset_index().values


# The core method we have is the tcorex.TCorex class.
tc = TCorex(nt=nt,
            nv=nv,
            n_hidden=m,
            max_iter=500,
            device='cpu',  # for GPU set 'cuda',
            l1=0.3,        # coefficient of temporal regularization term
            gamma=0.3,     # parameter that controls sample weights
            verbose=1,     # 0, 1, 2
            )

# Fit the parameters of T-CorEx.
tc.fit(bucketed_data)
andrewczgithub

andrewczgithub commented on Feb 3, 2020

@andrewczgithub
Author

Hi @gregversteeg @hrayrhar

I was able to get the algorithm to run but the output of covariance matrix is all 0's.

I am not sure how can this be?

from tcorex.experiments.data import load_modular_sudden_change
from tcorex.experiments.data import make_buckets
from tcorex.experiments import baselines
from tcorex import base
from tcorex import TCorex
from tcorex import Corex
from tcorex import covariance as cov_utils

import numpy as np
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt
import yfinance as yf
import numpy as np
data = yf.download("SPY GOOGL", start="2014-01-01", end="2019-04-30")
data
return_target=data['Close'].pct_change().dropna()
return_target

rt=return_target.to_numpy()

split_array= return_target.to_numpy()
sa=np.array_split(split_array, 1, axis=1)
sa

nv = 2         # number of observed variables
m = 1           # number of hidden variables
nt = 1        # number of time periods
train_cnt = 5  # number of training samples for each time period
val_cnt = 1     # number of validation samples for each time period


# Split it into train and validation.
#train_data = list([X[:train_cnt] for X in rt])
#val_data = list([X[train_cnt:] for X in rt])

#bucketed_data, index_to_bucket = make_buckets(sa, window=train_cnt + val_cnt, stride='full')





# The core method we have is the tcorex.TCorex class.
tc = TCorex(nt=nt,
            nv=nv,
            n_hidden=m,
            max_iter=500,
            device='cpu',  # for GPU set 'cuda',
            l1=0.3,        # coefficient of temporal regularization term
            gamma=0.3,     # parameter that controls sample weights
            verbose=1,     # 0, 1, 2
            )

# Fit the parameters of T-CorEx.
tc.fit(sa)
tc.get_covariance()

[array([[0., 0.],
[0., 0.]])]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

      Development

      No branches or pull requests

        Participants

        @hrayrhar@gregversteeg@andrewczgithub

        Issue actions

          basic example - finance data · Issue #5 · hrayrhar/T-CorEx