-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathUntitled-1.py
148 lines (129 loc) · 4.86 KB
/
Untitled-1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# %%
from modelling_functions import *
# %%
models_and_param_grids = [
{
'model': DecisionTreeRegressor(random_state=42),
'param_grid': {
'max_depth': [None, 10, 20, 30, 50],
'min_samples_split': [2, 5, 10],
'max_features': ['auto'],
'min_samples_leaf': [1, 3, 5, 10]
},
'model_name': 'CART',
},
{
'model': KNeighborsRegressor(),
'param_grid': {
'n_neighbors': [3, 5, 7, 9, 11],
'weights': ['uniform', 'distance'],
'algorithm': ['auto'],
'leaf_size': [10, 30, 50],
},
'model_name': 'KNN',
},
{
'model': lgb.LGBMRegressor(max_depth=-1, random_state=42),
'param_grid': {
'lgbmregressor__n_estimators': [100, 200],
'lgbmregressor__learning_rate': [0.01],
'lgbmregressor__max_depth': [5, 10, 20],
'lgbmregressor__num_leaves': [35, 50],
},
'model_name': 'GBR',
},
{
'model': xgb.XGBRegressor(random_state=42),
'param_grid': {
'n_estimators': [100, 200],
'learning_rate': [0.01, 0.1],
'max_depth': [3, 4, 5],
'min_child_weight': [1, 3, 5],
},
'model_name': 'XGB',
},
{
'model': RandomForestRegressor(random_state=42),
'param_grid': {
'n_estimators': [10, 50, 100, 200],
'max_depth': [None, 10, 20, 30],
'min_samples_split': [2, 5, 10],
'max_features': ['auto'],
},
'model_name': 'RF',
},
# {
# 'model': MLPRegressor(random_state=42),
# 'param_grid': {
# 'hidden_layer_sizes': [(50,), (100,), (50, 50),(100, 100), (50, 50, 50), (100, 100, 100), (50, 50, 50, 50), (100, 100, 100, 100)],
# 'activation': ['tanh', 'relu', 'logistic'],
# 'solver': ['sgd'],
# 'alpha': [0.00005, 0.0005, 0.005],
# 'early_stopping': [True],
# 'max_iter': [600],
# 'shuffle': [False],
# },
# 'model_name': 'MLP',
# },
# {
# 'model': GaussianProcessRegressor(random_state=42),
# 'param_grid': {
# 'kernel': [RBF(), DotProduct()+ WhiteKernel()],
# 'alpha': [1e-10, 1e-5, 1e-2, 1],
# 'n_restarts_optimizer': [0, 1, 3],
# },
# 'model_name': 'GPR',
# },
]
file_name_raw = "Test_data.csv"
file_name_clean = "imputed_df.csv"
# load data, transform and split
ts_raw = load_data_file(file_name_raw)
ts_imputed = load_data_file(file_name_clean)
# %% [markdown]
# Calculate naive benchmark using MC simulation
# %%
frequency = 'D'
n_trials = 100
#default process is false for log returns
ts_raw_transformed = transform_data(ts_raw, frequency, do_log_returns=False)
ts_imputed_transformed = transform_data(ts_imputed, frequency, do_log_returns=False)
# %% [markdown]
# Benchmark prediction using geometric brownian motion.
# S[t] = S[t-1] * (1 + ε)
# S[t] is the stock price at time t
# S[t-1] is the stock price at time t-1
# ε is a normally distributed random variable with mean 'mu' and standard deviation 'std'
#
# %%
run_predictions_and_plot(ts_raw_transformed, n_trials)
run_predictions_and_plot(ts_imputed_transformed, n_trials)
# %% [markdown]
# Model fitting, training and testing
# %%
split_date = '2011-12-31'
frequency = 'D'
LOG_RETURNS = True
test_size = 14
n_splits = 30
tscv = TimeSeriesSplit(n_splits=n_splits, test_size=test_size)
lags = 3
ts_raw_transformed = transform_data(ts_raw, frequency, do_log_returns=LOG_RETURNS)
ts_imputed_transformed = transform_data(ts_imputed, frequency, do_log_returns=LOG_RETURNS)
train_raw, test_raw = split_data(ts_raw_transformed, split_date)
train_imp, test_imp = split_data(ts_imputed_transformed, split_date)
# %% [markdown]
# Hypotheses models testing for four hypotheses with four cases of features:
# - missing vlaues removed without outliers marked
# - missing values removed with outliers marked
# - missing values imputed without outliers marked
# - missing values imputed with outliers marked
# %%
h1_results = test_hypothesis('H1', 'ts1', 'ts1', models_and_param_grids, tscv, lags, train_raw, test_raw, train_imp, n_splits, test_size)
h2_results = test_hypothesis('H2', 'ts2', 'ts2', models_and_param_grids, tscv, lags, train_raw, test_raw, train_imp, n_splits, test_size)
h3_results = test_hypothesis('H3', 'ts1', 'ts2', models_and_param_grids, tscv, lags, train_raw, test_raw, train_imp, n_splits, test_size)
h4_results = test_hypothesis('H4', 'ts2', 'ts1', models_and_param_grids, tscv, lags, train_raw, test_raw, train_imp, n_splits, test_size)
plot_all_experiment_results_comparison(h1_results, 'H1')
plot_all_experiment_results_comparison(h2_results, 'H2')
plot_all_experiment_results_comparison(h3_results, 'H3')
plot_all_experiment_results_comparison(h4_results, 'H4')