-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathfeature_engineering.py
82 lines (58 loc) · 2.23 KB
/
feature_engineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from config import *
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()
def load_raw_data():
hf_data = pd.read_csv(
folder_structure.path_input
+ "/"
+ "SPY2011.csv", # nrows=10000000 # use nrows for only a selection of the data
)
return hf_data
def etl(df):
df = df.drop(["SYM_ROOT", "SYM_SUFFIX"], axis=1)
# creating sub_df which excludes trades that are out of opening hours
df.TIME_M = pd.to_timedelta(df.TIME_M, "ns")
cut_l = pd.to_timedelta("09:30:00")
cut_u = pd.to_timedelta("16:00:00")
df = df.loc[(df.TIME_M >= cut_l) & (df.TIME_M <= cut_u)]
# sampling [--> sample 10 times and take averages (increases computational stress)]
interval_size = np.floor(df.shape[0] / 78)
interval = np.arange(0, df.shape[0], interval_size)
df = df.iloc[interval]
# Log Prices, Returns & Realized Volatility
df["LogPrice"] = np.log(df["PRICE"])
df["Returns"] = df["LogPrice"] - df["LogPrice"].shift(1)
df["RV"] = df["Returns"] ** 2
# realized semi variance squared positives
df["RSV_plus"] = np.nan
df.RSV_plus = (df.Returns.loc[df.Returns > 0]) ** 2
df.loc[np.isnan(df.RSV_plus), "RSV_plus"] = 0
# realized semi variance squared negatives
df["RSV_minus"] = np.nan
df.RSV_minus = (df.Returns.loc[df.Returns < 0]) ** 2
df.loc[np.isnan(df.RSV_minus), "RSV_minus"] = 0
rv = df["RV"].sum()
rsv_plus = df["RSV_plus"].sum()
rsv_minus = df["RSV_minus"].sum()
series_help = pd.Series([rv, rsv_plus, rsv_minus])
return series_help
def make_all_features(high_frequency_data_set):
df = high_frequency_data_set.groupby(high_frequency_data_set.DATE).progress_apply(
lambda x: etl(x)
)
df.rename(columns={0: "RV", 1: "RSV_plus", 2: "RSV_minus"}, inplace=True)
df.reset_index(level=0, inplace=True)
return df
def save_data_features(df):
df.to_csv(
folder_structure.path_input + "/" + "DataFeatures_2.csv"
) # adding a unique identifier?
print("Data exported in {}".format(folder_structure.path_input))
def run_feature_engineering():
hf_data = load_raw_data()
df = make_all_features(hf_data)
save_data_features(df)
# return df
run_feature_engineering()