-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathload_data.py
More file actions
62 lines (41 loc) · 1.61 KB
/
load_data.py
File metadata and controls
62 lines (41 loc) · 1.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import pandas as pd
import numpy as np
# load data and preprocess
def data_preparation(path, type='excel'):
if type is 'csv':
df = pd.read_csv(path)
if type is 'excel':
df = pd.read_excel(path)
# x and y
y = np.array(df['y'])
x = df.drop(['y'], axis=1)
# for each patient, convert their containing symptoms to a list of words
bin_feats = []
for i in x.columns:
if len(x[i].unique()) == 2:
bin_feats.append(i)
x_bin = x[bin_feats]
x_bin = x_bin.replace(0, np.nan)
x_bin_features = bin_gen(x_bin)
# pad to the maximum length and convert to matrix
x_bin_features, feats, tokens, feat_max = bin_pad_convert(x_bin_features, bin_feats)
return x_bin_features, feats, tokens, feat_max, y
# convert to a list of words for binary features
def bin_gen(x):
x_features = []
for i in range(x.shape[0]):
index = x.columns[x.iloc[i, :].notnull()]
feats = np.array(index)
x_features.append(feats)
return np.array(x_features)
# convert to matrix and generate descriptions (binary features)
def bin_pad_convert(txt_features, bin_feat_list):
bin_feat_max = max([len(feat) for feat in txt_features])
bin_feats = ['pad'] + bin_feat_list
tokens = len(bin_feats)
x_features = np.zeros((len(txt_features), bin_feat_max), dtype='int32')
feat_index = dict([(char, i) for i, char in enumerate(bin_feats)])
for i, input_text in enumerate(txt_features):
for t, char in enumerate(input_text):
x_features[i, t] = feat_index[char]
return x_features, bin_feats, tokens, bin_feat_max