-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathdata_loader.py
52 lines (42 loc) · 1.96 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import random
import numpy as np
import pandas as pd
def __append_data(data, features_list, label_list, label_dict):
for sample in data:
features_list.append(sample[:-1])
label_list.append(label_dict[sample[-1].lower()])
def load_dataset(file_path, valid_rate=0.1):
data_frame = pd.read_csv(file_path).sample(frac=1, random_state=11)
label_dict = dict()
for label_name in set([key.lower() for key in data_frame.iloc[:, -1].unique()]):
label_dict[label_name] = len(label_dict.keys())
label_data_dict = dict()
for sample in data_frame.values:
label = sample[-1]
if label not in label_data_dict:
label_data_dict[label] = list()
label_data_dict[label].append(sample)
train_features_list = list()
train_label_list = list()
valid_features_list = list()
valid_label_list = list()
test_features_list = list()
test_label_list = list()
for label in label_data_dict.keys():
data = label_data_dict[label]
data_size = len(data)
test_size = int(len(data) * 0.1)
if test_size == 0 and data_size >= 3:
test_size = 1
train_data = data[:-test_size]
valid_size = int(len(train_data) * valid_rate)
if valid_size == 0 and data_size >= 3:
valid_size = 1
__append_data(train_data[:-valid_size], train_features_list, train_label_list, label_dict)
__append_data(train_data[-valid_size:], valid_features_list, valid_label_list, label_dict)
__append_data(data[-test_size:], test_features_list, test_label_list, label_dict)
zipped_train_list = list(zip(train_features_list, train_label_list))
random.shuffle(zipped_train_list)
test_features, train_labels = zip(*zipped_train_list)
return np.stack(list(test_features)), np.array(list(train_labels)), np.stack(valid_features_list),\
np.array(valid_label_list), np.stack(test_features_list), np.array(test_label_list), label_dict