-
Notifications
You must be signed in to change notification settings - Fork 23
Expand file tree
/
Copy pathdata_split.py
More file actions
32 lines (29 loc) · 1.1 KB
/
data_split.py
File metadata and controls
32 lines (29 loc) · 1.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
'''
We will not perform the data splitting process while running both resnet and lstm training.
Rather, we will do it in the first place before everything.
'''
import sys
import os
import pandas as pd
import random
total_data = sys.argv[1]
train_ratio = int(sys.argv[2])
test_ratio = int(sys.argv[3])
random_seed = int(sys.argv[4])
def split_data(data_path, train_ratio, test_ratio, seed):
'''
generate the training and testing data using original data and random seed
'''
data = pd.read_csv(data_path).iloc[:,0].values.tolist()
random.seed(seed)
random.shuffle(data)
cut = int(len(data)*train_ratio/(train_ratio+test_ratio))
with open(data_path[:-4]+'_train_'+str(seed)+'.txt', 'w') as f:
f.write('sequence\tRT\n')
f.write('\n'.join(data[:cut])+'\n')
with open(data_path[:-4]+'_test_'+str(seed)+'.txt', 'w') as f:
f.write('sequence\tRT\n')
f.write('\n'.join(data[cut:])+'\n')
print('Now split the data into training and testing data sets with the seed:'+str(random_seed)+'\n')
split_data(total_data, train_ratio, test_ratio, random_seed)
print('Done.')