-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathpreprocessor.py
139 lines (115 loc) · 4.62 KB
/
preprocessor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import pandas as pd
import numpy as np
import PIL
import time
import math
from PIL import Image,ImageFile
from sklearn.decomposition import IncrementalPCA
# Median dimensions for the lesions after segmentation
# NOTE these are *not* the median dimensions of the ISIC dataset, they are the
# median dimensions of the ISIC combined with another dataset
HEIGHT = 510
WIDTH = 766
CHANNELS = 3
DATA_DIR = 'bound_box_final_2/'
N_COMPS = 10
BATCH_SIZE=100
OUTPUT_NAME = 'final_isic_thres'
def timer(f):
def wrapper(*args, **kwargs):
t1 = time.time()
f(*args, **kwargs)
t2 = time.time()
print "Function took {} seconds. \n".format(str(t2-t1))
return wrapper
def build_dataset(dataset, ignore_set):
print "Building dataset: {}".format(dataset)
data = pd.read_csv(dataset + '.csv')
# Images not reliably complete
ImageFile.LOAD_TRUNCATED_IMAGES = True
# Flattened image vector is of shape HEIGHT*WIDTH*CHANNELS
# Add 1 dimension to vector for class
data_matrix = np.zeros((data.shape[0], HEIGHT * WIDTH * CHANNELS + 1))
for idx,row in data.iterrows():
if row[0] + ".jpg" in ignore_set or not "isic" in row[0]:
print "ignoring " + row[0] + ".jpg"
continue
img = Image.open(DATA_DIR + row[0] + '.jpg')
resized_img = img.resize((HEIGHT, WIDTH), PIL.Image.ANTIALIAS)
image_vector = np.array(resized_img).flatten()
image_vector_with_class = np.concatenate(\
(image_vector, [row['melanoma']]), axis=0)
data_matrix[idx] = image_vector_with_class
if idx % 50 == 0:
print "{} images done...".format(idx)
return data_matrix
def get_next_batch(data, i, BATCH_SIZE):
if (i + 1) * BATCH_SIZE > data.shape[0]:
return data[i*BATCH_SIZE:data.shape[0],:]
return data[i * BATCH_SIZE:(i + 1) * BATCH_SIZE,:]
def reduce_dimensionality(dataset):
print "Reducing dimensionality for dataset: {}".format(dataset)
print "Loading {}".format(dataset)
data_with_class = np.load(dataset)
print "Finished Loading {}".format(dataset)
data_no_class, y = extract_features_and_class(data_with_class)
# Will Seg-fault with regular PCA due to dataset size
# Somewhat arbitrary batch size here.
pca = IncrementalPCA(n_components=N_COMPS)
num_batches = int(math.ceil(y.shape[0] / float(BATCH_SIZE)))
print "Beggining to fit dataset"
for i in xrange(num_batches):
batch = get_next_batch(data_no_class, i, BATCH_SIZE)
pca.partial_fit(batch)
if i % 10 == 0:
print "{}% complete.".format( float(i) / num_batches * 100)
print "Beggining to fit transform dataset"
reduced_data = None
for i in xrange(num_batches):
batch = get_next_batch(data_no_class, i, BATCH_SIZE)
transformed_chunk = pca.transform(batch)
if reduced_data == None:
reduced_data = transformed_chunk
else:
reduced_data = np.vstack((reduced_data, transformed_chunk))
if i % 10 == 0:
print "{}% complete.".format(float(i) / num_batches * 100)
print "PCA complete for {} components. Explained variance: {}".\
format(N_COMPS, np.sum(pca.explained_variance_ratio_))
print reduced_data.shape
print y.shape
reduced_data_with_class = np.hstack((reduced_data,y))
return reduced_data_with_class
def extract_features_and_class(data_with_class):
y = data_with_class[:,-1]
# Reshape into column vector instead of row
y_col = y.reshape(y.size,1)
n_columns = data_with_class.shape[1] - 1
data_no_class = data_with_class[:,0:n_columns]
return data_no_class, y_col
def get_ignored_images():
with open("failed_thres_img.txt") as f:
a = map(lambda x: x.split("\n")[0], f.readlines())
return set(a)
@timer
def build_and_write_dataset(dataset):
dataset_file = dataset + ".npy"
ignore_set = get_ignored_images()
dataset_matrix = build_dataset(dataset, ignore_set)
print "Saving {}".format(dataset_file)
np.save(dataset_file, dataset_matrix)
@timer
def build_and_write_reduced_dataset(dataset):
reduced_dataset_file = dataset + "_reduced.npy"
reduced_matrix = reduce_dimensionality(dataset + '.npy')
print "Saving {}".format(reduced_dataset_file)
np.save(reduced_dataset_file, reduced_matrix)
def main(dataset):
build_and_write_dataset(dataset)
build_and_write_reduced_dataset(dataset)
if __name__ == "__main__":
if BATCH_SIZE < N_COMPS:
# See https://github.com/scikit-learn/scikit-learn/issues/6452
raise ValueError("Number of components must be < \
batch size.")
main(OUTPUT_NAME)