-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathkNN.py
133 lines (99 loc) · 4.19 KB
/
kNN.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#%% Import libraries and data
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial import distance
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
import statistics
import pandas as pd
import numpy as np
#%% Original data
original_data = pd.read_csv("data_before_pca.csv")
o_features = original_data.iloc[:,1:]
o_labels = original_data.iloc[:,0]
ft_train, ft_test, l_train, l_test = train_test_split(o_features, o_labels, test_size=0.3, random_state=42)
print("Original data uploaded successfully")
#%% Normalized original data
scaler = MinMaxScaler()
o_features = scaler.fit_transform(o_features)
ft_train, ft_test, l_train, l_test = train_test_split(o_features, o_labels, test_size=0.3, random_state=42)
print("Normalized original data uploaded successfully")
#%% PCA data
pca_data = pd.read_csv("data_after_pca.csv")
pca_features = pca_data.iloc[:,1:]
pca_labels = original_data.iloc[:,0]
ft_train, ft_test, l_train, l_test = train_test_split(pca_features, pca_labels, test_size=0.3, random_state=42)
print("PCA data uploaded successfully")
#%% KNN
def knn(train, valid, k):
predictions=[]
for new_child in valid.values:
neighbor_distances_and_indices = []
for index, child in enumerate(train.values):
# Calculate the distance between the query example and the current example from the data.
dist = distance.euclidean(child[:-1], new_child)
# Add the distance and the index of the example to an ordered collection
neighbor_distances_and_indices.append((dist, index))
# Sort the ordered collection of distances and indices from
# smallest to largest (in ascending order) by the distances
sorted_neighbor_distances_and_indices = sorted(neighbor_distances_and_indices)
# Pick the first K entries from the sorted collection
k_nearest_distances_and_indices = sorted_neighbor_distances_and_indices[:k]
# Get the labels of the selected K entries
k_nearest_labels = [train.values[i][-1] for distance, i in k_nearest_distances_and_indices]
predictions.append(statistics.mode(k_nearest_labels))
return predictions
#%% KNN original data
train_o = pd.concat([ft_train, l_train], axis = 1)
predictions_o=knn(train_o,ft_test,5)
cm = metrics.confusion_matrix(l_test, predictions_o)
acc = metrics.accuracy_score(l_test, predictions_o)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'PuRd_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(round(acc,2))
plt.title(all_sample_title, size = 15)
#%% KNN PCA data
train_pca = pd.concat([ft_train, l_train], axis = 1)
predictions_pca=knn(train_pca,ft_test,5)
cm = metrics.confusion_matrix(l_test, predictions_pca)
acc = metrics.accuracy_score(l_test, predictions_pca)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'PuRd_r')
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
all_sample_title = 'Accuracy Score: {0}'.format(round(acc,2))
plt.title(all_sample_title, size = 15)
#%% Sensitivity of kNN to k
k_range = range(1,15)
scores_d = []
scores_u = []
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k, weights='distance')
clf.fit(ft_train, l_train)
scores_d.append(clf.score(ft_test, l_test))
clf2 = KNeighborsClassifier(n_neighbors = k, weights='uniform')
clf2.fit(ft_train, l_train)
scores_u.append(clf2.score(ft_test, l_test))
plt.figure()
plt.xlabel('k')
plt.ylabel('accuracy')
plt.scatter(k_range, scores_d, label="distance")
plt.scatter(k_range, scores_u, label="uniform")
plt.legend()
plt.show()
#%% Cross Validation
k_range=[1,3,5,10]
means = []
means2 = []
for k in k_range:
clf = KNeighborsClassifier(n_neighbors = k, weights='uniform')
cv = cross_val_score(clf,o_features,o_labels, scoring="roc_auc")
means.append(round(np.mean(cv),3))
#cv2 = cross_val_score(clf,pca_features,pca_labels)
#means2.append(np.mean(cv2))
print(means)
#print(means2)