-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathknn_main.py
executable file
·138 lines (127 loc) · 4.87 KB
/
knn_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!usr/bin/python3
"""
this script allows user to train and evaluate the performance of Direct-CS-kNN
"""
# Author: Dec4f
# License: GPLv3
import sys
import os
from copy import deepcopy
import numpy as np
import mldata # ignored private C.45 parser
from cs_knn import DirectCSkNN
from cs_stacking import BaseStacking
from eval_tools import *
from split_tools import KFoldCV
N_FOLD = 5
COST_MATRIX = np.array([[0.0, 0.25],
[0.6, 0.0]])
N_NEIGHBORS = 5
BASE_CONFIG = [COST_MATRIX, N_NEIGHBORS, 'euclidean']
SMOOTHING_CONFIG = [COST_MATRIX, N_NEIGHBORS, 'euclidean', 100, 0.8]
STACKING_CONFIG = [[0.01, 100, 10], N_FOLD]
STK_BASE_CONFIG = [[COST_MATRIX, 1, 'euclidean'],
[COST_MATRIX, 10, 'euclidean'],
[COST_MATRIX, 50, 'euclidean']]
def main():
"""
run the naive bayes network with given command line input
----------
"""
file_path, use_full_sample, opt_method = sys.argv[1:4]
examples = get_dataset(file_path)
clf = parse_enhance_method(opt_method)
if int(use_full_sample) and not isinstance(clf, BaseStacking):
samples = examples[:, 1:-1]
targets = examples[:, -1]
clf.fit(samples, targets)
else:
metrics_list = cv_eval(clf, examples)
m_acc, m_prec, m_rec, m_spec, m_cost = [np.mean(metri) for metri in metrics_list]
std_acc, std_prec, std_rec, std_spec, std_cost = [np.std(metri) for metri in metrics_list]
print(("Mean Accuracy: %.3f ± %.3f " + os.linesep +
"Precision: %.3f ± %.3f " + os.linesep +
"Recall: %.3f ± %.3f" + os.linesep +
"Specificity: %.3f ± %.3f" + os.linesep +
"Misclassification Cost: %.3f ± %.3f") %
(m_acc, std_acc,
m_prec, std_prec,
m_rec, std_rec,
m_spec, std_spec,
m_cost, std_cost))
def get_dataset(file_path):
"""
parse the dataset stored in the input file path
----------
file_path : String
the path to the dataset
"""
raw_parsed = mldata.parse_c45(file_path.split(os.sep)[-1], file_path)
return np.array(raw_parsed, dtype=object)
def parse_enhance_method(opt_method):
"""
parses user input and returns corresponding enhancement methods
----------
opt_method : String
the optimization method of choice
"""
methods = ['base', 'smooth', 'stack']
if opt_method not in methods:
raise Exception("Error Code: OPT_METHOD_UNAVAILABLE")
elif opt_method == methods[0]:
print("Learning with Base Model...")
return DirectCSkNN(*BASE_CONFIG)
# using m-smoothing
elif opt_method == methods[1]:
print("Learning with M-Smoothing...")
return DirectCSkNN(*SMOOTHING_CONFIG)
elif opt_method == methods[2]:
print("Learning with Stacking...")
return BaseStacking(*STACKING_CONFIG)
else:
raise Exception("Error Code: UNKNOWN_IO_ERROR")
def cv_eval(clf, examples):
"""
perform k fold cross validation and get metric data from each fold
"""
[acc, prec, rec, spec, cost] = [np.zeros(N_FOLD, dtype=float) for i in range(5)]
kcv = KFoldCV(N_FOLD, shuffle=True, seed=12345)
for i, (train_idx, test_idx) in enumerate(kcv.split(examples)):
train_set = examples[train_idx]
test_set = examples[test_idx]
model = deepcopy(clf)
if isinstance(model, BaseStacking):
clfs = [DirectCSkNN(*STK_BASE_CONFIG[i]) for i in range(3)]
acc[i], prec[i], rec[i], spec[i] = cv_with_stacking(model, train_set, test_set, clfs)
else:
acc[i], prec[i], rec[i], spec[i], cost[i] = cv_non_stacking(model, train_set, test_set)
print("accuracy = ", acc)
print("precision = ", prec)
print("recall = ", rec)
print("specificity = ", spec)
if not isinstance(model, BaseStacking):
print("misclassification cost = ", cost) # no cost for base stacking
return [acc, prec, rec, spec, cost]
def cv_with_stacking(model, train_set, test_set, clfs):
"""
get prediction with stacking model
"""
y_test = test_set[:, -1]
y_pred = model.fit_pred(train_set, test_set, clfs)
return accuracy(y_test, y_pred), precision(y_test, y_pred), recall(y_test, y_pred), specificity(y_test, y_pred)
def cv_non_stacking(model, train_set, test_set):
"""
get prediction with base learner
"""
x_train = train_set[:, 1:-1]
y_train = train_set[:, -1]
x_test = test_set[:, 1:-1]
y_test = test_set[:, -1]
model.fit(x_train, y_train)
y_pred = np.zeros(len(y_test), dtype=bool)
cost_list = np.zeros(len(y_test), dtype=float)
for j, row_test in enumerate(x_test):
y_pred[j], cost_list[j] = model.predict(row_test)
return accuracy(y_test, y_pred), precision(y_test, y_pred), recall(y_test, y_pred), specificity(y_test, y_pred), np.mean(cost_list)
if __name__ == '__main__':
main()