forked from KnowledgeCaptureAndDiscovery/somef
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtraining.py
More file actions
102 lines (88 loc) · 4.37 KB
/
training.py
File metadata and controls
102 lines (88 loc) · 4.37 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# training.py
# parameters:
## category: {description, installation, invocation, citation}
## model_name
# output:
## to console: Classification Report
## to SM2KG/models/`model_name`: pickled model
#Constants:
categories = ('description', 'installation', 'invocation', 'citation')
import argparse
import random
import pandas as pd
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
argparser = argparse.ArgumentParser(description=f"Train binary classifier for one of categories: {categories}.")
argparser.add_argument('-c', '--category', choices=categories, required=True, help='category in question')
argparser.add_argument('-o', '--output', help='output pickled model')
argv = argparser.parse_args()
selected_category = argv.category
categories_df = {cat : pd.read_csv(f"./data/{cat}.csv") for cat in categories}
negative_sample_size = int(len(categories_df[selected_category]) / 4)
print(f"Selected Category: {selected_category}")
for category in categories_df:
categories_df[category].drop('URL', 1, inplace=True)
if category != selected_category:
categories_df[category] = categories_df[category].sample(negative_sample_size)
categories_df[category] = categories_df[category].assign(**{selected_category: category == selected_category})
print("{} has {} samples;".format(category, len(categories_df[category])))
#print(categories_df[category].head())
treebank_background = pd.DataFrame(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), negative_sample_size)), columns=["excerpt"]).assign(description=False)
#print("Treebank has {} samples.".format(len(treebank_background)))
#print("categories_df")
corpus = pd.concat(categories_df.values(), ignore_index=True, sort=False)
corpus.append(treebank_background, ignore_index=True, sort=False)
corpus.fillna(value='', inplace=True)
#print(corpus)
pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear'))
X, y = corpus.excerpt, corpus[selected_category]
#cross validation
cv_results = cross_validate(pipeline, X, y, cv=5)
print(cv_results['test_score'])
#scores = cross_val_score(pipeline, X, y, cv=5)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
X_train, X_test, y_train, y_test = train_test_split(X, y)
# ## Count Vectorizer and Logistic Regression in Pipeline
def display_accuracy_score(y_test, y_pred_class):
score = accuracy_score(y_test, y_pred_class)
print('accuracy score: %s' % '{:.2%}'.format(score))
return score
def display_null_accuracy(y_test):
value_counts = pd.value_counts(y_test)
null_accuracy = max(value_counts) / float(len(y_test))
print('null accuracy: %s' % '{:.2%}'.format(null_accuracy))
return null_accuracy
def display_accuracy_difference(y_test, y_pred_class):
null_accuracy = display_null_accuracy(y_test)
accuracy_score = display_accuracy_score(y_test, y_pred_class)
difference = accuracy_score - null_accuracy
if difference > 0:
print('model is %s more accurate than null accuracy' % '{:.2%}'.format(difference))
elif difference < 0:
print('model is %s less accurate than null accuracy' % '{:.2%}'.format(abs(difference)))
elif difference == 0:
print('model is exactly as accurate as null accuracy')
return null_accuracy, accuracy_score
pipeline.fit(X_train, y_train)
y_pred_class = pipeline.predict(X_test)
y_pred_vals = pipeline.predict_proba(X_test)
#print(y_pred_vals)
#print("X_test: {}, y_pred: {}".format(X_test, y_pred_class))
#results_df = pd.DataFrame({"x_test": X_test, "y_pred": y_pred_vals[:,1], "y_TF_pred": y_pred_class, "y_actual": y_test})
results_df = pd.DataFrame({"x_test": X_test, "y_TF_pred": y_pred_class, "y_actual": y_test})
#print(results_df)
#print(confusion_matrix(y_test, y_pred_class))
print('-' * 75 + '\nClassification Report\n')
print(classification_report(y_test, y_pred_class))
display_accuracy_difference(y_test, y_pred_class)
if argv.output is not None:
out_file = argv.output
else:
out_file = f'models/{selected_category}.sk'
print(f"Saving model to {out_file}")
import pickle
pickle.dump(pipeline, open(out_file, 'wb+'))