SM2KG/training.py at master · haripriyad252/SM2KG · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# training.py
# parameters:
## category: {description, installation, invocation, citation}
## model_name
# output:
## to console: Classification Report
## to SM2KG/models/`model_name`: pickled model

#Constants:
categories = ('description', 'installation', 'invocation', 'citation')

import argparse
import random
import pandas as pd
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

argparser = argparse.ArgumentParser(description=f"Train binary classifier for one of categories: {categories}.")
argparser.add_argument('-c', '--category', choices=categories, required=True, help='category in question')
argparser.add_argument('-o', '--output', help='output pickled model')
argv = argparser.parse_args()
selected_category = argv.category
categories_df = {cat : pd.read_csv(f"./data/{cat}.csv") for cat in categories}

negative_sample_size = int(len(categories_df[selected_category]) / 4)
print(f"Selected Category: {selected_category}")
for category in categories_df:
    categories_df[category].drop('URL', 1, inplace=True)
    if category != selected_category:
        categories_df[category] = categories_df[category].sample(negative_sample_size)
    categories_df[category] = categories_df[category].assign(**{selected_category: category == selected_category})
    print("{} has {} samples;".format(category, len(categories_df[category])))
    #print(categories_df[category].head())
treebank_background = pd.DataFrame(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), negative_sample_size)), columns=["excerpt"]).assign(description=False)
#print("Treebank has {} samples.".format(len(treebank_background)))
#print("categories_df")
corpus = pd.concat(categories_df.values(), ignore_index=True, sort=False)
corpus.append(treebank_background, ignore_index=True, sort=False)
corpus.fillna(value='', inplace=True)
#print(corpus)

pipeline = make_pipeline(TfidfVectorizer(), LogisticRegression(solver='liblinear'))

X, y = corpus.excerpt, corpus[selected_category]

#cross validation
cv_results = cross_validate(pipeline, X, y, cv=5)
print(cv_results['test_score'])
#scores = cross_val_score(pipeline, X, y, cv=5)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

X_train, X_test, y_train, y_test = train_test_split(X, y)

# ## Count Vectorizer and Logistic Regression in Pipeline
def display_accuracy_score(y_test, y_pred_class):
    score = accuracy_score(y_test, y_pred_class)
    print('accuracy score: %s' % '{:.2%}'.format(score))
    return score
def display_null_accuracy(y_test):
    value_counts = pd.value_counts(y_test)
    null_accuracy = max(value_counts) / float(len(y_test))
    print('null accuracy: %s' % '{:.2%}'.format(null_accuracy))
    return null_accuracy

def display_accuracy_difference(y_test, y_pred_class):
    null_accuracy = display_null_accuracy(y_test)
    accuracy_score = display_accuracy_score(y_test, y_pred_class)
    difference = accuracy_score - null_accuracy
    if difference > 0:
        print('model is %s more accurate than null accuracy' % '{:.2%}'.format(difference))
    elif difference < 0:
        print('model is %s less accurate than null accuracy' % '{:.2%}'.format(abs(difference)))
    elif difference == 0:
        print('model is exactly as accurate as null accuracy')
    return null_accuracy, accuracy_score


pipeline.fit(X_train, y_train)

y_pred_class = pipeline.predict(X_test)
y_pred_vals = pipeline.predict_proba(X_test)
#print(y_pred_vals)
#print("X_test: {}, y_pred: {}".format(X_test, y_pred_class))
#results_df = pd.DataFrame({"x_test": X_test, "y_pred": y_pred_vals[:,1], "y_TF_pred": y_pred_class, "y_actual": y_test})
results_df = pd.DataFrame({"x_test": X_test,  "y_TF_pred": y_pred_class, "y_actual": y_test})
#print(results_df)
#print(confusion_matrix(y_test, y_pred_class))
print('-' * 75 + '\nClassification Report\n')
print(classification_report(y_test, y_pred_class))
display_accuracy_difference(y_test, y_pred_class)

if argv.output is not None:
    out_file = argv.output
else:
    out_file = f'models/{selected_category}.sk'
print(f"Saving model to {out_file}")
import pickle
pickle.dump(pipeline, open(out_file, 'wb+'))