-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathClassifier.py
More file actions
80 lines (69 loc) · 2.7 KB
/
Classifier.py
File metadata and controls
80 lines (69 loc) · 2.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
from sklearn.metrics import metrics
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import zero_one_loss
__version__ = "1.0"
__license__ = "MIT"
__author__ = "Rushil Goyal"
__author_email__ = "rg761@scarletmail.rutgers.edu"
def fileRead(path):
f = open(path, mode='r')
lines = f.readlines()
f.close()
return lines
# Fetches the news articles from the list and maps it to its labels
def extractData(dataList, txtList, txtLabels, labelList, fpath):
for lines in dataList:
txtlist1 = lines.split(',')
#txtlist1.remove('\n')
text = " ".join(fileRead(fpath+txtlist1[0]))
t=[]
i=1
j =len(txtlist1)
while i<j:
if txtlist1[i] not in ['', '\n','\r\n']:
if txtlist1[i] not in labelList:
labelList.append(txtlist1[i])
t.append(labelList.index(txtlist1[i]))
i = i+1
txtLabels.append(t)
txtList.append(text)
# Trains a Multi-Label classifier and generates a report on its predictions
def Classify(txtList, txtLabels, fileName, labelList):
x_train = np.array(txtList[0:300])
y_train = np.array(txtLabels[0:300])
x_test = np.array(txtList[301:])
y_test = np.array(txtLabels[301:])
classifier = Pipeline([
('vectorizer', CountVectorizer()),
('tfidf', TfidfTransformer()),
('clf', OneVsRestClassifier(LinearSVC()))])
classifier.fit(x_train, y_train)
predicted = classifier.predict(x_test)
f=open(fileName,'w')
f.writelines(metrics.classification_report(y_test, predicted,target_names=labelList))
f.write('\nNumber of Labels:'+str(len(labelList)))
f.write('\nhamming loss : '+str(metrics.hamming_loss(y_test,predicted)))
f.write('\nf-beta(beta=0.5 - biased towards Precision) : '+str(metrics.fbeta_score(y_test,predicted,0.5)))
f.write('\nzero-loss:'+str(zero_one_loss(y_test,predicted)))
f.write('\nAccuracy score:'+str(metrics.accuracy_score(y_test,predicted)))
f.close()
def main():
# Defines The path of the stored data files
path1 = "../DataDump/"
# The list of the news articles and their labels
path2 = "../DataList/All_890.csv"
# The file name and path of the results
fileName = 'results.txt'
dataList = []
labelList = []
txtList = []
txtLabels = []
dataList = fileRead(path2)
extractData(dataList,txtList,txtLabels,labelList,path1)
Classify(txtList,txtLabels,fileName, labelList)
if __name__ == "__main