-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclassifier.py
More file actions
executable file
·143 lines (101 loc) · 3.25 KB
/
classifier.py
File metadata and controls
executable file
·143 lines (101 loc) · 3.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/python
""" This classifier program will use the spamicity of each token of the message
and then determine the classification of the email as spam/ham """
""" Usage : Enter the file in the command line argument """
import json
import sys
import email
from BeautifulSoup import BeautifulSoup
import re
import operator #sorting dict accoring to the spamicity
def fun(x):
return 1.0-x
def main():
spamicity= {}
f = open(sys.argv[1])
msg = parser.parse(f)
f.close()
text = msg.get_payload()
try:
text = text.lower()
except AttributeError:
for check in msg.get_payload():
q = check['Content-Type']
if check['Content-Transfer-Encoding']=='base64':
print 'This program is not trained with base64 encoding . Hence it cannot classify'
sys.exit(0)
text = ' '
for ex in msg.get_payload():
try:
text = text+ex.get_payload()
except TypeError:
print 'This program is facing difficulty in reading this mail....Terminating'
sys.exit(0)
text = text.lower()
try:
tokens = ' '.join(BeautifulSoup(text).text.split())
# temp_special = ' '.join(reg2.findall(tokens)) #extracting special characters
# special_char = temp_special.split()
tokens = reg.findall(tokens) #+ special_char
except TypeError:
print 'SomeThing got messed up during regex'
print ' Terminating .... '
sys.exit(0)
for words in tokens:
if words not in ham_prob and words not in spam_prob:
spamicity[words]=0.4
continue
if words not in combined_spam and ham[words]<=5:
spamicity[words]=0.4
continue
if words not in ham and combined_spam[words]<=5:
spamicity[words]=0.4
continue
if words in ham and words in combined_spam:
if ham[words]<=5 and combined_spam[words]<=5:
spamicity[words]= 0.4
continue
if words not in spam_prob:
spam_prob[words]=0.0
if words not in ham_prob:
ham_prob[words]=0.0
spamicity[words] = spam_prob[words]/(spam_prob[words]+ham_prob[words])
for i,j in spamicity.iteritems():
print i,':',j
print 'Done tokenzing and determing the spamicity of tokens'
best_tokens = []
for i in spamicity.itervalues():
best_tokens.append(i)
best_tokens = sorted(best_tokens,reverse = True)[:-10] #taking the top 15 spamicity words
not_best = map(fun,best_tokens) #gettin a list of [1-x1,1-x2,1-x3,...]
print ' Best ' + str(best_tokens)
print 'Not Best ' + str(not_best)
num = reduce(lambda x, y:x*y,best_tokens)
den = reduce(lambda x, y:x*y,best_tokens) + reduce(lambda x,y:x*y,not_best)
result = num/den
if result>0.5:
print 'Its a Spam'
else:
print 'its a Ham'
print 'The ans is result of {0}/{1} = {2}'.format(num,den,result)
if __name__=='__main__':
parser = email.Parser.Parser()
reg = re.compile('[a-z]+',re.IGNORECASE)
reg2 = re.compile('\W+')
json_data = open('spam_token_contribution.db')
spam_prob = json.load(json_data)
json_data.close()
json_data = open('ham_token_contribution.db')
ham_prob = json.load(json_data)
json_data.close()
json_data = open('ham_dict.txt')
ham = json.load(json_data)
json_data.close()
json_data = open('more_spam_dict.txt')
spam2 = json.load(json_data)
json_data.close()
json_data = open('spam_dict.txt')
spam = json.load(json_data)
json_data.close()
combined_spam = dict(spam.items() + spam2.items())
main()