SpamEmail/classifier.py at master · nirvik/SpamEmail · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/python

""" This classifier program will use the spamicity of each token of the message
and then determine the classification of the email as spam/ham """


""" Usage : Enter the file in the command line argument """

import json
import sys
import email
from BeautifulSoup import BeautifulSoup
import re
import operator  #sorting dict accoring to the spamicity


def fun(x):
	return 1.0-x

def main():

	spamicity= {}

	f = open(sys.argv[1])
	msg = parser.parse(f)
	f.close()
	text = msg.get_payload()
	try:
		text = text.lower()

	except AttributeError:

		for check in msg.get_payload():
			q = check['Content-Type']
			if check['Content-Transfer-Encoding']=='base64':
				print 'This program is not trained with base64 encoding . Hence it cannot classify'
				sys.exit(0)

		text = ' '
		for ex in msg.get_payload():
			try:
				text = text+ex.get_payload()

			except TypeError:
				print 'This program is facing difficulty in reading this mail....Terminating'
				sys.exit(0)

		text = text.lower()

	try:
		tokens = ' '.join(BeautifulSoup(text).text.split())

	#	temp_special = ' '.join(reg2.findall(tokens)) #extracting special characters
	#	special_char = temp_special.split()
		tokens = reg.findall(tokens) #+ special_char

	except TypeError:
		print 'SomeThing got messed up during regex'
		print ' Terminating .... '
		sys.exit(0)


	for words in tokens:


		if words not in ham_prob and words not in spam_prob:
			spamicity[words]=0.4
			continue

		if words not in combined_spam and ham[words]<=5:
			spamicity[words]=0.4
			continue

		if words not in ham and combined_spam[words]<=5:
			spamicity[words]=0.4
			continue
		if words in ham and words in combined_spam:
			if ham[words]<=5 and combined_spam[words]<=5:
				spamicity[words]= 0.4
				continue

		if words not in spam_prob:
			spam_prob[words]=0.0

		if words not in ham_prob:
			ham_prob[words]=0.0


		spamicity[words] = spam_prob[words]/(spam_prob[words]+ham_prob[words])

	for i,j in spamicity.iteritems():
		print i,':',j

	print 'Done tokenzing and determing the spamicity of tokens'

	best_tokens = []
	for i in spamicity.itervalues():
		best_tokens.append(i)

	best_tokens = sorted(best_tokens,reverse = True)[:-10] #taking the top 15 spamicity words

	not_best = map(fun,best_tokens) #gettin a list of [1-x1,1-x2,1-x3,...]
	print ' Best ' + str(best_tokens)
	print 'Not Best ' + str(not_best)


	num = reduce(lambda x, y:x*y,best_tokens)
	den = reduce(lambda x, y:x*y,best_tokens) + reduce(lambda x,y:x*y,not_best)
	result = num/den
	if result>0.5:
		print 'Its a Spam'
	else:
		print 'its a Ham'

	print 'The ans is result of {0}/{1} = {2}'.format(num,den,result)


if __name__=='__main__':

	parser = email.Parser.Parser()
	reg = re.compile('[a-z]+',re.IGNORECASE)
	reg2 = re.compile('\W+')
	json_data = open('spam_token_contribution.db')
	spam_prob = json.load(json_data)
	json_data.close()

	json_data = open('ham_token_contribution.db')
	ham_prob = json.load(json_data)
	json_data.close()

	json_data = open('ham_dict.txt')
	ham = json.load(json_data)
	json_data.close()
	json_data = open('more_spam_dict.txt')
	spam2 = json.load(json_data)
	json_data.close()
	json_data = open('spam_dict.txt')
	spam = json.load(json_data)
	json_data.close()

	combined_spam = dict(spam.items() + spam2.items())

	main()