forked from ahmedshabib/twit-miner
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathInitialConfiguration.py
More file actions
104 lines (95 loc) · 4.23 KB
/
Copy pathInitialConfiguration.py
File metadata and controls
104 lines (95 loc) · 4.23 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from TweetsClassifier import Classifier
from TweetsValidator import Validator
from TweetsProcessor import TweetProcessor
import json
#This code should be executed during the inital configuration / starting up
def main():
#has to be executed only once
addTrainingToTweets()
a =7.5
b =5.8
c =1.3
print "Please Be patient while we make things ready will take around 5 mins\nGrab a coffee till then"
classify = Classifier()
classify.categorisetweets()
for i in range(3):
cleanUpTrainingSet(a,b,c,"sports")
cleanUpTrainingSet(a,b,c,"politics")
classify = Classifier()
classify.categorisetweets()
validate = Validator()
validate.validateToFile(r'training.txt','Outputtraining.txt',a,b,c)
cleanUpTrainingSet(a,b,c,"sports")
cleanUpTrainingSet(a,b,c,"politics")
classify = Classifier()
classify.categorisetweets()
validate = Validator()
validate.validateToFile(r'validation.txt','Outputvalidation.txt',a,b,c)
cleanUpTrainingSet(a,b,c,"sports")
cleanUpTrainingSet(a,b,c,"politics")
classify = Classifier()
classify.categorisetweets()
validate = Validator()
validate.validateToFile(r'test.txt','OutputTest.txt',a,b,c)
print str((i+1)*100/3) +"% complete"
print "Awesome you are done.\nNow you can run RunTwitMiner.py as many time you want :) \nNote the input file for the program should be in test.txt and you will get the output in OutputTest.txt"
def cleanUpTrainingSet(a,b,c,cat):
#this code will cleanup training set with previous training to eliminate confusion.
#This code has to executed only once and is done in the above
tweetprocessor = TweetProcessor()
sportswords = json.load(open("sportswords"))
politicswords = json.load(open("politicswords"))
sportshashtags = json.load(open("sportshashtags"))
politicshashtags = json.load(open("politicshashtags"))
sportsmentions = json.load(open("sportsmentions"))
politicsmentions = json.load(open("politicsmentions"))
tweets = json.load(open(cat+"trainingset"))
for actualtweet in tweets:
tweet = tweetprocessor.processTweet(actualtweet)
words= tweetprocessor.getwords(tweet)
totalsportsweight = 0.0
totalpoliticsweight = 0.0
sportwordweight = 0.0
politicwordweight = 0.0
for word in words:
if(word != ''):
if(word[0] == '#' and len(word.split())<2):
sportwordweight = sportshashtags.get(word,0.0) +1.0
politicwordweight = politicshashtags.get(word,0.0) +1.0
if(sportwordweight!=0 or sportwordweight!=0):
totalsportsweight += a*(sportwordweight / (sportwordweight +politicwordweight))
totalpoliticsweight += a*(politicwordweight / (sportwordweight +politicwordweight))
elif(word[0] == '@' and len(word.split())<2):
sportwordweight = sportsmentions.get(word,0.0) +1.0
politicwordweight = politicsmentions.get(word,0.0) +1.0
if(sportwordweight!=0 or sportwordweight!=0):
totalsportsweight += b*(sportwordweight / (sportwordweight +politicwordweight))
totalpoliticsweight += b*(politicwordweight / (sportwordweight +politicwordweight))
else:
sportwordweight = sportswords.get(word,0.0) +1.0
politicwordweight = politicswords.get(word,0.0) +1.0
if(sportwordweight!=0 or sportwordweight!=0):
totalsportsweight += c*sportwordweight / (sportwordweight +politicwordweight)
totalpoliticsweight += c*politicwordweight / (sportwordweight +politicwordweight)
if (cat == "politics" and totalsportsweight > totalpoliticsweight):
tweets.remove(actualtweet)
if (cat == "sports" and totalsportsweight < totalpoliticsweight):
tweets.remove(actualtweet)
json.dump(tweets, open(cat + "trainingset", 'wb'),indent = True)
def addTrainingToTweets():
trainingset= open(r'training.txt', 'r').read().splitlines()
tweetprocessor = TweetProcessor()
sportstweets = []
politicstweets = []
for line in trainingset:
sportindex = line.find(" ")+9
if line.split()[1] == 'Sports':
sportstweets.append(tweetprocessor.processTweet(line[sportindex:-1]))
for line in trainingset:
politicsindex = line.find(" ")+11
if line.split()[1] == 'Politics':
politicstweets.append(tweetprocessor.processTweet(line[politicsindex:-1]))
json.dump(sportstweets, open("sportstrainingset", 'wb'),indent = True)
json.dump(politicstweets, open("politicstrainingset", 'wb'),indent = True)
if __name__ == "__main__":
main()