-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTokenizer.py
85 lines (70 loc) · 2.39 KB
/
Tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from re import sub,compile
import cPickle as pickle
from nltk import stem
from Queue import Queue
import threading
from collections import defaultdict
tweet_filter=compile(r'(@[A-Za-z0-9]+)|(\w+:\/\/\S+)|(#[^ ]+)')
word_normalize=compile(r'(\w)\1+') #remove repeating charecters
stop=pickle.load(open("stopwords.p",'rb')) #load stop words
#splits the data in files into lines and inserts them into work Queue
workq=Queue()
def make_Queue(filename):
with open(filename,'rb') as file:
for line in file.readlines():
workq.put(line)
#extract the features and places them in Queues for word counting
processq=Queue()
class ThreadExtract(threading.Thread):
def __init__(self,workq,processq,id):
threading.Thread.__init__(self)
self.workq=workq
self.processq=processq
self.id=id
def run(self):
while True:
tuple=self.workq.get()
#print "\nThreadExtract",self.id,self.workq.qsize(),"\n"
temp=set(tweet_filter.sub("",word_normalize.sub(r'\1\1',tuple.lower())).split())
for word in temp:
tmp_word=stem.PorterStemmer().stem(word.strip('!,.:;/` '))
if tmp_word not in stop:
self.processq.put(tmp_word)
self.workq.task_done()
dic=defaultdict(int)
lock=threading.RLock()
#count the word frequency
class ThreadFreq(threading.Thread):
def __init__(self,processq,id):
threading.Thread.__init__(self)
self.processq=processq
self.id=id
def run(self):
#print "thread waiting for processq"
while True:
with lock:
dic[processq.get()]+=1
#print "\nThreadFreq",self.id,self.processq.qsize(),"\n"
self.processq.task_done()
for i in range(5):
t=ThreadExtract(workq,processq,i)
t.setDaemon(True)
t.start()
print "Started filling the Queue"
make_Queue("data.txt")
print "Queue full"
for i in range(5):
tf=ThreadFreq(processq,i)
tf.setDaemon(True)
tf.start()
print "Processing data ..."
workq.join()
processq.join()
print "Finished Tokenizing.Saving data ..."
print "Only words with Frequency more than 1 are saved!"
posdic=defaultdict(int)
for word in dic:
tmp=dic[word]
if tmp > 1:
posdic[word]=tmp
pickle.dump(posdic,open("datafrequency.p",'wb'))