-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathIter_Corpus.py
More file actions
46 lines (32 loc) · 1.12 KB
/
Iter_Corpus.py
File metadata and controls
46 lines (32 loc) · 1.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
import pdb
class Tokenize_Sent(object):
def __init__(self, filename, corpus_size):
self.filename = filename
if corpus_size == -1:
self.size = self.getCorpusSize(filename)
else:
self.size = corpus_size
def getCorpusSize(self, filename):
with open(self.filename, 'r') as f:
count = 0
for line in f:
count += 1
return count
def __iter__(self):
with open(self.filename, 'r') as f:
for line in f:
token_seq = line.rstrip().lower().split(' ')
yield (token_seq)
def load_Bulk_Corpus(filename):
with open(filename, 'r') as f:
sent_list = []
for line in f:
token_seq = line.rstrip().lower().split(' ')
sent_list.append(token_seq)
return sent_list
if __name__=='__main__':
corpus_path = '../../data/wordEmbed_data/Tokenized_Sentences.txt'
gen_sent = Tokenize_Sent(corpus_path, corpus_size = -1)
for item in gen_sent:
print(item)
pdb.set_trace()