-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathdata_gen.py
35 lines (27 loc) · 1.02 KB
/
data_gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
import re
class Corpus(object):
"""
"""
def __init__(self,in_file,
target_file=None):
self.in_file = in_file
self.target_file = target_file
self.__iter__()
def __iter__(self):
for i,(line,target_list) in enumerate(zip(open(self.in_file),open(self.target_file))):
line = re.sub('eos','',line)
yield ' '.join(line.strip().replace('-',' ').split(',')),target_list.strip().split(',')
class hierarchicalCorpus(object):
"""
"""
def __init__(self,in_file,
target_file=None):
self.in_file = in_file
self.target_file = target_file
self.__iter__()
def __iter__(self):
for i,(line,target_list) in enumerate(zip(open(self.in_file),open(self.target_file))):
sentences = line.rstrip().replace('-',' ').split('eos')[:-1]
sentences = [sent.split(',') for sent in sentences]
sentences = [' '.join(sent) for sent in sentences]
yield sentences,target_list.strip().split(',')