-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtokenizer.py
More file actions
executable file
·31 lines (23 loc) · 813 Bytes
/
tokenizer.py
File metadata and controls
executable file
·31 lines (23 loc) · 813 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
from nltk.tokenize.treebank import TreebankWordTokenizer
class TreebankSpanTokenizer(TreebankWordTokenizer):
def __init__(self):
self._word_tokenizer = TreebankWordTokenizer()
def span_tokenize(self, text):
ix = 0
for word_token in self.tokenize(text):
ix = text.find(word_token, ix)
end = ix+len(word_token)
yield (ix, end)
ix = end
def tokenize(self, text, withSpans=False):
tokens = self._word_tokenizer.tokenize(text)
if not withSpans:
return tokens
spans = []
ix = 0
for word_token in tokens:
ix = text.find(word_token, ix)
end = ix+len(word_token)
spans.append((ix, end))
ix = end
return zip(tokens, spans)