-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathexample_tokenizer.py
More file actions
41 lines (34 loc) · 1.43 KB
/
example_tokenizer.py
File metadata and controls
41 lines (34 loc) · 1.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import generic_token as tkn
import re
class SpamTokenizer(object):
"""parse email content, given as a list of strings- a list of lines in a file."""
def __init__(self, items):
"""Accepts: a list of email 'body' lines.
Returns: a list of tokens"""
self.items = items
self.tokens = []
def parse_pre_tokens(self, symbol):
"""remove '=', '><', and any word that does not contain letters; or
if the word 'spam' occurs, remove that word from the token list.
remove cruft from html."""
if ('=' in symbol) or ('><' in symbol):
return None
spam_in_subject = re.compile('spam', re.I)
if re.search(spam_in_subject, symbol):
return None
token = symbol.rstrip("'\"(<>*%$#@!:?,.\/").lstrip("'\")\/<>*%$#@!?:,.")
if re.search("[a-zA-Z]{3,}", token) and (len(token) < 13):
return token
return None
def tokenize(self):
"""Override this method to process the data_container
Returns: this must return a list of Token objects."""
split_tokens = []
for item in self.items:
split_string = item.split()
split_tokens.extend(split_string)
for pre_token in split_tokens:
t = self.parse_pre_tokens(pre_token)
if t and (t not in self.tokens):
self.tokens.append(tkn.Token(t))
return self.tokens