-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreProcess.py
More file actions
executable file
·32 lines (22 loc) · 872 Bytes
/
preProcess.py
File metadata and controls
executable file
·32 lines (22 loc) · 872 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
import re
from string import printable
import pyaspell
from config import stop_words
printable_characters = set([k for k in printable if k not in ("\n","\t","\r",";",'"',"'")])
spellchecker = pyaspell.Aspell(("lang", "en"))
def remove_all_non_printable(text):
return "".join([k for k in text if k in printable_characters])
def remove_all_non_characters(text):
return re.sub("[^a-zA-Z\s]"," ",text)
def spellcheck(text):
correct_text = text
for word in text.split():
if not spellchecker.check(word):
suggestion = spellchecker.suggest(word)
if suggestion:
correct_text = correct_text.replace(word,suggestion[0])
return correct_text
def remove_multispaces(text):
return re.sub("[\s]+"," ",text)
def remove_stopwords(text):
return " ".join([k for k in text.split() if k not in stop_words])