-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDictionary.py
More file actions
70 lines (57 loc) · 2.02 KB
/
Dictionary.py
File metadata and controls
70 lines (57 loc) · 2.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import time
from bisect import bisect_left
from slangCleaner import SlangCleaner
class Encoder:
PADDING = 0
couldnt_find = []
def __init__(self, file: str = "resources/dataset/dictionary"):
self.cleaner = SlangCleaner()
start = time.time()
self.index = [""]
self.count = 1
with open(file) as dictFile:
while True:
word = dictFile.readline().strip('\n')
if word is None or word == "":
break
if self.count % 5000 == 0:
print("Count at", self.count, "with word", word)
self.index.append(word)
self.count += 1
finish = time.time()
print("Loaded a dictionary with size:", self.count, "in", finish - start, "seconds")
def binarysearch(self, word: str):
l = 0
r = len(self.index) - 1
while l <= r:
mid = int(l + (r - l) / 2)
# Check if x is present at mid
if self.index[mid] == word:
return mid
# If x is greater, ignore left half
elif self.index[mid] < word:
l = mid + 1
# If x is smaller, ignore right half
else:
r = mid - 1
# If we reach here, then the element was not present
print("Could not find", word)
Encoder.couldnt_find.append(word)
return self.count
def encodelist(self, word_list: list):
encoded = [0] * 250
for i in range(len(word_list)):
word = word_list[i].lower()
encoded[i] = bisect_left(self.index, word)
return encoded
def encode(self, text: str):
word_list = self.cleaner.clean(text)
return self.encodelist(word_list)
def decode(self, encoded: list):
words = []
for num in encoded:
if num != self.count:
words.append(self.index[num])
else:
words.append("<UNKNOWN>")
return " ".join(words)