-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathinverted_index.py
108 lines (90 loc) · 3.6 KB
/
inverted_index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
from __future__ import unicode_literals
from hazm import Normalizer
from parsivar import FindStems, Tokenizer
import json
from doc_pos import DocPos
class DataPreprocess:
all_data = {}
def __init__(self):
self.file_path = 'IR_data_news_12k.json'
def read_data(self):
contents = []
flag = 0
# num_of_data = 0
with open(self.file_path, 'r') as f:
data = json.load(f)
for k in data.keys():
if flag >= 200:
break
# print(k)
# print(data[k])
idx = k + '' # to make the id string as the json file
self.all_data[idx] = {'title': data[idx]['title'],
'content': data[idx]['content'],
'url': data[idx]['url'],
}
contents.append(data[idx]['content'])
flag += 1
return self.all_data, contents
def stemming(self, tokens):
stemmed = []
my_stemmer = FindStems()
for token in tokens:
stemmed.append(my_stemmer.convert_to_stem(token))
return stemmed
#
# def lemmatizing(tokens):
# lemmatizied = []
def stopwords_removing(self, tokens):
tokens_with_removed_stopwords = []
stop_words = []
file = open("stopwords.txt", encoding="utf-8")
stop_words = file.read().splitlines()
for token in tokens:
if not (token in stop_words):
tokens_with_removed_stopwords.append(token)
return tokens_with_removed_stopwords
def tokenize(self, contents):
my_normalizer = Normalizer()
my_tokenizer = Tokenizer()
my_dictionary = {} # you can change it to a dictionary which means term id, term
for doc_id, content in enumerate(contents):
tokens_of_a_sentence = my_tokenizer.tokenize_words(my_normalizer.normalize(content))
stemmed = self.stemming(tokens_of_a_sentence)
stopwords_removed = self.stopwords_removing(stemmed)
final_tokens_of_a_sentence = stopwords_removed
# tokens_of_a_sentence = cont.tokenize()
for index_of_a_token, token in enumerate(final_tokens_of_a_sentence):
# if token in my_dictionary.keys():
if token in my_dictionary:
doc_pos_of_token = my_dictionary[token]
if doc_id in doc_pos_of_token.my_map.keys():
doc_pos_of_token.add_position(doc_id, index_of_a_token)
else:
doc_pos_of_token.new_doc_id(doc_id, index_of_a_token)
else:
temp = {}
# list_temp = []
doc_pos = DocPos()
# list_temp.append(index_of_a_token)
doc_pos.new_doc_id(doc_id, index_of_a_token)
# doc_pos.add_position(doc_id, index_of_a_token)
# temp[doc_id] = list_temp
my_dictionary[token] = doc_pos
return my_dictionary
def delete_stop_words(self):
pass
def execute(self):
all_data = {}
all_data, contents = self.read_data()
print(all_data)
# contents = []
# contents = make_content_list(all_data)
main_dictionary = self.tokenize(contents)
for k in main_dictionary:
print("")
print(f'{k}-> {main_dictionary[k].my_map}')
if __name__ == '__main__':
data_proc = DataPreprocess()
data_proc.execute()
print("end")