Skip to content

Commit 2919e71

Browse files
authored
Update search_engine2.py
1 parent 925946c commit 2919e71

File tree

1 file changed

+84
-75
lines changed

1 file changed

+84
-75
lines changed

search_engine/search_engine.py

Lines changed: 84 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -1,90 +1,99 @@
11
import re
2-
from math import log2
2+
import math
33

44

5-
def search(documents, query):
6-
inverted_index = get_inverted_index(documents)
7-
result_with_relevance = {}
8-
query_tokens = tokenize(query)
9-
for query_token in query_tokens:
10-
documents_has_token = inverted_index.get(query_token)
11-
if not documents_has_token:
12-
continue
13-
for document in documents_has_token:
14-
result_with_relevance.setdefault(document['id'], 0)
15-
token_tf_idf = get_tf_idf(document['id'], documents_has_token)
16-
result_with_relevance[document['id']] += token_tf_idf
17-
result = sorted(
18-
result_with_relevance,
19-
key=result_with_relevance.get,
20-
reverse=True)
21-
return result
5+
def compare_by_if_idf(item1, item2):
6+
if item1['TFIDF'] < item2['TFIDF']:
7+
return -1
8+
elif item1['TFIDF'] == item2['TFIDF']:
9+
return 0
10+
else:
11+
return 1
2212

2313

24-
def get_inverted_index(documents):
25-
inverted_index = {}
26-
tokens_all = set()
27-
documents_as_tokens = []
28-
for document in documents:
29-
document_tokens = tokenize(document['text'])
30-
current_document_tokenized = {
31-
'id': document['id'],
32-
'tokens': document_tokens
33-
}
34-
documents_as_tokens.append(current_document_tokenized)
35-
tokens_all.update(document_tokens)
36-
for token in tokens_all:
37-
inverted_index[token] = []
38-
idf = get_idf(documents_as_tokens, token)
39-
for document in documents_as_tokens:
40-
if token in document['tokens']:
41-
tf = get_tf(document['tokens'], token)
42-
current_document_with_relevance = {
43-
'id': document['id'],
44-
'tf-idf': round(tf * idf, 4)
45-
}
46-
inverted_index[token].append(current_document_with_relevance)
47-
return inverted_index
14+
def quickSort(items, comparator, direction='asc'):
15+
items_length = len(items)
4816

17+
if items_length == 0:
18+
return []
4919

50-
def tokenize(text):
51-
tokens = []
52-
text_lines = text.split('\n')
53-
for text_line in text_lines:
54-
text_line_tokenized = [
55-
get_term(token)
56-
for token in text_line.split(' ') if token
57-
]
58-
tokens.extend(text_line_tokenized)
59-
return tokens
20+
index = items_length // 2
21+
element = items[index]
6022

23+
smaller_items = [
24+
items[i]
25+
for i in range(items_length)
26+
if comparator(items[i], element) < 0 and i != index
27+
]
6128

62-
def get_term(token):
63-
return re.sub(r'[^\w\s]', '', token).lower()
29+
bigger_items = [
30+
items[i]
31+
for i in range(items_length)
32+
if comparator(items[i], element) >= 0 and i != index
33+
]
6434

35+
sorted_smaller_items = quickSort(smaller_items, comparator, direction)
36+
sorted_bigger_items = quickSort(bigger_items, comparator, direction)
6537

66-
def get_tf_idf(document_id, documents_has_token):
67-
filter_document_has_token = filter(
68-
lambda document: document['id'] == document_id, documents_has_token
69-
)
70-
document_has_token = list(filter_document_has_token)[0]
71-
tf_idf = document_has_token['tf-idf']
72-
return tf_idf
38+
if direction == 'asc':
39+
return [*sorted_smaller_items, element, *sorted_bigger_items]
40+
return [*sorted_bigger_items, element, *sorted_smaller_items]
7341

7442

75-
def get_tf(document_as_tokens, token):
76-
document_tokens_count = len(document_as_tokens)
77-
token_in_document_count = document_as_tokens.count(token)
78-
tf = token_in_document_count / document_tokens_count
79-
return tf
43+
def get_index(docs):
44+
index = {}
45+
docs_count = len(docs)
46+
for doc in docs:
47+
temp_dict = {}
48+
number_words = 0
49+
for token in doc['text'].split():
50+
term = re.findall(r'\w+', token)
51+
index_key = ''.join(term).lower()
52+
if index_key not in temp_dict:
53+
temp_dict[index_key] = 0
54+
temp_dict[index_key] += 1
55+
number_words += 1
56+
for key, TF in temp_dict.items():
57+
if key not in index:
58+
index[key] = [{'id': doc['id'], 'TF': TF / number_words}]
59+
else:
60+
index[key].append({'id': doc['id'], 'TF': TF / number_words})
61+
for key, list_doc in index.items():
62+
docs_with_term = len(list_doc)
63+
# IDF = math.log10( docs_count / docs_with_term )
64+
# Math.log2(1 + (docsCount - termCount + 1) / (termCount + 0.5));
65+
# docsCount - общее количество документов
66+
# termCount - количество документов, в которых встречается искомое слово
67+
# Это несколько "сглаженный" вариант основной формулы
68+
# линтер требует одновременно соблюдать W503 и W504
69+
part_idf = (docs_count - docs_with_term + 1) / (docs_with_term + 0.5)
70+
IDF = math.log2(1 + part_idf)
71+
for doc in list_doc:
72+
doc['TFIDF'] = doc['TF'] * IDF
73+
return index
8074

8175

82-
def get_idf(documents_as_tokens, token):
83-
documents_count = len(documents_as_tokens)
84-
filter_documents_has_token = filter(
85-
lambda document: token in document['tokens'], documents_as_tokens
86-
)
87-
documents_has_token = list(filter_documents_has_token)
88-
documents_has_token_count = len(documents_has_token)
89-
idf = log2((documents_count + 1) / (documents_has_token_count + 0.5))
90-
return idf
76+
def search(docs: dict, search_pattern: str):
77+
keys = []
78+
index = get_index(docs)
79+
search_results = {}
80+
search_worlds = search_pattern.split()
81+
for search_world in search_worlds:
82+
term = re.findall(r'\w+', search_world)
83+
search_world_lower = ''.join(term).lower()
84+
if search_world_lower in index:
85+
for doc in index[search_world_lower]:
86+
if doc['id'] not in search_results:
87+
search_results[doc['id']] = doc['TFIDF']
88+
else:
89+
search_results[doc['id']] += doc['TFIDF']
90+
search_results_list = []
91+
for doc, TFIDF in search_results.items():
92+
search_results_list.append({'id': doc, 'TFIDF': TFIDF})
93+
search_results = quickSort(search_results_list,
94+
compare_by_if_idf,
95+
'desc')
96+
print(search_results)
97+
for result in search_results:
98+
keys.append(result['id'])
99+
return keys

0 commit comments

Comments
 (0)