Skip to content

Commit 925946c

Browse files
authored
Update search_engine.py
1 parent d4ac606 commit 925946c

File tree

1 file changed

+83
-97
lines changed

1 file changed

+83
-97
lines changed

search_engine/search_engine.py

Lines changed: 83 additions & 97 deletions
Original file line numberDiff line numberDiff line change
@@ -1,104 +1,90 @@
11
import re
2-
from collections import defaultdict, Counter
3-
from itertools import chain
4-
from math import log
5-
6-
def preprocess(text):
7-
return re.sub(r'[^\w\s]|(?<!\w)_', '', text).lower()
8-
9-
10-
def search(docs: list, words: str) -> list:
11-
"""
12-
Searches for words in the documents and returns their IDs,
13-
ranked by the total count of word occurrences in the document text.
14-
"""
15-
words = preprocess(words).split()
16-
result = []
17-
18-
for doc in docs:
19-
cleaned_text = preprocess(doc['text'])
20-
word_count = sum(Counter(cleaned_text.split())[word] for word in words)
21-
if word_count:
22-
result.append({'id': doc['id'], 'word_count': word_count})
23-
24-
result.sort(key=lambda x: x['word_count'], reverse=True)
25-
26-
return [item['id'] for item in result]
27-
28-
29-
def get_inverted_index(docs: list) -> dict:
30-
"""
31-
Builds an inverted index from a list of documents.
32-
"""
33-
inverted_index = defaultdict(list)
34-
35-
# Создаем индекс для каждого слова
36-
for doc in docs:
37-
words = preprocess(doc['text']).split()
38-
word_counts = Counter(words)
39-
40-
for word, count in word_counts.items():
41-
if word not in inverted_index:
42-
inverted_index[word] = [doc['id']]
43-
else:
44-
inverted_index[word].append(doc['id'])
45-
46-
return dict(inverted_index)
47-
48-
49-
def get_tf(text: str) -> dict:
50-
""" Compute TF """
51-
cleaned_text = preprocess(text).split()
52-
total_words = len(cleaned_text)
53-
tf = defaultdict(float)
54-
55-
for word in cleaned_text:
56-
tf[word] += 1
57-
58-
for word in tf:
59-
tf[word] /= total_words
60-
61-
return tf
62-
63-
64-
def get_idf(docs):
65-
""" Compute IDF """
66-
N = len(docs)
67-
idf = {}
68-
69-
for doc in docs:
70-
words_in_doc = preprocess(doc['text']).split()
71-
for word in words_in_doc:
72-
q_of_docs_with_word = len(search(docs, word))
73-
idf[word] = log(N / (1 + q_of_docs_with_word)) + 1
74-
75-
return idf
76-
77-
78-
def get_tf_idf(docs):
79-
""" Compute TF-IDF """
80-
idf = get_idf(docs)
81-
tf_idf = {}
82-
83-
for doc in docs:
84-
doc_id = doc['id']
85-
tf = get_tf(doc['text'])
86-
tf_idf[doc_id] = {
87-
word: round(tf[word] * idf[word], 4)
88-
for word in tf
2+
from math import log2
3+
4+
5+
def search(documents, query):
6+
inverted_index = get_inverted_index(documents)
7+
result_with_relevance = {}
8+
query_tokens = tokenize(query)
9+
for query_token in query_tokens:
10+
documents_has_token = inverted_index.get(query_token)
11+
if not documents_has_token:
12+
continue
13+
for document in documents_has_token:
14+
result_with_relevance.setdefault(document['id'], 0)
15+
token_tf_idf = get_tf_idf(document['id'], documents_has_token)
16+
result_with_relevance[document['id']] += token_tf_idf
17+
result = sorted(
18+
result_with_relevance,
19+
key=result_with_relevance.get,
20+
reverse=True)
21+
return result
22+
23+
24+
def get_inverted_index(documents):
25+
inverted_index = {}
26+
tokens_all = set()
27+
documents_as_tokens = []
28+
for document in documents:
29+
document_tokens = tokenize(document['text'])
30+
current_document_tokenized = {
31+
'id': document['id'],
32+
'tokens': document_tokens
8933
}
90-
34+
documents_as_tokens.append(current_document_tokenized)
35+
tokens_all.update(document_tokens)
36+
for token in tokens_all:
37+
inverted_index[token] = []
38+
idf = get_idf(documents_as_tokens, token)
39+
for document in documents_as_tokens:
40+
if token in document['tokens']:
41+
tf = get_tf(document['tokens'], token)
42+
current_document_with_relevance = {
43+
'id': document['id'],
44+
'tf-idf': round(tf * idf, 4)
45+
}
46+
inverted_index[token].append(current_document_with_relevance)
47+
return inverted_index
48+
49+
50+
def tokenize(text):
51+
tokens = []
52+
text_lines = text.split('\n')
53+
for text_line in text_lines:
54+
text_line_tokenized = [
55+
get_term(token)
56+
for token in text_line.split(' ') if token
57+
]
58+
tokens.extend(text_line_tokenized)
59+
return tokens
60+
61+
62+
def get_term(token):
63+
return re.sub(r'[^\w\s]', '', token).lower()
64+
65+
66+
def get_tf_idf(document_id, documents_has_token):
67+
filter_document_has_token = filter(
68+
lambda document: document['id'] == document_id, documents_has_token
69+
)
70+
document_has_token = list(filter_document_has_token)[0]
71+
tf_idf = document_has_token['tf-idf']
9172
return tf_idf
9273

9374

94-
# doc1 = "I can't shoot straight unless I've had a pint!"
95-
# doc2 = "Don't shoot shoot shoot that thing at me."
96-
# doc3 = "I'm your shooter."
75+
def get_tf(document_as_tokens, token):
76+
document_tokens_count = len(document_as_tokens)
77+
token_in_document_count = document_as_tokens.count(token)
78+
tf = token_in_document_count / document_tokens_count
79+
return tf
9780

98-
# docs = [
99-
# {'id': 'doc1', 'text': doc1},
100-
# {'id': 'doc2', 'text': doc2},
101-
# {'id': 'doc3', 'text': doc3},
102-
# ]
10381

104-
# pprint(get_tf_idf(docs))
82+
def get_idf(documents_as_tokens, token):
83+
documents_count = len(documents_as_tokens)
84+
filter_documents_has_token = filter(
85+
lambda document: token in document['tokens'], documents_as_tokens
86+
)
87+
documents_has_token = list(filter_documents_has_token)
88+
documents_has_token_count = len(documents_has_token)
89+
idf = log2((documents_count + 1) / (documents_has_token_count + 0.5))
90+
return idf

0 commit comments

Comments
 (0)