Skip to content

Commit d4ac606

Browse files
authored
add probe
1 parent 71dcc80 commit d4ac606

File tree

1 file changed

+17
-26
lines changed

1 file changed

+17
-26
lines changed

search_engine/search_engine.py

Lines changed: 17 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,7 @@
11
import re
2-
from collections import defaultdict
2+
from collections import defaultdict, Counter
33
from itertools import chain
44
from math import log
5-
from pprint import pprint
6-
75

86
def preprocess(text):
97
return re.sub(r'[^\w\s]|(?<!\w)_', '', text).lower()
@@ -14,22 +12,14 @@ def search(docs: list, words: str) -> list:
1412
Searches for words in the documents and returns their IDs,
1513
ranked by the total count of word occurrences in the document text.
1614
"""
17-
18-
result = []
1915
words = preprocess(words).split()
16+
result = []
2017

2118
for doc in docs:
2219
cleaned_text = preprocess(doc['text'])
23-
word_count = 0
24-
25-
for word in words:
26-
word_count += cleaned_text.split().count(word)
27-
20+
word_count = sum(Counter(cleaned_text.split())[word] for word in words)
2821
if word_count:
29-
result.append({
30-
'id': doc['id'],
31-
'word_count': word_count
32-
})
22+
result.append({'id': doc['id'], 'word_count': word_count})
3323

3424
result.sort(key=lambda x: x['word_count'], reverse=True)
3525

@@ -39,20 +29,21 @@ def search(docs: list, words: str) -> list:
3929
def get_inverted_index(docs: list) -> dict:
4030
"""
4131
Builds an inverted index from a list of documents.
42-
43-
An inverted index maps each unique word found in the documents to a list of
44-
document IDs where that word appears.
4532
"""
46-
result = {}
47-
48-
words_pit = set(chain.from_iterable(
49-
(preprocess(dic['text'])).split()
50-
for dic in docs))
33+
inverted_index = defaultdict(list)
5134

52-
for word in words_pit:
53-
result[word] = search(docs, word)
54-
55-
return result
35+
# Создаем индекс для каждого слова
36+
for doc in docs:
37+
words = preprocess(doc['text']).split()
38+
word_counts = Counter(words)
39+
40+
for word, count in word_counts.items():
41+
if word not in inverted_index:
42+
inverted_index[word] = [doc['id']]
43+
else:
44+
inverted_index[word].append(doc['id'])
45+
46+
return dict(inverted_index)
5647

5748

5849
def get_tf(text: str) -> dict:

0 commit comments

Comments
 (0)