Skip to content

Commit 53351e8

Browse files
author
shmelevik
committed
add search for multiple words
1 parent b81fbf6 commit 53351e8

File tree

2 files changed

+41
-7
lines changed

2 files changed

+41
-7
lines changed

search_engine/search_engine.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,32 @@
11
import re
2+
from collections import defaultdict
23

3-
def search(docs: list, word: str) -> list:
4+
5+
def search(docs: list, words: str) -> list:
46
"""
5-
Searches for a word in the documents and returns their IDs.
7+
Searches for words in the documents and returns their IDs,
8+
ranked by the total count of word occurrences in the document text.
69
"""
710
def preprocess(text):
811
return re.sub(r'[^\w\s]', '', text).lower()
912

1013
result = []
11-
word = preprocess(word)
14+
words = preprocess(words).split()
15+
1216

1317
for doc in docs:
1418
cleaned_text = preprocess(doc['text'])
15-
word_count = cleaned_text.split().count(word)
19+
word_count = 0
20+
21+
for word in words:
22+
word_count += cleaned_text.split().count(word)
1623

1724
if word_count:
18-
result.append((doc['id'], word_count))
25+
result.append({
26+
'id': doc['id'],
27+
'word_count': word_count
28+
})
29+
30+
result.sort(key=lambda x: x['word_count'], reverse=True)
1931

20-
result.sort(key=lambda x: x[1], reverse=True)
21-
return [id for id, _ in result]
32+
return [item['id'] for item in result]

tests/test_search_engine.py

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,4 +94,27 @@ def test_search_ranging(self):
9494
]
9595

9696
result = search(docs, 'shoot')
97+
assert result == ['doc2', 'doc1']
98+
99+
def test_search_multiple_words(self):
100+
"""
101+
Test that the search function returns documents containing all the words
102+
in the search query, and sorts them based on the frequency of the words' occurrence.
103+
104+
The test ensures the following:
105+
- The search can handle multiple words (i.e., 'shoot at me').
106+
- Documents are returned in the order of their relevance (based on word frequency).
107+
- Words in the search query are treated independently (i.e., not as a single phrase).
108+
"""
109+
doc1 = "I can't shoot straight unless I've had a pint!"
110+
doc2 = "Don't shoot shoot shoot that thing at me."
111+
doc3 = "I'm your shooter."
112+
113+
docs = [
114+
{'id': 'doc1', 'text': doc1},
115+
{'id': 'doc2', 'text': doc2},
116+
{'id': 'doc3', 'text': doc3},
117+
]
118+
119+
result = search(docs, 'shoot at me')
97120
assert result == ['doc2', 'doc1']

0 commit comments

Comments
 (0)