Skip to content

Commit ad51492

Browse files
author
shmelevik
committed
add inverted_index, split search into 2 func
1 parent 53351e8 commit ad51492

File tree

2 files changed

+40
-5
lines changed

2 files changed

+40
-5
lines changed

search_engine/search_engine.py

Lines changed: 24 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,21 @@
11
import re
22
from collections import defaultdict
3+
from itertools import chain
4+
5+
6+
def preprocess(text):
7+
return re.sub(r'[^\w\s]', '', text).lower()
38

49

510
def search(docs: list, words: str) -> list:
611
"""
712
Searches for words in the documents and returns their IDs,
813
ranked by the total count of word occurrences in the document text.
914
"""
10-
def preprocess(text):
11-
return re.sub(r'[^\w\s]', '', text).lower()
1215

1316
result = []
1417
words = preprocess(words).split()
1518

16-
1719
for doc in docs:
1820
cleaned_text = preprocess(doc['text'])
1921
word_count = 0
@@ -30,3 +32,22 @@ def preprocess(text):
3032
result.sort(key=lambda x: x['word_count'], reverse=True)
3133

3234
return [item['id'] for item in result]
35+
36+
37+
def get_inverted_index(docs: list) -> dict:
38+
"""
39+
Builds an inverted index from a list of documents.
40+
41+
An inverted index maps each unique word found in the documents to a list of
42+
document IDs where that word appears.
43+
"""
44+
result = {}
45+
46+
words_pit = set(chain.from_iterable(
47+
(preprocess(dic['text'])).split()
48+
for dic in docs))
49+
50+
for word in words_pit:
51+
result[word] = search(docs, word)
52+
53+
return result

tests/test_search_engine.py

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import pytest
2-
from search_engine.search_engine import search
2+
from search_engine.search_engine import preprocess, search, get_inverted_index
33

44
docs = [
55
{'id': 1, 'text': 'hello world'},
@@ -117,4 +117,18 @@ def test_search_multiple_words(self):
117117
]
118118

119119
result = search(docs, 'shoot at me')
120-
assert result == ['doc2', 'doc1']
120+
assert result == ['doc2', 'doc1']
121+
122+
123+
def test_get_inverted_index(self):
124+
doc1 = {'id': 'doc1', 'text': 'some text'}
125+
doc2 = {'id': 'doc2', 'text': 'some text too'}
126+
docs = [doc1, doc2]
127+
128+
index = {
129+
'some': ['doc1', 'doc2'],
130+
'text': ['doc1', 'doc2'],
131+
'too': ['doc2']
132+
}
133+
134+
assert get_inverted_index(docs) == index

0 commit comments

Comments
 (0)