File tree Expand file tree Collapse file tree 2 files changed +40
-5
lines changed Expand file tree Collapse file tree 2 files changed +40
-5
lines changed Original file line number Diff line number Diff line change 11import re
22from collections import defaultdict
3+ from itertools import chain
4+
5+
6+ def preprocess (text ):
7+ return re .sub (r'[^\w\s]' , '' , text ).lower ()
38
49
510def search (docs : list , words : str ) -> list :
611 """
712 Searches for words in the documents and returns their IDs,
813 ranked by the total count of word occurrences in the document text.
914 """
10- def preprocess (text ):
11- return re .sub (r'[^\w\s]' , '' , text ).lower ()
1215
1316 result = []
1417 words = preprocess (words ).split ()
1518
16-
1719 for doc in docs :
1820 cleaned_text = preprocess (doc ['text' ])
1921 word_count = 0
@@ -30,3 +32,22 @@ def preprocess(text):
3032 result .sort (key = lambda x : x ['word_count' ], reverse = True )
3133
3234 return [item ['id' ] for item in result ]
35+
36+
37+ def get_inverted_index (docs : list ) -> dict :
38+ """
39+ Builds an inverted index from a list of documents.
40+
41+ An inverted index maps each unique word found in the documents to a list of
42+ document IDs where that word appears.
43+ """
44+ result = {}
45+
46+ words_pit = set (chain .from_iterable (
47+ (preprocess (dic ['text' ])).split ()
48+ for dic in docs ))
49+
50+ for word in words_pit :
51+ result [word ] = search (docs , word )
52+
53+ return result
Original file line number Diff line number Diff line change 11import pytest
2- from search_engine .search_engine import search
2+ from search_engine .search_engine import preprocess , search , get_inverted_index
33
44docs = [
55 {'id' : 1 , 'text' : 'hello world' },
@@ -117,4 +117,18 @@ def test_search_multiple_words(self):
117117 ]
118118
119119 result = search (docs , 'shoot at me' )
120- assert result == ['doc2' , 'doc1' ]
120+ assert result == ['doc2' , 'doc1' ]
121+
122+
123+ def test_get_inverted_index (self ):
124+ doc1 = {'id' : 'doc1' , 'text' : 'some text' }
125+ doc2 = {'id' : 'doc2' , 'text' : 'some text too' }
126+ docs = [doc1 , doc2 ]
127+
128+ index = {
129+ 'some' : ['doc1' , 'doc2' ],
130+ 'text' : ['doc1' , 'doc2' ],
131+ 'too' : ['doc2' ]
132+ }
133+
134+ assert get_inverted_index (docs ) == index
You can’t perform that action at this time.
0 commit comments