11import re
2- from collections import defaultdict
2+ from collections import defaultdict , Counter
33from itertools import chain
44from math import log
5- from pprint import pprint
6-
75
86def preprocess (text ):
97 return re .sub (r'[^\w\s]|(?<!\w)_' , '' , text ).lower ()
@@ -14,22 +12,14 @@ def search(docs: list, words: str) -> list:
1412 Searches for words in the documents and returns their IDs,
1513 ranked by the total count of word occurrences in the document text.
1614 """
17-
18- result = []
1915 words = preprocess (words ).split ()
16+ result = []
2017
2118 for doc in docs :
2219 cleaned_text = preprocess (doc ['text' ])
23- word_count = 0
24-
25- for word in words :
26- word_count += cleaned_text .split ().count (word )
27-
20+ word_count = sum (Counter (cleaned_text .split ())[word ] for word in words )
2821 if word_count :
29- result .append ({
30- 'id' : doc ['id' ],
31- 'word_count' : word_count
32- })
22+ result .append ({'id' : doc ['id' ], 'word_count' : word_count })
3323
3424 result .sort (key = lambda x : x ['word_count' ], reverse = True )
3525
@@ -39,20 +29,21 @@ def search(docs: list, words: str) -> list:
3929def get_inverted_index (docs : list ) -> dict :
4030 """
4131 Builds an inverted index from a list of documents.
42-
43- An inverted index maps each unique word found in the documents to a list of
44- document IDs where that word appears.
4532 """
46- result = {}
47-
48- words_pit = set (chain .from_iterable (
49- (preprocess (dic ['text' ])).split ()
50- for dic in docs ))
33+ inverted_index = defaultdict (list )
5134
52- for word in words_pit :
53- result [word ] = search (docs , word )
54-
55- return result
35+ # Создаем индекс для каждого слова
36+ for doc in docs :
37+ words = preprocess (doc ['text' ]).split ()
38+ word_counts = Counter (words )
39+
40+ for word , count in word_counts .items ():
41+ if word not in inverted_index :
42+ inverted_index [word ] = [doc ['id' ]]
43+ else :
44+ inverted_index [word ].append (doc ['id' ])
45+
46+ return dict (inverted_index )
5647
5748
5849def get_tf (text : str ) -> dict :
0 commit comments