Update search_engine.py

Shmelevick · web-flow · commit 925946c48a50 · 2025-02-27T23:01:53.000+03:00
diff --git a/search_engine/search_engine.py b/search_engine/search_engine.py
@@ -1,104 +1,90 @@
 import re
-from collections import defaultdict, Counter
-from itertools import chain
-from math import log
-
-def preprocess(text):
-    return re.sub(r'[^\w\s]|(?<!\w)_', '', text).lower()
-
-
-def search(docs: list, words: str) -> list:
-    """
-    Searches for words in the documents and returns their IDs,
-    ranked by the total count of word occurrences in the document text.
-    """
-    words = preprocess(words).split()
-    result = []
-
-    for doc in docs:
-        cleaned_text = preprocess(doc['text'])
-        word_count = sum(Counter(cleaned_text.split())[word] for word in words)
-        if word_count:
-            result.append({'id': doc['id'], 'word_count': word_count})
-
-    result.sort(key=lambda x: x['word_count'], reverse=True)
-
-    return [item['id'] for item in result]
-
-
-def get_inverted_index(docs: list) -> dict:
-    """
-    Builds an inverted index from a list of documents.
-    """
-    inverted_index = defaultdict(list)
-
-    # Создаем индекс для каждого слова
-    for doc in docs:
-        words = preprocess(doc['text']).split()
-        word_counts = Counter(words)
-        
-        for word, count in word_counts.items():
-            if word not in inverted_index:
-                inverted_index[word] = [doc['id']]
-            else:
-                inverted_index[word].append(doc['id'])
-
-    return dict(inverted_index)
-
-
-def get_tf(text: str) -> dict:
-    """ Compute TF """
-    cleaned_text = preprocess(text).split()
-    total_words = len(cleaned_text)
-    tf = defaultdict(float)
-
-    for word in cleaned_text:
-        tf[word] += 1
-
-    for word in tf:
-        tf[word] /= total_words
-
-    return tf
-
-
-def get_idf(docs):
-    """ Compute IDF """
-    N = len(docs)
-    idf = {}
-
-    for doc in docs:
-        words_in_doc = preprocess(doc['text']).split()
-        for word in words_in_doc:
-            q_of_docs_with_word = len(search(docs, word))
-            idf[word] = log(N / (1 + q_of_docs_with_word)) + 1
-
-    return idf
-
-
-def get_tf_idf(docs):
-    """ Compute TF-IDF """
-    idf = get_idf(docs)
-    tf_idf = {}
-
-    for doc in docs:
-        doc_id = doc['id']
-        tf = get_tf(doc['text'])
-        tf_idf[doc_id] = {
-            word: round(tf[word] * idf[word], 4)
-            for word in tf
+from math import log2
+
+
+def search(documents, query):
+    inverted_index = get_inverted_index(documents)
+    result_with_relevance = {}
+    query_tokens = tokenize(query)
+    for query_token in query_tokens:
+        documents_has_token = inverted_index.get(query_token)
+        if not documents_has_token:
+            continue
+        for document in documents_has_token:
+            result_with_relevance.setdefault(document['id'], 0)
+            token_tf_idf = get_tf_idf(document['id'], documents_has_token)
+            result_with_relevance[document['id']] += token_tf_idf
+    result = sorted(
+        result_with_relevance,
+        key=result_with_relevance.get,
+        reverse=True)
+    return result
+
+
+def get_inverted_index(documents):
+    inverted_index = {}
+    tokens_all = set()
+    documents_as_tokens = []
+    for document in documents:
+        document_tokens = tokenize(document['text'])
+        current_document_tokenized = {
+            'id': document['id'],
+            'tokens': document_tokens
         }
-
+        documents_as_tokens.append(current_document_tokenized)
+        tokens_all.update(document_tokens)
+    for token in tokens_all:
+        inverted_index[token] = []
+        idf = get_idf(documents_as_tokens, token)
+        for document in documents_as_tokens:
+            if token in document['tokens']:
+                tf = get_tf(document['tokens'], token)
+                current_document_with_relevance = {
+                    'id': document['id'],
+                    'tf-idf': round(tf * idf, 4)
+                }
+                inverted_index[token].append(current_document_with_relevance)
+    return inverted_index
+
+
+def tokenize(text):
+    tokens = []
+    text_lines = text.split('\n')
+    for text_line in text_lines:
+        text_line_tokenized = [
+            get_term(token)
+            for token in text_line.split(' ') if token
+        ]
+        tokens.extend(text_line_tokenized)
+    return tokens
+
+
+def get_term(token):
+    return re.sub(r'[^\w\s]', '', token).lower()
+
+
+def get_tf_idf(document_id, documents_has_token):
+    filter_document_has_token = filter(
+        lambda document: document['id'] == document_id, documents_has_token
+    )
+    document_has_token = list(filter_document_has_token)[0]
+    tf_idf = document_has_token['tf-idf']
     return tf_idf
 
 
-# doc1 = "I can't shoot straight unless I've had a pint!"
-# doc2 = "Don't shoot shoot shoot that thing at me."
-# doc3 = "I'm your shooter."
+def get_tf(document_as_tokens, token):
+    document_tokens_count = len(document_as_tokens)
+    token_in_document_count = document_as_tokens.count(token)
+    tf = token_in_document_count / document_tokens_count
+    return tf
 
-# docs = [
-#     {'id': 'doc1', 'text': doc1},
-#     {'id': 'doc2', 'text': doc2},
-#     {'id': 'doc3', 'text': doc3},
-# ]
 
-# pprint(get_tf_idf(docs))
+def get_idf(documents_as_tokens, token):
+    documents_count = len(documents_as_tokens)
+    filter_documents_has_token = filter(
+        lambda document: token in document['tokens'], documents_as_tokens
+    )
+    documents_has_token = list(filter_documents_has_token)
+    documents_has_token_count = len(documents_has_token)
+    idf = log2((documents_count + 1) / (documents_has_token_count + 0.5))
+    return idf