raw_ready

shmelevik · shmelevik · commit 1b5a52af3880 · 2025-02-27T19:56:51.000+03:00
diff --git a/search_engine/search_engine.py b/search_engine/search_engine.py
@@ -1,6 +1,8 @@
 import re
 from collections import defaultdict
 from itertools import chain
+from math import log
+from pprint import pprint
 
 
 def preprocess(text):
@@ -53,36 +55,59 @@ def get_inverted_index(docs: list) -> dict:
     return result
 
 
-def get_tf(doc: list, all_terms:dict = None):
+def get_tf(text: str) -> dict:
     """ Compute TF """
-    cleaned_text = preprocess(doc['text'])
-    unique_terms = cleaned_text.split()
+    cleaned_text = preprocess(text).split()
+    total_words = len(cleaned_text)
+    tf = defaultdict(float)
 
-    for word in unique_terms:
-        all_terms[word]['quant_in_doc'] += cleaned_text.split().count(word)
-    
-    return all_terms
+    for word in cleaned_text:
+        tf[word] += 1
+
+    for word in tf:
+        tf[word] /= total_words
+
+    return tf
+
+
+def get_idf(docs):
+    """ Compute IDF """
+    N = len(docs)
+    idf = {}
+
+    for doc in docs:
+        words_in_doc = preprocess(doc['text']).split()
+        for word in words_in_doc:
+            q_of_docs_with_word = len(search(docs, word))
+            idf[word] = log(N / (1 + q_of_docs_with_word)) + 1
+
+    return idf
 
 
 def get_tf_idf(docs):
     """ Compute TF-IDF """
-    N
+    idf = get_idf(docs)
+    tf_idf = {}
 
     for doc in docs:
-        unique_words_in_doc = set(preprocess(doc['text']).split())
-        for word in unique_words_in_doc:
+        doc_id = doc['id']
+        tf = get_tf(doc['text'])
+        tf_idf[doc_id] = {
+            word: round(tf[word] * idf[word], 4)
+            for word in tf
+        }
 
-        pass
+    return tf_idf
 
 
-doc1 = "I can't shoot straight unless I've had a pint!"
-doc2 = "Don't shoot shoot shoot that thing at me."
-doc3 = "I'm your shooter."
+# doc1 = "I can't shoot straight unless I've had a pint!"
+# doc2 = "Don't shoot shoot shoot that thing at me."
+# doc3 = "I'm your shooter."
 
-docs = [
-    {'id': 'doc1', 'text': doc1},
-    {'id': 'doc2', 'text': doc2},
-    {'id': 'doc3', 'text': doc3},
-]
+# docs = [
+#     {'id': 'doc1', 'text': doc1},
+#     {'id': 'doc2', 'text': doc2},
+#     {'id': 'doc3', 'text': doc3},
+# ]
 
-print(get_tf_idf(docs))
+# pprint(get_tf_idf(docs))