Skip to content

Commit 1b5a52a

Browse files
author
shmelevik
committed
raw_ready
1 parent f9b67ff commit 1b5a52a

File tree

1 file changed

+45
-20
lines changed

1 file changed

+45
-20
lines changed

search_engine/search_engine.py

Lines changed: 45 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
import re
22
from collections import defaultdict
33
from itertools import chain
4+
from math import log
5+
from pprint import pprint
46

57

68
def preprocess(text):
@@ -53,36 +55,59 @@ def get_inverted_index(docs: list) -> dict:
5355
return result
5456

5557

56-
def get_tf(doc: list, all_terms:dict = None):
58+
def get_tf(text: str) -> dict:
5759
""" Compute TF """
58-
cleaned_text = preprocess(doc['text'])
59-
unique_terms = cleaned_text.split()
60+
cleaned_text = preprocess(text).split()
61+
total_words = len(cleaned_text)
62+
tf = defaultdict(float)
6063

61-
for word in unique_terms:
62-
all_terms[word]['quant_in_doc'] += cleaned_text.split().count(word)
63-
64-
return all_terms
64+
for word in cleaned_text:
65+
tf[word] += 1
66+
67+
for word in tf:
68+
tf[word] /= total_words
69+
70+
return tf
71+
72+
73+
def get_idf(docs):
74+
""" Compute IDF """
75+
N = len(docs)
76+
idf = {}
77+
78+
for doc in docs:
79+
words_in_doc = preprocess(doc['text']).split()
80+
for word in words_in_doc:
81+
q_of_docs_with_word = len(search(docs, word))
82+
idf[word] = log(N / (1 + q_of_docs_with_word)) + 1
83+
84+
return idf
6585

6686

6787
def get_tf_idf(docs):
6888
""" Compute TF-IDF """
69-
N
89+
idf = get_idf(docs)
90+
tf_idf = {}
7091

7192
for doc in docs:
72-
unique_words_in_doc = set(preprocess(doc['text']).split())
73-
for word in unique_words_in_doc:
93+
doc_id = doc['id']
94+
tf = get_tf(doc['text'])
95+
tf_idf[doc_id] = {
96+
word: round(tf[word] * idf[word], 4)
97+
for word in tf
98+
}
7499

75-
pass
100+
return tf_idf
76101

77102

78-
doc1 = "I can't shoot straight unless I've had a pint!"
79-
doc2 = "Don't shoot shoot shoot that thing at me."
80-
doc3 = "I'm your shooter."
103+
# doc1 = "I can't shoot straight unless I've had a pint!"
104+
# doc2 = "Don't shoot shoot shoot that thing at me."
105+
# doc3 = "I'm your shooter."
81106

82-
docs = [
83-
{'id': 'doc1', 'text': doc1},
84-
{'id': 'doc2', 'text': doc2},
85-
{'id': 'doc3', 'text': doc3},
86-
]
107+
# docs = [
108+
# {'id': 'doc1', 'text': doc1},
109+
# {'id': 'doc2', 'text': doc2},
110+
# {'id': 'doc3', 'text': doc3},
111+
# ]
87112

88-
print(get_tf_idf(docs))
113+
# pprint(get_tf_idf(docs))

0 commit comments

Comments
 (0)