11import re
22from collections import defaultdict
33from itertools import chain
4+ from math import log
5+ from pprint import pprint
46
57
68def preprocess (text ):
@@ -53,36 +55,59 @@ def get_inverted_index(docs: list) -> dict:
5355 return result
5456
5557
56- def get_tf (doc : list , all_terms : dict = None ) :
58+ def get_tf (text : str ) -> dict :
5759 """ Compute TF """
58- cleaned_text = preprocess (doc ['text' ])
59- unique_terms = cleaned_text .split ()
60+ cleaned_text = preprocess (text ).split ()
61+ total_words = len (cleaned_text )
62+ tf = defaultdict (float )
6063
61- for word in unique_terms :
62- all_terms [word ]['quant_in_doc' ] += cleaned_text .split ().count (word )
63-
64- return all_terms
64+ for word in cleaned_text :
65+ tf [word ] += 1
66+
67+ for word in tf :
68+ tf [word ] /= total_words
69+
70+ return tf
71+
72+
73+ def get_idf (docs ):
74+ """ Compute IDF """
75+ N = len (docs )
76+ idf = {}
77+
78+ for doc in docs :
79+ words_in_doc = preprocess (doc ['text' ]).split ()
80+ for word in words_in_doc :
81+ q_of_docs_with_word = len (search (docs , word ))
82+ idf [word ] = log (N / (1 + q_of_docs_with_word )) + 1
83+
84+ return idf
6585
6686
6787def get_tf_idf (docs ):
6888 """ Compute TF-IDF """
69- N
89+ idf = get_idf (docs )
90+ tf_idf = {}
7091
7192 for doc in docs :
72- unique_words_in_doc = set (preprocess (doc ['text' ]).split ())
73- for word in unique_words_in_doc :
93+ doc_id = doc ['id' ]
94+ tf = get_tf (doc ['text' ])
95+ tf_idf [doc_id ] = {
96+ word : round (tf [word ] * idf [word ], 4 )
97+ for word in tf
98+ }
7499
75- pass
100+ return tf_idf
76101
77102
78- doc1 = "I can't shoot straight unless I've had a pint!"
79- doc2 = "Don't shoot shoot shoot that thing at me."
80- doc3 = "I'm your shooter."
103+ # doc1 = "I can't shoot straight unless I've had a pint!"
104+ # doc2 = "Don't shoot shoot shoot that thing at me."
105+ # doc3 = "I'm your shooter."
81106
82- docs = [
83- {'id' : 'doc1' , 'text' : doc1 },
84- {'id' : 'doc2' , 'text' : doc2 },
85- {'id' : 'doc3' , 'text' : doc3 },
86- ]
107+ # docs = [
108+ # {'id': 'doc1', 'text': doc1},
109+ # {'id': 'doc2', 'text': doc2},
110+ # {'id': 'doc3', 'text': doc3},
111+ # ]
87112
88- print (get_tf_idf (docs ))
113+ # pprint (get_tf_idf(docs))
0 commit comments