11from openai import Embedding , Completion
2- from openai .embeddings_utils import distances_from_embeddings
2+ from typing import List
3+ from scipy import spatial
34import pickle
45from pathlib import Path
56from sklearn .feature_extraction .text import TfidfVectorizer
@@ -25,6 +26,24 @@ def load_pickle_file():
2526global_df = load_pickle_file ()
2627
2728
29+ def distances_from_embeddings (
30+ query_embedding : List [float ],
31+ embeddings : List [List [float ]],
32+ distance_metric = "cosine" ,
33+ ) -> List [List ]:
34+ distance_metrics = {
35+ "cosine" : spatial .distance .cosine ,
36+ "L1" : spatial .distance .cityblock ,
37+ "L2" : spatial .distance .euclidean ,
38+ "Linf" : spatial .distance .chebyshev ,
39+ }
40+ distances = [
41+ distance_metrics [distance_metric ](query_embedding , embedding )
42+ for embedding in embeddings
43+ ]
44+ return distances
45+
46+
2847def create_context (question , df , max_len = 1800 , size = "ada" ):
2948 """
3049 Create a context for a question by finding the most similar context from the dataframe
@@ -108,13 +127,13 @@ def preprocess_text(text: str) -> str:
108127 # Remove punctuation marks
109128 text = text .translate (str .maketrans ("" , "" , string .punctuation ))
110129 # Remove numbers
111- text = re .sub (r' \d+' , '' , text )
130+ text = re .sub (r" \d+" , "" , text )
112131 # Remove extra whitespaces
113- text = re .sub (r' \s+' , ' ' , text )
132+ text = re .sub (r" \s+" , " " , text )
114133 # Remove stopwords
115- nltk .download (' stopwords' )
116- stop_words = set (stopwords .words (' english' ))
117- text = ' ' .join ([word for word in text .split () if word not in stop_words ])
134+ nltk .download (" stopwords" )
135+ stop_words = set (stopwords .words (" english" ))
136+ text = " " .join ([word for word in text .split () if word not in stop_words ])
118137 return text
119138
120139
@@ -123,9 +142,10 @@ def getPostChunks(post: Post, chunk_size: int = 1800) -> list:
123142 Split the post content into chunks of specified size
124143 """
125144 content = preprocess_text (post .content )
126- chunks = [content [i : i + chunk_size ] for i in range (0 , len (content ), chunk_size )]
145+ chunks = [content [i : i + chunk_size ] for i in range (0 , len (content ), chunk_size )]
127146 return chunks
128147
148+
129149def compute_similarity (post_id : int ) -> None :
130150 post = Post .objects .get (id = post_id )
131151 other_posts = Post .objects .exclude (id = post_id ).exclude (content = "" )
@@ -135,7 +155,8 @@ def compute_similarity(post_id: int) -> None:
135155
136156 # Create a list of (chunk, post_pk) tuples for all other posts
137157 combined_texts_and_pks = [
138- (chunk , other_post .pk ) for other_post in other_posts
158+ (chunk , other_post .pk )
159+ for other_post in other_posts
139160 for chunk in getPostChunks (other_post )
140161 ]
141162
@@ -152,18 +173,24 @@ def compute_similarity(post_id: int) -> None:
152173 return
153174
154175 cosine_sim = cosine_similarity (tfidf_matrix [0 :1 ], tfidf_matrix [1 :])
155-
176+
156177 # Calculate the number of similar posts to consider
157178 num_similar_posts = min (len (set (post_pks )) - 1 , 3 ) # Exclude the target post itself
158179
159180 # Get the top indices, but make sure to map them back to unique post PKs
160181 top_indices = np .argsort (- cosine_sim [0 ])[:num_similar_posts ]
161- unique_top_pks = {post_pks [i + 1 ] for i in top_indices } # +1 to skip the first post itself
182+ unique_top_pks = {
183+ post_pks [i + 1 ] for i in top_indices
184+ } # +1 to skip the first post itself
162185
163186 for pk in unique_top_pks :
164- idx = combined_texts_and_pks .index (next (filter (lambda x : x [1 ] == pk , combined_texts_and_pks )))
187+ idx = combined_texts_and_pks .index (
188+ next (filter (lambda x : x [1 ] == pk , combined_texts_and_pks ))
189+ )
165190 Similarity .objects .update_or_create (
166191 post1 = post ,
167192 post2 = Post .objects .get (pk = pk ),
168- defaults = {"score" : cosine_sim [0 ][idx - 1 ]}, # Adjust index for cosine_sim offset
169- )
193+ defaults = {
194+ "score" : cosine_sim [0 ][idx - 1 ]
195+ }, # Adjust index for cosine_sim offset
196+ )
0 commit comments