Skip to content

Commit a23288b

Browse files
committed
finished the discussion
1 parent a3369b3 commit a23288b

4 files changed

Lines changed: 373 additions & 203 deletions

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,5 @@
22
data/*
33
models/*
44
.vscode/*
5-
*__pycache__*
5+
*__pycache__*
6+
*pkl

notebooks/queries_comparison.ipynb

Lines changed: 102 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -149,22 +149,51 @@
149149
},
150150
{
151151
"cell_type": "code",
152-
"execution_count": 26,
152+
"execution_count": 39,
153+
"id": "d1b256c3",
154+
"metadata": {},
155+
"outputs": [
156+
{
157+
"name": "stdout",
158+
"output_type": "stream",
159+
"text": [
160+
"Total vectors indexed: 701528\n"
161+
]
162+
}
163+
],
164+
"source": [
165+
"import faiss\n",
166+
"\n",
167+
"dimension = corpus_embeddings.shape[1] # 384 for all-MiniLM-L6-v2\n",
168+
"\n",
169+
"index = faiss.IndexFlatL2(dimension) # L2 (Euclidean) distance\n",
170+
"# or use cosine similarity:\n",
171+
"# index = faiss.IndexFlatIP(dimension) # Inner product (cosine if normalized)\n",
172+
"\n",
173+
"index.add(corpus_embeddings)\n",
174+
"print(f\"Total vectors indexed: {index.ntotal}\")"
175+
]
176+
},
177+
{
178+
"cell_type": "code",
179+
"execution_count": 42,
153180
"id": "c9f4f122",
154181
"metadata": {},
155182
"outputs": [],
156183
"source": [
157184
"def semantic_search(query, top_k=5):\n",
158-
" query_embedding = model.encode([query])\n",
159-
" scores = cosine_similarity(query_embedding, corpus_embeddings)[0]\n",
160-
" top_indices = scores.argsort()[::-1][:top_k]\n",
161-
" result['score'] = scores # add scores to result for display\n",
162-
" return result.iloc[top_indices][['product_title', 'text', 'rating', 'score']].reset_index(drop=True)"
185+
" query_embedding = model.encode([query]).astype('float32')\n",
186+
" distances, indices = index.search(query_embedding, top_k)\n",
187+
" \n",
188+
" results = result.iloc[indices[0]].copy()\n",
189+
" results['distance'] = distances[0]\n",
190+
" \n",
191+
" return pd.DataFrame(results)[['product_title', 'text', 'rating', 'distance']].reset_index(drop=True)"
163192
]
164193
},
165194
{
166195
"cell_type": "code",
167-
"execution_count": 33,
196+
"execution_count": 44,
168197
"id": "df953299",
169198
"metadata": {},
170199
"outputs": [
@@ -192,72 +221,72 @@
192221
" <th>product_title</th>\n",
193222
" <th>text</th>\n",
194223
" <th>rating</th>\n",
195-
" <th>score</th>\n",
224+
" <th>distance</th>\n",
196225
" </tr>\n",
197226
" </thead>\n",
198227
" <tbody>\n",
199228
" <tr>\n",
200229
" <th>0</th>\n",
201-
" <td>Beard Straightener Brush For Men,BEENLE Electr...</td>\n",
202-
" <td>Nice, sharp, strong.</td>\n",
230+
" <td>(10-Pack+Bonus) Premium 4\"x4\" Waffle-Weave Mic...</td>\n",
231+
" <td>Amazing quality, and enviroment friendly. They...</td>\n",
203232
" <td>5.0</td>\n",
204-
" <td>0.562476</td>\n",
233+
" <td>0.506990</td>\n",
205234
" </tr>\n",
206235
" <tr>\n",
207236
" <th>1</th>\n",
208-
" <td>Segbeauty empty bottle 160083</td>\n",
209-
" <td>Love this product! The water comes out as a fi...</td>\n",
210-
" <td>5.0</td>\n",
211-
" <td>0.562476</td>\n",
237+
" <td>Wundercover: Tattoo Covers and Skin Shields (4...</td>\n",
238+
" <td>Works great, strong stick on skin and doesn't ...</td>\n",
239+
" <td>4.0</td>\n",
240+
" <td>0.522685</td>\n",
212241
" </tr>\n",
213242
" <tr>\n",
214243
" <th>2</th>\n",
215-
" <td>Walker Beauty Culture Scent &amp; Shine Coconut Oi...</td>\n",
216-
" <td>Very tiny bottle. The cost should be reduce.</td>\n",
217-
" <td>3.0</td>\n",
218-
" <td>0.562476</td>\n",
244+
" <td>Style Factor Wigout Leave in Detangler Sunset ...</td>\n",
245+
" <td>I love this product and it smells amazing</td>\n",
246+
" <td>5.0</td>\n",
247+
" <td>0.528536</td>\n",
219248
" </tr>\n",
220249
" <tr>\n",
221250
" <th>3</th>\n",
222-
" <td>OZNaturals Retinol Serum</td>\n",
223-
" <td>I cannot tell that it does anything-- just&nbsp;&nbsp;pu...</td>\n",
224-
" <td>2.0</td>\n",
225-
" <td>0.300436</td>\n",
251+
" <td>As I Am Curling Jelly - 4 ounce - Curl &amp; Coil ...</td>\n",
252+
" <td>Product does truly promote moisture which my h...</td>\n",
253+
" <td>4.0</td>\n",
254+
" <td>0.532994</td>\n",
226255
" </tr>\n",
227256
" <tr>\n",
228257
" <th>4</th>\n",
229-
" <td>2x Easy Fan Volume Lash Extensions 0.07 D curl...</td>\n",
230-
" <td>Affordable</td>\n",
258+
" <td>Bath Body Dry Brush Best for Lymphatic Drainag...</td>\n",
259+
" <td>Great!</td>\n",
231260
" <td>5.0</td>\n",
232-
" <td>0.294509</td>\n",
261+
" <td>0.555010</td>\n",
233262
" </tr>\n",
234263
" </tbody>\n",
235264
"</table>\n",
236265
"</div>"
237266
],
238267
"text/plain": [
239268
" product_title \\\n",
240-
"0 Beard Straightener Brush For Men,BEENLE Electr... \n",
241-
"1 Segbeauty empty bottle 160083 \n",
242-
"2 Walker Beauty Culture Scent & Shine Coconut Oi... \n",
243-
"3 OZNaturals Retinol Serum \n",
244-
"4 2x Easy Fan Volume Lash Extensions 0.07 D curl... \n",
269+
"0 (10-Pack+Bonus) Premium 4\"x4\" Waffle-Weave Mic... \n",
270+
"1 Wundercover: Tattoo Covers and Skin Shields (4... \n",
271+
"2 Style Factor Wigout Leave in Detangler Sunset ... \n",
272+
"3 As I Am Curling Jelly - 4 ounce - Curl & Coil ... \n",
273+
"4 Bath Body Dry Brush Best for Lymphatic Drainag... \n",
245274
"\n",
246-
" text rating score \n",
247-
"0 Nice, sharp, strong. 5.0 0.562476 \n",
248-
"1 Love this product! The water comes out as a fi... 5.0 0.562476 \n",
249-
"2 Very tiny bottle. The cost should be reduce. 3.0 0.562476 \n",
250-
"3 I cannot tell that it does anything-- just pu... 2.0 0.300436 \n",
251-
"4 Affordable 5.0 0.294509 "
275+
" text rating distance \n",
276+
"0 Amazing quality, and enviroment friendly. They... 5.0 0.506990 \n",
277+
"1 Works great, strong stick on skin and doesn't ... 4.0 0.522685 \n",
278+
"2 I love this product and it smells amazing 5.0 0.528536 \n",
279+
"3 Product does truly promote moisture which my h... 4.0 0.532994 \n",
280+
"4 Great! 5.0 0.555010 "
252281
]
253282
},
254-
"execution_count": 33,
283+
"execution_count": 44,
255284
"metadata": {},
256285
"output_type": "execute_result"
257286
}
258287
],
259288
"source": [
260-
"semantic_search(\"\")"
289+
"semantic_search(\"sunscreen that doesn't make my hair greasy\")"
261290
]
262291
},
263292
{
@@ -2228,6 +2257,40 @@
22282257
" display(q['semantic'])"
22292258
]
22302259
},
2260+
{
2261+
"cell_type": "code",
2262+
"execution_count": null,
2263+
"id": "331e64bf",
2264+
"metadata": {},
2265+
"outputs": [
2266+
{
2267+
"name": "stdout",
2268+
"output_type": "stream",
2269+
"text": [
2270+
"<class 'pandas.DataFrame'>\n",
2271+
"RangeIndex: 701528 entries, 0 to 701527\n",
2272+
"Data columns (total 10 columns):\n",
2273+
" # Column Non-Null Count Dtype \n",
2274+
"--- ------ -------------- ----- \n",
2275+
" 0 rating 701528 non-null float64\n",
2276+
" 1 title 701528 non-null str \n",
2277+
" 2 text 701528 non-null str \n",
2278+
" 3 verified_purchase 701528 non-null bool \n",
2279+
" 4 product_title 701528 non-null str \n",
2280+
" 5 average_rating 701528 non-null float64\n",
2281+
" 6 price 701528 non-null str \n",
2282+
" 7 description 701528 non-null object \n",
2283+
" 8 store 651636 non-null str \n",
2284+
" 9 details 701528 non-null str \n",
2285+
"dtypes: bool(1), float64(2), object(1), str(6)\n",
2286+
"memory usage: 414.8+ MB\n"
2287+
]
2288+
}
2289+
],
2290+
"source": [
2291+
"result.info()"
2292+
]
2293+
},
22312294
{
22322295
"cell_type": "code",
22332296
"execution_count": null,

0 commit comments

Comments
 (0)