|
149 | 149 | }, |
150 | 150 | { |
151 | 151 | "cell_type": "code", |
152 | | - "execution_count": 26, |
| 152 | + "execution_count": 39, |
| 153 | + "id": "d1b256c3", |
| 154 | + "metadata": {}, |
| 155 | + "outputs": [ |
| 156 | + { |
| 157 | + "name": "stdout", |
| 158 | + "output_type": "stream", |
| 159 | + "text": [ |
| 160 | + "Total vectors indexed: 701528\n" |
| 161 | + ] |
| 162 | + } |
| 163 | + ], |
| 164 | + "source": [ |
| 165 | + "import faiss\n", |
| 166 | + "\n", |
| 167 | + "dimension = corpus_embeddings.shape[1] # 384 for all-MiniLM-L6-v2\n", |
| 168 | + "\n", |
| 169 | + "index = faiss.IndexFlatL2(dimension) # L2 (Euclidean) distance\n", |
| 170 | + "# or use cosine similarity:\n", |
| 171 | + "# index = faiss.IndexFlatIP(dimension) # Inner product (cosine if normalized)\n", |
| 172 | + "\n", |
| 173 | + "index.add(corpus_embeddings)\n", |
| 174 | + "print(f\"Total vectors indexed: {index.ntotal}\")" |
| 175 | + ] |
| 176 | + }, |
| 177 | + { |
| 178 | + "cell_type": "code", |
| 179 | + "execution_count": 42, |
153 | 180 | "id": "c9f4f122", |
154 | 181 | "metadata": {}, |
155 | 182 | "outputs": [], |
156 | 183 | "source": [ |
157 | 184 | "def semantic_search(query, top_k=5):\n", |
158 | | - " query_embedding = model.encode([query])\n", |
159 | | - " scores = cosine_similarity(query_embedding, corpus_embeddings)[0]\n", |
160 | | - " top_indices = scores.argsort()[::-1][:top_k]\n", |
161 | | - " result['score'] = scores # add scores to result for display\n", |
162 | | - " return result.iloc[top_indices][['product_title', 'text', 'rating', 'score']].reset_index(drop=True)" |
| 185 | + " query_embedding = model.encode([query]).astype('float32')\n", |
| 186 | + " distances, indices = index.search(query_embedding, top_k)\n", |
| 187 | + " \n", |
| 188 | + " results = result.iloc[indices[0]].copy()\n", |
| 189 | + " results['distance'] = distances[0]\n", |
| 190 | + " \n", |
| 191 | + " return pd.DataFrame(results)[['product_title', 'text', 'rating', 'distance']].reset_index(drop=True)" |
163 | 192 | ] |
164 | 193 | }, |
165 | 194 | { |
166 | 195 | "cell_type": "code", |
167 | | - "execution_count": 33, |
| 196 | + "execution_count": 44, |
168 | 197 | "id": "df953299", |
169 | 198 | "metadata": {}, |
170 | 199 | "outputs": [ |
|
192 | 221 | " <th>product_title</th>\n", |
193 | 222 | " <th>text</th>\n", |
194 | 223 | " <th>rating</th>\n", |
195 | | - " <th>score</th>\n", |
| 224 | + " <th>distance</th>\n", |
196 | 225 | " </tr>\n", |
197 | 226 | " </thead>\n", |
198 | 227 | " <tbody>\n", |
199 | 228 | " <tr>\n", |
200 | 229 | " <th>0</th>\n", |
201 | | - " <td>Beard Straightener Brush For Men,BEENLE Electr...</td>\n", |
202 | | - " <td>Nice, sharp, strong.</td>\n", |
| 230 | + " <td>(10-Pack+Bonus) Premium 4\"x4\" Waffle-Weave Mic...</td>\n", |
| 231 | + " <td>Amazing quality, and enviroment friendly. They...</td>\n", |
203 | 232 | " <td>5.0</td>\n", |
204 | | - " <td>0.562476</td>\n", |
| 233 | + " <td>0.506990</td>\n", |
205 | 234 | " </tr>\n", |
206 | 235 | " <tr>\n", |
207 | 236 | " <th>1</th>\n", |
208 | | - " <td>Segbeauty empty bottle 160083</td>\n", |
209 | | - " <td>Love this product! The water comes out as a fi...</td>\n", |
210 | | - " <td>5.0</td>\n", |
211 | | - " <td>0.562476</td>\n", |
| 237 | + " <td>Wundercover: Tattoo Covers and Skin Shields (4...</td>\n", |
| 238 | + " <td>Works great, strong stick on skin and doesn't ...</td>\n", |
| 239 | + " <td>4.0</td>\n", |
| 240 | + " <td>0.522685</td>\n", |
212 | 241 | " </tr>\n", |
213 | 242 | " <tr>\n", |
214 | 243 | " <th>2</th>\n", |
215 | | - " <td>Walker Beauty Culture Scent & Shine Coconut Oi...</td>\n", |
216 | | - " <td>Very tiny bottle. The cost should be reduce.</td>\n", |
217 | | - " <td>3.0</td>\n", |
218 | | - " <td>0.562476</td>\n", |
| 244 | + " <td>Style Factor Wigout Leave in Detangler Sunset ...</td>\n", |
| 245 | + " <td>I love this product and it smells amazing</td>\n", |
| 246 | + " <td>5.0</td>\n", |
| 247 | + " <td>0.528536</td>\n", |
219 | 248 | " </tr>\n", |
220 | 249 | " <tr>\n", |
221 | 250 | " <th>3</th>\n", |
222 | | - " <td>OZNaturals Retinol Serum</td>\n", |
223 | | - " <td>I cannot tell that it does anything-- just pu...</td>\n", |
224 | | - " <td>2.0</td>\n", |
225 | | - " <td>0.300436</td>\n", |
| 251 | + " <td>As I Am Curling Jelly - 4 ounce - Curl & Coil ...</td>\n", |
| 252 | + " <td>Product does truly promote moisture which my h...</td>\n", |
| 253 | + " <td>4.0</td>\n", |
| 254 | + " <td>0.532994</td>\n", |
226 | 255 | " </tr>\n", |
227 | 256 | " <tr>\n", |
228 | 257 | " <th>4</th>\n", |
229 | | - " <td>2x Easy Fan Volume Lash Extensions 0.07 D curl...</td>\n", |
230 | | - " <td>Affordable</td>\n", |
| 258 | + " <td>Bath Body Dry Brush Best for Lymphatic Drainag...</td>\n", |
| 259 | + " <td>Great!</td>\n", |
231 | 260 | " <td>5.0</td>\n", |
232 | | - " <td>0.294509</td>\n", |
| 261 | + " <td>0.555010</td>\n", |
233 | 262 | " </tr>\n", |
234 | 263 | " </tbody>\n", |
235 | 264 | "</table>\n", |
236 | 265 | "</div>" |
237 | 266 | ], |
238 | 267 | "text/plain": [ |
239 | 268 | " product_title \\\n", |
240 | | - "0 Beard Straightener Brush For Men,BEENLE Electr... \n", |
241 | | - "1 Segbeauty empty bottle 160083 \n", |
242 | | - "2 Walker Beauty Culture Scent & Shine Coconut Oi... \n", |
243 | | - "3 OZNaturals Retinol Serum \n", |
244 | | - "4 2x Easy Fan Volume Lash Extensions 0.07 D curl... \n", |
| 269 | + "0 (10-Pack+Bonus) Premium 4\"x4\" Waffle-Weave Mic... \n", |
| 270 | + "1 Wundercover: Tattoo Covers and Skin Shields (4... \n", |
| 271 | + "2 Style Factor Wigout Leave in Detangler Sunset ... \n", |
| 272 | + "3 As I Am Curling Jelly - 4 ounce - Curl & Coil ... \n", |
| 273 | + "4 Bath Body Dry Brush Best for Lymphatic Drainag... \n", |
245 | 274 | "\n", |
246 | | - " text rating score \n", |
247 | | - "0 Nice, sharp, strong. 5.0 0.562476 \n", |
248 | | - "1 Love this product! The water comes out as a fi... 5.0 0.562476 \n", |
249 | | - "2 Very tiny bottle. The cost should be reduce. 3.0 0.562476 \n", |
250 | | - "3 I cannot tell that it does anything-- just pu... 2.0 0.300436 \n", |
251 | | - "4 Affordable 5.0 0.294509 " |
| 275 | + " text rating distance \n", |
| 276 | + "0 Amazing quality, and enviroment friendly. They... 5.0 0.506990 \n", |
| 277 | + "1 Works great, strong stick on skin and doesn't ... 4.0 0.522685 \n", |
| 278 | + "2 I love this product and it smells amazing 5.0 0.528536 \n", |
| 279 | + "3 Product does truly promote moisture which my h... 4.0 0.532994 \n", |
| 280 | + "4 Great! 5.0 0.555010 " |
252 | 281 | ] |
253 | 282 | }, |
254 | | - "execution_count": 33, |
| 283 | + "execution_count": 44, |
255 | 284 | "metadata": {}, |
256 | 285 | "output_type": "execute_result" |
257 | 286 | } |
258 | 287 | ], |
259 | 288 | "source": [ |
260 | | - "semantic_search(\"\")" |
| 289 | + "semantic_search(\"sunscreen that doesn't make my hair greasy\")" |
261 | 290 | ] |
262 | 291 | }, |
263 | 292 | { |
|
2228 | 2257 | " display(q['semantic'])" |
2229 | 2258 | ] |
2230 | 2259 | }, |
| 2260 | + { |
| 2261 | + "cell_type": "code", |
| 2262 | + "execution_count": null, |
| 2263 | + "id": "331e64bf", |
| 2264 | + "metadata": {}, |
| 2265 | + "outputs": [ |
| 2266 | + { |
| 2267 | + "name": "stdout", |
| 2268 | + "output_type": "stream", |
| 2269 | + "text": [ |
| 2270 | + "<class 'pandas.DataFrame'>\n", |
| 2271 | + "RangeIndex: 701528 entries, 0 to 701527\n", |
| 2272 | + "Data columns (total 10 columns):\n", |
| 2273 | + " # Column Non-Null Count Dtype \n", |
| 2274 | + "--- ------ -------------- ----- \n", |
| 2275 | + " 0 rating 701528 non-null float64\n", |
| 2276 | + " 1 title 701528 non-null str \n", |
| 2277 | + " 2 text 701528 non-null str \n", |
| 2278 | + " 3 verified_purchase 701528 non-null bool \n", |
| 2279 | + " 4 product_title 701528 non-null str \n", |
| 2280 | + " 5 average_rating 701528 non-null float64\n", |
| 2281 | + " 6 price 701528 non-null str \n", |
| 2282 | + " 7 description 701528 non-null object \n", |
| 2283 | + " 8 store 651636 non-null str \n", |
| 2284 | + " 9 details 701528 non-null str \n", |
| 2285 | + "dtypes: bool(1), float64(2), object(1), str(6)\n", |
| 2286 | + "memory usage: 414.8+ MB\n" |
| 2287 | + ] |
| 2288 | + } |
| 2289 | + ], |
| 2290 | + "source": [ |
| 2291 | + "result.info()" |
| 2292 | + ] |
| 2293 | + }, |
2231 | 2294 | { |
2232 | 2295 | "cell_type": "code", |
2233 | 2296 | "execution_count": null, |
|
0 commit comments