|
66 | 66 | }, |
67 | 67 | { |
68 | 68 | "cell_type": "code", |
69 | | - "execution_count": 3, |
| 69 | + "execution_count": null, |
70 | 70 | "metadata": { |
71 | 71 | "colab": { |
72 | 72 | "base_uri": "https://localhost:8080/" |
|
187 | 187 | } |
188 | 188 | ], |
189 | 189 | "source": [ |
190 | | - "from pathlib import Path\n", |
191 | | - "dataset_name = \"madoss/wsl_library_filtered\"\n", |
| 190 | + "dataset_name = \"EdouardCallet/wsl-policy-10k\"\n", |
192 | 191 | "dataset_split = \"train\"\n", |
193 | 192 | "dataset = load_dataset(dataset_name, split=dataset_split)\n", |
194 | | - "\n", |
195 | | - "df = dataset.select_columns([\"text\"]).to_pandas()\n", |
| 193 | + "text_field = \"single_policy_item\"\n", |
| 194 | + "dataset = dataset.filter(lambda example: bool(example[text_field]))\n", |
| 195 | + "df: pd.DataFrame = dataset.select_columns([text_field]).to_pandas()\n", |
196 | 196 | "print(df.shape)\n", |
197 | 197 | "df.head()" |
198 | 198 | ] |
199 | 199 | }, |
| 200 | + { |
| 201 | + "cell_type": "code", |
| 202 | + "execution_count": null, |
| 203 | + "metadata": {}, |
| 204 | + "outputs": [], |
| 205 | + "source": [ |
| 206 | + "print(df.isna().sum())\n", |
| 207 | + "df = df.dropna()" |
| 208 | + ] |
| 209 | + }, |
200 | 210 | { |
201 | 211 | "cell_type": "markdown", |
202 | 212 | "metadata": { |
|
290 | 300 | }, |
291 | 301 | { |
292 | 302 | "cell_type": "code", |
293 | | - "execution_count": 7, |
| 303 | + "execution_count": null, |
294 | 304 | "metadata": {}, |
295 | 305 | "outputs": [ |
296 | 306 | { |
|
308 | 318 | "import matplotlib.pyplot as plt\n", |
309 | 319 | "from wordcloud import WordCloud\n", |
310 | 320 | "\n", |
311 | | - "text = \" \".join(df['text'].tolist())\n", |
| 321 | + "text = \" \".join(df[text_field].tolist())\n", |
312 | 322 | "\n", |
313 | 323 | "wordcloud = WordCloud(width = 600, height = 400, \n", |
314 | 324 | " background_color ='white', \n", |
|
326 | 336 | }, |
327 | 337 | { |
328 | 338 | "cell_type": "code", |
329 | | - "execution_count": 8, |
| 339 | + "execution_count": null, |
330 | 340 | "metadata": { |
331 | 341 | "colab": { |
332 | 342 | "base_uri": "https://localhost:8080/" |
|
351 | 361 | "# Construction du vocabulaire\n", |
352 | 362 | "all_vocab = collections.Counter()\n", |
353 | 363 | "tokenizer = CountVectorizer().build_tokenizer()\n", |
354 | | - "for doc in tqdm(df['text'].tolist()):\n", |
| 364 | + "for doc in tqdm(df[text_field].tolist()):\n", |
355 | 365 | " all_vocab.update(tokenizer(doc))" |
356 | 366 | ] |
357 | 367 | }, |
358 | 368 | { |
359 | 369 | "cell_type": "code", |
360 | | - "execution_count": 10, |
| 370 | + "execution_count": null, |
361 | 371 | "metadata": {}, |
362 | 372 | "outputs": [ |
363 | 373 | { |
|
378 | 388 | "most_common_words = all_vocab.most_common(top_n)\n", |
379 | 389 | "\n", |
380 | 390 | "# Separate the words and their frequencies\n", |
381 | | - "words, frequencies = zip(*most_common_words)\n", |
| 391 | + "words, frequencies = zip(*most_common_words, strict=True)\n", |
382 | 392 | "\n", |
383 | 393 | "# Create a bar plot\n", |
384 | 394 | "\n", |
|
444 | 454 | }, |
445 | 455 | { |
446 | 456 | "cell_type": "code", |
447 | | - "execution_count": 15, |
| 457 | + "execution_count": null, |
448 | 458 | "metadata": { |
449 | 459 | "id": "OvnyNuYJzy_J" |
450 | 460 | }, |
451 | | - "outputs": [ |
452 | | - { |
453 | | - "ename": "OutOfMemoryError", |
454 | | - "evalue": "CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 3.68 GiB of which 5.25 MiB is free. Including non-PyTorch memory, this process has 3.66 GiB memory in use. Of the allocated memory 3.58 GiB is allocated by PyTorch, and 4.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", |
455 | | - "output_type": "error", |
456 | | - "traceback": [ |
457 | | - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", |
458 | | - "\u001b[31mOutOfMemoryError\u001b[39m Traceback (most recent call last)", |
459 | | - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 10\u001b[39m\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mbertopic\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mrepresentation\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m MaximalMarginalRelevance\n\u001b[32m 7\u001b[39m \u001b[38;5;66;03m# Preparation des modèles\u001b[39;00m\n\u001b[32m 8\u001b[39m \n\u001b[32m 9\u001b[39m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m embedding_model = \u001b[43mSentenceTransformer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mEmbeddings\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 11\u001b[39m \u001b[38;5;66;03m# UMAP\u001b[39;00m\n\u001b[32m 12\u001b[39m umap_model = UMAP(n_components=\u001b[32m8\u001b[39m, n_neighbors=\u001b[32m10\u001b[39m, random_state=\u001b[32m42\u001b[39m,\n\u001b[32m 13\u001b[39m metric=\u001b[33m\"\u001b[39m\u001b[33mcosine\u001b[39m\u001b[33m\"\u001b[39m, verbose=\u001b[38;5;28;01mTrue\u001b[39;00m)\n", |
460 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/sentence_transformers/SentenceTransformer.py:367\u001b[39m, in \u001b[36mSentenceTransformer.__init__\u001b[39m\u001b[34m(self, model_name_or_path, modules, device, prompts, default_prompt_name, similarity_fn_name, cache_folder, trust_remote_code, revision, local_files_only, token, use_auth_token, truncate_dim, model_kwargs, tokenizer_kwargs, config_kwargs, model_card_data, backend)\u001b[39m\n\u001b[32m 364\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m:\n\u001b[32m 365\u001b[39m \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m367\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 368\u001b[39m \u001b[38;5;28mself\u001b[39m.is_hpu_graph_enabled = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m 370\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.default_prompt_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.default_prompt_name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.prompts:\n", |
461 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1371\u001b[39m, in \u001b[36mModule.to\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m 1368\u001b[39m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m 1369\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1371\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n", |
462 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:930\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m 928\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[32m 929\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.children():\n\u001b[32m--> \u001b[39m\u001b[32m930\u001b[39m \u001b[43mmodule\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 932\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied) -> \u001b[38;5;28mbool\u001b[39m:\n\u001b[32m 933\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m torch._has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[32m 934\u001b[39m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[32m 935\u001b[39m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 940\u001b[39m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[32m 941\u001b[39m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n", |
463 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:930\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m 928\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[32m 929\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.children():\n\u001b[32m--> \u001b[39m\u001b[32m930\u001b[39m \u001b[43mmodule\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 932\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied) -> \u001b[38;5;28mbool\u001b[39m:\n\u001b[32m 933\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m torch._has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[32m 934\u001b[39m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[32m 935\u001b[39m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 940\u001b[39m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[32m 941\u001b[39m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n", |
464 | | - " \u001b[31m[... skipping similar frames: Module._apply at line 930 (3 times)]\u001b[39m\n", |
465 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:930\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m 928\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[32m 929\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.children():\n\u001b[32m--> \u001b[39m\u001b[32m930\u001b[39m \u001b[43mmodule\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 932\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied) -> \u001b[38;5;28mbool\u001b[39m:\n\u001b[32m 933\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m torch._has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[32m 934\u001b[39m \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[32m 935\u001b[39m \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[32m (...)\u001b[39m\u001b[32m 940\u001b[39m \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[32m 941\u001b[39m \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n", |
466 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:957\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m 953\u001b[39m \u001b[38;5;66;03m# Tensors stored in modules are graph leaves, and we don't want to\u001b[39;00m\n\u001b[32m 954\u001b[39m \u001b[38;5;66;03m# track autograd history of `param_applied`, so we have to use\u001b[39;00m\n\u001b[32m 955\u001b[39m \u001b[38;5;66;03m# `with torch.no_grad():`\u001b[39;00m\n\u001b[32m 956\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m--> \u001b[39m\u001b[32m957\u001b[39m param_applied = \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparam\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 958\u001b[39m p_should_use_set_data = compute_should_use_set_data(param, param_applied)\n\u001b[32m 960\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01m_subclasses\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mfake_tensor\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m FakeTensor\n", |
467 | | - "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1357\u001b[39m, in \u001b[36mModule.to.<locals>.convert\u001b[39m\u001b[34m(t)\u001b[39m\n\u001b[32m 1350\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m convert_to_format \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m t.dim() \u001b[38;5;129;01min\u001b[39;00m (\u001b[32m4\u001b[39m, \u001b[32m5\u001b[39m):\n\u001b[32m 1351\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m t.to(\n\u001b[32m 1352\u001b[39m device,\n\u001b[32m 1353\u001b[39m dtype \u001b[38;5;28;01mif\u001b[39;00m t.is_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t.is_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m 1354\u001b[39m non_blocking,\n\u001b[32m 1355\u001b[39m memory_format=convert_to_format,\n\u001b[32m 1356\u001b[39m )\n\u001b[32m-> \u001b[39m\u001b[32m1357\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m 1358\u001b[39m \u001b[43m \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1359\u001b[39m \u001b[43m \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mis_floating_point\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mis_complex\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m 1360\u001b[39m \u001b[43m \u001b[49m\u001b[43mnon_blocking\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m 1361\u001b[39m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m 1362\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m 1363\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(e) == \u001b[33m\"\u001b[39m\u001b[33mCannot copy out of meta tensor; no data!\u001b[39m\u001b[33m\"\u001b[39m:\n", |
468 | | - "\u001b[31mOutOfMemoryError\u001b[39m: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 3.68 GiB of which 5.25 MiB is free. Including non-PyTorch memory, this process has 3.66 GiB memory in use. Of the allocated memory 3.58 GiB is allocated by PyTorch, and 4.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)" |
469 | | - ] |
470 | | - } |
471 | | - ], |
| 461 | + "outputs": [], |
472 | 462 | "source": [ |
473 | 463 | "from umap import UMAP\n", |
474 | 464 | "import hdbscan\n", |
|
496 | 486 | " vectorizer_model=vectorizer_model,\n", |
497 | 487 | " verbose=True,\n", |
498 | 488 | " representation_model= MaximalMarginalRelevance(diversity=0.3),\n", |
499 | | - ").fit(df[\"text\"].tolist(), embeddings=embeddings)" |
| 489 | + ").fit(df[text_field].tolist(), embeddings=embeddings)" |
500 | 490 | ] |
501 | 491 | }, |
502 | 492 | { |
|
4540 | 4530 | } |
4541 | 4531 | ], |
4542 | 4532 | "source": [ |
4543 | | - "hierarchical_topics = topic_model.hierarchical_topics(df[\"text\"].tolist())" |
| 4533 | + "hierarchical_topics = topic_model.hierarchical_topics(df[text_field].tolist())" |
4544 | 4534 | ] |
4545 | 4535 | }, |
4546 | 4536 | { |
@@ -15258,11 +15248,11 @@ |
15258 | 15248 | }, |
15259 | 15249 | { |
15260 | 15250 | "cell_type": "code", |
15261 | | - "execution_count": 24, |
| 15251 | + "execution_count": null, |
15262 | 15252 | "metadata": {}, |
15263 | 15253 | "outputs": [], |
15264 | 15254 | "source": [ |
15265 | | - "def fit_bertopic_model(docs=df[\"policy\"].tolist(), embeddings=embeddings, vocab=vocab, stopword=stopword, \n", |
| 15255 | + "def fit_bertopic_model(docs=df[text_field].tolist(), embeddings=embeddings, vocab=vocab, stopword=stopword, \n", |
15266 | 15256 | " embedding_model_name=Embeddings, representation_model=None, y=None):\n", |
15267 | 15257 | " \"\"\"\n", |
15268 | 15258 | " Fits a BERTopic model with custom configurations.\n", |
|
15544 | 15534 | }, |
15545 | 15535 | { |
15546 | 15536 | "cell_type": "code", |
15547 | | - "execution_count": 26, |
| 15537 | + "execution_count": null, |
15548 | 15538 | "metadata": { |
15549 | 15539 | "id": "XupETxy89uqd" |
15550 | 15540 | }, |
|
15563 | 15553 | "color_key = {str(topic): next(colors) for topic in set(topic_model.topics_) if topic != -1}\n", |
15564 | 15554 | "\n", |
15565 | 15555 | "# Transformer les embeddings en dataframe\n", |
15566 | | - "docs = df[\"policy\"].tolist()\n", |
| 15556 | + "docs = df[text_field].tolist()\n", |
15567 | 15557 | "data = pd.DataFrame({\"x\": reduced_embeddings_2d[:, 0], \"y\": reduced_embeddings_2d[:, 1],\n", |
15568 | 15558 | " \"Topic\": [str(t) for t in topic_model.topics_]})\n", |
15569 | 15559 | "\n", |
|
0 commit comments