dataforgoodfr
diff --git a/‎policy_analysis/notebooks/BERTopic.ipynb‎
Lines changed: 29 additions & 39 deletions b/‎policy_analysis/notebooks/BERTopic.ipynb‎
Lines changed: 29 additions & 39 deletions
@@ -66,7 +66,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 3,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -187,16 +187,26 @@
         }
       ],
       "source": [
-        "from pathlib import Path\n",
-        "dataset_name = \"madoss/wsl_library_filtered\"\n",
+        "dataset_name = \"EdouardCallet/wsl-policy-10k\"\n",
         "dataset_split = \"train\"\n",
         "dataset = load_dataset(dataset_name, split=dataset_split)\n",
-        "\n",
-        "df = dataset.select_columns([\"text\"]).to_pandas()\n",
+        "text_field = \"single_policy_item\"\n",
+        "dataset = dataset.filter(lambda example: bool(example[text_field]))\n",
+        "df: pd.DataFrame = dataset.select_columns([text_field]).to_pandas()\n",
         "print(df.shape)\n",
         "df.head()"
       ]
     },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "print(df.isna().sum())\n",
+        "df = df.dropna()"
+      ]
+    },
     {
       "cell_type": "markdown",
       "metadata": {
@@ -290,7 +300,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 7,
+      "execution_count": null,
       "metadata": {},
       "outputs": [
         {
@@ -308,7 +318,7 @@
         "import matplotlib.pyplot as plt\n",
         "from wordcloud import WordCloud\n",
         "\n",
-        "text = \" \".join(df['text'].tolist())\n",
+        "text = \" \".join(df[text_field].tolist())\n",
         "\n",
         "wordcloud = WordCloud(width = 600, height = 400, \n",
         "                background_color ='white', \n",
@@ -326,7 +336,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 8,
+      "execution_count": null,
       "metadata": {
         "colab": {
           "base_uri": "https://localhost:8080/"
@@ -351,13 +361,13 @@
         "# Construction du vocabulaire\n",
         "all_vocab = collections.Counter()\n",
         "tokenizer = CountVectorizer().build_tokenizer()\n",
-        "for doc in tqdm(df['text'].tolist()):\n",
+        "for doc in tqdm(df[text_field].tolist()):\n",
         "  all_vocab.update(tokenizer(doc))"
       ]
     },
     {
       "cell_type": "code",
-      "execution_count": 10,
+      "execution_count": null,
       "metadata": {},
       "outputs": [
         {
@@ -378,7 +388,7 @@
         "most_common_words = all_vocab.most_common(top_n)\n",
         "\n",
         "# Separate the words and their frequencies\n",
-        "words, frequencies = zip(*most_common_words)\n",
+        "words, frequencies = zip(*most_common_words, strict=True)\n",
         "\n",
         "# Create a bar plot\n",
         "\n",
@@ -444,31 +454,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 15,
+      "execution_count": null,
       "metadata": {
         "id": "OvnyNuYJzy_J"
       },
-      "outputs": [
-        {
-          "ename": "OutOfMemoryError",
-          "evalue": "CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 3.68 GiB of which 5.25 MiB is free. Including non-PyTorch memory, this process has 3.66 GiB memory in use. Of the allocated memory 3.58 GiB is allocated by PyTorch, and 4.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)",
-          "output_type": "error",
-          "traceback": [
-            "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-            "\u001b[31mOutOfMemoryError\u001b[39m                          Traceback (most recent call last)",
-            "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[15]\u001b[39m\u001b[32m, line 10\u001b[39m\n\u001b[32m      5\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mbertopic\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mrepresentation\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m MaximalMarginalRelevance\n\u001b[32m      7\u001b[39m \u001b[38;5;66;03m# Preparation des modèles\u001b[39;00m\n\u001b[32m      8\u001b[39m \n\u001b[32m      9\u001b[39m \u001b[38;5;66;03m#\u001b[39;00m\n\u001b[32m---> \u001b[39m\u001b[32m10\u001b[39m embedding_model = \u001b[43mSentenceTransformer\u001b[49m\u001b[43m(\u001b[49m\u001b[43mEmbeddings\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m     11\u001b[39m \u001b[38;5;66;03m# UMAP\u001b[39;00m\n\u001b[32m     12\u001b[39m umap_model = UMAP(n_components=\u001b[32m8\u001b[39m, n_neighbors=\u001b[32m10\u001b[39m, random_state=\u001b[32m42\u001b[39m,\n\u001b[32m     13\u001b[39m                   metric=\u001b[33m\"\u001b[39m\u001b[33mcosine\u001b[39m\u001b[33m\"\u001b[39m, verbose=\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
-            "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/sentence_transformers/SentenceTransformer.py:367\u001b[39m, in \u001b[36mSentenceTransformer.__init__\u001b[39m\u001b[34m(self, model_name_or_path, modules, device, prompts, default_prompt_name, similarity_fn_name, cache_folder, trust_remote_code, revision, local_files_only, token, use_auth_token, truncate_dim, model_kwargs, tokenizer_kwargs, config_kwargs, model_card_data, backend)\u001b[39m\n\u001b[32m    364\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mStopIteration\u001b[39;00m:\n\u001b[32m    365\u001b[39m     \u001b[38;5;28;01mpass\u001b[39;00m\n\u001b[32m--> \u001b[39m\u001b[32m367\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    368\u001b[39m \u001b[38;5;28mself\u001b[39m.is_hpu_graph_enabled = \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[32m    370\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m.default_prompt_name \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m.default_prompt_name \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.prompts:\n",
-            "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1371\u001b[39m, in \u001b[36mModule.to\u001b[39m\u001b[34m(self, *args, **kwargs)\u001b[39m\n\u001b[32m   1368\u001b[39m         \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[32m   1369\u001b[39m             \u001b[38;5;28;01mraise\u001b[39;00m\n\u001b[32m-> \u001b[39m\u001b[32m1371\u001b[39m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mconvert\u001b[49m\u001b[43m)\u001b[49m\n",
-            "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:930\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m    928\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[32m    929\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.children():\n\u001b[32m--> \u001b[39m\u001b[32m930\u001b[39m         \u001b[43mmodule\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    932\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied) -> \u001b[38;5;28mbool\u001b[39m:\n\u001b[32m    933\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m torch._has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[32m    934\u001b[39m         \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[32m    935\u001b[39m         \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m    940\u001b[39m         \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[32m    941\u001b[39m         \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
-            "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:930\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m    928\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[32m    929\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.children():\n\u001b[32m--> \u001b[39m\u001b[32m930\u001b[39m         \u001b[43mmodule\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    932\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied) -> \u001b[38;5;28mbool\u001b[39m:\n\u001b[32m    933\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m torch._has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[32m    934\u001b[39m         \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[32m    935\u001b[39m         \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m    940\u001b[39m         \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[32m    941\u001b[39m         \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
-            "    \u001b[31m[... skipping similar frames: Module._apply at line 930 (3 times)]\u001b[39m\n",
-            "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:930\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m    928\u001b[39m \u001b[38;5;28;01mif\u001b[39;00m recurse:\n\u001b[32m    929\u001b[39m     \u001b[38;5;28;01mfor\u001b[39;00m module \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m.children():\n\u001b[32m--> \u001b[39m\u001b[32m930\u001b[39m         \u001b[43mmodule\u001b[49m\u001b[43m.\u001b[49m\u001b[43m_apply\u001b[49m\u001b[43m(\u001b[49m\u001b[43mfn\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    932\u001b[39m \u001b[38;5;28;01mdef\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34mcompute_should_use_set_data\u001b[39m(tensor, tensor_applied) -> \u001b[38;5;28mbool\u001b[39m:\n\u001b[32m    933\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m torch._has_compatible_shallow_copy_type(tensor, tensor_applied):\n\u001b[32m    934\u001b[39m         \u001b[38;5;66;03m# If the new tensor has compatible tensor type as the existing tensor,\u001b[39;00m\n\u001b[32m    935\u001b[39m         \u001b[38;5;66;03m# the current behavior is to change the tensor in-place using `.data =`,\u001b[39;00m\n\u001b[32m   (...)\u001b[39m\u001b[32m    940\u001b[39m         \u001b[38;5;66;03m# global flag to let the user control whether they want the future\u001b[39;00m\n\u001b[32m    941\u001b[39m         \u001b[38;5;66;03m# behavior of overwriting the existing tensor or not.\u001b[39;00m\n",
-            "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:957\u001b[39m, in \u001b[36mModule._apply\u001b[39m\u001b[34m(self, fn, recurse)\u001b[39m\n\u001b[32m    953\u001b[39m \u001b[38;5;66;03m# Tensors stored in modules are graph leaves, and we don't want to\u001b[39;00m\n\u001b[32m    954\u001b[39m \u001b[38;5;66;03m# track autograd history of `param_applied`, so we have to use\u001b[39;00m\n\u001b[32m    955\u001b[39m \u001b[38;5;66;03m# `with torch.no_grad():`\u001b[39;00m\n\u001b[32m    956\u001b[39m \u001b[38;5;28;01mwith\u001b[39;00m torch.no_grad():\n\u001b[32m--> \u001b[39m\u001b[32m957\u001b[39m     param_applied = \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[43mparam\u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m    958\u001b[39m p_should_use_set_data = compute_should_use_set_data(param, param_applied)\n\u001b[32m    960\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01m_subclasses\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mfake_tensor\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m FakeTensor\n",
-            "\u001b[36mFile \u001b[39m\u001b[32m~/OpenSource/13_democratiser_sobriete/policy_analysis/.venv/lib/python3.12/site-packages/torch/nn/modules/module.py:1357\u001b[39m, in \u001b[36mModule.to.<locals>.convert\u001b[39m\u001b[34m(t)\u001b[39m\n\u001b[32m   1350\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m convert_to_format \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mand\u001b[39;00m t.dim() \u001b[38;5;129;01min\u001b[39;00m (\u001b[32m4\u001b[39m, \u001b[32m5\u001b[39m):\n\u001b[32m   1351\u001b[39m         \u001b[38;5;28;01mreturn\u001b[39;00m t.to(\n\u001b[32m   1352\u001b[39m             device,\n\u001b[32m   1353\u001b[39m             dtype \u001b[38;5;28;01mif\u001b[39;00m t.is_floating_point() \u001b[38;5;129;01mor\u001b[39;00m t.is_complex() \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[32m   1354\u001b[39m             non_blocking,\n\u001b[32m   1355\u001b[39m             memory_format=convert_to_format,\n\u001b[32m   1356\u001b[39m         )\n\u001b[32m-> \u001b[39m\u001b[32m1357\u001b[39m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mto\u001b[49m\u001b[43m(\u001b[49m\n\u001b[32m   1358\u001b[39m \u001b[43m        \u001b[49m\u001b[43mdevice\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1359\u001b[39m \u001b[43m        \u001b[49m\u001b[43mdtype\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mif\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mis_floating_point\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;129;43;01mor\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[43mt\u001b[49m\u001b[43m.\u001b[49m\u001b[43mis_complex\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01melse\u001b[39;49;00m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[32m   1360\u001b[39m \u001b[43m        \u001b[49m\u001b[43mnon_blocking\u001b[49m\u001b[43m,\u001b[49m\n\u001b[32m   1361\u001b[39m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[32m   1362\u001b[39m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mNotImplementedError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n\u001b[32m   1363\u001b[39m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mstr\u001b[39m(e) == \u001b[33m\"\u001b[39m\u001b[33mCannot copy out of meta tensor; no data!\u001b[39m\u001b[33m\"\u001b[39m:\n",
-            "\u001b[31mOutOfMemoryError\u001b[39m: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacity of 3.68 GiB of which 5.25 MiB is free. Including non-PyTorch memory, this process has 3.66 GiB memory in use. Of the allocated memory 3.58 GiB is allocated by PyTorch, and 4.72 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)"
-          ]
-        }
-      ],
+      "outputs": [],
       "source": [
         "from umap import UMAP\n",
         "import hdbscan\n",
@@ -496,7 +486,7 @@
         "        vectorizer_model=vectorizer_model,\n",
         "        verbose=True,\n",
         "        representation_model= MaximalMarginalRelevance(diversity=0.3),\n",
-        ").fit(df[\"text\"].tolist(), embeddings=embeddings)"
+        ").fit(df[text_field].tolist(), embeddings=embeddings)"
       ]
     },
     {
@@ -4540,7 +4530,7 @@
         }
       ],
       "source": [
-        "hierarchical_topics = topic_model.hierarchical_topics(df[\"text\"].tolist())"
+        "hierarchical_topics = topic_model.hierarchical_topics(df[text_field].tolist())"
       ]
     },
     {
@@ -15258,11 +15248,11 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 24,
+      "execution_count": null,
       "metadata": {},
       "outputs": [],
       "source": [
-        "def fit_bertopic_model(docs=df[\"policy\"].tolist(), embeddings=embeddings, vocab=vocab, stopword=stopword, \n",
+        "def fit_bertopic_model(docs=df[text_field].tolist(), embeddings=embeddings, vocab=vocab, stopword=stopword, \n",
         "                       embedding_model_name=Embeddings, representation_model=None, y=None):\n",
         "    \"\"\"\n",
         "    Fits a BERTopic model with custom configurations.\n",
@@ -15544,7 +15534,7 @@
     },
     {
       "cell_type": "code",
-      "execution_count": 26,
+      "execution_count": null,
       "metadata": {
         "id": "XupETxy89uqd"
       },
@@ -15563,7 +15553,7 @@
         "color_key = {str(topic): next(colors) for topic in set(topic_model.topics_) if topic != -1}\n",
         "\n",
         "# Transformer les embeddings en dataframe\n",
-        "docs = df[\"policy\"].tolist()\n",
+        "docs = df[text_field].tolist()\n",
         "data = pd.DataFrame({\"x\": reduced_embeddings_2d[:, 0], \"y\": reduced_embeddings_2d[:, 1],\n",
         "                   \"Topic\": [str(t) for t in topic_model.topics_]})\n",
         "\n",