NLP-AI-Wizards
diff --git a/‎.python-version‎
Lines changed: 1 addition & 0 deletions b/‎.python-version‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎confusionmatrix.png‎
-2.05 KB b/‎confusionmatrix.png‎
-2.05 KB
diff --git a/‎dataset_augmentation_esp.py‎
Lines changed: 8 additions & 14 deletions b/‎dataset_augmentation_esp.py‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎dataset_augmentation_ita.py‎
Lines changed: 8 additions & 14 deletions b/‎dataset_augmentation_ita.py‎
Lines changed: 8 additions & 14 deletions
diff --git a/‎double_pipeline.ipynb‎
Lines changed: 39 additions & 47 deletions b/‎double_pipeline.ipynb‎
Lines changed: 39 additions & 47 deletions
@@ -0,0 +1 @@
+3.10
@@ -9,10 +9,7 @@
 ### Setup ###
 console = Console()
 load_dotenv()
-client = OpenAI(
-    api_key=os.environ.get('DEEPSEEK_API_KEY'), 
-    base_url="https://api.deepseek.com"
-)
+client = OpenAI(api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")
 
 system_prompt = """Eres un anotador para una tarea de clasificación. En la entrada recibirás la biografía de un usuario de Twitter y algunos de sus tuits.  
 Tu tarea es decidir si el usuario en cuestión forma parte o no de la comunidad LGBT.  
@@ -36,31 +33,28 @@
 iterable = zip(ita["text"], ita["bio"])
 
 for text, bio in track(iterable, description="[cyan]Processing entries...[/cyan]", total=len(ita)):
-    
+
     bio = "" if str(bio) == "nan" else bio
     user_message = f'"{text}" - "{bio}"'
-    
+
     try:
         response = client.chat.completions.create(
             model="deepseek-chat",
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_message}
-            ],
-            stream=False
+            messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
+            stream=False,
         )
         answer = response.choices[0].message.content.strip()
     except Exception as e:
         console.print(f"[bold red]An error occurred: {e}\nRetrying in 2 minutes...[/bold red]")
         answer = "error"
         time.sleep(120)
-    
+
     if answer in ["1", "0"]:
         answer = int(answer)
     else:
         # Mark ambiguous/error responses
         answer = 0.5
-    
+
     lgbt.append(answer)
     time.sleep(0.5)
 
@@ -69,4 +63,4 @@
 console.print("\n[bold yellow]Saving augmented dataset...[/bold yellow]")
 ita.to_csv("augmented_es.csv", index=False)
 console.print("[bold green]:white_check_mark: File saved as [cyan]augmented_es.csv[/cyan][/bold green]")
-console.print(ita.head())
+console.print(ita.head())
@@ -9,10 +9,7 @@
 ### Setup ###
 console = Console()
 load_dotenv()
-client = OpenAI(
-    api_key=os.environ.get('DEEPSEEK_API_KEY'), 
-    base_url="https://api.deepseek.com"
-)
+client = OpenAI(api_key=os.environ.get("DEEPSEEK_API_KEY"), base_url="https://api.deepseek.com")
 
 system_prompt = """Sei un annotatore per un task di classificazione. In input riceverai la bio di un utente Twitter e alcuni suoi tweet.
 Il tuo compito è decidere se l'utente in questione fa parte o meno della comunità LGBT.
@@ -36,31 +33,28 @@
 iterable = zip(ita["text"], ita["bio"])
 
 for text, bio in track(iterable, description="[cyan]Processing entries...[/cyan]", total=len(ita)):
-    
+
     bio = "" if str(bio) == "nan" else bio
     user_message = f'"{text}" - "{bio}"'
-    
+
     try:
         response = client.chat.completions.create(
             model="deepseek-chat",
-            messages=[
-                {"role": "system", "content": system_prompt},
-                {"role": "user", "content": user_message}
-            ],
-            stream=False
+            messages=[{"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}],
+            stream=False,
         )
         answer = response.choices[0].message.content.strip()
     except Exception as e:
         console.print(f"[bold red]An error occurred: {e}\nRetrying in 2 minutes...[/bold red]")
         answer = "error"
         time.sleep(120)
-    
+
     if answer in ["1", "0"]:
         answer = int(answer)
     else:
         # Mark ambiguous/error responses
         answer = 0.5
-    
+
     lgbt.append(answer)
     time.sleep(0.5)
 
@@ -69,4 +63,4 @@
 console.print("\n[bold yellow]Saving augmented dataset...[/bold yellow]")
 ita.to_csv("augmented_it.csv", index=False)
 console.print("[bold green]:white_check_mark: File saved as [cyan]augmented_it.csv[/cyan][/bold green]")
-console.print(ita.head())
+console.print(ita.head())
@@ -17,7 +17,7 @@
     "    Trainer,\n",
     "    AutoModelForSequenceClassification,\n",
     "    TrainingArguments,\n",
-    "    EarlyStoppingCallback\n",
+    "    EarlyStoppingCallback,\n",
     ")\n",
     "from datasets import Dataset as HFDataset\n",
     "from evaluate import load as load_metric\n",
@@ -28,9 +28,11 @@
     "\n",
     "\n",
     "from huggingface_hub.utils import disable_progress_bars\n",
+    "\n",
     "disable_progress_bars()\n",
     "\n",
     "import os\n",
+    "\n",
     "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
     "os.environ[\"TRANSFORMERS_NO_ADVISORY_WARNINGS\"] = \"true\""
    ]
@@ -57,9 +59,9 @@
    "outputs": [],
    "source": [
     "ita = pd.read_csv(\"dataset/augmented_it.csv\")\n",
-    "#For the moment, it works only on italian\n",
+    "# For the moment, it works only on italian\n",
     "dataset = pd.concat([ita])\n",
-    "dataset['bio'] = dataset['bio'].fillna('')"
+    "dataset[\"bio\"] = dataset[\"bio\"].fillna(\"\")"
    ]
   },
   {
@@ -243,12 +245,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "pre_train_df, pre_test_df = train_test_split(\n",
-    "    dataset,\n",
-    "    test_size=0.3,\n",
-    "    stratify=dataset[\"lgbt\"],\n",
-    "    random_state=42\n",
-    ")"
+    "pre_train_df, pre_test_df = train_test_split(dataset, test_size=0.3, stratify=dataset[\"lgbt\"], random_state=42)"
    ]
   },
   {
@@ -283,7 +280,7 @@
     "        batch[\"bio\"],\n",
     "        truncation=True,\n",
     "        padding=\"max_length\",\n",
-    "        max_length=128, # Lunghezza massima per i testi\n",
+    "        max_length=128,  # Lunghezza massima per i testi\n",
     "    )"
    ]
   },
@@ -645,6 +642,7 @@
    "source": [
     "class DualEncoderForSequenceClassification(PreTrainedModel):\n",
     "    config_class = AutoConfig\n",
+    "\n",
     "    def __init__(self, config):\n",
     "        super().__init__(config)\n",
     "        self.num_labels = config.num_labels\n",
@@ -654,10 +652,7 @@
     "\n",
     "        # Gating layer: it weights the two source of informations for the final classification\n",
     "        self.gate_layer = nn.Sequential(\n",
-    "            nn.Linear(hidden_size * 2, hidden_size),\n",
-    "            nn.Tanh(),\n",
-    "            nn.Linear(hidden_size, hidden_size),\n",
-    "            nn.Sigmoid()\n",
+    "            nn.Linear(hidden_size * 2, hidden_size), nn.Tanh(), nn.Linear(hidden_size, hidden_size), nn.Sigmoid()\n",
     "        )\n",
     "\n",
     "        # Final classifier\n",
@@ -686,14 +681,14 @@
     "            return_dict=return_dict,\n",
     "        )\n",
     "\n",
-    "        #Obtain last hidden state of the bert model\n",
+    "        # Obtain last hidden state of the bert model\n",
     "        h_text = outputs_text.last_hidden_state[:, 0]\n",
     "        h_bio = outputs_bio.last_hidden_state[:, 0]\n",
     "\n",
-    "        #Concat the hidden states\n",
+    "        # Concat the hidden states\n",
     "        combined = torch.cat((h_text, h_bio), dim=-1)\n",
-    "        #Gate the informations\n",
-    "        gate = self.gate_layer(combined) \n",
+    "        # Gate the informations\n",
+    "        gate = self.gate_layer(combined)\n",
     "        h_final = gate * h_text + (1 - gate) * h_bio\n",
     "\n",
     "        # Classification\n",
@@ -703,7 +698,7 @@
     "        # Loss computation\n",
     "        loss = None\n",
     "        if labels is not None:\n",
-    "            if hasattr(self.config, 'class_weights') and self.config.class_weights is not None:\n",
+    "            if hasattr(self.config, \"class_weights\") and self.config.class_weights is not None:\n",
     "                class_weights = torch.tensor(self.config.class_weights, device=self.device)\n",
     "            else:\n",
     "                class_weights = None\n",
@@ -727,12 +722,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "train_df, test_df = train_test_split(\n",
-    "    dataset,\n",
-    "    test_size=0.3,\n",
-    "    stratify=dataset[\"label\"],\n",
-    "    random_state=42\n",
-    ")"
+    "train_df, test_df = train_test_split(dataset, test_size=0.3, stratify=dataset[\"label\"], random_state=42)"
    ]
   },
   {
@@ -864,6 +854,7 @@
    "source": [
     "del trainer.model, trainer\n",
     "import gc\n",
+    "\n",
     "gc.collect()\n",
     "torch.cuda.empty_cache()"
    ]
@@ -1009,7 +1000,7 @@
     "    save_strategy=\"epoch\",\n",
     "    learning_rate=2e-5,\n",
     "    per_device_train_batch_size=16,\n",
-    "    #gradient_accumulation_steps=2, #Since higly unbalanced, this should provide also negative examples\n",
+    "    # gradient_accumulation_steps=2, #Since higly unbalanced, this should provide also negative examples\n",
     "    per_device_eval_batch_size=4,\n",
     "    num_train_epochs=8,\n",
     "    weight_decay=0.1,\n",
@@ -1018,7 +1009,7 @@
     "    logging_dir=\"./logs_weighted\",\n",
     "    logging_steps=50,\n",
     "    save_total_limit=2,\n",
-    "    #label_smoothing_factor=0.1,\n",
+    "    # label_smoothing_factor=0.1,\n",
     ")"
    ]
   },
@@ -1053,7 +1044,7 @@
     "    eval_dataset=test_ds,\n",
     "    tokenizer=tokenizer,\n",
     "    compute_metrics=compute_metrics,\n",
-    "    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)] \n",
+    "    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],\n",
     ")"
    ]
   },
@@ -1346,20 +1337,20 @@
     "true_labels = predictions_output.label_ids\n",
     "predicted_labels = np.argmax(logits, axis=-1)\n",
     "\n",
-    "# Compute the confidence of the model \n",
+    "# Compute the confidence of the model\n",
     "probabilities = softmax(logits, axis=1)\n",
     "confidence_scores = np.max(probabilities, axis=1)\n",
     "\n",
     "results_df = test_df.copy()\n",
     "\n",
-    "results_df['predicted_label'] = predicted_labels\n",
-    "results_df['true_label'] = true_labels\n",
-    "results_df['confidence'] = confidence_scores\n",
+    "results_df[\"predicted_label\"] = predicted_labels\n",
+    "results_df[\"true_label\"] = true_labels\n",
+    "results_df[\"confidence\"] = confidence_scores\n",
     "\n",
-    "results_df['is_correct'] = (results_df['true_label'] == results_df['predicted_label'])\n",
+    "results_df[\"is_correct\"] = results_df[\"true_label\"] == results_df[\"predicted_label\"]\n",
     "\n",
     "output_filename = \"error_analysis_results.csv\"\n",
-    "results_df.to_csv(output_filename, index=False, encoding='utf-8-sig')\n",
+    "results_df.to_csv(output_filename, index=False, encoding=\"utf-8-sig\")\n",
     "\n",
     "print(f\"\\nRisultati salvati in '{output_filename}'\")\n",
     "print(\"\\nAnteprima del DataFrame con i risultati:\")\n",
@@ -1471,14 +1462,15 @@
    ],
    "source": [
     "# Shows only errors\n",
-    "errors_df = results_df[results_df['is_correct'] == False].copy()\n",
+    "errors_df = results_df[results_df[\"is_correct\"] == False].copy()\n",
     "\n",
     "print(f\"{len(errors_df)}/{len(results_df)} errors.\")\n",
     "\n",
+    "\n",
     "def print_error_details(dataframe):\n",
     "    if dataframe.empty:\n",
     "        return\n",
-    "        \n",
+    "\n",
     "    for index, row in dataframe.iterrows():\n",
     "        print(\"-\" * 50)\n",
     "        print(f\"Confidence: {row['confidence']:.2%}\")\n",
@@ -1487,17 +1479,18 @@
     "        print(f\"Text:\\n\\\"{row['text']}\\\"\")\n",
     "        print(\"-\" * 50 + \"\\n\")\n",
     "\n",
+    "\n",
     "# Errors with high confidence\n",
     "N = 5\n",
-    "high_confidence_errors = errors_df.sort_values(by='confidence', ascending=False).head(N)\n",
+    "high_confidence_errors = errors_df.sort_values(by=\"confidence\", ascending=False).head(N)\n",
     "\n",
-    "print(\"\\n\" + \"=\"*20 + \" TOP 5 ERRORS WITH HIGH CONFIDENCE \" + \"=\"*20)\n",
+    "print(\"\\n\" + \"=\" * 20 + \" TOP 5 ERRORS WITH HIGH CONFIDENCE \" + \"=\" * 20)\n",
     "print_error_details(high_confidence_errors)\n",
     "\n",
     "\n",
-    "low_confidence_errors = errors_df.sort_values(by='confidence', ascending=True).head(N)\n",
-    "print(\"\\n\" + \"=\"*20 + \" TOP 5 ERRORS LOW CONFIDENCE \" + \"=\"*20)\n",
-    "print_error_details(low_confidence_errors)\n"
+    "low_confidence_errors = errors_df.sort_values(by=\"confidence\", ascending=True).head(N)\n",
+    "print(\"\\n\" + \"=\" * 20 + \" TOP 5 ERRORS LOW CONFIDENCE \" + \"=\" * 20)\n",
+    "print_error_details(low_confidence_errors)"
    ]
   },
   {
@@ -1524,15 +1517,14 @@
     "\n",
     "cm = confusion_matrix(true_labels, predicted_labels)\n",
     "\n",
-    "class_labels = ['Non-Reclamatory', 'Reclamatory']\n",
+    "class_labels = [\"Non-Reclamatory\", \"Reclamatory\"]\n",
     "\n",
     "plt.figure(figsize=(8, 6))\n",
-    "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', \n",
-    "            xticklabels=class_labels, yticklabels=class_labels)\n",
+    "sns.heatmap(cm, annot=True, fmt=\"d\", cmap=\"Blues\", xticklabels=class_labels, yticklabels=class_labels)\n",
     "\n",
-    "plt.title('Confusion matrix')\n",
-    "plt.ylabel('True Label')\n",
-    "plt.xlabel('Predicted Label')\n",
+    "plt.title(\"Confusion matrix\")\n",
+    "plt.ylabel(\"True Label\")\n",
+    "plt.xlabel(\"Predicted Label\")\n",
     "\n",
     "plt.show()"
    ]