-added command for physics entities calculation

Stanoja · Stanoja · commit 3803ba2c9988 · 2026-04-15T18:09:05.000+02:00
-updated gitignore with the reporting path
diff --git a/.gitignore b/.gitignore
@@ -3,4 +3,4 @@ __pycache__
 .idea
 **/*.sqlite3
 **/.env
-.web/classifier_reports
+./web/classifier_reports
diff --git a/run-physics-control.sh b/run-physics-control.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+cd "$(dirname "$0")/web"
+
+python manage.py physics_control \
+    --session "test-003-20260407"
diff --git a/web/categorizer/management/commands/evaluate_classifier.py b/web/categorizer/management/commands/evaluate_classifier.py
@@ -139,6 +139,7 @@ def handle(self, *args, **options):
         auc_c, auc_d = self._plot_roc_curve(
             output_dir, tp_results, obs_results, ground_truth
         )
+        self._plot_roc_curve_labeled(output_dir, tp_preds, obs_preds, results_limit)
         self._plot_precision_recall_curve(
             output_dir, tp_results, obs_results, ground_truth
         )
@@ -284,24 +285,74 @@ def _plot_roc_curve(self, output_dir, tp_results, obs_results, ground_truth):
         fpr_d, tpr_d, _ = roc_curve(labels_d, scores_d)
         auc_d = auc(fpr_d, tpr_d)
 
+        fig, ax = plt.subplots(figsize=(8, 8))
+        ax.plot(fpr_c, tpr_c, linewidth=2, label=f"Included (AUC={auc_c:.3f})")
+        ax.plot(fpr_d, tpr_d, linewidth=2, label=f"Excluded (AUC={auc_d:.3f})")
+        ax.plot([0, 1], [0, 1], "k--", linewidth=1, label="Random")
+        ax.set_xlabel("False Positive Rate")
+        ax.set_ylabel("True Positive Rate")
+        ax.set_title("ROC Curve (MathWorld identifier included vs. excluded)")
+        ax.legend(loc="lower right")
+        fig.tight_layout()
+        fig.savefig(os.path.join(output_dir, "roc_curve.png"), dpi=150)
+        plt.close(fig)
+
+        return auc_c, auc_d
+
+    def _plot_roc_curve_labeled(self, output_dir, tp_preds, obs_preds, limit):
+        common_ids = sorted(set(tp_preds) & set(obs_preds))[:limit]
+        if not common_ids:
+            self.stdout.write(
+                self.style.WARNING("Skipping labeled ROC: no common items")
+            )
+            return
+
+        labels = np.array([1 if tp_preds[i][0] else 0 for i in common_ids], dtype=int)
+        scores_with = np.array(
+            [tp_preds[i][1] / 100.0 for i in common_ids], dtype=float
+        )
+        scores_without = np.array(
+            [obs_preds[i][1] / 100.0 for i in common_ids], dtype=float
+        )
+
+        if len(np.unique(labels)) < 2:
+            self.stdout.write(
+                self.style.WARNING(
+                    "Skipping labeled ROC: include-MW-ID answers are single-class"
+                )
+            )
+            return
+
+        fpr_w, tpr_w, _ = roc_curve(labels, scores_with)
+        auc_w = auc(fpr_w, tpr_w)
+        fpr_wo, tpr_wo, _ = roc_curve(labels, scores_without)
+        auc_wo = auc(fpr_wo, tpr_wo)
+
         fig, ax = plt.subplots(figsize=(8, 8))
         ax.plot(
-            fpr_c, tpr_c, linewidth=2, label=f"Table C: tp+results (AUC={auc_c:.3f})"
+            fpr_w,
+            tpr_w,
+            linewidth=2,
+            label=f"With MathWorld ID (AUC={auc_w:.3f})",
         )
         ax.plot(
-            fpr_d, tpr_d, linewidth=2, label=f"Table D: obs+results (AUC={auc_d:.3f})"
+            fpr_wo,
+            tpr_wo,
+            linewidth=2,
+            label=f"Without MathWorld ID (AUC={auc_wo:.3f})",
         )
         ax.plot([0, 1], [0, 1], "k--", linewidth=1, label="Random")
         ax.set_xlabel("False Positive Rate")
         ax.set_ylabel("True Positive Rate")
-        ax.set_title("ROC Curve")
+        ax.set_title(
+            f"ROC Curve (n={len(common_ids)}, "
+            "labels = include-MW-ID aggregated answer)"
+        )
         ax.legend(loc="lower right")
         fig.tight_layout()
-        fig.savefig(os.path.join(output_dir, "roc_curve.png"), dpi=150)
+        fig.savefig(os.path.join(output_dir, "roc_curve_labeled.png"), dpi=150)
         plt.close(fig)
 
-        return auc_c, auc_d
-
     def _plot_precision_recall_curve(
         self, output_dir, tp_results, obs_results, ground_truth
     ):
diff --git a/web/categorizer/management/commands/physics_control.py b/web/categorizer/management/commands/physics_control.py
@@ -0,0 +1,120 @@
+from concepts.models import CategorizerResult, Item
+from django.core.management.base import BaseCommand
+
+
+class Command(BaseCommand):
+    help = (
+        "Summarize the physics-concepts control experiment: distribution of "
+        "'math' votes per item, mean confidence per group, and the full list "
+        "of items that received a unanimous 'math' vote."
+    )
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            "--session",
+            type=str,
+            required=True,
+            help="Session name of the physics-concepts categorization run.",
+        )
+
+    def handle(self, *args, **options):
+        session = options["session"]
+
+        by_item = {}
+        for item_id, answer, confidence in CategorizerResult.objects.filter(
+            session_name=session
+        ).values_list("item_id", "result_answer", "result_confidence"):
+            by_item.setdefault(item_id, []).append((bool(answer), int(confidence)))
+
+        if not by_item:
+            self.stdout.write(
+                self.style.ERROR(f"No CategorizerResult rows for session '{session}'")
+            )
+            return
+
+        # Per-item aggregation
+        groups = {0: [], 1: [], 2: [], 3: []}  # yes_votes -> list[(item_id, [conf])]
+        for item_id, judgments in by_item.items():
+            yes_votes = sum(1 for ans, _ in judgments if ans)
+            confidences = [c for _, c in judgments]
+            groups.setdefault(yes_votes, []).append((item_id, confidences))
+
+        total_items = sum(len(v) for v in groups.values())
+
+        self.stdout.write(
+            f"\nPhysics control — session '{session}' — {total_items} items\n"
+        )
+
+        # ----- Distribution + mean confidence per group -----
+        rows = []
+        for k in sorted(groups):
+            items = groups[k]
+            n = len(items)
+            if n == 0:
+                mean_conf = 0.0
+            else:
+                all_confs = [c for _, confs in items for c in confs]
+                mean_conf = sum(all_confs) / len(all_confs)
+            rows.append((k, n, mean_conf))
+
+        header = f"{'Judges voting math':>20} {'Items':>8} {'Mean confidence':>18}"
+        self.stdout.write(header)
+        self.stdout.write("-" * len(header))
+        for k, n, mean_conf in rows:
+            self.stdout.write(f"{k:>20} {n:>8} {mean_conf:>17.1f}")
+
+        # LaTeX tabular for the augmented distribution table
+        self.stdout.write("\nLaTeX tabular (vote distribution + mean confidence):\n")
+        self.stdout.write(
+            "\\begin{tabular}"
+            "{>{\\raggedleft\\arraybackslash}p{0.28\\textwidth}"
+            ">{\\raggedleft\\arraybackslash}p{0.20\\textwidth}"
+            ">{\\raggedleft\\arraybackslash}p{0.20\\textwidth}}"
+        )
+        self.stdout.write("    \\toprule")
+        self.stdout.write(
+            "    Judges voting ``math'' & Number of items & " "Mean confidence \\\\"
+        )
+        self.stdout.write("    \\midrule")
+        for k, n, mean_conf in rows:
+            self.stdout.write(f"    {k} & {n:>3} & {mean_conf:5.1f} \\\\")
+        self.stdout.write("    \\bottomrule")
+        self.stdout.write("\\end{tabular}\n")
+
+        # ----- Exhaustive list of items with unanimous 'math' votes -----
+        unanimous = groups.get(3, [])
+        if not unanimous:
+            self.stdout.write("\nNo items received a unanimous 'math' vote.\n")
+            return
+
+        unanimous_ids = [item_id for item_id, _ in unanimous]
+        items_by_id = {i.id: i for i in Item.objects.filter(id__in=unanimous_ids)}
+
+        # Attach name + mean confidence per item, ordered by name
+        enriched = []
+        for item_id, confs in unanimous:
+            item = items_by_id.get(item_id)
+            name = item.name if item and item.name else f"(item #{item_id})"
+            mean_conf = sum(confs) / len(confs) if confs else 0.0
+            enriched.append((name, mean_conf))
+        enriched.sort(key=lambda x: (x[0] or "").lower())
+
+        self.stdout.write(f"\nItems with unanimous 'math' vote ({len(enriched)}):\n")
+        for name, mean_conf in enriched:
+            self.stdout.write(f"  - {name}  (mean confidence {mean_conf:.1f})")
+
+        # LaTeX tabular for the unanimous-items list
+        self.stdout.write("\nLaTeX tabular (unanimous 'math' items):\n")
+        self.stdout.write(
+            "\\begin{tabular}"
+            "{>{\\raggedright\\arraybackslash}p{0.60\\textwidth}"
+            ">{\\raggedleft\\arraybackslash}p{0.20\\textwidth}}"
+        )
+        self.stdout.write("    \\toprule")
+        self.stdout.write("    Concept & Mean confidence \\\\")
+        self.stdout.write("    \\midrule")
+        for name, mean_conf in enriched:
+            safe = (name or "").replace("&", "\\&").replace("_", "\\_")
+            self.stdout.write(f"    {safe} & {mean_conf:5.1f} \\\\")
+        self.stdout.write("    \\bottomrule")
+        self.stdout.write("\\end{tabular}\n")