Add GPT-5 eval results to leaderboard

bnovik0v · bnovik0v · commit efd4e048834b · 2026-04-02T00:56:51.000+02:00
diff --git a/docs/leaderboard.md b/docs/leaderboard.md
@@ -9,5 +9,8 @@ Maintainer policy:
 
 | Rank | Label | Model | Provider | Version | Type | Composite | Validity | Control | Editing | Date |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| 1 | eval-baseline-0.2.0 | baseline-deterministic | local | 0.2.0 | baseline | 0.9797 | 1.0000 | 0.9475 | 0.9917 | 2026-04-01 |
+| 1 | gpt-5-eval-2026-04-02 | gpt-5 | openai | 2026-04-02 | community | 0.9442 | 0.9900 | 0.9500 | 0.8926 | 2026-04-01 |
+| 2 | gpt-5-mini-eval-2026-04-02 | gpt-5-mini | openai | 2026-04-02 | community | 0.9075 | 0.8918 | 0.9175 | 0.9133 | 2026-04-01 |
+| 3 | gpt-5-nano-eval-2026-04-02 | gpt-5-nano | openai | 2026-04-02 | community | 0.7900 | 0.7019 | 0.8800 | 0.7880 | 2026-04-01 |
+| 4 | eval-baseline-0.2.0 | baseline-deterministic | local | 0.2.0 | baseline | 0.9797 | 1.0000 | 0.9475 | 0.9917 | 2026-04-01 |
 
diff --git a/leaderboard/results.json b/leaderboard/results.json
@@ -26,6 +26,87 @@
         "validity_check": 1.0
       },
       "composite_score": 0.9797
+    },
+    {
+      "label": "gpt-5-eval-2026-04-02",
+      "model_name": "gpt-5",
+      "provider": "openai",
+      "model_version": "2026-04-02",
+      "benchmark_name": "ABC-GenBench",
+      "benchmark_version": "0.2.0-eval",
+      "benchmark_split": "eval",
+      "run_type": "community",
+      "submission_date": "2026-04-01",
+      "notes": "",
+      "aggregate_scores": {
+        "constraint_following": 0.95,
+        "editing_continuation": 0.8926,
+        "validity_renderability": 0.99
+      },
+      "task_type_scores": {
+        "controlled_generation": 0.95,
+        "error_correction": 1.0,
+        "free_continuation": 0.6574,
+        "middle_infilling": 0.9875,
+        "next_bar_choice": 1.0,
+        "style_variation": 0.9215,
+        "validity_check": 0.99
+      },
+      "composite_score": 0.9442
+    },
+    {
+      "label": "gpt-5-mini-eval-2026-04-02",
+      "model_name": "gpt-5-mini",
+      "provider": "openai",
+      "model_version": "2026-04-02",
+      "benchmark_name": "ABC-GenBench",
+      "benchmark_version": "0.2.0-eval",
+      "benchmark_split": "eval",
+      "run_type": "community",
+      "submission_date": "2026-04-01",
+      "notes": "",
+      "aggregate_scores": {
+        "constraint_following": 0.9175,
+        "editing_continuation": 0.9133,
+        "validity_renderability": 0.8918
+      },
+      "task_type_scores": {
+        "controlled_generation": 0.9175,
+        "error_correction": 0.9536,
+        "free_continuation": 0.7676,
+        "middle_infilling": 0.9792,
+        "next_bar_choice": 1.0,
+        "style_variation": 0.9305,
+        "validity_check": 0.8918
+      },
+      "composite_score": 0.9075
+    },
+    {
+      "label": "gpt-5-nano-eval-2026-04-02",
+      "model_name": "gpt-5-nano",
+      "provider": "openai",
+      "model_version": "2026-04-02",
+      "benchmark_name": "ABC-GenBench",
+      "benchmark_version": "0.2.0-eval",
+      "benchmark_split": "eval",
+      "run_type": "community",
+      "submission_date": "2026-04-01",
+      "notes": "",
+      "aggregate_scores": {
+        "constraint_following": 0.88,
+        "editing_continuation": 0.788,
+        "validity_renderability": 0.7019
+      },
+      "task_type_scores": {
+        "controlled_generation": 0.88,
+        "error_correction": 0.9745,
+        "free_continuation": 0.3235,
+        "middle_infilling": 0.8671,
+        "next_bar_choice": 1.0,
+        "style_variation": 0.934,
+        "validity_check": 0.7019
+      },
+      "composite_score": 0.79
     }
   ]
 }
diff --git a/src/abcgenbench/leaderboard.py b/src/abcgenbench/leaderboard.py
@@ -9,6 +9,11 @@
 
 
 VALID_RUN_TYPES = {"official", "community", "baseline"}
+RUN_TYPE_ORDER = {"official": 0, "community": 1, "baseline": 2}
+
+
+def _sort_key(row: dict[str, Any]) -> tuple[Any, ...]:
+    return (row["benchmark_split"], RUN_TYPE_ORDER.get(row["run_type"], 99), -row["composite_score"], row["label"])
 
 
 def load_leaderboard(leaderboard_path: str | Path) -> dict[str, Any]:
@@ -67,9 +72,7 @@ def ingest_report(
     payload = load_leaderboard(leaderboard_path)
     payload["entries"] = [row for row in payload["entries"] if row["label"] != label]
     payload["entries"].append(entry)
-    payload["entries"].sort(
-        key=lambda row: (row["benchmark_split"], -row["composite_score"], row["label"])
-    )
+    payload["entries"].sort(key=_sort_key)
 
     errors = validate_document(payload, "leaderboard_results.schema.json")
     if errors:
@@ -79,10 +82,7 @@ def ingest_report(
 
 
 def render_leaderboard_markdown(payload: dict[str, Any]) -> str:
-    entries = sorted(
-        payload["entries"],
-        key=lambda row: (row["benchmark_split"], -row["composite_score"], row["label"]),
-    )
+    entries = sorted(payload["entries"], key=_sort_key)
     lines = [
         "# Leaderboard",
         "",