Skip to content

Commit efd4e04

Browse files
committed
Add GPT-5 eval results to leaderboard
1 parent 987a34e commit efd4e04

3 files changed

Lines changed: 92 additions & 8 deletions

File tree

docs/leaderboard.md

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,8 @@ Maintainer policy:
99

1010
| Rank | Label | Model | Provider | Version | Type | Composite | Validity | Control | Editing | Date |
1111
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
12-
| 1 | eval-baseline-0.2.0 | baseline-deterministic | local | 0.2.0 | baseline | 0.9797 | 1.0000 | 0.9475 | 0.9917 | 2026-04-01 |
12+
| 1 | gpt-5-eval-2026-04-02 | gpt-5 | openai | 2026-04-02 | community | 0.9442 | 0.9900 | 0.9500 | 0.8926 | 2026-04-01 |
13+
| 2 | gpt-5-mini-eval-2026-04-02 | gpt-5-mini | openai | 2026-04-02 | community | 0.9075 | 0.8918 | 0.9175 | 0.9133 | 2026-04-01 |
14+
| 3 | gpt-5-nano-eval-2026-04-02 | gpt-5-nano | openai | 2026-04-02 | community | 0.7900 | 0.7019 | 0.8800 | 0.7880 | 2026-04-01 |
15+
| 4 | eval-baseline-0.2.0 | baseline-deterministic | local | 0.2.0 | baseline | 0.9797 | 1.0000 | 0.9475 | 0.9917 | 2026-04-01 |
1316

leaderboard/results.json

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,87 @@
2626
"validity_check": 1.0
2727
},
2828
"composite_score": 0.9797
29+
},
30+
{
31+
"label": "gpt-5-eval-2026-04-02",
32+
"model_name": "gpt-5",
33+
"provider": "openai",
34+
"model_version": "2026-04-02",
35+
"benchmark_name": "ABC-GenBench",
36+
"benchmark_version": "0.2.0-eval",
37+
"benchmark_split": "eval",
38+
"run_type": "community",
39+
"submission_date": "2026-04-01",
40+
"notes": "",
41+
"aggregate_scores": {
42+
"constraint_following": 0.95,
43+
"editing_continuation": 0.8926,
44+
"validity_renderability": 0.99
45+
},
46+
"task_type_scores": {
47+
"controlled_generation": 0.95,
48+
"error_correction": 1.0,
49+
"free_continuation": 0.6574,
50+
"middle_infilling": 0.9875,
51+
"next_bar_choice": 1.0,
52+
"style_variation": 0.9215,
53+
"validity_check": 0.99
54+
},
55+
"composite_score": 0.9442
56+
},
57+
{
58+
"label": "gpt-5-mini-eval-2026-04-02",
59+
"model_name": "gpt-5-mini",
60+
"provider": "openai",
61+
"model_version": "2026-04-02",
62+
"benchmark_name": "ABC-GenBench",
63+
"benchmark_version": "0.2.0-eval",
64+
"benchmark_split": "eval",
65+
"run_type": "community",
66+
"submission_date": "2026-04-01",
67+
"notes": "",
68+
"aggregate_scores": {
69+
"constraint_following": 0.9175,
70+
"editing_continuation": 0.9133,
71+
"validity_renderability": 0.8918
72+
},
73+
"task_type_scores": {
74+
"controlled_generation": 0.9175,
75+
"error_correction": 0.9536,
76+
"free_continuation": 0.7676,
77+
"middle_infilling": 0.9792,
78+
"next_bar_choice": 1.0,
79+
"style_variation": 0.9305,
80+
"validity_check": 0.8918
81+
},
82+
"composite_score": 0.9075
83+
},
84+
{
85+
"label": "gpt-5-nano-eval-2026-04-02",
86+
"model_name": "gpt-5-nano",
87+
"provider": "openai",
88+
"model_version": "2026-04-02",
89+
"benchmark_name": "ABC-GenBench",
90+
"benchmark_version": "0.2.0-eval",
91+
"benchmark_split": "eval",
92+
"run_type": "community",
93+
"submission_date": "2026-04-01",
94+
"notes": "",
95+
"aggregate_scores": {
96+
"constraint_following": 0.88,
97+
"editing_continuation": 0.788,
98+
"validity_renderability": 0.7019
99+
},
100+
"task_type_scores": {
101+
"controlled_generation": 0.88,
102+
"error_correction": 0.9745,
103+
"free_continuation": 0.3235,
104+
"middle_infilling": 0.8671,
105+
"next_bar_choice": 1.0,
106+
"style_variation": 0.934,
107+
"validity_check": 0.7019
108+
},
109+
"composite_score": 0.79
29110
}
30111
]
31112
}

src/abcgenbench/leaderboard.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,11 @@
99

1010

1111
VALID_RUN_TYPES = {"official", "community", "baseline"}
12+
RUN_TYPE_ORDER = {"official": 0, "community": 1, "baseline": 2}
13+
14+
15+
def _sort_key(row: dict[str, Any]) -> tuple[Any, ...]:
16+
return (row["benchmark_split"], RUN_TYPE_ORDER.get(row["run_type"], 99), -row["composite_score"], row["label"])
1217

1318

1419
def load_leaderboard(leaderboard_path: str | Path) -> dict[str, Any]:
@@ -67,9 +72,7 @@ def ingest_report(
6772
payload = load_leaderboard(leaderboard_path)
6873
payload["entries"] = [row for row in payload["entries"] if row["label"] != label]
6974
payload["entries"].append(entry)
70-
payload["entries"].sort(
71-
key=lambda row: (row["benchmark_split"], -row["composite_score"], row["label"])
72-
)
75+
payload["entries"].sort(key=_sort_key)
7376

7477
errors = validate_document(payload, "leaderboard_results.schema.json")
7578
if errors:
@@ -79,10 +82,7 @@ def ingest_report(
7982

8083

8184
def render_leaderboard_markdown(payload: dict[str, Any]) -> str:
82-
entries = sorted(
83-
payload["entries"],
84-
key=lambda row: (row["benchmark_split"], -row["composite_score"], row["label"]),
85-
)
85+
entries = sorted(payload["entries"], key=_sort_key)
8686
lines = [
8787
"# Leaderboard",
8888
"",

0 commit comments

Comments
 (0)