-
Notifications
You must be signed in to change notification settings - Fork 5.5k
Expand file tree
/
Copy pathleaderboard.py
More file actions
34 lines (25 loc) · 1.11 KB
/
leaderboard.py
File metadata and controls
34 lines (25 loc) · 1.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
from __future__ import annotations
from dataclasses import dataclass
@dataclass(frozen=True)
class HighScore:
label: str
value: float
# Curated high-score references from benchmark-local leaderboard constants.
# Values are intentionally scalarized to the best comparable primary metric
# per benchmark for cross-run delta comparisons.
HIGH_SCORES: dict[str, HighScore] = {
"realm": HighScore("GPT-4-Turbo", 82.1),
"context_bench": HighScore("claude-opus-4-7", 0.95),
"swe_bench": HighScore("SWE-bench Lite:OpenHands + Claude Sonnet 4.6", 53.0),
"mint": HighScore("gpt-4-0613", 0.72),
"vending_bench": HighScore("grok_4", 4694.15),
"tau_bench": HighScore("gpt-5", 0.4735),
"bfcl": HighScore("gpt-5", 0.891),
}
def best_high_score(benchmark_id: str) -> HighScore | None:
return HIGH_SCORES.get(benchmark_id)
def delta_to_high_score(benchmark_id: str, score: float | None) -> tuple[str | None, float | None, float | None]:
high = best_high_score(benchmark_id)
if high is None or score is None:
return None, None, None
return high.label, high.value, score - high.value