Skip to content

Commit e2ccfaa

Browse files
authored
feat: add support for alpha benchmarks in evaluation commands (#92)
1 parent e5743cb commit e2ccfaa

File tree

3 files changed

+86
-27
lines changed

3 files changed

+86
-27
lines changed

src/openbench/_cli/eval_command.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -302,6 +302,14 @@ def run_eval(
302302
envvar="BENCH_HUB_PRIVATE",
303303
),
304304
] = False,
305+
alpha: Annotated[
306+
bool,
307+
typer.Option(
308+
"--alpha",
309+
help="Allow running experimental/alpha benchmarks",
310+
envvar="BENCH_ALPHA",
311+
),
312+
] = False,
305313
) -> None:
306314
"""
307315
Run a benchmark on a model.
@@ -333,7 +341,7 @@ def run_eval(
333341
tasks = []
334342
for benchmark in benchmarks:
335343
try:
336-
task = load_task(benchmark)
344+
task = load_task(benchmark, allow_alpha=alpha)
337345
tasks.append(task)
338346
except (ValueError, ImportError, AttributeError) as e:
339347
raise typer.BadParameter(str(e))

src/openbench/_cli/list_command.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,9 @@ def list_evals(
3232
tags: bool = typer.Option(
3333
False, "--tags", "-t", help="Show tags for each benchmark"
3434
),
35+
alpha: bool = typer.Option(
36+
False, "--alpha", help="Include experimental/alpha benchmarks"
37+
),
3538
) -> None:
3639
"""List available benchmark evaluations with enhanced UI."""
3740
console = Console()
@@ -42,10 +45,10 @@ def list_evals(
4245
console.print(f"\n❌ [red]Unknown category: {category}[/red]")
4346
console.print(f" Available: {', '.join(sorted(get_categories()))}\n")
4447
return
45-
benchmarks = get_benchmarks_by_category(category)
48+
benchmarks = get_benchmarks_by_category(category, include_alpha=alpha)
4649
evals = [benchmark_to_eval_config(meta) for meta in benchmarks.values()]
4750
else:
48-
all_benchmarks = get_all_benchmarks()
51+
all_benchmarks = get_all_benchmarks(include_alpha=alpha)
4952
evals = [benchmark_to_eval_config(meta) for meta in all_benchmarks.values()]
5053

5154
# Apply search filter
@@ -86,7 +89,7 @@ def list_evals(
8689
# Get task names for this category
8790
cat_evals_with_keys = [
8891
(k, v)
89-
for k, v in get_all_benchmarks().items()
92+
for k, v in get_all_benchmarks(include_alpha=alpha).items()
9093
if v.name in [e.name for e in categories[cat_name]]
9194
]
9295
cat_evals_with_keys = sorted(cat_evals_with_keys, key=lambda x: x[0])
@@ -125,9 +128,11 @@ def list_evals(
125128
# Footer with stats and help
126129
total_count = len(evals)
127130
console.print("─" * 60)
128-
console.print(
129-
f"[dim]Total: {total_count} benchmark{'s' if total_count != 1 else ''}[/dim]"
130-
)
131+
status_msg = f"[dim]Total: {total_count} benchmark{'s' if total_count != 1 else ''}"
132+
if not alpha:
133+
status_msg += " (use --alpha to see experimental benchmarks)"
134+
status_msg += "[/dim]"
135+
console.print(status_msg)
131136
console.print()
132137
console.print("[dim]Commands:[/dim]")
133138
console.print(" bench describe <name> - Show detailed information")

src/openbench/config.py

Lines changed: 66 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -28,33 +28,39 @@ class BenchmarkMetadata:
2828
module_path: str
2929
function_name: str
3030

31+
# Alpha/experimental flag
32+
is_alpha: bool = False # Whether this benchmark is experimental/alpha
33+
3134

3235
# Benchmark metadata - minimal, no duplication
3336
BENCHMARKS = {
34-
# Graphwalks benchmarks
37+
# Graphwalks benchmarks (alpha)
3538
"graphwalks": BenchmarkMetadata(
3639
name="GraphWalks",
3740
description="Multi-hop reasoning on graphs - both BFS and parent finding tasks",
3841
category="core",
39-
tags=["long-context", "graphs", "reasoning"],
42+
tags=["long-context", "graphs", "reasoning", "alpha"],
4043
module_path="openbench.evals.graphwalks",
4144
function_name="graphwalks",
45+
is_alpha=True,
4246
),
4347
"graphwalks_bfs": BenchmarkMetadata(
4448
name="GraphWalks BFS",
4549
description="Multi-hop reasoning on graphs - BFS traversal tasks only",
4650
category="core",
47-
tags=["long-context", "graphs", "reasoning", "bfs"],
51+
tags=["long-context", "graphs", "reasoning", "bfs", "alpha"],
4852
module_path="openbench.evals.graphwalks",
4953
function_name="graphwalks_bfs",
54+
is_alpha=True,
5055
),
5156
"graphwalks_parents": BenchmarkMetadata(
5257
name="GraphWalks Parents",
5358
description="Multi-hop reasoning on graphs - parent finding tasks only",
5459
category="core",
55-
tags=["long-context", "graphs", "reasoning", "parents"],
60+
tags=["long-context", "graphs", "reasoning", "parents", "alpha"],
5661
module_path="openbench.evals.graphwalks",
5762
function_name="graphwalks_parents",
63+
is_alpha=True,
5864
),
5965
# Core benchmarks
6066
"mmlu": BenchmarkMetadata(
@@ -350,12 +356,13 @@ class BenchmarkMetadata:
350356
function_name="hmmt_feb_2025",
351357
),
352358
"scicode": BenchmarkMetadata(
353-
name="SCICode",
354-
description="SCICode",
359+
name="SciCode",
360+
description="Scientific computing and programming challenges",
355361
category="core",
356-
tags=["code-generation"],
362+
tags=["code-generation", "science", "alpha"],
357363
module_path="openbench.evals.scicode",
358364
function_name="scicode",
365+
is_alpha=True,
359366
),
360367
}
361368

@@ -365,29 +372,54 @@ def get_benchmark_metadata(name: str) -> Optional[BenchmarkMetadata]:
365372
return BENCHMARKS.get(name)
366373

367374

368-
def get_all_benchmarks() -> dict[str, BenchmarkMetadata]:
369-
"""Get all benchmark metadata."""
370-
return BENCHMARKS
375+
def get_all_benchmarks(include_alpha: bool = False) -> dict[str, BenchmarkMetadata]:
376+
"""Get all benchmark metadata.
377+
378+
Args:
379+
include_alpha: Whether to include alpha/experimental benchmarks
380+
"""
381+
if include_alpha:
382+
return BENCHMARKS
383+
return {name: meta for name, meta in BENCHMARKS.items() if not meta.is_alpha}
384+
371385

386+
def get_benchmarks_by_category(
387+
category: str, include_alpha: bool = False
388+
) -> dict[str, BenchmarkMetadata]:
389+
"""Get all benchmarks in a category.
372390
373-
def get_benchmarks_by_category(category: str) -> dict[str, BenchmarkMetadata]:
374-
"""Get all benchmarks in a category."""
375-
return {
391+
Args:
392+
category: Category to filter by
393+
include_alpha: Whether to include alpha/experimental benchmarks
394+
"""
395+
results = {
376396
name: meta for name, meta in BENCHMARKS.items() if meta.category == category
377397
}
398+
if not include_alpha:
399+
results = {name: meta for name, meta in results.items() if not meta.is_alpha}
400+
return results
378401

379402

380403
def get_categories() -> List[str]:
381404
"""Get all available categories."""
382405
return sorted(list(set(meta.category for meta in BENCHMARKS.values())))
383406

384407

385-
def search_benchmarks(query: str) -> dict[str, BenchmarkMetadata]:
386-
"""Search benchmarks by name, description, or tags."""
408+
def search_benchmarks(
409+
query: str, include_alpha: bool = False
410+
) -> dict[str, BenchmarkMetadata]:
411+
"""Search benchmarks by name, description, or tags.
412+
413+
Args:
414+
query: Search query
415+
include_alpha: Whether to include alpha/experimental benchmarks
416+
"""
387417
query = query.lower()
388418
results = {}
389419

390420
for name, meta in BENCHMARKS.items():
421+
if not include_alpha and meta.is_alpha:
422+
continue
391423
if (
392424
query in meta.name.lower()
393425
or query in meta.description.lower()
@@ -403,15 +435,20 @@ def search_benchmarks(query: str) -> dict[str, BenchmarkMetadata]:
403435
# ============================================================================
404436

405437

406-
def _generate_task_registry():
407-
"""Generate task registry from config."""
438+
def _generate_task_registry(include_alpha: bool = True):
439+
"""Generate task registry from config.
440+
441+
Args:
442+
include_alpha: Whether to include alpha/experimental benchmarks
443+
"""
408444
registry = {}
409-
for name, metadata in get_all_benchmarks().items():
445+
for name, metadata in get_all_benchmarks(include_alpha=include_alpha).items():
410446
registry[name] = f"{metadata.module_path}.{metadata.function_name}"
411447
return registry
412448

413449

414-
TASK_REGISTRY = _generate_task_registry()
450+
# Full registry including alpha benchmarks for backward compatibility
451+
TASK_REGISTRY = _generate_task_registry(include_alpha=True)
415452

416453

417454
def _import_module_from_path(path: Path) -> ModuleType:
@@ -457,12 +494,13 @@ def _import_module_from_path(path: Path) -> ModuleType:
457494

458495

459496
@lru_cache()
460-
def load_task(benchmark_name: str) -> Callable:
497+
def load_task(benchmark_name: str, allow_alpha: bool = False) -> Callable:
461498
"""
462499
Loads a task by benchmark name using the registry or from a local path.
463500
464501
Args:
465502
benchmark_name (str): The name of the benchmark or path to a local eval.
503+
allow_alpha (bool): Whether to allow loading alpha/experimental benchmarks.
466504
467505
Returns:
468506
Callable: The imported function object.
@@ -472,6 +510,14 @@ def load_task(benchmark_name: str) -> Callable:
472510
ImportError: If the module cannot be imported.
473511
AttributeError: If the function does not exist in the module.
474512
"""
513+
# Check if this is an alpha benchmark
514+
benchmark_meta = get_benchmark_metadata(benchmark_name)
515+
if benchmark_meta and benchmark_meta.is_alpha and not allow_alpha:
516+
raise ValueError(
517+
f"'{benchmark_name}' is an experimental/alpha benchmark. "
518+
f"Use --alpha flag to run it."
519+
)
520+
475521
# Try registry first (registry names take precedence)
476522
import_path = TASK_REGISTRY.get(benchmark_name)
477523
if import_path:

0 commit comments

Comments
 (0)