feat: add support for alpha benchmarks in evaluation commands (#92)

AarushSah · web-flow · commit e2ccfaa0faf9 · 2025-08-14T13:43:56.000-07:00
diff --git a/src/openbench/_cli/eval_command.py b/src/openbench/_cli/eval_command.py
@@ -302,6 +302,14 @@ def run_eval(
             envvar="BENCH_HUB_PRIVATE",
         ),
     ] = False,
+    alpha: Annotated[
+        bool,
+        typer.Option(
+            "--alpha",
+            help="Allow running experimental/alpha benchmarks",
+            envvar="BENCH_ALPHA",
+        ),
+    ] = False,
 ) -> None:
     """
     Run a benchmark on a model.
@@ -333,7 +341,7 @@ def run_eval(
     tasks = []
     for benchmark in benchmarks:
         try:
-            task = load_task(benchmark)
+            task = load_task(benchmark, allow_alpha=alpha)
             tasks.append(task)
         except (ValueError, ImportError, AttributeError) as e:
             raise typer.BadParameter(str(e))
diff --git a/src/openbench/_cli/list_command.py b/src/openbench/_cli/list_command.py
@@ -32,6 +32,9 @@ def list_evals(
     tags: bool = typer.Option(
         False, "--tags", "-t", help="Show tags for each benchmark"
     ),
+    alpha: bool = typer.Option(
+        False, "--alpha", help="Include experimental/alpha benchmarks"
+    ),
 ) -> None:
     """List available benchmark evaluations with enhanced UI."""
     console = Console()
@@ -42,10 +45,10 @@ def list_evals(
             console.print(f"\n❌ [red]Unknown category: {category}[/red]")
             console.print(f"   Available: {', '.join(sorted(get_categories()))}\n")
             return
-        benchmarks = get_benchmarks_by_category(category)
+        benchmarks = get_benchmarks_by_category(category, include_alpha=alpha)
         evals = [benchmark_to_eval_config(meta) for meta in benchmarks.values()]
     else:
-        all_benchmarks = get_all_benchmarks()
+        all_benchmarks = get_all_benchmarks(include_alpha=alpha)
         evals = [benchmark_to_eval_config(meta) for meta in all_benchmarks.values()]
 
     # Apply search filter
@@ -86,7 +89,7 @@ def list_evals(
         # Get task names for this category
         cat_evals_with_keys = [
             (k, v)
-            for k, v in get_all_benchmarks().items()
+            for k, v in get_all_benchmarks(include_alpha=alpha).items()
             if v.name in [e.name for e in categories[cat_name]]
         ]
         cat_evals_with_keys = sorted(cat_evals_with_keys, key=lambda x: x[0])
@@ -125,9 +128,11 @@ def list_evals(
     # Footer with stats and help
     total_count = len(evals)
     console.print("─" * 60)
-    console.print(
-        f"[dim]Total: {total_count} benchmark{'s' if total_count != 1 else ''}[/dim]"
-    )
+    status_msg = f"[dim]Total: {total_count} benchmark{'s' if total_count != 1 else ''}"
+    if not alpha:
+        status_msg += " (use --alpha to see experimental benchmarks)"
+    status_msg += "[/dim]"
+    console.print(status_msg)
     console.print()
     console.print("[dim]Commands:[/dim]")
     console.print("   bench describe <name> - Show detailed information")
diff --git a/src/openbench/config.py b/src/openbench/config.py
@@ -28,33 +28,39 @@ class BenchmarkMetadata:
     module_path: str
     function_name: str
 
+    # Alpha/experimental flag
+    is_alpha: bool = False  # Whether this benchmark is experimental/alpha
+
 
 # Benchmark metadata - minimal, no duplication
 BENCHMARKS = {
-    # Graphwalks benchmarks
+    # Graphwalks benchmarks (alpha)
     "graphwalks": BenchmarkMetadata(
         name="GraphWalks",
         description="Multi-hop reasoning on graphs - both BFS and parent finding tasks",
         category="core",
-        tags=["long-context", "graphs", "reasoning"],
+        tags=["long-context", "graphs", "reasoning", "alpha"],
         module_path="openbench.evals.graphwalks",
         function_name="graphwalks",
+        is_alpha=True,
     ),
     "graphwalks_bfs": BenchmarkMetadata(
         name="GraphWalks BFS",
         description="Multi-hop reasoning on graphs - BFS traversal tasks only",
         category="core",
-        tags=["long-context", "graphs", "reasoning", "bfs"],
+        tags=["long-context", "graphs", "reasoning", "bfs", "alpha"],
         module_path="openbench.evals.graphwalks",
         function_name="graphwalks_bfs",
+        is_alpha=True,
     ),
     "graphwalks_parents": BenchmarkMetadata(
         name="GraphWalks Parents",
         description="Multi-hop reasoning on graphs - parent finding tasks only",
         category="core",
-        tags=["long-context", "graphs", "reasoning", "parents"],
+        tags=["long-context", "graphs", "reasoning", "parents", "alpha"],
         module_path="openbench.evals.graphwalks",
         function_name="graphwalks_parents",
+        is_alpha=True,
     ),
     # Core benchmarks
     "mmlu": BenchmarkMetadata(
@@ -350,12 +356,13 @@ class BenchmarkMetadata:
         function_name="hmmt_feb_2025",
     ),
     "scicode": BenchmarkMetadata(
-        name="SCICode",
-        description="SCICode",
+        name="SciCode",
+        description="Scientific computing and programming challenges",
         category="core",
-        tags=["code-generation"],
+        tags=["code-generation", "science", "alpha"],
         module_path="openbench.evals.scicode",
         function_name="scicode",
+        is_alpha=True,
     ),
 }
 
@@ -365,29 +372,54 @@ def get_benchmark_metadata(name: str) -> Optional[BenchmarkMetadata]:
     return BENCHMARKS.get(name)
 
 
-def get_all_benchmarks() -> dict[str, BenchmarkMetadata]:
-    """Get all benchmark metadata."""
-    return BENCHMARKS
+def get_all_benchmarks(include_alpha: bool = False) -> dict[str, BenchmarkMetadata]:
+    """Get all benchmark metadata.
+
+    Args:
+        include_alpha: Whether to include alpha/experimental benchmarks
+    """
+    if include_alpha:
+        return BENCHMARKS
+    return {name: meta for name, meta in BENCHMARKS.items() if not meta.is_alpha}
+
 
+def get_benchmarks_by_category(
+    category: str, include_alpha: bool = False
+) -> dict[str, BenchmarkMetadata]:
+    """Get all benchmarks in a category.
 
-def get_benchmarks_by_category(category: str) -> dict[str, BenchmarkMetadata]:
-    """Get all benchmarks in a category."""
-    return {
+    Args:
+        category: Category to filter by
+        include_alpha: Whether to include alpha/experimental benchmarks
+    """
+    results = {
         name: meta for name, meta in BENCHMARKS.items() if meta.category == category
     }
+    if not include_alpha:
+        results = {name: meta for name, meta in results.items() if not meta.is_alpha}
+    return results
 
 
 def get_categories() -> List[str]:
     """Get all available categories."""
     return sorted(list(set(meta.category for meta in BENCHMARKS.values())))
 
 
-def search_benchmarks(query: str) -> dict[str, BenchmarkMetadata]:
-    """Search benchmarks by name, description, or tags."""
+def search_benchmarks(
+    query: str, include_alpha: bool = False
+) -> dict[str, BenchmarkMetadata]:
+    """Search benchmarks by name, description, or tags.
+
+    Args:
+        query: Search query
+        include_alpha: Whether to include alpha/experimental benchmarks
+    """
     query = query.lower()
     results = {}
 
     for name, meta in BENCHMARKS.items():
+        if not include_alpha and meta.is_alpha:
+            continue
         if (
             query in meta.name.lower()
             or query in meta.description.lower()
@@ -403,15 +435,20 @@ def search_benchmarks(query: str) -> dict[str, BenchmarkMetadata]:
 # ============================================================================
 
 
-def _generate_task_registry():
-    """Generate task registry from config."""
+def _generate_task_registry(include_alpha: bool = True):
+    """Generate task registry from config.
+
+    Args:
+        include_alpha: Whether to include alpha/experimental benchmarks
+    """
     registry = {}
-    for name, metadata in get_all_benchmarks().items():
+    for name, metadata in get_all_benchmarks(include_alpha=include_alpha).items():
         registry[name] = f"{metadata.module_path}.{metadata.function_name}"
     return registry
 
 
-TASK_REGISTRY = _generate_task_registry()
+# Full registry including alpha benchmarks for backward compatibility
+TASK_REGISTRY = _generate_task_registry(include_alpha=True)
 
 
 def _import_module_from_path(path: Path) -> ModuleType:
@@ -457,12 +494,13 @@ def _import_module_from_path(path: Path) -> ModuleType:
 
 
 @lru_cache()
-def load_task(benchmark_name: str) -> Callable:
+def load_task(benchmark_name: str, allow_alpha: bool = False) -> Callable:
     """
     Loads a task by benchmark name using the registry or from a local path.
 
     Args:
         benchmark_name (str): The name of the benchmark or path to a local eval.
+        allow_alpha (bool): Whether to allow loading alpha/experimental benchmarks.
 
     Returns:
         Callable: The imported function object.
@@ -472,6 +510,14 @@ def load_task(benchmark_name: str) -> Callable:
         ImportError: If the module cannot be imported.
         AttributeError: If the function does not exist in the module.
     """
+    # Check if this is an alpha benchmark
+    benchmark_meta = get_benchmark_metadata(benchmark_name)
+    if benchmark_meta and benchmark_meta.is_alpha and not allow_alpha:
+        raise ValueError(
+            f"'{benchmark_name}' is an experimental/alpha benchmark. "
+            f"Use --alpha flag to run it."
+        )
+
     # Try registry first (registry names take precedence)
     import_path = TASK_REGISTRY.get(benchmark_name)
     if import_path: