@@ -28,33 +28,39 @@ class BenchmarkMetadata:
2828 module_path : str
2929 function_name : str
3030
31+ # Alpha/experimental flag
32+ is_alpha : bool = False # Whether this benchmark is experimental/alpha
33+
3134
3235# Benchmark metadata - minimal, no duplication
3336BENCHMARKS = {
34- # Graphwalks benchmarks
37+ # Graphwalks benchmarks (alpha)
3538 "graphwalks" : BenchmarkMetadata (
3639 name = "GraphWalks" ,
3740 description = "Multi-hop reasoning on graphs - both BFS and parent finding tasks" ,
3841 category = "core" ,
39- tags = ["long-context" , "graphs" , "reasoning" ],
42+ tags = ["long-context" , "graphs" , "reasoning" , "alpha" ],
4043 module_path = "openbench.evals.graphwalks" ,
4144 function_name = "graphwalks" ,
45+ is_alpha = True ,
4246 ),
4347 "graphwalks_bfs" : BenchmarkMetadata (
4448 name = "GraphWalks BFS" ,
4549 description = "Multi-hop reasoning on graphs - BFS traversal tasks only" ,
4650 category = "core" ,
47- tags = ["long-context" , "graphs" , "reasoning" , "bfs" ],
51+ tags = ["long-context" , "graphs" , "reasoning" , "bfs" , "alpha" ],
4852 module_path = "openbench.evals.graphwalks" ,
4953 function_name = "graphwalks_bfs" ,
54+ is_alpha = True ,
5055 ),
5156 "graphwalks_parents" : BenchmarkMetadata (
5257 name = "GraphWalks Parents" ,
5358 description = "Multi-hop reasoning on graphs - parent finding tasks only" ,
5459 category = "core" ,
55- tags = ["long-context" , "graphs" , "reasoning" , "parents" ],
60+ tags = ["long-context" , "graphs" , "reasoning" , "parents" , "alpha" ],
5661 module_path = "openbench.evals.graphwalks" ,
5762 function_name = "graphwalks_parents" ,
63+ is_alpha = True ,
5864 ),
5965 # Core benchmarks
6066 "mmlu" : BenchmarkMetadata (
@@ -350,12 +356,13 @@ class BenchmarkMetadata:
350356 function_name = "hmmt_feb_2025" ,
351357 ),
352358 "scicode" : BenchmarkMetadata (
353- name = "SCICode " ,
354- description = "SCICode " ,
359+ name = "SciCode " ,
360+ description = "Scientific computing and programming challenges " ,
355361 category = "core" ,
356- tags = ["code-generation" ],
362+ tags = ["code-generation" , "science" , "alpha" ],
357363 module_path = "openbench.evals.scicode" ,
358364 function_name = "scicode" ,
365+ is_alpha = True ,
359366 ),
360367}
361368
@@ -365,29 +372,54 @@ def get_benchmark_metadata(name: str) -> Optional[BenchmarkMetadata]:
365372 return BENCHMARKS .get (name )
366373
367374
368- def get_all_benchmarks () -> dict [str , BenchmarkMetadata ]:
369- """Get all benchmark metadata."""
370- return BENCHMARKS
375+ def get_all_benchmarks (include_alpha : bool = False ) -> dict [str , BenchmarkMetadata ]:
376+ """Get all benchmark metadata.
377+
378+ Args:
379+ include_alpha: Whether to include alpha/experimental benchmarks
380+ """
381+ if include_alpha :
382+ return BENCHMARKS
383+ return {name : meta for name , meta in BENCHMARKS .items () if not meta .is_alpha }
384+
371385
386+ def get_benchmarks_by_category (
387+ category : str , include_alpha : bool = False
388+ ) -> dict [str , BenchmarkMetadata ]:
389+ """Get all benchmarks in a category.
372390
373- def get_benchmarks_by_category (category : str ) -> dict [str , BenchmarkMetadata ]:
374- """Get all benchmarks in a category."""
375- return {
391+ Args:
392+ category: Category to filter by
393+ include_alpha: Whether to include alpha/experimental benchmarks
394+ """
395+ results = {
376396 name : meta for name , meta in BENCHMARKS .items () if meta .category == category
377397 }
398+ if not include_alpha :
399+ results = {name : meta for name , meta in results .items () if not meta .is_alpha }
400+ return results
378401
379402
380403def get_categories () -> List [str ]:
381404 """Get all available categories."""
382405 return sorted (list (set (meta .category for meta in BENCHMARKS .values ())))
383406
384407
385- def search_benchmarks (query : str ) -> dict [str , BenchmarkMetadata ]:
386- """Search benchmarks by name, description, or tags."""
408+ def search_benchmarks (
409+ query : str , include_alpha : bool = False
410+ ) -> dict [str , BenchmarkMetadata ]:
411+ """Search benchmarks by name, description, or tags.
412+
413+ Args:
414+ query: Search query
415+ include_alpha: Whether to include alpha/experimental benchmarks
416+ """
387417 query = query .lower ()
388418 results = {}
389419
390420 for name , meta in BENCHMARKS .items ():
421+ if not include_alpha and meta .is_alpha :
422+ continue
391423 if (
392424 query in meta .name .lower ()
393425 or query in meta .description .lower ()
@@ -403,15 +435,20 @@ def search_benchmarks(query: str) -> dict[str, BenchmarkMetadata]:
403435# ============================================================================
404436
405437
406- def _generate_task_registry ():
407- """Generate task registry from config."""
438+ def _generate_task_registry (include_alpha : bool = True ):
439+ """Generate task registry from config.
440+
441+ Args:
442+ include_alpha: Whether to include alpha/experimental benchmarks
443+ """
408444 registry = {}
409- for name , metadata in get_all_benchmarks ().items ():
445+ for name , metadata in get_all_benchmarks (include_alpha = include_alpha ).items ():
410446 registry [name ] = f"{ metadata .module_path } .{ metadata .function_name } "
411447 return registry
412448
413449
414- TASK_REGISTRY = _generate_task_registry ()
450+ # Full registry including alpha benchmarks for backward compatibility
451+ TASK_REGISTRY = _generate_task_registry (include_alpha = True )
415452
416453
417454def _import_module_from_path (path : Path ) -> ModuleType :
@@ -457,12 +494,13 @@ def _import_module_from_path(path: Path) -> ModuleType:
457494
458495
459496@lru_cache ()
460- def load_task (benchmark_name : str ) -> Callable :
497+ def load_task (benchmark_name : str , allow_alpha : bool = False ) -> Callable :
461498 """
462499 Loads a task by benchmark name using the registry or from a local path.
463500
464501 Args:
465502 benchmark_name (str): The name of the benchmark or path to a local eval.
503+ allow_alpha (bool): Whether to allow loading alpha/experimental benchmarks.
466504
467505 Returns:
468506 Callable: The imported function object.
@@ -472,6 +510,14 @@ def load_task(benchmark_name: str) -> Callable:
472510 ImportError: If the module cannot be imported.
473511 AttributeError: If the function does not exist in the module.
474512 """
513+ # Check if this is an alpha benchmark
514+ benchmark_meta = get_benchmark_metadata (benchmark_name )
515+ if benchmark_meta and benchmark_meta .is_alpha and not allow_alpha :
516+ raise ValueError (
517+ f"'{ benchmark_name } ' is an experimental/alpha benchmark. "
518+ f"Use --alpha flag to run it."
519+ )
520+
475521 # Try registry first (registry names take precedence)
476522 import_path = TASK_REGISTRY .get (benchmark_name )
477523 if import_path :
0 commit comments