Skip to content

Commit 51fa258

Browse files
author
Shaw
committed
chore: commit benchmark training updates
1 parent f6f1669 commit 51fa258

8 files changed

Lines changed: 264 additions & 55 deletions

File tree

bun.lock

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/benchmarks/orchestrator/adapters.py

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -432,13 +432,30 @@ def _has_vision_language_bundle(tier: str = "eliza-1-9b") -> bool:
432432
manifest_payload = json.loads(manifest.read_text(encoding="utf-8"))
433433
except Exception:
434434
return False
435-
runtime = manifest_payload.get("runtime") if isinstance(manifest_payload, dict) else None
436-
if not isinstance(runtime, dict) or "dflash" not in runtime:
435+
if not isinstance(manifest_payload, dict):
436+
return False
437+
runtime = manifest_payload.get("runtime")
438+
kernels = manifest_payload.get("kernels")
439+
files = manifest_payload.get("files")
440+
has_dflash_runtime = isinstance(runtime, dict) and "dflash" in runtime
441+
has_dflash_kernel = (
442+
isinstance(kernels, dict)
443+
and isinstance(kernels.get("required"), list)
444+
and "dflash" in {str(item) for item in kernels["required"]}
445+
)
446+
has_dflash_file = (
447+
isinstance(files, dict)
448+
and isinstance(files.get("dflash"), list)
449+
and any(isinstance(item, dict) and item.get("path") for item in files["dflash"])
450+
)
451+
if not (has_dflash_runtime or has_dflash_kernel or has_dflash_file):
437452
return False
438453
slug = tier.removeprefix("eliza-1-")
439454
text_candidates = [
440455
bundle / "text" / f"eliza-1-{slug}-64k.gguf",
441456
bundle / "text" / f"eliza-1-{slug}-32k.gguf",
457+
bundle / "text" / f"eliza-1-{slug}-128k.gguf",
458+
bundle / "text" / f"eliza-1-{slug}-256k.gguf",
442459
bundle / "text" / f"eliza-1-{slug}.gguf",
443460
]
444461
vision = bundle / "vision" / f"mmproj-{slug}.gguf"

packages/benchmarks/orchestrator/runner.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,12 @@ def _publication_quarantine_reason(
714714
return "sample_task_set"
715715
if metrics.get("demo_mode") is True or metrics.get("demoMode") is True:
716716
return "demo_mode"
717+
failed_scenarios = metrics.get("failed_scenarios")
718+
if isinstance(failed_scenarios, (int, float)) and not isinstance(failed_scenarios, bool):
719+
if failed_scenarios > 0:
720+
return "failed_scenarios"
721+
if metrics.get("interrupted") is True:
722+
return "interrupted_run"
717723
return None
718724

719725

packages/benchmarks/orchestrator/tests/test_adapter_discovery.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3167,6 +3167,39 @@ def test_vision_language_harness_runtime_requires_multimodal_model(
31673167
assert orchestrator_adapters._has_vision_language_harness_runtime() is True
31683168

31693169

3170+
def test_vision_language_bundle_accepts_current_manifest_schema(
3171+
tmp_path: Path,
3172+
monkeypatch: pytest.MonkeyPatch,
3173+
) -> None:
3174+
bundle = (
3175+
tmp_path
3176+
/ ".eliza"
3177+
/ "local-inference"
3178+
/ "models"
3179+
/ "eliza-1-9b.bundle"
3180+
)
3181+
(bundle / "text").mkdir(parents=True)
3182+
(bundle / "vision").mkdir()
3183+
(bundle / "dflash").mkdir()
3184+
(bundle / "text" / "eliza-1-9b-128k.gguf").write_text("text", encoding="utf-8")
3185+
(bundle / "vision" / "mmproj-9b.gguf").write_text("vision", encoding="utf-8")
3186+
(bundle / "dflash" / "drafter-9b.gguf").write_text("dflash", encoding="utf-8")
3187+
(bundle / "eliza-1.manifest.json").write_text(
3188+
json.dumps(
3189+
{
3190+
"id": "eliza-1-9b",
3191+
"runtime": {"dflash": {"enabled": True}},
3192+
"kernels": {"required": ["dflash"]},
3193+
"files": {"dflash": [{"path": "dflash/drafter-9b.gguf"}]},
3194+
}
3195+
),
3196+
encoding="utf-8",
3197+
)
3198+
monkeypatch.setenv("ELIZA_STATE_DIR", str(tmp_path / ".eliza"))
3199+
3200+
assert orchestrator_adapters._has_vision_language_bundle("eliza-1-9b") is True
3201+
3202+
31703203
def test_rlm_registry_forwards_model_to_root_and_subcall(tmp_path: Path) -> None:
31713204
entry = {item.id: item for item in get_benchmark_registry(_workspace_root())}[
31723205
"rlm_bench"

packages/training/curriculum_state.json

Lines changed: 99 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,22 @@
6969
"synth-346eec6c": 3,
7070
"synth-e772ae68": 1,
7171
"synth-6566db2a": 1,
72-
"synth-7893eeb5": 1
72+
"synth-7893eeb5": 1,
73+
"synth-21a34891": 3,
74+
"synth-b061248a": 3,
75+
"synth-5f90212e": 1,
76+
"synth-f5851987": 1,
77+
"synth-bc69589c": 1,
78+
"synth-f4caa2cf": 3,
79+
"synth-c3511b12": 3,
80+
"synth-9b2e3a4d": 1,
81+
"synth-ee0956b5": 1,
82+
"synth-37f41a4a": 1,
83+
"synth-e07a70e4": 3,
84+
"synth-9444efb1": 3,
85+
"synth-2c42736e": 1,
86+
"synth-fd34ff03": 1,
87+
"synth-017755da": 1
7388
},
7489
"scores": {
7590
"synth-871fc8d4": [
@@ -337,37 +352,100 @@
337352
],
338353
"synth-7893eeb5": [
339354
0.9
355+
],
356+
"synth-21a34891": [
357+
0.9,
358+
0.9,
359+
0.9
360+
],
361+
"synth-b061248a": [
362+
0.9,
363+
0.9,
364+
0.9
365+
],
366+
"synth-5f90212e": [
367+
0.5
368+
],
369+
"synth-f5851987": [
370+
0.7
371+
],
372+
"synth-bc69589c": [
373+
0.9
374+
],
375+
"synth-f4caa2cf": [
376+
0.9,
377+
0.9,
378+
0.9
379+
],
380+
"synth-c3511b12": [
381+
0.9,
382+
0.9,
383+
0.9
384+
],
385+
"synth-9b2e3a4d": [
386+
0.5
387+
],
388+
"synth-ee0956b5": [
389+
0.7
390+
],
391+
"synth-37f41a4a": [
392+
0.9
393+
],
394+
"synth-e07a70e4": [
395+
0.9,
396+
0.9,
397+
0.9
398+
],
399+
"synth-9444efb1": [
400+
0.9,
401+
0.9,
402+
0.9
403+
],
404+
"synth-2c42736e": [
405+
0.5
406+
],
407+
"synth-fd34ff03": [
408+
0.7
409+
],
410+
"synth-017755da": [
411+
0.9
340412
]
341413
},
342414
"solved": [
415+
"synth-0909a501",
416+
"synth-63d72ad4",
417+
"synth-f4caa2cf",
418+
"synth-b061248a",
419+
"synth-114dabcb",
420+
"synth-22309d43",
421+
"synth-9444efb1",
422+
"synth-ea1adeb6",
423+
"synth-7be2922d",
343424
"synth-3570ed6f",
344425
"synth-2bdf09a1",
345-
"synth-67f61a80",
346-
"synth-10466b18",
347-
"synth-22309d43",
348-
"synth-808575e8",
349-
"synth-43c4906f",
350426
"synth-215e5aca",
351-
"synth-06eb2699",
352-
"synth-c5ff1314",
353-
"synth-9b83cfc8",
354-
"synth-2fdb5a5c",
355427
"synth-46111142",
356-
"synth-10043ebf",
357-
"synth-e82700bb",
428+
"synth-c3511b12",
429+
"synth-43c4906f",
358430
"synth-4ef1ca8f",
359-
"synth-0909a501",
360-
"synth-7be2922d",
431+
"synth-10466b18",
432+
"synth-e82700bb",
433+
"synth-06eb2699",
434+
"synth-147a7802",
361435
"synth-93db52f4",
362-
"synth-5d4e07bc",
363-
"synth-63d72ad4",
364436
"synth-871fc8d4",
365-
"synth-111403d8",
366-
"synth-114dabcb",
367-
"synth-147a7802",
437+
"synth-67f61a80",
368438
"synth-346eec6c",
439+
"synth-c5ff1314",
440+
"synth-5d4e07bc",
441+
"synth-111403d8",
369442
"synth-36ceaf32",
370-
"synth-ea1adeb6"
443+
"synth-e07a70e4",
444+
"synth-808575e8",
445+
"synth-2fdb5a5c",
446+
"synth-10043ebf",
447+
"synth-9b83cfc8",
448+
"synth-21a34891"
371449
],
372-
"last_updated": "2026-05-20T05:17:37.940076+00:00"
450+
"last_updated": "2026-05-20T05:20:37.501685+00:00"
373451
}

0 commit comments

Comments
 (0)