chore: commit current runner and docs updates

Shaw · Shaw · commit cd11d08e16aa · 2026-05-19T21:53:09.000-07:00
diff --git a/packages/benchmarks/orchestrator/runner.py b/packages/benchmarks/orchestrator/runner.py
@@ -1716,11 +1716,19 @@ def _score_from_saved_result(result_path: Path, metrics: dict[str, Any]) -> floa
             [
                 payload.get("score"),
                 payload.get("accuracy"),
+                payload.get("pass_at_1"),
+                payload.get("transcriptionNormalizedAccuracy"),
                 (payload.get("summary") or {}).get("accuracy")
                 if isinstance(payload.get("summary"), dict)
                 else None,
             ]
         )
+        summary = payload.get("summary")
+        if isinstance(summary, dict):
+            for mode_summary in summary.values():
+                if not isinstance(mode_summary, dict):
+                    continue
+                candidates.append(mode_summary.get("transcriptionNormalizedAccuracy"))
         payload_metrics = payload.get("metrics")
         if isinstance(payload_metrics, dict):
             candidates.extend(
@@ -1729,6 +1737,8 @@ def _score_from_saved_result(result_path: Path, metrics: dict[str, Any]) -> floa
                     payload_metrics.get("accuracy"),
                     payload_metrics.get("pass_rate"),
                     payload_metrics.get("eval/pass_rate"),
+                    payload_metrics.get("pass_at_1"),
+                    payload_metrics.get("transcriptionNormalizedAccuracy"),
                 ]
             )
     candidates.extend(
@@ -1737,6 +1747,8 @@ def _score_from_saved_result(result_path: Path, metrics: dict[str, Any]) -> floa
             metrics.get("accuracy"),
             metrics.get("pass_rate"),
             metrics.get("eval/pass_rate"),
+            metrics.get("pass_at_1"),
+            metrics.get("transcriptionNormalizedAccuracy"),
         ]
     )
     for candidate in candidates:
diff --git a/packages/examples/telegram/README.md b/packages/examples/telegram/README.md
@@ -1,6 +1,7 @@
-# Telegram Agent Examples
+# Telegram Agent Example
 
-Telegram bots using elizaOS with the full message pipeline (providers → LLM → actions → ALWAYS_AFTER hook actions).
+TypeScript Telegram bot using elizaOS with the full message pipeline
+(providers -> LLM -> actions -> ALWAYS_AFTER hook actions).
 
 ## Quick Start
 
@@ -10,17 +11,16 @@ export OPENAI_API_KEY="your-key"
 # Optional: export POSTGRES_URL="postgresql://..."
 ```
 
-| Language | Command |
-|----------|---------|
-| TypeScript | `cd typescript && bun install && bun run start` |
-| Python | `cd python && pip install -r requirements.txt && python telegram_agent.py` |
-| Rust | `cd rust/telegram-agent && cargo run --release` |
+```bash
+cd packages/examples/telegram
+bun install
+bun run start
+```
 
 ## How It Works
 
-**TypeScript**: The `telegramPlugin` auto-integrates with the runtime - just include it and messages flow through the full pipeline automatically.
-
-**Python/Rust**: Manually bridge Telegram to `runtime.message_service.handle_message()` which runs the full pipeline.
+The `telegramPlugin` auto-integrates with the runtime. Include it and messages
+flow through the full pipeline automatically.
 
 ## Message Pipeline
 
diff --git a/packages/training/tests/rl/conftest.py b/packages/training/tests/rl/conftest.py
@@ -9,6 +9,9 @@
    the package `rl` (relative imports inside it work).
 2. Installs sys.meta_path aliases mapping `src` and `src.training` prefixes
    onto `rl`, so legacy test imports keep working without edits.
+3. Skips tests whose imported module needs an optional ML dep (torch,
+   transformers, atroposlib, …) that isn't installed in the unit-test env.
+   Those tests are still runnable when the `[train]` extras are installed.
 """
 
 from __future__ import annotations
@@ -20,6 +23,8 @@
 import types
 from pathlib import Path
 
+import pytest
+
 _SCRIPTS_DIR = Path(__file__).resolve().parent.parent.parent / "scripts"
 _RL_DIR = _SCRIPTS_DIR / "rl"
 
@@ -55,8 +60,11 @@ def find_spec(self, fullname: str, path=None, target=None):
                 target_name = f"rl.{leaf}"
                 try:
                     real = importlib.import_module(target_name)
-                except ImportError:
-                    return None
+                except ImportError as exc:
+                    pytest.skip(
+                        f"optional ML dep missing for {fullname}: {exc}",
+                        allow_module_level=True,
+                    )
                 sys.modules[fullname] = real
                 return real.__spec__
         return None
@@ -71,3 +79,20 @@ def exec_module(self, module):  # noqa: D401
 
 
 sys.meta_path.insert(0, _RLAliasFinder())
+
+
+# Some tests `import torch` (or other heavy deps) at module-top, before any
+# alias-finder logic runs. Skip them at collection time when the dep is
+# missing so the unit suite stays green without `[train]` extras.
+_HEAVY_DEP_TESTS = {
+    "test_local_inference.py": "transformers",
+    "test_continuous_rl.py": "transformers",
+    "test_lr_scheduler.py": "transformers",
+}
+
+collect_ignore: list[str] = []
+for _file, _dep in _HEAVY_DEP_TESTS.items():
+    try:
+        importlib.import_module(_dep)
+    except ImportError:
+        collect_ignore.append(_file)

Original file line number	Diff line number	Diff line change
`@@ -1716,11 +1716,19 @@ def _score_from_saved_result(result_path: Path, metrics: dict[str, Any]) -> floa`
`1716`	`1716`	`[`
`1717`	`1717`	`payload.get("score"),`
`1718`	`1718`	`payload.get("accuracy"),`
	`1719`	`+ payload.get("pass_at_1"),`
	`1720`	`+ payload.get("transcriptionNormalizedAccuracy"),`
`1719`	`1721`	`(payload.get("summary") or {}).get("accuracy")`
`1720`	`1722`	`if isinstance(payload.get("summary"), dict)`
`1721`	`1723`	`else None,`
`1722`	`1724`	`]`
`1723`	`1725`	`)`
	`1726`	`+ summary = payload.get("summary")`
	`1727`	`+ if isinstance(summary, dict):`
	`1728`	`+ for mode_summary in summary.values():`
	`1729`	`+ if not isinstance(mode_summary, dict):`
	`1730`	`+ continue`
	`1731`	`+ candidates.append(mode_summary.get("transcriptionNormalizedAccuracy"))`
`1724`	`1732`	`payload_metrics = payload.get("metrics")`
`1725`	`1733`	`if isinstance(payload_metrics, dict):`
`1726`	`1734`	`candidates.extend(`
`@@ -1729,6 +1737,8 @@ def _score_from_saved_result(result_path: Path, metrics: dict[str, Any]) -> floa`
`1729`	`1737`	`payload_metrics.get("accuracy"),`
`1730`	`1738`	`payload_metrics.get("pass_rate"),`
`1731`	`1739`	`payload_metrics.get("eval/pass_rate"),`
	`1740`	`+ payload_metrics.get("pass_at_1"),`
	`1741`	`+ payload_metrics.get("transcriptionNormalizedAccuracy"),`
`1732`	`1742`	`]`
`1733`	`1743`	`)`
`1734`	`1744`	`candidates.extend(`
`@@ -1737,6 +1747,8 @@ def _score_from_saved_result(result_path: Path, metrics: dict[str, Any]) -> floa`
`1737`	`1747`	`metrics.get("accuracy"),`
`1738`	`1748`	`metrics.get("pass_rate"),`
`1739`	`1749`	`metrics.get("eval/pass_rate"),`
	`1750`	`+ metrics.get("pass_at_1"),`
	`1751`	`+ metrics.get("transcriptionNormalizedAccuracy"),`
`1740`	`1752`	`]`
`1741`	`1753`	`)`
`1742`	`1754`	`for candidate in candidates:`