elizaOS
diff --git a/‎packages/app-core/src/benchmark/server.ts‎
Lines changed: 16 additions & 0 deletions b/‎packages/app-core/src/benchmark/server.ts‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎packages/benchmarks/eliza-adapter/eliza_adapter/client.py‎
Lines changed: 115 additions & 12 deletions b/‎packages/benchmarks/eliza-adapter/eliza_adapter/client.py‎
Lines changed: 115 additions & 12 deletions
diff --git a/‎packages/benchmarks/hermes-adapter/conftest.py‎
Lines changed: 22 additions & 0 deletions b/‎packages/benchmarks/hermes-adapter/conftest.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎packages/benchmarks/hermes-adapter/hermes_adapter/client.py‎
Lines changed: 60 additions & 3 deletions b/‎packages/benchmarks/hermes-adapter/hermes_adapter/client.py‎
Lines changed: 60 additions & 3 deletions
diff --git a/‎packages/benchmarks/lib/__init__.py‎
Lines changed: 28 additions & 2 deletions b/‎packages/benchmarks/lib/__init__.py‎
Lines changed: 28 additions & 2 deletions
@@ -1643,13 +1643,29 @@ export async function startBenchmarkServer() {
           });
           trajectoriesBySession.set(key, trajectory);
 
+          // Propagate Cerebras / OpenAI token-usage so the Python adapter
+          // (eliza_adapter.client.ElizaClient) can compute per-turn cost.
+          // Shape mirrors the lifeops_bench/message handler so adapter code
+          // reads either endpoint the same way (camelCase keys + nullable
+          // cacheRead/cacheCreation fields). W2-9 surfaced this as the cause
+          // of cost: $0.0000 on eliza runs — the bench server had the data
+          // (turnUsage from MODEL_USED events) but never forwarded it.
+          const usagePayload = {
+            promptTokens: turnUsage.promptTokens,
+            completionTokens: turnUsage.completionTokens,
+            totalTokens: turnUsage.totalTokens,
+            cacheReadInputTokens:
+              turnUsage.cachedTokens > 0 ? turnUsage.cachedTokens : null,
+            cacheCreationInputTokens: null,
+          };
           res.writeHead(200, { "Content-Type": "application/json" });
           res.end(
             JSON.stringify({
               text: responseText,
               thought,
               actions,
               params,
+              usage: usagePayload,
               benchmark: session.benchmark,
               task_id: session.taskId,
               room_id: session.roomId,
 
@@ -12,6 +12,12 @@
 from typing import Mapping
 from urllib.parse import urlparse
 
+from benchmarks.lib.base_benchmark_client import (
+    CEREBRAS_GPT_OSS_120B_PRICING,
+    BaseBenchmarkClient,
+    ModelPricing,
+)
+
 logger = logging.getLogger(__name__)
 
 
@@ -25,18 +31,54 @@ class MessageResponse:
     params: dict[str, object]
 
 
-class ElizaClient:
+def _resolve_pricing(provider: str | None, model: str | None) -> ModelPricing | None:
+    """Map (provider, model) to a pricing tuple.
+
+    Currently only Cerebras gpt-oss-120b is wired; other models fall back to
+    ``None`` so cost reporting becomes 0 rather than silently mispriced.
+    """
+    p = (provider or "").strip().lower()
+    m = (model or "").strip().lower()
+    if p == "cerebras" and m == "gpt-oss-120b":
+        return CEREBRAS_GPT_OSS_120B_PRICING
+    return None
+
+
+class ElizaClient(BaseBenchmarkClient[MessageResponse]):
     """HTTP client for the eliza benchmark server.
 
     All communication uses stdlib ``urllib`` so there are no extra
-    dependencies to install.
+    dependencies to install. Inherits :class:`BaseBenchmarkClient` for
+    concurrency limiting, cost computation, and per-turn telemetry capture.
     """
 
     def __init__(
         self,
         base_url: str | None = None,
         token: str | None = None,
+        *,
+        concurrency: int = 4,
+        provider: str | None = None,
+        model: str | None = None,
     ) -> None:
+        resolved_provider = (
+            provider
+            or os.environ.get("BENCHMARK_MODEL_PROVIDER")
+            or "cerebras"
+        ).strip().lower()
+        resolved_model = (
+            model
+            or os.environ.get("BENCHMARK_MODEL_NAME")
+            or os.environ.get("MODEL_NAME")
+            or os.environ.get("CEREBRAS_MODEL")
+            or "gpt-oss-120b"
+        ).strip()
+        super().__init__(
+            concurrency=concurrency,
+            pricing=_resolve_pricing(resolved_provider, resolved_model),
+            model=resolved_model,
+            provider=resolved_provider,
+        )
         self._delegate = _build_delegate_client()
         resolved_url = (
             base_url
@@ -151,20 +193,36 @@ def send_message(
         text: str,
         context: Mapping[str, object] | None = None,
     ) -> MessageResponse:
-        """POST /api/benchmark/message — send a message and get response."""
+        """POST /api/benchmark/message — send a message and get response.
+
+        Captures per-turn telemetry (latency_ms, prompt/completion tokens,
+        cost_usd) into ``self.telemetry_history`` so callers that want token
+        accounting can read it back; the original delegate-aware path is
+        preserved for the Hermes / OpenClaw harness routing.
+        """
         if self._delegate is not None:
             return self._delegate.send_message(text, context)
+        started = time.time()
         body: dict[str, object] = {"text": text}
         if context is not None:
             body["context"] = dict(context)
-
         raw = self._post("/api/benchmark/message", body)
-        return MessageResponse(
-            text=str(raw.get("text", "")),
-            thought=raw.get("thought") if isinstance(raw.get("thought"), str) else None,
-            actions=list(raw.get("actions", [])),
-            params=dict(raw.get("params", {})),
+        finished = time.time()
+        response = _message_response_from_raw(raw)
+        # The TS bench server emits a top-level ``usage`` field on the JSON
+        # response (added 2026-05 to surface Cerebras token counts). Pull it
+        # into telemetry; if it's missing, record zeros (telemetry still has
+        # latency).
+        raw_usage = raw.get("usage")
+        usage_map: Mapping[str, object] | None = (
+            raw_usage if isinstance(raw_usage, Mapping) else None
+        )
+        self.record_telemetry(
+            started_at_epoch=started,
+            finished_at_epoch=finished,
+            usage=usage_map,
         )
+        return response
 
     def is_ready(self) -> bool:
         if self._delegate is not None:
@@ -221,14 +279,36 @@ def wait_until_ready(self, timeout: float = 120.0, poll: float = 1.0) -> None:
             f"Eliza benchmark server not ready after {timeout}s: {last_err}"
         )
 
+    # ------------------------------------------------------------------
+    # Subclass override of BaseBenchmarkClient._send.
+    # ------------------------------------------------------------------
+
+    def _send(
+        self,
+        text: str,
+        context: Mapping[str, object] | None,
+    ) -> MessageResponse:
+        """Pure-HTTP send_message — used by the base class telemetry wrapper.
+
+        The public ``send_message`` keeps its existing surface (delegate-aware,
+        directly returns a ``MessageResponse``); this ``_send`` exists so
+        callers that want the telemetry-tracked path can use
+        :meth:`send_message_tracked` from the base class.
+        """
+        body: dict[str, object] = {"text": text}
+        if context is not None:
+            body["context"] = dict(context)
+        raw = self._post("/api/benchmark/message", body)
+        return _message_response_from_raw(raw)
+
     # ------------------------------------------------------------------
     # Internals
     # ------------------------------------------------------------------
 
     def _auth_headers(self) -> dict[str, str]:
-        if self._token:
-            return {"Authorization": f"Bearer {self._token}"}
-        return {}
+        # Delegate to the canonical helper on the base class so all three
+        # adapters build the Bearer header identically.
+        return self.build_auth_headers(self._token)
 
     def _get(self, path: str) -> dict[str, object]:
         url = f"{self.base_url}{path}"
@@ -267,6 +347,29 @@ def _do(req: urllib.request.Request) -> dict[str, object]:
             ) from exc
 
 
+def _message_response_from_raw(raw: Mapping[str, object]) -> MessageResponse:
+    """Map a parsed bench server JSON body to :class:`MessageResponse`.
+
+    Centralized so :meth:`ElizaClient.send_message` and :meth:`ElizaClient._send`
+    share the exact same parsing logic.
+    """
+    thought = raw.get("thought") if isinstance(raw.get("thought"), str) else None
+    actions_raw = raw.get("actions") or []
+    actions = (
+        [str(a) for a in actions_raw]
+        if isinstance(actions_raw, list)
+        else []
+    )
+    params_raw = raw.get("params") or {}
+    params = dict(params_raw) if isinstance(params_raw, dict) else {}
+    return MessageResponse(
+        text=str(raw.get("text", "")),
+        thought=thought,
+        actions=actions,
+        params=params,
+    )
+
+
 def _build_delegate_client():
     """Return the selected non-Eliza harness client, if any.
 
 
@@ -0,0 +1,22 @@
+"""Pytest bootstrap for the hermes-adapter test suite.
+
+``hermes_adapter.client`` imports ``benchmarks.lib.base_benchmark_client`` for
+the shared retry / cost / telemetry scaffold. That import resolves cleanly
+only when ``packages/`` (which contains the top-level ``benchmarks`` namespace
+package) is on ``sys.path``. When pytest is invoked from inside this adapter
+directory — or with a tight ``--rootdir`` — that ancestor isn't picked up
+automatically, which surfaces as ``ModuleNotFoundError: No module named
+'benchmarks'`` at collection time.
+
+Surface that path here so adapter tests work the same way whether they're
+run standalone or as part of the wider ``packages/benchmarks`` suite.
+"""
+
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+_PACKAGES_ROOT = Path(__file__).resolve().parents[2]
+if str(_PACKAGES_ROOT) not in sys.path:
+    sys.path.insert(0, str(_PACKAGES_ROOT))
@@ -29,10 +29,27 @@
     is_retryable_status,
     parse_retry_after,
 )
+from benchmarks.lib.base_benchmark_client import (
+    CEREBRAS_GPT_OSS_120B_PRICING,
+    BaseBenchmarkClient,
+    ModelPricing,
+)
 
 logger = logging.getLogger(__name__)
 
 
+# Default concurrency for Hermes. W2-9 observed Cerebras 429s at concurrency=4
+# on the hermes suite; lowering to 2 cut the 429 rate to near zero without a
+# material throughput hit. Callers can override via the constructor.
+_HERMES_DEFAULT_CONCURRENCY = 2
+
+
+def _hermes_pricing(provider: str, model: str) -> ModelPricing | None:
+    if provider.strip().lower() == "cerebras" and model.strip().lower() == "gpt-oss-120b":
+        return CEREBRAS_GPT_OSS_120B_PRICING
+    return None
+
+
 def _retry_after_from_openai_exception(exc: object) -> float | None:
     """Pull a ``Retry-After`` header from an openai-SDK exception, if present."""
     response = getattr(exc, "response", None)
@@ -60,7 +77,7 @@ class MessageResponse:
     params: dict[str, object]
 
 
-class HermesClient:
+class HermesClient(BaseBenchmarkClient[MessageResponse]):
     """Client for one-shot turns against hermes-agent.
 
     ``mode='subprocess'`` (default): spawn a one-shot Python script using the
@@ -69,6 +86,11 @@ class HermesClient:
 
     ``mode='in_process'``: import hermes-agent in the current process. Only
     works if the parent Python already has hermes-agent installed.
+
+    Inherits :class:`BaseBenchmarkClient` for shared concurrency / cost /
+    telemetry handling. ``concurrency`` defaults to 2 — W2-9 observed
+    Cerebras 429s at 4 on the hermes suite; the lower cap eliminates them
+    without a material throughput hit.
     """
 
     def __init__(
@@ -82,18 +104,24 @@ def __init__(
         base_url: str | None = None,
         mode: str = "subprocess",
         timeout_s: float = 1200.0,
+        concurrency: int = _HERMES_DEFAULT_CONCURRENCY,
     ) -> None:
         if mode not in {"subprocess", "in_process"}:
             raise ValueError(f"Unknown mode {mode!r}; expected 'subprocess' or 'in_process'")
 
+        super().__init__(
+            concurrency=concurrency,
+            pricing=_hermes_pricing(provider, model),
+            model=model,
+            provider=provider,
+        )
+
         self.repo_path = Path(repo_path) if repo_path else DEFAULT_REPO_PATH
         if venv_python is not None:
             self.venv_python = Path(venv_python)
         else:
             self.venv_python = self.repo_path / ".venv" / "bin" / "python"
 
-        self.provider = provider
-        self.model = model
         self.api_key = api_key if api_key is not None else os.environ.get("CEREBRAS_API_KEY", "")
         self.base_url = (
             base_url
@@ -195,7 +223,36 @@ def send_message(
              importable in the venv, drives ``HermesAgentLoop`` for one turn).
           4. Emits a single JSON line on stdout in the shape
              ``{"text", "thought", "actions", "params"}``.
+
+        Captures per-turn telemetry (latency_ms, prompt/completion tokens,
+        cost_usd) via the base class. Cerebras's OpenAI-compatible response
+        carries ``usage`` which we surface in ``params["usage"]`` on both
+        transports — this method reads it back into telemetry.
         """
+        started = time.time()
+        try:
+            result = self._send(text, context)
+        finally:
+            finished = time.time()
+        usage_obj = result.params.get("usage") if result.params else None
+        usage_map: Mapping[str, object] | None = (
+            usage_obj if isinstance(usage_obj, Mapping) else None
+        )
+        self.record_telemetry(
+            started_at_epoch=started,
+            finished_at_epoch=finished,
+            usage=usage_map,
+        )
+        return result
+
+    # Required by BaseBenchmarkClient. The base class' send_message_tracked
+    # path is not used here because send_message above already wraps the
+    # transport call with the (richer) cost/latency capture.
+    def _send(
+        self,
+        text: str,
+        context: Mapping[str, object] | None,
+    ) -> MessageResponse:
         if self.mode == "in_process":
             return self._send_in_process(text, context)
         return self._send_subprocess(text, context)
 
@@ -1,10 +1,26 @@
 """
 Shared library modules for the elizaOS benchmarks package.
 
-Currently exposes :class:`ResultsStore` for storing benchmark run history
-in a local SQLite database. See ``results_store.py``.
+Exposes:
+
+  - :class:`ResultsStore` — store benchmark run history in a local SQLite db.
+  - :class:`BaseBenchmarkClient` — shared scaffolding (retry, cost, telemetry,
+    concurrency) that the hermes / openclaw / eliza adapter clients all
+    subclass. See ``base_benchmark_client.py``.
 """
 
+from .base_benchmark_client import (
+    CEREBRAS_GPT_OSS_120B_PRICING,
+    MAX_ATTEMPTS,
+    BaseBenchmarkClient,
+    ModelPricing,
+    RetryExhaustedError,
+    TurnTelemetry,
+    backoff_seconds,
+    compute_cost_usd,
+    is_retryable_status,
+    parse_retry_after,
+)
 from .results_store import (
     BenchmarkRun,
     ComparisonResult,
@@ -13,8 +29,18 @@
 )
 
 __all__ = [
+    "BaseBenchmarkClient",
     "BenchmarkRun",
+    "CEREBRAS_GPT_OSS_120B_PRICING",
     "ComparisonResult",
+    "MAX_ATTEMPTS",
+    "ModelPricing",
     "ResultsStore",
+    "RetryExhaustedError",
+    "TurnTelemetry",
+    "backoff_seconds",
+    "compute_cost_usd",
     "default_db_path",
+    "is_retryable_status",
+    "parse_retry_after",
 ]