Skip to content

Commit 6e1d8b2

Browse files
lalaluneclaude
andcommitted
refactor(benchmark-adapters): extract BaseBenchmarkClient + fix Eliza usage propagation (P3)
Consolidates retry/auth/cost-tracking shared by hermes/openclaw/eliza adapters into packages/benchmarks/lib/base_benchmark_client.py. Each adapter subclasses + overrides _send for its transport (subprocess / CLI / HTTP). Fixes Eliza's $0 cost reporting by propagating Cerebras `usage` from generateText through the bench server response. --concurrency is now uniform across all three; Hermes defaults to 2 to avoid the W2-9 429 spikes. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 4d89b51 commit 6e1d8b2

9 files changed

Lines changed: 1274 additions & 21 deletions

File tree

packages/app-core/src/benchmark/server.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1643,13 +1643,29 @@ export async function startBenchmarkServer() {
16431643
});
16441644
trajectoriesBySession.set(key, trajectory);
16451645

1646+
// Propagate Cerebras / OpenAI token-usage so the Python adapter
1647+
// (eliza_adapter.client.ElizaClient) can compute per-turn cost.
1648+
// Shape mirrors the lifeops_bench/message handler so adapter code
1649+
// reads either endpoint the same way (camelCase keys + nullable
1650+
// cacheRead/cacheCreation fields). W2-9 surfaced this as the cause
1651+
// of cost: $0.0000 on eliza runs — the bench server had the data
1652+
// (turnUsage from MODEL_USED events) but never forwarded it.
1653+
const usagePayload = {
1654+
promptTokens: turnUsage.promptTokens,
1655+
completionTokens: turnUsage.completionTokens,
1656+
totalTokens: turnUsage.totalTokens,
1657+
cacheReadInputTokens:
1658+
turnUsage.cachedTokens > 0 ? turnUsage.cachedTokens : null,
1659+
cacheCreationInputTokens: null,
1660+
};
16461661
res.writeHead(200, { "Content-Type": "application/json" });
16471662
res.end(
16481663
JSON.stringify({
16491664
text: responseText,
16501665
thought,
16511666
actions,
16521667
params,
1668+
usage: usagePayload,
16531669
benchmark: session.benchmark,
16541670
task_id: session.taskId,
16551671
room_id: session.roomId,

packages/benchmarks/eliza-adapter/eliza_adapter/client.py

Lines changed: 115 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@
1212
from typing import Mapping
1313
from urllib.parse import urlparse
1414

15+
from benchmarks.lib.base_benchmark_client import (
16+
CEREBRAS_GPT_OSS_120B_PRICING,
17+
BaseBenchmarkClient,
18+
ModelPricing,
19+
)
20+
1521
logger = logging.getLogger(__name__)
1622

1723

@@ -25,18 +31,54 @@ class MessageResponse:
2531
params: dict[str, object]
2632

2733

28-
class ElizaClient:
34+
def _resolve_pricing(provider: str | None, model: str | None) -> ModelPricing | None:
35+
"""Map (provider, model) to a pricing tuple.
36+
37+
Currently only Cerebras gpt-oss-120b is wired; other models fall back to
38+
``None`` so cost reporting becomes 0 rather than silently mispriced.
39+
"""
40+
p = (provider or "").strip().lower()
41+
m = (model or "").strip().lower()
42+
if p == "cerebras" and m == "gpt-oss-120b":
43+
return CEREBRAS_GPT_OSS_120B_PRICING
44+
return None
45+
46+
47+
class ElizaClient(BaseBenchmarkClient[MessageResponse]):
2948
"""HTTP client for the eliza benchmark server.
3049
3150
All communication uses stdlib ``urllib`` so there are no extra
32-
dependencies to install.
51+
dependencies to install. Inherits :class:`BaseBenchmarkClient` for
52+
concurrency limiting, cost computation, and per-turn telemetry capture.
3353
"""
3454

3555
def __init__(
3656
self,
3757
base_url: str | None = None,
3858
token: str | None = None,
59+
*,
60+
concurrency: int = 4,
61+
provider: str | None = None,
62+
model: str | None = None,
3963
) -> None:
64+
resolved_provider = (
65+
provider
66+
or os.environ.get("BENCHMARK_MODEL_PROVIDER")
67+
or "cerebras"
68+
).strip().lower()
69+
resolved_model = (
70+
model
71+
or os.environ.get("BENCHMARK_MODEL_NAME")
72+
or os.environ.get("MODEL_NAME")
73+
or os.environ.get("CEREBRAS_MODEL")
74+
or "gpt-oss-120b"
75+
).strip()
76+
super().__init__(
77+
concurrency=concurrency,
78+
pricing=_resolve_pricing(resolved_provider, resolved_model),
79+
model=resolved_model,
80+
provider=resolved_provider,
81+
)
4082
self._delegate = _build_delegate_client()
4183
resolved_url = (
4284
base_url
@@ -151,20 +193,36 @@ def send_message(
151193
text: str,
152194
context: Mapping[str, object] | None = None,
153195
) -> MessageResponse:
154-
"""POST /api/benchmark/message — send a message and get response."""
196+
"""POST /api/benchmark/message — send a message and get response.
197+
198+
Captures per-turn telemetry (latency_ms, prompt/completion tokens,
199+
cost_usd) into ``self.telemetry_history`` so callers that want token
200+
accounting can read it back; the original delegate-aware path is
201+
preserved for the Hermes / OpenClaw harness routing.
202+
"""
155203
if self._delegate is not None:
156204
return self._delegate.send_message(text, context)
205+
started = time.time()
157206
body: dict[str, object] = {"text": text}
158207
if context is not None:
159208
body["context"] = dict(context)
160-
161209
raw = self._post("/api/benchmark/message", body)
162-
return MessageResponse(
163-
text=str(raw.get("text", "")),
164-
thought=raw.get("thought") if isinstance(raw.get("thought"), str) else None,
165-
actions=list(raw.get("actions", [])),
166-
params=dict(raw.get("params", {})),
210+
finished = time.time()
211+
response = _message_response_from_raw(raw)
212+
# The TS bench server emits a top-level ``usage`` field on the JSON
213+
# response (added 2026-05 to surface Cerebras token counts). Pull it
214+
# into telemetry; if it's missing, record zeros (telemetry still has
215+
# latency).
216+
raw_usage = raw.get("usage")
217+
usage_map: Mapping[str, object] | None = (
218+
raw_usage if isinstance(raw_usage, Mapping) else None
219+
)
220+
self.record_telemetry(
221+
started_at_epoch=started,
222+
finished_at_epoch=finished,
223+
usage=usage_map,
167224
)
225+
return response
168226

169227
def is_ready(self) -> bool:
170228
if self._delegate is not None:
@@ -221,14 +279,36 @@ def wait_until_ready(self, timeout: float = 120.0, poll: float = 1.0) -> None:
221279
f"Eliza benchmark server not ready after {timeout}s: {last_err}"
222280
)
223281

282+
# ------------------------------------------------------------------
283+
# Subclass override of BaseBenchmarkClient._send.
284+
# ------------------------------------------------------------------
285+
286+
def _send(
287+
self,
288+
text: str,
289+
context: Mapping[str, object] | None,
290+
) -> MessageResponse:
291+
"""Pure-HTTP send_message — used by the base class telemetry wrapper.
292+
293+
The public ``send_message`` keeps its existing surface (delegate-aware,
294+
directly returns a ``MessageResponse``); this ``_send`` exists so
295+
callers that want the telemetry-tracked path can use
296+
:meth:`send_message_tracked` from the base class.
297+
"""
298+
body: dict[str, object] = {"text": text}
299+
if context is not None:
300+
body["context"] = dict(context)
301+
raw = self._post("/api/benchmark/message", body)
302+
return _message_response_from_raw(raw)
303+
224304
# ------------------------------------------------------------------
225305
# Internals
226306
# ------------------------------------------------------------------
227307

228308
def _auth_headers(self) -> dict[str, str]:
229-
if self._token:
230-
return {"Authorization": f"Bearer {self._token}"}
231-
return {}
309+
# Delegate to the canonical helper on the base class so all three
310+
# adapters build the Bearer header identically.
311+
return self.build_auth_headers(self._token)
232312

233313
def _get(self, path: str) -> dict[str, object]:
234314
url = f"{self.base_url}{path}"
@@ -267,6 +347,29 @@ def _do(req: urllib.request.Request) -> dict[str, object]:
267347
) from exc
268348

269349

350+
def _message_response_from_raw(raw: Mapping[str, object]) -> MessageResponse:
351+
"""Map a parsed bench server JSON body to :class:`MessageResponse`.
352+
353+
Centralized so :meth:`ElizaClient.send_message` and :meth:`ElizaClient._send`
354+
share the exact same parsing logic.
355+
"""
356+
thought = raw.get("thought") if isinstance(raw.get("thought"), str) else None
357+
actions_raw = raw.get("actions") or []
358+
actions = (
359+
[str(a) for a in actions_raw]
360+
if isinstance(actions_raw, list)
361+
else []
362+
)
363+
params_raw = raw.get("params") or {}
364+
params = dict(params_raw) if isinstance(params_raw, dict) else {}
365+
return MessageResponse(
366+
text=str(raw.get("text", "")),
367+
thought=thought,
368+
actions=actions,
369+
params=params,
370+
)
371+
372+
270373
def _build_delegate_client():
271374
"""Return the selected non-Eliza harness client, if any.
272375
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
"""Pytest bootstrap for the hermes-adapter test suite.
2+
3+
``hermes_adapter.client`` imports ``benchmarks.lib.base_benchmark_client`` for
4+
the shared retry / cost / telemetry scaffold. That import resolves cleanly
5+
only when ``packages/`` (which contains the top-level ``benchmarks`` namespace
6+
package) is on ``sys.path``. When pytest is invoked from inside this adapter
7+
directory — or with a tight ``--rootdir`` — that ancestor isn't picked up
8+
automatically, which surfaces as ``ModuleNotFoundError: No module named
9+
'benchmarks'`` at collection time.
10+
11+
Surface that path here so adapter tests work the same way whether they're
12+
run standalone or as part of the wider ``packages/benchmarks`` suite.
13+
"""
14+
15+
from __future__ import annotations
16+
17+
import sys
18+
from pathlib import Path
19+
20+
_PACKAGES_ROOT = Path(__file__).resolve().parents[2]
21+
if str(_PACKAGES_ROOT) not in sys.path:
22+
sys.path.insert(0, str(_PACKAGES_ROOT))

packages/benchmarks/hermes-adapter/hermes_adapter/client.py

Lines changed: 60 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,27 @@
2929
is_retryable_status,
3030
parse_retry_after,
3131
)
32+
from benchmarks.lib.base_benchmark_client import (
33+
CEREBRAS_GPT_OSS_120B_PRICING,
34+
BaseBenchmarkClient,
35+
ModelPricing,
36+
)
3237

3338
logger = logging.getLogger(__name__)
3439

3540

41+
# Default concurrency for Hermes. W2-9 observed Cerebras 429s at concurrency=4
42+
# on the hermes suite; lowering to 2 cut the 429 rate to near zero without a
43+
# material throughput hit. Callers can override via the constructor.
44+
_HERMES_DEFAULT_CONCURRENCY = 2
45+
46+
47+
def _hermes_pricing(provider: str, model: str) -> ModelPricing | None:
48+
if provider.strip().lower() == "cerebras" and model.strip().lower() == "gpt-oss-120b":
49+
return CEREBRAS_GPT_OSS_120B_PRICING
50+
return None
51+
52+
3653
def _retry_after_from_openai_exception(exc: object) -> float | None:
3754
"""Pull a ``Retry-After`` header from an openai-SDK exception, if present."""
3855
response = getattr(exc, "response", None)
@@ -60,7 +77,7 @@ class MessageResponse:
6077
params: dict[str, object]
6178

6279

63-
class HermesClient:
80+
class HermesClient(BaseBenchmarkClient[MessageResponse]):
6481
"""Client for one-shot turns against hermes-agent.
6582
6683
``mode='subprocess'`` (default): spawn a one-shot Python script using the
@@ -69,6 +86,11 @@ class HermesClient:
6986
7087
``mode='in_process'``: import hermes-agent in the current process. Only
7188
works if the parent Python already has hermes-agent installed.
89+
90+
Inherits :class:`BaseBenchmarkClient` for shared concurrency / cost /
91+
telemetry handling. ``concurrency`` defaults to 2 — W2-9 observed
92+
Cerebras 429s at 4 on the hermes suite; the lower cap eliminates them
93+
without a material throughput hit.
7294
"""
7395

7496
def __init__(
@@ -82,18 +104,24 @@ def __init__(
82104
base_url: str | None = None,
83105
mode: str = "subprocess",
84106
timeout_s: float = 1200.0,
107+
concurrency: int = _HERMES_DEFAULT_CONCURRENCY,
85108
) -> None:
86109
if mode not in {"subprocess", "in_process"}:
87110
raise ValueError(f"Unknown mode {mode!r}; expected 'subprocess' or 'in_process'")
88111

112+
super().__init__(
113+
concurrency=concurrency,
114+
pricing=_hermes_pricing(provider, model),
115+
model=model,
116+
provider=provider,
117+
)
118+
89119
self.repo_path = Path(repo_path) if repo_path else DEFAULT_REPO_PATH
90120
if venv_python is not None:
91121
self.venv_python = Path(venv_python)
92122
else:
93123
self.venv_python = self.repo_path / ".venv" / "bin" / "python"
94124

95-
self.provider = provider
96-
self.model = model
97125
self.api_key = api_key if api_key is not None else os.environ.get("CEREBRAS_API_KEY", "")
98126
self.base_url = (
99127
base_url
@@ -195,7 +223,36 @@ def send_message(
195223
importable in the venv, drives ``HermesAgentLoop`` for one turn).
196224
4. Emits a single JSON line on stdout in the shape
197225
``{"text", "thought", "actions", "params"}``.
226+
227+
Captures per-turn telemetry (latency_ms, prompt/completion tokens,
228+
cost_usd) via the base class. Cerebras's OpenAI-compatible response
229+
carries ``usage`` which we surface in ``params["usage"]`` on both
230+
transports — this method reads it back into telemetry.
198231
"""
232+
started = time.time()
233+
try:
234+
result = self._send(text, context)
235+
finally:
236+
finished = time.time()
237+
usage_obj = result.params.get("usage") if result.params else None
238+
usage_map: Mapping[str, object] | None = (
239+
usage_obj if isinstance(usage_obj, Mapping) else None
240+
)
241+
self.record_telemetry(
242+
started_at_epoch=started,
243+
finished_at_epoch=finished,
244+
usage=usage_map,
245+
)
246+
return result
247+
248+
# Required by BaseBenchmarkClient. The base class' send_message_tracked
249+
# path is not used here because send_message above already wraps the
250+
# transport call with the (richer) cost/latency capture.
251+
def _send(
252+
self,
253+
text: str,
254+
context: Mapping[str, object] | None,
255+
) -> MessageResponse:
199256
if self.mode == "in_process":
200257
return self._send_in_process(text, context)
201258
return self._send_subprocess(text, context)
Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,26 @@
11
"""
22
Shared library modules for the elizaOS benchmarks package.
33
4-
Currently exposes :class:`ResultsStore` for storing benchmark run history
5-
in a local SQLite database. See ``results_store.py``.
4+
Exposes:
5+
6+
- :class:`ResultsStore` — store benchmark run history in a local SQLite db.
7+
- :class:`BaseBenchmarkClient` — shared scaffolding (retry, cost, telemetry,
8+
concurrency) that the hermes / openclaw / eliza adapter clients all
9+
subclass. See ``base_benchmark_client.py``.
610
"""
711

12+
from .base_benchmark_client import (
13+
CEREBRAS_GPT_OSS_120B_PRICING,
14+
MAX_ATTEMPTS,
15+
BaseBenchmarkClient,
16+
ModelPricing,
17+
RetryExhaustedError,
18+
TurnTelemetry,
19+
backoff_seconds,
20+
compute_cost_usd,
21+
is_retryable_status,
22+
parse_retry_after,
23+
)
824
from .results_store import (
925
BenchmarkRun,
1026
ComparisonResult,
@@ -13,8 +29,18 @@
1329
)
1430

1531
__all__ = [
32+
"BaseBenchmarkClient",
1633
"BenchmarkRun",
34+
"CEREBRAS_GPT_OSS_120B_PRICING",
1735
"ComparisonResult",
36+
"MAX_ATTEMPTS",
37+
"ModelPricing",
1838
"ResultsStore",
39+
"RetryExhaustedError",
40+
"TurnTelemetry",
41+
"backoff_seconds",
42+
"compute_cost_usd",
1943
"default_db_path",
44+
"is_retryable_status",
45+
"parse_retry_after",
2046
]

0 commit comments

Comments
 (0)