Skip to content

Commit 6030ee0

Browse files
committed
updates
1 parent 24e3129 commit 6030ee0

36 files changed

Lines changed: 1941 additions & 468 deletions

File tree

bun.lock

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/app-core/src/benchmark/lifeops-bench-handler.ts

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,16 @@ export interface PlannerInvocationResult {
6868
promptTokens?: number;
6969
completionTokens?: number;
7070
totalTokens?: number;
71+
/**
72+
* Provider-reported prompt-cache reads (Anthropic
73+
* ``cache_read_input_tokens`` / OpenAI + Cerebras
74+
* ``prompt_tokens_details.cached_tokens``). Optional because not every
75+
* provider supports prompt caching; nullable upstream stays nullable
76+
* here — no silent 0 fallback, per AGENTS.md Cmd #8.
77+
*/
78+
cacheReadInputTokens?: number;
79+
/** Anthropic-only ``cache_creation_input_tokens``. */
80+
cacheCreationInputTokens?: number;
7181
};
7282
}
7383

packages/app-core/src/benchmark/server.ts

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,13 +1036,34 @@ export async function startBenchmarkServer() {
10361036
};
10371037
});
10381038

1039+
// Sum the per-call cache-read tokens across every LLM call that fired
1040+
// during this turn. A call with `cachedTokens === undefined` means the
1041+
// provider didn't report it — those calls do NOT contribute to the sum
1042+
// and do NOT collapse the value to 0. If no call in the turn reported
1043+
// cache info, we pass `undefined` through so the wire shape preserves
1044+
// "we don't know" (AGENTS.md Cmd #8). Cerebras gpt-oss-120b reports
1045+
// `prompt_tokens_details.cached_tokens` default-on; Anthropic reports
1046+
// `cache_read_input_tokens` natively.
1047+
const anyCacheReported = turnUsageBuffer.some(
1048+
(c) => typeof c.cachedTokens === "number",
1049+
);
1050+
const cacheReadInputTokens = anyCacheReported
1051+
? turnUsageBuffer.reduce(
1052+
(s, c) =>
1053+
s + (typeof c.cachedTokens === "number" ? c.cachedTokens : 0),
1054+
0,
1055+
)
1056+
: undefined;
10391057
const usage = {
10401058
promptTokens: turnUsageBuffer.reduce((s, c) => s + c.promptTokens, 0),
10411059
completionTokens: turnUsageBuffer.reduce(
10421060
(s, c) => s + c.completionTokens,
10431061
0,
10441062
),
10451063
totalTokens: turnUsageBuffer.reduce((s, c) => s + c.totalTokens, 0),
1064+
...(cacheReadInputTokens !== undefined
1065+
? { cacheReadInputTokens }
1066+
: {}),
10461067
};
10471068

10481069
// Touch the backend so unused-import linters do not strip the

packages/app-core/test/helpers/live-provider.ts

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ export type LiveProviderName =
8080
| "openai"
8181
| "anthropic"
8282
| "google"
83-
| "openrouter";
83+
| "openrouter"
84+
| "local-llama-cpp";
8485

8586
export type LiveProviderConfig = {
8687
name: LiveProviderName;
@@ -194,6 +195,24 @@ const PROVIDERS: Array<{
194195
defaultSmallModel: "google/gemini-2.0-flash-001",
195196
defaultLargeModel: "google/gemini-2.0-flash-001",
196197
},
198+
{
199+
// Local OpenAI-compatible server (dflash llama-server fork or Ollama).
200+
// The dflash fork at ~/.cache/eliza-dflash/milady-llama-cpp is preferred
201+
// when present; otherwise PARALLAX_OPENCODE_BASE_URL points at Ollama
202+
// (default http://localhost:11434/v1). No real API key is required, but
203+
// the selector requires a non-empty key string, so callers must set
204+
// LOCAL_LLAMA_CPP_API_KEY=local (or rely on the explicit
205+
// selectLiveProvider("local-llama-cpp") path which seeds the sentinel).
206+
name: "local-llama-cpp",
207+
plugin: "@elizaos/plugin-openai",
208+
keyEnvVars: ["LOCAL_LLAMA_CPP_API_KEY"],
209+
baseUrlEnvVar: "OPENAI_BASE_URL",
210+
defaultBaseUrl: "http://localhost:11434/v1",
211+
smallModelEnvVar: "OPENAI_SMALL_MODEL",
212+
largeModelEnvVar: "OPENAI_LARGE_MODEL",
213+
defaultSmallModel: "qwen3-0.6b-q8_0",
214+
defaultLargeModel: "qwen3-1.7b-q4_k_m",
215+
},
197216
];
198217

199218
for (const provider of PROVIDERS) {

packages/benchmarks/eliza-adapter/eliza_adapter/lifeops_bench.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,30 @@ async def _agent_fn(
147147
value = usage.get(key)
148148
if isinstance(value, (int, float)):
149149
setattr(turn, attr, int(value))
150+
# Cache telemetry comes from the TS bench server's MODEL_USED
151+
# buffer rollup. `cacheReadInputTokens` is omitted when no LLM
152+
# call in the turn reported cache info — we propagate that as
153+
# ``None`` so the runner records "unknown" rather than a silent
154+
# 0. Per AGENTS.md Cmd #8.
155+
cache_read_raw = usage.get("cacheReadInputTokens")
156+
cache_creation_raw = usage.get("cacheCreationInputTokens")
157+
setattr(
158+
turn,
159+
"cache_read_input_tokens",
160+
int(cache_read_raw)
161+
if isinstance(cache_read_raw, (int, float))
162+
else None,
163+
)
164+
setattr(
165+
turn,
166+
"cache_creation_input_tokens",
167+
int(cache_creation_raw)
168+
if isinstance(cache_creation_raw, (int, float))
169+
else None,
170+
)
171+
# Eliza routes through plugin-openai (OpenAI / Cerebras) or
172+
# plugin-anthropic — both support prompt caching.
173+
setattr(turn, "cache_supported", True)
150174
# Stash model identity so result records can attribute spend.
151175
if model_name:
152176
setattr(turn, "model_name", model_name)

packages/benchmarks/hermes-adapter/hermes_adapter/client.py

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,11 +310,22 @@ def _send_in_process(
310310
for tc in tool_calls
311311
if getattr(getattr(tc, "function", None), "name", "")
312312
]
313+
# Surface the provider-reported usage block so the lifeops_bench adapter
314+
# can parse cache_read_input_tokens (OpenAI / Cerebras shape:
315+
# ``usage.prompt_tokens_details.cached_tokens``). Mirrors the subprocess
316+
# path's payload shape; downstream callers read ``params['usage']``.
317+
usage_obj = getattr(completion, "usage", None)
318+
if usage_obj is not None and hasattr(usage_obj, "model_dump"):
319+
usage_payload: dict[str, object] = usage_obj.model_dump()
320+
elif isinstance(usage_obj, Mapping):
321+
usage_payload = dict(usage_obj)
322+
else:
323+
usage_payload = {}
313324
return MessageResponse(
314325
text=str(msg.content or ""),
315326
thought=getattr(msg, "reasoning_content", None) or None,
316327
actions=actions,
317-
params={"tool_calls": parsed_tool_calls},
328+
params={"tool_calls": parsed_tool_calls, "usage": usage_payload},
318329
)
319330

320331
@staticmethod

packages/benchmarks/hermes-adapter/hermes_adapter/lifeops_bench.py

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,64 @@ async def _agent_fn(
107107
)
108108
if model_name:
109109
setattr(turn, "model_name", model_name)
110+
# Surface usage + cache telemetry on the returned MessageTurn so the
111+
# LifeOpsBench runner can populate TurnResult.cache_read_input_tokens
112+
# / cache_creation_input_tokens / cache_hit_pct via getattr(). The
113+
# hermes-agent OpenAI-compat surface exposes:
114+
# * OpenAI / Cerebras shape: usage.prompt_tokens_details.cached_tokens
115+
# Anthropic-shaped responses (cache_read_input_tokens /
116+
# cache_creation_input_tokens) are forwarded verbatim when present.
117+
usage = resp.params.get("usage") if isinstance(resp.params, dict) else None
118+
if isinstance(usage, dict):
119+
_attach_usage_cache_fields(turn, usage)
110120
return turn
111121

112122
return _agent_fn
123+
124+
125+
def _attach_usage_cache_fields(turn: Any, usage: dict[str, Any]) -> None:
126+
"""Parse OpenAI / Cerebras / Anthropic-shaped usage onto the MessageTurn.
127+
128+
Sets ``input_tokens`` / ``output_tokens`` / ``cache_read_input_tokens`` /
129+
``cache_creation_input_tokens`` / ``cache_supported`` as attributes on
130+
``turn`` (via ``setattr``) so the LifeOpsBench runner can pick them up
131+
with ``getattr``. Cache fields stay ``None`` when the provider does not
132+
report them — per AGENTS.md Cmd #8, no silent ``0`` fallback.
133+
"""
134+
prompt = usage.get("prompt_tokens")
135+
completion = usage.get("completion_tokens")
136+
# Anthropic shape: input_tokens / output_tokens.
137+
if not isinstance(prompt, (int, float)):
138+
prompt = usage.get("input_tokens")
139+
if not isinstance(completion, (int, float)):
140+
completion = usage.get("output_tokens")
141+
if isinstance(prompt, (int, float)):
142+
setattr(turn, "input_tokens", int(prompt))
143+
if isinstance(completion, (int, float)):
144+
setattr(turn, "output_tokens", int(completion))
145+
146+
# OpenAI / Cerebras: usage.prompt_tokens_details.cached_tokens
147+
prompt_details = usage.get("prompt_tokens_details") or {}
148+
cache_read_raw = (
149+
prompt_details.get("cached_tokens")
150+
if isinstance(prompt_details, dict)
151+
else None
152+
)
153+
# Anthropic: cache_read_input_tokens at the usage root.
154+
if cache_read_raw is None:
155+
cache_read_raw = usage.get("cache_read_input_tokens")
156+
cache_creation_raw = usage.get("cache_creation_input_tokens")
157+
158+
cache_read_value: int | None = (
159+
int(cache_read_raw) if isinstance(cache_read_raw, (int, float)) else None
160+
)
161+
cache_creation_value: int | None = (
162+
int(cache_creation_raw)
163+
if isinstance(cache_creation_raw, (int, float))
164+
else None
165+
)
166+
setattr(turn, "cache_read_input_tokens", cache_read_value)
167+
setattr(turn, "cache_creation_input_tokens", cache_creation_value)
168+
# Hermes-template servers fronting Cerebras gpt-oss-120b or Anthropic
169+
# support prompt caching; cache_supported is a hard-true here.
170+
setattr(turn, "cache_supported", True)

packages/benchmarks/lib/package.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,9 @@
88
"types": "./src/index.ts",
99
"exports": {
1010
".": "./src/index.ts",
11-
"./metrics-schema": "./src/metrics-schema.ts"
11+
"./metrics-schema": "./src/metrics-schema.ts",
12+
"./model-tiers": "./src/model-tiers.ts",
13+
"./local-llama-cpp": "./src/local-llama-cpp.ts"
1214
},
1315
"scripts": {
1416
"typecheck": "tsc --noEmit",
Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
import { describe, expect, it } from "vitest";
2+
import { existsSync } from "node:fs";
3+
4+
import {
5+
DFLASH_BINARY_PATH,
6+
expandHome,
7+
probeDflashFork,
8+
resolveLocalBaseUrl,
9+
startLocalServer,
10+
} from "../local-llama-cpp.ts";
11+
12+
describe("expandHome", () => {
13+
it("expands leading ~/ to the home dir", () => {
14+
const expanded = expandHome("~/foo/bar");
15+
expect(expanded).not.toContain("~");
16+
expect(expanded.endsWith("/foo/bar")).toBe(true);
17+
});
18+
19+
it("returns absolute paths verbatim", () => {
20+
expect(expandHome("/abs/path")).toBe("/abs/path");
21+
});
22+
23+
it("handles bare ~", () => {
24+
const expanded = expandHome("~");
25+
expect(expanded).not.toBe("~");
26+
});
27+
});
28+
29+
describe("probeDflashFork", () => {
30+
it("returns null when the binary is absent, otherwise the absolute path", () => {
31+
const result = probeDflashFork();
32+
if (result === null) {
33+
// Binary not present in this environment — confirm the default path
34+
// was the one checked.
35+
expect(existsSync(DFLASH_BINARY_PATH)).toBe(false);
36+
} else {
37+
expect(result).toBe(DFLASH_BINARY_PATH);
38+
expect(existsSync(result)).toBe(true);
39+
}
40+
});
41+
});
42+
43+
describe("resolveLocalBaseUrl", () => {
44+
it("uses PARALLAX_OPENCODE_BASE_URL when set", () => {
45+
const result = resolveLocalBaseUrl({
46+
env: { PARALLAX_OPENCODE_BASE_URL: "http://example:5555/v1" },
47+
});
48+
expect(result.baseUrl).toBe("http://example:5555/v1");
49+
expect(result.source).toBe("ollama-env");
50+
});
51+
52+
it("falls back to localhost:11434 when no override is set", () => {
53+
const result = resolveLocalBaseUrl({ env: {} });
54+
expect(result.baseUrl).toBe("http://localhost:11434/v1");
55+
expect(result.source).toBe("ollama-default");
56+
});
57+
58+
it("ignores empty/whitespace override values", () => {
59+
const result = resolveLocalBaseUrl({
60+
env: { PARALLAX_OPENCODE_BASE_URL: " " },
61+
});
62+
expect(result.source).toBe("ollama-default");
63+
});
64+
});
65+
66+
describe("startLocalServer", () => {
67+
it("throws a helpful error when the dflash fork is not present", async () => {
68+
if (probeDflashFork() !== null) {
69+
// Binary IS present — skip this branch; the next test covers it.
70+
return;
71+
}
72+
await expect(
73+
startLocalServer({ bundlePath: "/nonexistent" }),
74+
).rejects.toThrow(/dflash llama-server binary not found/);
75+
});
76+
77+
it("throws when the bundle path does not exist (binary present)", async () => {
78+
if (probeDflashFork() === null) {
79+
// No binary — covered above.
80+
return;
81+
}
82+
await expect(
83+
startLocalServer({ bundlePath: "/nonexistent-bundle-xyz.gguf" }),
84+
).rejects.toThrow(/dflash bundle path does not exist/);
85+
});
86+
});

0 commit comments

Comments
 (0)