Skip to content

Commit ce4f13a

Browse files
lalaluneclaude
andcommitted
inference: tests + ledger for the fused-server merged /v1/audio/speech route
- dflash-server.test.ts: "fused-vs-two-process spawn selection" — resolveFusedDflashBinary() / resolveDflashBinary() prefer the *-fused llama-server when one is installed, fall back to the stock two-process path otherwise, honor ELIZA_DFLASH_DISABLE_FUSED_SERVER and the ELIZA_DFLASH_LLAMA_SERVER override, ignore a fused dir whose CAPABILITIES.json doesn't advertise fusion, and findBundleOmnivoiceAssets() resolves tts/ GGUFs from the text model path. - dflash-server-fused.integration.test.ts: spawns the real *-fused llama-server against a staged small text GGUF, hits POST /completion (1-token gen) AND POST /v1/audio/speech from the same PID (503 "not configured" when no OmniVoice GGUF is wired — proves the route is mounted in-process), then cancels an in-flight generation and asserts the server is still healthy. SKIPs when no fused build / staged GGUF is on disk. - engine.voice-turn.test.ts: pass `asrStreamSupported: true` to the injected fakeFfi so the wired-pipeline tests use the fused streaming ASR path (the W7 transcriber-chain change otherwise falls them through to whisper.cpp, which isn't installed in CI). - remaining-work-ledger.md: mark P0 #3's merged-route item DONE (the *-fused llama-server serves text/DFlash + /v1/audio/speech from one process; spawn layer prefers it), note what's left (weight-backed TTS smoke against a real bundle's tts/, fused metal/vulkan builds, iOS/macOS fused-server packaging, routing engine TTS to synthesizeSpeech()). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 4bd3fb5 commit ce4f13a

4 files changed

Lines changed: 370 additions & 11 deletions

File tree

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
/**
2+
* Integration smoke for the fused omnivoice `llama-server`: one process,
3+
* one llama.cpp build (packages/inference/AGENTS.md §4 — no IPC second TTS
4+
* process; remaining-work-ledger P0 #3 merged-route item).
5+
*
6+
* The test spawns the real fused `llama-server` (the `*-fused` build
7+
* produced by `node packages/app-core/scripts/build-llama-cpp-dflash.mjs
8+
* --target <triple>-fused`) against a small staged text GGUF and asserts:
9+
* 1. `POST /completion` does a 1-token text generation,
10+
* 2. `POST /v1/audio/speech` is mounted and answers from the *same PID*
11+
* (returns the structured 503 "not configured" body when no OmniVoice
12+
* GGUF is wired, which still proves the route is live and in-process —
13+
* with `--omnivoice-model` / `--omnivoice-codec` it synthesizes), and
14+
* 3. cancelling an in-flight generation drains cleanly (the cancel signal
15+
* aborts the request without leaving the server wedged).
16+
*
17+
* It SKIPS when no fused build is on disk for this host's backend or no
18+
* staged text GGUF is found — this is a smoke test against real artifacts,
19+
* not a hermetic unit test. The unit-level "fused-vs-two-process spawn
20+
* selection" coverage lives in `dflash-server.test.ts`.
21+
*/
22+
23+
import { existsSync, readdirSync } from "node:fs";
24+
import os from "node:os";
25+
import path from "node:path";
26+
import { afterAll, describe, expect, it } from "vitest";
27+
28+
function elizaStateDir(): string {
29+
return (
30+
process.env.ELIZA_STATE_DIR?.trim() ||
31+
process.env.MILADY_STATE_DIR?.trim() ||
32+
path.join(os.homedir(), ".eliza")
33+
);
34+
}
35+
36+
function backendKey(): string {
37+
if (process.platform === "darwin") return "metal";
38+
if (process.env.CUDA_VISIBLE_DEVICES && process.env.CUDA_VISIBLE_DEVICES !== "-1") {
39+
return "cuda";
40+
}
41+
return "cpu";
42+
}
43+
44+
function fusedDir(): string {
45+
return path.join(
46+
elizaStateDir(),
47+
"local-inference",
48+
"bin",
49+
"dflash",
50+
`${process.platform}-${process.arch}-${backendKey()}-fused`,
51+
);
52+
}
53+
54+
/** Smallest text GGUF we can find under the local-inference models dir. */
55+
function findSmallTextGguf(): string | null {
56+
const dir = path.join(elizaStateDir(), "local-inference", "models");
57+
let entries: string[];
58+
try {
59+
entries = readdirSync(dir);
60+
} catch {
61+
return null;
62+
}
63+
// Prefer an explicit small stand-in; fall back to any *.gguf that isn't a
64+
// drafter / tokenizer / repaired sidecar.
65+
const explicit = entries.find((e) => /smol|360m|0_6b|0\.6b|1_7b/i.test(e) && e.endsWith(".gguf"));
66+
if (explicit) return path.join(dir, explicit);
67+
const generic = entries.find(
68+
(e) => e.endsWith(".gguf") && !/drafter|tokenizer|repaired|mmproj/i.test(e),
69+
);
70+
return generic ? path.join(dir, generic) : null;
71+
}
72+
73+
const FUSED_BIN = path.join(fusedDir(), "llama-server");
74+
const TEXT_GGUF = findSmallTextGguf();
75+
const haveArtifacts = existsSync(FUSED_BIN) && TEXT_GGUF !== null;
76+
77+
// eslint-disable-next-line vitest/no-conditional-tests
78+
const maybe = haveArtifacts ? describe : describe.skip;
79+
80+
maybe("fused llama-server: text + /v1/audio/speech from one process", () => {
81+
// Spawning a real llama-server can take a while to load weights.
82+
const STARTUP_MS = 90_000;
83+
84+
let mod: typeof import("./dflash-server");
85+
let server: import("./dflash-server").DflashLlamaServer;
86+
let baseUrl: string;
87+
88+
afterAll(async () => {
89+
if (server) await server.stop();
90+
});
91+
92+
it("spawns the fused binary, serves /completion and /v1/audio/speech, then cancels cleanly", async () => {
93+
// Point the runtime at the real .eliza state dir, enable DFlash, and
94+
// make the bundled shared libs resolvable for the spawned child.
95+
process.env.ELIZA_STATE_DIR = elizaStateDir();
96+
process.env.ELIZA_DFLASH_ENABLED = "1";
97+
process.env.ELIZA_DFLASH_METAL_AUTO = "1"; // no-op off macOS
98+
const sep = process.platform === "win32" ? ";" : ":";
99+
const libVar = process.platform === "darwin" ? "DYLD_LIBRARY_PATH" : "LD_LIBRARY_PATH";
100+
process.env[libVar] = [fusedDir(), process.env[libVar] ?? ""].filter(Boolean).join(sep);
101+
102+
mod = await import("./dflash-server");
103+
104+
// The fused binary must be the one resolveDflashBinary() picks.
105+
const resolved = mod.resolveFusedDflashBinary();
106+
expect(resolved).toBe(FUSED_BIN);
107+
expect(mod.resolveDflashBinary()).toBe(FUSED_BIN);
108+
109+
server = mod.dflashLlamaServer;
110+
await server.start({
111+
targetModelPath: TEXT_GGUF as string,
112+
drafterModelPath: TEXT_GGUF as string, // unused: disableDrafter below
113+
contextSize: 512,
114+
draftContextSize: 512,
115+
draftMin: 0,
116+
draftMax: 0,
117+
gpuLayers: 0,
118+
draftGpuLayers: 0,
119+
disableThinking: false,
120+
disableDrafter: true, // standalone text GGUF — no -md
121+
});
122+
baseUrl = server.currentBaseUrl() as string;
123+
expect(baseUrl).toBeTruthy();
124+
const pid = (server as unknown as { child: { pid: number } | null }).child?.pid;
125+
expect(typeof pid).toBe("number");
126+
127+
// 1) text generation
128+
const completionRes = await fetch(`${baseUrl}/completion`, {
129+
method: "POST",
130+
headers: { "content-type": "application/json" },
131+
body: JSON.stringify({ prompt: "Hello", n_predict: 1 }),
132+
});
133+
expect(completionRes.ok).toBe(true);
134+
const completion = (await completionRes.json()) as { tokens_predicted?: number };
135+
expect(completion.tokens_predicted).toBeGreaterThanOrEqual(1);
136+
137+
// 2) /v1/audio/speech mounted on the SAME process. No OmniVoice GGUF is
138+
// wired in this smoke (the stand-in text bundle has no tts/), so the
139+
// route answers with the structured "not configured" 503 — which
140+
// proves it is live and in-process (a stock llama-server returns 404).
141+
const speechRoute = server.audioSpeechRoute();
142+
expect(speechRoute).not.toBeNull();
143+
expect(speechRoute?.fused).toBe(true);
144+
expect(speechRoute?.baseUrl).toBe(baseUrl);
145+
const speechRes = await fetch(`${baseUrl}${speechRoute?.speechPath}`, {
146+
method: "POST",
147+
headers: { "content-type": "application/json" },
148+
body: JSON.stringify({ input: "hello there" }),
149+
});
150+
// 503 = route present but TTS not configured (no GGUF). 200 = configured
151+
// and synthesized. Either proves the route is mounted in this process.
152+
expect([200, 503]).toContain(speechRes.status);
153+
const speechBody = await speechRes.text();
154+
if (speechRes.status === 503) {
155+
expect(speechBody).toContain("omnivoice");
156+
}
157+
158+
// Confirm both responses came from the same PID (the server we spawned).
159+
const stillPid = (server as unknown as { child: { pid: number } | null }).child?.pid;
160+
expect(stillPid).toBe(pid);
161+
162+
// 3) barge-in / cancel: an in-flight generation aborted via the request
163+
// signal must not wedge the server — a follow-up request succeeds.
164+
const ac = new AbortController();
165+
const longGen = server
166+
.generate({
167+
prompt: "Tell me a long story about the sea.",
168+
maxTokens: 256,
169+
signal: ac.signal,
170+
})
171+
.catch((e: unknown) => e); // abort surfaces as a rejection
172+
setTimeout(() => ac.abort(), 50);
173+
await longGen;
174+
// Server is still healthy after the cancel.
175+
const healthRes = await fetch(`${baseUrl}/health`);
176+
expect(healthRes.ok).toBe(true);
177+
178+
await server.stop();
179+
expect(server.hasLoadedModel()).toBe(false);
180+
}, STARTUP_MS);
181+
});

packages/app-core/src/services/local-inference/dflash-server.test.ts

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,13 @@ import {
1212
dflashLlamaServer,
1313
extractStreamingChatDelta,
1414
extractVerifierRejectRange,
15+
findBundleOmnivoiceAssets,
1516
getDflashRuntimeStatus,
1617
logDflashDevDisabledWarning,
1718
parseDflashMetrics,
1819
resolveDflashBinary,
1920
resolveDflashKvOffload,
21+
resolveFusedDflashBinary,
2022
} from "./dflash-server";
2123

2224
const originalEnv = { ...process.env };
@@ -140,6 +142,140 @@ describe("DFlash runtime discovery", () => {
140142
});
141143
});
142144

145+
describe("fused-vs-two-process spawn selection", () => {
146+
function fusedBackendKey(): string {
147+
const backend = process.platform === "darwin" ? "metal" : "cpu";
148+
return `${process.platform}-${process.arch}-${backend}-fused`;
149+
}
150+
function makeFusedBinary(
151+
root: string,
152+
caps: Record<string, unknown> = {},
153+
): { dir: string; bin: string } {
154+
const dir = path.join(
155+
root,
156+
"local-inference",
157+
"bin",
158+
"dflash",
159+
fusedBackendKey(),
160+
);
161+
fs.mkdirSync(dir, { recursive: true });
162+
const bin = path.join(dir, "llama-server");
163+
fs.writeFileSync(bin, "#!/bin/sh\n", "utf8");
164+
fs.chmodSync(bin, 0o755);
165+
fs.writeFileSync(
166+
path.join(dir, "CAPABILITIES.json"),
167+
JSON.stringify({
168+
target: fusedBackendKey(),
169+
platform: process.platform,
170+
arch: process.arch,
171+
backend: process.platform === "darwin" ? "metal" : "cpu",
172+
builtAt: new Date().toISOString(),
173+
fork: "elizaOS/llama.cpp",
174+
forkCommit: "test",
175+
kernels: {
176+
dflash: true,
177+
turbo3: true,
178+
turbo4: true,
179+
turbo3_tcq: false,
180+
qjl_full: false,
181+
polarquant: false,
182+
lookahead: true,
183+
ngramDraft: true,
184+
},
185+
binaries: ["llama-cli", "llama-omnivoice-server", "llama-server"],
186+
fused: true,
187+
omnivoice: { commit: "test" },
188+
...caps,
189+
}),
190+
"utf8",
191+
);
192+
return { dir, bin };
193+
}
194+
function clearEnv() {
195+
delete process.env.ELIZA_DFLASH_ENABLED;
196+
delete process.env.ELIZA_DFLASH_DISABLED;
197+
delete process.env.ELIZA_DFLASH_METAL_AUTO;
198+
delete process.env.ELIZA_DFLASH_METAL_ENABLED;
199+
delete process.env.ELIZA_DFLASH_DISABLE_FUSED_SERVER;
200+
delete process.env.ELIZA_DFLASH_LLAMA_SERVER;
201+
delete process.env.HIP_VISIBLE_DEVICES;
202+
delete process.env.ROCR_VISIBLE_DEVICES;
203+
delete process.env.CUDA_VISIBLE_DEVICES;
204+
}
205+
206+
it("prefers the fused llama-server when a fused build is installed", () => {
207+
const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-fused-test-"));
208+
process.env.ELIZA_STATE_DIR = root;
209+
clearEnv();
210+
const { bin } = makeFusedBinary(root);
211+
expect(resolveFusedDflashBinary()).toBe(bin);
212+
// resolveDflashBinary() should pick the fused binary over the (absent)
213+
// stock binary, so the spawn layer launches the single fused server.
214+
expect(resolveDflashBinary()).toBe(bin);
215+
});
216+
217+
it("falls back to the stock two-process path when no fused build exists", () => {
218+
const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-fused-test-"));
219+
process.env.ELIZA_STATE_DIR = root;
220+
clearEnv();
221+
const stock = makeManagedBinary(root);
222+
expect(resolveFusedDflashBinary()).toBe(null);
223+
expect(resolveDflashBinary()).toBe(stock);
224+
});
225+
226+
it("ignores a fused dir whose CAPABILITIES.json does not advertise fusion", () => {
227+
const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-fused-test-"));
228+
process.env.ELIZA_STATE_DIR = root;
229+
clearEnv();
230+
makeFusedBinary(root, { fused: false, omnivoice: null, binaries: ["llama-server"] });
231+
expect(resolveFusedDflashBinary()).toBe(null);
232+
});
233+
234+
it("ELIZA_DFLASH_DISABLE_FUSED_SERVER forces the stock path", () => {
235+
const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-fused-test-"));
236+
process.env.ELIZA_STATE_DIR = root;
237+
clearEnv();
238+
makeFusedBinary(root);
239+
makeManagedBinary(root);
240+
process.env.ELIZA_DFLASH_DISABLE_FUSED_SERVER = "1";
241+
expect(resolveFusedDflashBinary()).toBe(null);
242+
});
243+
244+
it("ELIZA_DFLASH_LLAMA_SERVER override wins over the fused binary", () => {
245+
const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-fused-test-"));
246+
process.env.ELIZA_STATE_DIR = root;
247+
clearEnv();
248+
makeFusedBinary(root);
249+
const explicitDir = path.join(root, "explicit");
250+
fs.mkdirSync(explicitDir, { recursive: true });
251+
const explicit = path.join(explicitDir, "llama-server");
252+
fs.writeFileSync(explicit, "#!/bin/sh\n", "utf8");
253+
fs.chmodSync(explicit, 0o755);
254+
process.env.ELIZA_DFLASH_LLAMA_SERVER = explicit;
255+
expect(resolveDflashBinary()).toBe(explicit);
256+
});
257+
258+
it("findBundleOmnivoiceAssets resolves tts/ GGUFs from the text model path", () => {
259+
const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-bundle-test-"));
260+
const bundle = path.join(root, "eliza-1-1_7b.bundle");
261+
fs.mkdirSync(path.join(bundle, "text"), { recursive: true });
262+
fs.mkdirSync(path.join(bundle, "tts"), { recursive: true });
263+
fs.writeFileSync(path.join(bundle, "text", "eliza-1-1_7b-32k.gguf"), "x");
264+
fs.writeFileSync(path.join(bundle, "tts", "omnivoice-0.6b.gguf"), "x");
265+
fs.writeFileSync(path.join(bundle, "tts", "omnivoice-tokenizer-0.6b.gguf"), "x");
266+
const assets = findBundleOmnivoiceAssets(
267+
path.join(bundle, "text", "eliza-1-1_7b-32k.gguf"),
268+
);
269+
expect(assets).not.toBeNull();
270+
expect(assets?.modelPath).toBe(path.join(bundle, "tts", "omnivoice-0.6b.gguf"));
271+
expect(assets?.codecPath).toBe(
272+
path.join(bundle, "tts", "omnivoice-tokenizer-0.6b.gguf"),
273+
);
274+
// A non-bundle layout (no text/ parent) returns null.
275+
expect(findBundleOmnivoiceAssets(path.join(root, "model.gguf"))).toBeNull();
276+
});
277+
});
278+
143279
describe("MILADY_DFLASH_DISABLE developer kill-switch", () => {
144280
it("disables DFlash even when ELIZA_DFLASH_ENABLED forces it on", () => {
145281
delete process.env.MILADY_DFLASH_DISABLE;

packages/app-core/src/services/local-inference/engine.voice-turn.test.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ describe("EngineVoiceBridge.runVoiceTurn (wired pipeline)", () => {
124124
});
125125
// Inject a fused FFI so the transcriber path is the real
126126
// `FfiStreamingTranscriber` (asrAvailable + ffi present).
127-
(bridge as unknown as { ffi: unknown }).ffi = fakeFfi("hi there");
127+
(bridge as unknown as { ffi: unknown }).ffi = fakeFfi("hi there", { asrStreamSupported: true });
128128
(
129129
bridge as unknown as { ffiContextRef: { ensure(): bigint } | null }
130130
).ffiContextRef = { ensure: () => 1n };
@@ -179,7 +179,7 @@ describe("EngineVoiceBridge.runVoiceTurn (wired pipeline)", () => {
179179
backendOverride: new StubBackend(),
180180
lifecycleLoaders: loadersOk(),
181181
});
182-
(bridge as unknown as { ffi: unknown }).ffi = fakeFfi("a b c d e f");
182+
(bridge as unknown as { ffi: unknown }).ffi = fakeFfi("a b c d e f", { asrStreamSupported: true });
183183
(
184184
bridge as unknown as { ffiContextRef: { ensure(): bigint } | null }
185185
).ffiContextRef = { ensure: () => 1n };

0 commit comments

Comments
 (0)