inference: tests + ledger for the fused-server merged /v1/audio/speech route

lalalune · claude · lalalune · commit ce4f13a06939 · 2026-05-11T09:39:12.000-07:00
- dflash-server.test.ts: "fused-vs-two-process spawn selection" — resolveFusedDflashBinary() / resolveDflashBinary() prefer the *-fused llama-server when one is installed, fall back to the stock two-process path otherwise, honor ELIZA_DFLASH_DISABLE_FUSED_SERVER and the ELIZA_DFLASH_LLAMA_SERVER override, ignore a fused dir whose CAPABILITIES.json doesn't advertise fusion, and findBundleOmnivoiceAssets() resolves tts/ GGUFs from the text model path. - dflash-server-fused.integration.test.ts: spawns the real *-fused llama-server against a staged small text GGUF, hits POST /completion (1-token gen) AND POST /v1/audio/speech from the same PID (503 "not configured" when no OmniVoice GGUF is wired — proves the route is mounted in-process), then cancels an in-flight generation and asserts the server is still healthy. SKIPs when no fused build / staged GGUF is on disk. - engine.voice-turn.test.ts: pass `asrStreamSupported: true` to the injected fakeFfi so the wired-pipeline tests use the fused streaming ASR path (the W7 transcriber-chain change otherwise falls them through to whisper.cpp, which isn't installed in CI). - remaining-work-ledger.md: mark P0 #3's merged-route item DONE (the *-fused llama-server serves text/DFlash + /v1/audio/speech from one process; spawn layer prefers it), note what's left (weight-backed TTS smoke against a real bundle's tts/, fused metal/vulkan builds, iOS/macOS fused-server packaging, routing engine TTS to synthesizeSpeech()). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
diff --git a/packages/app-core/src/services/local-inference/dflash-server-fused.integration.test.ts b/packages/app-core/src/services/local-inference/dflash-server-fused.integration.test.ts
@@ -0,0 +1,181 @@
+/**
+ * Integration smoke for the fused omnivoice `llama-server`: one process,
+ * one llama.cpp build (packages/inference/AGENTS.md §4 — no IPC second TTS
+ * process; remaining-work-ledger P0 #3 merged-route item).
+ *
+ * The test spawns the real fused `llama-server` (the `*-fused` build
+ * produced by `node packages/app-core/scripts/build-llama-cpp-dflash.mjs
+ * --target <triple>-fused`) against a small staged text GGUF and asserts:
+ *   1. `POST /completion` does a 1-token text generation,
+ *   2. `POST /v1/audio/speech` is mounted and answers from the *same PID*
+ *      (returns the structured 503 "not configured" body when no OmniVoice
+ *      GGUF is wired, which still proves the route is live and in-process —
+ *      with `--omnivoice-model` / `--omnivoice-codec` it synthesizes), and
+ *   3. cancelling an in-flight generation drains cleanly (the cancel signal
+ *      aborts the request without leaving the server wedged).
+ *
+ * It SKIPS when no fused build is on disk for this host's backend or no
+ * staged text GGUF is found — this is a smoke test against real artifacts,
+ * not a hermetic unit test. The unit-level "fused-vs-two-process spawn
+ * selection" coverage lives in `dflash-server.test.ts`.
+ */
+
+import { existsSync, readdirSync } from "node:fs";
+import os from "node:os";
+import path from "node:path";
+import { afterAll, describe, expect, it } from "vitest";
+
+function elizaStateDir(): string {
+  return (
+    process.env.ELIZA_STATE_DIR?.trim() ||
+    process.env.MILADY_STATE_DIR?.trim() ||
+    path.join(os.homedir(), ".eliza")
+  );
+}
+
+function backendKey(): string {
+  if (process.platform === "darwin") return "metal";
+  if (process.env.CUDA_VISIBLE_DEVICES && process.env.CUDA_VISIBLE_DEVICES !== "-1") {
+    return "cuda";
+  }
+  return "cpu";
+}
+
+function fusedDir(): string {
+  return path.join(
+    elizaStateDir(),
+    "local-inference",
+    "bin",
+    "dflash",
+    `${process.platform}-${process.arch}-${backendKey()}-fused`,
+  );
+}
+
+/** Smallest text GGUF we can find under the local-inference models dir. */
+function findSmallTextGguf(): string | null {
+  const dir = path.join(elizaStateDir(), "local-inference", "models");
+  let entries: string[];
+  try {
+    entries = readdirSync(dir);
+  } catch {
+    return null;
+  }
+  // Prefer an explicit small stand-in; fall back to any *.gguf that isn't a
+  // drafter / tokenizer / repaired sidecar.
+  const explicit = entries.find((e) => /smol|360m|0_6b|0\.6b|1_7b/i.test(e) && e.endsWith(".gguf"));
+  if (explicit) return path.join(dir, explicit);
+  const generic = entries.find(
+    (e) => e.endsWith(".gguf") && !/drafter|tokenizer|repaired|mmproj/i.test(e),
+  );
+  return generic ? path.join(dir, generic) : null;
+}
+
+const FUSED_BIN = path.join(fusedDir(), "llama-server");
+const TEXT_GGUF = findSmallTextGguf();
+const haveArtifacts = existsSync(FUSED_BIN) && TEXT_GGUF !== null;
+
+// eslint-disable-next-line vitest/no-conditional-tests
+const maybe = haveArtifacts ? describe : describe.skip;
+
+maybe("fused llama-server: text + /v1/audio/speech from one process", () => {
+  // Spawning a real llama-server can take a while to load weights.
+  const STARTUP_MS = 90_000;
+
+  let mod: typeof import("./dflash-server");
+  let server: import("./dflash-server").DflashLlamaServer;
+  let baseUrl: string;
+
+  afterAll(async () => {
+    if (server) await server.stop();
+  });
+
+  it("spawns the fused binary, serves /completion and /v1/audio/speech, then cancels cleanly", async () => {
+    // Point the runtime at the real .eliza state dir, enable DFlash, and
+    // make the bundled shared libs resolvable for the spawned child.
+    process.env.ELIZA_STATE_DIR = elizaStateDir();
+    process.env.ELIZA_DFLASH_ENABLED = "1";
+    process.env.ELIZA_DFLASH_METAL_AUTO = "1"; // no-op off macOS
+    const sep = process.platform === "win32" ? ";" : ":";
+    const libVar = process.platform === "darwin" ? "DYLD_LIBRARY_PATH" : "LD_LIBRARY_PATH";
+    process.env[libVar] = [fusedDir(), process.env[libVar] ?? ""].filter(Boolean).join(sep);
+
+    mod = await import("./dflash-server");
+
+    // The fused binary must be the one resolveDflashBinary() picks.
+    const resolved = mod.resolveFusedDflashBinary();
+    expect(resolved).toBe(FUSED_BIN);
+    expect(mod.resolveDflashBinary()).toBe(FUSED_BIN);
+
+    server = mod.dflashLlamaServer;
+    await server.start({
+      targetModelPath: TEXT_GGUF as string,
+      drafterModelPath: TEXT_GGUF as string, // unused: disableDrafter below
+      contextSize: 512,
+      draftContextSize: 512,
+      draftMin: 0,
+      draftMax: 0,
+      gpuLayers: 0,
+      draftGpuLayers: 0,
+      disableThinking: false,
+      disableDrafter: true, // standalone text GGUF — no -md
+    });
+    baseUrl = server.currentBaseUrl() as string;
+    expect(baseUrl).toBeTruthy();
+    const pid = (server as unknown as { child: { pid: number } | null }).child?.pid;
+    expect(typeof pid).toBe("number");
+
+    // 1) text generation
+    const completionRes = await fetch(`${baseUrl}/completion`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({ prompt: "Hello", n_predict: 1 }),
+    });
+    expect(completionRes.ok).toBe(true);
+    const completion = (await completionRes.json()) as { tokens_predicted?: number };
+    expect(completion.tokens_predicted).toBeGreaterThanOrEqual(1);
+
+    // 2) /v1/audio/speech mounted on the SAME process. No OmniVoice GGUF is
+    //    wired in this smoke (the stand-in text bundle has no tts/), so the
+    //    route answers with the structured "not configured" 503 — which
+    //    proves it is live and in-process (a stock llama-server returns 404).
+    const speechRoute = server.audioSpeechRoute();
+    expect(speechRoute).not.toBeNull();
+    expect(speechRoute?.fused).toBe(true);
+    expect(speechRoute?.baseUrl).toBe(baseUrl);
+    const speechRes = await fetch(`${baseUrl}${speechRoute?.speechPath}`, {
+      method: "POST",
+      headers: { "content-type": "application/json" },
+      body: JSON.stringify({ input: "hello there" }),
+    });
+    // 503 = route present but TTS not configured (no GGUF). 200 = configured
+    // and synthesized. Either proves the route is mounted in this process.
+    expect([200, 503]).toContain(speechRes.status);
+    const speechBody = await speechRes.text();
+    if (speechRes.status === 503) {
+      expect(speechBody).toContain("omnivoice");
+    }
+
+    // Confirm both responses came from the same PID (the server we spawned).
+    const stillPid = (server as unknown as { child: { pid: number } | null }).child?.pid;
+    expect(stillPid).toBe(pid);
+
+    // 3) barge-in / cancel: an in-flight generation aborted via the request
+    //    signal must not wedge the server — a follow-up request succeeds.
+    const ac = new AbortController();
+    const longGen = server
+      .generate({
+        prompt: "Tell me a long story about the sea.",
+        maxTokens: 256,
+        signal: ac.signal,
+      })
+      .catch((e: unknown) => e); // abort surfaces as a rejection
+    setTimeout(() => ac.abort(), 50);
+    await longGen;
+    // Server is still healthy after the cancel.
+    const healthRes = await fetch(`${baseUrl}/health`);
+    expect(healthRes.ok).toBe(true);
+
+    await server.stop();
+    expect(server.hasLoadedModel()).toBe(false);
+  }, STARTUP_MS);
+});
diff --git a/packages/app-core/src/services/local-inference/dflash-server.test.ts b/packages/app-core/src/services/local-inference/dflash-server.test.ts
@@ -12,11 +12,13 @@ import {
   dflashLlamaServer,
   extractStreamingChatDelta,
   extractVerifierRejectRange,
+  findBundleOmnivoiceAssets,
   getDflashRuntimeStatus,
   logDflashDevDisabledWarning,
   parseDflashMetrics,
   resolveDflashBinary,
   resolveDflashKvOffload,
+  resolveFusedDflashBinary,
 } from "./dflash-server";
 
 const originalEnv = { ...process.env };
@@ -140,6 +142,140 @@ describe("DFlash runtime discovery", () => {
   });
 });
 
+describe("fused-vs-two-process spawn selection", () => {
+  function fusedBackendKey(): string {
+    const backend = process.platform === "darwin" ? "metal" : "cpu";
+    return `${process.platform}-${process.arch}-${backend}-fused`;
+  }
+  function makeFusedBinary(
+    root: string,
+    caps: Record<string, unknown> = {},
+  ): { dir: string; bin: string } {
+    const dir = path.join(
+      root,
+      "local-inference",
+      "bin",
+      "dflash",
+      fusedBackendKey(),
+    );
+    fs.mkdirSync(dir, { recursive: true });
+    const bin = path.join(dir, "llama-server");
+    fs.writeFileSync(bin, "#!/bin/sh\n", "utf8");
+    fs.chmodSync(bin, 0o755);
+    fs.writeFileSync(
+      path.join(dir, "CAPABILITIES.json"),
+      JSON.stringify({
+        target: fusedBackendKey(),
+        platform: process.platform,
+        arch: process.arch,
+        backend: process.platform === "darwin" ? "metal" : "cpu",
+        builtAt: new Date().toISOString(),
+        fork: "elizaOS/llama.cpp",
+        forkCommit: "test",
+        kernels: {
+          dflash: true,
+          turbo3: true,
+          turbo4: true,
+          turbo3_tcq: false,
+          qjl_full: false,
+          polarquant: false,
+          lookahead: true,
+          ngramDraft: true,
+        },
+        binaries: ["llama-cli", "llama-omnivoice-server", "llama-server"],
+        fused: true,
+        omnivoice: { commit: "test" },
+        ...caps,
+      }),
+      "utf8",
+    );
+    return { dir, bin };
+  }
+  function clearEnv() {
+    delete process.env.ELIZA_DFLASH_ENABLED;
+    delete process.env.ELIZA_DFLASH_DISABLED;
+    delete process.env.ELIZA_DFLASH_METAL_AUTO;
+    delete process.env.ELIZA_DFLASH_METAL_ENABLED;
+    delete process.env.ELIZA_DFLASH_DISABLE_FUSED_SERVER;
+    delete process.env.ELIZA_DFLASH_LLAMA_SERVER;
+    delete process.env.HIP_VISIBLE_DEVICES;
+    delete process.env.ROCR_VISIBLE_DEVICES;
+    delete process.env.CUDA_VISIBLE_DEVICES;
+  }
+
+  it("prefers the fused llama-server when a fused build is installed", () => {
+    const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-fused-test-"));
+    process.env.ELIZA_STATE_DIR = root;
+    clearEnv();
+    const { bin } = makeFusedBinary(root);
+    expect(resolveFusedDflashBinary()).toBe(bin);
+    // resolveDflashBinary() should pick the fused binary over the (absent)
+    // stock binary, so the spawn layer launches the single fused server.
+    expect(resolveDflashBinary()).toBe(bin);
+  });
+
+  it("falls back to the stock two-process path when no fused build exists", () => {
+    const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-fused-test-"));
+    process.env.ELIZA_STATE_DIR = root;
+    clearEnv();
+    const stock = makeManagedBinary(root);
+    expect(resolveFusedDflashBinary()).toBe(null);
+    expect(resolveDflashBinary()).toBe(stock);
+  });
+
+  it("ignores a fused dir whose CAPABILITIES.json does not advertise fusion", () => {
+    const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-fused-test-"));
+    process.env.ELIZA_STATE_DIR = root;
+    clearEnv();
+    makeFusedBinary(root, { fused: false, omnivoice: null, binaries: ["llama-server"] });
+    expect(resolveFusedDflashBinary()).toBe(null);
+  });
+
+  it("ELIZA_DFLASH_DISABLE_FUSED_SERVER forces the stock path", () => {
+    const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-fused-test-"));
+    process.env.ELIZA_STATE_DIR = root;
+    clearEnv();
+    makeFusedBinary(root);
+    makeManagedBinary(root);
+    process.env.ELIZA_DFLASH_DISABLE_FUSED_SERVER = "1";
+    expect(resolveFusedDflashBinary()).toBe(null);
+  });
+
+  it("ELIZA_DFLASH_LLAMA_SERVER override wins over the fused binary", () => {
+    const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-fused-test-"));
+    process.env.ELIZA_STATE_DIR = root;
+    clearEnv();
+    makeFusedBinary(root);
+    const explicitDir = path.join(root, "explicit");
+    fs.mkdirSync(explicitDir, { recursive: true });
+    const explicit = path.join(explicitDir, "llama-server");
+    fs.writeFileSync(explicit, "#!/bin/sh\n", "utf8");
+    fs.chmodSync(explicit, 0o755);
+    process.env.ELIZA_DFLASH_LLAMA_SERVER = explicit;
+    expect(resolveDflashBinary()).toBe(explicit);
+  });
+
+  it("findBundleOmnivoiceAssets resolves tts/ GGUFs from the text model path", () => {
+    const root = fs.mkdtempSync(path.join(os.tmpdir(), "eliza-bundle-test-"));
+    const bundle = path.join(root, "eliza-1-1_7b.bundle");
+    fs.mkdirSync(path.join(bundle, "text"), { recursive: true });
+    fs.mkdirSync(path.join(bundle, "tts"), { recursive: true });
+    fs.writeFileSync(path.join(bundle, "text", "eliza-1-1_7b-32k.gguf"), "x");
+    fs.writeFileSync(path.join(bundle, "tts", "omnivoice-0.6b.gguf"), "x");
+    fs.writeFileSync(path.join(bundle, "tts", "omnivoice-tokenizer-0.6b.gguf"), "x");
+    const assets = findBundleOmnivoiceAssets(
+      path.join(bundle, "text", "eliza-1-1_7b-32k.gguf"),
+    );
+    expect(assets).not.toBeNull();
+    expect(assets?.modelPath).toBe(path.join(bundle, "tts", "omnivoice-0.6b.gguf"));
+    expect(assets?.codecPath).toBe(
+      path.join(bundle, "tts", "omnivoice-tokenizer-0.6b.gguf"),
+    );
+    // A non-bundle layout (no text/ parent) returns null.
+    expect(findBundleOmnivoiceAssets(path.join(root, "model.gguf"))).toBeNull();
+  });
+});
+
 describe("MILADY_DFLASH_DISABLE developer kill-switch", () => {
   it("disables DFlash even when ELIZA_DFLASH_ENABLED forces it on", () => {
     delete process.env.MILADY_DFLASH_DISABLE;
diff --git a/packages/app-core/src/services/local-inference/engine.voice-turn.test.ts b/packages/app-core/src/services/local-inference/engine.voice-turn.test.ts
@@ -124,7 +124,7 @@ describe("EngineVoiceBridge.runVoiceTurn (wired pipeline)", () => {
     });
     // Inject a fused FFI so the transcriber path is the real
     // `FfiStreamingTranscriber` (asrAvailable + ffi present).
-    (bridge as unknown as { ffi: unknown }).ffi = fakeFfi("hi there");
+    (bridge as unknown as { ffi: unknown }).ffi = fakeFfi("hi there", { asrStreamSupported: true });
     (
       bridge as unknown as { ffiContextRef: { ensure(): bigint } | null }
     ).ffiContextRef = { ensure: () => 1n };
@@ -179,7 +179,7 @@ describe("EngineVoiceBridge.runVoiceTurn (wired pipeline)", () => {
       backendOverride: new StubBackend(),
       lifecycleLoaders: loadersOk(),
     });
-    (bridge as unknown as { ffi: unknown }).ffi = fakeFfi("a b c d e f");
+    (bridge as unknown as { ffi: unknown }).ffi = fakeFfi("a b c d e f", { asrStreamSupported: true });
     (
       bridge as unknown as { ffiContextRef: { ensure(): bigint } | null }
     ).ffiContextRef = { ensure: () => 1n };
diff --git a/packages/inference/reports/porting/2026-05-11/remaining-work-ledger.md b/packages/inference/reports/porting/2026-05-11/remaining-work-ledger.md