Skip to content

Commit 318e7b6

Browse files
committed
merge: backend unification + cache bridge integration (agents #1 + #2)
Resolve conflicts on dflash-server.ts and engine.ts so both deliverables land coherently: - backend.ts: GenerateArgs gains optional cacheKey threaded by both backends - dflash-server.ts: keep cache bridge slot persistence + the new appendOptimizationFlags/LocalInferenceBackend interface; resolveParallel honors ELIZA_LOCAL_PARALLEL > ELIZA_DFLASH_PARALLEL > catalog - engine.ts: NodeLlamaCppBackend.load applies catalog optimizations (mmap/mlock/flash-attention) before building the session pool; LocalInferenceEngine.describeSessionPool delegates to the node backend so service.ts's local-cache stats endpoint stays callable Verified: 50/50 local-inference tests pass (cache-bridge + backend + dflash-server + engine + readiness + recommendation + downloader).
2 parents a2e2f3e + 3717809 commit 318e7b6

34 files changed

Lines changed: 2026 additions & 66 deletions
Lines changed: 257 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,257 @@
1+
import { afterEach, describe, expect, it } from "vitest";
2+
import {
3+
BackendDispatcher,
4+
decideBackend,
5+
type LocalInferenceBackend,
6+
readBackendOverride,
7+
} from "./backend";
8+
import type { CatalogModel } from "./types";
9+
10+
const ORIGINAL_ENV = { ...process.env };
11+
12+
afterEach(() => {
13+
process.env = { ...ORIGINAL_ENV };
14+
});
15+
16+
const BASE_CATALOG: CatalogModel = {
17+
id: "test-model",
18+
displayName: "Test Model",
19+
hfRepo: "fake/Test-GGUF",
20+
ggufFile: "Test-Q4.gguf",
21+
params: "4B",
22+
quant: "Q4_K_M",
23+
sizeGb: 2.5,
24+
minRamGb: 5,
25+
category: "chat",
26+
bucket: "small",
27+
blurb: "test",
28+
};
29+
30+
function withRuntime(
31+
base: CatalogModel,
32+
runtime: CatalogModel["runtime"],
33+
): CatalogModel {
34+
return { ...base, runtime };
35+
}
36+
37+
describe("readBackendOverride", () => {
38+
it("returns 'auto' when unset", () => {
39+
delete process.env.ELIZA_LOCAL_BACKEND;
40+
expect(readBackendOverride()).toBe("auto");
41+
});
42+
43+
it("returns 'auto' for unknown values", () => {
44+
process.env.ELIZA_LOCAL_BACKEND = "magic";
45+
expect(readBackendOverride()).toBe("auto");
46+
});
47+
48+
it("respects explicit overrides", () => {
49+
process.env.ELIZA_LOCAL_BACKEND = "node-llama-cpp";
50+
expect(readBackendOverride()).toBe("node-llama-cpp");
51+
process.env.ELIZA_LOCAL_BACKEND = "llama-server";
52+
expect(readBackendOverride()).toBe("llama-server");
53+
});
54+
});
55+
56+
describe("decideBackend", () => {
57+
it("defaults to node-llama-cpp for stock GGUFs", () => {
58+
const decision = decideBackend({
59+
override: "auto",
60+
catalog: BASE_CATALOG,
61+
llamaServerAvailable: true,
62+
dflashRequired: false,
63+
});
64+
expect(decision.backend).toBe("node-llama-cpp");
65+
expect(decision.reason).toBe("default");
66+
});
67+
68+
it("routes to llama-server when a kernel is required", () => {
69+
const catalog = withRuntime(BASE_CATALOG, {
70+
optimizations: { requiresKernel: ["dflash"] },
71+
});
72+
const decision = decideBackend({
73+
override: "auto",
74+
catalog,
75+
llamaServerAvailable: false,
76+
dflashRequired: false,
77+
});
78+
expect(decision.backend).toBe("llama-server");
79+
expect(decision.reason).toBe("kernel-required");
80+
expect(decision.kernels).toEqual(["dflash"]);
81+
});
82+
83+
it("env override wins over default", () => {
84+
const decision = decideBackend({
85+
override: "llama-server",
86+
catalog: BASE_CATALOG,
87+
llamaServerAvailable: true,
88+
dflashRequired: false,
89+
});
90+
expect(decision.backend).toBe("llama-server");
91+
expect(decision.reason).toBe("env-override");
92+
});
93+
94+
it("env override is overridden by hard kernel requirement", () => {
95+
const catalog = withRuntime(BASE_CATALOG, {
96+
optimizations: { requiresKernel: ["turbo3"] },
97+
});
98+
const decision = decideBackend({
99+
override: "node-llama-cpp",
100+
catalog,
101+
llamaServerAvailable: true,
102+
dflashRequired: false,
103+
});
104+
// The user can't ask the in-process binding to run turbo3.
105+
expect(decision.backend).toBe("llama-server");
106+
expect(decision.reason).toBe("kernel-required");
107+
});
108+
109+
it("respects preferredBackend=llama-server when binary available", () => {
110+
const catalog = withRuntime(BASE_CATALOG, {
111+
preferredBackend: "llama-server",
112+
});
113+
const decision = decideBackend({
114+
override: "auto",
115+
catalog,
116+
llamaServerAvailable: true,
117+
dflashRequired: false,
118+
});
119+
expect(decision.backend).toBe("llama-server");
120+
expect(decision.reason).toBe("preferred-backend");
121+
});
122+
123+
it("falls back to node-llama-cpp when preferredBackend=llama-server but binary missing and DFlash not required", () => {
124+
const catalog = withRuntime(BASE_CATALOG, {
125+
preferredBackend: "llama-server",
126+
});
127+
const decision = decideBackend({
128+
override: "auto",
129+
catalog,
130+
llamaServerAvailable: false,
131+
dflashRequired: false,
132+
});
133+
expect(decision.backend).toBe("node-llama-cpp");
134+
expect(decision.reason).toBe("default");
135+
});
136+
137+
it("forces llama-server when DFlash is required and configured, even if binary probe is false", () => {
138+
const catalog = withRuntime(BASE_CATALOG, {
139+
preferredBackend: "llama-server",
140+
dflash: {
141+
drafterModelId: "x",
142+
specType: "dflash",
143+
contextSize: 8192,
144+
draftContextSize: 256,
145+
draftMin: 1,
146+
draftMax: 16,
147+
gpuLayers: "auto",
148+
draftGpuLayers: "auto",
149+
disableThinking: true,
150+
},
151+
});
152+
const decision = decideBackend({
153+
override: "auto",
154+
catalog,
155+
llamaServerAvailable: false,
156+
dflashRequired: true,
157+
});
158+
expect(decision.backend).toBe("llama-server");
159+
expect(decision.reason).toBe("dflash-required");
160+
});
161+
162+
it("returns default when no catalog entry is supplied", () => {
163+
const decision = decideBackend({
164+
override: "auto",
165+
catalog: undefined,
166+
llamaServerAvailable: true,
167+
dflashRequired: false,
168+
});
169+
expect(decision.backend).toBe("node-llama-cpp");
170+
expect(decision.reason).toBe("default");
171+
});
172+
});
173+
174+
class FakeBackend implements LocalInferenceBackend {
175+
loaded = false;
176+
unloads = 0;
177+
loadCalls: string[] = [];
178+
179+
constructor(public readonly id: "node-llama-cpp" | "llama-server") {}
180+
181+
async available(): Promise<boolean> {
182+
return true;
183+
}
184+
185+
async load(plan: { modelPath: string }): Promise<void> {
186+
this.loaded = true;
187+
this.loadCalls.push(plan.modelPath);
188+
}
189+
190+
async unload(): Promise<void> {
191+
this.loaded = false;
192+
this.unloads += 1;
193+
}
194+
195+
async generate(): Promise<string> {
196+
return `${this.id}:reply`;
197+
}
198+
199+
hasLoadedModel(): boolean {
200+
return this.loaded;
201+
}
202+
203+
currentModelPath(): string | null {
204+
return this.loaded ? (this.loadCalls.at(-1) ?? null) : null;
205+
}
206+
}
207+
208+
describe("BackendDispatcher", () => {
209+
it("loads node-llama-cpp by default", async () => {
210+
const node = new FakeBackend("node-llama-cpp");
211+
const server = new FakeBackend("llama-server");
212+
const d = new BackendDispatcher(
213+
node,
214+
server,
215+
() => true,
216+
() => false,
217+
);
218+
await d.load({ modelPath: "/m.gguf", catalog: BASE_CATALOG });
219+
expect(d.activeBackendId()).toBe("node-llama-cpp");
220+
expect(node.loaded).toBe(true);
221+
expect(server.loaded).toBe(false);
222+
expect(await d.generate({ prompt: "hi" })).toBe("node-llama-cpp:reply");
223+
});
224+
225+
it("switches backends when the decision differs and unloads the previous", async () => {
226+
const node = new FakeBackend("node-llama-cpp");
227+
const server = new FakeBackend("llama-server");
228+
const d = new BackendDispatcher(
229+
node,
230+
server,
231+
() => true,
232+
() => false,
233+
);
234+
await d.load({ modelPath: "/m.gguf", catalog: BASE_CATALOG });
235+
expect(d.activeBackendId()).toBe("node-llama-cpp");
236+
237+
const kernelCatalog = withRuntime(BASE_CATALOG, {
238+
optimizations: { requiresKernel: ["dflash"] },
239+
});
240+
await d.load({ modelPath: "/m2.gguf", catalog: kernelCatalog });
241+
expect(d.activeBackendId()).toBe("llama-server");
242+
expect(node.unloads).toBe(1);
243+
expect(server.loaded).toBe(true);
244+
});
245+
246+
it("throws on generate before load", async () => {
247+
const d = new BackendDispatcher(
248+
new FakeBackend("node-llama-cpp"),
249+
new FakeBackend("llama-server"),
250+
() => true,
251+
() => false,
252+
);
253+
await expect(d.generate({ prompt: "x" })).rejects.toThrow(
254+
/No backend loaded/,
255+
);
256+
});
257+
});

0 commit comments

Comments
 (0)