Skip to content

Commit 42d645a

Browse files
lalaluneclaude
andcommitted
feat(local-inference): expose lookahead/ngram/MoE offload via runtime acceleration
Wires the new optimization knobs into the llama-server spawn args: ELIZA_LOCAL_LOOKAHEAD=N → --lookahead N ELIZA_LOCAL_NGRAM=on → --draft-min/max/min-prob ELIZA_LOCAL_PARALLEL=N → --parallel N (generalised over the pre-existing ELIZA_DFLASH_PARALLEL) ELIZA_LOCAL_MOE_OFFLOAD=cpu → -ot ".*=CPU" ELIZA_LOCAL_MLOCK=1 → --mlock ELIZA_LOCAL_NO_MMAP=1 → --no-mmap ELIZA_LOCAL_MMPROJ=<path> → --mmproj <path> ELIZA_LOCAL_ALIAS=<name> → --alias <name> ELIZA_LOCAL_FLASH_ATTENTION=on → -fa on Each knob also has a catalog form under runtime.optimizations so models can declare safe defaults (e.g. moeOffload: "cpu" for Qwen3 Coder MoE). Env wins over catalog when both are set. Refactors engine.ts so `LocalInferenceEngine` is a thin wrapper over the dispatcher. The DFlash-or-node-llama-cpp branching that used to live in the engine is now the dispatcher's pure decision tree. The public engine API (`load`, `unload`, `generate`, `available`, `hasLoadedModel`, `currentModelPath`) is unchanged so service.ts, router-handler.ts, active-model.ts, and ensure-local-inference-handler.ts keep working without modification. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1 parent 5875f69 commit 42d645a

2 files changed

Lines changed: 336 additions & 43 deletions

File tree

packages/app-core/src/services/local-inference/dflash-server.ts

Lines changed: 199 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,14 @@ import fs from "node:fs";
1313
import net from "node:net";
1414
import os from "node:os";
1515
import path from "node:path";
16+
import type {
17+
GenerateArgs as BackendGenerateArgs,
18+
BackendPlan,
19+
LocalInferenceBackend,
20+
} from "./backend";
21+
import { findCatalogModel } from "./catalog";
1622
import { localInferenceRoot } from "./paths";
23+
import type { LocalRuntimeOptimizations } from "./types";
1724

1825
export interface DflashServerPlan {
1926
targetModelPath: string;
@@ -329,7 +336,113 @@ async function fetchJson(
329336
}
330337
}
331338

332-
export class DflashLlamaServer {
339+
/**
340+
* Append optimization flags driven by env overrides + catalog metadata to a
341+
* llama-server arg list. Env wins over the catalog when both supply the
342+
* same knob — the operator's escape hatch.
343+
*
344+
* Returns the same array (mutated in place) for chaining-style call sites.
345+
*
346+
* Env mapping (per AGENTS.md / task brief):
347+
*
348+
* ELIZA_LOCAL_LOOKAHEAD=N → --lookahead N
349+
* ELIZA_LOCAL_NGRAM=on → enable n-gram drafter (uses
350+
* optimizations.ngramDraft when set,
351+
* else conservative defaults)
352+
* ELIZA_LOCAL_PARALLEL=N → --parallel N (handled at the call
353+
* site so the existing default order
354+
* is preserved; not redone here)
355+
* ELIZA_LOCAL_MOE_OFFLOAD=cpu → -ot ".*=CPU"
356+
* ELIZA_LOCAL_MLOCK=1 → --mlock
357+
* ELIZA_LOCAL_NO_MMAP=1 → --no-mmap
358+
* ELIZA_LOCAL_FLASH_ATTENTION=on → -fa on (DFlash already implies it via
359+
* spec config; this is for non-DFlash
360+
* llama-server use cases)
361+
*/
362+
function readBoolFlag(name: string): boolean | undefined {
363+
const raw = process.env[name]?.trim().toLowerCase();
364+
if (raw === undefined) return undefined;
365+
if (raw === "1" || raw === "true" || raw === "yes" || raw === "on") {
366+
return true;
367+
}
368+
if (raw === "0" || raw === "false" || raw === "no" || raw === "off") {
369+
return false;
370+
}
371+
return undefined;
372+
}
373+
374+
export function appendOptimizationFlags(
375+
args: string[],
376+
optimizations: LocalRuntimeOptimizations | null,
377+
): string[] {
378+
// --lookahead N
379+
const lookaheadEnv = process.env.ELIZA_LOCAL_LOOKAHEAD?.trim();
380+
const lookaheadValue = lookaheadEnv
381+
? Number.parseInt(lookaheadEnv, 10)
382+
: optimizations?.lookahead;
383+
if (
384+
typeof lookaheadValue === "number" &&
385+
Number.isFinite(lookaheadValue) &&
386+
lookaheadValue > 0
387+
) {
388+
args.push("--lookahead", String(lookaheadValue));
389+
}
390+
391+
// N-gram drafter — only meaningful when DFlash is NOT in use (mutually
392+
// exclusive). Caller is responsible for not setting ngramDraft on a
393+
// DFlash-configured catalog entry.
394+
const ngramEnvOn = readBoolFlag("ELIZA_LOCAL_NGRAM");
395+
const ngramConfig = optimizations?.ngramDraft;
396+
const ngramEffective =
397+
ngramEnvOn === false
398+
? null
399+
: (ngramConfig ?? (ngramEnvOn ? { min: 4, max: 8, minProb: 0.5 } : null));
400+
if (ngramEffective) {
401+
args.push("--draft-min", String(ngramEffective.min));
402+
args.push("--draft-max", String(ngramEffective.max));
403+
args.push("--draft-min-prob", String(ngramEffective.minProb));
404+
}
405+
406+
// -ot ".*=CPU" — MoE expert offload to CPU.
407+
const moeEnv = process.env.ELIZA_LOCAL_MOE_OFFLOAD?.trim().toLowerCase();
408+
const moeMode = moeEnv ?? optimizations?.moeOffload;
409+
if (moeMode === "cpu") {
410+
args.push("-ot", ".*=CPU");
411+
}
412+
413+
// --mlock
414+
const mlockEnv = readBoolFlag("ELIZA_LOCAL_MLOCK");
415+
const mlock = mlockEnv ?? optimizations?.mlock;
416+
if (mlock === true) args.push("--mlock");
417+
418+
// --no-mmap
419+
const noMmapEnv = readBoolFlag("ELIZA_LOCAL_NO_MMAP");
420+
const noMmap = noMmapEnv ?? optimizations?.noMmap;
421+
if (noMmap === true) args.push("--no-mmap");
422+
423+
// --mmproj <path>
424+
const mmprojEnv = process.env.ELIZA_LOCAL_MMPROJ?.trim();
425+
const mmproj = mmprojEnv || optimizations?.mmproj;
426+
if (mmproj) args.push("--mmproj", mmproj);
427+
428+
// --alias <name>
429+
const aliasEnv = process.env.ELIZA_LOCAL_ALIAS?.trim();
430+
const alias = aliasEnv || optimizations?.alias;
431+
if (alias) args.push("--alias", alias);
432+
433+
// -fa on / -fa off (catalog default off so existing DFlash behaviour
434+
// — which compiles flash attention into the spec config — is unchanged
435+
// unless the operator opts in).
436+
const faEnv = readBoolFlag("ELIZA_LOCAL_FLASH_ATTENTION");
437+
const fa = faEnv ?? optimizations?.flashAttention;
438+
if (fa === true) args.push("-fa", "on");
439+
440+
return args;
441+
}
442+
443+
export class DflashLlamaServer implements LocalInferenceBackend {
444+
readonly id = "llama-server" as const;
445+
333446
private child: ChildProcess | null = null;
334447
private baseUrl: string | null = null;
335448
private stderrTail: string[] = [];
@@ -343,7 +456,77 @@ export class DflashLlamaServer {
343456
return this.loadedPlan?.targetModelPath ?? null;
344457
}
345458

346-
async start(plan: DflashServerPlan): Promise<void> {
459+
/** Soft probe — does the binary resolve and is DFlash enabled. */
460+
async available(): Promise<boolean> {
461+
return getDflashRuntimeStatus().enabled;
462+
}
463+
464+
/**
465+
* Unified backend contract entry point. Resolves the catalog entry from
466+
* the plan and delegates to `start()` if a DFlash plan is configured.
467+
* For non-DFlash llama-server use (e.g. `requiresKernel` for turbo3
468+
* without spec decoding), the catalog can declare an `optimizations`
469+
* block without `dflash` and we still launch the server here.
470+
*/
471+
async load(plan: BackendPlan): Promise<void> {
472+
const catalog =
473+
plan.catalog ??
474+
(plan.modelId ? findCatalogModel(plan.modelId) : undefined);
475+
const dflash = catalog?.runtime?.dflash;
476+
const optimizations = catalog?.runtime?.optimizations ?? null;
477+
478+
if (!dflash) {
479+
throw new Error(
480+
`[dflash] llama-server backend currently requires a catalog 'runtime.dflash' block. Model '${plan.modelId ?? plan.modelPath}' has none — declare DFlash or route this model through node-llama-cpp.`,
481+
);
482+
}
483+
484+
// The drafter is resolved from the registry by the engine before this
485+
// dispatcher call, but the engine no longer pre-builds the dflash plan,
486+
// so we resolve it here. Inline import avoids the engine ↔ dflash-server
487+
// import cycle.
488+
const { listInstalledModels } = await import("./registry");
489+
const installed = await listInstalledModels();
490+
const target =
491+
installed.find((m) => m.path === plan.modelPath) ??
492+
installed.find((m) => m.id === plan.modelId);
493+
if (!target) {
494+
throw new Error(
495+
`[dflash] No installed model matched plan path/id (${plan.modelPath}; ${plan.modelId ?? "no id"}).`,
496+
);
497+
}
498+
const drafter = installed.find((m) => m.id === dflash.drafterModelId);
499+
if (!drafter) {
500+
throw new Error(
501+
`[dflash] ${target.displayName} requires companion drafter ${dflash.drafterModelId}; install it first.`,
502+
);
503+
}
504+
505+
await this.start(
506+
{
507+
targetModelPath: target.path,
508+
drafterModelPath: drafter.path,
509+
contextSize: dflash.contextSize,
510+
draftContextSize: dflash.draftContextSize,
511+
draftMin: dflash.draftMin,
512+
draftMax: dflash.draftMax,
513+
gpuLayers: dflash.gpuLayers,
514+
draftGpuLayers: dflash.draftGpuLayers,
515+
disableThinking: dflash.disableThinking,
516+
},
517+
optimizations,
518+
);
519+
}
520+
521+
/** Backend interface alias for stop(). */
522+
async unload(): Promise<void> {
523+
await this.stop();
524+
}
525+
526+
async start(
527+
plan: DflashServerPlan,
528+
optimizations?: LocalRuntimeOptimizations | null,
529+
): Promise<void> {
347530
if (
348531
this.child &&
349532
this.loadedPlan?.targetModelPath === plan.targetModelPath &&
@@ -365,6 +548,14 @@ export class DflashLlamaServer {
365548
);
366549
const port = await resolvePort();
367550
const host = process.env.ELIZA_DFLASH_HOST?.trim() || DEFAULT_HOST;
551+
// Parallel batching default. Backwards compat: ELIZA_DFLASH_PARALLEL
552+
// remains the original DFlash-specific knob; ELIZA_LOCAL_PARALLEL is
553+
// the generalised name shared with the Cache Bridge agent's runtime
554+
// bump. The generalised env wins when both are set.
555+
const parallelEnv =
556+
process.env.ELIZA_LOCAL_PARALLEL?.trim() ||
557+
process.env.ELIZA_DFLASH_PARALLEL?.trim() ||
558+
String(optimizations?.parallel ?? 1);
368559
const args = [
369560
"--model",
370561
plan.targetModelPath,
@@ -389,7 +580,7 @@ export class DflashLlamaServer {
389580
"--draft-max",
390581
String(plan.draftMax),
391582
"--parallel",
392-
process.env.ELIZA_DFLASH_PARALLEL?.trim() || "1",
583+
parallelEnv,
393584
"--metrics",
394585
"--jinja",
395586
];
@@ -414,6 +605,8 @@ export class DflashLlamaServer {
414605
args.push("--cache-type-v", cacheTypeV);
415606
}
416607

608+
appendOptimizationFlags(args, optimizations ?? null);
609+
417610
const extra = process.env.ELIZA_DFLASH_LLAMA_ARGS?.trim();
418611
if (extra && isMetalDflashRuntime()) {
419612
for (const cacheType of METAL_UNSUPPORTED_CACHE_TYPES) {
@@ -464,7 +657,9 @@ export class DflashLlamaServer {
464657
]);
465658
}
466659

467-
async generate(args: DflashGenerateArgs): Promise<string> {
660+
async generate(
661+
args: DflashGenerateArgs | BackendGenerateArgs,
662+
): Promise<string> {
468663
if (!this.baseUrl) {
469664
throw new Error("[dflash] llama-server is not running");
470665
}

0 commit comments

Comments
 (0)