elizaOS
diff --git a/‎packages/app-core/src/services/local-inference/dflash-server.ts‎
Lines changed: 199 additions & 4 deletions b/‎packages/app-core/src/services/local-inference/dflash-server.ts‎
Lines changed: 199 additions & 4 deletions
@@ -13,7 +13,14 @@ import fs from "node:fs";
 import net from "node:net";
 import os from "node:os";
 import path from "node:path";
+import type {
+  GenerateArgs as BackendGenerateArgs,
+  BackendPlan,
+  LocalInferenceBackend,
+} from "./backend";
+import { findCatalogModel } from "./catalog";
 import { localInferenceRoot } from "./paths";
+import type { LocalRuntimeOptimizations } from "./types";
 
 export interface DflashServerPlan {
   targetModelPath: string;
@@ -329,7 +336,113 @@ async function fetchJson(
   }
 }
 
-export class DflashLlamaServer {
+/**
+ * Append optimization flags driven by env overrides + catalog metadata to a
+ * llama-server arg list. Env wins over the catalog when both supply the
+ * same knob — the operator's escape hatch.
+ *
+ * Returns the same array (mutated in place) for chaining-style call sites.
+ *
+ * Env mapping (per AGENTS.md / task brief):
+ *
+ *   ELIZA_LOCAL_LOOKAHEAD=N        → --lookahead N
+ *   ELIZA_LOCAL_NGRAM=on           → enable n-gram drafter (uses
+ *                                    optimizations.ngramDraft when set,
+ *                                    else conservative defaults)
+ *   ELIZA_LOCAL_PARALLEL=N         → --parallel N (handled at the call
+ *                                    site so the existing default order
+ *                                    is preserved; not redone here)
+ *   ELIZA_LOCAL_MOE_OFFLOAD=cpu    → -ot ".*=CPU"
+ *   ELIZA_LOCAL_MLOCK=1            → --mlock
+ *   ELIZA_LOCAL_NO_MMAP=1          → --no-mmap
+ *   ELIZA_LOCAL_FLASH_ATTENTION=on → -fa on (DFlash already implies it via
+ *                                    spec config; this is for non-DFlash
+ *                                    llama-server use cases)
+ */
+function readBoolFlag(name: string): boolean | undefined {
+  const raw = process.env[name]?.trim().toLowerCase();
+  if (raw === undefined) return undefined;
+  if (raw === "1" || raw === "true" || raw === "yes" || raw === "on") {
+    return true;
+  }
+  if (raw === "0" || raw === "false" || raw === "no" || raw === "off") {
+    return false;
+  }
+  return undefined;
+}
+
+export function appendOptimizationFlags(
+  args: string[],
+  optimizations: LocalRuntimeOptimizations | null,
+): string[] {
+  // --lookahead N
+  const lookaheadEnv = process.env.ELIZA_LOCAL_LOOKAHEAD?.trim();
+  const lookaheadValue = lookaheadEnv
+    ? Number.parseInt(lookaheadEnv, 10)
+    : optimizations?.lookahead;
+  if (
+    typeof lookaheadValue === "number" &&
+    Number.isFinite(lookaheadValue) &&
+    lookaheadValue > 0
+  ) {
+    args.push("--lookahead", String(lookaheadValue));
+  }
+
+  // N-gram drafter — only meaningful when DFlash is NOT in use (mutually
+  // exclusive). Caller is responsible for not setting ngramDraft on a
+  // DFlash-configured catalog entry.
+  const ngramEnvOn = readBoolFlag("ELIZA_LOCAL_NGRAM");
+  const ngramConfig = optimizations?.ngramDraft;
+  const ngramEffective =
+    ngramEnvOn === false
+      ? null
+      : (ngramConfig ?? (ngramEnvOn ? { min: 4, max: 8, minProb: 0.5 } : null));
+  if (ngramEffective) {
+    args.push("--draft-min", String(ngramEffective.min));
+    args.push("--draft-max", String(ngramEffective.max));
+    args.push("--draft-min-prob", String(ngramEffective.minProb));
+  }
+
+  // -ot ".*=CPU" — MoE expert offload to CPU.
+  const moeEnv = process.env.ELIZA_LOCAL_MOE_OFFLOAD?.trim().toLowerCase();
+  const moeMode = moeEnv ?? optimizations?.moeOffload;
+  if (moeMode === "cpu") {
+    args.push("-ot", ".*=CPU");
+  }
+
+  // --mlock
+  const mlockEnv = readBoolFlag("ELIZA_LOCAL_MLOCK");
+  const mlock = mlockEnv ?? optimizations?.mlock;
+  if (mlock === true) args.push("--mlock");
+
+  // --no-mmap
+  const noMmapEnv = readBoolFlag("ELIZA_LOCAL_NO_MMAP");
+  const noMmap = noMmapEnv ?? optimizations?.noMmap;
+  if (noMmap === true) args.push("--no-mmap");
+
+  // --mmproj <path>
+  const mmprojEnv = process.env.ELIZA_LOCAL_MMPROJ?.trim();
+  const mmproj = mmprojEnv || optimizations?.mmproj;
+  if (mmproj) args.push("--mmproj", mmproj);
+
+  // --alias <name>
+  const aliasEnv = process.env.ELIZA_LOCAL_ALIAS?.trim();
+  const alias = aliasEnv || optimizations?.alias;
+  if (alias) args.push("--alias", alias);
+
+  // -fa on / -fa off (catalog default off so existing DFlash behaviour
+  // — which compiles flash attention into the spec config — is unchanged
+  // unless the operator opts in).
+  const faEnv = readBoolFlag("ELIZA_LOCAL_FLASH_ATTENTION");
+  const fa = faEnv ?? optimizations?.flashAttention;
+  if (fa === true) args.push("-fa", "on");
+
+  return args;
+}
+
+export class DflashLlamaServer implements LocalInferenceBackend {
+  readonly id = "llama-server" as const;
+
   private child: ChildProcess | null = null;
   private baseUrl: string | null = null;
   private stderrTail: string[] = [];
@@ -343,7 +456,77 @@ export class DflashLlamaServer {
     return this.loadedPlan?.targetModelPath ?? null;
   }
 
-  async start(plan: DflashServerPlan): Promise<void> {
+  /** Soft probe — does the binary resolve and is DFlash enabled. */
+  async available(): Promise<boolean> {
+    return getDflashRuntimeStatus().enabled;
+  }
+
+  /**
+   * Unified backend contract entry point. Resolves the catalog entry from
+   * the plan and delegates to `start()` if a DFlash plan is configured.
+   * For non-DFlash llama-server use (e.g. `requiresKernel` for turbo3
+   * without spec decoding), the catalog can declare an `optimizations`
+   * block without `dflash` and we still launch the server here.
+   */
+  async load(plan: BackendPlan): Promise<void> {
+    const catalog =
+      plan.catalog ??
+      (plan.modelId ? findCatalogModel(plan.modelId) : undefined);
+    const dflash = catalog?.runtime?.dflash;
+    const optimizations = catalog?.runtime?.optimizations ?? null;
+
+    if (!dflash) {
+      throw new Error(
+        `[dflash] llama-server backend currently requires a catalog 'runtime.dflash' block. Model '${plan.modelId ?? plan.modelPath}' has none — declare DFlash or route this model through node-llama-cpp.`,
+      );
+    }
+
+    // The drafter is resolved from the registry by the engine before this
+    // dispatcher call, but the engine no longer pre-builds the dflash plan,
+    // so we resolve it here. Inline import avoids the engine ↔ dflash-server
+    // import cycle.
+    const { listInstalledModels } = await import("./registry");
+    const installed = await listInstalledModels();
+    const target =
+      installed.find((m) => m.path === plan.modelPath) ??
+      installed.find((m) => m.id === plan.modelId);
+    if (!target) {
+      throw new Error(
+        `[dflash] No installed model matched plan path/id (${plan.modelPath}; ${plan.modelId ?? "no id"}).`,
+      );
+    }
+    const drafter = installed.find((m) => m.id === dflash.drafterModelId);
+    if (!drafter) {
+      throw new Error(
+        `[dflash] ${target.displayName} requires companion drafter ${dflash.drafterModelId}; install it first.`,
+      );
+    }
+
+    await this.start(
+      {
+        targetModelPath: target.path,
+        drafterModelPath: drafter.path,
+        contextSize: dflash.contextSize,
+        draftContextSize: dflash.draftContextSize,
+        draftMin: dflash.draftMin,
+        draftMax: dflash.draftMax,
+        gpuLayers: dflash.gpuLayers,
+        draftGpuLayers: dflash.draftGpuLayers,
+        disableThinking: dflash.disableThinking,
+      },
+      optimizations,
+    );
+  }
+
+  /** Backend interface alias for stop(). */
+  async unload(): Promise<void> {
+    await this.stop();
+  }
+
+  async start(
+    plan: DflashServerPlan,
+    optimizations?: LocalRuntimeOptimizations | null,
+  ): Promise<void> {
     if (
       this.child &&
       this.loadedPlan?.targetModelPath === plan.targetModelPath &&
@@ -365,6 +548,14 @@ export class DflashLlamaServer {
     );
     const port = await resolvePort();
     const host = process.env.ELIZA_DFLASH_HOST?.trim() || DEFAULT_HOST;
+    // Parallel batching default. Backwards compat: ELIZA_DFLASH_PARALLEL
+    // remains the original DFlash-specific knob; ELIZA_LOCAL_PARALLEL is
+    // the generalised name shared with the Cache Bridge agent's runtime
+    // bump. The generalised env wins when both are set.
+    const parallelEnv =
+      process.env.ELIZA_LOCAL_PARALLEL?.trim() ||
+      process.env.ELIZA_DFLASH_PARALLEL?.trim() ||
+      String(optimizations?.parallel ?? 1);
     const args = [
       "--model",
       plan.targetModelPath,
@@ -389,7 +580,7 @@ export class DflashLlamaServer {
       "--draft-max",
       String(plan.draftMax),
       "--parallel",
-      process.env.ELIZA_DFLASH_PARALLEL?.trim() || "1",
+      parallelEnv,
       "--metrics",
       "--jinja",
     ];
@@ -414,6 +605,8 @@ export class DflashLlamaServer {
       args.push("--cache-type-v", cacheTypeV);
     }
 
+    appendOptimizationFlags(args, optimizations ?? null);
+
     const extra = process.env.ELIZA_DFLASH_LLAMA_ARGS?.trim();
     if (extra && isMetalDflashRuntime()) {
       for (const cacheType of METAL_UNSUPPORTED_CACHE_TYPES) {
@@ -464,7 +657,9 @@ export class DflashLlamaServer {
     ]);
   }
 
-  async generate(args: DflashGenerateArgs): Promise<string> {
+  async generate(
+    args: DflashGenerateArgs | BackendGenerateArgs,
+  ): Promise<string> {
     if (!this.baseUrl) {
       throw new Error("[dflash] llama-server is not running");
     }