@@ -13,7 +13,14 @@ import fs from "node:fs";
1313import net from "node:net" ;
1414import os from "node:os" ;
1515import path from "node:path" ;
16+ import type {
17+ GenerateArgs as BackendGenerateArgs ,
18+ BackendPlan ,
19+ LocalInferenceBackend ,
20+ } from "./backend" ;
21+ import { findCatalogModel } from "./catalog" ;
1622import { localInferenceRoot } from "./paths" ;
23+ import type { LocalRuntimeOptimizations } from "./types" ;
1724
1825export interface DflashServerPlan {
1926 targetModelPath : string ;
@@ -329,7 +336,113 @@ async function fetchJson(
329336 }
330337}
331338
332- export class DflashLlamaServer {
339+ /**
340+ * Append optimization flags driven by env overrides + catalog metadata to a
341+ * llama-server arg list. Env wins over the catalog when both supply the
342+ * same knob — the operator's escape hatch.
343+ *
344+ * Returns the same array (mutated in place) for chaining-style call sites.
345+ *
346+ * Env mapping (per AGENTS.md / task brief):
347+ *
348+ * ELIZA_LOCAL_LOOKAHEAD=N → --lookahead N
349+ * ELIZA_LOCAL_NGRAM=on → enable n-gram drafter (uses
350+ * optimizations.ngramDraft when set,
351+ * else conservative defaults)
352+ * ELIZA_LOCAL_PARALLEL=N → --parallel N (handled at the call
353+ * site so the existing default order
354+ * is preserved; not redone here)
355+ * ELIZA_LOCAL_MOE_OFFLOAD=cpu → -ot ".*=CPU"
356+ * ELIZA_LOCAL_MLOCK=1 → --mlock
357+ * ELIZA_LOCAL_NO_MMAP=1 → --no-mmap
358+ * ELIZA_LOCAL_FLASH_ATTENTION=on → -fa on (DFlash already implies it via
359+ * spec config; this is for non-DFlash
360+ * llama-server use cases)
361+ */
362+ function readBoolFlag ( name : string ) : boolean | undefined {
363+ const raw = process . env [ name ] ?. trim ( ) . toLowerCase ( ) ;
364+ if ( raw === undefined ) return undefined ;
365+ if ( raw === "1" || raw === "true" || raw === "yes" || raw === "on" ) {
366+ return true ;
367+ }
368+ if ( raw === "0" || raw === "false" || raw === "no" || raw === "off" ) {
369+ return false ;
370+ }
371+ return undefined ;
372+ }
373+
374+ export function appendOptimizationFlags (
375+ args : string [ ] ,
376+ optimizations : LocalRuntimeOptimizations | null ,
377+ ) : string [ ] {
378+ // --lookahead N
379+ const lookaheadEnv = process . env . ELIZA_LOCAL_LOOKAHEAD ?. trim ( ) ;
380+ const lookaheadValue = lookaheadEnv
381+ ? Number . parseInt ( lookaheadEnv , 10 )
382+ : optimizations ?. lookahead ;
383+ if (
384+ typeof lookaheadValue === "number" &&
385+ Number . isFinite ( lookaheadValue ) &&
386+ lookaheadValue > 0
387+ ) {
388+ args . push ( "--lookahead" , String ( lookaheadValue ) ) ;
389+ }
390+
391+ // N-gram drafter — only meaningful when DFlash is NOT in use (mutually
392+ // exclusive). Caller is responsible for not setting ngramDraft on a
393+ // DFlash-configured catalog entry.
394+ const ngramEnvOn = readBoolFlag ( "ELIZA_LOCAL_NGRAM" ) ;
395+ const ngramConfig = optimizations ?. ngramDraft ;
396+ const ngramEffective =
397+ ngramEnvOn === false
398+ ? null
399+ : ( ngramConfig ?? ( ngramEnvOn ? { min : 4 , max : 8 , minProb : 0.5 } : null ) ) ;
400+ if ( ngramEffective ) {
401+ args . push ( "--draft-min" , String ( ngramEffective . min ) ) ;
402+ args . push ( "--draft-max" , String ( ngramEffective . max ) ) ;
403+ args . push ( "--draft-min-prob" , String ( ngramEffective . minProb ) ) ;
404+ }
405+
406+ // -ot ".*=CPU" — MoE expert offload to CPU.
407+ const moeEnv = process . env . ELIZA_LOCAL_MOE_OFFLOAD ?. trim ( ) . toLowerCase ( ) ;
408+ const moeMode = moeEnv ?? optimizations ?. moeOffload ;
409+ if ( moeMode === "cpu" ) {
410+ args . push ( "-ot" , ".*=CPU" ) ;
411+ }
412+
413+ // --mlock
414+ const mlockEnv = readBoolFlag ( "ELIZA_LOCAL_MLOCK" ) ;
415+ const mlock = mlockEnv ?? optimizations ?. mlock ;
416+ if ( mlock === true ) args . push ( "--mlock" ) ;
417+
418+ // --no-mmap
419+ const noMmapEnv = readBoolFlag ( "ELIZA_LOCAL_NO_MMAP" ) ;
420+ const noMmap = noMmapEnv ?? optimizations ?. noMmap ;
421+ if ( noMmap === true ) args . push ( "--no-mmap" ) ;
422+
423+ // --mmproj <path>
424+ const mmprojEnv = process . env . ELIZA_LOCAL_MMPROJ ?. trim ( ) ;
425+ const mmproj = mmprojEnv || optimizations ?. mmproj ;
426+ if ( mmproj ) args . push ( "--mmproj" , mmproj ) ;
427+
428+ // --alias <name>
429+ const aliasEnv = process . env . ELIZA_LOCAL_ALIAS ?. trim ( ) ;
430+ const alias = aliasEnv || optimizations ?. alias ;
431+ if ( alias ) args . push ( "--alias" , alias ) ;
432+
433+ // -fa on / -fa off (catalog default off so existing DFlash behaviour
434+ // — which compiles flash attention into the spec config — is unchanged
435+ // unless the operator opts in).
436+ const faEnv = readBoolFlag ( "ELIZA_LOCAL_FLASH_ATTENTION" ) ;
437+ const fa = faEnv ?? optimizations ?. flashAttention ;
438+ if ( fa === true ) args . push ( "-fa" , "on" ) ;
439+
440+ return args ;
441+ }
442+
443+ export class DflashLlamaServer implements LocalInferenceBackend {
444+ readonly id = "llama-server" as const ;
445+
333446 private child : ChildProcess | null = null ;
334447 private baseUrl : string | null = null ;
335448 private stderrTail : string [ ] = [ ] ;
@@ -343,7 +456,77 @@ export class DflashLlamaServer {
343456 return this . loadedPlan ?. targetModelPath ?? null ;
344457 }
345458
346- async start ( plan : DflashServerPlan ) : Promise < void > {
459+ /** Soft probe — does the binary resolve and is DFlash enabled. */
460+ async available ( ) : Promise < boolean > {
461+ return getDflashRuntimeStatus ( ) . enabled ;
462+ }
463+
464+ /**
465+ * Unified backend contract entry point. Resolves the catalog entry from
466+ * the plan and delegates to `start()` if a DFlash plan is configured.
467+ * For non-DFlash llama-server use (e.g. `requiresKernel` for turbo3
468+ * without spec decoding), the catalog can declare an `optimizations`
469+ * block without `dflash` and we still launch the server here.
470+ */
471+ async load ( plan : BackendPlan ) : Promise < void > {
472+ const catalog =
473+ plan . catalog ??
474+ ( plan . modelId ? findCatalogModel ( plan . modelId ) : undefined ) ;
475+ const dflash = catalog ?. runtime ?. dflash ;
476+ const optimizations = catalog ?. runtime ?. optimizations ?? null ;
477+
478+ if ( ! dflash ) {
479+ throw new Error (
480+ `[dflash] llama-server backend currently requires a catalog 'runtime.dflash' block. Model '${ plan . modelId ?? plan . modelPath } ' has none — declare DFlash or route this model through node-llama-cpp.` ,
481+ ) ;
482+ }
483+
484+ // The drafter is resolved from the registry by the engine before this
485+ // dispatcher call, but the engine no longer pre-builds the dflash plan,
486+ // so we resolve it here. Inline import avoids the engine ↔ dflash-server
487+ // import cycle.
488+ const { listInstalledModels } = await import ( "./registry" ) ;
489+ const installed = await listInstalledModels ( ) ;
490+ const target =
491+ installed . find ( ( m ) => m . path === plan . modelPath ) ??
492+ installed . find ( ( m ) => m . id === plan . modelId ) ;
493+ if ( ! target ) {
494+ throw new Error (
495+ `[dflash] No installed model matched plan path/id (${ plan . modelPath } ; ${ plan . modelId ?? "no id" } ).` ,
496+ ) ;
497+ }
498+ const drafter = installed . find ( ( m ) => m . id === dflash . drafterModelId ) ;
499+ if ( ! drafter ) {
500+ throw new Error (
501+ `[dflash] ${ target . displayName } requires companion drafter ${ dflash . drafterModelId } ; install it first.` ,
502+ ) ;
503+ }
504+
505+ await this . start (
506+ {
507+ targetModelPath : target . path ,
508+ drafterModelPath : drafter . path ,
509+ contextSize : dflash . contextSize ,
510+ draftContextSize : dflash . draftContextSize ,
511+ draftMin : dflash . draftMin ,
512+ draftMax : dflash . draftMax ,
513+ gpuLayers : dflash . gpuLayers ,
514+ draftGpuLayers : dflash . draftGpuLayers ,
515+ disableThinking : dflash . disableThinking ,
516+ } ,
517+ optimizations ,
518+ ) ;
519+ }
520+
521+ /** Backend interface alias for stop(). */
522+ async unload ( ) : Promise < void > {
523+ await this . stop ( ) ;
524+ }
525+
526+ async start (
527+ plan : DflashServerPlan ,
528+ optimizations ?: LocalRuntimeOptimizations | null ,
529+ ) : Promise < void > {
347530 if (
348531 this . child &&
349532 this . loadedPlan ?. targetModelPath === plan . targetModelPath &&
@@ -365,6 +548,14 @@ export class DflashLlamaServer {
365548 ) ;
366549 const port = await resolvePort ( ) ;
367550 const host = process . env . ELIZA_DFLASH_HOST ?. trim ( ) || DEFAULT_HOST ;
551+ // Parallel batching default. Backwards compat: ELIZA_DFLASH_PARALLEL
552+ // remains the original DFlash-specific knob; ELIZA_LOCAL_PARALLEL is
553+ // the generalised name shared with the Cache Bridge agent's runtime
554+ // bump. The generalised env wins when both are set.
555+ const parallelEnv =
556+ process . env . ELIZA_LOCAL_PARALLEL ?. trim ( ) ||
557+ process . env . ELIZA_DFLASH_PARALLEL ?. trim ( ) ||
558+ String ( optimizations ?. parallel ?? 1 ) ;
368559 const args = [
369560 "--model" ,
370561 plan . targetModelPath ,
@@ -389,7 +580,7 @@ export class DflashLlamaServer {
389580 "--draft-max" ,
390581 String ( plan . draftMax ) ,
391582 "--parallel" ,
392- process . env . ELIZA_DFLASH_PARALLEL ?. trim ( ) || "1" ,
583+ parallelEnv ,
393584 "--metrics" ,
394585 "--jinja" ,
395586 ] ;
@@ -414,6 +605,8 @@ export class DflashLlamaServer {
414605 args . push ( "--cache-type-v" , cacheTypeV ) ;
415606 }
416607
608+ appendOptimizationFlags ( args , optimizations ?? null ) ;
609+
417610 const extra = process . env . ELIZA_DFLASH_LLAMA_ARGS ?. trim ( ) ;
418611 if ( extra && isMetalDflashRuntime ( ) ) {
419612 for ( const cacheType of METAL_UNSUPPORTED_CACHE_TYPES ) {
@@ -464,7 +657,9 @@ export class DflashLlamaServer {
464657 ] ) ;
465658 }
466659
467- async generate ( args : DflashGenerateArgs ) : Promise < string > {
660+ async generate (
661+ args : DflashGenerateArgs | BackendGenerateArgs ,
662+ ) : Promise < string > {
468663 if ( ! this . baseUrl ) {
469664 throw new Error ( "[dflash] llama-server is not running" ) ;
470665 }
0 commit comments