elizaOS
diff --git a/‎packages/app-core/scripts/build-llama-cpp-dflash.mjs‎
Lines changed: 7 additions & 8 deletions b/‎packages/app-core/scripts/build-llama-cpp-dflash.mjs‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎packages/app-core/scripts/kernel-patches/metal-kernels.mjs‎
Lines changed: 37 additions & 2 deletions b/‎packages/app-core/scripts/kernel-patches/metal-kernels.mjs‎
Lines changed: 37 additions & 2 deletions
diff --git a/‎packages/app-core/src/runtime/ensure-local-inference-handler.ts‎
Lines changed: 133 additions & 2 deletions b/‎packages/app-core/src/runtime/ensure-local-inference-handler.ts‎
Lines changed: 133 additions & 2 deletions
diff --git a/‎packages/app-core/src/services/local-inference/catalog.test.ts‎
Lines changed: 2 additions & 2 deletions b/‎packages/app-core/src/services/local-inference/catalog.test.ts‎
Lines changed: 2 additions & 2 deletions
@@ -1078,14 +1078,13 @@ function ensureCheckout(cacheDir, ref) {
 //
 // What still doesn't fully ship at v0.4.0-milady (deferred dispatch wiring):
 //
-//   * ggml-metal-ops.cpp / ggml-metal-device.m have NO dispatch sites for
-//     the milady quant types (TBQ3_0, TBQ4_0, TBQ3_TCQ, QJL1_256, Q4_POLAR).
-//     CUDA has them; Metal does not. After this patch the kernel symbols
-//     (kernel_turbo3_dot, kernel_attn_score_qjl1_256, kernel_mul_mv_q4_polar_f32,
-//     etc.) are present in default.metallib and `nm`/`strings` will see
-//     them, but the runtime cannot yet select them via GGML_TYPE_*. That
-//     wiring is a separate fork-internals patch and is the next agent's
-//     mission.
+//   * ggml-metal-ops.cpp / ggml-metal-device.m have a dedicated, smoke-tested
+//     dispatch site only for GGML_OP_ATTN_SCORE_QJL -> QJL1_256 attention
+//     scoring, now routed through kernel_attn_score_qjl1_256_multi to
+//     amortize launch overhead. TBQ3_0, TBQ4_0, TBQ3_TCQ, and Q4_POLAR still
+//     ship only as symbols in default.metallib. CUDA has those runtime routes;
+//     Metal does not yet. That wiring is a separate fork-internals patch and
+//     remains publish-blocking.
 //
 //   * The EMBED_LIBRARY=ON branch (used by iOS targets) is also patched:
 //     it compiles ggml-metal.metal + the milady standalones as separate
 
@@ -389,7 +389,12 @@ function patchMetalQjlAttnDeviceCpp(cacheDir, { dryRun }) {
   const cppPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-device.cpp");
   const original = fs.readFileSync(cppPath, "utf8");
   if (original.includes(SENTINEL_QJL_ATTN)) {
-    return { changed: false, path: cppPath };
+    const upgraded = original.replace(
+      'const char * name = "kernel_attn_score_qjl1_256";',
+      'const char * name = "kernel_attn_score_qjl1_256_multi";',
+    );
+    if (upgraded !== original && !dryRun) fs.writeFileSync(cppPath, upgraded, "utf8");
+    return { changed: upgraded !== original && !dryRun, path: cppPath };
   }
   const anchor = `ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_bin(ggml_metal_library_t lib, const ggml_tensor * op, int32_t n_fuse) {`;
   if (!original.includes(anchor)) {
@@ -447,7 +452,37 @@ function patchMetalQjlAttnOpsCpp(cacheDir, { dryRun }) {
   const opsPath = path.join(cacheDir, "ggml", "src", "ggml-metal", "ggml-metal-ops.cpp");
   const original = fs.readFileSync(opsPath, "utf8");
   if (original.includes(SENTINEL_QJL_ATTN)) {
-    return { changed: false, path: opsPath };
+    let upgraded = original.replace(
+      `struct milady_qjl_score_args {
+    uint32_t n_heads;
+    uint32_t n_kv_heads;
+    uint32_t n_tokens;
+    uint32_t proj_dim;
+};`,
+      `struct milady_qjl_score_args {
+    uint32_t n_heads;
+    uint32_t n_kv_heads;
+    uint32_t n_tokens;
+    uint32_t proj_dim;
+    uint32_t tokens_per_threadgroup;
+};`,
+    );
+    upgraded = upgraded.replace(
+      `        /* n_tokens   = */ n_tokens,
+        /* proj_dim   = */ 256u,
+    };`,
+      `        /* n_tokens   = */ n_tokens,
+        /* proj_dim   = */ 256u,
+        /* tokens_per_threadgroup = */ 32u,
+    };`,
+    );
+    upgraded = upgraded.replace(
+      `            ggml_metal_encoder_dispatch_threadgroups(enc, (int) n_heads, (int) n_tokens, 1, 32, 1, 1);`,
+      `            const int token_groups = (int) ((n_tokens + args.tokens_per_threadgroup - 1u) / args.tokens_per_threadgroup);
+            ggml_metal_encoder_dispatch_threadgroups(enc, (int) n_heads, token_groups, 1, 32, 1, 1);`,
+    );
+    if (upgraded !== original && !dryRun) fs.writeFileSync(opsPath, upgraded, "utf8");
+    return { changed: upgraded !== original && !dryRun, path: opsPath };
   }
 
   const funcAnchor = `static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {`;
 
@@ -26,7 +26,9 @@ import {
   type IAgentRuntime,
   logger,
   ModelType,
+  type TextToSpeechParams,
   type TextEmbeddingParams,
+  type TranscriptionParams,
 } from "@elizaos/core";
 import {
   type LocalInferenceLoader,
@@ -47,6 +49,10 @@ import { handlerRegistry } from "../services/local-inference/handler-registry";
 import { listInstalledModels } from "../services/local-inference/registry";
 import { installRouterHandler } from "../services/local-inference/router-handler";
 import type { AgentModelSlot } from "../services/local-inference/types";
+import {
+  decodeMonoPcm16Wav,
+  type TranscriptionAudio,
+} from "../services/local-inference/voice";
 import { getRuntimeMode } from "./mode/runtime-mode";
 
 type GenerateTextHandler = (
@@ -64,13 +70,36 @@ type EmbeddingHandler = (
   params: TextEmbeddingParams | string | null,
 ) => Promise<number[]>;
 
+type TextToSpeechHandler = (
+  runtime: IAgentRuntime,
+  params: TextToSpeechParams | string,
+) => Promise<Uint8Array>;
+
+type TranscriptionHandler = (
+  runtime: IAgentRuntime,
+  params: TranscriptionParams | Buffer | string | LocalTranscriptionParams,
+) => Promise<string>;
+
+interface LocalTranscriptionParams {
+  pcm?: Float32Array;
+  audio?: Uint8Array | ArrayBuffer | Buffer;
+  sampleRateHz?: number;
+  sampleRate?: number;
+}
+
+type LocalModelHandler =
+  | GenerateTextHandler
+  | EmbeddingHandler
+  | TextToSpeechHandler
+  | TranscriptionHandler;
+
 type RuntimeWithModelRegistration = AgentRuntime & {
   getModel: (
     modelType: string | number,
-  ) => GenerateTextHandler | EmbeddingHandler | undefined;
+  ) => LocalModelHandler | undefined;
   registerModel: (
     modelType: string | number,
-    handler: GenerateTextHandler | EmbeddingHandler,
+    handler: LocalModelHandler,
     provider: string,
     priority?: number,
   ) => void;
@@ -295,6 +324,85 @@ function makeEmbeddingHandler(): EmbeddingHandler {
   };
 }
 
+function extractSpeechText(params: TextToSpeechParams | string): string {
+  if (typeof params === "string") return params;
+  if (params && typeof params.text === "string") return params.text;
+  throw new Error(
+    "[local-inference] TEXT_TO_SPEECH requires a string or { text } input",
+  );
+}
+
+function makeTextToSpeechHandler(): TextToSpeechHandler {
+  return async (_runtime, params) => {
+    const text = extractSpeechText(params);
+    if (text.length === 0) {
+      throw new Error("[local-inference] TEXT_TO_SPEECH text must be non-empty");
+    }
+    // Do not filter singing, emotion tags, or lyrical phrasing here. The
+    // local voice bundle advertises its expressive capability in the
+    // manifest; runtime safety policy lives above this model adapter.
+    return localInferenceEngine.synthesizeSpeech(text);
+  };
+}
+
+function toUint8Array(value: Uint8Array | ArrayBuffer | Buffer): Uint8Array {
+  if (value instanceof Uint8Array) {
+    return new Uint8Array(value.buffer, value.byteOffset, value.byteLength);
+  }
+  return new Uint8Array(value);
+}
+
+function extractTranscriptionAudio(
+  params: TranscriptionParams | Buffer | string | LocalTranscriptionParams,
+): TranscriptionAudio {
+  if (typeof params === "string") {
+    throw new Error(
+      "[local-inference] TRANSCRIPTION via the local voice runtime requires PCM/WAV bytes; URL/path strings are not fetched by this provider",
+    );
+  }
+  if (params instanceof Uint8Array || params instanceof ArrayBuffer) {
+    return decodeMonoPcm16Wav(toUint8Array(params));
+  }
+  if (!params || typeof params !== "object") {
+    throw new Error(
+      "[local-inference] TRANSCRIPTION requires PCM/WAV bytes or { pcm, sampleRateHz }",
+    );
+  }
+  if ("audioUrl" in params && typeof params.audioUrl === "string") {
+    throw new Error(
+      "[local-inference] TRANSCRIPTION audioUrl is not fetched by the local voice runtime; pass mono PCM16 WAV bytes or { pcm, sampleRateHz }",
+    );
+  }
+  if ("pcm" in params && params.pcm instanceof Float32Array) {
+    const sampleRate =
+      ("sampleRateHz" in params ? params.sampleRateHz : undefined) ??
+      ("sampleRate" in params ? params.sampleRate : undefined);
+    if (typeof sampleRate !== "number" || sampleRate <= 0) {
+      throw new Error(
+        "[local-inference] TRANSCRIPTION { pcm } requires a positive sampleRateHz",
+      );
+    }
+    return { pcm: params.pcm, sampleRate };
+  }
+  if (
+    "audio" in params &&
+    (params.audio instanceof Uint8Array ||
+      params.audio instanceof ArrayBuffer)
+  ) {
+    return decodeMonoPcm16Wav(toUint8Array(params.audio));
+  }
+  throw new Error(
+    "[local-inference] TRANSCRIPTION requires mono PCM16 WAV bytes or { pcm, sampleRateHz } for the local voice runtime",
+  );
+}
+
+function makeTranscriptionHandler(): TranscriptionHandler {
+  return async (_runtime, params) => {
+    const audio = extractTranscriptionAudio(params);
+    return localInferenceEngine.transcribePcm(audio);
+  };
+}
+
 /**
  * Register the device-bridge loader on the runtime. Accepts load/generate
  * calls whether or not a mobile device is currently connected — parked
@@ -535,6 +643,29 @@ export async function ensureLocalInferenceHandler(
     }
   }
 
+  try {
+    runtimeWithRegistration.registerModel(
+      ModelType.TEXT_TO_SPEECH,
+      makeTextToSpeechHandler(),
+      provider,
+      LOCAL_INFERENCE_PRIORITY,
+    );
+    runtimeWithRegistration.registerModel(
+      ModelType.TRANSCRIPTION,
+      makeTranscriptionHandler(),
+      provider,
+      LOCAL_INFERENCE_PRIORITY,
+    );
+    logger.info(
+      `[local-inference] Registered ${provider} voice handlers for TEXT_TO_SPEECH / TRANSCRIPTION at priority ${LOCAL_INFERENCE_PRIORITY}`,
+    );
+  } catch (err) {
+    logger.warn(
+      "[local-inference] Could not register local voice handlers",
+      err instanceof Error ? err.message : String(err),
+    );
+  }
+
   logger.info(
     `[local-inference] Registered ${provider} llama.cpp handler for TEXT_SMALL / TEXT_LARGE at priority ${LOCAL_INFERENCE_PRIORITY}`,
   );
 
@@ -86,8 +86,8 @@ describe("local inference catalog", () => {
   });
 
   it("sets contextLength on every Eliza-1 tier per the tier matrix", () => {
-    // Per packages/inference/AGENTS.md §2: lite/mobile = 32k, desktop =
-    // 64k, pro = 128k, server = 256k. The catalog records the largest
+    // Size tiers: 0.6B/1.7B = 32k, 9B = 64k, 27B = 128k,
+    // 27B-256k = 256k. The catalog records the largest
     // ctx the bundle's manifest will advertise for each tier.
     const expected: Record<string, number> = {
       "eliza-1-0_6b": 32768,