@@ -26,7 +26,9 @@ import {
2626 type IAgentRuntime ,
2727 logger ,
2828 ModelType ,
29+ type TextToSpeechParams ,
2930 type TextEmbeddingParams ,
31+ type TranscriptionParams ,
3032} from "@elizaos/core" ;
3133import {
3234 type LocalInferenceLoader ,
@@ -47,6 +49,10 @@ import { handlerRegistry } from "../services/local-inference/handler-registry";
4749import { listInstalledModels } from "../services/local-inference/registry" ;
4850import { installRouterHandler } from "../services/local-inference/router-handler" ;
4951import type { AgentModelSlot } from "../services/local-inference/types" ;
52+ import {
53+ decodeMonoPcm16Wav ,
54+ type TranscriptionAudio ,
55+ } from "../services/local-inference/voice" ;
5056import { getRuntimeMode } from "./mode/runtime-mode" ;
5157
5258type GenerateTextHandler = (
@@ -64,13 +70,36 @@ type EmbeddingHandler = (
6470 params : TextEmbeddingParams | string | null ,
6571) => Promise < number [ ] > ;
6672
73+ type TextToSpeechHandler = (
74+ runtime : IAgentRuntime ,
75+ params : TextToSpeechParams | string ,
76+ ) => Promise < Uint8Array > ;
77+
78+ type TranscriptionHandler = (
79+ runtime : IAgentRuntime ,
80+ params : TranscriptionParams | Buffer | string | LocalTranscriptionParams ,
81+ ) => Promise < string > ;
82+
83+ interface LocalTranscriptionParams {
84+ pcm ?: Float32Array ;
85+ audio ?: Uint8Array | ArrayBuffer | Buffer ;
86+ sampleRateHz ?: number ;
87+ sampleRate ?: number ;
88+ }
89+
90+ type LocalModelHandler =
91+ | GenerateTextHandler
92+ | EmbeddingHandler
93+ | TextToSpeechHandler
94+ | TranscriptionHandler ;
95+
6796type RuntimeWithModelRegistration = AgentRuntime & {
6897 getModel : (
6998 modelType : string | number ,
70- ) => GenerateTextHandler | EmbeddingHandler | undefined ;
99+ ) => LocalModelHandler | undefined ;
71100 registerModel : (
72101 modelType : string | number ,
73- handler : GenerateTextHandler | EmbeddingHandler ,
102+ handler : LocalModelHandler ,
74103 provider : string ,
75104 priority ?: number ,
76105 ) => void ;
@@ -295,6 +324,85 @@ function makeEmbeddingHandler(): EmbeddingHandler {
295324 } ;
296325}
297326
327+ function extractSpeechText ( params : TextToSpeechParams | string ) : string {
328+ if ( typeof params === "string" ) return params ;
329+ if ( params && typeof params . text === "string" ) return params . text ;
330+ throw new Error (
331+ "[local-inference] TEXT_TO_SPEECH requires a string or { text } input" ,
332+ ) ;
333+ }
334+
335+ function makeTextToSpeechHandler ( ) : TextToSpeechHandler {
336+ return async ( _runtime , params ) => {
337+ const text = extractSpeechText ( params ) ;
338+ if ( text . length === 0 ) {
339+ throw new Error ( "[local-inference] TEXT_TO_SPEECH text must be non-empty" ) ;
340+ }
341+ // Do not filter singing, emotion tags, or lyrical phrasing here. The
342+ // local voice bundle advertises its expressive capability in the
343+ // manifest; runtime safety policy lives above this model adapter.
344+ return localInferenceEngine . synthesizeSpeech ( text ) ;
345+ } ;
346+ }
347+
348+ function toUint8Array ( value : Uint8Array | ArrayBuffer | Buffer ) : Uint8Array {
349+ if ( value instanceof Uint8Array ) {
350+ return new Uint8Array ( value . buffer , value . byteOffset , value . byteLength ) ;
351+ }
352+ return new Uint8Array ( value ) ;
353+ }
354+
355+ function extractTranscriptionAudio (
356+ params : TranscriptionParams | Buffer | string | LocalTranscriptionParams ,
357+ ) : TranscriptionAudio {
358+ if ( typeof params === "string" ) {
359+ throw new Error (
360+ "[local-inference] TRANSCRIPTION via the local voice runtime requires PCM/WAV bytes; URL/path strings are not fetched by this provider" ,
361+ ) ;
362+ }
363+ if ( params instanceof Uint8Array || params instanceof ArrayBuffer ) {
364+ return decodeMonoPcm16Wav ( toUint8Array ( params ) ) ;
365+ }
366+ if ( ! params || typeof params !== "object" ) {
367+ throw new Error (
368+ "[local-inference] TRANSCRIPTION requires PCM/WAV bytes or { pcm, sampleRateHz }" ,
369+ ) ;
370+ }
371+ if ( "audioUrl" in params && typeof params . audioUrl === "string" ) {
372+ throw new Error (
373+ "[local-inference] TRANSCRIPTION audioUrl is not fetched by the local voice runtime; pass mono PCM16 WAV bytes or { pcm, sampleRateHz }" ,
374+ ) ;
375+ }
376+ if ( "pcm" in params && params . pcm instanceof Float32Array ) {
377+ const sampleRate =
378+ ( "sampleRateHz" in params ? params . sampleRateHz : undefined ) ??
379+ ( "sampleRate" in params ? params . sampleRate : undefined ) ;
380+ if ( typeof sampleRate !== "number" || sampleRate <= 0 ) {
381+ throw new Error (
382+ "[local-inference] TRANSCRIPTION { pcm } requires a positive sampleRateHz" ,
383+ ) ;
384+ }
385+ return { pcm : params . pcm , sampleRate } ;
386+ }
387+ if (
388+ "audio" in params &&
389+ ( params . audio instanceof Uint8Array ||
390+ params . audio instanceof ArrayBuffer )
391+ ) {
392+ return decodeMonoPcm16Wav ( toUint8Array ( params . audio ) ) ;
393+ }
394+ throw new Error (
395+ "[local-inference] TRANSCRIPTION requires mono PCM16 WAV bytes or { pcm, sampleRateHz } for the local voice runtime" ,
396+ ) ;
397+ }
398+
399+ function makeTranscriptionHandler ( ) : TranscriptionHandler {
400+ return async ( _runtime , params ) => {
401+ const audio = extractTranscriptionAudio ( params ) ;
402+ return localInferenceEngine . transcribePcm ( audio ) ;
403+ } ;
404+ }
405+
298406/**
299407 * Register the device-bridge loader on the runtime. Accepts load/generate
300408 * calls whether or not a mobile device is currently connected — parked
@@ -535,6 +643,29 @@ export async function ensureLocalInferenceHandler(
535643 }
536644 }
537645
646+ try {
647+ runtimeWithRegistration . registerModel (
648+ ModelType . TEXT_TO_SPEECH ,
649+ makeTextToSpeechHandler ( ) ,
650+ provider ,
651+ LOCAL_INFERENCE_PRIORITY ,
652+ ) ;
653+ runtimeWithRegistration . registerModel (
654+ ModelType . TRANSCRIPTION ,
655+ makeTranscriptionHandler ( ) ,
656+ provider ,
657+ LOCAL_INFERENCE_PRIORITY ,
658+ ) ;
659+ logger . info (
660+ `[local-inference] Registered ${ provider } voice handlers for TEXT_TO_SPEECH / TRANSCRIPTION at priority ${ LOCAL_INFERENCE_PRIORITY } ` ,
661+ ) ;
662+ } catch ( err ) {
663+ logger . warn (
664+ "[local-inference] Could not register local voice handlers" ,
665+ err instanceof Error ? err . message : String ( err ) ,
666+ ) ;
667+ }
668+
538669 logger . info (
539670 `[local-inference] Registered ${ provider } llama.cpp handler for TEXT_SMALL / TEXT_LARGE at priority ${ LOCAL_INFERENCE_PRIORITY } ` ,
540671 ) ;
0 commit comments