Skip to content

Commit 9b339fe

Browse files
committed
fix(local-inference): harden desktop local ASR path
1 parent f4c3d0b commit 9b339fe

7 files changed

Lines changed: 177 additions & 23 deletions

File tree

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
import { describe, expect, it } from "vitest";
2+
import { isAllowedOrigin } from "./server-cors";
3+
4+
describe("server CORS origin allowlist", () => {
5+
it("allows the packaged Electrobun views scheme used by the desktop renderer", () => {
6+
expect(isAllowedOrigin("views://")).toBe(true);
7+
});
8+
9+
it("continues to reject untrusted custom browser schemes", () => {
10+
expect(isAllowedOrigin("evil://localhost")).toBe(false);
11+
});
12+
});

packages/app-core/src/api/server-cors.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ const CAPACITOR_WEBVIEW_ORIGINS: ReadonlySet<string> = new Set([
9191
* these origins; they are used by packaged/native app shells.
9292
*/
9393
const NATIVE_WEBVIEW_PROTOCOLS: ReadonlySet<string> = new Set([
94+
"views:",
9495
"capacitor:",
9596
"capacitor-electron:",
9697
"ionic:",
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
import { describe, expect, it } from "vitest";
2+
import {
3+
encodeMonoPcm16Wav,
4+
isSilentPcmAudio,
5+
measurePcmAudio,
6+
} from "./local-asr-capture";
7+
8+
describe("local ASR capture", () => {
9+
it("detects truly silent PCM before sending it to ASR", () => {
10+
const pcm = new Float32Array(16000);
11+
12+
expect(measurePcmAudio(pcm)).toEqual({ rms: 0, peak: 0 });
13+
expect(isSilentPcmAudio(pcm)).toBe(true);
14+
});
15+
16+
it("keeps low but real microphone signal eligible for ASR", () => {
17+
const pcm = new Float32Array(16000);
18+
pcm[1200] = 0.001;
19+
pcm[1201] = -0.001;
20+
21+
expect(measurePcmAudio(pcm).peak).toBeCloseTo(0.001);
22+
expect(isSilentPcmAudio(pcm)).toBe(false);
23+
});
24+
25+
it("encodes mono PCM16 WAV with the requested sample rate", () => {
26+
const wav = encodeMonoPcm16Wav(new Float32Array([0, 1, -1]), 16000);
27+
const view = new DataView(wav.buffer);
28+
29+
expect(String.fromCharCode(...wav.slice(0, 4))).toBe("RIFF");
30+
expect(String.fromCharCode(...wav.slice(8, 12))).toBe("WAVE");
31+
expect(view.getUint32(24, true)).toBe(16000);
32+
expect(view.getUint32(40, true)).toBe(6);
33+
});
34+
});

packages/ui/src/voice/local-asr-capture.ts

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@ export interface LocalAsrRecorder {
33
cancel(): void;
44
}
55

6+
export interface PcmAudioStats {
7+
rms: number;
8+
peak: number;
9+
}
10+
611
type AudioContextConstructor = typeof AudioContext;
712

813
type WindowWithAudioContext = Window & {
@@ -45,6 +50,27 @@ function clampPcm16(value: number): number {
4550
return Math.max(-1, Math.min(1, value));
4651
}
4752

53+
export function measurePcmAudio(pcm: Float32Array): PcmAudioStats {
54+
let sumSquares = 0;
55+
let peak = 0;
56+
for (const sample of pcm) {
57+
if (!Number.isFinite(sample)) continue;
58+
const abs = Math.abs(sample);
59+
peak = Math.max(peak, abs);
60+
sumSquares += sample * sample;
61+
}
62+
63+
return {
64+
rms: Math.sqrt(sumSquares / Math.max(1, pcm.length)),
65+
peak,
66+
};
67+
}
68+
69+
export function isSilentPcmAudio(pcm: Float32Array): boolean {
70+
const { rms, peak } = measurePcmAudio(pcm);
71+
return peak < 0.0001 && rms < 0.00001;
72+
}
73+
4874
export function encodeMonoPcm16Wav(
4975
pcm: Float32Array,
5076
sampleRateHz: number,
@@ -154,6 +180,11 @@ export async function startLocalAsrRecorder(): Promise<LocalAsrRecorder> {
154180
if (pcm.length === 0) {
155181
throw new Error("No microphone audio was captured for local ASR");
156182
}
183+
if (isSilentPcmAudio(pcm)) {
184+
throw new Error(
185+
"Microphone audio was silent; check the selected input device and try again",
186+
);
187+
}
157188
return encodeMonoPcm16Wav(pcm, sampleRate);
158189
},
159190
cancel() {

plugins/plugin-local-inference/src/services/voice/engine-bridge.ts

Lines changed: 37 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -93,9 +93,11 @@ import {
9393
SpeakerPresetCache,
9494
} from "./speaker-preset-cache";
9595
import {
96+
type AsrBackendPreference,
9697
ASR_SAMPLE_RATE,
9798
AsrUnavailableError,
9899
createStreamingTranscriber,
100+
readAsrBackendPreferenceFromEnv,
99101
resampleLinear,
100102
} from "./transcriber";
101103
import type {
@@ -1488,6 +1490,8 @@ export class EngineVoiceBridge {
14881490
*/
14891491
createStreamingTranscriber(opts?: {
14901492
vad?: VadEventSource;
1493+
prefer?: AsrBackendPreference;
1494+
allowOpenVinoWhisper?: boolean;
14911495
}): StreamingTranscriber {
14921496
this.assertVoiceOn("create streaming transcriber");
14931497
const contextRef = this.ffiContextRef;
@@ -1496,6 +1500,8 @@ export class EngineVoiceBridge {
14961500
getContext: contextRef ? () => contextRef.ensure() : undefined,
14971501
asrBundlePresent: this.asrAvailable,
14981502
vad: opts?.vad,
1503+
prefer: opts?.prefer,
1504+
allowOpenVinoWhisper: opts?.allowOpenVinoWhisper,
14991505
});
15001506
}
15011507

@@ -1518,6 +1524,36 @@ export class EngineVoiceBridge {
15181524
? signal.reason
15191525
: new DOMException("Aborted", "AbortError");
15201526
}
1527+
const transcribeWithStreaming = async (
1528+
prefer?: AsrBackendPreference,
1529+
): Promise<string> => {
1530+
const transcriber = this.createStreamingTranscriber(
1531+
prefer ? { prefer } : undefined,
1532+
);
1533+
const abort = () => transcriber.dispose();
1534+
try {
1535+
signal?.addEventListener("abort", abort, { once: true });
1536+
transcriber.feed({
1537+
pcm: args.pcm,
1538+
sampleRate: args.sampleRate,
1539+
timestampMs: 0,
1540+
});
1541+
const final = await transcriber.flush();
1542+
if (signal?.aborted) {
1543+
throw signal.reason instanceof Error
1544+
? signal.reason
1545+
: new DOMException("Aborted", "AbortError");
1546+
}
1547+
return final.partial;
1548+
} finally {
1549+
signal?.removeEventListener("abort", abort);
1550+
transcriber.dispose();
1551+
}
1552+
};
1553+
const asrPreference = readAsrBackendPreferenceFromEnv();
1554+
if (asrPreference === "openvino-whisper") {
1555+
return await transcribeWithStreaming("openvino-whisper");
1556+
}
15211557
const backendBatch = this.backend as OmniVoiceBackend & {
15221558
transcribe?: (args: TranscriptionAudio) => Promise<string>;
15231559
};
@@ -1554,26 +1590,7 @@ export class EngineVoiceBridge {
15541590
}
15551591
return transcript;
15561592
}
1557-
const transcriber = this.createStreamingTranscriber();
1558-
const abort = () => transcriber.dispose();
1559-
try {
1560-
signal?.addEventListener("abort", abort, { once: true });
1561-
transcriber.feed({
1562-
pcm: args.pcm,
1563-
sampleRate: args.sampleRate,
1564-
timestampMs: 0,
1565-
});
1566-
const final = await transcriber.flush();
1567-
if (signal?.aborted) {
1568-
throw signal.reason instanceof Error
1569-
? signal.reason
1570-
: new DOMException("Aborted", "AbortError");
1571-
}
1572-
return final.partial;
1573-
} finally {
1574-
signal?.removeEventListener("abort", abort);
1575-
transcriber.dispose();
1576-
}
1593+
return await transcribeWithStreaming(asrPreference ?? undefined);
15771594
}
15781595

15791596
/**

plugins/plugin-local-inference/src/services/voice/openvino-whisper-asr.test.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,9 @@ beforeEach(() => {
6969

7070
afterEach(() => {
7171
mockState.runtimeFixture = null;
72+
vi.unstubAllEnvs();
73+
delete process.env.ELIZA_LOCAL_ASR_BACKEND;
74+
delete process.env.ELIZA_LOCAL_ASR_ALLOW_OPENVINO;
7275
});
7376

7477
describe("createStreamingTranscriber — OpenVINO Whisper tier", () => {

plugins/plugin-local-inference/src/services/voice/transcriber.ts

Lines changed: 59 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,59 @@ export class AsrUnavailableError extends Error {
6868
}
6969
}
7070

71+
export type AsrBackendPreference =
72+
| "auto"
73+
| "fused"
74+
| "ffi-batch"
75+
| "openvino-whisper";
76+
77+
function normalizeBooleanEnv(value: string | undefined): boolean {
78+
const normalized = value?.trim().toLowerCase();
79+
return (
80+
normalized === "1" ||
81+
normalized === "true" ||
82+
normalized === "yes" ||
83+
normalized === "on"
84+
);
85+
}
86+
87+
export function normalizeAsrBackendPreference(
88+
value: string | null | undefined,
89+
): AsrBackendPreference | null {
90+
const normalized = value?.trim().toLowerCase().replace(/_/g, "-");
91+
if (!normalized) return null;
92+
switch (normalized) {
93+
case "auto":
94+
return "auto";
95+
case "fused":
96+
case "streaming":
97+
case "fused-streaming":
98+
return "fused";
99+
case "batch":
100+
case "ffi-batch":
101+
case "fused-batch":
102+
return "ffi-batch";
103+
case "openvino":
104+
case "openvino-whisper":
105+
case "whisper-openvino":
106+
return "openvino-whisper";
107+
default:
108+
return null;
109+
}
110+
}
111+
112+
export function readAsrBackendPreferenceFromEnv(
113+
env: NodeJS.ProcessEnv = process.env,
114+
): AsrBackendPreference | null {
115+
return normalizeAsrBackendPreference(env.ELIZA_LOCAL_ASR_BACKEND);
116+
}
117+
118+
function allowOpenVinoWhisperFromEnv(
119+
env: NodeJS.ProcessEnv = process.env,
120+
): boolean {
121+
return normalizeBooleanEnv(env.ELIZA_LOCAL_ASR_ALLOW_OPENVINO);
122+
}
123+
71124
/* ==================================================================== *
72125
* Shared base — event fan-out, VAD gating, word detection.
73126
* ==================================================================== */
@@ -744,7 +797,7 @@ export interface CreateStreamingTranscriberOptions {
744797
* `"auto"` (default) → fused streaming → fused batch →
745798
* OpenVINO whisper (when enabled) → throw.
746799
*/
747-
prefer?: "auto" | "fused" | "ffi-batch" | "openvino-whisper";
800+
prefer?: AsrBackendPreference;
748801
/**
749802
* Permit the OpenVINO Whisper adapter (NPU→CPU autoprobe). Off by default
750803
* — Eliza-1 voice bridges run only the fused path. Set explicitly to `true`
@@ -771,8 +824,11 @@ export interface CreateStreamingTranscriberOptions {
771824
export function createStreamingTranscriber(
772825
opts: CreateStreamingTranscriberOptions = {},
773826
): StreamingTranscriber {
774-
const prefer = opts.prefer ?? "auto";
775-
const allowOpenVinoWhisper = opts.allowOpenVinoWhisper === true;
827+
const prefer = opts.prefer ?? readAsrBackendPreferenceFromEnv() ?? "auto";
828+
const allowOpenVinoWhisper =
829+
opts.allowOpenVinoWhisper === true ||
830+
prefer === "openvino-whisper" ||
831+
allowOpenVinoWhisperFromEnv();
776832

777833
const tryFusedStreaming = (): StreamingTranscriber | null => {
778834
if (!opts.ffi || !opts.getContext) return null;

0 commit comments

Comments
 (0)