;
};
-const openAiVoices: SelectOption[] = [
- { id: "alloy", label: "Alloy" },
- { id: "echo", label: "Echo" },
- { id: "fable", label: "Fable" },
- { id: "onyx", label: "Onyx" },
- { id: "nova", label: "Nova" },
- { id: "shimmer", label: "Shimmer" },
-];
-
-const openAiTtsModels: SelectOption[] = [
- { id: "tts-1", label: "tts-1" },
- { id: "tts-1-hd", label: "tts-1-hd" },
-];
-
const whisperModels: SelectOption[] = [
{ id: "whisper-1", label: "whisper-1" },
];
@@ -289,32 +275,6 @@ export const chatProviderOptions: ProviderOption[] = [
];
export const speechProviderOptions: ProviderOption[] = [
- {
- id: "openai-audio-speech",
- label: "OpenAI",
- icon: "i-lobe-icons:openai",
- description: "OpenAI speech models.",
- category: "speech",
- engineId: "openai-tts",
- defaultBaseUrl: "https://api.openai.com/v1/",
- requiresApiKey: true,
- requiresBaseUrl: true,
- supportsModels: true,
- supportsVoices: true,
- modelOptions: openAiTtsModels,
- voiceOptions: openAiVoices,
- defaultModel: "tts-1",
- defaultVoice: "alloy",
- },
- {
- id: "openai-compatible-audio-speech",
- label: "OpenAI Compatible",
- icon: "i-lobe-icons:openai",
- description: "OpenAI-compatible speech.",
- category: "speech",
- requiresApiKey: true,
- requiresBaseUrl: true,
- },
{
id: "volcengine-speech",
label: "Volcengine",
@@ -327,14 +287,15 @@ export const speechProviderOptions: ProviderOption[] = [
supportsVoices: true,
modelOptions: [{ id: "v1", label: "v1" }],
defaultModel: "v1",
- defaultBaseUrl: "https://unspeech.hyp3r.link/v1/",
+ defaultBaseUrl: "https://openspeech.bytedance.com/api/v1/tts",
},
{
- id: "alibaba-cloud-model-studio",
+ id: "alibaba-cloud-model-studio-speech",
label: "Alibaba Cloud Model Studio",
icon: "i-lobe-icons:alibabacloud",
description: "bailian.console.aliyun.com",
category: "speech",
+ engineId: "alibaba-cloud-model-studio-speech",
requiresApiKey: true,
requiresBaseUrl: true,
supportsModels: true,
@@ -344,65 +305,7 @@ export const speechProviderOptions: ProviderOption[] = [
{ id: "cosyvoice-v2", label: "cosyvoice-v2" },
],
defaultModel: "cosyvoice-v1",
- defaultBaseUrl: "https://unspeech.hyp3r.link/v1/",
- },
- {
- id: "volcengine",
- label: "Volcengine",
- icon: "i-lobe-icons:volcengine",
- description: "volcengine.com",
- category: "speech",
- requiresApiKey: true,
- requiresBaseUrl: true,
- supportsModels: true,
- supportsVoices: true,
- modelOptions: [{ id: "v1", label: "v1" }],
- defaultModel: "v1",
- defaultBaseUrl: "https://unspeech.hyp3r.link/v1/",
- },
- {
- id: "elevenlabs",
- label: "ElevenLabs",
- icon: "i-simple-icons:elevenlabs",
- description: "Voice synthesis & cloning.",
- category: "speech",
- defaultBaseUrl: "https://api.elevenlabs.io/v1/",
- requiresApiKey: true,
- requiresBaseUrl: true,
- },
- {
- id: "microsoft-speech",
- label: "Microsoft / Azure Speech",
- icon: "i-lobe-icons:microsoft",
- description: "Microsoft speech services.",
- category: "speech",
- requiresApiKey: true,
- },
- {
- id: "index-tts-vllm",
- label: "Bilibili Index TTS",
- icon: "i-lobe-icons:bilibiliindex",
- description: "index-tts.github.io",
- category: "speech",
- defaultBaseUrl: "http://localhost:8000/v1/",
- requiresBaseUrl: true,
- },
- {
- id: "comet-api-speech",
- label: "Comet API",
- icon: "i-lobe-icons:cometapi",
- description: "Comet API speech.",
- category: "speech",
- requiresApiKey: true,
- },
- {
- id: "player2-speech",
- label: "Player2 Speech",
- icon: "i-lobe-icons:player2",
- description: "Local gameplay assistant speech.",
- category: "speech",
- defaultBaseUrl: "http://localhost:4315/v1/",
- requiresBaseUrl: true,
+ defaultBaseUrl: "https://dashscope.aliyuncs.com",
},
{
id: "app-local-audio-speech",
@@ -446,11 +349,11 @@ export const transcriptionProviderOptions: ProviderOption[] = [
},
{
id: "aliyun-nls-transcription",
- label: "Aliyun NLS",
+ label: "Alibaba Cloud Model Studio",
icon: "i-lobe-icons:alibabacloud",
- description: "Aliyun transcription.",
+ description: "Alibaba Bailian ASR transcription.",
category: "transcription",
- requiresApiKey: true,
+ engineId: "aliyun-nls-asr",
},
{
id: "comet-api-transcription",
diff --git a/frontend/packages/app-core/src/services/audio-direct.test.ts b/frontend/packages/app-core/src/services/audio-direct.test.ts
new file mode 100644
index 0000000..963d195
--- /dev/null
+++ b/frontend/packages/app-core/src/services/audio-direct.test.ts
@@ -0,0 +1,164 @@
+import assert from "node:assert/strict";
+
+import {
+ buildLegacyTtsHttpRequest,
+ buildDirectTtsHttpRequest,
+ supportsDirectTts,
+} from "../utils/tts-direct-request.ts";
+
+function run(name: string, fn: () => void) {
+ try {
+ fn();
+ console.info(`PASS ${name}`);
+ } catch (error) {
+ console.error(`FAIL ${name}`);
+ throw error;
+ }
+}
+
+run("supports backend relay tts engines", () => {
+ assert.equal(supportsDirectTts("volcengine-speech"), true);
+ assert.equal(supportsDirectTts("alibaba-cloud-model-studio-speech"), true);
+ assert.equal(supportsDirectTts("openai-tts"), false);
+ assert.equal(supportsDirectTts("unknown-engine"), false);
+});
+
+run("builds backend relay request for volcengine speech engine", () => {
+ const request = buildDirectTtsHttpRequest({
+ text: "hello",
+ engineId: "volcengine-speech",
+ apiBaseUrl: "http://localhost:8090",
+ config: {
+ apiKey: "token-123",
+ model: "v1",
+ voice: "zh_female_test",
+ appId: "appid-xyz",
+ },
+ });
+
+ assert.ok(request);
+ assert.equal(request?.url, "http://localhost:8090/api/tts/engines");
+ assert.equal(request?.headers.Authorization, undefined);
+ assert.deepEqual(request?.body, {
+ engine: "volcengine-speech",
+ data: "hello",
+ config: {
+ apiKey: "token-123",
+ model: "v1",
+ voice: "zh_female_test",
+ appId: "appid-xyz",
+ },
+ });
+});
+
+run("builds backend relay request for alibaba speech engine", () => {
+ const request = buildDirectTtsHttpRequest({
+ text: "hello",
+ engineId: "alibaba-cloud-model-studio-speech",
+ apiBaseUrl: "http://localhost:8090/",
+ config: {
+ apiKey: "token-123",
+ model: "alibaba/cosyvoice-v1",
+ voice: "longxiaochun_v2",
+ rate: 1.2,
+ pitch: 0.9,
+ },
+ });
+
+ assert.ok(request);
+ assert.equal(request?.url, "http://localhost:8090/api/tts/engines");
+ assert.deepEqual(request?.body, {
+ engine: "alibaba-cloud-model-studio-speech",
+ data: "hello",
+ config: {
+ apiKey: "token-123",
+ model: "cosyvoice-v1",
+ voice: "longxiaochun_v2",
+ rate: 1.2,
+ pitch: 0.9,
+ },
+ });
+});
+
+run("does not forward base url for fixed direct providers", () => {
+ const volcRequest = buildDirectTtsHttpRequest({
+ text: "hello",
+ engineId: "volcengine-speech",
+ apiBaseUrl: "http://localhost:8090",
+ config: {
+ apiKey: "token-123",
+ baseUrl: "https://unspeech.hyp3r.link/v1/",
+ model: "v1",
+ voice: "zh_female_test",
+ appId: "appid-xyz",
+ },
+ });
+ assert.ok(volcRequest);
+ assert.equal((volcRequest?.body.config as { baseUrl?: string }).baseUrl, undefined);
+
+ const alibabaRequest = buildDirectTtsHttpRequest({
+ text: "hello",
+ engineId: "alibaba-cloud-model-studio-speech",
+ apiBaseUrl: "http://localhost:8090",
+ config: {
+ apiKey: "token-123",
+ baseUrl: "https://unspeech.hyp3r.link/v1/",
+ model: "cosyvoice-v1",
+ voice: "longwan",
+ },
+ });
+ assert.ok(alibabaRequest);
+ assert.equal((alibabaRequest?.body.config as { baseUrl?: string }).baseUrl, undefined);
+});
+
+run("builds legacy synthesize fallback request from backend relay request", () => {
+ const request = buildDirectTtsHttpRequest({
+ text: "fallback test",
+ engineId: "volcengine-speech",
+ apiBaseUrl: "http://localhost:8090",
+ config: {
+ apiKey: "token-123",
+ baseUrl: "https://unspeech.example/v1",
+ model: "v1",
+ voice: "zh_female_test",
+ appId: "appid-xyz",
+ },
+ });
+
+ assert.ok(request);
+ const legacy = buildLegacyTtsHttpRequest(request!);
+ assert.equal(legacy.url, "http://localhost:8090/api/tts/synthesize");
+ assert.deepEqual(legacy.body, {
+ text: "fallback test",
+ engine: "volcengine-speech",
+ providerId: "volcengine-speech",
+ provider_id: "volcengine-speech",
+ config: {
+ apiKey: "token-123",
+ api_key: "token-123",
+ model: "volcengine/v1",
+ voice: "zh_female_test",
+ appId: "appid-xyz",
+ appid: "appid-xyz",
+ app_id: "appid-xyz",
+ backend: "volcengine",
+ },
+ });
+});
+
+run("keeps alibaba model id without provider prefix in legacy fallback request", () => {
+ const request = buildDirectTtsHttpRequest({
+ text: "fallback alibaba",
+ engineId: "alibaba-cloud-model-studio-speech",
+ apiBaseUrl: "http://localhost:8090",
+ config: {
+ apiKey: "token-123",
+ model: "alibaba/cosyvoice-v1",
+ voice: "longxiaochun_v2",
+ },
+ });
+
+ assert.ok(request);
+ const legacy = buildLegacyTtsHttpRequest(request!);
+ assert.equal((legacy.body.config as { model?: string }).model, "cosyvoice-v1");
+});
diff --git a/frontend/packages/app-core/src/services/audio.ts b/frontend/packages/app-core/src/services/audio.ts
index 35049d4..9de5e7c 100644
--- a/frontend/packages/app-core/src/services/audio.ts
+++ b/frontend/packages/app-core/src/services/audio.ts
@@ -1,4 +1,9 @@
import { appConfig } from "../config";
+import {
+ buildLegacyTtsHttpRequest,
+ buildDirectTtsHttpRequest,
+ supportsDirectTts,
+} from "../utils/tts-direct-request";
type AudioRequestConfig = Record
;
@@ -34,6 +39,61 @@ export type AsrStreamConnection = {
close: () => void;
};
+export { buildDirectTtsHttpRequest, supportsDirectTts };
+
+function createTtsHttpError(status: number, detail: string) {
+ const error = new Error(detail || `Direct TTS request failed: ${status}`) as Error & {
+ status?: number;
+ detail?: string;
+ };
+ error.status = status;
+ error.detail = detail || undefined;
+ return error;
+}
+
+function decodeBase64Audio(base64: string, mimeType: string) {
+ const cleaned = base64.trim().replace(/^data:[^;]+;base64,/, "");
+ const binary = atob(cleaned);
+ const bytes = new Uint8Array(binary.length);
+ for (let index = 0; index < binary.length; index += 1) {
+ bytes[index] = binary.charCodeAt(index);
+ }
+ return new Blob([bytes], { type: mimeType || "audio/mpeg" });
+}
+
+async function resolveTtsBlob(response: Response) {
+ const contentType = (response.headers.get("Content-Type") || "").toLowerCase();
+ if (!contentType.includes("application/json")) {
+ return await response.blob();
+ }
+
+ const payload = (await response.json().catch(() => null)) as
+ | {
+ audioBase64?: unknown;
+ audio_base64?: unknown;
+ audio?: unknown;
+ mimeType?: unknown;
+ mime_type?: unknown;
+ format?: unknown;
+ }
+ | null;
+ const audioBase64 =
+ (typeof payload?.audioBase64 === "string" && payload.audioBase64) ||
+ (typeof payload?.audio_base64 === "string" && payload.audio_base64) ||
+ (typeof payload?.audio === "string" && payload.audio) ||
+ "";
+ if (audioBase64) {
+ const mimeType =
+ (typeof payload?.mimeType === "string" && payload.mimeType) ||
+ (typeof payload?.mime_type === "string" && payload.mime_type) ||
+ (typeof payload?.format === "string" && `audio/${payload.format}`) ||
+ "audio/mpeg";
+ return decodeBase64Audio(audioBase64, mimeType);
+ }
+
+ throw new Error("TTS response JSON does not contain audio payload.");
+}
+
export function resolveAudioApiBaseUrl() {
const proxyUrl = appConfig.providers.proxyUrl?.trim();
const apiBaseUrl = appConfig.providers.apiBaseUrl?.trim();
@@ -51,28 +111,55 @@ function resolveAudioWsBaseUrl(baseUrl?: string) {
}
export async function requestTts(request: TtsRequest): Promise {
- const baseUrl = request.baseUrl?.trim() || resolveAudioApiBaseUrl();
- if (!baseUrl) {
+ return await requestTtsDirect(request);
+}
+
+export async function requestTtsDirect(request: TtsRequest): Promise {
+ const apiBaseUrl = request.baseUrl?.trim() || resolveAudioApiBaseUrl();
+ if (!apiBaseUrl) {
throw new Error("Audio API base URL is not configured.");
}
- const response = await fetch(`${baseUrl}/api/tts/engines`, {
+ const directRequest = buildDirectTtsHttpRequest({
+ text: request.text,
+ engineId: request.engineId,
+ apiBaseUrl,
+ config: request.config,
+ });
+ if (!directRequest) {
+ throw new Error("Backend relay TTS request is not available for current config.");
+ }
+
+ const response = await fetch(directRequest.url, {
method: "POST",
- headers: { "Content-Type": "application/json" },
- body: JSON.stringify({
- engine: request.engineId || "default",
- data: { text: request.text },
- config: request.config ?? {},
- }),
+ headers: directRequest.headers,
+ body: JSON.stringify(directRequest.body),
signal: request.signal,
});
- if (!response.ok) {
- const detail = await response.text();
- throw new Error(detail || `TTS request failed: ${response.status}`);
+ if (response.ok) {
+ return await resolveTtsBlob(response);
+ }
+
+ if (response.status === 405) {
+ const legacyRequest = buildLegacyTtsHttpRequest(directRequest);
+ const legacyResponse = await fetch(legacyRequest.url, {
+ method: "POST",
+ headers: legacyRequest.headers,
+ body: JSON.stringify(legacyRequest.body),
+ signal: request.signal,
+ });
+
+ if (legacyResponse.ok) {
+ return await resolveTtsBlob(legacyResponse);
+ }
+
+ const detail = await legacyResponse.text();
+ throw createTtsHttpError(legacyResponse.status, detail);
}
- return await response.blob();
+ const detail = await response.text();
+ throw createTtsHttpError(response.status, detail);
}
export async function requestAsr(request: AsrRequest): Promise> {
diff --git a/frontend/packages/app-core/src/services/providers.ts b/frontend/packages/app-core/src/services/providers.ts
index fbbd870..f318f36 100644
--- a/frontend/packages/app-core/src/services/providers.ts
+++ b/frontend/packages/app-core/src/services/providers.ts
@@ -1,8 +1,5 @@
import type { ProviderConfig } from "../stores/providers";
import type { ProviderCatalogEntry, ProviderCategory, SelectOption } from "../data/provider-catalog";
-import type { UnAlibabaCloudOptions, VoiceProviderWithExtraOptions } from "unspeech";
-
-import { createUnAlibabaCloud, listVoices } from "unspeech";
import { appConfig } from "../config";
@@ -115,67 +112,45 @@ export async function listProviderModels(option: ProviderCatalogEntry, config: P
}
export async function listProviderVoices(option: ProviderCatalogEntry, config: ProviderConfig) {
+ const payload = {
+ providerId: option.id,
+ apiKey: config.apiKey ?? "",
+ baseUrl: normalizeBaseUrl(config.baseUrl, resolveDefaultBaseUrl(option)),
+ model: config.model ?? "",
+ extra: config.extra ?? {},
+ };
+
if (!proxyBaseUrl) {
- if (option.id === "alibaba-cloud-model-studio-speech") {
- const apiKey = config.apiKey?.trim();
- const baseUrl = normalizeBaseUrl(config.baseUrl, resolveDefaultBaseUrl(option));
- if (!apiKey || !baseUrl) {
- return [];
- }
- const provider = createUnAlibabaCloud(apiKey, baseUrl) as VoiceProviderWithExtraOptions;
- const voices = await listVoices({
- ...provider.voice(),
- });
- const configuredModel = config.model?.trim();
- const modelCandidates = new Set();
- if (configuredModel) {
- modelCandidates.add(configuredModel);
- if (configuredModel.includes("/")) {
- const shortModel = configuredModel.split("/").pop();
- if (shortModel) {
- modelCandidates.add(shortModel);
- }
- } else {
- modelCandidates.add(`alibaba/${configuredModel}`);
- }
- }
- const filtered = voices.filter((voice) => {
- const compatible = voice.compatible_models;
- if (!Array.isArray(compatible) || compatible.length === 0) {
- return true;
- }
- if (!modelCandidates.size) {
- return true;
- }
- return compatible.some((model) => modelCandidates.has(model));
- });
- const resolved = filtered.length ? filtered : voices;
- return resolved.map((voice) => {
- const descriptions: string[] = [];
- if (voice.languages?.length) {
- descriptions.push(voice.languages.map((lang) => lang.title).join(", "));
- }
- if (Array.isArray(voice.compatible_models) && voice.compatible_models.length) {
- descriptions.push(`Models: ${voice.compatible_models.join(", ")}`);
- }
- return {
- id: voice.id,
- label: voice.name,
- description: descriptions.join(" · "),
- };
- });
+ const baseUrl = resolveApiBaseUrl();
+ if (!baseUrl) {
+ return null;
}
- return null;
+
+ const response = await fetch(`${baseUrl}/api/providers/voices`, {
+ method: "POST",
+ headers: {
+ "Content-Type": "application/json",
+ },
+ body: JSON.stringify(payload),
+ });
+
+ if (!response.ok) {
+ throw new Error(`Provider voices request failed: ${response.status}`);
+ }
+
+ const result = (await response.json()) as {
+ voices?: SelectOption[];
+ data?: { voices?: SelectOption[] };
+ };
+
+ if (result.data?.voices) return result.data.voices;
+ if (result.voices) return result.voices;
+ return [];
}
const result = await requestProxy<{ voices?: SelectOption[]; data?: { voices?: SelectOption[] } }>(
"/providers/voices",
- {
- providerId: option.id,
- apiKey: config.apiKey ?? "",
- baseUrl: normalizeBaseUrl(config.baseUrl, resolveDefaultBaseUrl(option)),
- extra: config.extra ?? {},
- }
+ payload
);
if (result.data?.voices) return result.data.voices;
diff --git a/frontend/packages/app-core/src/stores/providers.ts b/frontend/packages/app-core/src/stores/providers.ts
index d97b16a..da928d6 100644
--- a/frontend/packages/app-core/src/stores/providers.ts
+++ b/frontend/packages/app-core/src/stores/providers.ts
@@ -18,6 +18,8 @@ import {
} from "../services/providers";
import { useI18n } from "../composables/use-i18n";
import { formatHealthError, formatHealthMessage } from "../utils/health";
+import { filterProviderFields } from "../utils/provider-fields";
+import { isVisibleSpeechProviderId } from "../utils/provider-visibility";
import { useSettingsStore } from "./settings";
export type ProviderStatus = "online" | "offline";
@@ -40,6 +42,16 @@ export type ProviderRuntime = {
const configFieldIds = new Set(["apiKey", "baseUrl", "model", "voice"]);
const credentialFieldIds = new Set(["apiKey", "baseUrl"]);
+const ALIYUN_NLS_PROVIDER_ID = "aliyun-nls-transcription";
+const aliyunNlsNormalizedFields: ProviderField[] = [
+ {
+ id: "apiKey",
+ label: "API Key",
+ type: "secret",
+ required: true,
+ scope: "config",
+ },
+];
export const useProvidersStore = defineStore("providers", () => {
const settingsStore = useSettingsStore();
@@ -47,16 +59,46 @@ export const useProvidersStore = defineStore("providers", () => {
"whalewhisper/providers/configs",
{}
);
- const engineHealthSkipProviders = new Set(["alibaba-cloud-model-studio-speech"]);
+ const engineHealthSkipProviders = new Set([
+ "alibaba-cloud-model-studio-speech",
+ "volcengine-speech",
+ "aliyun-nls-transcription",
+ ]);
const providerRuntime = ref>({});
const catalogProviders = ref([]);
const catalogLoading = ref(false);
const catalogError = ref(null);
const { t } = useI18n();
+ function normalizeProviderEntry(option: ProviderCatalogEntry): ProviderCatalogEntry {
+ if (option.id !== ALIYUN_NLS_PROVIDER_ID) {
+ return option;
+ }
+
+ return {
+ ...option,
+ fields: aliyunNlsNormalizedFields.map((field) => ({ ...field })),
+ };
+ }
+
+ function filterRemovedSpeechProviders(providers: ProviderCatalogEntry[]) {
+ return providers.filter((provider) => {
+ if (provider.category !== "speech") return true;
+ return isVisibleSpeechProviderId(provider.id);
+ });
+ }
+
const effectiveProviders = computed(() => {
- if (catalogProviders.value.length) return catalogProviders.value;
- if (catalogError.value) return fallbackProviderCatalog;
+ if (catalogProviders.value.length) {
+ return filterRemovedSpeechProviders(
+ catalogProviders.value.map((option) => normalizeProviderEntry(option))
+ );
+ }
+ if (catalogError.value) {
+ return filterRemovedSpeechProviders(
+ fallbackProviderCatalog.map((option) => normalizeProviderEntry(option))
+ );
+ }
return [];
});
@@ -235,7 +277,7 @@ export const useProvidersStore = defineStore("providers", () => {
function getProviderFields(providerId: string) {
const option = getProviderMetadata(providerId);
- return option?.fields ?? [];
+ return filterProviderFields(option);
}
function getProviderFieldValue(providerId: string, field: ProviderField) {
@@ -536,6 +578,13 @@ export const useProvidersStore = defineStore("providers", () => {
}
const pendingRefreshIds = new Set();
+ function queueProviderRefresh(providerId: string) {
+ if (!providerId) return;
+ ensureProvider(providerId);
+ ensureDefaultConfig(providerId);
+ pendingRefreshIds.add(providerId);
+ }
+
const flushRefreshQueue = useDebounceFn(() => {
pendingRefreshIds.forEach((providerId) => {
void refreshProvider(providerId);
@@ -543,6 +592,15 @@ export const useProvidersStore = defineStore("providers", () => {
pendingRefreshIds.clear();
}, 600);
+ watch(
+ () => [settingsStore.chatProviderId, settingsStore.speechProviderId, settingsStore.transcriptionProviderId],
+ (providerIds) => {
+ providerIds.forEach((providerId) => queueProviderRefresh(providerId));
+ flushRefreshQueue();
+ },
+ { immediate: true }
+ );
+
watch(
providerConfigs,
(next, prev) => {
@@ -553,7 +611,7 @@ export const useProvidersStore = defineStore("providers", () => {
const nextConfig = next?.[key];
const prevConfig = prev?.[key];
if (JSON.stringify(nextConfig) !== JSON.stringify(prevConfig)) {
- pendingRefreshIds.add(key);
+ queueProviderRefresh(key);
}
});
flushRefreshQueue();
diff --git a/frontend/packages/app-core/src/stores/settings.ts b/frontend/packages/app-core/src/stores/settings.ts
index ebbecbe..20a9a85 100644
--- a/frontend/packages/app-core/src/stores/settings.ts
+++ b/frontend/packages/app-core/src/stores/settings.ts
@@ -15,7 +15,7 @@ export const useSettingsStore = defineStore("settings", () => {
} = storeToRefs(stageSettings);
const themeColorsHueDynamic = ref(false);
const chatProviderId = useLocalStorage("whalewhisper/providers/chat", "openrouter-ai");
- const speechProviderId = useLocalStorage("whalewhisper/providers/speech", "openai-audio-speech");
+ const speechProviderId = useLocalStorage("whalewhisper/providers/speech", "browser-local-audio-speech");
const transcriptionProviderId = useLocalStorage(
"whalewhisper/providers/transcription",
"openai-audio-transcription"
diff --git a/frontend/packages/app-core/src/stores/speech-output.ts b/frontend/packages/app-core/src/stores/speech-output.ts
index 358928e..9f0f6b8 100644
--- a/frontend/packages/app-core/src/stores/speech-output.ts
+++ b/frontend/packages/app-core/src/stores/speech-output.ts
@@ -2,7 +2,13 @@ import { useLocalStorage } from "@vueuse/core";
import { defineStore, storeToRefs } from "pinia";
import { computed, onScopeDispose, ref, watch } from "vue";
-import { requestTts, resolveAudioApiBaseUrl } from "../services/audio";
+import {
+ requestTtsDirect,
+ supportsDirectTts,
+} from "../services/audio";
+import { toSpeakableTtsChunks } from "../utils/tts-chunker";
+import { TtsStreamSegmenter } from "../utils/tts-stream-segmenter";
+import { runTtsChunkQueue, TtsChunkQueueError } from "../utils/tts-streaming-runner";
import { useProvidersStore } from "./providers";
import { useSettingsStore } from "./settings";
@@ -14,7 +20,6 @@ type VoiceOption = {
export const useSpeechOutputStore = defineStore("speech-output", () => {
const enabled = useLocalStorage("whalewhisper/audio/tts/enabled", false);
- const voiceId = useLocalStorage("whalewhisper/audio/tts/voice", "");
const rate = useLocalStorage("whalewhisper/audio/tts/rate", 1);
const pitch = useLocalStorage("whalewhisper/audio/tts/pitch", 1);
const volume = useLocalStorage("whalewhisper/audio/tts/volume", 1);
@@ -24,7 +29,6 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
const settingsStore = useSettingsStore();
const { speechProviderId } = storeToRefs(settingsStore);
const localVoices = ref([]);
- const audioApiBaseUrl = computed(() => resolveAudioApiBaseUrl());
const useBrowserTts = computed(
() => speechProviderId.value === "browser-local-audio-speech"
);
@@ -33,7 +37,7 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
if (useBrowserTts.value) {
return "speechSynthesis" in window;
}
- return Boolean(audioApiBaseUrl.value);
+ return true;
});
const providerMetadata = computed(() =>
providersStore.getProviderMetadata(speechProviderId.value)
@@ -59,16 +63,149 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
}
return remoteVoices.value;
});
- const resolvedVoiceId = computed(() => voiceId.value || providerConfig.value?.voice || "");
+ const resolvedVoiceId = computed(() => providerConfig.value?.voice || "");
const audioElement = ref(null);
const lastError = ref(null);
let remoteController: AbortController | null = null;
let streamController: AbortController | null = null;
+ let requestQueueTail: Promise = Promise.resolve();
let activeObjectUrl: string | null = null;
let audioContext: AudioContext | null = null;
let gainNode: GainNode | null = null;
let activeSources: AudioBufferSourceNode[] = [];
let scheduledTime = 0;
+ const assistantStreamSegmenter = new TtsStreamSegmenter();
+ const incrementalStreamingEnabled = computed(
+ () => streaming.value && !useBrowserTts.value
+ );
+ let assistantStreamActive = false;
+ let assistantStreamTaskTail: Promise = Promise.resolve();
+ let assistantStreamPlaybackTail: Promise = Promise.resolve();
+ let assistantStreamStartedChunks = 0;
+ let assistantStreamQueueVersion = 0;
+ let assistantStreamChunks: string[] = [];
+ let assistantStreamFailedChunkIndex: number | null = null;
+
+ function resolveTtsEngineId() {
+ const metadataEngineId = providerMetadata.value?.engineId;
+ if (typeof metadataEngineId === "string" && metadataEngineId.trim()) {
+ return metadataEngineId.trim();
+ }
+ if (speechProviderId.value === "volcengine-speech" || speechProviderId.value === "volcengine") {
+ return "volcengine-speech";
+ }
+ if (
+ speechProviderId.value === "alibaba-cloud-model-studio-speech" ||
+ speechProviderId.value === "alibaba-cloud-model-studio"
+ ) {
+ return "alibaba-cloud-model-studio-speech";
+ }
+ return "";
+ }
+
+ async function requestTtsDirectSerial(params: Parameters[0]) {
+ const previous = requestQueueTail;
+ let release: (() => void) | null = null;
+ requestQueueTail = new Promise((resolve) => {
+ release = resolve;
+ });
+
+ await previous;
+ try {
+ if (params.signal?.aborted) {
+ throw new DOMException("Aborted", "AbortError");
+ }
+ return await requestTtsDirect(params);
+ } finally {
+ release?.();
+ }
+ }
+
+ function isRetriableTtsError(error: unknown) {
+ if (error instanceof DOMException && error.name === "AbortError") {
+ return false;
+ }
+ if (!(error instanceof Error)) {
+ return false;
+ }
+ const status = Number((error as Error & { status?: number }).status);
+ if (status === 502 || status === 503 || status === 504 || status === 429) {
+ return true;
+ }
+ const message = error.message.toLowerCase();
+ return (
+ message.includes("bad gateway") ||
+ message.includes("gateway timeout") ||
+ message.includes("failed to fetch") ||
+ message.includes("networkerror") ||
+ message.includes("502")
+ );
+ }
+
+ async function sleep(ms: number, signal?: AbortSignal) {
+ if (!ms) return;
+ await new Promise((resolve, reject) => {
+ const timer = setTimeout(() => {
+ cleanup();
+ resolve();
+ }, ms);
+ const onAbort = () => {
+ cleanup();
+ reject(new DOMException("Aborted", "AbortError"));
+ };
+ const cleanup = () => {
+ clearTimeout(timer);
+ signal?.removeEventListener("abort", onAbort);
+ };
+ if (signal?.aborted) {
+ onAbort();
+ return;
+ }
+ signal?.addEventListener("abort", onAbort, { once: true });
+ });
+ }
+
+ async function requestTtsDirectWithRetry(
+ params: Parameters[0],
+ options?: { maxAttempts?: number }
+ ) {
+ const maxAttempts = Math.max(1, options?.maxAttempts ?? 1);
+ for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+ try {
+ return await requestTtsDirectSerial(params);
+ } catch (error) {
+ const shouldRetry = attempt < maxAttempts && isRetriableTtsError(error);
+ if (!shouldRetry) {
+ throw error;
+ }
+ await sleep(180 * attempt, params.signal);
+ }
+ }
+ throw new Error("Direct TTS request failed.");
+ }
+
+ async function requestRemoteTtsBlob(
+ text: string,
+ signal: AbortSignal,
+ options?: { maxAttempts?: number }
+ ) {
+ const engineId = resolveTtsEngineId();
+ const config = buildRemoteConfig();
+
+ if (!supportsDirectTts(engineId)) {
+ throw new Error(`Direct TTS is not supported for provider: ${speechProviderId.value}`);
+ }
+
+ return await requestTtsDirectWithRetry(
+ {
+ engineId,
+ text,
+ config,
+ signal,
+ },
+ options
+ );
+ }
function refreshVoices() {
if (!supported.value) return;
@@ -84,12 +221,13 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
return localVoices.value.find((voice) => voice.voiceURI === resolvedVoiceId.value);
}
- function stopRemotePlayback() {
+ function stopRemotePlayback(options?: { invalidateQueue?: boolean }) {
if (remoteController) {
remoteController.abort();
remoteController = null;
}
stopStreamingPlayback();
+ resetAssistantStreamState({ invalidateQueue: options?.invalidateQueue ?? true });
if (audioElement.value) {
audioElement.value.pause();
audioElement.value.src = "";
@@ -111,6 +249,23 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
return Math.min(Math.max(value, 0), 1);
}
+ function scheduleDecodedBuffer(ctx: AudioContext, buffer: AudioBuffer) {
+ const startAt = Math.max(ctx.currentTime, scheduledTime);
+ const source = ctx.createBufferSource();
+ source.buffer = buffer;
+ if (gainNode) {
+ source.connect(gainNode);
+ } else {
+ source.connect(ctx.destination);
+ }
+ source.start(startAt);
+ scheduledTime = startAt + buffer.duration;
+ activeSources.push(source);
+ source.onended = () => {
+ activeSources = activeSources.filter((item) => item !== source);
+ };
+ }
+
function ensureAudioContext() {
if (typeof window === "undefined") return null;
if (!audioContext) {
@@ -142,23 +297,202 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
scheduledTime = 0;
}
+ function resetAssistantStreamState(options?: { invalidateQueue?: boolean }) {
+ assistantStreamActive = false;
+ assistantStreamPlaybackTail = Promise.resolve();
+ assistantStreamStartedChunks = 0;
+ assistantStreamChunks = [];
+ assistantStreamFailedChunkIndex = null;
+ assistantStreamSegmenter.reset();
+ if (options?.invalidateQueue) {
+ assistantStreamQueueVersion += 1;
+ assistantStreamTaskTail = Promise.resolve();
+ }
+ }
+
+ function queueAssistantStreamTask(task: () => Promise) {
+ const version = assistantStreamQueueVersion;
+ const runIfCurrent = async () => {
+ if (version !== assistantStreamQueueVersion) return;
+ await task();
+ };
+ assistantStreamTaskTail = assistantStreamTaskTail.then(runIfCurrent, runIfCurrent);
+ return assistantStreamTaskTail;
+ }
+
+ async function ensureAssistantStreamStarted() {
+ if (!incrementalStreamingEnabled.value) return false;
+ if (!supported.value || !enabled.value) return false;
+ if (assistantStreamActive && streamController && !streamController.signal.aborted) {
+ return true;
+ }
+
+ const ctx = ensureAudioContext();
+ if (!ctx) return false;
+
+ if (ctx.state === "suspended") {
+ await ctx.resume();
+ }
+ if (gainNode) {
+ gainNode.gain.value = clampVolume(volume.value);
+ }
+
+ stopRemotePlayback({ invalidateQueue: false });
+ assistantStreamSegmenter.reset();
+ assistantStreamPlaybackTail = Promise.resolve();
+ assistantStreamStartedChunks = 0;
+ assistantStreamChunks = [];
+ assistantStreamFailedChunkIndex = null;
+ streamController = new AbortController();
+ scheduledTime = ctx.currentTime;
+ assistantStreamActive = true;
+ return true;
+ }
+
+ function scheduleAssistantChunkPlayback(chunk: string, chunkIndex: number) {
+ assistantStreamPlaybackTail = assistantStreamPlaybackTail
+ .catch(() => undefined)
+ .then(async () => {
+ if (!assistantStreamActive || !streamController) return;
+ if (
+ assistantStreamFailedChunkIndex !== null &&
+ chunkIndex > assistantStreamFailedChunkIndex
+ ) {
+ return;
+ }
+ const ctx = ensureAudioContext();
+ if (!ctx || streamController.signal.aborted) return;
+
+ const buffer = await fetchTtsBuffer(chunk, streamController, ctx, {
+ maxAttempts: 1,
+ });
+ if (!buffer || streamController.signal.aborted) return;
+ scheduleDecodedBuffer(ctx, buffer);
+ assistantStreamStartedChunks += 1;
+ })
+ .catch((error) => {
+ if (error instanceof DOMException && error.name === "AbortError") return;
+ if (
+ assistantStreamFailedChunkIndex === null ||
+ chunkIndex < assistantStreamFailedChunkIndex
+ ) {
+ assistantStreamFailedChunkIndex = chunkIndex;
+ }
+ console.warn("[TTS] stream chunk failed, defer to merged fallback:", {
+ index: chunkIndex,
+ chunk,
+ error: error instanceof Error ? error.message : String(error),
+ });
+ });
+ }
+
+ function flushAssistantSegmenter(finalize: boolean) {
+ const chunks = assistantStreamSegmenter.drain(finalize);
+ if (chunks.length === 0) return;
+ const baseIndex = assistantStreamChunks.length;
+ chunks.forEach((chunk, offset) => {
+ const chunkIndex = baseIndex + offset;
+ assistantStreamChunks.push(chunk);
+ scheduleAssistantChunkPlayback(chunk, chunkIndex);
+ });
+ }
+
+ function pushAssistantLiteral(literal: string) {
+ if (!literal) return;
+ if (!incrementalStreamingEnabled.value) return;
+ if (!supported.value || !enabled.value) return;
+ void queueAssistantStreamTask(async () => {
+ const started = await ensureAssistantStreamStarted();
+ if (!started) return;
+ assistantStreamSegmenter.appendLiteral(literal);
+ flushAssistantSegmenter(false);
+ });
+ }
+
+ function pushAssistantSpecial(_special: string) {
+ if (!incrementalStreamingEnabled.value) return;
+ if (!supported.value || !enabled.value) return;
+ void queueAssistantStreamTask(async () => {
+ const started = await ensureAssistantStreamStarted();
+ if (!started) return;
+ assistantStreamSegmenter.appendSpecialMarker();
+ flushAssistantSegmenter(false);
+ });
+ }
+
+ async function endAssistantStream(finalText?: string) {
+ if (!incrementalStreamingEnabled.value) {
+ if (finalText?.trim()) {
+ await speak(finalText);
+ }
+ return;
+ }
+
+ await queueAssistantStreamTask(async () => {
+ if (!assistantStreamActive) {
+ if (finalText?.trim()) {
+ await speak(finalText);
+ }
+ return;
+ }
+ assistantStreamSegmenter.appendFlushMarker();
+ flushAssistantSegmenter(true);
+ try {
+ await assistantStreamPlaybackTail;
+ if (
+ assistantStreamFailedChunkIndex !== null &&
+ streamController &&
+ !streamController.signal.aborted
+ ) {
+ const ctx = ensureAudioContext();
+ const remainingText = assistantStreamChunks
+ .slice(assistantStreamFailedChunkIndex)
+ .join("");
+ if (ctx && remainingText.trim()) {
+ console.warn("[TTS] stream fallback to merged remainder:", {
+ failedIndex: assistantStreamFailedChunkIndex,
+ remainingChunks: assistantStreamChunks.length - assistantStreamFailedChunkIndex,
+ });
+ const fallbackBuffer = await fetchTtsBuffer(
+ remainingText,
+ streamController,
+ ctx,
+ { maxAttempts: 1 }
+ );
+ if (fallbackBuffer && !streamController.signal.aborted) {
+ scheduleDecodedBuffer(ctx, fallbackBuffer);
+ assistantStreamStartedChunks += 1;
+ }
+ }
+ }
+ } finally {
+ if (streamController && streamController.signal.aborted) {
+ // Keep current abort state from explicit stop/interrupt.
+ }
+ assistantStreamActive = false;
+ assistantStreamSegmenter.reset();
+ assistantStreamPlaybackTail = Promise.resolve();
+ assistantStreamStartedChunks = 0;
+ }
+ });
+ }
+
function buildRemoteConfig() {
const config: Record = {
...(providerConfig.value?.extra ?? {}),
};
const isAlibaba = speechProviderId.value === "alibaba-cloud-model-studio-speech";
+ const isVolcengine = speechProviderId.value === "volcengine-speech";
if (providerConfig.value?.apiKey) {
config.apiKey = providerConfig.value.apiKey;
}
if (providerConfig.value?.baseUrl) {
config.baseUrl = providerConfig.value.baseUrl;
+ config.base_url = providerConfig.value.baseUrl;
}
let model = providerConfig.value?.model;
- const voice = resolvedVoiceId.value || providerConfig.value?.voice;
+ const voice = resolvedVoiceId.value;
if (model) {
- if (isAlibaba && !model.includes("/")) {
- model = `alibaba/${model}`;
- }
config.model = model;
}
if (voice) {
@@ -174,37 +508,23 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
if (isAlibaba && pitch.value && pitch.value !== 1) {
config.pitch = pitch.value;
}
- return config;
- }
-
- function splitTtsText(text: string) {
- const hardBreaks = new Set([".", "。", "!", "!", "?", "?", "…", "\n"]);
- const softBreaks = new Set([",", ",", ";", ";", ":", ":", "、"]);
- const minChars = 12;
- const maxChars = 80;
- const chunks: string[] = [];
- let buffer = "";
- for (const char of text) {
- buffer += char;
- const isBreak = hardBreaks.has(char) || softBreaks.has(char);
- if (buffer.length >= maxChars || (isBreak && buffer.length >= minChars)) {
- const trimmed = buffer.trim();
- if (trimmed) chunks.push(trimmed);
- buffer = "";
+ if (isVolcengine) {
+ const appId = String(providerConfig.value?.extra?.appId ?? providerConfig.value?.extra?.appid ?? "").trim();
+ if (appId) {
+ config.appId = appId;
}
}
- const trimmed = buffer.trim();
- if (trimmed) chunks.push(trimmed);
- return chunks;
+ return config;
}
- async function fetchTtsBuffer(text: string, controller: AbortController, ctx: AudioContext) {
- const blob = await requestTts({
- baseUrl: audioApiBaseUrl.value,
- engineId: providerMetadata.value?.engineId,
- text,
- config: buildRemoteConfig(),
- signal: controller.signal,
+ async function fetchTtsBuffer(
+ text: string,
+ controller: AbortController,
+ ctx: AudioContext,
+ options?: { maxAttempts?: number }
+ ) {
+ const blob = await requestRemoteTtsBlob(text, controller.signal, {
+ maxAttempts: Math.max(1, options?.maxAttempts ?? 1),
});
if (controller.signal.aborted) {
throw new DOMException("Aborted", "AbortError");
@@ -226,37 +546,56 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
const controller = new AbortController();
streamController = controller;
- const chunks = splitTtsText(text);
+ const chunks = toSpeakableTtsChunks(text);
if (chunks.length === 0) return;
- const pending: Array> = [];
- let index = 0;
- const maxInFlight = 2;
scheduledTime = ctx.currentTime;
try {
- while (index < chunks.length || pending.length > 0) {
- while (index < chunks.length && pending.length < maxInFlight) {
- pending.push(fetchTtsBuffer(chunks[index], controller, ctx));
- index += 1;
+ await runTtsChunkQueue(
+ chunks,
+ async (chunk) => {
+ if (controller.signal.aborted) return;
+ const buffer = await fetchTtsBuffer(chunk, controller, ctx, {
+ maxAttempts: 1,
+ });
+ if (!buffer || controller.signal.aborted) return;
+ scheduleDecodedBuffer(ctx, buffer);
+ },
+ {
+ stopOnError: true,
+ onChunkError: (error, context) => {
+ console.warn("[TTS] chunk failed:", {
+ index: context.index,
+ total: context.total,
+ chunk: context.chunk,
+ error: error instanceof Error ? error.message : String(error),
+ });
+ },
}
- const buffer = await pending.shift();
- if (!buffer || controller.signal.aborted) return;
- const startAt = Math.max(ctx.currentTime, scheduledTime);
- const source = ctx.createBufferSource();
- source.buffer = buffer;
- if (gainNode) {
- source.connect(gainNode);
- } else {
- source.connect(ctx.destination);
+ );
+ } catch (error) {
+ if (error instanceof DOMException && error.name === "AbortError") {
+ throw error;
+ }
+ if (error instanceof TtsChunkQueueError) {
+ const remainingChunks = chunks.slice(error.context.index);
+ const remainingText = remainingChunks.join("");
+ if (remainingText.trim()) {
+ console.warn("[TTS] fallback to merged remainder after chunk failure:", {
+ failedIndex: error.context.index,
+ remainingChunks: remainingChunks.length,
+ });
+ const fallbackBuffer = await fetchTtsBuffer(remainingText, controller, ctx, {
+ maxAttempts: 1,
+ });
+ if (!controller.signal.aborted && fallbackBuffer) {
+ scheduleDecodedBuffer(ctx, fallbackBuffer);
+ return;
+ }
}
- source.start(startAt);
- scheduledTime = startAt + buffer.duration;
- activeSources.push(source);
- source.onended = () => {
- activeSources = activeSources.filter((item) => item !== source);
- };
}
+ throw error;
} finally {
if (streamController === controller) {
streamController = null;
@@ -313,13 +652,7 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
const controller = new AbortController();
remoteController = controller;
try {
- const blob = await requestTts({
- baseUrl: audioApiBaseUrl.value,
- engineId: providerMetadata.value?.engineId,
- text,
- config: buildRemoteConfig(),
- signal: controller.signal,
- });
+ const blob = await requestRemoteTtsBlob(text, controller.signal);
if (controller.signal.aborted) return;
const objectUrl = URL.createObjectURL(blob);
activeObjectUrl = objectUrl;
@@ -362,25 +695,6 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
}
);
- watch(
- () => voices.value,
- (next) => {
- if (useBrowserTts.value) return;
- const voiceIds = new Set(next.map((voice) => voice.voiceURI));
- if (voiceId.value && voiceIds.has(voiceId.value)) return;
- const configuredVoice = providerConfig.value?.voice;
- if (configuredVoice && voiceIds.has(configuredVoice)) {
- voiceId.value = configuredVoice;
- return;
- }
- if (next.length > 0) {
- voiceId.value = next[0].voiceURI;
- } else {
- voiceId.value = "";
- }
- }
- );
-
watch(
() => volume.value,
(next) => {
@@ -399,7 +713,6 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
return {
enabled,
- voiceId,
rate,
pitch,
volume,
@@ -407,7 +720,11 @@ export const useSpeechOutputStore = defineStore("speech-output", () => {
voices,
supported,
lastError,
+ incrementalStreamingEnabled,
refreshVoices,
+ pushAssistantLiteral,
+ pushAssistantSpecial,
+ endAssistantStream,
speak,
stop,
};
diff --git a/frontend/packages/app-core/src/stores/transcription.ts b/frontend/packages/app-core/src/stores/transcription.ts
index 625dfa7..e47950c 100644
--- a/frontend/packages/app-core/src/stores/transcription.ts
+++ b/frontend/packages/app-core/src/stores/transcription.ts
@@ -8,6 +8,13 @@ import {
createAudioCaptureSession,
pcm16ToWavBlob,
} from "../utils/audio-stream";
+import { shouldAutoRestartBrowserRecognition } from "../utils/browser-recognition-restart";
+import { decideCaptureFallback } from "../utils/capture-startup";
+import {
+ normalizeTranscriptionLanguage,
+ resolveInitialTranscriptionLanguage,
+} from "../utils/transcription-language";
+import { sanitizeTranscript } from "../utils/transcript-filter";
import { useChatStore } from "./chat";
import { useHearingStore } from "./hearing";
import { useProvidersStore } from "./providers";
@@ -21,6 +28,12 @@ type RecordingResult = {
};
type CaptureMode = "worklet" | "media";
+type ListeningSource = "settings-test" | "chat-input";
+type StartListeningOptions = {
+ autoSend?: boolean;
+ source?: ListeningSource;
+};
+const BROWSER_RECOGNITION_RESTART_DELAY_MS = 250;
export const useTranscriptionStore = defineStore("transcription", () => {
const chatStore = useChatStore();
@@ -31,7 +44,14 @@ export const useTranscriptionStore = defineStore("transcription", () => {
const enabled = useLocalStorage("whalewhisper/audio/transcription/enabled", false);
const autoSend = useLocalStorage("whalewhisper/audio/transcription/auto-send", true);
- const language = useLocalStorage("whalewhisper/audio/transcription/language", "en-US");
+ const initialLanguage =
+ typeof navigator !== "undefined"
+ ? resolveInitialTranscriptionLanguage(navigator.language)
+ : resolveInitialTranscriptionLanguage(undefined);
+ const language = useLocalStorage(
+ "whalewhisper/audio/transcription/language",
+ initialLanguage
+ );
const vadMinSpeechMs = useLocalStorage(
"whalewhisper/audio/transcription/vad-min-ms",
300
@@ -87,6 +107,8 @@ export const useTranscriptionStore = defineStore("transcription", () => {
const lastTranscript = ref("");
const error = ref(null);
const vadActive = ref(false);
+ const activeAutoSend = ref(Boolean(autoSend.value));
+ const listeningSource = ref(null);
let recognition: any = null;
let recorder: MediaRecorder | null = null;
@@ -96,6 +118,10 @@ export const useTranscriptionStore = defineStore("transcription", () => {
let recorderStartedAt = 0;
let silenceTimer: number | null = null;
let restoreHearingEnabled: boolean | null = null;
+ let browserRecognitionSessionRequested = false;
+ let manualBrowserRecognitionStop = false;
+ let recognitionRestartTimer: number | null = null;
+ let lastBrowserRecognitionErrorCode: string | null = null;
let captureSession: Awaited> | null = null;
let captureActive = false;
@@ -106,6 +132,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
let streamPending: ArrayBuffer[] = [];
let streamConnection: ReturnType | null = null;
let streamReady = false;
+ let workletCaptureDisabled = false;
const minSpeechMs = computed(() =>
Math.max(100, Number(vadMinSpeechMs.value) || 300)
@@ -114,6 +141,81 @@ export const useTranscriptionStore = defineStore("transcription", () => {
Math.max(200, Number(vadSilenceMs.value) || 700)
);
+ function applyTranscript(raw: string) {
+ const transcript = sanitizeTranscript(raw);
+ if (!transcript) {
+ return;
+ }
+ lastTranscript.value = transcript;
+ interimText.value = "";
+ if (activeAutoSend.value) {
+ chatStore.send(transcript);
+ }
+ }
+
+ function resolveStartAutoSend(options?: StartListeningOptions) {
+ if (typeof options?.autoSend === "boolean") {
+ return options.autoSend;
+ }
+ return Boolean(autoSend.value);
+ }
+
+ function resolveStartSource(options: StartListeningOptions | undefined, nextAutoSend: boolean) {
+ if (options?.source) {
+ return options.source;
+ }
+ return nextAutoSend ? "chat-input" : "settings-test";
+ }
+
+ function shouldRestartBrowserRecognition() {
+ return shouldAutoRestartBrowserRecognition({
+ userRequested: browserRecognitionSessionRequested,
+ manuallyStopped: manualBrowserRecognitionStop,
+ enabled: enabled.value,
+ supported: supported.value,
+ useBrowserRecognition: useBrowserRecognition.value,
+ lastErrorCode: lastBrowserRecognitionErrorCode,
+ });
+ }
+
+ function clearRecognitionRestartTimer() {
+ if (typeof window === "undefined") return;
+ if (recognitionRestartTimer) {
+ window.clearTimeout(recognitionRestartTimer);
+ recognitionRestartTimer = null;
+ }
+ }
+
+ function startBrowserRecognitionSession() {
+ const recognizer = ensureRecognition();
+ if (!recognizer) return;
+ recognizer.lang = normalizeTranscriptionLanguage(language.value);
+ try {
+ recognizer.start();
+ } catch (err) {
+ const name = err instanceof DOMException ? err.name : "";
+ if (name === "InvalidStateError") {
+ listening.value = true;
+ return;
+ }
+ error.value = err instanceof Error ? err.message : "Speech recognition error.";
+ listening.value = false;
+ }
+ }
+
+ function scheduleBrowserRecognitionRestart() {
+ if (typeof window === "undefined") return;
+ clearRecognitionRestartTimer();
+ recognitionRestartTimer = window.setTimeout(() => {
+ recognitionRestartTimer = null;
+ if (!shouldRestartBrowserRecognition()) {
+ listening.value = false;
+ return;
+ }
+ startBrowserRecognitionSession();
+ }, BROWSER_RECOGNITION_RESTART_DELAY_MS);
+ }
+
function getRecognitionCtor(): SpeechRecognitionCtor | null {
if (typeof window === "undefined") return null;
return (window.SpeechRecognition || window.webkitSpeechRecognition) as SpeechRecognitionCtor;
@@ -136,18 +238,26 @@ export const useTranscriptionStore = defineStore("transcription", () => {
recognition = new Ctor();
recognition.continuous = true;
recognition.interimResults = true;
- recognition.lang = language.value;
+ recognition.lang = normalizeTranscriptionLanguage(language.value);
recognition.onstart = () => {
listening.value = true;
error.value = null;
+ lastBrowserRecognitionErrorCode = null;
};
recognition.onend = () => {
+ if (shouldRestartBrowserRecognition()) {
+ listening.value = true;
+ scheduleBrowserRecognitionRestart();
+ return;
+ }
listening.value = false;
};
recognition.onerror = (event) => {
+ lastBrowserRecognitionErrorCode =
+ typeof event.error === "string" ? event.error : null;
error.value = event.error || "Speech recognition error.";
listening.value = false;
};
@@ -166,13 +276,9 @@ export const useTranscriptionStore = defineStore("transcription", () => {
}
}
- interimText.value = interim.trim();
+ interimText.value = sanitizeTranscript(interim);
if (finalText.trim()) {
- lastTranscript.value = finalText.trim();
- interimText.value = "";
- if (autoSend.value) {
- chatStore.send(lastTranscript.value);
- }
+ applyTranscript(finalText);
}
};
@@ -217,7 +323,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
}
}
- async function startListening() {
+ async function startListening(options?: StartListeningOptions) {
if (!canListen.value) {
error.value = useBrowserRecognition.value
? "Speech recognition is not supported in this environment."
@@ -225,6 +331,17 @@ export const useTranscriptionStore = defineStore("transcription", () => {
return;
}
+ if (!enabled.value) {
+ enabled.value = true;
+ }
+ const nextAutoSend = resolveStartAutoSend(options);
+ activeAutoSend.value = nextAutoSend;
+ listeningSource.value = resolveStartSource(options, nextAutoSend);
+
+ if (listening.value) {
+ return;
+ }
+
listening.value = true;
if (!hearingStore.enabled) {
@@ -233,15 +350,12 @@ export const useTranscriptionStore = defineStore("transcription", () => {
}
await hearingStore.start();
- if (!enabled.value) {
- return;
- }
-
if (useBrowserRecognition.value && supported.value) {
- const recognizer = ensureRecognition();
- if (!recognizer) return;
- recognizer.lang = language.value;
- recognizer.start();
+ browserRecognitionSessionRequested = true;
+ manualBrowserRecognitionStop = false;
+ lastBrowserRecognitionErrorCode = null;
+ clearRecognitionRestartTimer();
+ startBrowserRecognitionSession();
return;
}
@@ -249,18 +363,29 @@ export const useTranscriptionStore = defineStore("transcription", () => {
}
async function stopListening() {
+ const wasSettingsTest = listeningSource.value === "settings-test";
+ const wasChatInput = listeningSource.value === "chat-input";
listening.value = false;
if (useBrowserRecognition.value) {
+ browserRecognitionSessionRequested = false;
+ manualBrowserRecognitionStop = true;
+ clearRecognitionRestartTimer();
recognition?.stop();
} else {
await stopVad();
}
- if (restoreHearingEnabled === false) {
+ if (wasSettingsTest || wasChatInput) {
+ hearingStore.stopSpeechDetection();
+ hearingStore.stop();
+ hearingStore.enabled = false;
+ } else if (restoreHearingEnabled === false) {
hearingStore.enabled = false;
}
restoreHearingEnabled = null;
+ activeAutoSend.value = Boolean(autoSend.value);
+ listeningSource.value = null;
}
function resolveRecorderMimeType() {
@@ -279,9 +404,9 @@ export const useTranscriptionStore = defineStore("transcription", () => {
error.value = null;
if (!navigator.mediaDevices?.getUserMedia) {
error.value = "Microphone is not supported in this environment.";
- return;
+ return false;
}
- if (recorder) return;
+ if (recorder) return true;
const constraints: MediaStreamConstraints = {
audio: hearingStore.selectedDeviceId
@@ -292,7 +417,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
recorderStream = await navigator.mediaDevices.getUserMedia(constraints);
} catch (err) {
error.value = err instanceof Error ? err.message : "Failed to access microphone.";
- return;
+ return false;
}
const mimeType = resolveRecorderMimeType();
@@ -305,7 +430,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
} catch (err) {
error.value = err instanceof Error ? err.message : "Failed to start recorder.";
cleanupRecorder();
- return;
+ return false;
}
recorder.ondataavailable = (event) => {
@@ -323,6 +448,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
error.value = detail?.message || "Recorder error.";
};
recorder.start();
+ return true;
}
async function stopRecording() {
@@ -383,11 +509,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
});
const transcript = extractTranscript(payload);
if (transcript) {
- lastTranscript.value = transcript;
- interimText.value = "";
- if (autoSend.value) {
- chatStore.send(transcript);
- }
+ applyTranscript(transcript);
}
} catch (err) {
error.value = err instanceof Error ? err.message : "Transcription failed.";
@@ -402,9 +524,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
if (model) {
config.model = model;
}
- if (language.value) {
- config.language = language.value;
- }
+ config.language = normalizeTranscriptionLanguage(language.value);
const extension = resolveExtension(mimeType);
config.filename = extension ? `audio.${extension}` : "audio.wav";
config.content_type = mimeType || "application/octet-stream";
@@ -422,9 +542,9 @@ export const useTranscriptionStore = defineStore("transcription", () => {
}
function extractTranscript(payload: Record) {
- if (typeof payload.text === "string") return payload.text.trim();
+ if (typeof payload.text === "string") return sanitizeTranscript(payload.text);
const data = payload.data;
- if (data && typeof data.text === "string") return data.text.trim();
+ if (data && typeof data.text === "string") return sanitizeTranscript(data.text);
return "";
}
@@ -444,16 +564,33 @@ export const useTranscriptionStore = defineStore("transcription", () => {
if (vadActive.value) return;
vadActive.value = true;
listening.value = true;
+ error.value = null;
try {
- await hearingStore.startSpeechDetection({
- minSpeechMs: minSpeechMs.value,
- });
- if (workletAvailable.value) {
- await ensureCaptureSession();
+ // Always use local volume-threshold detection to avoid external VAD runtime/CDN dependency.
+ hearingStore.stopSpeechDetection();
+ await hearingStore.start();
+ if (workletAvailable.value && !workletCaptureDisabled) {
+ try {
+ await ensureCaptureSession();
+ } catch (err) {
+ const fallback = decideCaptureFallback({
+ workletError: err,
+ mediaRecorderSupported: recordingAvailable.value,
+ });
+ if (fallback.mode === "none") {
+ throw new Error(fallback.error || "Failed to start microphone listening.");
+ }
+ workletCaptureDisabled = true;
+ }
}
} catch (err) {
- error.value = err instanceof Error ? err.message : "Failed to start VAD.";
+ hearingStore.stopSpeechDetection();
+ error.value =
+ err instanceof Error
+ ? err.message
+ : "Failed to start microphone listening.";
+ listening.value = false;
}
}
@@ -468,6 +605,17 @@ export const useTranscriptionStore = defineStore("transcription", () => {
await stopCapture();
}
+ async function startMediaCapture() {
+ const started = await startRecording();
+ if (!started) {
+ captureActive = false;
+ captureMode = null;
+ return false;
+ }
+ captureMode = "media";
+ return true;
+ }
+
async function startCapture() {
if (captureActive) return;
captureStartedAt = Date.now();
@@ -477,8 +625,24 @@ export const useTranscriptionStore = defineStore("transcription", () => {
streamReady = false;
captureActive = true;
- if (workletAvailable.value) {
- await ensureCaptureSession();
+ if (workletAvailable.value && !workletCaptureDisabled) {
+ try {
+ await ensureCaptureSession();
+ } catch (err) {
+ const fallback = decideCaptureFallback({
+ workletError: err,
+ mediaRecorderSupported: recordingAvailable.value,
+ });
+ if (fallback.mode === "none") {
+ error.value = fallback.error || "Recording is not supported in this environment.";
+ captureActive = false;
+ captureMode = null;
+ return;
+ }
+ workletCaptureDisabled = true;
+ await startMediaCapture();
+ return;
+ }
captureMode = "worklet";
if (useStreamingTransport.value) {
try {
@@ -506,15 +670,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
}
return;
}
-
- if (!recordingAvailable.value) {
- error.value = "Recording is not supported in this environment.";
- captureActive = false;
- return;
- }
-
- captureMode = "media";
- await startRecording();
+ await startMediaCapture();
}
async function stopCapture() {
@@ -538,11 +694,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
const payload = await streamConnection.result;
const transcript = extractTranscript(payload);
if (transcript) {
- lastTranscript.value = transcript;
- interimText.value = "";
- if (autoSend.value) {
- chatStore.send(transcript);
- }
+ applyTranscript(transcript);
}
streamSucceeded = true;
} catch (err) {
@@ -566,11 +718,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
});
const transcript = extractTranscript(payload);
if (transcript) {
- lastTranscript.value = transcript;
- interimText.value = "";
- if (autoSend.value) {
- chatStore.send(transcript);
- }
+ applyTranscript(transcript);
}
} catch (err) {
error.value = err instanceof Error ? err.message : "Transcription failed.";
@@ -607,7 +755,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
watch(language, (next) => {
if (recognition) {
- recognition.lang = next;
+ recognition.lang = normalizeTranscriptionLanguage(next);
}
});
@@ -634,6 +782,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
);
onScopeDispose(() => {
+ clearRecognitionRestartTimer();
void stopListening();
recognition = null;
cleanupRecorder();
@@ -677,6 +826,7 @@ export const useTranscriptionStore = defineStore("transcription", () => {
listening,
supported,
canListen,
+ listeningSource,
interimText,
lastTranscript,
error,
diff --git a/frontend/packages/app-core/src/utils/browser-recognition-restart.test.ts b/frontend/packages/app-core/src/utils/browser-recognition-restart.test.ts
new file mode 100644
index 0000000..ec2b29b
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/browser-recognition-restart.test.ts
@@ -0,0 +1,56 @@
+import assert from "node:assert/strict";
+
+import { shouldAutoRestartBrowserRecognition } from "./browser-recognition-restart.ts";
+
+function run(name: string, fn: () => void) {
+ try {
+ fn();
+ console.info(`PASS ${name}`);
+ } catch (error) {
+ console.error(`FAIL ${name}`);
+ throw error;
+ }
+}
+
+run("restarts when user session is active and no fatal error", () => {
+ assert.equal(
+ shouldAutoRestartBrowserRecognition({
+ userRequested: true,
+ manuallyStopped: false,
+ enabled: true,
+ supported: true,
+ useBrowserRecognition: true,
+ lastErrorCode: null,
+ }),
+ true
+ );
+});
+
+run("does not restart after manual stop", () => {
+ assert.equal(
+ shouldAutoRestartBrowserRecognition({
+ userRequested: true,
+ manuallyStopped: true,
+ enabled: true,
+ supported: true,
+ useBrowserRecognition: true,
+ lastErrorCode: null,
+ }),
+ false
+ );
+});
+
+run("does not restart on microphone permission denial", () => {
+ assert.equal(
+ shouldAutoRestartBrowserRecognition({
+ userRequested: true,
+ manuallyStopped: false,
+ enabled: true,
+ supported: true,
+ useBrowserRecognition: true,
+ lastErrorCode: "not-allowed",
+ }),
+ false
+ );
+});
+
diff --git a/frontend/packages/app-core/src/utils/browser-recognition-restart.ts b/frontend/packages/app-core/src/utils/browser-recognition-restart.ts
new file mode 100644
index 0000000..644f176
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/browser-recognition-restart.ts
@@ -0,0 +1,23 @@
+const NON_RESTARTABLE_ERRORS = new Set([
+ "not-allowed",
+ "service-not-allowed",
+ "audio-capture",
+]);
+
+type AutoRestartDecision = {
+ userRequested: boolean;
+ manuallyStopped: boolean;
+ enabled: boolean;
+ supported: boolean;
+ useBrowserRecognition: boolean;
+ lastErrorCode?: string | null;
+};
+
+export function shouldAutoRestartBrowserRecognition(options: AutoRestartDecision) {
+ if (!options.userRequested) return false;
+ if (options.manuallyStopped) return false;
+ if (!options.enabled || !options.supported || !options.useBrowserRecognition) return false;
+ if (!options.lastErrorCode) return true;
+ return !NON_RESTARTABLE_ERRORS.has(options.lastErrorCode);
+}
+
diff --git a/frontend/packages/app-core/src/utils/capture-startup.test.ts b/frontend/packages/app-core/src/utils/capture-startup.test.ts
new file mode 100644
index 0000000..5ccea7a
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/capture-startup.test.ts
@@ -0,0 +1,63 @@
+import {
+ decideCaptureFallback,
+ type CaptureFallbackDecision,
+} from "./capture-startup.ts";
+
+function run(name: string, fn: () => void) {
+ try {
+ fn();
+ console.info(`PASS ${name}`);
+ } catch (error) {
+ console.error(`FAIL ${name}`);
+ throw error;
+ }
+}
+
+function expectDecision(
+ actual: CaptureFallbackDecision,
+ expected: CaptureFallbackDecision
+) {
+ const actualText = JSON.stringify(actual);
+ const expectedText = JSON.stringify(expected);
+ if (actualText !== expectedText) {
+ throw new Error(`Expected ${expectedText} but received ${actualText}`);
+ }
+}
+
+function expectEqual(actual: T, expected: T) {
+ if (actual !== expected) {
+ throw new Error(`Expected ${String(expected)} but received ${String(actual)}`);
+ }
+}
+
+run("falls back to media recorder when worklet init fails and media is supported", () => {
+ const decision = decideCaptureFallback({
+ workletError: new Error("worklet addModule failed"),
+ mediaRecorderSupported: true,
+ });
+
+ expectDecision(decision, {
+ mode: "media",
+ error: null,
+ });
+});
+
+run("returns actionable error when no fallback transport is available", () => {
+ const decision = decideCaptureFallback({
+ workletError: new Error("worklet addModule failed"),
+ mediaRecorderSupported: false,
+ });
+
+ expectEqual(decision.mode, "none");
+ expectEqual(decision.error, "worklet addModule failed");
+});
+
+run("normalizes non-error throw values", () => {
+ const decision = decideCaptureFallback({
+ workletError: "AudioWorklet is unavailable",
+ mediaRecorderSupported: false,
+ });
+
+ expectEqual(decision.mode, "none");
+ expectEqual(decision.error, "AudioWorklet is unavailable");
+});
diff --git a/frontend/packages/app-core/src/utils/capture-startup.ts b/frontend/packages/app-core/src/utils/capture-startup.ts
new file mode 100644
index 0000000..add6ff2
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/capture-startup.ts
@@ -0,0 +1,35 @@
+export type CaptureFallbackDecision = {
+ mode: "media" | "none";
+ error: string | null;
+};
+
+type CaptureFallbackInput = {
+ workletError: unknown;
+ mediaRecorderSupported: boolean;
+};
+
+function normalizeError(error: unknown) {
+ if (error instanceof Error && error.message.trim()) {
+ return error.message.trim();
+ }
+ if (typeof error === "string" && error.trim()) {
+ return error.trim();
+ }
+ return "Audio capture initialization failed.";
+}
+
+export function decideCaptureFallback(
+ input: CaptureFallbackInput
+): CaptureFallbackDecision {
+ if (input.mediaRecorderSupported) {
+ return {
+ mode: "media",
+ error: null,
+ };
+ }
+
+ return {
+ mode: "none",
+ error: normalizeError(input.workletError),
+ };
+}
diff --git a/frontend/packages/app-core/src/utils/provider-fields.test.ts b/frontend/packages/app-core/src/utils/provider-fields.test.ts
new file mode 100644
index 0000000..49c71db
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/provider-fields.test.ts
@@ -0,0 +1,61 @@
+import assert from "node:assert/strict";
+
+import type { ProviderCatalogEntry } from "../data/provider-catalog.ts";
+import { filterProviderFields } from "./provider-fields.ts";
+
+function run(name: string, fn: () => void) {
+ try {
+ fn();
+ console.info(`PASS ${name}`);
+ } catch (error) {
+ console.error(`FAIL ${name}`);
+ throw error;
+ }
+}
+
+function fieldIds(option: ProviderCatalogEntry) {
+ return filterProviderFields(option).map((field) => field.id);
+}
+
+run("hides baseUrl when provider has default baseUrl in defaults", () => {
+ const option: ProviderCatalogEntry = {
+ id: "openai-compatible",
+ label: "OpenAI Compatible",
+ category: "chat",
+ defaults: {
+ baseUrl: "https://api.example.com/v1/",
+ },
+ fields: [
+ { id: "apiKey", label: "API Key", type: "secret" },
+ { id: "baseUrl", label: "Base URL", type: "text" },
+ { id: "model", label: "Model", type: "select" },
+ ],
+ };
+ assert.deepEqual(fieldIds(option), ["apiKey", "model"]);
+});
+
+run("hides baseUrl when baseUrl field itself has default", () => {
+ const option: ProviderCatalogEntry = {
+ id: "custom-provider",
+ label: "Custom",
+ category: "speech",
+ fields: [
+ { id: "apiKey", label: "API Key", type: "secret" },
+ { id: "baseUrl", label: "Base URL", type: "text", default: "https://tts.example.com/" },
+ ],
+ };
+ assert.deepEqual(fieldIds(option), ["apiKey"]);
+});
+
+run("keeps baseUrl when provider has no default baseUrl", () => {
+ const option: ProviderCatalogEntry = {
+ id: "manual-base-url",
+ label: "Manual Base URL",
+ category: "transcription",
+ fields: [
+ { id: "apiKey", label: "API Key", type: "secret" },
+ { id: "baseUrl", label: "Base URL", type: "text" },
+ ],
+ };
+ assert.deepEqual(fieldIds(option), ["apiKey", "baseUrl"]);
+});
diff --git a/frontend/packages/app-core/src/utils/provider-fields.ts b/frontend/packages/app-core/src/utils/provider-fields.ts
new file mode 100644
index 0000000..0ebf0db
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/provider-fields.ts
@@ -0,0 +1,23 @@
+import type { ProviderCatalogEntry, ProviderField } from "../data/provider-catalog";
+
+function resolveFieldDefault(field?: ProviderField) {
+ if (!field || field.default === undefined || field.default === null) return "";
+ return String(field.default).trim();
+}
+
+function hasDefaultBaseUrl(option?: ProviderCatalogEntry) {
+ const defaultBaseUrl = option?.defaults?.baseUrl?.trim();
+ if (defaultBaseUrl) {
+ return true;
+ }
+ const baseUrlField = option?.fields?.find((field) => field.id === "baseUrl");
+ return Boolean(resolveFieldDefault(baseUrlField));
+}
+
+export function filterProviderFields(option?: ProviderCatalogEntry): ProviderField[] {
+ const fields = option?.fields ?? [];
+ if (!hasDefaultBaseUrl(option)) {
+ return fields;
+ }
+ return fields.filter((field) => field.id !== "baseUrl");
+}
diff --git a/frontend/packages/app-core/src/utils/provider-visibility.test.ts b/frontend/packages/app-core/src/utils/provider-visibility.test.ts
new file mode 100644
index 0000000..e110419
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/provider-visibility.test.ts
@@ -0,0 +1,42 @@
+import assert from "node:assert/strict";
+
+import {
+ filterVisibleSpeechProviders,
+ isVisibleSpeechProviderId,
+} from "./provider-visibility.ts";
+
+function run(name: string, fn: () => void) {
+ try {
+ fn();
+ console.info(`PASS ${name}`);
+ } catch (error) {
+ console.error(`FAIL ${name}`);
+ throw error;
+ }
+}
+
+run("only configured speech providers are visible", () => {
+ assert.equal(isVisibleSpeechProviderId("volcengine-speech"), true);
+ assert.equal(isVisibleSpeechProviderId("alibaba-cloud-model-studio-speech"), true);
+ assert.equal(isVisibleSpeechProviderId("browser-local-audio-speech"), true);
+ assert.equal(isVisibleSpeechProviderId("app-local-audio-speech"), true);
+ assert.equal(isVisibleSpeechProviderId("openai-audio-speech"), false);
+ assert.equal(isVisibleSpeechProviderId("elevenlabs"), false);
+});
+
+run("filters unsupported speech provider ids", () => {
+ assert.deepEqual(
+ filterVisibleSpeechProviders([
+ "openai-audio-speech",
+ "volcengine-speech",
+ "alibaba-cloud-model-studio-speech",
+ "elevenlabs",
+ "browser-local-audio-speech",
+ ]),
+ [
+ "volcengine-speech",
+ "alibaba-cloud-model-studio-speech",
+ "browser-local-audio-speech",
+ ]
+ );
+});
diff --git a/frontend/packages/app-core/src/utils/provider-visibility.ts b/frontend/packages/app-core/src/utils/provider-visibility.ts
new file mode 100644
index 0000000..9399749
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/provider-visibility.ts
@@ -0,0 +1,14 @@
+const visibleSpeechProviderIds = new Set([
+ "volcengine-speech",
+ "alibaba-cloud-model-studio-speech",
+ "browser-local-audio-speech",
+ "app-local-audio-speech",
+]);
+
+export function isVisibleSpeechProviderId(providerId: string) {
+ return visibleSpeechProviderIds.has(providerId);
+}
+
+export function filterVisibleSpeechProviders(providerIds: string[]) {
+ return providerIds.filter((providerId) => isVisibleSpeechProviderId(providerId));
+}
diff --git a/frontend/packages/app-core/src/utils/transcript-filter.test.ts b/frontend/packages/app-core/src/utils/transcript-filter.test.ts
new file mode 100644
index 0000000..0fc07e6
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/transcript-filter.test.ts
@@ -0,0 +1,23 @@
+import assert from "node:assert/strict";
+
+import { sanitizeTranscript } from "./transcript-filter.ts";
+
+function run(name: string, fn: () => void) {
+ try {
+ fn();
+ console.info(`PASS ${name}`);
+ } catch (error) {
+ console.error(`FAIL ${name}`);
+ throw error;
+ }
+}
+
+run("drops windows absolute image path transcript", () => {
+ const transcript = String.raw`C:\Users\ADMIN\Documents\WeChat Files\wxid_b2orpigekka622\FileStorage\Temp\1772262785183.jpg`;
+ assert.equal(sanitizeTranscript(transcript), "");
+});
+
+run("keeps normal natural language transcript", () => {
+ assert.equal(sanitizeTranscript("你好,这是语音测试。"), "你好,这是语音测试。");
+});
+
diff --git a/frontend/packages/app-core/src/utils/transcript-filter.ts b/frontend/packages/app-core/src/utils/transcript-filter.ts
new file mode 100644
index 0000000..8080c64
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/transcript-filter.ts
@@ -0,0 +1,17 @@
+const WINDOWS_ABSOLUTE_PATH_RE =
+ /^[a-zA-Z]:\\(?:[^\\/:*?"<>|\r\n]+\\)*[^\\/:*?"<>|\r\n]+$/;
+
+export function sanitizeTranscript(value: string) {
+ const trimmed = value.trim();
+ if (!trimmed) {
+ return "";
+ }
+
+ const normalizedPathCandidate = trimmed.replace(/\//g, "\\");
+ if (WINDOWS_ABSOLUTE_PATH_RE.test(normalizedPathCandidate)) {
+ return "";
+ }
+
+ return trimmed;
+}
+
diff --git a/frontend/packages/app-core/src/utils/transcription-language.test.ts b/frontend/packages/app-core/src/utils/transcription-language.test.ts
new file mode 100644
index 0000000..fd7e4ab
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/transcription-language.test.ts
@@ -0,0 +1,40 @@
+import {
+ normalizeTranscriptionLanguage,
+ resolveInitialTranscriptionLanguage,
+} from "./transcription-language.ts";
+
+function run(name: string, fn: () => void) {
+ try {
+ fn();
+ console.info(`PASS ${name}`);
+ } catch (error) {
+ console.error(`FAIL ${name}`);
+ throw error;
+ }
+}
+
+function expectEqual(actual: T, expected: T) {
+ if (actual !== expected) {
+ throw new Error(`Expected ${String(expected)} but received ${String(actual)}`);
+ }
+}
+
+run("normalizes short zh language token", () => {
+ expectEqual(normalizeTranscriptionLanguage("zh"), "zh-CN");
+});
+
+run("normalizes short en language token", () => {
+ expectEqual(normalizeTranscriptionLanguage("en"), "en-US");
+});
+
+run("keeps specific locale value", () => {
+ expectEqual(normalizeTranscriptionLanguage("ja-JP"), "ja-JP");
+});
+
+run("falls back to english when language is missing", () => {
+ expectEqual(resolveInitialTranscriptionLanguage(undefined), "en-US");
+});
+
+run("uses navigator language when available", () => {
+ expectEqual(resolveInitialTranscriptionLanguage("zh-CN"), "zh-CN");
+});
diff --git a/frontend/packages/app-core/src/utils/transcription-language.ts b/frontend/packages/app-core/src/utils/transcription-language.ts
new file mode 100644
index 0000000..3acd1ae
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/transcription-language.ts
@@ -0,0 +1,25 @@
+const DEFAULT_TRANSCRIPTION_LANGUAGE = "en-US";
+
+function normalizeLocaleToken(value: string) {
+ return value.trim().replace("_", "-");
+}
+
+export function normalizeTranscriptionLanguage(language: unknown) {
+ if (typeof language !== "string") {
+ return DEFAULT_TRANSCRIPTION_LANGUAGE;
+ }
+ const normalized = normalizeLocaleToken(language);
+ if (!normalized) {
+ return DEFAULT_TRANSCRIPTION_LANGUAGE;
+ }
+ const lower = normalized.toLowerCase();
+ if (lower === "zh") return "zh-CN";
+ if (lower === "en") return "en-US";
+ return normalized;
+}
+
+export function resolveInitialTranscriptionLanguage(
+ navigatorLanguage?: string | null
+) {
+ return normalizeTranscriptionLanguage(navigatorLanguage);
+}
diff --git a/frontend/packages/app-core/src/utils/tts-chunker.test.ts b/frontend/packages/app-core/src/utils/tts-chunker.test.ts
new file mode 100644
index 0000000..64f610a
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/tts-chunker.test.ts
@@ -0,0 +1,51 @@
+import assert from "node:assert/strict";
+
+import {
+ chunkTtsInput,
+ toSpeakableTtsChunks,
+ TTS_FLUSH_INSTRUCTION,
+ TTS_SPECIAL_TOKEN,
+} from "./tts-chunker.ts";
+
+function run(name: string, fn: () => void) {
+ try {
+ fn();
+ console.info(`PASS ${name}`);
+ } catch (error) {
+ console.error(`FAIL ${name}`);
+ throw error;
+ }
+}
+
+run("splits on hard punctuation", () => {
+ const chunks = toSpeakableTtsChunks("你好。世界。");
+ assert.deepEqual(chunks, ["你好。", "世界。"]);
+});
+
+run("keeps decimal punctuation in numbers", () => {
+ const chunks = toSpeakableTtsChunks("价格是2.5,不是25。");
+ assert.equal(chunks.join(""), "价格是2.5,不是25。");
+});
+
+run("normalizes three dots into ellipsis", () => {
+ const chunks = toSpeakableTtsChunks("等等...快点。");
+ assert.ok(chunks.join("").includes("…"));
+ assert.equal(chunks.join("").includes("..."), false);
+});
+
+run("emits special reason when special token appears", () => {
+ const chunks = chunkTtsInput(`前缀${TTS_SPECIAL_TOKEN}后缀。`);
+ assert.equal(chunks[0]?.reason, "special");
+ assert.equal(chunks[0]?.text, "前缀");
+});
+
+run("emits standalone special chunk when buffer is empty", () => {
+ const chunks = chunkTtsInput(`${TTS_SPECIAL_TOKEN}你好。`);
+ assert.equal(chunks[0]?.reason, "special");
+ assert.equal(chunks[0]?.text, "");
+});
+
+run("flush instruction forces chunk boundary and is stripped for TTS text", () => {
+ const chunks = toSpeakableTtsChunks(`第一句${TTS_FLUSH_INSTRUCTION}第二句。`);
+ assert.deepEqual(chunks, ["第一句", "第二句。"]);
+});
diff --git a/frontend/packages/app-core/src/utils/tts-chunker.ts b/frontend/packages/app-core/src/utils/tts-chunker.ts
new file mode 100644
index 0000000..f1b793b
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/tts-chunker.ts
@@ -0,0 +1,243 @@
+export const TTS_FLUSH_INSTRUCTION = "\u200B";
+export const TTS_SPECIAL_TOKEN = "\u2063";
+
+const keptPunctuations = new Set(["?", "?", "!", "!"]);
+const hardPunctuations = new Set([
+ ".",
+ "。",
+ "?",
+ "?",
+ "!",
+ "!",
+ "…",
+ "⋯",
+ "~",
+ "~",
+ "\n",
+ "\t",
+ "\r",
+]);
+const softPunctuations = new Set([
+ ",",
+ ",",
+ "、",
+ "–",
+ "—",
+ ":",
+ ":",
+ ";",
+ ";",
+ "《",
+ "》",
+ "「",
+ "」",
+]);
+
+export type TtsChunkReason = "boost" | "limit" | "hard" | "flush" | "special";
+
+export interface TtsInputChunk {
+ text: string;
+ words: number;
+ reason: TtsChunkReason;
+}
+
+export interface TtsInputChunkOptions {
+ boost?: number;
+ minimumWords?: number;
+ maximumWords?: number;
+}
+
+type SegmentLike = { segment?: string; isWordLike?: boolean };
+type SegmenterLike = { segment: (input: string) => Iterable };
+
+function createSegmenter(granularity: "word" | "grapheme"): SegmenterLike | null {
+ const SegmenterCtor = (Intl as any)?.Segmenter as
+ | (new (locales?: string | string[], options?: { granularity: string }) => SegmenterLike)
+ | undefined;
+ if (!SegmenterCtor) return null;
+ try {
+ return new SegmenterCtor(undefined, { granularity });
+ } catch {
+ return null;
+ }
+}
+
+function splitGraphemes(text: string, segmenter: SegmenterLike | null) {
+ if (!text) return [];
+ if (!segmenter) {
+ return Array.from(text);
+ }
+ const units: string[] = [];
+ for (const token of segmenter.segment(text)) {
+ if (typeof token?.segment === "string" && token.segment.length > 0) {
+ units.push(token.segment);
+ }
+ }
+ return units.length > 0 ? units : Array.from(text);
+}
+
+function countWordLike(text: string, segmenter: SegmenterLike | null) {
+ if (!text) return 0;
+ if (!segmenter) {
+ const matched = text.match(/[A-Za-z0-9\u4e00-\u9fff]+/g);
+ return matched?.length ?? 0;
+ }
+ let count = 0;
+ for (const token of segmenter.segment(text)) {
+ if (token?.isWordLike) {
+ count += 1;
+ }
+ }
+ return count;
+}
+
+export function sanitizeTtsChunk(text: string) {
+ return text
+ .replaceAll(TTS_SPECIAL_TOKEN, "")
+ .replaceAll(TTS_FLUSH_INSTRUCTION, "")
+ .trim();
+}
+
+export function chunkTtsInput(
+ inputText: string,
+ options?: TtsInputChunkOptions
+): TtsInputChunk[] {
+ const { boost = 2, minimumWords = 4, maximumWords = 12 } = options ?? {};
+ const source = inputText.trim();
+ if (!source) return [];
+
+ const graphemeSegmenter = createSegmenter("grapheme");
+ const wordSegmenter = createSegmenter("word");
+ const input = splitGraphemes(source, graphemeSegmenter);
+
+ const chunks: TtsInputChunk[] = [];
+ let yieldCount = 0;
+ let buffer = "";
+ let chunk = "";
+ let chunkWordsCount = 0;
+ let previousValue: string | undefined;
+ let index = 0;
+
+ while (index < input.length) {
+ let value = input[index];
+
+ if (value.length > 1) {
+ previousValue = value;
+ index += 1;
+ continue;
+ }
+
+ const flush = value === TTS_FLUSH_INSTRUCTION;
+ const special = value === TTS_SPECIAL_TOKEN;
+ const hard = hardPunctuations.has(value);
+ const soft = softPunctuations.has(value);
+ const kept = keptPunctuations.has(value);
+ let consumed = 1;
+
+ if (flush || special || hard || soft) {
+ switch (value) {
+ case ".":
+ case ",": {
+ if (previousValue !== undefined && /\d/.test(previousValue)) {
+ const nextValue = input[index + 1];
+ if (nextValue && /\d/.test(nextValue)) {
+ buffer += value;
+ previousValue = value;
+ index += consumed;
+ continue;
+ }
+ } else if (value === ".") {
+ const nextValue = input[index + 1];
+ const afterNextValue = input[index + 2];
+ if (nextValue === "." && afterNextValue === ".") {
+ value = "…";
+ consumed = 3;
+ }
+ }
+ break;
+ }
+ }
+
+ if (buffer.length === 0) {
+ if (special) {
+ chunks.push({
+ text: "",
+ words: 0,
+ reason: "special",
+ });
+ yieldCount += 1;
+ chunkWordsCount = 0;
+ }
+
+ previousValue = value;
+ index += consumed;
+ continue;
+ }
+
+ const words = countWordLike(buffer, wordSegmenter);
+
+ if (chunkWordsCount > minimumWords && chunkWordsCount + words > maximumWords) {
+ const text = kept ? `${chunk.trim()}${value}` : chunk.trim();
+ chunks.push({
+ text,
+ words: chunkWordsCount,
+ reason: "limit",
+ });
+ yieldCount += 1;
+ chunk = "";
+ chunkWordsCount = 0;
+ }
+
+ chunk += buffer + value;
+ chunkWordsCount += words;
+ buffer = "";
+
+ if (special) {
+ chunks.push({
+ text: chunk.slice(0, -1).trim(),
+ words: chunkWordsCount,
+ reason: "special",
+ });
+ yieldCount += 1;
+ chunk = "";
+ chunkWordsCount = 0;
+ } else if (flush || hard || chunkWordsCount > maximumWords || yieldCount < boost) {
+ chunks.push({
+ text: chunk.trim(),
+ words: chunkWordsCount,
+ reason: flush ? "flush" : hard ? "hard" : chunkWordsCount > maximumWords ? "limit" : "boost",
+ });
+ yieldCount += 1;
+ chunk = "";
+ chunkWordsCount = 0;
+ }
+
+ previousValue = value;
+ index += consumed;
+ continue;
+ }
+
+ buffer += value;
+ previousValue = value;
+ index += 1;
+ }
+
+ if (chunk.length > 0 || buffer.length > 0) {
+ chunks.push({
+ text: (chunk + buffer).trim(),
+ words: chunkWordsCount + countWordLike(buffer, wordSegmenter),
+ reason: "flush",
+ });
+ }
+
+ return chunks;
+}
+
+export function toSpeakableTtsChunks(
+ inputText: string,
+ options?: TtsInputChunkOptions
+) {
+ return chunkTtsInput(inputText, options)
+ .map((item) => sanitizeTtsChunk(item.text))
+ .filter((text) => text.length > 0);
+}
diff --git a/frontend/packages/app-core/src/utils/tts-direct-request.ts b/frontend/packages/app-core/src/utils/tts-direct-request.ts
new file mode 100644
index 0000000..0c476b6
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/tts-direct-request.ts
@@ -0,0 +1,231 @@
+type AudioRequestConfig = Record;
+
+type BackendTtsPayload = {
+ engine: string;
+ data: string;
+ config: Record;
+};
+
+type LegacyTtsPayload = {
+ text: string;
+ engine: string;
+ providerId?: string;
+ provider_id?: string;
+ config: Record;
+};
+
+export type DirectTtsHttpRequest = {
+ url: string;
+ headers: Record;
+ body: BackendTtsPayload;
+};
+
+export type LegacyTtsHttpRequest = {
+ url: string;
+ headers: Record;
+ body: LegacyTtsPayload;
+};
+
+const allowedBackendTtsEngineIds = new Set([
+ "volcengine-speech",
+ "alibaba-cloud-model-studio-speech",
+]);
+
+function asRecord(value: unknown): Record {
+ return typeof value === "object" && value !== null && !Array.isArray(value)
+ ? (value as Record)
+ : {};
+}
+
+function readString(config: Record, keys: string[]) {
+ for (const key of keys) {
+ const value = config[key];
+ if (typeof value === "string" && value.trim()) {
+ return value.trim();
+ }
+ }
+ return "";
+}
+
+function normalizeBackendTtsUrl(apiBaseUrl: string) {
+ const trimmed = apiBaseUrl.trim().replace(/\/+$/, "");
+ return `${trimmed}/api/tts/engines`;
+}
+
+function normalizeLegacyTtsUrl(url: string) {
+ const trimmed = url.trim().replace(/\/+$/, "");
+ if (trimmed.endsWith("/api/tts/engines")) {
+ return `${trimmed.slice(0, -"/api/tts/engines".length)}/api/tts/synthesize`;
+ }
+ return `${trimmed}/api/tts/synthesize`;
+}
+
+function resolveLegacyBackend(engineId: string) {
+ if (engineId === "volcengine-speech") return "volcengine";
+ if (engineId === "alibaba-cloud-model-studio-speech") return "alibaba";
+ return "";
+}
+
+function normalizeAlibabaModelId(model: string, engineId: string) {
+ if (engineId !== "alibaba-cloud-model-studio-speech") {
+ return model;
+ }
+ return model.replace(/^alibaba\//i, "").trim();
+}
+
+function resolveVolcengineAppId(config: Record) {
+ const topLevel = readString(config, ["appId", "appid", "app_id"]);
+ if (topLevel) return topLevel;
+ const app = asRecord(config.app);
+ return readString(app, ["appId", "appid", "app_id"]);
+}
+
+function copyKnownExtras(
+ source: Record,
+ target: Record,
+ keys: string[]
+) {
+ keys.forEach((key) => {
+ if (Object.prototype.hasOwnProperty.call(source, key) && source[key] !== undefined) {
+ target[key] = source[key];
+ }
+ });
+}
+
+export function supportsDirectTts(engineId: string | null | undefined) {
+ if (!engineId) return false;
+ return allowedBackendTtsEngineIds.has(engineId);
+}
+
+export function buildDirectTtsHttpRequest(input: {
+ text: string;
+ engineId?: string;
+ apiBaseUrl?: string;
+ config?: AudioRequestConfig;
+}): DirectTtsHttpRequest | null {
+ const engineId = (input.engineId || "").trim();
+ if (!supportsDirectTts(engineId)) return null;
+
+ const config = asRecord(input.config);
+ const apiBaseUrl = (input.apiBaseUrl || "").trim();
+ const apiKey = readString(config, ["apiKey", "api_key"]);
+ const model = normalizeAlibabaModelId(readString(config, ["model"]), engineId);
+ const voice = readString(config, ["voice"]);
+ const text = (input.text || "").trim();
+
+ if (!apiBaseUrl || !apiKey || !model || !voice || !text) {
+ return null;
+ }
+
+ const backendConfig: Record = {
+ apiKey,
+ model,
+ voice,
+ };
+
+ const responseFormat = readString(config, ["response_format", "responseFormat", "format"]);
+ if (responseFormat) {
+ backendConfig.response_format = responseFormat;
+ }
+
+ if (typeof config.speed === "number") {
+ backendConfig.speed = config.speed;
+ }
+
+ if (engineId === "volcengine-speech") {
+ const appId = resolveVolcengineAppId(config);
+ if (!appId) return null;
+ backendConfig.appId = appId;
+ copyKnownExtras(config, backendConfig, [
+ "app",
+ "audio",
+ "request",
+ "user",
+ "extra_body",
+ "extraBody",
+ ]);
+ }
+
+ if (engineId === "alibaba-cloud-model-studio-speech") {
+ copyKnownExtras(config, backendConfig, [
+ "rate",
+ "pitch",
+ "volume",
+ "sample_rate",
+ "sampleRate",
+ "extra_body",
+ "extraBody",
+ ]);
+ }
+
+ return {
+ url: normalizeBackendTtsUrl(apiBaseUrl),
+ headers: {
+ "Content-Type": "application/json",
+ },
+ body: {
+ engine: engineId,
+ data: text,
+ config: backendConfig,
+ },
+ };
+}
+
+export function buildLegacyTtsHttpRequest(input: DirectTtsHttpRequest): LegacyTtsHttpRequest {
+ const config: Record = {
+ ...input.body.config,
+ };
+ const apiKey = readString(config, ["apiKey", "api_key"]);
+ if (apiKey && !readString(config, ["api_key"])) {
+ config.api_key = apiKey;
+ }
+
+ const baseUrl = readString(config, ["baseUrl", "base_url"]);
+ if (baseUrl) {
+ if (!readString(config, ["baseUrl"])) {
+ config.baseUrl = baseUrl;
+ }
+ if (!readString(config, ["base_url"])) {
+ config.base_url = baseUrl;
+ }
+ }
+
+ const backend = resolveLegacyBackend(input.body.engine);
+ if (backend && !readString(config, ["backend"])) {
+ config.backend = backend;
+ }
+
+ const model = readString(config, ["model"]);
+ if (model && !model.includes("/")) {
+ if (backend === "volcengine") {
+ config.model = `volcengine/${model}`;
+ }
+ }
+ if (backend === "alibaba") {
+ config.model = normalizeAlibabaModelId(readString(config, ["model"]), input.body.engine);
+ }
+
+ const appId = readString(config, ["appId", "appid", "app_id"]);
+ if (appId) {
+ if (!readString(config, ["appid"])) {
+ config.appid = appId;
+ }
+ if (!readString(config, ["app_id"])) {
+ config.app_id = appId;
+ }
+ }
+
+ return {
+ url: normalizeLegacyTtsUrl(input.url),
+ headers: {
+ ...input.headers,
+ },
+ body: {
+ text: input.body.data,
+ engine: input.body.engine,
+ providerId: input.body.engine,
+ provider_id: input.body.engine,
+ config,
+ },
+ };
+}
diff --git a/frontend/packages/app-core/src/utils/tts-stream-segmenter.test.ts b/frontend/packages/app-core/src/utils/tts-stream-segmenter.test.ts
new file mode 100644
index 0000000..07fbe3e
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/tts-stream-segmenter.test.ts
@@ -0,0 +1,39 @@
+import assert from "node:assert/strict";
+
+import { TtsStreamSegmenter } from "./tts-stream-segmenter.ts";
+
+function run(name: string, fn: () => void) {
+ try {
+ fn();
+ console.info(`PASS ${name}`);
+ } catch (error) {
+ console.error(`FAIL ${name}`);
+ throw error;
+ }
+}
+
+run("emits finished sentence while keeping trailing tail", () => {
+ const segmenter = new TtsStreamSegmenter();
+ segmenter.appendLiteral("你好");
+ assert.deepEqual(segmenter.drain(false), []);
+
+ segmenter.appendLiteral("。世界");
+ assert.deepEqual(segmenter.drain(false), ["你好。"]);
+
+ segmenter.appendLiteral("。");
+ assert.deepEqual(segmenter.drain(false), ["世界。"]);
+});
+
+run("special marker flushes previous literal chunk", () => {
+ const segmenter = new TtsStreamSegmenter();
+ segmenter.appendLiteral("前缀");
+ segmenter.appendSpecialMarker();
+ assert.deepEqual(segmenter.drain(false), ["前缀"]);
+});
+
+run("final drain emits tail chunk", () => {
+ const segmenter = new TtsStreamSegmenter();
+ segmenter.appendLiteral("还没结束");
+ assert.deepEqual(segmenter.drain(false), []);
+ assert.deepEqual(segmenter.drain(true), ["还没结束"]);
+});
diff --git a/frontend/packages/app-core/src/utils/tts-stream-segmenter.ts b/frontend/packages/app-core/src/utils/tts-stream-segmenter.ts
new file mode 100644
index 0000000..2c34f20
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/tts-stream-segmenter.ts
@@ -0,0 +1,64 @@
+import {
+ chunkTtsInput,
+ sanitizeTtsChunk,
+ TTS_FLUSH_INSTRUCTION,
+ TTS_SPECIAL_TOKEN,
+} from "./tts-chunker.ts";
+
+function endsWithControlMarker(text: string) {
+ return (
+ text.endsWith(TTS_FLUSH_INSTRUCTION) ||
+ text.endsWith(TTS_SPECIAL_TOKEN)
+ );
+}
+
+export class TtsStreamSegmenter {
+ private input = "";
+ private emittedCount = 0;
+
+ appendLiteral(text: string) {
+ if (!text) return;
+ this.input += text;
+ }
+
+ appendSpecialMarker() {
+ this.input += TTS_SPECIAL_TOKEN;
+ }
+
+ appendFlushMarker() {
+ this.input += TTS_FLUSH_INSTRUCTION;
+ }
+
+ reset() {
+ this.input = "";
+ this.emittedCount = 0;
+ }
+
+ drain(finalize: boolean) {
+ const chunks = chunkTtsInput(this.input);
+ if (chunks.length === 0) return [];
+
+ let emitUntil = chunks.length;
+ if (!finalize) {
+ const last = chunks[chunks.length - 1];
+ if (last?.reason === "flush" && !endsWithControlMarker(this.input)) {
+ emitUntil -= 1;
+ }
+ }
+
+ if (emitUntil <= this.emittedCount) {
+ return [];
+ }
+
+ const emitted: string[] = [];
+ for (let index = this.emittedCount; index < emitUntil; index++) {
+ const text = sanitizeTtsChunk(chunks[index]?.text ?? "");
+ if (text) {
+ emitted.push(text);
+ }
+ }
+
+ this.emittedCount = emitUntil;
+ return emitted;
+ }
+}
diff --git a/frontend/packages/app-core/src/utils/tts-streaming-runner.test.ts b/frontend/packages/app-core/src/utils/tts-streaming-runner.test.ts
new file mode 100644
index 0000000..4d65cd7
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/tts-streaming-runner.test.ts
@@ -0,0 +1,79 @@
+import assert from "node:assert/strict";
+
+import { runTtsChunkQueue, TtsChunkQueueError } from "./tts-streaming-runner.ts";
+
+async function run(name: string, fn: () => Promise | void) {
+ try {
+ await fn();
+ console.info(`PASS ${name}`);
+ } catch (error) {
+ console.error(`FAIL ${name}`);
+ throw error;
+ }
+}
+
+await run("continues when one chunk fails", async () => {
+ const processed: string[] = [];
+ const result = await runTtsChunkQueue(["A", "B", "C"], async (chunk) => {
+ if (chunk === "B") {
+ const error = new Error("Bad gateway") as Error & { status?: number };
+ error.status = 502;
+ throw error;
+ }
+ processed.push(chunk);
+ });
+
+ assert.deepEqual(processed, ["A", "C"]);
+ assert.equal(result.succeeded, 2);
+ assert.equal(result.failed, 1);
+});
+
+await run("throws when every chunk fails", async () => {
+ await assert.rejects(
+ async () => {
+ await runTtsChunkQueue(["A", "B"], async () => {
+ const error = new Error("Always fail");
+ throw error;
+ });
+ },
+ /Always fail/
+ );
+});
+
+await run("does not swallow AbortError", async () => {
+ await assert.rejects(
+ async () => {
+ await runTtsChunkQueue(["A"], async () => {
+ throw new DOMException("Aborted", "AbortError");
+ });
+ },
+ (error: unknown) =>
+ error instanceof DOMException &&
+ error.name === "AbortError"
+ );
+});
+
+await run("stops on first chunk failure when stopOnError is enabled", async () => {
+ const processed: string[] = [];
+ await assert.rejects(
+ async () => {
+ await runTtsChunkQueue(
+ ["A", "B", "C"],
+ async (chunk) => {
+ if (chunk === "B") {
+ throw new Error("B failed");
+ }
+ processed.push(chunk);
+ },
+ { stopOnError: true }
+ );
+ },
+ (error: unknown) => {
+ if (!(error instanceof TtsChunkQueueError)) return false;
+ assert.equal(error.context.index, 1);
+ assert.equal(error.context.chunk, "B");
+ return true;
+ }
+ );
+ assert.deepEqual(processed, ["A"]);
+});
diff --git a/frontend/packages/app-core/src/utils/tts-streaming-runner.ts b/frontend/packages/app-core/src/utils/tts-streaming-runner.ts
new file mode 100644
index 0000000..f1b933a
--- /dev/null
+++ b/frontend/packages/app-core/src/utils/tts-streaming-runner.ts
@@ -0,0 +1,72 @@
+export interface TtsChunkQueueResult {
+ succeeded: number;
+ failed: number;
+ lastError: unknown | null;
+}
+
+export interface TtsChunkQueueContext {
+ chunk: string;
+ index: number;
+ total: number;
+}
+
+export class TtsChunkQueueError extends Error {
+ context: TtsChunkQueueContext;
+ originalError: unknown;
+
+ constructor(error: unknown, context: TtsChunkQueueContext) {
+ const message = error instanceof Error ? error.message : String(error);
+ super(`TTS chunk failed at ${context.index + 1}/${context.total}: ${message}`);
+ this.name = "TtsChunkQueueError";
+ this.context = context;
+ this.originalError = error;
+ }
+}
+
+export interface RunTtsChunkQueueOptions {
+ onChunkError?: (error: unknown, context: TtsChunkQueueContext) => void;
+ stopOnError?: boolean;
+}
+
+function isAbortError(error: unknown) {
+ return error instanceof DOMException && error.name === "AbortError";
+}
+
+export async function runTtsChunkQueue(
+ chunks: string[],
+ runChunk: (chunk: string, index: number, total: number) => Promise,
+ options?: RunTtsChunkQueueOptions
+): Promise {
+ let succeeded = 0;
+ let failed = 0;
+ let lastError: unknown | null = null;
+
+ for (let index = 0; index < chunks.length; index++) {
+ const chunk = chunks[index];
+ try {
+ await runChunk(chunk, index, chunks.length);
+ succeeded += 1;
+ } catch (error) {
+ if (isAbortError(error)) {
+ throw error;
+ }
+ failed += 1;
+ lastError = error;
+ const context = {
+ chunk,
+ index,
+ total: chunks.length,
+ };
+ options?.onChunkError?.(error, context);
+ if (options?.stopOnError) {
+ throw new TtsChunkQueueError(error, context);
+ }
+ }
+ }
+
+ if (succeeded === 0 && lastError) {
+ throw lastError;
+ }
+
+ return { succeeded, failed, lastError };
+}
diff --git a/frontend/packages/app-settings/src/sections/AudioSection.vue b/frontend/packages/app-settings/src/sections/AudioSection.vue
index bb2299d..ea47a3a 100644
--- a/frontend/packages/app-settings/src/sections/AudioSection.vue
+++ b/frontend/packages/app-settings/src/sections/AudioSection.vue
@@ -1,5 +1,5 @@
diff --git a/frontend/packages/app-settings/src/sections/ModelSection.vue b/frontend/packages/app-settings/src/sections/ModelSection.vue
index 85e458f..b22b291 100644
--- a/frontend/packages/app-settings/src/sections/ModelSection.vue
+++ b/frontend/packages/app-settings/src/sections/ModelSection.vue
@@ -122,11 +122,17 @@ function buildPanel(category: "chat" | "speech" | "transcription", providerId: s
: category === "speech"
? speechProviderId.value
: transcriptionProviderId.value;
- const targetField = providersStore
- .getProviderFields(updatedProviderId)
- .find((field) => field.id === fieldId);
+ const providerFields = providersStore.getProviderFields(updatedProviderId);
+ const targetField = providerFields.find((field) => field.id === fieldId);
if (!targetField) return;
providersStore.setProviderFieldValue(updatedProviderId, targetField, value);
+ if (category === "speech" && fieldId === "model") {
+ const voiceField = providerFields.find((field) => field.id === "voice");
+ if (voiceField) {
+ providersStore.setProviderFieldValue(updatedProviderId, voiceField, "");
+ }
+ void providersStore.refreshProvider(updatedProviderId);
+ }
},
};
}
diff --git a/frontend/packages/stage-settings-ui/src/components/AudioSection.vue b/frontend/packages/stage-settings-ui/src/components/AudioSection.vue
index 2024237..4137805 100644
--- a/frontend/packages/stage-settings-ui/src/components/AudioSection.vue
+++ b/frontend/packages/stage-settings-ui/src/components/AudioSection.vue
@@ -1,4 +1,4 @@
-