Skip to content

Commit 8612a4a

Browse files
committed
fix(tts): harden provider URL handling and local voice loading
1 parent 7af80a4 commit 8612a4a

File tree

6 files changed

+192
-60
lines changed

6 files changed

+192
-60
lines changed

backend/app/api/tts.py

Lines changed: 1 addition & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -454,23 +454,15 @@ def _resolve_tts_media_type(format_name: str) -> str:
454454

455455

456456
def _resolve_volcengine_tts_url(runtime_config, overrides: Dict[str, Any]) -> str:
457-
explicit = _read_string(overrides, "volcengine_url", "volcengineUrl", "provider_url", "providerUrl")
458-
if explicit:
459-
return explicit
460457
base_url = str(runtime_config.base_url or "").strip().lower()
461458
if "openspeech.bytedance.com" in base_url:
462459
return runtime_config.base_url.rstrip("/")
463460
return VOLCENGINE_TTS_URL
464461

465462

466463
def _resolve_alibaba_tts_ws_url(runtime_config, overrides: Dict[str, Any]) -> str:
467-
explicit = _read_string(overrides, "dashscope_ws_url", "dashscopeWsUrl", "ws_url", "wsUrl")
468-
if explicit:
469-
return explicit
470-
471-
explicit_base = _read_string(overrides, "base_url", "baseUrl", "dashscope_base_url", "dashscopeBaseUrl")
472464
runtime_base = str(runtime_config.base_url or "").strip()
473-
normalized_base = (explicit_base or runtime_base).lower()
465+
normalized_base = runtime_base.lower()
474466

475467
region = _read_string(overrides, "region").lower()
476468
if region in {"intl", "sg", "singapore", "intl-singapore", "ap-southeast-1"}:

backend/app/services/providers/registry.py

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import os
22
import json
3+
import logging
34
from functools import lru_cache
45
from pathlib import Path
56
from typing import Any, Dict, List, Optional, Set
@@ -56,6 +57,8 @@
5657
"volcengine-speech": LOCAL_TTS_VOICES_DIR / "volcengine.json",
5758
"alibaba-cloud-model-studio-speech": LOCAL_TTS_VOICES_DIR / "alibaba.json",
5859
}
60+
_LAST_GOOD_LOCAL_TTS_VOICES: Dict[str, List[dict]] = {}
61+
logger = logging.getLogger(__name__)
5962

6063

6164
class ProviderRegistry:
@@ -190,19 +193,47 @@ async def _load_local_tts_voices(provider_id: str) -> List[dict]:
190193
path = LOCAL_TTS_VOICE_FILES.get(provider_id)
191194
if not path:
192195
return []
193-
return _load_local_tts_voices_cached(provider_id, str(path))
194196

195-
196-
@lru_cache(maxsize=8)
197-
def _load_local_tts_voices_cached(provider_id: str, path: str) -> List[dict]:
198197
source = Path(path)
199-
if not source.exists():
200-
return []
201-
202198
try:
203-
raw = json.loads(source.read_text(encoding="utf-8"))
204-
except Exception:
199+
mtime_ns = source.stat().st_mtime_ns
200+
voices = _load_local_tts_voices_cached(provider_id, str(source), mtime_ns)
201+
except FileNotFoundError:
202+
logger.warning("Local TTS voices file not found for provider=%s path=%s", provider_id, source)
203+
return _load_last_good_local_tts_voices(provider_id)
204+
except (OSError, UnicodeDecodeError, json.JSONDecodeError) as exc:
205+
logger.exception(
206+
"Failed to load local TTS voices for provider=%s path=%s",
207+
provider_id,
208+
source,
209+
exc_info=exc,
210+
)
211+
return _load_last_good_local_tts_voices(provider_id)
212+
except Exception as exc:
213+
logger.exception(
214+
"Unexpected error while loading local TTS voices for provider=%s path=%s",
215+
provider_id,
216+
source,
217+
exc_info=exc,
218+
)
219+
return _load_last_good_local_tts_voices(provider_id)
220+
221+
_LAST_GOOD_LOCAL_TTS_VOICES[provider_id] = list(voices)
222+
return list(voices)
223+
224+
225+
def _load_last_good_local_tts_voices(provider_id: str) -> List[dict]:
226+
voices = _LAST_GOOD_LOCAL_TTS_VOICES.get(provider_id)
227+
if not voices:
205228
return []
229+
return list(voices)
230+
231+
232+
@lru_cache(maxsize=16)
233+
def _load_local_tts_voices_cached(provider_id: str, path: str, mtime_ns: int) -> List[dict]:
234+
source = Path(path)
235+
_ = mtime_ns
236+
raw = json.loads(source.read_text(encoding="utf-8"))
206237

207238
if provider_id == "alibaba-cloud-model-studio-speech":
208239
return _parse_alibaba_voices(raw)

backend/tests/test_provider_voices_tts.py

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import asyncio
22
import sys
3+
import uuid
34
from pathlib import Path
45

56
ROOT = Path(__file__).resolve().parents[1]
@@ -104,7 +105,112 @@ def test_list_voices_unsupported_provider_returns_empty():
104105
assert result == []
105106

106107

108+
def _volcengine_voices_payload() -> str:
109+
return """
110+
{
111+
"status": "success",
112+
"error": null,
113+
"data": {
114+
"resource_packs": [
115+
{
116+
"code": "zh_female_test",
117+
"resource_display": "Test Voice",
118+
"details": {
119+
"language": "Chinese",
120+
"voice_type": "zh_female_test",
121+
"tone_number": "zh_female_test",
122+
"recommended_scenario": "General"
123+
}
124+
}
125+
]
126+
}
127+
}
128+
""".strip()
129+
130+
131+
def _make_local_temp_file() -> Path:
132+
root = ROOT / "tests_tmp"
133+
root.mkdir(parents=True, exist_ok=True)
134+
path = root / f"tmp-{uuid.uuid4().hex}.json"
135+
return path
136+
137+
138+
def test_load_local_tts_voices_recovers_after_transient_parse_error():
139+
temp_path = _make_local_temp_file()
140+
try:
141+
temp_path.write_text("{ invalid json", encoding="utf-8")
142+
143+
original_map = provider_registry_module.LOCAL_TTS_VOICE_FILES
144+
provider_registry_module.LOCAL_TTS_VOICE_FILES = {
145+
**original_map,
146+
"volcengine-speech": temp_path,
147+
}
148+
provider_registry_module._load_local_tts_voices_cached.cache_clear()
149+
provider_registry_module._LAST_GOOD_LOCAL_TTS_VOICES.pop("volcengine-speech", None)
150+
try:
151+
broken = asyncio.run(
152+
provider_registry_module._load_local_tts_voices("volcengine-speech")
153+
)
154+
assert broken == []
155+
156+
temp_path.write_text(_volcengine_voices_payload(), encoding="utf-8")
157+
recovered = asyncio.run(
158+
provider_registry_module._load_local_tts_voices("volcengine-speech")
159+
)
160+
assert len(recovered) == 1
161+
assert recovered[0]["id"] == "zh_female_test"
162+
finally:
163+
provider_registry_module.LOCAL_TTS_VOICE_FILES = original_map
164+
provider_registry_module._load_local_tts_voices_cached.cache_clear()
165+
provider_registry_module._LAST_GOOD_LOCAL_TTS_VOICES.pop("volcengine-speech", None)
166+
finally:
167+
if temp_path.exists():
168+
temp_path.unlink()
169+
170+
171+
def test_load_local_tts_voices_uses_last_good_on_parse_error():
172+
temp_path = _make_local_temp_file()
173+
try:
174+
temp_path.write_text(_volcengine_voices_payload(), encoding="utf-8")
175+
176+
original_map = provider_registry_module.LOCAL_TTS_VOICE_FILES
177+
provider_registry_module.LOCAL_TTS_VOICE_FILES = {
178+
**original_map,
179+
"volcengine-speech": temp_path,
180+
}
181+
provider_registry_module._load_local_tts_voices_cached.cache_clear()
182+
provider_registry_module._LAST_GOOD_LOCAL_TTS_VOICES.pop("volcengine-speech", None)
183+
try:
184+
first = asyncio.run(
185+
provider_registry_module._load_local_tts_voices("volcengine-speech")
186+
)
187+
assert len(first) == 1
188+
189+
temp_path.write_text("{ invalid json", encoding="utf-8")
190+
provider_registry_module._load_local_tts_voices_cached.cache_clear()
191+
fallback = asyncio.run(
192+
provider_registry_module._load_local_tts_voices("volcengine-speech")
193+
)
194+
assert len(fallback) == 1
195+
assert fallback[0]["id"] == "zh_female_test"
196+
finally:
197+
provider_registry_module.LOCAL_TTS_VOICE_FILES = original_map
198+
provider_registry_module._load_local_tts_voices_cached.cache_clear()
199+
provider_registry_module._LAST_GOOD_LOCAL_TTS_VOICES.pop("volcengine-speech", None)
200+
finally:
201+
if temp_path.exists():
202+
temp_path.unlink()
203+
204+
107205
if __name__ == "__main__":
108206
run("list voices volcengine from local catalog", test_list_voices_volcengine_from_local_catalog)
109207
run("list voices alibaba filters by model", test_list_voices_alibaba_filters_by_model)
110208
run("list voices unsupported provider returns empty", test_list_voices_unsupported_provider_returns_empty)
209+
run(
210+
"load local tts voices recovers after transient parse error",
211+
test_load_local_tts_voices_recovers_after_transient_parse_error,
212+
)
213+
run(
214+
"load local tts voices uses last good on parse error",
215+
test_load_local_tts_voices_uses_last_good_on_parse_error,
216+
)

backend/tests/test_tts_engine_relay.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
_extract_json_error_message,
1313
_extract_tts_input,
1414
_normalize_alibaba_provider_model,
15+
_resolve_volcengine_tts_url,
1516
_resolve_alibaba_tts_ws_url,
1617
_resolve_tts_api_key,
1718
)
@@ -139,6 +140,22 @@ def test_build_volcengine_provider_payload_direct():
139140
assert payload["request"]["text"] == "hello volcengine direct"
140141

141142

143+
def test_resolve_volcengine_tts_url_ignores_client_url_override():
144+
runtime = EngineRuntimeConfig(
145+
id="volcengine-speech",
146+
base_url="https://openspeech.bytedance.com/api/v1/tts",
147+
model="v1",
148+
)
149+
url = _resolve_volcengine_tts_url(
150+
runtime,
151+
{
152+
"volcengine_url": "https://attacker.example/tts",
153+
"provider_url": "https://attacker-2.example/tts",
154+
},
155+
)
156+
assert url == "https://openspeech.bytedance.com/api/v1/tts"
157+
158+
142159
def test_resolve_alibaba_tts_ws_url_prefers_intl_region():
143160
runtime = EngineRuntimeConfig(
144161
id="alibaba-cloud-model-studio-speech",
@@ -149,6 +166,23 @@ def test_resolve_alibaba_tts_ws_url_prefers_intl_region():
149166
assert ws_url == "wss://dashscope-intl.aliyuncs.com/api-ws/v1/inference"
150167

151168

169+
def test_resolve_alibaba_tts_ws_url_ignores_client_url_override():
170+
runtime = EngineRuntimeConfig(
171+
id="alibaba-cloud-model-studio-speech",
172+
base_url="https://dashscope.aliyuncs.com",
173+
model="cosyvoice-v1",
174+
)
175+
ws_url = _resolve_alibaba_tts_ws_url(
176+
runtime,
177+
{
178+
"ws_url": "wss://attacker.example/ws",
179+
"dashscope_ws_url": "wss://attacker-2.example/ws",
180+
"baseUrl": "https://dashscope-intl.aliyuncs.com",
181+
},
182+
)
183+
assert ws_url == "wss://dashscope.aliyuncs.com/api-ws/v1/inference"
184+
185+
152186
def test_normalize_alibaba_provider_model_strips_provider_prefix():
153187
assert _normalize_alibaba_provider_model("alibaba/cosyvoice-v1") == "cosyvoice-v1"
154188
assert _normalize_alibaba_provider_model("cosyvoice-v1") == "cosyvoice-v1"
@@ -182,7 +216,15 @@ def test_decorate_tts_error_for_volcengine_grant_issue():
182216
run("build unspeech payload for volcengine", test_build_unspeech_payload_for_volcengine)
183217
run("build unspeech payload for alibaba", test_build_unspeech_payload_for_alibaba)
184218
run("build volcengine provider payload direct", test_build_volcengine_provider_payload_direct)
219+
run(
220+
"resolve volcengine tts url ignores client url override",
221+
test_resolve_volcengine_tts_url_ignores_client_url_override,
222+
)
185223
run("resolve alibaba tts ws url prefers intl region", test_resolve_alibaba_tts_ws_url_prefers_intl_region)
224+
run(
225+
"resolve alibaba tts ws url ignores client url override",
226+
test_resolve_alibaba_tts_ws_url_ignores_client_url_override,
227+
)
186228
run("normalize alibaba provider model strips provider prefix", test_normalize_alibaba_provider_model_strips_provider_prefix)
187229
run("extract json error message from errors array", test_extract_json_error_message_from_errors_array)
188230
run("decorate tts error for volcengine grant issue", test_decorate_tts_error_for_volcengine_grant_issue)

frontend/packages/app-core/src/services/audio-direct.test.ts

Lines changed: 3 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ run("builds backend relay request for alibaba speech engine", () => {
8080
});
8181
});
8282

83-
run("normalizes legacy unspeech base url to provider official endpoint", () => {
83+
run("does not forward base url for fixed direct providers", () => {
8484
const volcRequest = buildDirectTtsHttpRequest({
8585
text: "hello",
8686
engineId: "volcengine-speech",
@@ -94,10 +94,7 @@ run("normalizes legacy unspeech base url to provider official endpoint", () => {
9494
},
9595
});
9696
assert.ok(volcRequest);
97-
assert.equal(
98-
(volcRequest?.body.config as { baseUrl?: string }).baseUrl,
99-
"https://openspeech.bytedance.com/api/v1/tts"
100-
);
97+
assert.equal((volcRequest?.body.config as { baseUrl?: string }).baseUrl, undefined);
10198

10299
const alibabaRequest = buildDirectTtsHttpRequest({
103100
text: "hello",
@@ -111,10 +108,7 @@ run("normalizes legacy unspeech base url to provider official endpoint", () => {
111108
},
112109
});
113110
assert.ok(alibabaRequest);
114-
assert.equal(
115-
(alibabaRequest?.body.config as { baseUrl?: string }).baseUrl,
116-
"https://dashscope.aliyuncs.com"
117-
);
111+
assert.equal((alibabaRequest?.body.config as { baseUrl?: string }).baseUrl, undefined);
118112
});
119113

120114
run("builds legacy synthesize fallback request from backend relay request", () => {
@@ -142,8 +136,6 @@ run("builds legacy synthesize fallback request from backend relay request", () =
142136
config: {
143137
apiKey: "token-123",
144138
api_key: "token-123",
145-
baseUrl: "https://unspeech.example/v1",
146-
base_url: "https://unspeech.example/v1",
147139
model: "volcengine/v1",
148140
voice: "zh_female_test",
149141
appId: "appid-xyz",

frontend/packages/app-core/src/utils/tts-direct-request.ts

Lines changed: 0 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ const allowedBackendTtsEngineIds = new Set([
3030
"volcengine-speech",
3131
"alibaba-cloud-model-studio-speech",
3232
]);
33-
const legacyUnspeechHost = "unspeech.hyp3r.link";
3433

3534
function asRecord(value: unknown): Record<string, unknown> {
3635
return typeof value === "object" && value !== null && !Array.isArray(value)
@@ -74,28 +73,6 @@ function normalizeAlibabaModelId(model: string, engineId: string) {
7473
return model.replace(/^alibaba\//i, "").trim();
7574
}
7675

77-
function normalizeLegacyUnspeechBaseUrl(engineId: string, baseUrl: string) {
78-
const normalized = baseUrl.trim();
79-
if (!normalized) return normalized;
80-
81-
try {
82-
const parsed = new URL(normalized);
83-
if (parsed.hostname.toLowerCase() !== legacyUnspeechHost) {
84-
return normalized;
85-
}
86-
} catch {
87-
return normalized;
88-
}
89-
90-
if (engineId === "volcengine-speech") {
91-
return "https://openspeech.bytedance.com/api/v1/tts";
92-
}
93-
if (engineId === "alibaba-cloud-model-studio-speech") {
94-
return "https://dashscope.aliyuncs.com";
95-
}
96-
return normalized;
97-
}
98-
9976
function resolveVolcengineAppId(config: Record<string, unknown>) {
10077
const topLevel = readString(config, ["appId", "appid", "app_id"]);
10178
if (topLevel) return topLevel;
@@ -132,10 +109,6 @@ export function buildDirectTtsHttpRequest(input: {
132109
const config = asRecord(input.config);
133110
const apiBaseUrl = (input.apiBaseUrl || "").trim();
134111
const apiKey = readString(config, ["apiKey", "api_key"]);
135-
const baseUrl = normalizeLegacyUnspeechBaseUrl(
136-
engineId,
137-
readString(config, ["baseUrl", "base_url"])
138-
);
139112
const model = normalizeAlibabaModelId(readString(config, ["model"]), engineId);
140113
const voice = readString(config, ["voice"]);
141114
const text = (input.text || "").trim();
@@ -150,10 +123,6 @@ export function buildDirectTtsHttpRequest(input: {
150123
voice,
151124
};
152125

153-
if (baseUrl) {
154-
backendConfig.baseUrl = baseUrl;
155-
}
156-
157126
const responseFormat = readString(config, ["response_format", "responseFormat", "format"]);
158127
if (responseFormat) {
159128
backendConfig.response_format = responseFormat;

0 commit comments

Comments
 (0)