Skip to content

Commit 4fd031a

Browse files
committed
修正TTS有些音色参数过多的问题
1 parent 6dd7efb commit 4fd031a

10 files changed

Lines changed: 58 additions & 144 deletions

File tree

.claude/settings.local.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,8 @@
3232
"mcp__open-websearch__search",
3333
"Bash(powershell:*)",
3434
"Bash(ls:*)",
35-
"mcp__serena__think_about_task_adherence"
35+
"mcp__serena__think_about_task_adherence",
36+
"mcp__serena__search_for_pattern"
3637
],
3738
"deny": [],
3839
"ask": []

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
backup/
22
.spec-workflow
33
.claude/
4+
/*/*/__pycache__/

custom_components/ai_hub/config_flow.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -51,9 +51,6 @@
5151
CONF_TOP_P,
5252
CONF_TTS_VOICE,
5353
CONF_TTS_LANG,
54-
CONF_TTS_RATE,
55-
CONF_TTS_VOLUME,
56-
CONF_TTS_PITCH,
5754
DEFAULT_AI_TASK_NAME,
5855
DEFAULT_CONVERSATION_NAME,
5956
DEFAULT_TITLE,
@@ -76,9 +73,6 @@
7673
RECOMMENDED_TOP_P,
7774
TTS_DEFAULT_VOICE,
7875
TTS_DEFAULT_LANG,
79-
TTS_DEFAULT_RATE,
80-
TTS_DEFAULT_VOLUME,
81-
TTS_DEFAULT_PITCH,
8276
AI_HUB_CHAT_MODELS,
8377
AI_HUB_CHAT_URL,
8478
AI_HUB_IMAGE_MODELS,

custom_components/ai_hub/const.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -522,16 +522,10 @@ def _build_edge_tts_languages() -> dict:
522522
# Edge TTS Configuration Keys
523523
CONF_TTS_VOICE: Final = "voice"
524524
CONF_TTS_LANG: Final = "lang"
525-
CONF_TTS_RATE: Final = "rate"
526-
CONF_TTS_VOLUME: Final = "volume"
527-
CONF_TTS_PITCH: Final = "pitch"
528525

529526
# Edge TTS Default Parameters
530527
TTS_DEFAULT_VOICE: Final = "zh-CN-XiaoxiaoNeural" # 默认使用晓晓女声
531528
TTS_DEFAULT_LANG: Final = "zh-CN"
532-
TTS_DEFAULT_RATE: Final = "+0%"
533-
TTS_DEFAULT_VOLUME: Final = "+0%"
534-
TTS_DEFAULT_PITCH: Final = "+0%"
535529

536530
# Silicon Flow STT Configuration
537531
# STT Configuration Keys
@@ -668,9 +662,6 @@ def _build_edge_tts_languages() -> dict:
668662
CONF_RECOMMENDED: True,
669663
CONF_TTS_VOICE: TTS_DEFAULT_VOICE,
670664
CONF_TTS_LANG: TTS_DEFAULT_LANG,
671-
CONF_TTS_RATE: TTS_DEFAULT_RATE,
672-
CONF_TTS_VOLUME: TTS_DEFAULT_VOLUME,
673-
CONF_TTS_PITCH: TTS_DEFAULT_PITCH,
674665
}
675666

676667

custom_components/ai_hub/manifest.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,5 +9,5 @@
99
"iot_class": "cloud_polling",
1010
"issue_tracker": "https://github.com/ha-china/ai_hub/issues",
1111
"requirements": ["edge-tts", "aiofiles", "aiohttp"],
12-
"version": "v2025.12.2"
12+
"version": "v2025.12.3"
1313
}

custom_components/ai_hub/services.py

Lines changed: 8 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929
AI_HUB_IMAGE_GEN_URL,
3030
AI_HUB_STT_AUDIO_FORMATS,
3131
AI_HUB_STT_MODELS,
32+
AI_HUB_TTS_URL,
3233
BEMFA_API_URL,
3334
CONF_API_KEY,
3435
CONF_BEMFA_UID,
@@ -59,10 +60,7 @@
5960
SERVICE_TRANSLATE_BLUEPRINTS,
6061
SILICONFLOW_ASR_URL,
6162
STT_MAX_FILE_SIZE_MB,
62-
TTS_DEFAULT_PITCH,
63-
TTS_DEFAULT_RATE,
6463
TTS_DEFAULT_VOICE,
65-
TTS_DEFAULT_VOLUME,
6664
)
6765

6866
_LOGGER = logging.getLogger(__name__)
@@ -89,9 +87,6 @@
8987
TTS_SCHEMA = {
9088
vol.Required("text"): cv.string,
9189
vol.Optional("voice", default=TTS_DEFAULT_VOICE): vol.In(list(EDGE_TTS_VOICES.keys())),
92-
vol.Optional("rate", default=TTS_DEFAULT_RATE): cv.string,
93-
vol.Optional("volume", default=TTS_DEFAULT_VOLUME): cv.string,
94-
vol.Optional("pitch", default=TTS_DEFAULT_PITCH): cv.string,
9590
vol.Optional("media_player_entity"): cv.entity_id,
9691
}
9792

@@ -316,11 +311,6 @@ async def handle_tts_speech(call: ServiceCall) -> dict:
316311
}
317312
text = call.data["text"]
318313
voice = call.data.get("voice", TTS_DEFAULT_VOICE)
319-
speed = float(call.data.get("speed", TTS_DEFAULT_VOLUME))
320-
volume = float(call.data.get("volume", TTS_DEFAULT_VOLUME))
321-
response_format = call.data.get("response_format", TTS_DEFAULT_RATE)
322-
encode_format = call.data.get("encode_format", TTS_DEFAULT_VOICE)
323-
stream = call.data.get("stream", TTS_DEFAULT_PITCH)
324314
media_player_entity = call.data.get("media_player_entity")
325315

326316
# 验证参数
@@ -330,18 +320,6 @@ async def handle_tts_speech(call: ServiceCall) -> dict:
330320
if voice not in EDGE_TTS_VOICES:
331321
raise ServiceValidationError(f"不支持的语音类型: {voice}")
332322

333-
if response_format not in TTS_DEFAULT_RATE:
334-
raise ServiceValidationError(f"不支持的响应格式: {response_format}")
335-
336-
if encode_format not in TTS_DEFAULT_VOICE:
337-
raise ServiceValidationError(f"不支持的编码格式: {encode_format}")
338-
339-
if not 0.25 <= speed <= 4.0:
340-
raise ServiceValidationError("语速必须在 0.25 到 4.0 之间")
341-
342-
if not 0.1 <= volume <= 2.0:
343-
raise ServiceValidationError("音量必须在 0.1 到 2.0 之间")
344-
345323
# 构建 TTS API 请求
346324
headers = {
347325
"Authorization": f"Bearer {api_key}",
@@ -352,18 +330,14 @@ async def handle_tts_speech(call: ServiceCall) -> dict:
352330
"model": "cogtts",
353331
"input": text,
354332
"voice": voice,
355-
"response_format": response_format,
356-
"encode_format": encode_format,
357-
"stream": stream,
358-
"speed": speed,
359-
"volume": volume,
333+
"response_format": "wav",
360334
}
361335

362336
timeout = aiohttp.ClientTimeout(total=DEFAULT_REQUEST_TIMEOUT / 1000)
363337

364338
async with aiohttp.ClientSession(timeout=timeout) as session:
365339
async with session.post(
366-
EDGE_TTS_VOICES,
340+
AI_HUB_TTS_URL,
367341
headers=headers,
368342
json=payload
369343
) as response:
@@ -379,46 +353,15 @@ async def handle_tts_speech(call: ServiceCall) -> dict:
379353
"error": f"TTS API 请求失败: {response.status}"
380354
}
381355

382-
if stream:
383-
# 处理流式响应
384-
response_text = await response.text()
385-
from .helpers import parse_streaming_response, combine_audio_chunks
386-
387-
audio_chunks = parse_streaming_response(response_text)
388-
389-
if not audio_chunks:
390-
return {"success": False, "error": "未从流式响应中获取到音频数据"}
391-
392-
# 合并音频块
393-
combined_audio = audio_chunks[0] # 对于 TTS,通常第一个块就包含完整数据
356+
# 处理响应
357+
response_data = await response.json()
394358

395-
# 如果有多个块,尝试合并
396-
if len(audio_chunks) > 1:
397-
try:
398-
combined_audio = combine_audio_chunks(audio_chunks)
399-
except Exception as exc:
400-
_LOGGER.warning("音频合并失败,使用第一个音频块: %s", exc)
401-
402-
audio_base64 = combined_audio
403-
else:
404-
# 处理非流式响应
405-
response_data = await response.json()
406-
407-
if "choices" not in response_data or not response_data["choices"]:
408-
return {"success": False, "error": "API 响应格式错误"}
409-
410-
# 从非流式响应中提取音频数据
411-
choice = response_data["choices"][0]
412-
if "audio" in choice:
413-
audio_base64 = choice["audio"]["content"]
414-
elif "message" in choice and "content" in choice["message"]:
415-
audio_base64 = choice["message"]["content"]
416-
else:
417-
return {"success": False, "error": "无法从响应中提取音频数据"}
359+
if not response_data:
360+
return {"success": False, "error": "API 响应为空"}
418361

419362
# 解码音频为 WAV 格式
420363
from .helpers import decode_base64_audio
421-
wav_audio_data = decode_base64_audio(audio_base64)
364+
wav_audio_data = decode_base64_audio(response_data)
422365

423366
# 如果指定了媒体播放器实体,直接播放
424367
if media_player_entity:
@@ -467,8 +410,6 @@ async def handle_tts_speech(call: ServiceCall) -> dict:
467410
"audio_data": audio_base64,
468411
"audio_format": "wav",
469412
"voice": voice,
470-
"speed": speed,
471-
"volume": volume,
472413
}
473414

474415
except ServiceValidationError as exc:

custom_components/ai_hub/services.yaml

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -256,3 +256,43 @@ translate_blueprints:
256256
default: false
257257
selector:
258258
boolean:
259+
260+
tts_speech:
261+
name: 文字转语音
262+
description: 使用Edge TTS生成语音
263+
fields:
264+
text:
265+
name: 文本内容
266+
description: 要转换为语音的文本内容
267+
required: true
268+
example: "你好,欢迎使用文字转语音服务"
269+
selector:
270+
text:
271+
multiline: true
272+
voice:
273+
name: 语音
274+
description: 选择语音类型
275+
required: false
276+
default: "zh-CN-XiaoxiaoNeural"
277+
selector:
278+
select:
279+
options:
280+
- "zh-CN-XiaoxiaoNeural"
281+
- "zh-CN-XiaoyiNeural"
282+
- "zh-CN-YunjianNeural"
283+
- "zh-CN-YunxiNeural"
284+
- "zh-CN-YunxiaNeural"
285+
- "zh-CN-YunyangNeural"
286+
- "zh-HK-HiuGaaiNeural"
287+
- "zh-HK-HiuMaanNeural"
288+
- "zh-HK-WanLungNeural"
289+
- "zh-TW-HsiaoChenNeural"
290+
- "zh-TW-YunJheNeural"
291+
- "zh-TW-HsiaoYuNeural"
292+
media_player_entity:
293+
name: 媒体播放器
294+
description: 指定用于播放生成的语音的媒体播放器实体ID(可选)
295+
required: false
296+
selector:
297+
entity:
298+
domain: media_player

custom_components/ai_hub/strings.json

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -107,18 +107,12 @@
107107
"name": "Name",
108108
"recommended": "Recommended Mode",
109109
"voice": "Voice",
110-
"lang": "Language",
111-
"rate": "Rate",
112-
"volume": "Volume",
113-
"pitch": "Pitch"
110+
"lang": "Language"
114111
},
115112
"data_description": {
116113
"recommended": "Use recommended settings",
117114
"voice": "Select Edge TTS voice, like Xiaoxiao, Yunxi, etc.",
118-
"lang": "Select voice language, like zh-CN for Chinese",
119-
"rate": "Speech rate, like +10% faster, -10% slower",
120-
"volume": "Speech volume, like +10% louder, -10% quieter",
121-
"pitch": "Speech pitch, like +10Hz higher, -10Hz lower"
115+
"lang": "Select voice language, like zh-CN for Chinese"
122116
}
123117
}
124118
}
@@ -279,18 +273,6 @@
279273
"name": "Voice",
280274
"description": "Select Edge TTS voice, like zh-CN-XiaoxiaoNeural"
281275
},
282-
"rate": {
283-
"name": "Rate",
284-
"description": "Speech rate adjustment, like +10% faster, -10% slower"
285-
},
286-
"volume": {
287-
"name": "Volume",
288-
"description": "Speech volume adjustment, like +10% louder, -10% quieter"
289-
},
290-
"pitch": {
291-
"name": "Pitch",
292-
"description": "Speech pitch adjustment, like +10Hz higher, -10Hz lower"
293-
},
294276
"media_player_entity": {
295277
"name": "Media Player",
296278
"description": "Media player entity to play speech"

custom_components/ai_hub/translations/en.json

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -107,18 +107,12 @@
107107
"name": "Name",
108108
"recommended": "Recommended Mode",
109109
"voice": "Voice",
110-
"lang": "Language",
111-
"rate": "Rate",
112-
"volume": "Volume",
113-
"pitch": "Pitch"
110+
"lang": "Language"
114111
},
115112
"data_description": {
116113
"recommended": "Use recommended settings",
117114
"voice": "Select Edge TTS voice, like Xiaoxiao, Yunxi, etc.",
118-
"lang": "Select voice language, like zh-CN for Chinese",
119-
"rate": "Speech rate, like +10% faster, -10% slower",
120-
"volume": "Speech volume, like +10% louder, -10% quieter",
121-
"pitch": "Speech pitch, like +10Hz higher, -10Hz lower"
115+
"lang": "Select voice language, like zh-CN for Chinese"
122116
}
123117
}
124118
}
@@ -281,18 +275,6 @@
281275
"name": "Voice",
282276
"description": "Select Edge TTS voice, like zh-CN-XiaoxiaoNeural"
283277
},
284-
"rate": {
285-
"name": "Rate",
286-
"description": "Speech rate adjustment, like +10% faster, -10% slower"
287-
},
288-
"volume": {
289-
"name": "Volume",
290-
"description": "Speech volume adjustment, like +10% louder, -10% quieter"
291-
},
292-
"pitch": {
293-
"name": "Pitch",
294-
"description": "Speech pitch adjustment, like +10Hz higher, -10Hz lower"
295-
},
296278
"media_player_entity": {
297279
"name": "Media Player",
298280
"description": "Media player entity to play speech"

custom_components/ai_hub/translations/zh-Hans.json

Lines changed: 2 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -107,18 +107,12 @@
107107
"name": "名称",
108108
"recommended": "推荐模式",
109109
"voice": "语音",
110-
"lang": "语言",
111-
"rate": "语速",
112-
"volume": "音量",
113-
"pitch": "音调"
110+
"lang": "语言"
114111
},
115112
"data_description": {
116113
"recommended": "使用推荐设置",
117114
"voice": "选择Edge TTS语音,如晓晓、云熙等",
118-
"lang": "选择语音语言,如zh-CN代表中文",
119-
"rate": "语音速度,如+10%更快,-10%更慢",
120-
"volume": "语音音量,如+10%更响,-10%更轻",
121-
"pitch": "语音音调,如+10Hz更高,-10Hz更低"
115+
"lang": "选择语音语言,如zh-CN代表中文"
122116
}
123117
}
124118
}
@@ -274,18 +268,6 @@
274268
"name": "语音",
275269
"description": "选择Edge TTS语音,如zh-CN-XiaoxiaoNeural"
276270
},
277-
"rate": {
278-
"name": "语速",
279-
"description": "语音速度调整,如+10%更快,-10%更慢"
280-
},
281-
"volume": {
282-
"name": "音量",
283-
"description": "语音音量调整,如+10%更响,-10%更轻"
284-
},
285-
"pitch": {
286-
"name": "音调",
287-
"description": "语音音调调整,如+10Hz更高,-10Hz更低"
288-
},
289271
"media_player_entity": {
290272
"name": "媒体播放器",
291273
"description": "用于播放语音的媒体播放器实体"

0 commit comments

Comments
 (0)