2929 AI_HUB_IMAGE_GEN_URL ,
3030 AI_HUB_STT_AUDIO_FORMATS ,
3131 AI_HUB_STT_MODELS ,
32+ AI_HUB_TTS_URL ,
3233 BEMFA_API_URL ,
3334 CONF_API_KEY ,
3435 CONF_BEMFA_UID ,
5960 SERVICE_TRANSLATE_BLUEPRINTS ,
6061 SILICONFLOW_ASR_URL ,
6162 STT_MAX_FILE_SIZE_MB ,
62- TTS_DEFAULT_PITCH ,
63- TTS_DEFAULT_RATE ,
6463 TTS_DEFAULT_VOICE ,
65- TTS_DEFAULT_VOLUME ,
6664)
6765
6866_LOGGER = logging .getLogger (__name__ )
8987TTS_SCHEMA = {
9088 vol .Required ("text" ): cv .string ,
9189 vol .Optional ("voice" , default = TTS_DEFAULT_VOICE ): vol .In (list (EDGE_TTS_VOICES .keys ())),
92- vol .Optional ("rate" , default = TTS_DEFAULT_RATE ): cv .string ,
93- vol .Optional ("volume" , default = TTS_DEFAULT_VOLUME ): cv .string ,
94- vol .Optional ("pitch" , default = TTS_DEFAULT_PITCH ): cv .string ,
9590 vol .Optional ("media_player_entity" ): cv .entity_id ,
9691}
9792
@@ -316,11 +311,6 @@ async def handle_tts_speech(call: ServiceCall) -> dict:
316311 }
317312 text = call .data ["text" ]
318313 voice = call .data .get ("voice" , TTS_DEFAULT_VOICE )
319- speed = float (call .data .get ("speed" , TTS_DEFAULT_VOLUME ))
320- volume = float (call .data .get ("volume" , TTS_DEFAULT_VOLUME ))
321- response_format = call .data .get ("response_format" , TTS_DEFAULT_RATE )
322- encode_format = call .data .get ("encode_format" , TTS_DEFAULT_VOICE )
323- stream = call .data .get ("stream" , TTS_DEFAULT_PITCH )
324314 media_player_entity = call .data .get ("media_player_entity" )
325315
326316 # 验证参数
@@ -330,18 +320,6 @@ async def handle_tts_speech(call: ServiceCall) -> dict:
330320 if voice not in EDGE_TTS_VOICES :
331321 raise ServiceValidationError (f"不支持的语音类型: { voice } " )
332322
333- if response_format not in TTS_DEFAULT_RATE :
334- raise ServiceValidationError (f"不支持的响应格式: { response_format } " )
335-
336- if encode_format not in TTS_DEFAULT_VOICE :
337- raise ServiceValidationError (f"不支持的编码格式: { encode_format } " )
338-
339- if not 0.25 <= speed <= 4.0 :
340- raise ServiceValidationError ("语速必须在 0.25 到 4.0 之间" )
341-
342- if not 0.1 <= volume <= 2.0 :
343- raise ServiceValidationError ("音量必须在 0.1 到 2.0 之间" )
344-
345323 # 构建 TTS API 请求
346324 headers = {
347325 "Authorization" : f"Bearer { api_key } " ,
@@ -352,18 +330,14 @@ async def handle_tts_speech(call: ServiceCall) -> dict:
352330 "model" : "cogtts" ,
353331 "input" : text ,
354332 "voice" : voice ,
355- "response_format" : response_format ,
356- "encode_format" : encode_format ,
357- "stream" : stream ,
358- "speed" : speed ,
359- "volume" : volume ,
333+ "response_format" : "wav" ,
360334 }
361335
362336 timeout = aiohttp .ClientTimeout (total = DEFAULT_REQUEST_TIMEOUT / 1000 )
363337
364338 async with aiohttp .ClientSession (timeout = timeout ) as session :
365339 async with session .post (
366- EDGE_TTS_VOICES ,
340+ AI_HUB_TTS_URL ,
367341 headers = headers ,
368342 json = payload
369343 ) as response :
@@ -379,46 +353,15 @@ async def handle_tts_speech(call: ServiceCall) -> dict:
379353 "error" : f"TTS API 请求失败: { response .status } "
380354 }
381355
382- if stream :
383- # 处理流式响应
384- response_text = await response .text ()
385- from .helpers import parse_streaming_response , combine_audio_chunks
386-
387- audio_chunks = parse_streaming_response (response_text )
388-
389- if not audio_chunks :
390- return {"success" : False , "error" : "未从流式响应中获取到音频数据" }
391-
392- # 合并音频块
393- combined_audio = audio_chunks [0 ] # 对于 TTS,通常第一个块就包含完整数据
356+ # 处理响应
357+ response_data = await response .json ()
394358
395- # 如果有多个块,尝试合并
396- if len (audio_chunks ) > 1 :
397- try :
398- combined_audio = combine_audio_chunks (audio_chunks )
399- except Exception as exc :
400- _LOGGER .warning ("音频合并失败,使用第一个音频块: %s" , exc )
401-
402- audio_base64 = combined_audio
403- else :
404- # 处理非流式响应
405- response_data = await response .json ()
406-
407- if "choices" not in response_data or not response_data ["choices" ]:
408- return {"success" : False , "error" : "API 响应格式错误" }
409-
410- # 从非流式响应中提取音频数据
411- choice = response_data ["choices" ][0 ]
412- if "audio" in choice :
413- audio_base64 = choice ["audio" ]["content" ]
414- elif "message" in choice and "content" in choice ["message" ]:
415- audio_base64 = choice ["message" ]["content" ]
416- else :
417- return {"success" : False , "error" : "无法从响应中提取音频数据" }
359+ if not response_data :
360+ return {"success" : False , "error" : "API 响应为空" }
418361
419362 # 解码音频为 WAV 格式
420363 from .helpers import decode_base64_audio
421- wav_audio_data = decode_base64_audio (audio_base64 )
364+ wav_audio_data = decode_base64_audio (response_data )
422365
423366 # 如果指定了媒体播放器实体,直接播放
424367 if media_player_entity :
@@ -467,8 +410,6 @@ async def handle_tts_speech(call: ServiceCall) -> dict:
467410 "audio_data" : audio_base64 ,
468411 "audio_format" : "wav" ,
469412 "voice" : voice ,
470- "speed" : speed ,
471- "volume" : volume ,
472413 }
473414
474415 except ServiceValidationError as exc :
0 commit comments