Skip to content

Commit 766d269

Browse files
committed
feat: support MiniMax audio provider
1 parent 9af61c4 commit 766d269

3 files changed

Lines changed: 221 additions & 3 deletions

File tree

app/agent/llm/capability.py

Lines changed: 157 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@
99
from abc import ABC
1010
from io import BytesIO
1111
from pathlib import Path
12-
from typing import Dict, Optional
12+
from typing import Any, Dict, Optional
1313
from uuid import uuid4
1414

1515
from app.core.config import settings
1616
from app.log import logger
17+
from app.utils.http import RequestUtils
1718

1819

1920
class AgentCapabilityProvider(ABC):
@@ -411,6 +412,160 @@ def _normalize_tts_model(self) -> str:
411412
return model
412413

413414

415+
class MiniMaxAudioProvider(OpenAIChatAudioProvider):
416+
"""MiniMax 音频 provider,语音合成使用官方 T2A HTTP 接口。"""
417+
418+
name = "minimax"
419+
DISPLAY_NAME = "MiniMax"
420+
DEFAULT_BASE_URL = "https://api.minimaxi.com/v1"
421+
DEFAULT_STT_MODEL = "MiniMax-M2.7"
422+
DEFAULT_TTS_MODEL = "speech-2.8-turbo"
423+
DEFAULT_VOICE = "Chinese (Mandarin)_Lyrical_Voice"
424+
AUDIO_INPUT_DATA_URL = True
425+
SUPPORTED_TTS_MODELS = frozenset(
426+
{
427+
"speech-2.8-hd",
428+
"speech-2.8-turbo",
429+
"speech-2.6-hd",
430+
"speech-2.6-turbo",
431+
"speech-02-hd",
432+
"speech-02-turbo",
433+
"speech-01-hd",
434+
"speech-01-turbo",
435+
}
436+
)
437+
438+
def _build_client(self, api_key: str, base_url: Optional[str]):
439+
"""构建 MiniMax OpenAI 兼容客户端,兼容用户误填 Anthropic 端点的情况。"""
440+
from openai import OpenAI
441+
442+
return OpenAI(
443+
api_key=api_key,
444+
base_url=self._normalize_api_base_url(base_url),
445+
max_retries=3,
446+
)
447+
448+
@classmethod
449+
def _normalize_api_base_url(cls, base_url: Optional[str]) -> str:
450+
"""归一化 MiniMax API 基础 URL,确保后续可以拼接 OpenAI/T2A 路径。"""
451+
normalized = (base_url or cls.DEFAULT_BASE_URL).strip().rstrip("/")
452+
if normalized.endswith("/t2a_v2"):
453+
normalized = normalized[: -len("/t2a_v2")]
454+
for suffix in ("/anthropic/v1", "/openai/v1"):
455+
if normalized.endswith(suffix):
456+
return normalized[: -len(suffix)] + "/v1"
457+
if not normalized.endswith("/v1"):
458+
normalized = f"{normalized}/v1"
459+
return normalized
460+
461+
@classmethod
462+
def _build_t2a_url(cls, base_url: Optional[str]) -> str:
463+
"""生成 MiniMax 同步 T2A 接口地址。"""
464+
return f"{cls._normalize_api_base_url(base_url)}/t2a_v2"
465+
466+
def _normalize_stt_model(self) -> str:
467+
"""将非 MiniMax 的默认转写模型名兜底为 MiniMax 对话模型。"""
468+
model = (settings.AUDIO_INPUT_MODEL or "").strip()
469+
if not model or model.lower().startswith(("gpt-", "mimo-")):
470+
return self.DEFAULT_STT_MODEL
471+
return model
472+
473+
def _normalize_tts_model(self) -> str:
474+
"""将非 MiniMax 语音模型兜底为官方 T2A 模型。"""
475+
model = (settings.AUDIO_OUTPUT_MODEL or "").strip().lower()
476+
if model in self.SUPPORTED_TTS_MODELS:
477+
return model
478+
return self.DEFAULT_TTS_MODEL
479+
480+
def _normalize_voice_id(self) -> str:
481+
"""将其他 provider 的默认音色兜底为 MiniMax 中文系统音色。"""
482+
voice_id = (settings.AUDIO_OUTPUT_VOICE or "").strip()
483+
if not voice_id or voice_id in {"alloy", "mimo_default"}:
484+
return self.DEFAULT_VOICE
485+
return voice_id
486+
487+
@staticmethod
488+
def _decode_audio_payload(audio_data: str) -> bytes:
489+
"""解析 MiniMax T2A 返回的音频数据,优先按官方 hex 格式处理。"""
490+
normalized = "".join((audio_data or "").split())
491+
try:
492+
return bytes.fromhex(normalized)
493+
except ValueError:
494+
return base64.b64decode(audio_data)
495+
496+
@staticmethod
497+
def _extract_minimax_error(data: dict[str, Any]) -> Optional[str]:
498+
"""提取 MiniMax base_resp 错误信息,成功响应返回 None。"""
499+
base_resp = data.get("base_resp") or {}
500+
status_code = base_resp.get("status_code")
501+
if status_code in (None, 0, "0"):
502+
return None
503+
status_msg = base_resp.get("status_msg") or "unknown error"
504+
return f"{status_code}: {status_msg}"
505+
506+
def synthesize_speech(self, text: str) -> Optional[Path]:
507+
"""调用 MiniMax T2A HTTP 接口合成语音文件。"""
508+
if not text:
509+
return None
510+
511+
try:
512+
api_key, base_url = self._output_credentials()
513+
if not api_key:
514+
raise ValueError("音频输出 provider 未配置 API Key")
515+
response = RequestUtils(
516+
headers={
517+
"Authorization": f"Bearer {api_key}",
518+
"Content-Type": "application/json",
519+
"Accept": "application/json",
520+
},
521+
proxies=settings.PROXY or {},
522+
timeout=60,
523+
).post_res(
524+
url=self._build_t2a_url(base_url),
525+
json={
526+
"model": self._normalize_tts_model(),
527+
"text": text,
528+
"stream": False,
529+
"language_boost": "auto",
530+
"output_format": "hex",
531+
"voice_setting": {
532+
"voice_id": self._normalize_voice_id(),
533+
"speed": 1,
534+
"vol": 1,
535+
"pitch": 0,
536+
},
537+
"audio_setting": {
538+
"sample_rate": 32000,
539+
"bitrate": 128000,
540+
"format": "opus",
541+
"channel": 1,
542+
},
543+
},
544+
)
545+
if not response:
546+
raise ValueError("MiniMax T2A 请求无响应")
547+
if response.status_code >= 400:
548+
raise ValueError(f"MiniMax T2A HTTP {response.status_code}")
549+
550+
result = response.json()
551+
minimax_error = self._extract_minimax_error(result)
552+
if minimax_error:
553+
raise ValueError(f"MiniMax T2A 返回错误: {minimax_error}")
554+
555+
audio_data = ((result.get("data") or {}).get("audio") or "").strip()
556+
if not audio_data:
557+
raise ValueError("MiniMax T2A 响应中没有音频数据")
558+
559+
voice_dir = settings.TEMP_PATH / "voice"
560+
voice_dir.mkdir(parents=True, exist_ok=True)
561+
output_path = voice_dir / f"{uuid4().hex}.opus"
562+
output_path.write_bytes(self._decode_audio_payload(audio_data))
563+
return output_path
564+
except Exception as err:
565+
logger.error(f"音频输出合成失败: provider={self.name}, error={err}")
566+
return None
567+
568+
414569
class AgentCapabilityManager:
415570
"""Agent 能力统一入口。"""
416571

@@ -420,6 +575,7 @@ class AgentCapabilityManager:
420575
OpenAIAudioProvider.name: OpenAIAudioProvider(),
421576
OpenAIChatAudioProvider.name: OpenAIChatAudioProvider(),
422577
MiMoAudioProvider.name: MiMoAudioProvider(),
578+
MiniMaxAudioProvider.name: MiniMaxAudioProvider(),
423579
}
424580

425581
@classmethod

app/core/config.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ class ConfigModel(BaseModel):
600600
# AI智能体自动重试整理失败记录开关
601601
AI_AGENT_RETRY_TRANSFER: bool = False
602602

603-
# 音频输入提供商:openai/openai_chat_audio/mimo
603+
# 音频输入提供商:openai/openai_chat_audio/mimo/minimax
604604
AUDIO_INPUT_PROVIDER: str = "openai"
605605
# 音频输入 API 密钥
606606
AUDIO_INPUT_API_KEY: Optional[str] = None
@@ -610,7 +610,7 @@ class ConfigModel(BaseModel):
610610
AUDIO_INPUT_MODEL: str = "gpt-4o-mini-transcribe"
611611
# 音频输入识别语言
612612
AUDIO_INPUT_LANGUAGE: str = "zh"
613-
# 音频输出提供商:openai/openai_chat_audio/mimo
613+
# 音频输出提供商:openai/openai_chat_audio/mimo/minimax
614614
AUDIO_OUTPUT_PROVIDER: str = "openai"
615615
# 音频输出 API 密钥
616616
AUDIO_OUTPUT_API_KEY: Optional[str] = None

tests/test_agent_llm_capability.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121

2222
AgentCapabilityManager = capability_module.AgentCapabilityManager
2323
MiMoAudioProvider = capability_module.MiMoAudioProvider
24+
MiniMaxAudioProvider = capability_module.MiniMaxAudioProvider
2425
OpenAIChatAudioProvider = capability_module.OpenAIChatAudioProvider
2526
OpenAIAudioProvider = capability_module.OpenAIAudioProvider
2627

@@ -32,6 +33,7 @@ def test_registered_audio_providers_contains_builtin_providers(self):
3233
"openai_chat_audio", AgentCapabilityManager.get_registered_audio_providers()
3334
)
3435
self.assertIn("mimo", AgentCapabilityManager.get_registered_audio_providers())
36+
self.assertIn("minimax", AgentCapabilityManager.get_registered_audio_providers())
3537

3638
def test_get_audio_provider_uses_separate_input_and_output_settings(self):
3739
with patch.object(settings, "AUDIO_INPUT_PROVIDER", "openai"), patch.object(
@@ -230,6 +232,66 @@ def test_mimo_stt_uses_base64_audio_input(self):
230232
)
231233
self.assertIn("只输出转写结果", content[1]["text"])
232234

235+
def test_minimax_stt_normalizes_openai_default_model(self):
236+
"""校验 MiniMax 音频输入会把 OpenAI 默认模型兜底为 MiniMax 模型。"""
237+
provider = MiniMaxAudioProvider()
238+
239+
with patch.object(settings, "AUDIO_INPUT_MODEL", "gpt-4o-mini-transcribe"):
240+
self.assertEqual(provider._normalize_stt_model(), "MiniMax-M2.7")
241+
242+
def test_minimax_tts_uses_t2a_http_payload(self):
243+
"""校验 MiniMax 音频输出会调用官方 T2A HTTP 接口并写入音频文件。"""
244+
provider = MiniMaxAudioProvider()
245+
fake_response = SimpleNamespace(
246+
status_code=200,
247+
json=Mock(
248+
return_value={
249+
"data": {"audio": b"opus-bytes".hex(), "status": 2},
250+
"base_resp": {"status_code": 0, "status_msg": "success"},
251+
}
252+
),
253+
)
254+
request_utils = Mock()
255+
request_utils.post_res.return_value = fake_response
256+
257+
with TemporaryDirectory() as temp_dir, patch.object(
258+
capability_module, "RequestUtils", return_value=request_utils
259+
) as request_utils_cls, patch.object(
260+
capability_module,
261+
"settings",
262+
SimpleNamespace(
263+
TEMP_PATH=Path(temp_dir),
264+
PROXY={},
265+
AUDIO_OUTPUT_MODEL="gpt-4o-mini-tts",
266+
AUDIO_OUTPUT_VOICE="alloy",
267+
AUDIO_OUTPUT_API_KEY="sk-test",
268+
AUDIO_OUTPUT_BASE_URL="https://api.minimaxi.com/anthropic/v1",
269+
),
270+
):
271+
output_path = provider.synthesize_speech("你好")
272+
output_bytes = output_path.read_bytes() if output_path else None
273+
274+
self.assertIsNotNone(output_path)
275+
self.assertEqual(output_bytes, b"opus-bytes")
276+
request_utils_cls.assert_called_once()
277+
request = request_utils.post_res.call_args.kwargs
278+
self.assertEqual(request["url"], "https://api.minimaxi.com/v1/t2a_v2")
279+
self.assertEqual(request["json"]["model"], "speech-2.8-turbo")
280+
self.assertEqual(
281+
request["json"]["voice_setting"]["voice_id"],
282+
"Chinese (Mandarin)_Lyrical_Voice",
283+
)
284+
self.assertEqual(request["json"]["audio_setting"]["format"], "opus")
285+
286+
def test_minimax_tts_accepts_base64_audio_payload(self):
287+
"""校验 MiniMax 音频解析兼容部分代理返回的 base64 音频数据。"""
288+
provider = MiniMaxAudioProvider()
289+
290+
self.assertEqual(
291+
provider._decode_audio_payload(b64encode(b"opus-bytes").decode("utf-8")),
292+
b"opus-bytes",
293+
)
294+
233295

234296
if __name__ == "__main__":
235297
unittest.main()

0 commit comments

Comments
 (0)