AstrBotDevs · lingyun14beta · Jul 1, 2026 · sourcery-ai · Jul 1, 2026
diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py
@@ -1638,6 +1638,7 @@
                         "api_base": "https://api.xiaomimimo.com/v1",
                         "model": "mimo-v2-tts",
                         "mimo-tts-voice": "mimo_default",
+                        "mimo-tts-voiceclone-audio": "",
                         "mimo-tts-format": "wav",
                         "mimo-tts-style-prompt": "",
                         "mimo-tts-dialect": "",
@@ -2604,7 +2605,12 @@
                     "mimo-tts-voice": {
                         "description": "音色",
                         "type": "string",
-                        "hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。",
+                        "hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。仅用于 mimo-v2.5-tts / mimo-v2-tts 等预置音色模型；使用 mimo-v2.5-tts-voiceclone 模型时本字段会被忽略，请改填下方的'音色复刻参考音频'。",
+                    },
+                    "mimo-tts-voiceclone-audio": {
+                        "description": "音色复刻参考音频",
+                        "type": "string",
+                        "hint": "仅在模型为 mimo-v2.5-tts-voiceclone（音色复刻）时需要填写。支持本地文件路径、http(s) 链接、或 base64/data URI，音频格式仅支持 mp3、wav，转换后大小不超过 10 MB。留空且选用该模型时会报错。",
                     },
                     "mimo-tts-format": {
                         "description": "输出格式",

diff --git a/astrbot/core/provider/sources/mimo_api_common.py b/astrbot/core/provider/sources/mimo_api_common.py
@@ -56,14 +56,20 @@ def build_api_url(api_base: str) -> str:
     return normalized_api_base + "/chat/completions"
 
 
-async def prepare_audio_input(audio_source: str) -> tuple[str, list[Path]]:
+async def prepare_audio_input(
+    audio_source: str,
+    *,
+    target_format: str | None = "wav",
+    preserve_mp3: bool = False,
+) -> tuple[str, list[Path]]:
     audio_data = await MediaResolver(
         audio_source,
         media_type="audio",
         default_suffix=".wav",
     ).to_base64_data(
         strict=True,
-        target_format="wav",
+        target_format=target_format,
+        preserve_mp3=preserve_mp3,
     )
     if audio_data is None:
         raise ValueError(f"Invalid audio data: {describe_media_ref(audio_source)}")

diff --git a/astrbot/core/provider/sources/mimo_tts_api_source.py b/astrbot/core/provider/sources/mimo_tts_api_source.py
@@ -1,5 +1,7 @@
+import asyncio
 import base64
 import uuid
+from pathlib import Path
 
 from ..entities import ProviderType
 from ..provider import TTSProvider
@@ -12,9 +14,11 @@
     MiMoAPIError,
     build_api_url,
     build_headers,
+    cleanup_files,
     create_http_client,
     get_temp_dir,
     normalize_timeout,
+    prepare_audio_input,
 )
 
 
@@ -35,6 +39,9 @@ def __init__(
         self.proxy = provider_config.get("proxy", "")
         self.timeout = normalize_timeout(provider_config.get("timeout", 20))
         self.voice = provider_config.get("mimo-tts-voice", DEFAULT_MIMO_TTS_VOICE)
+        self.voiceclone_audio_source = provider_config.get(
+            "mimo-tts-voiceclone-audio", ""
+        ).strip()
         self.audio_format = provider_config.get("mimo-tts-format", "wav")
         self.style_prompt = provider_config.get("mimo-tts-style-prompt", "")
         self.dialect = provider_config.get("mimo-tts-dialect", "")
@@ -43,6 +50,13 @@ def __init__(
         )
         self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL))
         self.client = create_http_client(self.timeout, self.proxy)
+        # 音色复刻(voiceclone)参考音频转换结果缓存，避免每次合成都重新读取/转码同一份样本
+        self._voiceclone_cache_source: str | None = None
+        self._voiceclone_cache_data_url: str | None = None
+        self._voiceclone_cleanup_paths: list[Path] = []
+        # TTS provider 实例在管线中是长期共享的，可能被并发请求同时调用；
+        # 用锁序列化"检查缓存 -> 转换 -> 写入缓存"这一过程，避免重复转换和临时文件泄漏。
+        self._voiceclone_lock = asyncio.Lock()
 
     def _build_user_prompt(self) -> str | None:
         seed_text = self.seed_text.strip()
@@ -69,7 +83,52 @@ def _build_style_prefix(self) -> str:
     def _build_assistant_content(self, text: str) -> str:
         return f"{self._build_style_prefix()}{text}"
 
-    def _build_payload(self, text: str) -> dict:
+    def _is_voiceclone_model(self) -> bool:
+        return "voiceclone" in self.model_name
+
+    async def _resolve_voiceclone_voice(self) -> str:
+        """将配置的参考音频样本转换为 voiceclone 所需的 data URL。
+
+        结果会按音频来源缓存，避免每次合成请求都重新读取/转码同一份样本。
+        加锁是为了在并发请求下避免重复转换、避免临时文件泄漏或被错误清理。
+        """
+        if not self.voiceclone_audio_source:
+            raise MiMoAPIError(
+                "MiMo TTS voiceclone model (mimo-v2.5-tts-voiceclone) requires a "
+                "reference audio sample. Please set 'mimo-tts-voiceclone-audio' to "
+                "a local path, URL, or base64/data URI."
+            )
+
+        async with self._voiceclone_lock:
+            if (
+                self._voiceclone_cache_data_url is not None
+                and self._voiceclone_cache_source == self.voiceclone_audio_source
+            ):
+                return self._voiceclone_cache_data_url
+
+            try:
+                data_url, cleanup_paths = await prepare_audio_input(
+                    self.voiceclone_audio_source,
+                    # MiMo voiceclone 接受 mp3 或 wav；保留原始 mp3 而不强制转 wav，
+                    # 可避免未压缩 PCM 带来的体积膨胀（更容易撞到官方 10 MB 上限）。
+                    # 其他格式（ogg/flac/silk 等）仍会兜底转换为 wav。
+                    target_format=None,
+                    preserve_mp3=True,
+                )
+            except Exception as exc:
+                raise MiMoAPIError(
+                    f"Failed to prepare MiMo TTS voiceclone reference audio "
+                    f"'{self.voiceclone_audio_source}': {exc}"
+                ) from exc
+
+            # 旧缓存的临时文件不再需要，先清理掉再写入新结果
+            cleanup_files(self._voiceclone_cleanup_paths)
+            self._voiceclone_cleanup_paths = cleanup_paths
+            self._voiceclone_cache_source = self.voiceclone_audio_source
+            self._voiceclone_cache_data_url = data_url
+            return data_url
+
+    def _build_payload(self, text: str, voice_value: str | None = None) -> dict:
         messages: list[dict[str, str]] = []
 
         user_prompt = self._build_user_prompt()
@@ -91,7 +150,9 @@ def _build_payload(self, text: str) -> dict:
         audio_params = {"format": self.audio_format}
         # voice design 模型不支持 audio.voice 参数
         if "voicedesign" not in self.model_name:
-            audio_params["voice"] = self.voice
+            audio_params["voice"] = (
+                voice_value if voice_value is not None else self.voice
+            )
 
         return {
             "model": self.model_name,
@@ -100,10 +161,14 @@ def _build_payload(self, text: str) -> dict:
         }
 
     async def get_audio(self, text: str) -> str:
+        voice_value = None
+        if self._is_voiceclone_model():
+            voice_value = await self._resolve_voiceclone_voice()
+
         response = await self.client.post(
             build_api_url(self.api_base),
             headers=build_headers(self.chosen_api_key),
-            json=self._build_payload(text),
+            json=self._build_payload(text, voice_value),
         )
 
         try:
@@ -129,5 +194,7 @@ async def get_audio(self, text: str) -> str:
         return str(output_path)
 
     async def terminate(self):
+        cleanup_files(self._voiceclone_cleanup_paths)
+        self._voiceclone_cleanup_paths = []
         if self.client:
             await self.client.aclose()
diff --git a/dashboard/src/i18n/locales/en-US/features/config-metadata.json b/dashboard/src/i18n/locales/en-US/features/config-metadata.json
@@ -1616,7 +1616,11 @@
       },
       "mimo-tts-voice": {
         "description": "Voice",
-        "hint": "MiMo TTS voice name. Supported values include 'mimo_default', 'default_en', and 'default_zh'."
+        "hint": "MiMo TTS voice name. Supported values include 'mimo_default', 'default_en', and 'default_zh'. Only used by preset-voice models; ignored when the model is mimo-v2.5-tts-voiceclone (use 'Voice Clone Reference Audio' instead)."
+      },
+      "mimo-tts-voiceclone-audio": {
+        "description": "Voice Clone Reference Audio",
+        "hint": "Required only when the model is mimo-v2.5-tts-voiceclone. Accepts a local file path, an http(s) URL, or a base64/data URI. Only mp3 and wav are supported, and the encoded size must not exceed 10 MB."
       },
       "mimo-tts-format": {
         "description": "Output format",

diff --git a/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json b/dashboard/src/i18n/locales/ru-RU/features/config-metadata.json
@@ -1613,7 +1613,11 @@
             },
             "mimo-tts-voice": {
                 "description": "Голос",
-                "hint": "Имя голоса MiMo TTS. Поддерживаются значения 'mimo_default', 'default_en' и 'default_zh'."
+                "hint": "Имя голоса MiMo TTS. Поддерживаются значения 'mimo_default', 'default_en' и 'default_zh'. Используется только для моделей с предустановленными голосами; игнорируется для модели mimo-v2.5-tts-voiceclone (используйте 'Референсное аудио для клонирования голоса')."
+            },
+            "mimo-tts-voiceclone-audio": {
+                "description": "Референсное аудио для клонирования голоса",
+                "hint": "Требуется только для модели mimo-v2.5-tts-voiceclone. Принимает локальный путь к файлу, ссылку http(s) или base64/data URI. Поддерживаются только mp3 и wav, размер после кодирования не должен превышать 10 МБ."
             },
             "mimo-tts-format": {
                 "description": "Формат вывода",

diff --git a/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json b/dashboard/src/i18n/locales/zh-CN/features/config-metadata.json
@@ -1618,7 +1618,11 @@
       },
       "mimo-tts-voice": {
         "description": "音色",
-        "hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。"
+        "hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。仅用于预置音色模型；使用 mimo-v2.5-tts-voiceclone 模型时本字段会被忽略，请改填'音色复刻参考音频'。"
+      },
+      "mimo-tts-voiceclone-audio": {
+        "description": "音色复刻参考音频",
+        "hint": "仅在模型为 mimo-v2.5-tts-voiceclone（音色复刻）时需要填写。支持本地文件路径、http(s) 链接、或 base64/data URI，音频格式仅支持 mp3、wav，转换后大小不超过 10 MB。"
       },
       "mimo-tts-format": {
         "description": "输出格式",