Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion astrbot/core/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1638,6 +1638,7 @@
"api_base": "https://api.xiaomimimo.com/v1",
"model": "mimo-v2-tts",
"mimo-tts-voice": "mimo_default",
"mimo-tts-voiceclone-audio": "",
"mimo-tts-format": "wav",
"mimo-tts-style-prompt": "",
"mimo-tts-dialect": "",
Expand Down Expand Up @@ -2604,7 +2605,12 @@
"mimo-tts-voice": {
"description": "音色",
"type": "string",
"hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。",
"hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。仅用于 mimo-v2.5-tts / mimo-v2-tts 等预置音色模型;使用 mimo-v2.5-tts-voiceclone 模型时本字段会被忽略,请改填下方的'音色复刻参考音频'。",
},
"mimo-tts-voiceclone-audio": {
"description": "音色复刻参考音频",
"type": "string",
"hint": "仅在模型为 mimo-v2.5-tts-voiceclone(音色复刻)时需要填写。支持本地文件路径、http(s) 链接、或 base64/data URI,音频格式仅支持 mp3、wav,转换后大小不超过 10 MB。留空且选用该模型时会报错。",
},
"mimo-tts-format": {
"description": "输出格式",
Expand Down
10 changes: 8 additions & 2 deletions astrbot/core/provider/sources/mimo_api_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,14 +56,20 @@ def build_api_url(api_base: str) -> str:
return normalized_api_base + "/chat/completions"


async def prepare_audio_input(audio_source: str) -> tuple[str, list[Path]]:
async def prepare_audio_input(
audio_source: str,
*,
target_format: str | None = "wav",
preserve_mp3: bool = False,
) -> tuple[str, list[Path]]:
audio_data = await MediaResolver(
audio_source,
media_type="audio",
default_suffix=".wav",
).to_base64_data(
strict=True,
target_format="wav",
target_format=target_format,
preserve_mp3=preserve_mp3,
)
if audio_data is None:
raise ValueError(f"Invalid audio data: {describe_media_ref(audio_source)}")
Expand Down
73 changes: 70 additions & 3 deletions astrbot/core/provider/sources/mimo_tts_api_source.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import asyncio
import base64
import uuid
from pathlib import Path

from ..entities import ProviderType
from ..provider import TTSProvider
Expand All @@ -12,9 +14,11 @@
MiMoAPIError,
build_api_url,
build_headers,
cleanup_files,
create_http_client,
get_temp_dir,
normalize_timeout,
prepare_audio_input,
)


Expand All @@ -35,6 +39,9 @@ def __init__(
self.proxy = provider_config.get("proxy", "")
self.timeout = normalize_timeout(provider_config.get("timeout", 20))
self.voice = provider_config.get("mimo-tts-voice", DEFAULT_MIMO_TTS_VOICE)
self.voiceclone_audio_source = provider_config.get(
"mimo-tts-voiceclone-audio", ""
).strip()
self.audio_format = provider_config.get("mimo-tts-format", "wav")
self.style_prompt = provider_config.get("mimo-tts-style-prompt", "")
self.dialect = provider_config.get("mimo-tts-dialect", "")
Expand All @@ -43,6 +50,13 @@ def __init__(
)
self.set_model(provider_config.get("model", DEFAULT_MIMO_TTS_MODEL))
self.client = create_http_client(self.timeout, self.proxy)
# 音色复刻(voiceclone)参考音频转换结果缓存,避免每次合成都重新读取/转码同一份样本
self._voiceclone_cache_source: str | None = None
self._voiceclone_cache_data_url: str | None = None
self._voiceclone_cleanup_paths: list[Path] = []
# TTS provider 实例在管线中是长期共享的,可能被并发请求同时调用;
# 用锁序列化"检查缓存 -> 转换 -> 写入缓存"这一过程,避免重复转换和临时文件泄漏。
self._voiceclone_lock = asyncio.Lock()

def _build_user_prompt(self) -> str | None:
seed_text = self.seed_text.strip()
Expand All @@ -69,7 +83,52 @@ def _build_style_prefix(self) -> str:
def _build_assistant_content(self, text: str) -> str:
return f"{self._build_style_prefix()}{text}"

def _build_payload(self, text: str) -> dict:
def _is_voiceclone_model(self) -> bool:
return "voiceclone" in self.model_name

async def _resolve_voiceclone_voice(self) -> str:
"""将配置的参考音频样本转换为 voiceclone 所需的 data URL。

结果会按音频来源缓存,避免每次合成请求都重新读取/转码同一份样本。
加锁是为了在并发请求下避免重复转换、避免临时文件泄漏或被错误清理。
"""
if not self.voiceclone_audio_source:
raise MiMoAPIError(
"MiMo TTS voiceclone model (mimo-v2.5-tts-voiceclone) requires a "
"reference audio sample. Please set 'mimo-tts-voiceclone-audio' to "
"a local path, URL, or base64/data URI."
)

async with self._voiceclone_lock:
if (
self._voiceclone_cache_data_url is not None
and self._voiceclone_cache_source == self.voiceclone_audio_source
):
return self._voiceclone_cache_data_url

try:
data_url, cleanup_paths = await prepare_audio_input(
self.voiceclone_audio_source,
Comment on lines +102 to +111

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): Consider reusing the lock in terminate() to avoid potential races with voiceclone cleanup.

terminate() calls cleanup_files(self._voiceclone_cleanup_paths) without holding _voiceclone_lock, while _resolve_voiceclone_voice() accesses and mutates _voiceclone_cleanup_paths under that lock. If terminate() runs while voiceclone resolution is in progress, this can cause races or inconsistent cleanup. Please guard terminate()’s access to _voiceclone_cleanup_paths with the same lock (or otherwise prevent concurrent access).

Suggested implementation:

        async with self._voiceclone_lock:
            cleanup_files(self._voiceclone_cleanup_paths)

To safely reuse _voiceclone_lock in terminate():

  1. Ensure terminate() is an async def so that async with self._voiceclone_lock: is valid. If terminate() must remain synchronous, instead move the cleanup into an async helper (e.g. _async_terminate_cleanup) that uses the lock, and have terminate() schedule/await that helper where appropriate.
  2. Verify that all other accesses and mutations of self._voiceclone_cleanup_paths (if any) are also guarded by _voiceclone_lock to fully prevent races.

# MiMo voiceclone 接受 mp3 或 wav;保留原始 mp3 而不强制转 wav,
# 可避免未压缩 PCM 带来的体积膨胀(更容易撞到官方 10 MB 上限)。
# 其他格式(ogg/flac/silk 等)仍会兜底转换为 wav。
target_format=None,
preserve_mp3=True,
)
except Exception as exc:
raise MiMoAPIError(
f"Failed to prepare MiMo TTS voiceclone reference audio "
f"'{self.voiceclone_audio_source}': {exc}"
) from exc

# 旧缓存的临时文件不再需要,先清理掉再写入新结果
cleanup_files(self._voiceclone_cleanup_paths)
self._voiceclone_cleanup_paths = cleanup_paths
self._voiceclone_cache_source = self.voiceclone_audio_source
self._voiceclone_cache_data_url = data_url
return data_url

def _build_payload(self, text: str, voice_value: str | None = None) -> dict:
messages: list[dict[str, str]] = []

user_prompt = self._build_user_prompt()
Expand All @@ -91,7 +150,9 @@ def _build_payload(self, text: str) -> dict:
audio_params = {"format": self.audio_format}
# voice design 模型不支持 audio.voice 参数
if "voicedesign" not in self.model_name:
audio_params["voice"] = self.voice
audio_params["voice"] = (
voice_value if voice_value is not None else self.voice
)

return {
"model": self.model_name,
Expand All @@ -100,10 +161,14 @@ def _build_payload(self, text: str) -> dict:
}

async def get_audio(self, text: str) -> str:
voice_value = None
if self._is_voiceclone_model():
voice_value = await self._resolve_voiceclone_voice()

response = await self.client.post(
build_api_url(self.api_base),
headers=build_headers(self.chosen_api_key),
json=self._build_payload(text),
json=self._build_payload(text, voice_value),
)

try:
Expand All @@ -129,5 +194,7 @@ async def get_audio(self, text: str) -> str:
return str(output_path)

async def terminate(self):
cleanup_files(self._voiceclone_cleanup_paths)
self._voiceclone_cleanup_paths = []
if self.client:
await self.client.aclose()
Original file line number Diff line number Diff line change
Expand Up @@ -1616,7 +1616,11 @@
},
"mimo-tts-voice": {
"description": "Voice",
"hint": "MiMo TTS voice name. Supported values include 'mimo_default', 'default_en', and 'default_zh'."
"hint": "MiMo TTS voice name. Supported values include 'mimo_default', 'default_en', and 'default_zh'. Only used by preset-voice models; ignored when the model is mimo-v2.5-tts-voiceclone (use 'Voice Clone Reference Audio' instead)."
},
"mimo-tts-voiceclone-audio": {
"description": "Voice Clone Reference Audio",
"hint": "Required only when the model is mimo-v2.5-tts-voiceclone. Accepts a local file path, an http(s) URL, or a base64/data URI. Only mp3 and wav are supported, and the encoded size must not exceed 10 MB."
},
"mimo-tts-format": {
"description": "Output format",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1613,7 +1613,11 @@
},
"mimo-tts-voice": {
"description": "Голос",
"hint": "Имя голоса MiMo TTS. Поддерживаются значения 'mimo_default', 'default_en' и 'default_zh'."
"hint": "Имя голоса MiMo TTS. Поддерживаются значения 'mimo_default', 'default_en' и 'default_zh'. Используется только для моделей с предустановленными голосами; игнорируется для модели mimo-v2.5-tts-voiceclone (используйте 'Референсное аудио для клонирования голоса')."
},
"mimo-tts-voiceclone-audio": {
"description": "Референсное аудио для клонирования голоса",
"hint": "Требуется только для модели mimo-v2.5-tts-voiceclone. Принимает локальный путь к файлу, ссылку http(s) или base64/data URI. Поддерживаются только mp3 и wav, размер после кодирования не должен превышать 10 МБ."
},
"mimo-tts-format": {
"description": "Формат вывода",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1618,7 +1618,11 @@
},
"mimo-tts-voice": {
"description": "音色",
"hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。"
"hint": "MiMo TTS 的音色名称。可选值包括 'mimo_default'、'default_en'、'default_zh'。仅用于预置音色模型;使用 mimo-v2.5-tts-voiceclone 模型时本字段会被忽略,请改填'音色复刻参考音频'。"
},
"mimo-tts-voiceclone-audio": {
"description": "音色复刻参考音频",
"hint": "仅在模型为 mimo-v2.5-tts-voiceclone(音色复刻)时需要填写。支持本地文件路径、http(s) 链接、或 base64/data URI,音频格式仅支持 mp3、wav,转换后大小不超过 10 MB。"
},
"mimo-tts-format": {
"description": "输出格式",
Expand Down
Loading
Loading