Skip to content

Commit 430bab8

Browse files
Set voice cloning Qwen3-TTS as the default with dynamic voice creation
1 parent f170e2e commit 430bab8

3 files changed

Lines changed: 7 additions & 7 deletions

File tree

src/sdialog/audio/dialog.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ def persona_to_voice(
440440
:type seed: int
441441
"""
442442
if voices is None and voice_database is None:
443+
logger.info("No voices provided, generating them dynamically based on the persona definition of each speaker.")
443444
reference_prompts = generate_reference_voices(dialog=self, voice_clone_model=tts_engine)
444445
voices = {role: reference_prompts.get(speaker) for speaker, role in self.speakers_roles.items()}
445446

src/sdialog/audio/pipeline.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,9 @@
5858
from sdialog.audio.utils import logger
5959
from sdialog.audio.dialog import AudioDialog
6060
from sdialog.audio.processing import AudioProcessor
61-
from sdialog.audio.tts import BaseTTS, Qwen3TTS
6261
from sdialog.audio.jsalt import MedicalRoomGenerator, RoomRole
6362
from sdialog.audio.room import Room, RoomPosition, DirectivityType
63+
from sdialog.audio.tts import BaseTTS, Qwen3TTS, Qwen3TTSVoiceClone
6464
from sdialog.audio.voice_database import Voice, BaseVoiceDatabase, HuggingfaceVoiceDatabase
6565
from sdialog.audio import generate_utterances_audios, generate_audio_room_accoustic
6666
from sdialog.audio.impulse_response_database import ImpulseResponseDatabase, RecordingDevice
@@ -359,13 +359,12 @@ def __init__(
359359

360360
self.tts_engine = tts_engine
361361
if self.tts_engine is None:
362-
logger.warning("No TTS provided, using Qwen3-TTS as the default TTS model: Qwen/Qwen3-TTS-12Hz-1.7B-Base")
363-
self.tts_engine = Qwen3TTS()
362+
logger.warning("No TTS provided, using voice cloning Qwen3-TTS as the default TTS model (Qwen3-TTS-12Hz-1.7B-Base)")
363+
self.tts_engine = Qwen3TTSVoiceClone()
364364

365365
self.voice_database = voice_database
366366
if self.voice_database is None and isinstance(self.tts_engine, BaseTTS):
367-
logger.warning("No voice database provided, make sure the TTS engine supports voice design or voice "
368-
"cloning if you want to use the voice assignment features of the audio pipeline.")
367+
logger.warning("No voice database provided, using default voice database for the TTS engine.")
369368
# TODO: default voice databased SHOULD be part of the TTS engine!
370369
# since each engine supports a predefined voice database we should get the defalt as:
371370
# self.voice_database = self.tts_engine.voice_database

src/sdialog/audio/tts/base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ class BaseVoiceCloneTTS(ABC):
106106
def generate(
107107
self,
108108
text: str,
109-
speaker_voice: str | Any = None,
109+
speaker_voice: Any = None,
110110
tts_pipeline_kwargs: dict = {}) -> tuple[np.ndarray, int]:
111111
"""
112112
Generates audio from text using voice cloning.
@@ -117,7 +117,7 @@ def generate(
117117
:param text: The text to be converted to speech.
118118
:type text: str
119119
:param speaker_voice: Reference audio path, prompt object, or None.
120-
:type speaker_voice: str | Any
120+
:type speaker_voice: Any
121121
:param tts_pipeline_kwargs: Additional keyword arguments passed to the TTS pipeline.
122122
:type tts_pipeline_kwargs: dict
123123
:return: A tuple containing the audio data as a numpy array and the sampling rate.

0 commit comments

Comments
 (0)