Set voice cloning Qwen3-TTS as the default with dynamic voice creation

sergioburdisso · sergioburdisso · commit 430bab8b1476 · 2026-02-11T11:20:23.000+01:00
diff --git a/src/sdialog/audio/dialog.py b/src/sdialog/audio/dialog.py
@@ -440,6 +440,7 @@ def persona_to_voice(
         :type seed: int
         """
         if voices is None and voice_database is None:
+            logger.info("No voices provided, generating them dynamically based on the persona definition of each speaker.")
             reference_prompts = generate_reference_voices(dialog=self, voice_clone_model=tts_engine)
             voices = {role: reference_prompts.get(speaker) for speaker, role in self.speakers_roles.items()}
 
diff --git a/src/sdialog/audio/pipeline.py b/src/sdialog/audio/pipeline.py
@@ -58,9 +58,9 @@
 from sdialog.audio.utils import logger
 from sdialog.audio.dialog import AudioDialog
 from sdialog.audio.processing import AudioProcessor
-from sdialog.audio.tts import BaseTTS, Qwen3TTS
 from sdialog.audio.jsalt import MedicalRoomGenerator, RoomRole
 from sdialog.audio.room import Room, RoomPosition, DirectivityType
+from sdialog.audio.tts import BaseTTS, Qwen3TTS, Qwen3TTSVoiceClone
 from sdialog.audio.voice_database import Voice, BaseVoiceDatabase, HuggingfaceVoiceDatabase
 from sdialog.audio import generate_utterances_audios, generate_audio_room_accoustic
 from sdialog.audio.impulse_response_database import ImpulseResponseDatabase, RecordingDevice
@@ -359,13 +359,12 @@ def __init__(
 
         self.tts_engine = tts_engine
         if self.tts_engine is None:
-            logger.warning("No TTS provided, using Qwen3-TTS as the default TTS model: Qwen/Qwen3-TTS-12Hz-1.7B-Base")
-            self.tts_engine = Qwen3TTS()
+            logger.warning("No TTS provided, using voice cloning Qwen3-TTS as the default TTS model (Qwen3-TTS-12Hz-1.7B-Base)")
+            self.tts_engine = Qwen3TTSVoiceClone()
 
         self.voice_database = voice_database
         if self.voice_database is None and isinstance(self.tts_engine, BaseTTS):
-            logger.warning("No voice database provided, make sure the TTS engine supports voice design or voice "
-                           "cloning if you want to use the voice assignment features of the audio pipeline.")
+            logger.warning("No voice database provided, using default voice database for the TTS engine.")
             # TODO: default voice databased SHOULD be part of the TTS engine!
             #       since each engine supports a predefined voice database we should get the defalt as:
             #       self.voice_database = self.tts_engine.voice_database
diff --git a/src/sdialog/audio/tts/base.py b/src/sdialog/audio/tts/base.py
@@ -106,7 +106,7 @@ class BaseVoiceCloneTTS(ABC):
     def generate(
             self,
             text: str,
-            speaker_voice: str | Any = None,
+            speaker_voice: Any = None,
             tts_pipeline_kwargs: dict = {}) -> tuple[np.ndarray, int]:
         """
         Generates audio from text using voice cloning.
@@ -117,7 +117,7 @@ def generate(
         :param text: The text to be converted to speech.
         :type text: str
         :param speaker_voice: Reference audio path, prompt object, or None.
-        :type speaker_voice: str | Any
+        :type speaker_voice: Any
         :param tts_pipeline_kwargs: Additional keyword arguments passed to the TTS pipeline.
         :type tts_pipeline_kwargs: dict
         :return: A tuple containing the audio data as a numpy array and the sampling rate.