diff --git a/qwen_tts/inference/qwen3_tts_model.py b/qwen_tts/inference/qwen3_tts_model.py index f4d33bf3..21041fa3 100644 --- a/qwen_tts/inference/qwen3_tts_model.py +++ b/qwen_tts/inference/qwen3_tts_model.py @@ -259,8 +259,8 @@ def _normalize_audio_inputs(self, audios: Union[AudioLike, List[AudioLike]]) -> raise TypeError(f"Unsupported audio input type: {type(a)}") for i, a in enumerate(out): if a[0].ndim > 1: - a[0] = np.mean(a[0], axis=-1).astype(np.float32) - out[i] = (a[0], a[1]) + a_s = np.mean(a[0], axis=-1).astype(np.float32) + out[i] = (a_s, a[1]) return out def _ensure_list(self, x: MaybeList) -> List[Any]: