ploomber · edublancas · Jun 5, 2025 · Jun 5, 2025
@@ -0,0 +1,54 @@
+# Voice-to-Text-to-Voice Pipeline
+
+This project implements a complete voice transformation pipeline that:
+
+1. **Voice → Text**: Uses OpenAI's Whisper model (via Transformers) to transcribe speech to text
+2. **Text → Voice**: Uses ChatterboxTTS to generate speech with voice cloning based on an audio prompt
+
+
+This can be done from a file, or by using system audio like microphone & speaker
+
+## Usage
+
+Install the dependencies.
+```sh
+pip install -r requirements.txt
+```
+
+Make sure to uninstall & reinstall the right version of Torch, compatible with your Cuda version
+
+Once ready, start the voice_pipeline.
+
+```sh
+python voice_pipeline.py
+```
+
+### Platform-specific Setup:
+
+**Windows:**
+1. Install [VB-Cable](https://vb-audio.com/Cable/) (free)
+2. Set output device to "CABLE Input"
+3. In Discord/Zoom, select "CABLE Output" as microphone
+
+**macOS:**
+1. Install [BlackHole](https://github.com/ExistentialAudio/BlackHole) (free)
+2. Set output device to "BlackHole 2ch"
+3. In Discord/Zoom, select "BlackHole 2ch" as microphone
+
+**Linux:**
+1. Create virtual device: `pactl load-module module-null-sink sink_name=virtual_mic`
+2. Set output device to the virtual sink
+3. In Discord/Zoom, select the virtual source as microphone
+
+### Performance Tips
+
+- **Lower latency**: Use shorter chunk durations (1-2 seconds)
+- **Better quality**: Use longer chunk durations (3-5 seconds)
+- **GPU acceleration**: Ensure CUDA is available for faster processing
+- **Audio quality**: Use high-quality voice prompt files
+
+## Files
+
+- [`main.py`](./main.py) - Main pipeline implementation with demos
+- [`voice_pipeline.py`](./voice_pipeline.py) - Command-line utility script  
+- [`male_petergriffin.wav`](./male_petergriffin.wav) - Sample audio prompt for voice cloning
@@ -0,0 +1,80 @@
+#!/usr/bin/env python3
+"""
+Debug Audio Test Script
+
+This script tests the TTS audio generation separately from real-time processing
+to help diagnose audio output issues.
+"""
+
+import torch
+import numpy as np
+import soundfile as sf
+from pathlib import Path
+from chatterbox.tts import ChatterboxTTS
+import logging
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+def test_tts_generation():
+    """Test TTS audio generation with debug output."""
+
+    # Test configuration
+    audio_prompt_path = "male_petergriffin.wav"  # Update this path as needed
+    test_text = "Hello, this is a test of the text-to-speech system."
+
+    if not Path(audio_prompt_path).exists():
+        logger.error(f"Audio prompt file not found: {audio_prompt_path}")
+        return
+
+    try:
+        # Load TTS model
+        logger.info("Loading ChatterboxTTS model...")
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        tts_model = ChatterboxTTS.from_pretrained(device=device)
+        logger.info(f"TTS model loaded on {device}")
+
+        # Generate audio
+        logger.info(f"Generating audio for text: '{test_text}'")
+        wav = tts_model.generate(test_text, audio_prompt_path=audio_prompt_path)
+
+        # Process audio
+        if isinstance(wav, torch.Tensor):
+            wav = wav.squeeze().cpu().numpy()
+
+        if wav.ndim > 1:
+            wav = wav.flatten()
+
+        # Save original audio
+        original_path = "debug_tts_original.wav"
+        sf.write(original_path, wav, tts_model.sr)
+        logger.info(f"Original TTS audio saved to: {original_path}")
+        logger.info(f"Original audio: length={len(wav)} samples, sr={tts_model.sr}Hz, duration={len(wav)/tts_model.sr:.2f}s")
+        logger.info(f"Original audio: min={wav.min():.4f}, max={wav.max():.4f}, mean={wav.mean():.4f}")
+
+        # Test resampling to 16kHz (Whisper sample rate)
+        target_sr = 16000
+        if tts_model.sr != target_sr:
+            import torchaudio as ta
+            logger.info(f"Resampling from {tts_model.sr}Hz to {target_sr}Hz")
+            wav_tensor = torch.tensor(wav, dtype=torch.float32).unsqueeze(0)
+            resampler = ta.transforms.Resample(tts_model.sr, target_sr)
+            wav_resampled = resampler(wav_tensor).squeeze().numpy()
+
+            # Save resampled audio
+            resampled_path = "debug_tts_resampled.wav"
+            sf.write(resampled_path, wav_resampled, target_sr)
+            logger.info(f"Resampled audio saved to: {resampled_path}")
+            logger.info(f"Resampled audio: length={len(wav_resampled)} samples, sr={target_sr}Hz, duration={len(wav_resampled)/target_sr:.2f}s")
+            logger.info(f"Resampled audio: min={wav_resampled.min():.4f}, max={wav_resampled.max():.4f}, mean={wav_resampled.mean():.4f}")
+
+        logger.info("TTS test completed successfully!")
+        logger.info("Check the generated audio files to verify they sound correct.")
+
+    except Exception as e:
+        logger.error(f"TTS test failed: {e}")
+        import traceback
+        logger.error(traceback.format_exc())
+
+if __name__ == "__main__":
+    test_tts_generation() 
@@ -0,0 +1,132 @@
+import torchaudio as ta
+import torch
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+from chatterbox.tts import ChatterboxTTS
+
+def voice_to_text_to_voice_pipeline(input_audio_path, audio_prompt_path, output_path):
+    """
+    Complete pipeline: voice -> text -> voice (transformed)
+
+    Args:
+        input_audio_path: Path to input audio file to transcribe
+        audio_prompt_path: Path to audio prompt for voice transformation
+        output_path: Path to save the transformed output audio
+    """
+
+    # Step 1: Voice -> Text using Whisper from transformers
+    print("Loading Whisper model...")
+    processor = WhisperProcessor.from_pretrained("openai/whisper-base")
+    model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")
+
+    print(f"Transcribing audio from {input_audio_path}...")
+    # Load audio file
+    audio_input, sample_rate = ta.load(input_audio_path)
+
+    # Resample to 16kHz if needed (Whisper expects 16kHz)
+    if sample_rate != 16000:
+        resampler = ta.transforms.Resample(sample_rate, 16000)
+        audio_input = resampler(audio_input)
+
+    # Convert to mono if stereo
+    if audio_input.shape[0] > 1:
+        audio_input = torch.mean(audio_input, dim=0, keepdim=True)
+
+    # Process audio
+    input_features = processor(audio_input.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features
+
+    # Generate transcription
+    with torch.no_grad():
+        predicted_ids = model.generate(input_features)
+
+    transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    print(f"Transcribed text: {transcribed_text}")
+
+    # Step 2: Text -> Voice using ChatterboxTTS with audio prompt
+    print("Loading ChatterboxTTS model...")
+    assert torch.cuda.is_available(), "CUDA is not available. Please install CUDA and PyTorch with GPU support."
+    tts_model = ChatterboxTTS.from_pretrained(device="cuda")
+
+    print(f"Generating transformed voice using prompt from {audio_prompt_path}...")
+    wav = tts_model.generate(transcribed_text, audio_prompt_path=audio_prompt_path)
+
+    # Step 3: Save the transformed audio
+    ta.save(output_path, wav, tts_model.sr)
+    print(f"Transformed audio saved to {output_path}")
+
+    return transcribed_text, output_path
+
+# Example usage with your existing setup
+if __name__ == "__main__":
+    # Original text-to-speech example (keeping for reference)
+    print("=== Original Text-to-Speech Example ===")
+    text = "Ezreal and Jinx teamed up with Ahri, Yasuo, and Teemo to take down the enemy's Nexus in an epic late-game pentakill."
+    model = ChatterboxTTS.from_pretrained(device="cuda" if torch.cuda.is_available() else "cpu")
+    AUDIO_PROMPT_PATH = "male_petergriffin.wav"
+    wav = model.generate(text, audio_prompt_path=AUDIO_PROMPT_PATH)
+    ta.save("output_original.wav", wav, model.sr)
+    print("Original output saved to output_original.wav")
+
+    # New voice-to-text-to-voice pipeline with file input
+    print("\n=== Voice-to-Text-to-Voice Pipeline (File Input) ===")
+    # You'll need to provide an input audio file to transcribe
+    # For demo purposes, let's use the audio prompt as input (you can change this)
+    INPUT_AUDIO_PATH = "male_petergriffin.wav"  # Change this to your input audio file
+    AUDIO_PROMPT_PATH = "male_petergriffin.wav"  # This transforms the voice style
+    OUTPUT_PATH = "output_transformed.wav"
+
+    try:
+        transcribed_text, output_file = voice_to_text_to_voice_pipeline(
+            INPUT_AUDIO_PATH, 
+            AUDIO_PROMPT_PATH, 
+            OUTPUT_PATH
+        )
+        print(f"\nFile pipeline completed successfully!")
+        print(f"Transcribed: '{transcribed_text}'")
+        print(f"Transformed audio saved to: {output_file}")
+    except Exception as e:
+        print(f"Error in file pipeline: {e}")
+        print("Make sure you have an input audio file and the required models are available.")
+
+    # Live microphone recording demo
+    print("\n=== Live Microphone Recording Demo ===")
+    try:
+        from microphone_recorder import MicrophoneRecorder
+
+        response = input("Would you like to try live microphone recording? (y/n): ").lower().strip()
+        if response in ['y', 'yes']:
+            recorder = MicrophoneRecorder()
+
+            print("\n🎤 Available audio devices:")
+            recorder.list_audio_devices()
+
+            print(f"\n🔴 Ready to record! Speak into your microphone...")
+            print("Press ENTER when you're done speaking.")
+
+            temp_recording_path = "temp_recording.wav"
+            success = recorder.record_and_save(temp_recording_path)
+
+            if success:
+                print(f"\n🎯 Processing your recorded audio...")
+                transcribed_text, output_file = voice_to_text_to_voice_pipeline(
+                    temp_recording_path,
+                    AUDIO_PROMPT_PATH,
+                    "output_live_recording.wav"
+                )
+
+                print(f"\n✅ Live recording pipeline completed!")
+                print(f"📝 You said: '{transcribed_text}'")
+                print(f"🔊 Your voice transformed and saved to: output_live_recording.wav")
+
+                # Clean up temporary file
+                import os
+                os.remove(temp_recording_path)
+                print("🗑️  Temporary recording file cleaned up.")
+            else:
+                print("❌ Recording failed.")
+        else:
+            print("Skipping live recording demo.")
+
+    except ImportError:
+        print("Microphone recording not available. Install sounddevice and soundfile packages.")
+    except Exception as e:
+        print(f"Error in live recording demo: {e}")