Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions misc/Voice-to-Text-to-Voice/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# Voice-to-Text-to-Voice Pipeline

This project implements a complete voice transformation pipeline that:

1. **Voice → Text**: Uses OpenAI's Whisper model (via Transformers) to transcribe speech to text
2. **Text → Voice**: Uses ChatterboxTTS to generate speech with voice cloning based on an audio prompt


This can be done from a file, or by using system audio like microphone & speaker

## Usage

Install the dependencies.
```sh
pip install -r requirements.txt
```

Make sure to uninstall & reinstall the right version of Torch, compatible with your Cuda version

Once ready, start the voice_pipeline.

```sh
python voice_pipeline.py
```

### Platform-specific Setup:

**Windows:**
1. Install [VB-Cable](https://vb-audio.com/Cable/) (free)
2. Set output device to "CABLE Input"
3. In Discord/Zoom, select "CABLE Output" as microphone

**macOS:**
1. Install [BlackHole](https://github.com/ExistentialAudio/BlackHole) (free)
2. Set output device to "BlackHole 2ch"
3. In Discord/Zoom, select "BlackHole 2ch" as microphone

**Linux:**
1. Create virtual device: `pactl load-module module-null-sink sink_name=virtual_mic`
2. Set output device to the virtual sink
3. In Discord/Zoom, select the virtual source as microphone

### Performance Tips

- **Lower latency**: Use shorter chunk durations (1-2 seconds)
- **Better quality**: Use longer chunk durations (3-5 seconds)
- **GPU acceleration**: Ensure CUDA is available for faster processing
- **Audio quality**: Use high-quality voice prompt files

## Files

- [`main.py`](./main.py) - Main pipeline implementation with demos
- [`voice_pipeline.py`](./voice_pipeline.py) - Command-line utility script
- [`male_petergriffin.wav`](./male_petergriffin.wav) - Sample audio prompt for voice cloning
80 changes: 80 additions & 0 deletions misc/Voice-to-Text-to-Voice/debug_audio_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""
Debug Audio Test Script

This script tests the TTS audio generation separately from real-time processing
to help diagnose audio output issues.
"""

import torch
import numpy as np
import soundfile as sf
from pathlib import Path
from chatterbox.tts import ChatterboxTTS
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def test_tts_generation():
"""Test TTS audio generation with debug output."""

# Test configuration
audio_prompt_path = "male_petergriffin.wav" # Update this path as needed
test_text = "Hello, this is a test of the text-to-speech system."

if not Path(audio_prompt_path).exists():
logger.error(f"Audio prompt file not found: {audio_prompt_path}")
return

try:
# Load TTS model
logger.info("Loading ChatterboxTTS model...")
device = "cuda" if torch.cuda.is_available() else "cpu"
tts_model = ChatterboxTTS.from_pretrained(device=device)
logger.info(f"TTS model loaded on {device}")

# Generate audio
logger.info(f"Generating audio for text: '{test_text}'")
wav = tts_model.generate(test_text, audio_prompt_path=audio_prompt_path)

# Process audio
if isinstance(wav, torch.Tensor):
wav = wav.squeeze().cpu().numpy()

if wav.ndim > 1:
wav = wav.flatten()

# Save original audio
original_path = "debug_tts_original.wav"
sf.write(original_path, wav, tts_model.sr)
logger.info(f"Original TTS audio saved to: {original_path}")
logger.info(f"Original audio: length={len(wav)} samples, sr={tts_model.sr}Hz, duration={len(wav)/tts_model.sr:.2f}s")
logger.info(f"Original audio: min={wav.min():.4f}, max={wav.max():.4f}, mean={wav.mean():.4f}")

# Test resampling to 16kHz (Whisper sample rate)
target_sr = 16000
if tts_model.sr != target_sr:
import torchaudio as ta
logger.info(f"Resampling from {tts_model.sr}Hz to {target_sr}Hz")
wav_tensor = torch.tensor(wav, dtype=torch.float32).unsqueeze(0)
resampler = ta.transforms.Resample(tts_model.sr, target_sr)
wav_resampled = resampler(wav_tensor).squeeze().numpy()

# Save resampled audio
resampled_path = "debug_tts_resampled.wav"
sf.write(resampled_path, wav_resampled, target_sr)
logger.info(f"Resampled audio saved to: {resampled_path}")
logger.info(f"Resampled audio: length={len(wav_resampled)} samples, sr={target_sr}Hz, duration={len(wav_resampled)/target_sr:.2f}s")
logger.info(f"Resampled audio: min={wav_resampled.min():.4f}, max={wav_resampled.max():.4f}, mean={wav_resampled.mean():.4f}")

logger.info("TTS test completed successfully!")
logger.info("Check the generated audio files to verify they sound correct.")

except Exception as e:
logger.error(f"TTS test failed: {e}")
import traceback
logger.error(traceback.format_exc())

if __name__ == "__main__":
test_tts_generation()
132 changes: 132 additions & 0 deletions misc/Voice-to-Text-to-Voice/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
import torchaudio as ta
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from chatterbox.tts import ChatterboxTTS

def voice_to_text_to_voice_pipeline(input_audio_path, audio_prompt_path, output_path):
"""
Complete pipeline: voice -> text -> voice (transformed)

Args:
input_audio_path: Path to input audio file to transcribe
audio_prompt_path: Path to audio prompt for voice transformation
output_path: Path to save the transformed output audio
"""

# Step 1: Voice -> Text using Whisper from transformers
print("Loading Whisper model...")
processor = WhisperProcessor.from_pretrained("openai/whisper-base")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-base")

print(f"Transcribing audio from {input_audio_path}...")
# Load audio file
audio_input, sample_rate = ta.load(input_audio_path)

# Resample to 16kHz if needed (Whisper expects 16kHz)
if sample_rate != 16000:
resampler = ta.transforms.Resample(sample_rate, 16000)
audio_input = resampler(audio_input)

# Convert to mono if stereo
if audio_input.shape[0] > 1:
audio_input = torch.mean(audio_input, dim=0, keepdim=True)

# Process audio
input_features = processor(audio_input.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").input_features

# Generate transcription
with torch.no_grad():
predicted_ids = model.generate(input_features)

transcribed_text = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
print(f"Transcribed text: {transcribed_text}")

# Step 2: Text -> Voice using ChatterboxTTS with audio prompt
print("Loading ChatterboxTTS model...")
assert torch.cuda.is_available(), "CUDA is not available. Please install CUDA and PyTorch with GPU support."
tts_model = ChatterboxTTS.from_pretrained(device="cuda")

print(f"Generating transformed voice using prompt from {audio_prompt_path}...")
wav = tts_model.generate(transcribed_text, audio_prompt_path=audio_prompt_path)

# Step 3: Save the transformed audio
ta.save(output_path, wav, tts_model.sr)
print(f"Transformed audio saved to {output_path}")

return transcribed_text, output_path

# Example usage with your existing setup
if __name__ == "__main__":
# Original text-to-speech example (keeping for reference)
print("=== Original Text-to-Speech Example ===")
text = "Ezreal and Jinx teamed up with Ahri, Yasuo, and Teemo to take down the enemy's Nexus in an epic late-game pentakill."
model = ChatterboxTTS.from_pretrained(device="cuda" if torch.cuda.is_available() else "cpu")
AUDIO_PROMPT_PATH = "male_petergriffin.wav"
wav = model.generate(text, audio_prompt_path=AUDIO_PROMPT_PATH)
ta.save("output_original.wav", wav, model.sr)
print("Original output saved to output_original.wav")

# New voice-to-text-to-voice pipeline with file input
print("\n=== Voice-to-Text-to-Voice Pipeline (File Input) ===")
# You'll need to provide an input audio file to transcribe
# For demo purposes, let's use the audio prompt as input (you can change this)
INPUT_AUDIO_PATH = "male_petergriffin.wav" # Change this to your input audio file
AUDIO_PROMPT_PATH = "male_petergriffin.wav" # This transforms the voice style
OUTPUT_PATH = "output_transformed.wav"

try:
transcribed_text, output_file = voice_to_text_to_voice_pipeline(
INPUT_AUDIO_PATH,
AUDIO_PROMPT_PATH,
OUTPUT_PATH
)
print(f"\nFile pipeline completed successfully!")
print(f"Transcribed: '{transcribed_text}'")
print(f"Transformed audio saved to: {output_file}")
except Exception as e:
print(f"Error in file pipeline: {e}")
print("Make sure you have an input audio file and the required models are available.")

# Live microphone recording demo
print("\n=== Live Microphone Recording Demo ===")
try:
from microphone_recorder import MicrophoneRecorder

response = input("Would you like to try live microphone recording? (y/n): ").lower().strip()
if response in ['y', 'yes']:
recorder = MicrophoneRecorder()

print("\n🎤 Available audio devices:")
recorder.list_audio_devices()

print(f"\n🔴 Ready to record! Speak into your microphone...")
print("Press ENTER when you're done speaking.")

temp_recording_path = "temp_recording.wav"
success = recorder.record_and_save(temp_recording_path)

if success:
print(f"\n🎯 Processing your recorded audio...")
transcribed_text, output_file = voice_to_text_to_voice_pipeline(
temp_recording_path,
AUDIO_PROMPT_PATH,
"output_live_recording.wav"
)

print(f"\n✅ Live recording pipeline completed!")
print(f"📝 You said: '{transcribed_text}'")
print(f"🔊 Your voice transformed and saved to: output_live_recording.wav")

# Clean up temporary file
import os
os.remove(temp_recording_path)
print("🗑️ Temporary recording file cleaned up.")
else:
print("❌ Recording failed.")
else:
print("Skipping live recording demo.")

except ImportError:
print("Microphone recording not available. Install sounddevice and soundfile packages.")
except Exception as e:
print(f"Error in live recording demo: {e}")
Binary file not shown.
Loading