Description
Hey, I've introduced the following two modifications for my own use and figured you may want to take a look and see if it's something you'd like to implement. This is pretty crude and needs some refinement for sure but works. The following code is a drop-in replacement (you will probably want to add relevant config.py settings). The first snippet is for whisperX, the second one adds AllTalk TTS support. AllTalk TTS is a little bit more demanding than piper but offers way better voice quality. WhisperX lets you run this app 100% offline. With 12GB VRAM I'm running the tiny whisper model, a 7B/8B LLM (currently testing wizardlm2 and llama3 via Ollama) and my custom AllTalk model.
import whisperx as wx
from pydub import AudioSegment
import os
from dotenv import load_dotenv
from config import AUDIO_FILE_DIR
import gc
# Load .env file if present
load_dotenv()
device = "cuda"
batch_size = 16
compute_type = "int8"
model_dir = "C:\\test"
language = "en"
model = wx.load_model("tiny", device, language=language, compute_type=compute_type, download_root=model_dir)
def transcribe_audio(file_path):
try:
audio = AudioSegment.from_file(f"{AUDIO_FILE_DIR}/{file_path}")
chunk_size = 10 * 60 * 1000
num_chunks = len(audio) // chunk_size + (1 if len(audio) % chunk_size else 0)
transcript = ""
file_size = os.path.getsize(f"{AUDIO_FILE_DIR}/{file_path}")
if file_size <= 24 * 1024 * 1024:
result = model.transcribe(f"{AUDIO_FILE_DIR}/{file_path}", batch_size=batch_size)
for segment in result['segments']:
transcript += segment['text'] + " "
else:
for i in range(num_chunks):
temp_chunk_path = f"{AUDIO_FILE_DIR}/temp_chunk.mp3"
chunk = audio[i*chunk_size:(i+1)*chunk_size]
with open(temp_chunk_path, 'wb') as f:
chunk.export(f, format="mp3")
try:
result = model.transcribe(temp_chunk_path, batch_size=batch_size)
for segment in result['segments']:
transcript += segment['text'] + " "
finally:
os.remove(temp_chunk_path)
os.remove(f"{AUDIO_FILE_DIR}/{file_path}")
return transcript
except FileNotFoundError as e:
raise FileNotFoundError(f"The audio file {file_path} was not found.") from e
except Exception as e:
raise Exception(f"An error occurred during the transcription process: {e}") from e
def cleanup_model():
global model
del model
gc.collect()
import os
import soundfile as sf
import sounddevice as sd
from openai import OpenAI
from dotenv import load_dotenv
import subprocess
import threading
import queue
import config
import tempfile
import utils
import requests
import shutil
...
...
def TTS_Alltalk(self, text_to_speak, output_file):
# Sanitize the input text by removing unsuitable characters
text_to_speak = utils.sanitize_text(text_to_speak)
# If there is no text left after sanitization, return "failed"
if not text_to_speak.strip():
return "failed"
try:
# Define the API endpoint
api_url = "http://127.0.0.1:7851/api/tts-generate"
# Prepare the data payload for the POST request
data = {
"text_input": text_to_speak,
"text_filtering": "none",
"character_voice_gen": "female_03.wav",
"narrator_enabled": "false",
"narrator_voice_gen": "arnold.wav",
"text_not_inside": "character",
"language": "en",
"output_file_name": "output",
"output_file_timestamp": "true",
"autoplay": "false",
"autoplay_volume": "0.8"
}
response = requests.post(api_url, data=data)
print(response.content)
response.raise_for_status()
response_data = response.json()
if response_data["status"] == "generate-success":
local_audio_path = response_data["output_file_path"]
# Copy the file from the local path to the desired output file
shutil.copyfile(local_audio_path, output_file)
return "success"
except requests.RequestException as e:
print(f"Error calling TTS API: {e}")
return "failed"
The latter snippet is not really an efficient solution as there is no need to copy the AllTalk generated wavs over to the AlwaysReddy audio_files directory. It would make more sense to change the AUDIO_FILE_DIR in config.py to point to the AllTalk output folder. Or change the output directory in AllTalk to point to AUDIO_FILE_DIR. If you think this may come in handy in any way, please feel free to use this code as you see fit.
Activity