Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 128 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
.hypothesis/
.pytest_cache/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
.python-version

# celery beat schedule file
celerybeat-schedule

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# IDE specific files
.idea/
.vscode/
*.swp
*.swo

# Project specific
model_weights/
*.pt
*.pth

# Docker
.docker/
docker-volumes/
models/
51 changes: 51 additions & 0 deletions Dockerfile.project
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04
WORKDIR /root
RUN apt-get update -y && apt-get install -y python3-pip
RUN pip3 install faster-whisper


# Set DEBIAN_FRONTEND to noninteractive to avoid prompts during apt-get install
ENV DEBIAN_FRONTEND=noninteractive

# Install system dependencies
RUN apt-get update && apt-get install -y --no-install-recommends \
python3 \
python3-pip \
libsndfile1 \
ffmpeg \
wget \
&& rm -rf /var/lib/apt/lists/*

# Set environment variables
ENV PYTHONDONTWRITEBYTECODE=1
ENV PYTHONUNBUFFERED=1

# Install PyTorch compatible with CUDA 11.8 (matching base image)
# Use python3 explicitly if needed
RUN python3 -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118

# Set working directory
WORKDIR /app

# Copy only the requirements file first
COPY services/WhisperLive/requirements/server.txt /tmp/requirements.txt

# Remove openai-whisper and onnxruntime lines from the copied requirements file
RUN sed -i '/openai-whisper/d' /tmp/requirements.txt || true \
&& sed -i '/onnxruntime==/d' /tmp/requirements.txt || true

# Install remaining Python dependencies from the modified requirements file
RUN python3 -m pip install --no-cache-dir -r /tmp/requirements.txt

# Now copy the application code
COPY services/WhisperLive/ /app/

# Copy our entrypoint script
COPY services/WhisperLive/entrypoint.sh /entrypoint.sh
RUN chmod +x /entrypoint.sh

# Set it as the entrypoint
ENTRYPOINT ["/entrypoint.sh"]

# Default command to run the server with faster_whisper backend
CMD ["--port", "9090", "--backend", "faster_whisper"]
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -182,3 +182,8 @@ We are available to help you with both Open Source and proprietary AI projects.
howpublished = {\url{https://github.com/snakers4/silero-vad}},
email = {[email protected]}
}



(whisperlive) dima@bbb:~/test1/vexa/vexa/WhisperLive_fresh$ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
(whisperlive) dima@bbb:~/test1/vexa/vexa/WhisperLive_fresh$ python3 run_server.py --port 9090 --backend faster_whisper -m
13 changes: 13 additions & 0 deletions TensorRT_whisper.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,18 @@ docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it g
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en # float16
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8 # int8 weight only quantization
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4 # int4 weight only quantization
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples medium

# convert small multilingual model
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small
```

we have committed a docker image for medium, reuse this one!
```
REPOSITORY TAG IMAGE ID CREATED SIZE
whisperlive-trt-medium-ready latest 8596e0157dbf 2 seconds ago 19.1GB
```

## Run WhisperLive Server with TensorRT Backend
```bash
# Run English only model
Expand All @@ -36,3 +43,9 @@ python3 run_server.py --port 9090 \
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \
--trt_multilingual
```


python3 run_server.py --port 9090 \
--backend tensorrt \
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_medium_float16" \
--trt_multilingual
3 changes: 3 additions & 0 deletions entrypoint.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
#!/bin/sh
# Execute the WhisperLive server script with any provided arguments
exec python3 run_server.py "$@"
1 change: 1 addition & 0 deletions requirements/server.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
faster-whisper==1.1.0
websockets
websocket-client
onnxruntime==1.17.0
numba
kaldialign
Expand Down
48 changes: 33 additions & 15 deletions whisper_live/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ def __init__(
log_transcription=True,
max_clients=4,
max_connection_time=600,
platform="test_platform",
meeting_url="test_url",
token="test_token"
):
"""
Initializes a Client instance for audio recording and streaming to a server.
Expand All @@ -52,6 +55,9 @@ def __init__(
log_transcription (bool, optional): Whether to log transcription output to the console. Default is True.
max_clients (int, optional): Maximum number of client connections allowed. Default is 4.
max_connection_time (int, optional): Maximum allowed connection time in seconds. Default is 600.
platform (str, optional): Platform identifier sent to the server. Defaults to "test_platform".
meeting_url (str, optional): Meeting URL identifier sent to the server. Defaults to "test_url".
token (str, optional): Token identifier sent to the server. Defaults to "test_token".
"""
self.recording = False
self.task = "transcribe"
Expand All @@ -69,6 +75,9 @@ def __init__(
self.log_transcription = log_transcription
self.max_clients = max_clients
self.max_connection_time = max_connection_time
self.platform = platform
self.meeting_url = meeting_url
self.token = token

if translate:
self.task = "translate"
Expand Down Expand Up @@ -195,26 +204,26 @@ def on_open(self, ws):
Callback function called when the WebSocket connection is successfully opened.

Sends an initial configuration message to the server, including client UID,
language selection, and task type.
language selection, task type, and potentially platform, meeting_url, token.

Args:
ws (websocket.WebSocketApp): The WebSocket client instance.

"""
print("[INFO]: Opened connection")
ws.send(
json.dumps(
{
"uid": self.uid,
"language": self.language,
"task": self.task,
"model": self.model,
"use_vad": self.use_vad,
"max_clients": self.max_clients,
"max_connection_time": self.max_connection_time,
}
)
)
initial_payload = {
"uid": self.uid,
"language": self.language,
"task": self.task,
"model": self.model,
"use_vad": self.use_vad,
"max_clients": self.max_clients,
"max_connection_time": self.max_connection_time,
"platform": self.platform,
"meeting_url": self.meeting_url,
"token": self.token,
}
ws.send(json.dumps(initial_payload))

def send_packet_to_server(self, message):
"""
Expand Down Expand Up @@ -682,6 +691,9 @@ class TranscriptionClient(TranscriptionTeeClient):
max_clients (int, optional): Maximum number of client connections allowed. Default is 4.
max_connection_time (int, optional): Maximum allowed connection time in seconds. Default is 600.
mute_audio_playback (bool, optional): If True, mutes audio playback during file playback. Default is False.
platform (str, optional): Platform identifier sent to the server. Defaults to "test_platform".
meeting_url (str, optional): Meeting URL identifier sent to the server. Defaults to "test_url".
token (str, optional): Token identifier sent to the server. Defaults to "test_token".

Attributes:
client (Client): An instance of the underlying Client class responsible for handling the WebSocket connection.
Expand All @@ -708,11 +720,17 @@ def __init__(
max_clients=4,
max_connection_time=600,
mute_audio_playback=False,
platform="test_platform",
meeting_url="test_url",
token="test_token"
):
self.client = Client(
host, port, lang, translate, model, srt_file_path=output_transcription_path,
use_vad=use_vad, log_transcription=log_transcription, max_clients=max_clients,
max_connection_time=max_connection_time
max_connection_time=max_connection_time,
platform=platform,
meeting_url=meeting_url,
token=token
)

if save_output_recording and not output_recording_filename.endswith(".wav"):
Expand Down
Loading