collabora · DmitriyG228 · Apr 2, 2025 · Apr 2, 2025 · Apr 5, 2025 · Apr 6, 2025
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,128 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# IDE specific files
+.idea/
+.vscode/
+*.swp
+*.swo
+
+# Project specific
+model_weights/
+*.pt
+*.pth
+
+# Docker
+.docker/
+docker-volumes/
+models/
diff --git a/Dockerfile.project b/Dockerfile.project
@@ -0,0 +1,51 @@
+FROM nvidia/cuda:12.3.2-cudnn9-runtime-ubuntu22.04
+WORKDIR /root
+RUN apt-get update -y && apt-get install -y python3-pip
+RUN pip3 install faster-whisper
+
+
+# Set DEBIAN_FRONTEND to noninteractive to avoid prompts during apt-get install
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install system dependencies
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    libsndfile1 \
+    ffmpeg \
+    wget \
+    && rm -rf /var/lib/apt/lists/*
+
+# Set environment variables
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# Install PyTorch compatible with CUDA 11.8 (matching base image)
+# Use python3 explicitly if needed
+RUN python3 -m pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
+
+# Set working directory
+WORKDIR /app
+
+# Copy only the requirements file first
+COPY services/WhisperLive/requirements/server.txt /tmp/requirements.txt
+
+# Remove openai-whisper and onnxruntime lines from the copied requirements file
+RUN sed -i '/openai-whisper/d' /tmp/requirements.txt || true \
+    && sed -i '/onnxruntime==/d' /tmp/requirements.txt || true
+
+# Install remaining Python dependencies from the modified requirements file
+RUN python3 -m pip install --no-cache-dir -r /tmp/requirements.txt
+
+# Now copy the application code
+COPY services/WhisperLive/ /app/
+
+# Copy our entrypoint script
+COPY services/WhisperLive/entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+
+# Set it as the entrypoint
+ENTRYPOINT ["/entrypoint.sh"]
+
+# Default command to run the server with faster_whisper backend
+CMD ["--port", "9090", "--backend", "faster_whisper"]
diff --git a/README.md b/README.md
@@ -182,3 +182,8 @@ We are available to help you with both Open Source and proprietary AI projects.
   howpublished = {\url{https://github.com/snakers4/silero-vad}},
   email = {[email protected]}
 }
+
+
+
+(whisperlive) dima@bbb:~/test1/vexa/vexa/WhisperLive_fresh$ export LD_LIBRARY_PATH=$CONDA_PREFIX/lib/python3.10/site-packages/nvidia/cudnn/lib:$LD_LIBRARY_PATH
+(whisperlive) dima@bbb:~/test1/vexa/vexa/WhisperLive_fresh$ python3 run_server.py --port 9090 --backend faster_whisper -m
diff --git a/TensorRT_whisper.md b/TensorRT_whisper.md
@@ -18,11 +18,18 @@ docker run -p 9090:9090 --runtime=nvidia --gpus all --entrypoint /bin/bash -it g
 bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en        # float16
 bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8   # int8 weight only quantization
 bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4   # int4 weight only quantization
+bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples medium
 
 # convert small multilingual model
 bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small
 ```
 
+we have committed a docker image for medium, reuse this one!
+```
+REPOSITORY                         TAG                IMAGE ID       CREATED          SIZE
+whisperlive-trt-medium-ready       latest             8596e0157dbf   2 seconds ago    19.1GB
+```
+
 ## Run WhisperLive Server with TensorRT Backend
 ```bash
 # Run English only model
@@ -36,3 +43,9 @@ python3 run_server.py --port 9090 \
                       --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_float16" \
                       --trt_multilingual
 ```
+
+
+python3 run_server.py --port 9090 \
+                      --backend tensorrt \
+                      --trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_medium_float16" \
+                      --trt_multilingual
diff --git a/entrypoint.sh b/entrypoint.sh
@@ -0,0 +1,3 @@
+#!/bin/sh
+# Execute the WhisperLive server script with any provided arguments
+exec python3 run_server.py "$@" 
diff --git a/requirements/server.txt b/requirements/server.txt
@@ -1,5 +1,6 @@
 faster-whisper==1.1.0
 websockets
+websocket-client
 onnxruntime==1.17.0
 numba
 kaldialign

diff --git a/whisper_live/client.py b/whisper_live/client.py
@@ -33,6 +33,9 @@ def __init__(
         log_transcription=True,
         max_clients=4,
         max_connection_time=600,
+        platform="test_platform",
+        meeting_url="test_url",
+        token="test_token"
     ):
         """
         Initializes a Client instance for audio recording and streaming to a server.
@@ -52,6 +55,9 @@ def __init__(
             log_transcription (bool, optional): Whether to log transcription output to the console. Default is True.
             max_clients (int, optional): Maximum number of client connections allowed. Default is 4.
             max_connection_time (int, optional): Maximum allowed connection time in seconds. Default is 600.
+            platform (str, optional): Platform identifier sent to the server. Defaults to "test_platform".
+            meeting_url (str, optional): Meeting URL identifier sent to the server. Defaults to "test_url".
+            token (str, optional): Token identifier sent to the server. Defaults to "test_token".
         """
         self.recording = False
         self.task = "transcribe"
@@ -69,6 +75,9 @@ def __init__(
         self.log_transcription = log_transcription
         self.max_clients = max_clients
         self.max_connection_time = max_connection_time
+        self.platform = platform
+        self.meeting_url = meeting_url
+        self.token = token
 
         if translate:
             self.task = "translate"
@@ -195,26 +204,26 @@ def on_open(self, ws):
         Callback function called when the WebSocket connection is successfully opened.
 
         Sends an initial configuration message to the server, including client UID,
-        language selection, and task type.
+        language selection, task type, and potentially platform, meeting_url, token.
 
         Args:
             ws (websocket.WebSocketApp): The WebSocket client instance.
 
         """
         print("[INFO]: Opened connection")
-        ws.send(
-            json.dumps(
-                {
-                    "uid": self.uid,
-                    "language": self.language,
-                    "task": self.task,
-                    "model": self.model,
-                    "use_vad": self.use_vad,
-                    "max_clients": self.max_clients,
-                    "max_connection_time": self.max_connection_time,
-                }
-            )
-        )
+        initial_payload = {
+            "uid": self.uid,
+            "language": self.language,
+            "task": self.task,
+            "model": self.model,
+            "use_vad": self.use_vad,
+            "max_clients": self.max_clients,
+            "max_connection_time": self.max_connection_time,
+            "platform": self.platform,
+            "meeting_url": self.meeting_url,
+            "token": self.token,
+        }
+        ws.send(json.dumps(initial_payload))
 
     def send_packet_to_server(self, message):
         """
@@ -682,6 +691,9 @@ class TranscriptionClient(TranscriptionTeeClient):
         max_clients (int, optional): Maximum number of client connections allowed. Default is 4.
         max_connection_time (int, optional): Maximum allowed connection time in seconds. Default is 600.
         mute_audio_playback (bool, optional): If True, mutes audio playback during file playback. Default is False.
+        platform (str, optional): Platform identifier sent to the server. Defaults to "test_platform".
+        meeting_url (str, optional): Meeting URL identifier sent to the server. Defaults to "test_url".
+        token (str, optional): Token identifier sent to the server. Defaults to "test_token".
 
     Attributes:
         client (Client): An instance of the underlying Client class responsible for handling the WebSocket connection.
@@ -708,11 +720,17 @@ def __init__(
         max_clients=4,
         max_connection_time=600,
         mute_audio_playback=False,
+        platform="test_platform",
+        meeting_url="test_url",
+        token="test_token"
     ):
         self.client = Client(
             host, port, lang, translate, model, srt_file_path=output_transcription_path,
             use_vad=use_vad, log_transcription=log_transcription, max_clients=max_clients,
-            max_connection_time=max_connection_time
+            max_connection_time=max_connection_time,
+            platform=platform,
+            meeting_url=meeting_url,
+            token=token
         )
 
         if save_output_recording and not output_recording_filename.endswith(".wav"):