Use large-v3-turbo as default model,simplify tests, fix Dockerfile and docs

Cyril LAY · Cyril LAY · commit 0824fb9085da · 2026-04-23T17:54:51.000+02:00
diff --git a/README.md b/README.md
@@ -33,6 +33,36 @@ export LOGGING_CONFIG=logging-config.yaml
 python app/main.py
 ```
 
+## Deployment
+
+### Build the image
+
+```bash
+docker build -f app/Dockerfile -t whisperx-openai-api .
+```
+
+### Run
+
+```bash
+docker run -d \
+  --gpus all \
+  -p 8000:8000 \
+  -e API_KEY=your-api-key \
+  -e HF_TOKEN=your-hf-token \
+  -v /path/to/models:/data/models \
+  whisperx-openai-api
+```
+
+Models are downloaded on first startup and cached in `/data/models`. Mount a persistent volume to avoid re-downloading on restart.
+
+**1 worker is recommended.** GPU inference is serialized internally : multiple workers each load a full model copy in VRAM, and it doesn't improve throughput unless you have multiple GPUs.
+
+To scale workers (each worker loads its own model in VRAM):
+
+```bash
+docker run -d --gpus all ... -e WORKERS=2 whisperx-openai-api
+```
+
 ## Testing
 
 Tests mock actual inference and can be run locally:
@@ -52,7 +82,7 @@ Check the [documentation to run integration tests](docs/testing_with_gpu.md) on
 | -------- | ----------- | ------- |
 | API_KEY | API key for API access | Required |
 | HF_TOKEN | Hugging Face token (required for diarization) | Required |
-| TRANSCRIBE_MODEL | WhisperX model to load | `large-v2` |
+| TRANSCRIBE_MODEL | WhisperX model to load | `large-v3-turbo` |
 | BATCH_SIZE | Transcription batch size | `16` |
 | DIARIZE_MODEL | Pyannote diarization model | `pyannote/speaker-diarization-community-1` |
 | PRELOADED_ALIGN_MODEL_LANGUAGES | Languages to pre-load alignment models for | `["en", "fr", "nl", "de"]` |
@@ -64,6 +94,6 @@ Check the [documentation to run integration tests](docs/testing_with_gpu.md) on
 | WORKERS | Number of uvicorn workers (each loads its own model in VRAM) | `1` |
 | RELOAD | Enable auto-reload | `false` |
 | ROOT_PATH | API root path | `None` |
-| LOGGING_CONFIG | Path to logging config file | `None` |
+| LOGGING_CONFIG | Path to logging config file | `logging-config.yaml` |
 | DEBUG | Enable debug logging | `false` |
 
diff --git a/app/Dockerfile b/app/Dockerfile
@@ -75,8 +75,4 @@ COPY --chown=whisperuser:whisperuser ./logging-config.yaml /app/logging-config.y
 
 USER whisperuser
 
-CMD gunicorn main:app \
-    --workers ${WORKERS:-1} \
-    --worker-class uvicorn.workers.UvicornWorker \
-    --timeout 120 \
-    --bind 0.0.0.0:${PORT:-8000}
+ENTRYPOINT ["sh", "-c", "gunicorn main:app --workers ${WORKERS:-1} --worker-class uvicorn.workers.UvicornWorker --timeout 120 --bind 0.0.0.0:${PORT:-8000}"]
diff --git a/app/tests/conftest.py b/app/tests/conftest.py
@@ -2,115 +2,58 @@
 import sys
 from unittest.mock import MagicMock, patch
 
-# ---------------------------------------------------------------------------
-# 1. Mock heavy third-party libraries in sys.modules BEFORE any app import.
-#    app modules import whisperx, torch, numpy at the top level.
-# ---------------------------------------------------------------------------
-_mock_np = MagicMock()
+import pytest
+
+# Mock libs before import
 _mock_torch = MagicMock()
 _mock_torch.cuda.is_available.return_value = False
 _mock_torch.float32 = "float32"
 
 _mock_whisperx = MagicMock()
-FAKE_LANGUAGES = {"en": "english", "fr": "french", "cz": "czech"}
-FAKE_ALIGN_MODELS_HF = {"en": "WAV2VEC2_ASR_BASE_960H", "fr": "some-fr-model"}
-FAKE_ALIGN_MODELS_TORCH = {"en": "WAV2VEC2_ASR_BASE_960H"}
-
-_mock_whisperx.utils.LANGUAGES = FAKE_LANGUAGES
-_mock_whisperx.alignment.DEFAULT_ALIGN_MODELS_HF = FAKE_ALIGN_MODELS_HF
-_mock_whisperx.alignment.DEFAULT_ALIGN_MODELS_TORCH = FAKE_ALIGN_MODELS_TORCH
+_mock_whisperx.utils.LANGUAGES = {"en": "english", "fr": "french", "cz": "czech"}
+_mock_whisperx.alignment.DEFAULT_ALIGN_MODELS_HF = {"en": "...", "fr": "..."}
+_mock_whisperx.alignment.DEFAULT_ALIGN_MODELS_TORCH = {}
 
-for mod_name, mock_obj in {
-    "numpy": _mock_np,
-    "np": _mock_np,
+for name, mock in {
+    "numpy": MagicMock(),
     "torch": _mock_torch,
     "whisperx": _mock_whisperx,
     "whisperx.utils": _mock_whisperx.utils,
     "whisperx.alignment": _mock_whisperx.alignment,
     "whisperx.asr": _mock_whisperx.asr,
     "whisperx.diarize": _mock_whisperx.diarize,
 }.items():
-    sys.modules.setdefault(mod_name, mock_obj)
+    sys.modules.setdefault(name, mock)
 
-# ---------------------------------------------------------------------------
-# 2. Set required env vars BEFORE any app module is imported.
-#    config.py and security.py evaluate settings at module level.
-# ---------------------------------------------------------------------------
-TEST_API_KEY = "test-api-key"
-TEST_HF_TOKEN = "test-hf-token"
+os.environ.setdefault("API_KEY", "test-key")
+os.environ.setdefault("HF_TOKEN", "test-token")
 
-os.environ.setdefault("API_KEY", TEST_API_KEY)
-os.environ.setdefault("HF_TOKEN", TEST_HF_TOKEN)
+# App imports after mocking)
 
-# ---------------------------------------------------------------------------
-# 3. Now it is safe to import app modules.
-# ---------------------------------------------------------------------------
 from fastapi import FastAPI  # noqa: E402
 from fastapi.testclient import TestClient  # noqa: E402
-import pytest  # noqa: E402
 
 from endpoints import audio  # noqa: E402
 from utils.config import Settings, get_settings  # noqa: E402
 from utils.security import check_api_key  # noqa: E402
 
-_FAKE_WORDS = [
-    {"word": "Hello", "start": 0.0, "end": 0.7, "score": 0.95},
-    {"word": "world.", "start": 0.8, "end": 1.5, "score": 0.92},
-]
-
-MOCK_TRANSCRIPTION_RESULT = {
-    "segments": [
-        {"start": 0.0, "end": 1.5, "text": "Hello world.", "words": _FAKE_WORDS}
-    ],
-    "word_segments": _FAKE_WORDS,
+FAKE_TRANSCRIPTION = {
+    "segments": [{"start": 0.0, "end": 1.5, "text": "Hello world.", "speaker": "SPEAKER_00"}],
 }
 
-FAKE_AUDIO = MagicMock(name="fake_audio_array")
-
-
-def _test_settings() -> Settings:
-    return Settings(
-        api_key=TEST_API_KEY,
-        hf_token=TEST_HF_TOKEN,
-        transcribe_model="large-v2",
-        batch_size=4,
-    )
-
 
 @pytest.fixture()
 def client():
-    """TestClient with dependency overrides and lifespan disabled."""
     app = FastAPI()
     app.include_router(audio.router, prefix="/v1")
+    app.dependency_overrides[get_settings] = lambda: Settings(
+        api_key="test-key", hf_token="test-token", transcribe_model="large-v3-turbo"
+    )
+    app.dependency_overrides[check_api_key] = lambda: "test-key"
 
-    app.dependency_overrides[get_settings] = _test_settings
-    app.dependency_overrides[check_api_key] = lambda: TEST_API_KEY
-
-    with TestClient(app) as c:
+    with (
+        patch.object(audio.whisperx, "load_audio", return_value=MagicMock()),
+        patch.object(audio, "transcribe", return_value=FAKE_TRANSCRIPTION),
+        TestClient(app) as c,
+    ):
         yield c
-
-
-@pytest.fixture()
-def mock_whisperx():
-    """Patch whisperx symbols used directly in the endpoint module."""
-    with patch.object(
-        audio.whisperx, "load_audio", return_value=FAKE_AUDIO
-    ) as load_audio:
-        yield {"load_audio": load_audio, "fake_audio": FAKE_AUDIO}
-
-
-@pytest.fixture()
-def mock_transcribe():
-    """Patch the transcribe service function as imported in the endpoint module."""
-    with patch.object(
-        audio,
-        "transcribe",
-        return_value=MOCK_TRANSCRIPTION_RESULT,
-    ) as mock:
-        yield mock
-
-
-@pytest.fixture()
-def sample_audio_bytes() -> bytes:
-    """Minimal bytes to simulate an uploaded audio file."""
-    return b"\x00" * 1024
diff --git a/app/tests/test_audio_transcriptions.py b/app/tests/test_audio_transcriptions.py
@@ -1,153 +1,43 @@
-"""Tests for the POST /v1/audio/transcriptions endpoint."""
-
-import os
-
-from utils.config import Settings
+"""Tests for POST /v1/audio/transcriptions."""
 
 ENDPOINT = "/v1/audio/transcriptions"
+AUDIO = b"\x00" * 512
 
 
-def _post_to_transcribe_endpoint(client, audio_bytes, **form_fields):
-    """Helper: POST a file upload to the transcription endpoint."""
-    return client.post(
-        ENDPOINT,
-        files={"file": ("test.wav", audio_bytes, "audio/wav")},
-        data=form_fields,
-    )
-
-
-# Test Success
-
-
-class TestTranscribeSuccess:
-    def test_default_params(
-        self, client, mock_whisperx, mock_transcribe, sample_audio_bytes
-    ):
-        """Successful transcription with no explicit model or language."""
-        response = _post_to_transcribe_endpoint(client, sample_audio_bytes)
-
-        assert response.status_code == 200
-        body = response.json()
-        assert "segments" in body
-        assert body["segments"][0]["words"][0]["word"] == "Hello"
-
-    def test_with_language(
-        self, client, mock_whisperx, mock_transcribe, sample_audio_bytes
-    ):
-        """Explicit language is forwarded to the transcribe service."""
-        response = _post_to_transcribe_endpoint(
-            client, sample_audio_bytes, language="en"
-        )
-
-        assert response.status_code == 200
-        mock_transcribe.assert_called_once()
-        assert mock_transcribe.call_args.args[2] == "en"
-
-    def test_with_matching_model(
-        self, client, mock_whisperx, mock_transcribe, sample_audio_bytes
-    ):
-        """Explicit model that matches the configured model succeeds."""
-        response = _post_to_transcribe_endpoint(
-            client, sample_audio_bytes, model="large-v2"
-        )
-
-        assert response.status_code == 200
-
-
-# Test Validation Errors
-
-
-class TestTranscribeValidation:
-    def test_unsupported_transcribe_language(
-        self, client, mock_whisperx, mock_transcribe, sample_audio_bytes
-    ):
-        """Language not in whisperx.utils.LANGUAGES returns 400."""
-        response = _post_to_transcribe_endpoint(
-            client, sample_audio_bytes, language="xx"
-        )
-
-        assert response.status_code == 400
-        assert "Unsupported language" in response.json()["detail"]
-        assert "for transcription" in response.json()["detail"]
-
-    def test_unsupported_align_language(
-        self, client, mock_whisperx, mock_transcribe, sample_audio_bytes
-    ):
-        """Language in LANGUAGES but missing from alignment dicts returns 400."""
-
-        # NB: "cz" is supported for transcribe but not align
-        response = _post_to_transcribe_endpoint(
-            client, sample_audio_bytes, language="cz"
-        )
-
-        assert response.status_code == 400
-        assert "Unsupported language" in response.json()["detail"]
-        assert "for alignment" in response.json()["detail"]
-
-    def test_wrong_model(
-        self, client, mock_whisperx, mock_transcribe, sample_audio_bytes
-    ):
-        """Model that differs from configured model returns 404."""
-        response = _post_to_transcribe_endpoint(
-            client, sample_audio_bytes, model="tiny"
-        )
+def post(client, model=None, language=None, response_format=None):
+    data = {}
+    if model:
+        data["model"] = model
+    if language:
+        data["language"] = language
+    if response_format:
+        data["response_format"] = response_format
+    return client.post(ENDPOINT, files={"file": ("test.wav", AUDIO, "audio/wav")}, data=data)
 
-        assert response.status_code == 404
-        assert "Model not found" in response.json()["detail"]
 
-    def test_missing_file(self, client, mock_whisperx, mock_transcribe):
-        """Request without a file upload returns 422."""
-        response = client.post(ENDPOINT)
+def test_transcription(client):
+    body = post(client).json()
+    assert body["text"] == "Hello world."
+    assert body["segments"] is None
 
-        assert response.status_code == 422
 
+def test_diarized(client):
+    body = post(client, response_format="diarized_json").json()
+    assert body["segments"][0]["text"] == "Hello world."
+    assert body["segments"][0]["speaker"] == "SPEAKER_00"
 
-# Behavior tests
 
+def test_wrong_model_returns_404(client):
+    assert post(client, model="tiny").status_code == 404
 
-class TestTranscribeBehaviour:
-    def test_load_audio_called_with_temp_path(
-        self, client, mock_whisperx, mock_transcribe, sample_audio_bytes
-    ):
-        """whisperx.load_audio is called with a temp file path that is cleaned up."""
-        response = _post_to_transcribe_endpoint(client, sample_audio_bytes)
 
-        assert response.status_code == 200
-        mock_whisperx["load_audio"].assert_called_once()
-        temp_path = mock_whisperx["load_audio"].call_args.args[0]
-        assert isinstance(temp_path, str)
-        # Temp file should have been deleted by the endpoint
-        assert not os.path.exists(temp_path)
+def test_english(client):
+    assert post(client, language="en").status_code == 200
 
-    def test_temp_file_extension_preserved(
-        self, client, mock_whisperx, mock_transcribe
-    ):
-        """Temp file preserves the original upload extension."""
-        response = client.post(
-            ENDPOINT,
-            files={"file": ("interview.ogg", b"\x00" * 512, "audio/mpeg")},
-        )
 
-        assert response.status_code == 200
-        temp_path = mock_whisperx["load_audio"].call_args.args[0]
-        assert temp_path.endswith(".ogg")
+def test_french(client):
+    assert post(client, language="fr").status_code == 200
 
-    def test_transcribe_called_with_correct_args(
-        self, client, mock_whisperx, mock_transcribe, sample_audio_bytes
-    ):
-        """The transcribe service receives (audio_array, settings, language)."""
-        response = _post_to_transcribe_endpoint(
-            client, sample_audio_bytes, language="fr"
-        )
 
-        assert response.status_code == 200
-        mock_transcribe.assert_called_once()
-        args = mock_transcribe.call_args.args
-        # 0: Numpy audio array returned by load_audio
-        assert args[0] is mock_whisperx["fake_audio"]
-        # 1: Settings instance with expected values
-        assert isinstance(args[1], Settings)
-        assert args[1].transcribe_model == "large-v2"
-        assert args[1].batch_size == 4
-        # 2: language
-        assert args[2] == "fr"
+def test_missing_file_returns_422(client):
+    assert client.post(ENDPOINT).status_code == 422
diff --git a/app/tests_with_gpu/test_transcribe.py b/app/tests_with_gpu/test_transcribe.py
@@ -37,9 +37,11 @@ def test_real_audio_output(self, integration_client, sample_ogg, expected_output
         response = integration_client.post(
             ENDPOINT,
             files={"file": ("sample_en_1.ogg", sample_ogg, "audio/ogg")},
+            data={"response_format": "diarized_json"},
         )
 
         assert response.status_code == 200
         body = response.json()
 
+        assert body["text"] == expected_output["text"]
         assert body["segments"] == expected_output["segments"]
diff --git a/app/utils/config.py b/app/utils/config.py
diff --git a/app/utils/lifespan.py b/app/utils/lifespan.py