Refactor STT validation boundaries (#153)

LxYuan0420 · web-flow · commit a00cba6b068c · 2026-03-12T16:16:08.000+08:00
This PR is:
- To validate `SpeechToTextConfig` inputs at construction time
- To normalize and validate Whisper decode options up front
- To fail fast on invalid STT model loading inputs
- To add focused tests for the new validation paths

Notes:
- This keeps the change narrow and behavioral: no broad file moves, no
larger refactor mixed in
- I also removed a couple of defensive fallbacks so the code stays
strict about real model contracts

Validation:
- `pytest tests/test_stt.py tests/test_transcribe.py
tests/test_qwen3_asr.py tests/test_v1_stt_integration.py -q`
- Result: `135 passed, 3 skipped`

Skipped tests:
- `tests/test_qwen3_asr.py::TestModelLoad::test_load_model`
- `tests/test_qwen3_asr.py::TestModelLoad::test_encode_dummy_mel`
- `tests/test_qwen3_asr.py::TestModelLoad::test_greedy_decode`
- Reason: `QWEN3_ASR_MODEL_PATH not set`

Next:
- Continue the STT cleanup in small slices, likely around
transcriber/loader boundaries.

---------

Signed-off-by: Yuan Lik Xun &lt;lxyuan0420@gmail.com&gt;
diff --git a/tests/test_stt.py b/tests/test_stt.py
@@ -47,6 +47,26 @@ def test_custom_values(self) -> None:
         assert cfg.overlap_chunk_second == 0.5
         assert cfg.min_energy_split_window_size == 800
 
+    @pytest.mark.parametrize(
+        ("kwargs", "message"),
+        [
+            ({"max_audio_clip_s": 0.0}, "max_audio_clip_s"),
+            ({"overlap_chunk_second": -0.1}, "overlap_chunk_second"),
+            (
+                {"max_audio_clip_s": 10.0, "overlap_chunk_second": 10.0},
+                "overlap_chunk_second",
+            ),
+            ({"min_energy_split_window_size": 0}, "min_energy_split_window_size"),
+        ],
+    )
+    def test_invalid_values_raise(
+        self,
+        kwargs: dict[str, float | int],
+        message: str,
+    ) -> None:
+        with pytest.raises(ValueError, match=message):
+            SpeechToTextConfig(**kwargs)
+
 
 # ===========================================================================
 # validate_language
diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py
@@ -3,14 +3,18 @@
 
 from __future__ import annotations
 
+import json
 from pathlib import Path
+from types import SimpleNamespace
 
+import mlx.core as mx
 import pytest
 
 from vllm_metal.stt.transcribe import (
     _MAX_PROMPT_TOKENS,
     TranscriptionResult,
     WhisperTranscriber,
+    load_model,
 )
 
 # ===========================================================================
@@ -166,17 +170,11 @@ class TestLoadModel:
 
     def test_missing_config_json(self, tmp_path: Path) -> None:
         """Should raise FileNotFoundError when config.json is absent."""
-        from vllm_metal.stt.transcribe import load_model
-
         with pytest.raises(FileNotFoundError, match="config.json not found"):
             load_model(tmp_path)
 
     def test_missing_weight_files(self, tmp_path: Path) -> None:
         """Should raise FileNotFoundError when no weight files exist."""
-        import json
-
-        from vllm_metal.stt.transcribe import load_model
-
         config = {
             "n_mels": 80,
             "n_audio_ctx": 10,
@@ -194,6 +192,65 @@ def test_missing_weight_files(self, tmp_path: Path) -> None:
         with pytest.raises(FileNotFoundError, match="No weight files"):
             load_model(tmp_path)
 
+    def test_empty_model_path_raises(self) -> None:
+        """Whitespace-only model paths should fail fast."""
+        with pytest.raises(ValueError, match="model_path"):
+            load_model("   ")
+
+    def test_invalid_dtype_raises(self, tmp_path: Path) -> None:
+        """Non-floating dtypes are rejected before any file I/O."""
+        with pytest.raises(TypeError, match="Unsupported STT model dtype"):
+            load_model(tmp_path, dtype=mx.int32)
+
+    def test_unknown_model_type_raises(self, tmp_path: Path) -> None:
+        """Unknown model_type values should not fall through to Whisper."""
+        (tmp_path / "config.json").write_text(json.dumps({"model_type": "mystery_stt"}))
+
+        with pytest.raises(ValueError, match="Unsupported STT model_type"):
+            load_model(tmp_path)
+
+
+class TestResolveDecodeOptions:
+    """Tests for WhisperTranscriber task/language validation."""
+
+    def test_multilingual_model_normalizes_inputs(self) -> None:
+        transcriber = WhisperTranscriber(
+            model=SimpleNamespace(is_multilingual=True),
+            model_path=None,
+        )
+
+        language, task = transcriber._resolve_decode_options(" EN ", "Transcribe")
+
+        assert language == "en"
+        assert task == "transcribe"
+
+    def test_invalid_task_raises(self) -> None:
+        transcriber = WhisperTranscriber(
+            model=SimpleNamespace(is_multilingual=True),
+            model_path=None,
+        )
+
+        with pytest.raises(ValueError, match="Unsupported STT task"):
+            transcriber._resolve_decode_options("en", "summarize")
+
+    def test_english_only_model_rejects_translation(self) -> None:
+        transcriber = WhisperTranscriber(
+            model=SimpleNamespace(is_multilingual=False),
+            model_path=None,
+        )
+
+        with pytest.raises(ValueError, match="do not support translation"):
+            transcriber._resolve_decode_options(None, "translate")
+
+    def test_english_only_model_rejects_non_english_language(self) -> None:
+        transcriber = WhisperTranscriber(
+            model=SimpleNamespace(is_multilingual=False),
+            model_path=None,
+        )
+
+        with pytest.raises(ValueError, match="only support English transcription"):
+            transcriber._resolve_decode_options("fr", "transcribe")
+
 
 # ===========================================================================
 # Greedy decode and encode chunk (require tiny model)
diff --git a/vllm_metal/stt/config.py b/vllm_metal/stt/config.py
@@ -74,6 +74,17 @@ class SpeechToTextConfig:
     # Deprecated: Whisper requires 16kHz; this field is ignored.
     sample_rate: int = 16000
 
+    def __post_init__(self) -> None:
+        """Validate runtime chunking parameters."""
+        if self.max_audio_clip_s <= 0:
+            raise ValueError("max_audio_clip_s must be > 0")
+        if self.overlap_chunk_second < 0:
+            raise ValueError("overlap_chunk_second must be >= 0")
+        if self.overlap_chunk_second >= self.max_audio_clip_s:
+            raise ValueError("overlap_chunk_second must be < max_audio_clip_s")
+        if self.min_energy_split_window_size <= 0:
+            raise ValueError("min_energy_split_window_size must be > 0")
+
 
 def is_stt_model(model_path: str) -> bool:
     """Return ``True`` if *model_path* points to a Speech-to-Text model.
diff --git a/vllm_metal/stt/transcribe.py b/vllm_metal/stt/transcribe.py
@@ -31,6 +31,7 @@
     QWEN3_ASR_MAX_DECODE_TOKENS,
     WHISPER_MAX_DECODE_TOKENS,
     SpeechToTextConfig,
+    validate_language,
 )
 from vllm_metal.stt.protocol import TranscriptionSegment
 from vllm_metal.stt.whisper import WhisperConfig, WhisperModel
@@ -57,6 +58,12 @@
 # Regex to detect Whisper timestamp tokens like ``<|0.00|>``.
 _TIMESTAMP_RE = re.compile(r"<\|(\d+\.\d+)\|>")
 
+# Supported tasks for Whisper transcription requests.
+_WHISPER_TASKS = frozenset({"transcribe", "translate"})
+
+# Supported floating-point dtypes for STT model loading.
+_SUPPORTED_LOAD_DTYPES = frozenset({mx.float16, mx.float32, mx.bfloat16})
+
 
 # ===========================================================================
 # Data types
@@ -135,6 +142,8 @@ def transcribe(
         Returns:
             :class:`TranscriptionResult` with text and optional segments.
         """
+        language, task = self._resolve_decode_options(language, task)
+
         if isinstance(audio, str):
             audio = load_audio(audio, sample_rate=SAMPLE_RATE)
         elif isinstance(audio, np.ndarray):
@@ -251,6 +260,31 @@ def _get_token_id(self, token: str) -> int:
         """Resolve a special token string to its integer ID."""
         return self.tokenizer.convert_tokens_to_ids(token)
 
+    def _resolve_decode_options(
+        self,
+        language: str | None,
+        task: str,
+    ) -> tuple[str | None, str]:
+        """Validate and normalize task/language options for Whisper."""
+        task = task.strip().lower()
+        if task not in _WHISPER_TASKS:
+            supported = ", ".join(sorted(_WHISPER_TASKS))
+            raise ValueError(
+                f"Unsupported STT task: {task!r}. Must be one of {supported}."
+            )
+
+        if self.model.is_multilingual:
+            return validate_language(language, default=None), task
+
+        resolved_language = validate_language(language, default=None)
+        if task == "translate":
+            raise ValueError("English-only Whisper models do not support translation.")
+        if resolved_language not in (None, "en"):
+            raise ValueError(
+                "English-only Whisper models only support English transcription."
+            )
+        return resolved_language, task
+
     def _encode_prompt(self, prompt: str | None) -> list[int]:
         """Encode a user prompt into ``<|startofprev|>`` prefix tokens.
 
@@ -580,6 +614,15 @@ def _resolve_model_path(model_path: str | Path) -> Path:
     return model_path
 
 
+def _validate_load_dtype(dtype: mx.Dtype) -> None:
+    """Validate the floating-point dtype used for model loading."""
+    if dtype not in _SUPPORTED_LOAD_DTYPES:
+        names = ", ".join(sorted(str(d) for d in _SUPPORTED_LOAD_DTYPES))
+        raise TypeError(
+            f"Unsupported STT model dtype: {dtype!r}. Must be one of {names}."
+        )
+
+
 def load_model(model_path: str | Path, dtype: mx.Dtype = mx.float16):
     """Load an STT model from a local directory or HuggingFace repo.
 
@@ -597,14 +640,24 @@ def load_model(model_path: str | Path, dtype: mx.Dtype = mx.float16):
         ValueError: If the model type is unsupported or download fails.
         FileNotFoundError: If config.json or weight files are missing.
     """
+    if isinstance(model_path, str) and not model_path.strip():
+        raise ValueError(
+            "model_path must be a non-empty local path or HuggingFace repo ID."
+        )
+    _validate_load_dtype(dtype)
     model_path = _resolve_model_path(model_path)
     config_dict = _read_config(model_path)
     model_type = config_dict.get("model_type", "").lower()
 
     if model_type == "qwen3_asr":
         return _load_qwen3_asr_model(model_path, config_dict, dtype)
-    # Default to Whisper for backward compatibility
-    return _load_whisper_model(model_path, config_dict, dtype)
+    if model_type in ("", "whisper"):
+        # Default to Whisper for backward compatibility
+        return _load_whisper_model(model_path, config_dict, dtype)
+    raise ValueError(
+        f"Unsupported STT model_type: {model_type!r}. "
+        "Expected 'whisper' or 'qwen3_asr'."
+    )
 
 
 def _load_and_init_model(model, model_path: Path, config_dict: dict):