Refactor STT model loading (#168)

LxYuan0420 · web-flow · commit 95ad433c4014 · 2026-03-17T03:30:16.000-05:00
This PR is:
- To move STT model loading out of `vllm_metal/stt/transcribe.py` into
`vllm_metal/stt/loader.py`.
- To centralize `model_type -&gt; constructor` mapping in
`vllm_metal/stt/registry.py`.
- To keep `vllm_metal/stt/transcribe.py` orchestration-only while
preserving the public `load_model()` API.

Next:
- Extract STT runtime glue out of `vllm_metal/v1/model_runner.py` into
`vllm_metal/stt/runtime.py`.
- Move model-specific request/mm payload interpretation into model-owned
adapters under `stt/&lt;model&gt;/`.

---------

Signed-off-by: Yuan Lik Xun &lt;lxyuan0420@gmail.com&gt;
diff --git a/vllm_metal/stt/loader.py b/vllm_metal/stt/loader.py
@@ -0,0 +1,104 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Speech-to-Text model loading."""
+
+from __future__ import annotations
+
+import json
+import logging
+from pathlib import Path
+
+import mlx.core as mx
+import mlx.nn as nn
+
+from vllm_metal.stt.registry import get_stt_model_constructor
+
+logger = logging.getLogger(__name__)
+
+# Supported floating-point dtypes for STT model loading.
+_SUPPORTED_LOAD_DTYPES = frozenset({mx.float16, mx.float32, mx.bfloat16})
+
+
+def load_model(model_path: str | Path, dtype: mx.Dtype = mx.float16):
+    """Load an STT model from a local directory or HuggingFace repo."""
+    if isinstance(model_path, str) and not model_path.strip():
+        raise ValueError(
+            "model_path must be a non-empty local path or HuggingFace repo ID."
+        )
+    _validate_load_dtype(dtype)
+
+    resolved_model_path = _resolve_model_path(model_path)
+    config_dict = _read_config(resolved_model_path)
+    model_type = config_dict.get("model_type", "").lower()
+
+    model_constructor = get_stt_model_constructor(model_type)
+    model = model_constructor(config_dict, dtype)
+    return _load_and_init_model(model, resolved_model_path, config_dict)
+
+
+def _validate_load_dtype(dtype: mx.Dtype) -> None:
+    """Validate the floating-point dtype used for model loading."""
+    if dtype not in _SUPPORTED_LOAD_DTYPES:
+        names = ", ".join(sorted(str(d) for d in _SUPPORTED_LOAD_DTYPES))
+        raise TypeError(
+            f"Unsupported STT model dtype: {dtype!r}. Must be one of {names}."
+        )
+
+
+def _read_config(model_path: Path) -> dict:
+    """Read and return config.json from a model directory."""
+    config_path = model_path / "config.json"
+    if not config_path.exists():
+        raise FileNotFoundError(f"config.json not found in {model_path}")
+    with open(config_path) as f:
+        return json.load(f)
+
+
+def _load_weights(model_path: Path) -> dict[str, mx.array]:
+    """Load model weights from safetensors or npz files."""
+    weight_files = sorted(model_path.glob("*.safetensors"))
+    if not weight_files:
+        weight_files = sorted(model_path.glob("*.npz"))
+    if not weight_files:
+        raise FileNotFoundError(f"No weight files in {model_path}")
+
+    weights: dict[str, mx.array] = {}
+    for wf in weight_files:
+        weights.update(mx.load(str(wf)))
+    return weights
+
+
+def _resolve_model_path(model_path: str | Path) -> Path:
+    """Resolve model path, downloading from HF if needed."""
+    model_path = Path(model_path)
+    if model_path.exists():
+        return model_path
+
+    try:
+        from huggingface_hub import snapshot_download
+    except ImportError as e:  # pragma: no cover
+        raise ValueError(
+            f"Could not download model {model_path}: huggingface_hub is not installed"
+        ) from e
+
+    try:
+        return Path(snapshot_download(repo_id=str(model_path)))
+    except OSError as e:
+        raise ValueError(f"Could not download model: {model_path}") from e
+
+
+def _load_and_init_model(model, model_path: Path, config_dict: dict):
+    """Shared loader: quantize, sanitize, load weights, and eval."""
+    weights = _load_weights(model_path)
+
+    quantization = config_dict.get("quantization")
+    if quantization is not None:
+
+        def class_predicate(p, m):
+            return isinstance(m, (nn.Linear, nn.Embedding)) and f"{p}.scales" in weights
+
+        nn.quantize(model, **quantization, class_predicate=class_predicate)
+
+    weights = model.sanitize(weights)
+    model.load_weights(list(weights.items()), strict=False)
+    mx.eval(model.parameters())
+    return model
diff --git a/vllm_metal/stt/registry.py b/vllm_metal/stt/registry.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Speech-to-Text model constructor registry."""
+
+from __future__ import annotations
+
+from collections.abc import Callable
+
+import mlx.core as mx
+
+from vllm_metal.stt.qwen3_asr.config import Qwen3ASRConfig
+from vllm_metal.stt.qwen3_asr.model import Qwen3ASRModel
+from vllm_metal.stt.whisper.config import WhisperConfig
+from vllm_metal.stt.whisper.model import WhisperModel
+
+STTModel = WhisperModel | Qwen3ASRModel
+STTModelConstructor = Callable[[dict, mx.Dtype], STTModel]
+
+
+def get_stt_model_constructor(model_type: str) -> STTModelConstructor:
+    """Return the model constructor for an STT ``model_type``."""
+    model_type = model_type.lower()
+    try:
+        return _STT_MODEL_CONSTRUCTORS[model_type]
+    except KeyError:
+        raise ValueError(
+            f"Unsupported STT model_type: {model_type!r}. "
+            "Expected 'whisper' or 'qwen3_asr'."
+        ) from None
+
+
+def _construct_whisper_model(config_dict: dict, dtype: mx.Dtype) -> WhisperModel:
+    config = WhisperConfig.from_dict(config_dict)
+    return WhisperModel(config, dtype)
+
+
+def _construct_qwen3_asr_model(config_dict: dict, dtype: mx.Dtype) -> Qwen3ASRModel:
+    config = Qwen3ASRConfig.from_dict(config_dict)
+    return Qwen3ASRModel(config, dtype)
+
+
+_STT_MODEL_CONSTRUCTORS: dict[str, STTModelConstructor] = {
+    # Default to Whisper for backward compatibility.
+    "": _construct_whisper_model,
+    "whisper": _construct_whisper_model,
+    "qwen3_asr": _construct_qwen3_asr_model,
+}
diff --git a/vllm_metal/stt/transcribe.py b/vllm_metal/stt/transcribe.py
@@ -1,164 +1,26 @@
 # SPDX-License-Identifier: Apache-2.0
-"""Speech-to-Text model loading and orchestration."""
+"""Speech-to-Text model orchestration."""
 
 from __future__ import annotations
 
-import json
 import logging
 from pathlib import Path
 
 import mlx.core as mx
-import mlx.nn as nn
 import numpy as np
 
 from vllm_metal.stt.config import SpeechToTextConfig
+from vllm_metal.stt.loader import load_model as _load_model
 from vllm_metal.stt.protocol import TranscriptionResult
-from vllm_metal.stt.qwen3_asr.config import Qwen3ASRConfig
-from vllm_metal.stt.qwen3_asr.model import Qwen3ASRModel
 from vllm_metal.stt.qwen3_asr.transcriber import Qwen3ASRTranscriber  # noqa: F401
-from vllm_metal.stt.whisper import WhisperConfig, WhisperModel, WhisperTranscriber
+from vllm_metal.stt.whisper import WhisperModel, WhisperTranscriber
 
 logger = logging.getLogger(__name__)
 
-try:
-    from huggingface_hub import snapshot_download
-except ImportError:  # pragma: no cover
-    snapshot_download = None  # type: ignore[assignment]
-
-# Supported floating-point dtypes for STT model loading.
-_SUPPORTED_LOAD_DTYPES = frozenset({mx.float16, mx.float32, mx.bfloat16})
-
-
-# ===========================================================================
-# Model loading
-# ===========================================================================
-
-
-def _read_config(model_path: Path) -> dict:
-    """Read and return config.json from a model directory."""
-    config_path = model_path / "config.json"
-    if not config_path.exists():
-        raise FileNotFoundError(f"config.json not found in {model_path}")
-    with open(config_path) as f:
-        return json.load(f)
-
-
-def _load_weights(model_path: Path) -> dict[str, mx.array]:
-    """Load model weights from safetensors or npz files."""
-    weight_files = sorted(model_path.glob("*.safetensors"))
-    if not weight_files:
-        weight_files = sorted(model_path.glob("*.npz"))
-    if not weight_files:
-        raise FileNotFoundError(f"No weight files in {model_path}")
-
-    weights: dict[str, mx.array] = {}
-    for wf in weight_files:
-        weights.update(mx.load(str(wf)))
-    return weights
-
-
-def _resolve_model_path(model_path: str | Path) -> Path:
-    """Resolve model path, downloading from HF if needed."""
-    model_path = Path(model_path)
-    if not model_path.exists():
-        if snapshot_download is None:
-            raise ValueError(
-                f"Could not download model {model_path}: huggingface_hub is not installed"
-            )
-        try:
-            model_path = Path(snapshot_download(repo_id=str(model_path)))
-        except OSError as e:
-            raise ValueError(f"Could not download model: {model_path}") from e
-    return model_path
-
-
-def _validate_load_dtype(dtype: mx.Dtype) -> None:
-    """Validate the floating-point dtype used for model loading."""
-    if dtype not in _SUPPORTED_LOAD_DTYPES:
-        names = ", ".join(sorted(str(d) for d in _SUPPORTED_LOAD_DTYPES))
-        raise TypeError(
-            f"Unsupported STT model dtype: {dtype!r}. Must be one of {names}."
-        )
-
 
 def load_model(model_path: str | Path, dtype: mx.Dtype = mx.float16):
-    """Load an STT model from a local directory or HuggingFace repo.
-
-    Auto-detects model type from config.json and dispatches to the
-    appropriate loader (Whisper or Qwen3-ASR).
-
-    Args:
-        model_path: Local path or HuggingFace repo ID.
-        dtype: Model dtype (default: float16).
-
-    Returns:
-        Loaded model ready for inference.
-
-    Raises:
-        ValueError: If the model type is unsupported or download fails.
-        FileNotFoundError: If config.json or weight files are missing.
-    """
-    if isinstance(model_path, str) and not model_path.strip():
-        raise ValueError(
-            "model_path must be a non-empty local path or HuggingFace repo ID."
-        )
-    _validate_load_dtype(dtype)
-    model_path = _resolve_model_path(model_path)
-    config_dict = _read_config(model_path)
-    model_type = config_dict.get("model_type", "").lower()
-
-    if model_type == "qwen3_asr":
-        return _load_qwen3_asr_model(model_path, config_dict, dtype)
-    if model_type in ("", "whisper"):
-        # Default to Whisper for backward compatibility
-        return _load_whisper_model(model_path, config_dict, dtype)
-    raise ValueError(
-        f"Unsupported STT model_type: {model_type!r}. "
-        "Expected 'whisper' or 'qwen3_asr'."
-    )
-
-
-def _load_and_init_model(model, model_path: Path, config_dict: dict):
-    """Shared loader: quantize, sanitize, load weights, and eval.
-
-    Args:
-        model: Instantiated model with a ``sanitize`` method.
-        model_path: Path to weight files.
-        config_dict: Raw config.json dict (checked for ``quantization``).
-
-    Returns:
-        The model with weights loaded and evaluated.
-    """
-    weights = _load_weights(model_path)
-
-    quantization = config_dict.get("quantization")
-    if quantization is not None:
-
-        def class_predicate(p, m):
-            return isinstance(m, (nn.Linear, nn.Embedding)) and f"{p}.scales" in weights
-
-        nn.quantize(model, **quantization, class_predicate=class_predicate)
-
-    weights = model.sanitize(weights)
-    model.load_weights(list(weights.items()), strict=False)
-    mx.eval(model.parameters())
-    return model
-
-
-def _load_whisper_model(
-    model_path: Path, config_dict: dict, dtype: mx.Dtype
-) -> WhisperModel:
-    """Load a Whisper model from config and weights."""
-    config = WhisperConfig.from_dict(config_dict)
-    model = WhisperModel(config, dtype)
-    return _load_and_init_model(model, model_path, config_dict)
-
-
-def _load_qwen3_asr_model(model_path: Path, config_dict: dict, dtype: mx.Dtype):
-    """Load a Qwen3-ASR model from config and weights."""
-    config = Qwen3ASRConfig.from_dict(config_dict)
-    model = Qwen3ASRModel(config, dtype)
-    return _load_and_init_model(model, model_path, config_dict)
+    """Load an STT model from a local directory or HuggingFace repo."""
+    return _load_model(model_path, dtype)
 
 
 # ===========================================================================