vllm-project
diff --git a/‎tests/test_qwen3_asr.py‎
Lines changed: 21 additions & 20 deletions b/‎tests/test_qwen3_asr.py‎
Lines changed: 21 additions & 20 deletions
diff --git a/‎tests/test_transcribe.py‎
Lines changed: 6 additions & 6 deletions b/‎tests/test_transcribe.py‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎vllm_metal/stt/hf_config.py‎
Lines changed: 4 additions & 3 deletions b/‎vllm_metal/stt/hf_config.py‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎vllm_metal/stt/protocol.py‎
Lines changed: 12 additions & 0 deletions b/‎vllm_metal/stt/protocol.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎vllm_metal/stt/qwen3_asr/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎vllm_metal/stt/qwen3_asr/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎vllm_metal/stt/qwen3_asr/config.py‎
Lines changed: 121 additions & 0 deletions b/‎vllm_metal/stt/qwen3_asr/config.py‎
Lines changed: 121 additions & 0 deletions
@@ -12,16 +12,16 @@
 import pytest
 
 from vllm_metal.stt.config import is_stt_model
-from vllm_metal.stt.qwen3_asr import (
-    AudioEncoder,
+from vllm_metal.stt.qwen3_asr.config import (
     Qwen3ASRAudioConfig,
     Qwen3ASRConfig,
-    Qwen3ASRModel,
     Qwen3ASRTextConfig,
+)
+from vllm_metal.stt.qwen3_asr.model import (
+    AudioEncoder,
+    Qwen3ASRModel,
     Qwen3Attention,
     Qwen3LM,
-    _get_cnn_output_lengths,
-    _get_feat_extract_output_lengths,
 )
 from vllm_metal.stt.transcribe import Qwen3ASRTranscriber, load_model
 
@@ -94,34 +94,36 @@ def test_defaults(self) -> None:
 
 
 class TestCNNOutputLengths:
-    """Tests for _get_cnn_output_lengths and _get_feat_extract_output_lengths."""
+    """Tests for Qwen3ASRAudioConfig shape helpers."""
 
     def test_single_conv_stride(self) -> None:
         """3x Conv2d stride-2 on 100 frames → 13 output frames."""
-        assert _get_cnn_output_lengths(100) == 13
+        assert Qwen3ASRAudioConfig.cnn_output_length(100) == 13
 
     def test_small_inputs(self) -> None:
         """Edge cases for small input lengths."""
-        assert _get_cnn_output_lengths(1) == 1
-        assert _get_cnn_output_lengths(2) == 1
+        assert Qwen3ASRAudioConfig.cnn_output_length(1) == 1
+        assert Qwen3ASRAudioConfig.cnn_output_length(2) == 1
         # 3 -> 2 -> 1 -> 1 after 3x stride-2
-        assert _get_cnn_output_lengths(3) == 1
+        assert Qwen3ASRAudioConfig.cnn_output_length(3) == 1
 
     def test_feat_extract_full_chunks(self) -> None:
         """Full chunks of 100 frames each produce 13 frames per chunk."""
-        assert _get_feat_extract_output_lengths(100) == 13
-        assert _get_feat_extract_output_lengths(200) == 26
-        assert _get_feat_extract_output_lengths(300) == 39
+        cfg = Qwen3ASRAudioConfig()
+        assert cfg.feat_extract_output_length(100) == 13
+        assert cfg.feat_extract_output_length(200) == 26
+        assert cfg.feat_extract_output_length(300) == 39
 
     def test_feat_extract_with_remainder(self) -> None:
         """Partial chunk adds its CNN output to full chunks."""
         # 150 = 1 full chunk (13) + 50 remainder
-        remainder_out = _get_cnn_output_lengths(50)
-        assert _get_feat_extract_output_lengths(150) == 13 + remainder_out
+        cfg = Qwen3ASRAudioConfig()
+        remainder_out = Qwen3ASRAudioConfig.cnn_output_length(50)
+        assert cfg.feat_extract_output_length(150) == 13 + remainder_out
 
     def test_feat_extract_3000_frames(self) -> None:
         """30 seconds at 16kHz/hop160 = 3000 frames → 30 * 13 = 390."""
-        assert _get_feat_extract_output_lengths(3000) == 390
+        assert Qwen3ASRAudioConfig().feat_extract_output_length(3000) == 390
 
 
 # ===========================================================================
@@ -154,7 +156,7 @@ def test_single_chunk(self, tiny_encoder) -> None:
         mel = mx.random.normal((16, 80))  # 80 < 100 frames
         out = tiny_encoder(mel)
         mx.eval(out)
-        expected_frames = _get_cnn_output_lengths(80)
+        expected_frames = Qwen3ASRAudioConfig.cnn_output_length(80)
         assert out.shape == (expected_frames, 48)
 
     def test_exact_chunk(self, tiny_encoder) -> None:
@@ -529,7 +531,6 @@ def transcriber(self, tmp_path):
         )
         model = MagicMock()
         model.config = config
-        t = Qwen3ASRTranscriber(model, model_path=str(tmp_path))
 
         # Inject mock tokenizer with deterministic encode
         mock_tok = MagicMock()
@@ -543,7 +544,7 @@ def transcriber(self, tmp_path):
         mock_tok.encode = MagicMock(
             side_effect=lambda s, add_special_tokens=False: _encode_map.get(s, [0])
         )
-        t._tokenizer = mock_tok
+        t = Qwen3ASRTranscriber(model, tokenizer=mock_tok)
         return t
 
     def test_audio_pad_count_matches_frames(self, transcriber) -> None:
@@ -635,7 +636,7 @@ def test_encode_dummy_mel(self) -> None:
         mel = mx.random.normal((128, 300))
         embeddings = model.encode(mel)
         mx.eval(embeddings)
-        expected = _get_feat_extract_output_lengths(300)
+        expected = model.config.audio_config.feat_extract_output_length(300)
         assert embeddings.shape == (expected, 1024)
 
     def test_greedy_decode(self) -> None:
 
@@ -11,11 +11,11 @@
 import pytest
 
 from vllm_metal.stt.transcribe import (
-    _MAX_PROMPT_TOKENS,
     TranscriptionResult,
     WhisperTranscriber,
     load_model,
 )
+from vllm_metal.stt.whisper.transcriber import MAX_PROMPT_TOKENS
 
 # ===========================================================================
 # Fixtures
@@ -33,8 +33,8 @@ def transcriber():
     except ImportError:
         pytest.skip("transformers not available")
 
-    t = WhisperTranscriber(model=None, model_path=None)  # type: ignore[arg-type]
-    t._tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small")
+    tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-small")
+    t = WhisperTranscriber(model=None, tokenizer=tokenizer)  # type: ignore[arg-type]
     return t
 
 
@@ -153,11 +153,11 @@ def test_prompt_contains_text_tokens(self, transcriber: WhisperTranscriber) -> N
         assert len(result) >= 2
 
     def test_long_prompt_truncated(self, transcriber: WhisperTranscriber) -> None:
-        """Very long prompt should be truncated to _MAX_PROMPT_TOKENS + 1."""
+        """Very long prompt should be truncated to MAX_PROMPT_TOKENS + 1."""
         long_text = "word " * 500
         result = transcriber._encode_prompt(long_text)
-        # startofprev (1) + at most _MAX_PROMPT_TOKENS text tokens
-        assert len(result) <= _MAX_PROMPT_TOKENS + 1
+        # startofprev (1) + at most MAX_PROMPT_TOKENS text tokens
+        assert len(result) <= MAX_PROMPT_TOKENS + 1
 
 
 # ===========================================================================
 
@@ -17,6 +17,7 @@
 from transformers.configuration_utils import PretrainedConfig
 
 from vllm_metal.stt.config import get_whisper_languages
+from vllm_metal.stt.qwen3_asr.config import Qwen3ASRAudioConfig
 
 logger = logging.getLogger(__name__)
 
@@ -449,8 +450,6 @@ def get_num_audio_tokens(
             stt_config: SpeechToTextConfig,
             model_config: ModelConfig,
         ) -> int | None:
-            from vllm_metal.stt.qwen3_asr import _get_feat_extract_output_lengths
-
             # Derive hop_length from WhisperFeatureExtractor defaults
             hop_length = WhisperFeatureExtractor().hop_length
 
@@ -460,7 +459,9 @@ def get_num_audio_tokens(
             mel_frames = math.ceil(
                 audio_duration_s * stt_config.sample_rate / hop_length
             )
-            return _get_feat_extract_output_lengths(mel_frames, n_window=n_window)
+            return Qwen3ASRAudioConfig(n_window=n_window).feat_extract_output_length(
+                mel_frames
+            )
 
     # Attach multimodal processor factory to the stub class
     Qwen3ASRStub._processor_factory = _ProcessorFactories(
 
@@ -1,6 +1,8 @@
 # SPDX-License-Identifier: Apache-2.0
 """Response types for Speech-to-Text."""
 
+from dataclasses import dataclass, field
+
 from pydantic import BaseModel
 
 
@@ -16,3 +18,13 @@ class TranscriptionSegment(BaseModel):
     avg_logprob: float = 0.0
     compression_ratio: float = 0.0
     no_speech_prob: float = 0.0
+
+
+@dataclass
+class TranscriptionResult:
+    """Result of a transcription operation."""
+
+    text: str
+    language: str | None = None
+    segments: list[TranscriptionSegment] = field(default_factory=list)
+    duration: float = 0.0
@@ -0,0 +1,14 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Qwen3-ASR configuration and entrypoints.
+
+Keep this module MLX-free so ``vllm_metal.stt.hf_config`` can import
+``vllm_metal.stt.qwen3_asr.config`` without pulling in the model stack.
+"""
+
+from .config import Qwen3ASRAudioConfig, Qwen3ASRConfig, Qwen3ASRTextConfig
+
+__all__ = [
+    "Qwen3ASRAudioConfig",
+    "Qwen3ASRConfig",
+    "Qwen3ASRTextConfig",
+]
@@ -0,0 +1,121 @@
+# SPDX-License-Identifier: Apache-2.0
+"""Qwen3-ASR configuration (MLX-free).
+
+Keep this module free of MLX imports so vLLM compat code can import config and
+shape helpers during planning/registration without pulling in the model stack.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass, field
+
+
+@dataclass
+class Qwen3ASRAudioConfig:
+    """Audio encoder configuration."""
+
+    num_mel_bins: int = 128
+    d_model: int = 896
+    encoder_layers: int = 18
+    encoder_attention_heads: int = 14
+    encoder_ffn_dim: int = 3584
+    downsample_hidden_size: int = 480
+    output_dim: int = 1024
+    max_source_positions: int = 1500
+    n_window: int = 50
+    n_window_infer: int = 800
+    activation_function: str = "gelu"
+
+    @staticmethod
+    def cnn_output_length(num_frames: int) -> int:
+        """Return time length after 3x Conv2d(stride=2) downsampling."""
+        length = num_frames
+        for _ in range(3):
+            length = (length - 1) // 2 + 1
+        return int(length)
+
+    def feat_extract_output_length(self, num_mel_frames: int) -> int:
+        """Return number of audio tokens produced from a mel with N time frames."""
+        chunk_size = self.n_window * 2
+        frames_per_full_chunk = self.cnn_output_length(chunk_size)
+        full_chunks, remainder = divmod(num_mel_frames, chunk_size)
+        if remainder == 0:
+            return int(full_chunks * frames_per_full_chunk)
+        return int(
+            full_chunks * frames_per_full_chunk + self.cnn_output_length(remainder)
+        )
+
+
+@dataclass
+class Qwen3ASRTextConfig:
+    """Text decoder (Qwen3 LM) configuration."""
+
+    hidden_size: int = 1024
+    num_hidden_layers: int = 28
+    num_attention_heads: int = 16
+    num_key_value_heads: int = 8
+    head_dim: int = 128
+    intermediate_size: int = 3072
+    vocab_size: int = 151936
+    rms_norm_eps: float = 1e-6
+    rope_theta: float = 1000000.0
+    tie_word_embeddings: bool = True
+
+
+@dataclass
+class Qwen3ASRConfig:
+    """Top-level Qwen3-ASR model configuration."""
+
+    audio_config: Qwen3ASRAudioConfig = field(default_factory=Qwen3ASRAudioConfig)
+    text_config: Qwen3ASRTextConfig = field(default_factory=Qwen3ASRTextConfig)
+    audio_token_id: int = 151676
+    audio_start_token_id: int = 151669
+    audio_end_token_id: int = 151670
+    eos_token_id: int = 151643
+    # Compatibility with Whisper interface for load_model dispatching
+    n_mels: int = 128
+    n_audio_ctx: int = 1500
+
+    @classmethod
+    def from_dict(cls, d: dict) -> Qwen3ASRConfig:
+        """Create config from config.json dictionary."""
+        thinker = d.get("thinker_config", d)
+
+        audio_dict = thinker.get("audio_config", {})
+        audio_cfg = Qwen3ASRAudioConfig(
+            num_mel_bins=audio_dict.get("num_mel_bins", 128),
+            d_model=audio_dict.get("d_model", 896),
+            encoder_layers=audio_dict.get("encoder_layers", 18),
+            encoder_attention_heads=audio_dict.get("encoder_attention_heads", 14),
+            encoder_ffn_dim=audio_dict.get("encoder_ffn_dim", 3584),
+            downsample_hidden_size=audio_dict.get("downsample_hidden_size", 480),
+            output_dim=audio_dict.get("output_dim", 1024),
+            max_source_positions=audio_dict.get("max_source_positions", 1500),
+            n_window=audio_dict.get("n_window", 50),
+            n_window_infer=audio_dict.get("n_window_infer", 800),
+            activation_function=audio_dict.get("activation_function", "gelu"),
+        )
+
+        text_dict = thinker.get("text_config", {})
+        text_cfg = Qwen3ASRTextConfig(
+            hidden_size=text_dict.get("hidden_size", 1024),
+            num_hidden_layers=text_dict.get("num_hidden_layers", 28),
+            num_attention_heads=text_dict.get("num_attention_heads", 16),
+            num_key_value_heads=text_dict.get("num_key_value_heads", 8),
+            head_dim=text_dict.get("head_dim", 128),
+            intermediate_size=text_dict.get("intermediate_size", 3072),
+            vocab_size=text_dict.get("vocab_size", 151936),
+            rms_norm_eps=text_dict.get("rms_norm_eps", 1e-6),
+            rope_theta=text_dict.get("rope_theta", 1000000.0),
+            tie_word_embeddings=text_dict.get("tie_word_embeddings", True),
+        )
+
+        return cls(
+            audio_config=audio_cfg,
+            text_config=text_cfg,
+            audio_token_id=thinker.get("audio_token_id", 151676),
+            audio_start_token_id=thinker.get("audio_start_token_id", 151669),
+            audio_end_token_id=thinker.get("audio_end_token_id", 151670),
+            n_mels=audio_cfg.num_mel_bins,
+            n_audio_ctx=audio_cfg.max_source_positions,
+        )