@@ -490,3 +490,223 @@ def test_decode_silence_produces_tokens(self) -> None:
490490 # Must end with EOT
491491 eot = tokenizer .convert_tokens_to_ids ("<|endoftext|>" )
492492 assert result [- 1 ] == eot
493+
494+
495+ # ===========================================================================
496+ # Qwen3-ASR STTExecutor dispatch & integration
497+ # ===========================================================================
498+
499+
500+ def _make_qwen3_executor () -> STTExecutor :
501+ """Create a STTExecutor configured for Qwen3-ASR model type.
502+
503+ Uses concrete stubs (not MagicMock rebinding) to exercise the
504+ Qwen3-ASR-specific dispatch paths in STTExecutor.
505+ """
506+ model = MagicMock ()
507+ model .model_type = "qwen3_asr"
508+ model .config = SimpleNamespace (eos_token_id = 151643 )
509+ model .encode = MagicMock (return_value = mx .ones ((50 , 1024 )))
510+
511+ executor = STTExecutor (model , "/fake/qwen3-asr" )
512+
513+ # Pre-inject a mock transcriber that mimics Qwen3ASRTranscriber
514+ mock_tokenizer = MagicMock ()
515+ # tokenizer.encode returns different IDs for special tokens
516+ _token_map = {
517+ "<asr_text>" : [151674 ],
518+ "<|im_end|>" : [151645 ],
519+ }
520+ mock_tokenizer .encode = MagicMock (
521+ side_effect = lambda s , add_special_tokens = False : _token_map .get (s , [0 ])
522+ )
523+ mock_transcriber = MagicMock ()
524+ mock_transcriber .tokenizer = mock_tokenizer
525+ mock_transcriber .build_prompt_tokens = MagicMock (
526+ return_value = [1 , 2 , 3 ] # simplified prompt
527+ )
528+ # Return token stream: <lang> <asr_text> hello world <|im_end|>
529+ mock_transcriber .greedy_decode_tokens = MagicMock (
530+ return_value = [100 , 151674 , 200 , 300 , 151645 ]
531+ )
532+ executor ._transcriber = mock_transcriber
533+
534+ return executor
535+
536+
537+ class TestSTTExecutorQwen3ASRDispatch :
538+ """Tests for Qwen3-ASR-specific dispatch paths in STTExecutor."""
539+
540+ def test_eot_token_from_config (self ) -> None :
541+ """Qwen3-ASR eot_token should come from model.config.eos_token_id."""
542+ executor = _make_qwen3_executor ()
543+ assert executor .eot_token == 151643
544+
545+ def test_transcriber_dispatches_to_qwen3 (self ) -> None :
546+ """model_type='qwen3_asr' should create Qwen3ASRTranscriber."""
547+ model = MagicMock ()
548+ model .model_type = "qwen3_asr"
549+ executor = STTExecutor (model , "/fake/path" )
550+
551+ with patch ("vllm_metal.stt.transcribe.Qwen3ASRTranscriber" ) as mock_cls :
552+ mock_cls .return_value = MagicMock ()
553+ _ = executor .transcriber
554+ mock_cls .assert_called_once_with (model , model_path = "/fake/path" )
555+
556+ def test_extract_audio_features_2d_passthrough (self ) -> None :
557+ """Qwen3-ASR: 2D mel (n_mels, time) should pass through without transpose."""
558+ executor = _make_qwen3_executor ()
559+
560+ def capture_encode (mel_input ):
561+ # Qwen3-ASR receives (n_mels, time) directly — no transpose
562+ assert mel_input .shape == (128 , 500 )
563+ return mx .ones ((50 , 1024 ))
564+
565+ executor .model .encode = capture_encode
566+
567+ mel = np .zeros ((128 , 500 ), dtype = np .float32 )
568+ result = executor .extract_audio_features (mel )
569+ assert result is not None
570+ assert result .shape == (50 , 1024 )
571+
572+ def test_extract_audio_features_3d_drops_batch (self ) -> None :
573+ """Qwen3-ASR: 3D mel (1, n_mels, time) should drop batch dim."""
574+ executor = _make_qwen3_executor ()
575+
576+ def capture_encode (mel_input ):
577+ # Batch dim stripped → (n_mels, time)
578+ assert mel_input .shape == (128 , 500 )
579+ return mx .ones ((50 , 1024 ))
580+
581+ executor .model .encode = capture_encode
582+
583+ mel = np .zeros ((1 , 128 , 500 ), dtype = np .float32 )
584+ result = executor .extract_audio_features (mel )
585+ assert result is not None
586+
587+ def test_extract_audio_features_1d_raises (self ) -> None :
588+ """Qwen3-ASR: 1D mel should raise ValueError (expects 2D or 3D)."""
589+ executor = _make_qwen3_executor ()
590+
591+ mel = np .zeros ((500 ,), dtype = np .float32 )
592+ with pytest .raises (ValueError , match = "rank" ):
593+ executor .extract_audio_features (mel )
594+
595+ def test_decode_rebuilds_prompt_with_audio_frames (self ) -> None :
596+ """Qwen3-ASR decode should rebuild prompt using build_prompt_tokens."""
597+ executor = _make_qwen3_executor ()
598+
599+ audio = mx .ones ((50 , 1024 )) # 50 audio frames
600+ executor .decode (audio , [1 , 2 , 3 ])
601+
602+ # build_prompt_tokens should be called with n_audio_frames=50
603+ executor .transcriber .build_prompt_tokens .assert_called_once_with (50 )
604+
605+ def test_decode_extracts_asr_text_tokens (self ) -> None :
606+ """Qwen3-ASR decode should extract tokens between <asr_text> and <|im_end|>."""
607+ executor = _make_qwen3_executor ()
608+
609+ audio = mx .ones ((50 , 1024 ))
610+ result = executor .decode (audio , [1 , 2 , 3 ])
611+
612+ # greedy_decode_tokens returns [100, 151674, 200, 300, 151645]
613+ # _extract_asr_text_tokens: between 151674 (<asr_text>) and 151645 (<|im_end|>)
614+ # → [200, 300]
615+ # + eot (151643) appended
616+ assert result == [200 , 300 , 151643 ]
617+
618+ def test_decode_empty_prompt_rebuilds (self ) -> None :
619+ """Qwen3-ASR rebuilds prompt even when caller passes empty list."""
620+ executor = _make_qwen3_executor ()
621+ result = executor .decode (mx .ones ((50 , 1024 )), [])
622+ # Should rebuild prompt and decode normally, not early-return EOT
623+ assert len (result ) > 1
624+ assert result [- 1 ] == 151643 # ends with EOT
625+
626+
627+ # ===========================================================================
628+ # TestExtractASRTextTokens
629+ # ===========================================================================
630+
631+
632+ class TestExtractASRTextTokens :
633+ """Tests for STTExecutor._extract_asr_text_tokens.
634+
635+ This method extracts content tokens between <asr_text> and <|im_end|>,
636+ which is the core post-processing step for Qwen3-ASR output.
637+ """
638+
639+ def test_basic_extraction (self ) -> None :
640+ """Tokens between <asr_text> and <|im_end|> should be extracted."""
641+ executor = _make_qwen3_executor ()
642+ # [lang, <asr_text>, hello, world, <|im_end|>]
643+ tokens = [100 , 151674 , 200 , 300 , 151645 ]
644+ result = executor ._extract_asr_text_tokens (tokens )
645+ assert result == [200 , 300 ]
646+
647+ def test_no_asr_text_tag_returns_original (self ) -> None :
648+ """Without <asr_text>, tokens should be returned as-is."""
649+ executor = _make_qwen3_executor ()
650+ tokens = [100 , 200 , 300 ]
651+ result = executor ._extract_asr_text_tokens (tokens )
652+ assert result == [100 , 200 , 300 ]
653+
654+ def test_no_im_end_returns_to_end (self ) -> None :
655+ """Without <|im_end|>, extract from <asr_text> to end of sequence."""
656+ executor = _make_qwen3_executor ()
657+ # [lang, <asr_text>, hello, world] — no im_end
658+ tokens = [100 , 151674 , 200 , 300 ]
659+ result = executor ._extract_asr_text_tokens (tokens )
660+ assert result == [200 , 300 ]
661+
662+ def test_multiple_asr_text_uses_last (self ) -> None :
663+ """Multiple <asr_text> tags should use the last one."""
664+ executor = _make_qwen3_executor ()
665+ # Two <asr_text> tags
666+ tokens = [151674 , 999 , 151674 , 200 , 300 , 151645 ]
667+ result = executor ._extract_asr_text_tokens (tokens )
668+ assert result == [200 , 300 ]
669+
670+ def test_empty_content_between_tags (self ) -> None :
671+ """<asr_text> immediately followed by <|im_end|> → empty list."""
672+ executor = _make_qwen3_executor ()
673+ tokens = [100 , 151674 , 151645 ]
674+ result = executor ._extract_asr_text_tokens (tokens )
675+ assert result == []
676+
677+ def test_asr_text_at_end (self ) -> None :
678+ """<asr_text> as last token → no content, return as-is."""
679+ executor = _make_qwen3_executor ()
680+ tokens = [100 , 200 , 151674 ]
681+ result = executor ._extract_asr_text_tokens (tokens )
682+ # start=3, which equals len(tokens), so returns original
683+ assert result == [100 , 200 , 151674 ]
684+
685+
686+ # ===========================================================================
687+ # TestQwen3ASRStubRejectsTranslate
688+ # ===========================================================================
689+
690+
691+ class TestQwen3ASRStubRejectsTranslate :
692+ """Qwen3-ASR does not support translation — must reject explicitly."""
693+
694+ def test_translate_raises_valueerror (self ) -> None :
695+ """task_type='translate' must raise ValueError, not silently transcribe."""
696+ from vllm_metal .stt .hf_config import _make_stub_class
697+
698+ stub_cls = _make_stub_class ()
699+ model_config = MagicMock ()
700+ model_config .tokenizer = "Qwen/Qwen3-ASR-0.6B"
701+ stt_config = MagicMock ()
702+
703+ with pytest .raises (ValueError , match = "does not support translation" ):
704+ stub_cls .get_generation_prompt (
705+ audio = np .zeros (16000 , dtype = np .float32 ),
706+ stt_config = stt_config ,
707+ model_config = model_config ,
708+ language = "en" ,
709+ task_type = "translate" ,
710+ request_prompt = "" ,
711+ to_language = None ,
712+ )
0 commit comments