[Perf] Qwen3-Omni performance optimization (vllm-project#3878)

amy-why-3459 · hsliuustc0106 · web-flow · commit 269675100386 · 2026-05-29T22:15:23.000+08:00
Signed-off-by: amy-why-3459 &lt;wuhaiyan17@huawei.com&gt;
Co-authored-by: Hongsheng Liu &lt;liuhongsheng4@huawei.com&gt;
diff --git a/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py b/vllm_omni/model_executor/models/qwen3_omni/qwen3_omni.py
@@ -20,7 +20,6 @@
 from vllm.config import ModelConfig, VllmConfig
 from vllm.inputs import PromptType, TokensPrompt
 from vllm.logger import init_logger
-from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.model_executor.models.interfaces import SupportsMRoPE, SupportsMultiModal, SupportsPP, SupportsRealtime
 from vllm.model_executor.models.qwen3_asr_realtime import Qwen3ASRRealtimeBuffer
 from vllm.model_executor.models.qwen3_omni_moe_thinker import (
@@ -180,6 +179,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
                 ("hidden_states", "last"),
                 ("hidden_states", "trailing_text"),
                 ("embed", "tts_pad_projected"),
+                # talker MTP codec codes must stay on GPU to avoid a per-step D2H
+                # sync stall; build_mm_cpu handles the eventual D2H at payload time.
+                ("codes", "audio"),
             }
             # Keys that need to be accumulated across streaming inputs
             self.streaming_accumulated_keys: set[tuple[str, str]] = {
@@ -323,7 +325,12 @@ def get_mrope_input_positions(
                 msg = "Qwen3 Omni thinker get_mrope_input_positions requires mm_features"
                 raise ValueError(msg)
             return self.thinker.get_mrope_input_positions(input_tokens, mm_features)
-        return MRotaryEmbedding.get_input_positions_tensor(input_tokens, **kwargs)
+        # Talker/code2wav stages are text/codec-only and do not need
+        # multimodal M-RoPE position computation. Return a cheap linear
+        # position tensor to avoid unnecessary per-request M-RoPE work.
+        seq_len = len(input_tokens)
+        linear = torch.arange(seq_len, dtype=torch.long).unsqueeze(0).expand(3, seq_len)
+        return linear, 0
 
     def forward(
         self,
@@ -1253,7 +1260,7 @@ def compute_logits(
         if (
             getattr(self, "model_stage", None) == "talker"
             and sampling_metadata is not None
-            and (sampling_metadata.temperature is None or (sampling_metadata.temperature <= 0).any())
+            and (sampling_metadata.temperature is None)
         ):
             self._warn_talker_sampling_temperature(sampling_metadata)
 
diff --git a/vllm_omni/worker/gpu_model_runner.py b/vllm_omni/worker/gpu_model_runner.py
@@ -1765,7 +1765,12 @@ def _store_value(self, dest: dict, key: str, value: Any, gpu_keys: set) -> None:
             if key in gpu_keys:
                 dest[key] = value.detach().clone()
             else:
-                dest[key] = value.detach().to("cpu").contiguous()
+                t = value.detach()
+                if t.is_cuda:
+                    dest[key] = t.to("cpu").contiguous()
+                else:
+                    # If the tensor is already on the CPU, there is no need to unload it to the CPU.
+                    dest[key] = t.contiguous()
         elif isinstance(value, list):
             dest[key] = [
                 (item.detach().to("cpu").contiguous() if isinstance(item, torch.Tensor) else item) for item in value