[model] Address Qwen3-Omni packseq review comments

hbhflw2000 · hbhflw2000 · commit d179d09afbad · 2026-06-16T15:35:36.000+08:00
Signed-off-by: hbhflw2000 &lt;417911774@qq.com&gt;
diff --git a/src/megatron/bridge/models/qwen_omni/modeling_qwen3_omni/thinker_model.py b/src/megatron/bridge/models/qwen_omni/modeling_qwen3_omni/thinker_model.py
@@ -65,12 +65,23 @@ def _build_text_only_mrope_position_ids(input_ids: torch.Tensor) -> torch.Tensor
     return torch.stack([base, base, base], dim=0)
 
 
+_AUDIO_ENCODER_CHUNK_SIZE = 100
+_AUDIO_ENCODER_TOKENS_PER_FULL_CHUNK = 13
+
+
 def _get_qwen3_omni_audio_output_lengths(input_lengths: torch.LongTensor) -> torch.LongTensor:
     """Match HF Qwen3-Omni audio encoder forward output lengths."""
 
-    input_lengths_leave = input_lengths % 100
+    # HF Qwen3-Omni audio encoder handles full 100-frame chunks separately.
+    # Each full chunk contributes 13 output tokens; the remainder goes through
+    # two stride-2 convolutions followed by a final stride-2 pooling layer.
+    input_lengths_leave = input_lengths % _AUDIO_ENCODER_CHUNK_SIZE
     feat_lengths = (input_lengths_leave - 1) // 2 + 1
-    return ((feat_lengths - 1) // 2 + 1 - 1) // 2 + 1 + (input_lengths // 100) * 13
+    return (
+        ((feat_lengths - 1) // 2 + 1 - 1) // 2
+        + 1
+        + (input_lengths // _AUDIO_ENCODER_CHUNK_SIZE) * _AUDIO_ENCODER_TOKENS_PER_FULL_CHUNK
+    )
 
 
 def _configure_multimodal_attn_impl(config: object, attn_impl: str | None) -> None:
@@ -616,26 +627,19 @@ def forward(
             position_ids = torch.nn.functional.pad(position_ids, (0, sp_pad_len), mode="replicate")
 
         if self.config.sequence_parallel or cp_size > 1:
-            if packed_seq_params is None:
-                visual_pos_masks, deepstack_visual_embeds = split_deepstack_embs(
-                    visual_pos_masks,
-                    deepstack_visual_embeds,
-                    tp_size=tp_size,
-                    tp_rank=tp_rank,
-                    cp_size=cp_size,
-                    cp_rank=cp_rank,
-                    sequence_parallel=self.config.sequence_parallel,
-                )
-            elif self.config.sequence_parallel:
-                visual_pos_masks, deepstack_visual_embeds = split_deepstack_embs(
-                    visual_pos_masks,
-                    deepstack_visual_embeds,
-                    tp_size=tp_size,
-                    tp_rank=tp_rank,
-                    cp_size=1,
-                    cp_rank=0,
-                    sequence_parallel=self.config.sequence_parallel,
-                )
+            # Packed THD tensors are already CP-aware after preprocess_packed_seqs;
+            # only the SP split remains for deepstack embeddings.
+            split_cp_size = 1 if packed_seq_params is not None else cp_size
+            split_cp_rank = 0 if packed_seq_params is not None else cp_rank
+            visual_pos_masks, deepstack_visual_embeds = split_deepstack_embs(
+                visual_pos_masks,
+                deepstack_visual_embeds,
+                tp_size=tp_size,
+                tp_rank=tp_rank,
+                cp_size=split_cp_size,
+                cp_rank=split_cp_rank,
+                sequence_parallel=self.config.sequence_parallel,
+            )
 
         if packed_seq_params is not None and position_ids is not None:
             position_ids = (
@@ -649,16 +653,20 @@ def forward(
                 .contiguous()
             )
             attention_mask = None
-            self.language_model.rotary_pos_emb.is_thd_format = True
+
+        rotary_pos_emb = getattr(self.language_model, "rotary_pos_emb", None)
+        if rotary_pos_emb is not None:
+            rotary_pos_emb.is_thd_format = packed_seq_params is not None
+
+        if packed_seq_params is not None:
+            language_model_input_ids = lm_input_ids
+        elif combined_embeddings is not None:
+            language_model_input_ids = None
+        else:
+            language_model_input_ids = input_ids
 
         return self.language_model(
-            input_ids=(
-                lm_input_ids
-                if packed_seq_params is not None
-                else None
-                if combined_embeddings is not None
-                else input_ids
-            ),
+            input_ids=language_model_input_ids,
             position_ids=position_ids,
             attention_mask=attention_mask,
             decoder_input=combined_embeddings,
diff --git a/src/megatron/bridge/models/qwen_omni/qwen3_omni_step.py b/src/megatron/bridge/models/qwen_omni/qwen3_omni_step.py
@@ -284,6 +284,7 @@ def forward_step(
             attention_mask,
             position_ids,
             pg_collection,
+            # Keep packed THD lengths TE-friendly even when the recipe toggles FP8 later.
             use_fp8_padding=True,
             force_to_seq_length=_parallel_size(pg_collection, "pp") > 1 or _parallel_size(pg_collection, "ep") > 1,
             seq_length=getattr(config, "seq_length", getattr(state.cfg.model, "seq_length", None)),
diff --git a/tests/unit_tests/models/qwen_omni/modeling_qwen3_omni/test_omni_model.py b/tests/unit_tests/models/qwen_omni/modeling_qwen3_omni/test_omni_model.py
@@ -455,6 +455,54 @@ def _fake_get_rope_index(*args, **kwargs):  # noqa: ARG001
         assert rope_calls["attention_mask"] is None
         assert fake_language_model.forward_kwargs["attention_mask"] is attention_mask
 
+    def test_packed_forward_resets_rotary_thd_state(self):
+        class _MockProcessGroup:
+            def size(self):
+                return 1
+
+            def rank(self):
+                return 0
+
+        class _FakeLanguageModel(torch.nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.rotary_pos_emb = SimpleNamespace(is_thd_format=False)
+                self.forward_kwargs = []
+
+            def forward(self, **kwargs):
+                self.forward_kwargs.append(kwargs)
+                return torch.tensor(0.0)
+
+        fake_language_model = _FakeLanguageModel()
+        thinker = SimpleNamespace(
+            pg_collection=SimpleNamespace(cp=_MockProcessGroup(), tp=_MockProcessGroup()),
+            config=SimpleNamespace(sequence_parallel=False),
+            pre_process=False,
+            language_model=fake_language_model,
+        )
+
+        input_ids = torch.tensor([[1, 2, 3, 4]])
+        position_ids = torch.arange(input_ids.size(1)).view(1, 1, -1).expand(3, input_ids.size(0), -1)
+        attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
+
+        Qwen3OmniThinkerModel.forward(
+            thinker,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=attention_mask,
+            packed_seq_params=object(),
+        )
+        assert fake_language_model.rotary_pos_emb.is_thd_format is True
+
+        Qwen3OmniThinkerModel.forward(
+            thinker,
+            input_ids=input_ids,
+            position_ids=position_ids,
+            attention_mask=None,
+            packed_seq_params=None,
+        )
+        assert fake_language_model.rotary_pos_emb.is_thd_format is False
+
     def test_audio_forward(self, thinker_config):
         model = self._build_model(thinker_config)
         if torch.cuda.is_available():