luyiyun1021
diff --git a/‎cpp/tensorrt_llm/kernels/fusedDiTQKNormRopeKernel.cu‎
Lines changed: 10 additions & 3 deletions b/‎cpp/tensorrt_llm/kernels/fusedDiTQKNormRopeKernel.cu‎
Lines changed: 10 additions & 3 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/fusedDiTSplitQKNormRopeKernel.cu‎
Lines changed: 5 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/fusedDiTSplitQKNormRopeKernel.cu‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/thop/fusedDiTSplitQKNormRopeOp.cpp‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/thop/fusedDiTSplitQKNormRopeOp.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/visual_gen/models/ltx2/ltx2_core/transformer_args.py‎
Lines changed: 1 addition & 16 deletions b/‎tensorrt_llm/_torch/visual_gen/models/ltx2/ltx2_core/transformer_args.py‎
Lines changed: 1 addition & 16 deletions
diff --git a/‎tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2.py‎
Lines changed: 0 additions & 8 deletions b/‎tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2.py‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎tensorrt_llm/_torch/visual_gen/models/ltx2/text_cache.py‎
Lines changed: 15 additions & 14 deletions b/‎tensorrt_llm/_torch/visual_gen/models/ltx2/text_cache.py‎
Lines changed: 15 additions & 14 deletions
@@ -217,7 +217,7 @@ __global__ void fusedDiTQKNormRopeKernel(__nv_bfloat16* qkv, // [num_tokens, tot
 //                     each head h reads its own slice (LTX-2 INTERLEAVED RoPE).
 // Note: when PER_HEAD_COS=true, Q and K share the same cos/sin buffer (same
 //       num_heads_q == num_heads_k for LTX-2 self-attn).
-// CosT: float (fp32 cos) or __nv_bfloat16 (B-2: kernel upcasts in registers).
+// CosT: float (fp32 cos) or __nv_bfloat16 (kernel upcasts bf16 to fp32 in registers, lossless).
 template <int HEAD_DIM, bool INTERLEAVE, bool PER_HEAD_COS, typename CosT>
 __global__ void fusedDiTQKNormFullDimRopeKernel(__nv_bfloat16* qkv, int const num_heads_q, int const num_heads_k,
     int const num_heads_v, float const eps, __nv_bfloat16 const* q_weight, __nv_bfloat16 const* k_weight,
@@ -274,6 +274,10 @@ __global__ void fusedDiTQKNormFullDimRopeKernel(__nv_bfloat16* qkv, int const nu
 #pragma unroll
         for (int w = 0; w < N_WARPS; w++)
             total += warp_sums[w];
+        // Trailing barrier: this lambda is called twice (Q then K) and reuses
+        // warp_sums; without this, warp X's next-iteration lane-0 write can race
+        // warp Y's pending read of the previous iteration.
+        __syncthreads();
         return total;
     };
 
@@ -405,13 +409,16 @@ __global__ void fusedDiTQKNormFullDimRopeKernel(__nv_bfloat16* qkv, int const nu
             else
             {
                 // rotate-half (LTX-2 SPLIT): partner element at +HEAD_DIM/2 within head.
-                // Inline partner exchange to avoid 8-reg partner array (Step 1 reg-pressure opt).
+                // Inline partner exchange to avoid 8-reg partner array (reg-pressure opt).
+                // Use __activemask() because the surrounding chunk loop can have
+                // `continue` early-exit on partial warps for small N.
                 constexpr int xor_mask = HEAD_DIM / 16;
                 bool const negate = ((laneId & xor_mask) == 0);
+                unsigned const activeMask = __activemask();
 #pragma unroll
                 for (int i = 0; i < CHUNK_ELEMS; i++)
                 {
-                    float p = __shfl_xor_sync(0xffffffff, elements[i], xor_mask);
+                    float p = __shfl_xor_sync(activeMask, elements[i], xor_mask);
                     if (negate)
                     {
                         p = -p;
 
@@ -37,7 +37,7 @@ namespace kernels
 //
 // PER_HEAD_COS=false: cos/sin shape [num_tokens, HEAD_DIM] (FLUX-style, head broadcast).
 // PER_HEAD_COS=true:  cos/sin shape [num_tokens, num_heads * HEAD_DIM] (LTX-2 3D RoPE).
-// CosT: float (fp32 cos) or __nv_bfloat16 (B-2: kernel upcasts in registers).
+// CosT: float (fp32 cos) or __nv_bfloat16 (kernel upcasts bf16 to fp32 in registers, lossless).
 template <int HEAD_DIM, bool INTERLEAVE, bool PER_HEAD_COS, typename CosT>
 __global__ void fusedDiTSplitNormFullDimRopeKernel(__nv_bfloat16* __restrict__ tensor, int const num_tokens,
     int const num_heads, float const eps, __nv_bfloat16 const* __restrict__ weight, CosT const* __restrict__ cos_emb,
@@ -188,12 +188,15 @@ __global__ void fusedDiTSplitNormFullDimRopeKernel(__nv_bfloat16* __restrict__ t
         {
             // rotate-half: partner element at +HEAD_DIM/2 within the same head.
             // Inline partner exchange (single reg `p` per iter, no array).
+            // Use __activemask() because the surrounding chunk loop can have
+            // `continue` early-exit on partial warps for small num_heads*HEAD_DIM.
             constexpr int xor_mask = HEAD_DIM / 16;
             bool const negate = ((laneId & xor_mask) == 0);
+            unsigned const activeMask = __activemask();
 #pragma unroll
             for (int i = 0; i < CHUNK_ELEMS; i++)
             {
-                float p = __shfl_xor_sync(0xffffffff, elements[i], xor_mask);
+                float p = __shfl_xor_sync(activeMask, elements[i], xor_mask);
                 if (negate)
                 {
                     p = -p;
 
@@ -39,7 +39,7 @@ void fused_dit_split_norm_rope(torch::Tensor& tensor, int64_t num_heads, int64_t
 
     CHECK_INPUT(tensor, torch::kBFloat16);
     CHECK_INPUT(weight, torch::kBFloat16);
-    // Cos/sin may be fp32 (legacy) or bf16 (B-2: kernel upcasts in registers).
+    // Cos/sin may be fp32 or bf16 (kernel upcasts bf16 to fp32 in registers, lossless).
     auto const cos_dtype = cos_emb.scalar_type();
     TORCH_CHECK(cos_dtype == torch::kFloat32 || cos_dtype == torch::kBFloat16,
         "cos_emb dtype must be float32 or bfloat16, got ", cos_dtype);
 
@@ -32,12 +32,6 @@ class TransformerArgs:
     cross_scale_shift_timestep: torch.Tensor | None
     cross_gate_timestep: torch.Tensor | None
     enabled: bool
-    # Sharded-local 2D contiguous [T_local, H*D] forms of *positional_embeddings*
-    # and *cross_positional_embeddings*, computed once in prepare_text_cache
-    # (loop-external) and threaded through to fused norm+rope kernels. The 4D
-    # forms above stay around for the unfused fallback (apply_rotary_emb).
-    positional_embeddings_2d: tuple[torch.Tensor, torch.Tensor] | None = None
-    cross_positional_embeddings_2d: tuple[torch.Tensor, torch.Tensor] | None = None
 
 
 class TransformerArgsPreprocessor:
@@ -161,14 +155,11 @@ def prepare(
         static_mask: torch.Tensor | None,
         static_pe: tuple[torch.Tensor, torch.Tensor],
         static_cross_pe: tuple[torch.Tensor, torch.Tensor] | None = None,
-        static_pe_2d: tuple[torch.Tensor, torch.Tensor] | None = None,
-        static_cross_pe_2d: tuple[torch.Tensor, torch.Tensor] | None = None,
     ) -> TransformerArgs:
         """Build TransformerArgs for one denoise step.
 
         Step-invariant static args are always required.  *static_cross_pe*
-        and *static_pe_2d* / *static_cross_pe_2d* are only meaningful when
-        provided by the caller; ignored in this base class for *_cross_pe.
+        is only used by the MultiModal subclass; ignored here.
         """
         x = self.patchify_proj(modality.latent.contiguous())
         timestep, embedded_timestep = self._prepare_timestep(
@@ -185,8 +176,6 @@ def prepare(
             cross_scale_shift_timestep=None,
             cross_gate_timestep=None,
             enabled=modality.enabled,
-            positional_embeddings_2d=static_pe_2d,
-            cross_positional_embeddings_2d=None,
         )
 
 
@@ -266,16 +255,13 @@ def prepare(
         static_mask: torch.Tensor | None,
         static_pe: tuple[torch.Tensor, torch.Tensor],
         static_cross_pe: tuple[torch.Tensor, torch.Tensor],
-        static_pe_2d: tuple[torch.Tensor, torch.Tensor] | None = None,
-        static_cross_pe_2d: tuple[torch.Tensor, torch.Tensor] | None = None,
     ) -> TransformerArgs:
         """Build TransformerArgs for one denoise step with pre-computed static outputs."""
         transformer_args = self.simple_preprocessor.prepare(
             modality,
             static_context=static_context,
             static_mask=static_mask,
             static_pe=static_pe,
-            static_pe_2d=static_pe_2d,
         )
         cross_scale_shift_timestep, cross_gate_timestep = self._prepare_cross_attention_timestep(
             timestep=modality.timesteps,
@@ -288,7 +274,6 @@ def prepare(
             cross_positional_embeddings=static_cross_pe,
             cross_scale_shift_timestep=cross_scale_shift_timestep,
             cross_gate_timestep=cross_gate_timestep,
-            cross_positional_embeddings_2d=static_cross_pe_2d,
         )
 
     def _prepare_cross_attention_timestep(
 
@@ -379,16 +379,12 @@ def _clone_value(v):
                 video_context=v.video_context.clone() if v.video_context is not None else None,
                 video_mask=v.video_mask.clone() if v.video_mask is not None else None,
                 video_pe=clone_pair(v.video_pe),
-                video_pe_2d=clone_pair(v.video_pe_2d),
                 video_cross_pe=clone_pair(v.video_cross_pe),
-                video_cross_pe_2d=clone_pair(v.video_cross_pe_2d),
                 video_kv=[clone_pair(kv) for kv in v.video_kv] if v.video_kv is not None else None,
                 audio_context=v.audio_context.clone() if v.audio_context is not None else None,
                 audio_mask=v.audio_mask.clone() if v.audio_mask is not None else None,
                 audio_pe=clone_pair(v.audio_pe),
-                audio_pe_2d=clone_pair(v.audio_pe_2d),
                 audio_cross_pe=clone_pair(v.audio_cross_pe),
-                audio_cross_pe_2d=clone_pair(v.audio_cross_pe_2d),
                 audio_kv=[clone_pair(kv) for kv in v.audio_kv] if v.audio_kv is not None else None,
             )
         if isinstance(v, torch.Tensor):
@@ -417,9 +413,7 @@ def _copy_value(dst, src):
             if dst.video_mask is not None and src.video_mask is not None:
                 dst.video_mask.copy_(src.video_mask)
             copy_pair(dst.video_pe, src.video_pe)
-            copy_pair(dst.video_pe_2d, src.video_pe_2d)
             copy_pair(dst.video_cross_pe, src.video_cross_pe)
-            copy_pair(dst.video_cross_pe_2d, src.video_cross_pe_2d)
             if dst.video_kv is not None and src.video_kv is not None:
                 for d, s in zip(dst.video_kv, src.video_kv):
                     copy_pair(d, s)
@@ -428,9 +422,7 @@ def _copy_value(dst, src):
             if dst.audio_mask is not None and src.audio_mask is not None:
                 dst.audio_mask.copy_(src.audio_mask)
             copy_pair(dst.audio_pe, src.audio_pe)
-            copy_pair(dst.audio_pe_2d, src.audio_pe_2d)
             copy_pair(dst.audio_cross_pe, src.audio_cross_pe)
-            copy_pair(dst.audio_cross_pe_2d, src.audio_cross_pe_2d)
             if dst.audio_kv is not None and src.audio_kv is not None:
                 for d, s in zip(dst.audio_kv, src.audio_kv):
                     copy_pair(d, s)
 
@@ -20,36 +20,37 @@
 class TextCache:
     """Pre-computed text-derived tensors that are constant across denoise steps.
 
+    The ``*_pe`` fields hold sharded-local positional embeddings in the form
+    the consumer wants:
+
+      - ``fuse_qk_norm_rope=True`` (LTX-2 default): 2D ``[T_local, H*D]``
+        contiguous, fed directly to the fused norm+rope kernel.
+      - ``fuse_qk_norm_rope=False``: 4D ``[B, T_local, H, D]`` sharded but
+        otherwise unchanged, for the naive ``apply_rotary_emb`` path.
+
+    Form is decided at cache-build time (``LTXModel.prepare_text_cache``); no
+    per-step reshape, ``.contiguous()``, or shard slicing.
+
     Attributes:
         video_context: Projected text embedding for video cross-attention.
         video_mask: Attention mask for video text cross-attention.
-        video_pe: RoPE (cos, sin) for video.  4D form [1, T, H, D], un-sharded.
-        video_pe_2d: Sharded-local 2D contiguous form [T_local, H*D] of video_pe,
-            fed directly to fused norm+rope kernels — skips per-step reshape +
-            ``.contiguous()`` in the hot helper.
+        video_pe: Sharded-local RoPE (cos, sin) for video self-attn.
+        video_cross_pe: Sharded-local RoPE for video AV cross-attn (audio-video model only).
         audio_context: Projected text embedding for audio cross-attention.
         audio_mask: Attention mask for audio text cross-attention.
-        audio_pe: RoPE (cos, sin) for audio.  4D form, un-sharded.
-        audio_pe_2d: Sharded-local 2D contiguous form of audio_pe.
-        video_cross_pe: Cross-modal RoPE for video (audio-video model only).
-        video_cross_pe_2d: Sharded-local 2D contiguous form of video_cross_pe.
-        audio_cross_pe: Cross-modal RoPE for audio (audio-video model only).
-        audio_cross_pe_2d: Sharded-local 2D contiguous form of audio_cross_pe.
+        audio_pe: Sharded-local RoPE (cos, sin) for audio self-attn.
+        audio_cross_pe: Sharded-local RoPE for audio AV cross-attn (audio-video model only).
         video_kv: Per-layer pre-projected text K/V for video cross-attention.
         audio_kv: Per-layer pre-projected text K/V for audio cross-attention.
     """
 
     video_context: Optional[torch.Tensor] = None
     video_mask: Optional[torch.Tensor] = None
     video_pe: Optional[tuple[torch.Tensor, torch.Tensor]] = None
-    video_pe_2d: Optional[tuple[torch.Tensor, torch.Tensor]] = None
     video_cross_pe: Optional[tuple[torch.Tensor, torch.Tensor]] = None
-    video_cross_pe_2d: Optional[tuple[torch.Tensor, torch.Tensor]] = None
     video_kv: Optional[list[tuple[torch.Tensor, torch.Tensor]]] = None
     audio_context: Optional[torch.Tensor] = None
     audio_mask: Optional[torch.Tensor] = None
     audio_pe: Optional[tuple[torch.Tensor, torch.Tensor]] = None
-    audio_pe_2d: Optional[tuple[torch.Tensor, torch.Tensor]] = None
     audio_cross_pe: Optional[tuple[torch.Tensor, torch.Tensor]] = None
-    audio_cross_pe_2d: Optional[tuple[torch.Tensor, torch.Tensor]] = None
     audio_kv: Optional[list[tuple[torch.Tensor, torch.Tensor]]] = None