Clean up _decode_latents, fix LTX2 compat, add LTX2 batch support

karljang · karljang · commit e81069d54c43 · 2026-03-13T11:26:39.000-07:00
- Remove unused batch_size param from _decode_latents in all pipelines
  (FLUX1, FLUX2, WAN T2V, WAN I2V)
- Fix outdated docstrings to reflect always-batched output
- Fix LTX2 postprocess_video_tensor call (remove_batch_dim removed)
- Add batch generation support to LTX2 pipeline (batch=batch_size)
- Add lightweight LTX2 batch unit tests (no model loading required)

Signed-off-by: Kanghwan Jang &lt;861393+karljang@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux.py b/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux.py
@@ -333,9 +333,7 @@ def forward_fn(
         # Decode
         logger.info("Decoding image...")
         decode_start = time.time()
-        image = self.decode_latents(
-            latents, lambda lat: self._decode_latents(lat, height, width, batch_size)
-        )
+        image = self.decode_latents(latents, lambda lat: self._decode_latents(lat, height, width))
 
         if self.rank == 0:
             logger.info(f"Image decoded in {time.time() - decode_start:.2f}s")
@@ -524,19 +522,16 @@ def _prepare_latents(
 
         return latents, latent_ids
 
-    def _decode_latents(
-        self, latents: torch.Tensor, height: int, width: int, batch_size: int = 1
-    ) -> torch.Tensor:
+    def _decode_latents(self, latents: torch.Tensor, height: int, width: int) -> torch.Tensor:
         """Decode latents to image tensor.
 
         Args:
             latents: Packed latents [B, seq, 64].
             height: Output image height.
             width: Output image width.
-            batch_size: Number of images in batch.
 
         Returns:
-            Image tensor (H, W, C) for single image, (B, H, W, C) for batch.
+            Image tensor (B, H, W, C).
         """
         # Unpack latents: (batch, seq_len, channels) -> (batch, channels, h, w)
         latents = self._unpack_latents(latents, height, width)
diff --git a/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux2.py b/tensorrt_llm/_torch/visual_gen/models/flux/pipeline_flux2.py
@@ -424,9 +424,7 @@ def forward_fn(
         # Decode
         logger.info("Decoding image...")
         decode_start = time.time()
-        image = self.decode_latents(
-            latents, lambda lat: self._decode_latents(lat, latent_ids, batch_size)
-        )
+        image = self.decode_latents(latents, lambda lat: self._decode_latents(lat, latent_ids))
 
         if self.rank == 0:
             logger.info(f"Image decoded in {time.time() - decode_start:.2f}s")
@@ -659,17 +657,15 @@ def _decode_latents(
         self,
         latents: torch.Tensor,
         latent_ids: torch.Tensor,
-        batch_size: int = 1,
     ) -> torch.Tensor:
         """Decode latents to image tensor.
 
         Args:
             latents: Packed latents [B, seq, C].
             latent_ids: Position IDs [seq, 4].
-            batch_size: Number of images in batch.
 
         Returns:
-            Image tensor (H, W, C) for single image, (B, H, W, C) for batch.
+            Image tensor (B, H, W, C).
         """
         # Unpack latents using position IDs
         latents = self._unpack_latents_with_ids(latents, latent_ids)
diff --git a/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2.py b/tensorrt_llm/_torch/visual_gen/models/ltx2/pipeline_ltx2.py
@@ -1402,7 +1402,7 @@ def decode_video_fn(vid_latents):
                 )
             )
             video = torch.cat(chunks, dim=2)
-            video = postprocess_video_tensor(video, remove_batch_dim=True)
+            video = postprocess_video_tensor(video)
             return video
 
         def decode_audio_fn(aud_latents):
diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py b/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan.py
@@ -483,7 +483,7 @@ def forward_fn(
         # Decode
         logger.info("Decoding video...")
         decode_start = time.time()
-        video = self.decode_latents(latents, lambda lat: self._decode_latents(lat, batch_size))
+        video = self.decode_latents(latents, self._decode_latents)
 
         if self.rank == 0:
             logger.info(f"Video decoded in {time.time() - decode_start:.2f}s")
@@ -566,7 +566,7 @@ def _prepare_latents(
         return randn_tensor(shape, generator=generator, device=self.device, dtype=self.dtype)
 
     @nvtx_range("_decode_latents", color="blue")
-    def _decode_latents(self, latents: torch.Tensor, batch_size: int = 1) -> torch.Tensor:
+    def _decode_latents(self, latents: torch.Tensor) -> torch.Tensor:
         """Decode latents to video tensor."""
         latents = latents.to(self.vae.dtype)
 
diff --git a/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan_i2v.py b/tensorrt_llm/_torch/visual_gen/models/wan/pipeline_wan_i2v.py
@@ -647,7 +647,7 @@ def forward_fn(
         # Decode
         logger.info("Decoding video...")
         decode_start = time.time()
-        video = self.decode_latents(latents, lambda lat: self._decode_latents(lat, batch_size))
+        video = self.decode_latents(latents, self._decode_latents)
 
         if self.rank == 0:
             logger.info(f"Video decoded in {time.time() - decode_start:.2f}s")
@@ -828,7 +828,7 @@ def _prepare_latents(
 
         return latents, condition
 
-    def _decode_latents(self, latents, batch_size=1):
+    def _decode_latents(self, latents):
         """Decode latents to video."""
         latents = latents.to(self.vae.dtype)
 
diff --git a/tests/unittest/_torch/visual_gen/test_ltx2_pipeline.py b/tests/unittest/_torch/visual_gen/test_ltx2_pipeline.py
@@ -413,5 +413,75 @@ def test_attention_backend_comparison(self, ltx2_bf16_checkpoint_exists):
         torch.cuda.empty_cache()
 
 
+# ============================================================================
+# Batch Support Unit Tests (no model loading required)
+# ============================================================================
+
+
+class TestLTX2BatchSupport:
+    """Test batch support logic without loading the full pipeline."""
+
+    def test_video_pixel_shape_batch_propagation(self):
+        """VideoPixelShape(batch=N) propagates through VideoLatentShape."""
+        from tensorrt_llm._torch.visual_gen.models.ltx2.ltx2_core.types import (
+            VideoLatentShape,
+            VideoPixelShape,
+        )
+
+        for batch_size in [1, 2, 4]:
+            pixel_shape = VideoPixelShape(
+                batch=batch_size, frames=9, height=512, width=768, fps=24.0
+            )
+            video_shape = VideoLatentShape.from_pixel_shape(pixel_shape, latent_channels=128)
+            assert video_shape.batch == batch_size
+            torch_shape = video_shape.to_torch_shape()
+            assert torch_shape[0] == batch_size
+
+    def test_prompt_normalization(self):
+        """forward() normalizes str prompt to List[str] and computes batch_size."""
+        # Simulate the normalization logic from forward()
+        for prompt_input, expected_batch in [
+            ("a cat", 1),
+            (["a cat"], 1),
+            (["a cat", "a dog"], 2),
+        ]:
+            prompt = prompt_input
+            if isinstance(prompt, str):
+                prompt = [prompt]
+            assert len(prompt) == expected_batch
+
+    def test_negative_prompt_expansion(self):
+        """Negative prompt is expanded to match batch_size."""
+        # Simulate the negative prompt expansion logic from forward()
+        for neg_input, batch_size, expected_len in [
+            ("bad quality", 1, 1),
+            ("bad quality", 3, 3),
+            (["bad quality"], 3, 3),
+            (["bad 1", "bad 2", "bad 3"], 3, 3),
+        ]:
+            negative_prompt = neg_input
+            if isinstance(negative_prompt, str):
+                neg_prompt_list = [negative_prompt] * batch_size
+            else:
+                neg_prompt_list = list(negative_prompt)
+                if len(neg_prompt_list) == 1 and batch_size > 1:
+                    neg_prompt_list = neg_prompt_list * batch_size
+            assert len(neg_prompt_list) == expected_len
+
+    @pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
+    def test_latent_shape_matches_batch(self):
+        """Latents created from VideoLatentShape have correct batch dim."""
+        from tensorrt_llm._torch.visual_gen.models.ltx2.ltx2_core.types import (
+            VideoLatentShape,
+            VideoPixelShape,
+        )
+
+        batch_size = 2
+        pixel_shape = VideoPixelShape(batch=batch_size, frames=9, height=512, width=768, fps=24.0)
+        video_shape = VideoLatentShape.from_pixel_shape(pixel_shape, latent_channels=128)
+        latents = torch.randn(video_shape.to_torch_shape(), device="cuda", dtype=torch.float32)
+        assert latents.shape[0] == batch_size
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])

Original file line number	Diff line number	Diff line change
`@@ -1402,7 +1402,7 @@ def decode_video_fn(vid_latents):`
`1402`	`1402`	`)`
`1403`	`1403`	`)`
`1404`	`1404`	`video = torch.cat(chunks, dim=2)`
`1405`		`- video = postprocess_video_tensor(video, remove_batch_dim=True)`
	`1405`	`+ video = postprocess_video_tensor(video)`
`1406`	`1406`	`return video`
`1407`	`1407`
`1408`	`1408`	`def decode_audio_fn(aud_latents):`