[TRTLLM-12127][fix] VisualGen metadata updates (NVIDIA#12862)

o-stoner · web-flow · commit 6e5a3392b4c9 · 2026-04-21T13:38:33.000-07:00
Signed-off-by: Olivia Stoner &lt;245287810+o-stoner@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/visual_gen/attention_backend/trtllm.py b/tensorrt_llm/_torch/visual_gen/attention_backend/trtllm.py
@@ -45,40 +45,51 @@ class TrtllmAttentionMetadata:
         max_batch_size: Initial batch size hint. Will grow automatically if exceeded.
         max_seq_len: Initial sequence length hint. Will grow automatically if exceeded.
         device: Target device for tensors.
+        attention_metadata_state: Mutable model-scoped state shared by all
+            attention layers in one model instance.
     """
 
     def __init__(
         self,
         max_batch_size: int = 16,
         max_seq_len: int = 4096,
         device: Optional[torch.device] = None,
+        attention_metadata_state: Optional[dict] = None,
     ):
         # These are initial hints, not hard limits - capacity grows as needed
         self.max_batch_size = max_batch_size
         self.max_seq_len = max_seq_len
         self.device = device or torch.device("cuda")
+        if attention_metadata_state is None:
+            raise ValueError(
+                "TRTLLM attention requires `attention_metadata_state` to be provided "
+                "by visual-gen config for model-scoped metadata sharing."
+            )
+        self._metadata_state = attention_metadata_state
 
         # Lazily created BaseTrtllmAttentionMetadata
-        self._metadata: Optional[BaseTrtllmAttentionMetadata] = None
-
-        # Track allocated capacity
-        self._allocated_batch_size = 0
-        self._allocated_max_seq_len = 0
+        self._metadata: Optional[BaseTrtllmAttentionMetadata] = self._metadata_state["metadata"]
 
         # Track prepared state
         self._cached_seq_lens: Optional[torch.Tensor] = None
         self._prepared = False
 
     def _needs_new_metadata(self, batch_size: int, max_seq_len: int) -> bool:
         """Check if we need to create new metadata (capacity change)."""
+        metadata = self._metadata_state["metadata"]
+        allocated_batch_size, allocated_max_seq_len = self._metadata_state["capacity"]
         return (
-            self._metadata is None
-            or batch_size > self._allocated_batch_size
-            or max_seq_len > self._allocated_max_seq_len
+            metadata is None
+            or batch_size > allocated_batch_size
+            or max_seq_len > allocated_max_seq_len
         )
 
     def _needs_prepare(self, batch_size: int, seq_lens: torch.Tensor) -> bool:
-        """Check if we need to call prepare() (seq_lens changed)."""
+        """Check if we need to call prepare() (seq_lens changed).
+
+        Assumes uniform sequence length per batch; if per-sample lengths vary,
+        we may need to check seq_lens tensor instead.
+        """
         if not self._prepared:
             return True
         if self._cached_seq_lens is None:
@@ -89,9 +100,9 @@ def _needs_prepare(self, batch_size: int, seq_lens: torch.Tensor) -> bool:
 
     def _create_metadata(self, batch_size: int, max_seq_len: int) -> None:
         """Create new metadata with given capacity."""
-        # Allocate with some headroom to avoid frequent reallocation
-        alloc_batch = max(batch_size, self._allocated_batch_size)
-        alloc_seq_len = max(max_seq_len, self._allocated_max_seq_len)
+        prev_batch, prev_seq = self._metadata_state["capacity"]
+        alloc_batch = max(batch_size, prev_batch)
+        alloc_seq_len = max(max_seq_len, prev_seq)
 
         self._metadata = BaseTrtllmAttentionMetadata(
             max_num_requests=alloc_batch,
@@ -102,8 +113,8 @@ def _create_metadata(self, batch_size: int, max_seq_len: int) -> None:
             runtime_features=AttentionRuntimeFeatures(),
         )
 
-        self._allocated_batch_size = alloc_batch
-        self._allocated_max_seq_len = alloc_seq_len
+        self._metadata_state["metadata"] = self._metadata
+        self._metadata_state["capacity"] = (alloc_batch, alloc_seq_len)
         self._prepared = False  # Reset prepare state on new metadata
 
     def prepare(
@@ -116,7 +127,7 @@ def prepare(
 
         Lazy behavior:
         - Creates metadata only when capacity needs increase
-        - Calls prepare() only when seq_lens actually change
+        - Calls prepare() only when (batch_size, max_seq_len) actually change
         """
         if isinstance(seq_lens, int):
             seq_lens_tensor = torch.full((batch_size,), seq_lens, dtype=torch.int32)
@@ -127,6 +138,8 @@ def prepare(
 
         if self._needs_new_metadata(batch_size, max_seq_len):
             self._create_metadata(batch_size, max_seq_len)
+        else:
+            self._metadata = self._metadata_state["metadata"]
 
         if self._needs_prepare(batch_size, seq_lens_tensor):
             self._metadata.seq_lens = seq_lens_tensor
@@ -165,6 +178,7 @@ def __init__(
         dtype: Optional[torch.dtype] = None,
         max_batch_size: int = 16,
         max_seq_len: int = 4096,
+        attention_metadata_state: Optional[dict] = None,
     ):
         num_kv_heads = num_kv_heads or num_heads
 
@@ -183,6 +197,7 @@ def __init__(
         self.metadata = TrtllmAttentionMetadata(
             max_batch_size=max_batch_size,
             max_seq_len=max_seq_len,
+            attention_metadata_state=attention_metadata_state,
         )
 
     # Needed to work with torch compile cause of attention metadata
diff --git a/tensorrt_llm/_torch/visual_gen/attention_backend/utils.py b/tensorrt_llm/_torch/visual_gen/attention_backend/utils.py
@@ -26,6 +26,7 @@
 
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
+from ..config import AttentionConfig
 from .interface import AttentionBackend
 
 
@@ -77,6 +78,8 @@ def create_attention(
     dtype: Optional[torch.dtype] = None,
     max_batch_size: int = 16,
     max_seq_len: int = 4096,
+    attention_config: Optional[AttentionConfig] = None,
+    attention_metadata_state: Optional[dict] = None,
     **kwargs,
 ) -> AttentionBackend:
     """
@@ -97,13 +100,24 @@ def create_attention(
             will automatically reallocate if larger batches are encountered.
         max_seq_len: Initial sequence length for metadata pre-allocation. The backend
             will automatically reallocate if longer sequences are encountered.
+        attention_config: Optional AttentionConfig
+        attention_metadata_state: Optional model-scoped metadata state from
+            visual-gen config. Required for TRTLLM backend.
         **kwargs: Additional backend-specific arguments
 
     Returns:
         AttentionBackend instance
     """
     attn_cls = get_visual_gen_attention_backend(backend)
 
+    if backend.upper() == "TRTLLM":
+        if attention_metadata_state is None:
+            raise ValueError(
+                "TRTLLM backend requires `attention_metadata_state` from "
+                "DiffusionModelConfig; creation path must not allocate metadata implicitly."
+            )
+        kwargs["attention_metadata_state"] = attention_metadata_state
+
     return attn_cls(
         layer_idx=layer_idx,
         num_heads=num_heads,
diff --git a/tensorrt_llm/_torch/visual_gen/config.py b/tensorrt_llm/_torch/visual_gen/config.py
@@ -536,6 +536,11 @@ def discover_pipeline_components(checkpoint_path: Path) -> Dict[str, Path]:
     return components
 
 
+def create_attention_metadata_state() -> Dict[str, Any]:
+    """Create model-scoped attention metadata state for TRTLLM visual-gen backend."""
+    return {"metadata": None, "capacity": (0, 0)}
+
+
 # =============================================================================
 # DiffusionModelConfig - Internal configuration (merged/parsed)
 # =============================================================================
@@ -579,6 +584,7 @@ class DiffusionModelConfig(BaseModel):
     cuda_graph: CudaGraphConfig = PydanticField(default_factory=CudaGraphConfig)
     pipeline: PipelineConfig = PydanticField(default_factory=PipelineConfig)
     attention: AttentionConfig = PydanticField(default_factory=AttentionConfig)
+    attention_metadata_state: Optional[Dict[str, Any]] = None
     parallel: ParallelConfig = PydanticField(default_factory=ParallelConfig)
     cache: Optional[CacheConfig] = None
 
@@ -935,6 +941,10 @@ def from_pretrained(
 
             NVFP4LinearMethod.use_tunable_quantize = True
 
+        attention_metadata_state = (
+            create_attention_metadata_state() if attention_cfg.backend == "TRTLLM" else None
+        )
+
         return cls(
             pretrained_config=pretrained_config,
             quant_config=quant_config,
@@ -947,6 +957,7 @@ def from_pretrained(
             cuda_graph=cuda_graph_cfg,
             pipeline=pipeline_cfg,
             attention=attention_cfg,
+            attention_metadata_state=attention_metadata_state,
             parallel=parallel_cfg,
             cache=cache_cfg,
             skip_create_weights_in_init=True,
diff --git a/tensorrt_llm/_torch/visual_gen/models/ltx2/transformer_ltx2.py b/tensorrt_llm/_torch/visual_gen/models/ltx2/transformer_ltx2.py
@@ -131,6 +131,8 @@ def __init__(
                 num_kv_heads=self.num_key_value_heads,
                 quant_config=self.quant_config,
                 dtype=self.dtype,
+                attention_config=config.attention,
+                attention_metadata_state=config.attention_metadata_state,
             )
             self._has_dual_attn = True
 
diff --git a/tensorrt_llm/_torch/visual_gen/modules/attention.py b/tensorrt_llm/_torch/visual_gen/modules/attention.py
@@ -95,6 +95,8 @@ def __init__(
 
         self._init_qkv_proj()
 
+        attention_metadata_state = getattr(config, "attention_metadata_state", None)
+
         if self.qk_norm:
             # "full": norm over all heads combined (e.g. WAN, dim=q_dim)
             # "per_head": norm over each head independently (e.g. FLUX, dim=head_dim)
@@ -141,6 +143,8 @@ def __init__(
             num_kv_heads=backend_num_kv_heads,
             quant_config=self.quant_config,
             dtype=self.dtype,
+            attention_config=config.attention,
+            attention_metadata_state=attention_metadata_state,
         )
 
         # Wrap with parallelism strategies (orthogonal to backend choice)
diff --git a/tests/unittest/_torch/visual_gen/multi_gpu/test_flux_ulysses.py b/tests/unittest/_torch/visual_gen/multi_gpu/test_flux_ulysses.py
@@ -27,6 +27,7 @@
         AttentionConfig,
         DiffusionModelConfig,
         TorchCompileConfig,
+        create_attention_metadata_state,
     )
     from tensorrt_llm._torch.visual_gen.mapping import VisualGenMapping
     from tensorrt_llm._utils import get_free_port
@@ -152,6 +153,9 @@ def _make_model_config(pretrained_dict, ulysses_size=1, backend="VANILLA"):
         attention=AttentionConfig(backend=backend),
         visual_gen_mapping=vgm,
         cache=None,
+        attention_metadata_state=(
+            create_attention_metadata_state() if backend.upper() == "TRTLLM" else None
+        ),
         skip_create_weights_in_init=False,
     )
     config.mapping = vgm.to_llm_mapping()
diff --git a/tests/unittest/_torch/visual_gen/test_attention_integration.py b/tests/unittest/_torch/visual_gen/test_attention_integration.py
@@ -19,7 +19,11 @@
 # Flash Attention 4 availability
 # ============================================================================
 from tensorrt_llm._torch.visual_gen.attention_backend.flash_attn4 import _flash_attn_fwd as _fa4_fwd
-from tensorrt_llm._torch.visual_gen.config import AttentionConfig, DiffusionModelConfig
+from tensorrt_llm._torch.visual_gen.config import (
+    AttentionConfig,
+    DiffusionModelConfig,
+    create_attention_metadata_state,
+)
 
 # Import new integrated versions
 from tensorrt_llm._torch.visual_gen.modules.attention import Attention, QKVMode, apply_rotary_emb
@@ -128,6 +132,9 @@ def create_model_config(
         attention=AttentionConfig(backend=attn_backend),
         skip_create_weights_in_init=False,
     )
+    config.attention_metadata_state = (
+        create_attention_metadata_state() if attn_backend == "TRTLLM" else None
+    )
     return config
 
 
diff --git a/tests/unittest/_torch/visual_gen/test_attention_perf.py b/tests/unittest/_torch/visual_gen/test_attention_perf.py
@@ -43,7 +43,11 @@
 from tensorrt_llm._torch.visual_gen.attention_backend.flash_attn4 import (
     _flash_attn_fwd_import_error as _fa4_import_error,
 )
-from tensorrt_llm._torch.visual_gen.config import AttentionConfig, DiffusionModelConfig
+from tensorrt_llm._torch.visual_gen.config import (
+    AttentionConfig,
+    DiffusionModelConfig,
+    create_attention_metadata_state,
+)
 from tensorrt_llm._torch.visual_gen.modules.attention import Attention, QKVMode
 
 _flash_attn4_available = _fa4_fwd is not None
@@ -155,6 +159,9 @@ def create_model_config(
         attention=AttentionConfig(backend=attn_backend),
         skip_create_weights_in_init=False,
     )
+    config.attention_metadata_state = (
+        create_attention_metadata_state() if attn_backend == "TRTLLM" else None
+    )
     return config
 
 
diff --git a/tests/unittest/_torch/visual_gen/test_flux_attention.py b/tests/unittest/_torch/visual_gen/test_flux_attention.py
@@ -20,7 +20,11 @@
 import torch
 import torch.nn.functional as F
 
-from tensorrt_llm._torch.visual_gen.config import AttentionConfig, DiffusionModelConfig
+from tensorrt_llm._torch.visual_gen.config import (
+    AttentionConfig,
+    DiffusionModelConfig,
+    create_attention_metadata_state,
+)
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
@@ -103,6 +107,7 @@ def test_trtllm_backend_sanity(self):
 
         torch.manual_seed(42)
         config = self._create_config("TRTLLM")
+        config.attention_metadata_state = create_attention_metadata_state()
 
         attn = (
             FluxJointAttention(
@@ -175,6 +180,7 @@ def test_backend_equivalence(self):
                 p.normal_(0, 0.02)
 
         config = self._create_config("TRTLLM")
+        config.attention_metadata_state = create_attention_metadata_state()
         trtllm_attn = (
             FluxJointAttention(
                 hidden_size=dim,
diff --git a/tests/unittest/_torch/visual_gen/test_ltx2_attention.py b/tests/unittest/_torch/visual_gen/test_ltx2_attention.py
@@ -16,7 +16,11 @@
 import torch
 import torch.nn.functional as F
 
-from tensorrt_llm._torch.visual_gen.config import AttentionConfig, DiffusionModelConfig
+from tensorrt_llm._torch.visual_gen.config import (
+    AttentionConfig,
+    DiffusionModelConfig,
+    create_attention_metadata_state,
+)
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
@@ -102,6 +106,7 @@ def test_trtllm_self_attention_sanity(self):
 
         torch.manual_seed(42)
         config = _create_config("TRTLLM")
+        config.attention_metadata_state = create_attention_metadata_state()
 
         attn = (
             LTX2Attention(
@@ -287,6 +292,7 @@ def test_backend_equivalence(self):
 
         # Create TRTLLM attention and copy weights
         config_trtllm = _create_config("TRTLLM")
+        config_trtllm.attention_metadata_state = create_attention_metadata_state()
         trtllm_attn = (
             LTX2Attention(
                 query_dim=query_dim,

Original file line number	Diff line number	Diff line change
`@@ -131,6 +131,8 @@ def __init__(`
`131`	`131`	`num_kv_heads=self.num_key_value_heads,`
`132`	`132`	`quant_config=self.quant_config,`
`133`	`133`	`dtype=self.dtype,`
	`134`	`+ attention_config=config.attention,`
	`135`	`+ attention_metadata_state=config.attention_metadata_state,`
`134`	`136`	`)`
`135`	`137`	`self._has_dual_attn = True`
`136`	`138`