[None][fix] Fix multi_stream_moe accuracy with MLIR and piecewise cudagraphs (NVIDIA#12847)

suyoggupta · web-flow · commit 72dc7eccecbe · 2026-04-11T10:51:16.000-07:00
Signed-off-by: Suyog Gupta &lt;41447211+suyoggupta@users.noreply.github.com&gt;
diff --git a/examples/auto_deploy/model_registry/configs/gemma4_moe.yaml b/examples/auto_deploy/model_registry/configs/gemma4_moe.yaml
@@ -26,3 +26,5 @@ transforms:
     enabled: true
   fuse_gemms:
     enabled: true
+  multi_stream_moe:
+    enabled: true
diff --git a/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py b/tensorrt_llm/_torch/auto_deploy/compile/backends/torch_cudagraph.py
@@ -345,6 +345,22 @@ def prepare(self) -> None:
         gm = GraphModule(model, copy.deepcopy(model.graph))
 
         self.split_info = split_graph_at_dynamic_ops(gm)
+
+        # When multi-stream transforms reclassify ALL static partitions as
+        # dynamic (e.g. multi_stream_moe + multi_stream_mla_attn on every
+        # layer), there are zero capturable static segments.  Piecewise CUDA
+        # graphs are impossible — fall back to eager execution for
+        # prefill/mixed batches (monolithic CG still handles decode).
+        if not self.split_info.static_submod_indices:
+            ad_logger.warning(
+                "PiecewiseCapturedGraph: no static partitions after splitting "
+                "(%d dynamic). Piecewise CUDA graphs disabled — prefill/mixed "
+                "batches will run eagerly.",
+                len(self.split_info.dynamic_submod_indices),
+            )
+            self._is_prepared = True
+            return
+
         self.split_gm = self.split_info.split_gm
 
         graph_pool = torch.cuda.graph_pool_handle()
@@ -408,6 +424,17 @@ def prepare(self) -> None:
             self.split_info.static_submod_indices + self.split_info.dynamic_submod_indices
         )
         current_static_runner: Optional[ADPiecewiseRunner] = None
+        # Fallback runner: the first available static runner.  When
+        # multi-stream transforms reclassify the initial static partition(s)
+        # as dynamic (e.g. record_event_passthrough from multi_stream_mla_attn)
+        # AND the static partitions between metadata-prep and attention have
+        # no CUDA ops (skipped), there is no *preceding* static runner for the
+        # first attention op.  In that case we fall back to the nearest
+        # *following* static runner — any runner in the shared graph pool can
+        # host the pre-allocated output buffer.
+        fallback_runner: Optional[ADPiecewiseRunner] = None
+        if runner_by_idx:
+            fallback_runner = runner_by_idx[min(runner_by_idx)]
         num_metadata_wrapped = 0
         for idx in all_submod_indices:
             if idx in runner_by_idx:
@@ -430,15 +457,23 @@ def prepare(self) -> None:
                         )
                     continue
 
-                assert current_static_runner is not None, (
-                    f"Dynamic {submod_name} has no preceding static runner — "
+                effective_runner = current_static_runner or fallback_runner
+                assert effective_runner is not None, (
+                    f"Dynamic {submod_name} has no static runner available — "
                     f"cannot allocate out= buffer for stable output addresses"
                 )
+                if current_static_runner is None:
+                    ad_logger.info(
+                        "PiecewiseCapturedGraph: %s has no preceding static "
+                        "runner, using fallback runner (submod_%d)",
+                        submod_name,
+                        min(runner_by_idx),
+                    )
 
                 _inject_out_param(submod)
                 wrapper = DynamicOpWrapper(
                     submod,
-                    preceding_runner=current_static_runner,
+                    preceding_runner=effective_runner,
                     dynamic_submod_id=idx,
                 )
                 setattr(self.split_gm, submod_name, wrapper)
diff --git a/tensorrt_llm/_torch/auto_deploy/compile/piecewise_utils.py b/tensorrt_llm/_torch/auto_deploy/compile/piecewise_utils.py
@@ -86,6 +86,20 @@
     "auto_deploy::cuda_cached_causal_conv1d",
 ]
 
+# Multi-stream passthrough functions that switch the CUDA current stream.
+# Static partitions containing these functions cannot be captured as CUDA
+# graphs because the host-side stream synchronization required for
+# correctness (caller_stream.synchronize) is not capturable.  Such
+# partitions are reclassified as dynamic so they run eagerly.
+_STREAM_SWITCH_FUNCTION_NAMES = frozenset(
+    {
+        "begin_aux_stream_passthrough",
+        "end_aux_stream_passthrough",
+        "wait_aux_stream_passthrough",
+        "record_event_passthrough",
+    }
+)
+
 
 def _get_all_dynamic_op_names() -> Set[str]:
     """Return the full set of dynamic op qualified names."""
@@ -193,11 +207,17 @@ def needs_out_buffer(submod: nn.Module) -> bool:
 
     Inplace ops (mutate input, return None) don't produce new tensors.
     Metadata prep ops are handled by MetadataWrapper (stable output addresses).
-    Both are skipped — only attention/SSM/delta/logits ops need out= buffers.
+    Multi-stream partitions reclassified as dynamic run eagerly and manage
+    their own output tensors — they do not need out= buffers.
+    All of these are skipped — only attention/SSM/delta/logits ops need out= buffers.
     """
     if not isinstance(submod, GraphModule):
         return True
 
+    # Multi-stream partitions (reclassified from static) do not need out= buffers.
+    if _submod_has_stream_switch(submod):
+        return False
+
     for node in submod.graph.nodes:
         if node.op == "call_function" and is_dynamic_cached_op(node):
             op_name = node.target.name() if hasattr(node.target, "name") else str(node.target)
@@ -232,6 +252,16 @@ def is_metadata_prep(submod: nn.Module) -> bool:
 # ---------------------------------------------------------------------------
 
 
+def _submod_has_stream_switch(submod: GraphModule) -> bool:
+    """Return True if *submod* contains a multi-stream passthrough function."""
+    for node in submod.graph.nodes:
+        if node.op == "call_function":
+            func_name = getattr(node.target, "__name__", "")
+            if func_name in _STREAM_SWITCH_FUNCTION_NAMES:
+                return True
+    return False
+
+
 @dataclass
 class SplitInfo:
     """Metadata about a split GraphModule."""
@@ -318,6 +348,26 @@ def partition_fn(node: Node) -> int:
 
     submod_names.sort(key=lambda n: int(n.split("_")[1]))
 
+    # Reclassify static partitions that contain multi-stream passthrough
+    # functions as dynamic.  These partitions switch the CUDA current stream
+    # at runtime, which requires a host-side caller_stream.synchronize() for
+    # correctness with MLIR-fused Triton kernels.  Since synchronize() cannot
+    # be called during CUDA graph capture, such partitions must run eagerly.
+    num_reclassified = 0
+    for name in submod_names:
+        pid = int(name.split("_")[1])
+        if pid in dynamic_partitions:
+            continue
+        submod = getattr(split_gm, name)
+        if isinstance(submod, GraphModule) and _submod_has_stream_switch(submod):
+            dynamic_partitions.add(pid)
+            num_reclassified += 1
+    if num_reclassified:
+        ad_logger.info(
+            f"Piecewise split: reclassified {num_reclassified} static partition(s) "
+            "as dynamic (contain multi-stream passthrough ops)"
+        )
+
     dynamic_indices = []
     static_indices = []
     for name in submod_names:
diff --git a/tensorrt_llm/_torch/auto_deploy/transform/library/gather_logits_before_lm_head.py b/tensorrt_llm/_torch/auto_deploy/transform/library/gather_logits_before_lm_head.py
@@ -62,12 +62,21 @@ def _apply(
             node_to_gather = lm_head_node.all_input_nodes[0]
             self._log_info(f"Found LM head node: {lm_head_node.name}")
         else:
-            # Walk backward through elementwise/unary ops (e.g. softcapping: div, tanh, mul)
-            # to find the actual lm_head linear node.
+            # Walk backward through SINGLE-INPUT elementwise/unary ops
+            # (e.g. Gemma4 softcapping: linear → div → tanh → mul) to find the
+            # actual lm_head linear node.  Only follow nodes that have exactly
+            # one tensor input to avoid branching into the model body (e.g.
+            # residual adds, fused allreduce+norm ops).
             current = lm_head_node
             while current is not None and not is_linear_op(current):
-                inputs = current.all_input_nodes
-                current = inputs[0] if len(inputs) >= 1 else None
+                tensor_inputs = [n for n in current.all_input_nodes if n.op != "get_attr"]
+                if len(tensor_inputs) != 1:
+                    # Multi-input or no-input node — stop walking; the lm_head
+                    # is not in this graph (common for VLMs where only the text
+                    # backbone is exported and the lm_head is applied externally).
+                    current = None
+                    break
+                current = tensor_inputs[0]
 
             if current is not None and is_linear_op(current):
                 node_to_gather = current.all_input_nodes[0]
@@ -76,7 +85,10 @@ def _apply(
                 )
             else:
                 node_to_gather = lm_head_node
-                self._log_info("lm_head node is not linear, using it as the node to gather")
+                self._log_info(
+                    f"lm_head linear not in graph; inserting gather before "
+                    f"output node ({lm_head_node.name})"
+                )
 
         # Add logits_gather_mask as input in the graph and the sequence info interface
         logits_gather_indices_node = self._add_or_retrieve_input(gm, cm, "token_gather_indices")
diff --git a/tensorrt_llm/_torch/auto_deploy/utils/multi_stream_utils.py b/tensorrt_llm/_torch/auto_deploy/utils/multi_stream_utils.py
@@ -156,6 +156,19 @@ def begin_aux_stream_passthrough(
     # which is NOT ``torch.cuda.default_stream()``.
     caller_stream = torch.cuda.current_stream(device)
     cuda_stream_manager._caller_streams[device] = caller_stream
+    # Synchronize the caller stream before switching to aux.  The GPU-side
+    # event wait (aux_stream.wait_event) alone is NOT sufficient when
+    # MLIR-generated Triton kernels precede this point: their interaction
+    # with PyTorch's CUDA caching allocator can cause the allocator to
+    # recycle memory that the aux stream still needs, leading to illegal
+    # memory accesses or silent data corruption.  A CPU-side synchronize
+    # ensures all caller-stream GPU work has retired before aux-stream
+    # allocations begin.
+    # NOTE: this cannot be called during CUDA graph capture.  The cudagraph
+    # path must rely on event-based sync only; a separate fix is needed
+    # there (see TRTLLM multi_stream_moe + MLIR tracking).
+    if not torch.cuda.is_current_stream_capturing():
+        caller_stream.synchronize()
     # Record where the caller's stream has reached so aux knows when data is ready.
     main_event = cuda_stream_manager.get_event(device, cuda_stream_manager.MAIN_STREAM_NAME)
     main_event.record(caller_stream)
diff --git a/tests/unittest/auto_deploy/singlegpu/compile/test_piecewise_utils.py b/tests/unittest/auto_deploy/singlegpu/compile/test_piecewise_utils.py
diff --git a/tests/unittest/auto_deploy/singlegpu/transformations/library/test_gather_logits_before_lm_head.py b/tests/unittest/auto_deploy/singlegpu/transformations/library/test_gather_logits_before_lm_head.py