Recapture fake tensor meta data for inserted subgraphs (#4052)

narendasan · cehongwang · web-flow · commit 121d63650f4d · 2026-01-27T15:59:58.000-08:00
Co-authored-by: cehongwang &lt;wangcehong@gmail.com&gt;
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_FakeTensorUpdater.py b/py/torch_tensorrt/dynamo/lowering/passes/_FakeTensorUpdater.py
@@ -0,0 +1,206 @@
+import contextlib
+import operator
+from collections import defaultdict
+from typing import Any, Optional
+
+import sympy
+import torch
+import torch.fx
+from torch._dispatch.python import enable_python_dispatcher
+from torch._inductor.fx_utils import get_fake_args_kwargs, get_node_storage, get_storage
+from torch._subclasses.fake_tensor import FakeTensorMode
+from torch.fx.experimental.symbolic_shapes import (
+    compute_unbacked_bindings,
+    rebind_unbacked,
+    statically_known_true,
+    sym_eq,
+)
+from torch.utils._ordered_set import OrderedSet
+
+
+# Adapted from torch._inductor.fx_utils.FakeTensorUpdater
+class FakeTensorUpdater:
+    """
+    The main idea here is that it's difficult to maintain accurate fake
+    tensors (our primary form of metadata) for each node in our graph as we
+    transform it.
+
+    The most reliable way to obtain this information is by rerunning
+    faketensor propagation. However, in general, faketensor propagation is
+    fairly expensive. So, instead we'd like to only rerun faketensor
+    propagation on nodes that have changed.
+
+    In order to detect which nodes have changed, we first hash its node,
+    target, and argument lists (which are immutable in FX).
+
+    Then, whenever we call incremental_update, we check which FX nodes have a
+    new hash, and recompute the faketensor metadata for that node. Then, we
+    continue to recursively compute the faketensors for all users until the
+    fake tensors stop changing.
+    """
+
+    def __init__(self, graph: torch.fx.Graph) -> None:
+        self.processed_hashes = OrderedSet[Any]()
+        self.graph = graph
+
+        for node in self.graph.nodes:
+            self.processed_hashes.add(self.hash_node(node))
+
+    def hash_node(self, node: torch.fx.Node) -> tuple[torch.fx.Node, Any, Any, Any]:
+        return (node, node.target, id(node.args), id(node.kwargs))
+
+    def incremental_update(self, fake_mode: FakeTensorMode) -> None:
+        """Update FakeTensors on self.graph. We will try to do the minimum amount of work."""
+        existing_storages: defaultdict[Optional[int], int] = defaultdict(int)
+        for node in self.graph.nodes:
+            existing_storages[get_node_storage(node)] += 1
+
+        def is_intlist_same(new: Any, old: Any) -> Any:
+            return statically_known_true(sym_eq(new, old))
+
+        def is_fake_tensor_same(new: Any, old: Any, *, node: torch.fx.Node) -> Any:
+            if type(new) is not type(old):
+                return False
+            if isinstance(new, (list, tuple)):
+                if len(new) != len(old):
+                    return False
+                return all(
+                    is_fake_tensor_same(new_i, old_i, node=node)
+                    for new_i, old_i in zip(new, old)
+                )
+            if new is None:
+                return old is None
+            if not isinstance(new, torch.Tensor):
+                assert isinstance(
+                    new, (torch.SymInt, torch.SymBool, torch.SymFloat)
+                ), f"Unknown type {type(new)} in {self.graph}"
+                return (
+                    new.node.shape_env._maybe_evaluate_static(
+                        sympy.Eq(new.node.expr, old.node.expr)
+                    )
+                    == sympy.true
+                )
+            if not is_intlist_same(new.shape, old.shape) or new.layout != old.layout:
+                return False
+            if new.layout == torch.strided and (
+                not is_intlist_same(new.stride(), old.stride())
+                or not statically_known_true(
+                    new.storage_offset() == old.storage_offset()
+                )
+            ):
+                return False
+
+            if new.device != old.device:
+                return False
+
+            if get_storage(new) == get_storage(old):
+                return True
+
+            def any_user_may_alias(node: torch.fx.Node) -> bool:
+                if not isinstance(node.meta["val"], torch.Tensor):
+                    # analysis too complicated on lists, can support in the future
+                    return True
+                for user in node.users:
+                    if not (
+                        isinstance(
+                            user.target,
+                            (torch._ops.OpOverload, torch._ops.HigherOrderOperator),
+                        )
+                    ):
+                        return True
+                    if isinstance(user.target, torch._ops.HigherOrderOperator):
+                        # HOPs that survive until inductor are all non-aliasing HOPs.
+                        # We will likely never support HOPs that are aliasing.
+                        continue
+                    # Strategy: do a FakeTensor prop, see if the storage aliases.
+                    # If Inductor ever gets tighter invariants on OpOverloads
+                    # (that is, we ban things like torch.ops.aten.reshape calls in the graph),
+                    # Then this could just be a fast schema lookup.
+                    is_valid, args, kwargs = get_fake_args_kwargs(user)
+                    if not is_valid:
+                        return True
+                    with (
+                        fake_mode,
+                        enable_python_dispatcher(),
+                        contextlib.ExitStack() as stack,
+                    ):
+                        # Ignore unbacked symbols (if they exist): we're making
+                        # this FakeTensor and then throwing it away.
+                        if fake_mode.shape_env is not None:
+                            stack.enter_context(
+                                fake_mode.shape_env.ignore_fresh_unbacked_symbols()
+                            )
+                        new_fake_tensor = user.target(*args, **kwargs)
+                    if not isinstance(new_fake_tensor, torch.Tensor):
+                        # analysis too complicated on lists, can support in the future
+                        return True
+                    if get_storage(new_fake_tensor) == get_storage(node.meta["val"]):
+                        return True
+                return False
+
+            # This is the case where it returns a completely fresh storage that's used nowhere else.
+            # If the FakeTensor's storage is fresh and none of the node's users can alias it, then
+            # we don't need to update this node.
+            if (
+                existing_storages[get_storage(old)] == 1
+                and get_storage(new) not in existing_storages
+                and not any_user_may_alias(node)
+            ):
+                return True
+
+            return False
+
+        def should_process_node(node: torch.fx.Node) -> bool:
+            # node.target for nodes returning true from this function
+            # are called under fake mode and does not work for inductor
+            # lowerings. We check if the node.target is an aten operator
+            # or operator.getitem which is used when returning multiple
+            # tensors from an op.
+            return node.op == "call_function" and (
+                isinstance(node.target, torch._ops.OpOverload)
+                or node.target is operator.getitem
+                or node.target
+                is torch._inductor.fx_passes.reinplace._generalized_scatter
+            )
+
+        to_process = OrderedSet[int]()
+        for node in self.graph.nodes:
+            # NB: Be very careful about skipping nodes (via continues) here
+            # and ask for a careful review when changing this code. The
+            # consequence for incorrect FakeTensor metadata is difficult-to-debug
+            # silent incorrectness.
+            if (
+                self.hash_node(node) in self.processed_hashes
+                and id(node) not in to_process
+            ):
+                continue
+
+            if not should_process_node(node):
+                continue
+
+            is_valid, args, kwargs = get_fake_args_kwargs(node)
+            if not is_valid:
+                continue
+            with fake_mode, enable_python_dispatcher():
+                new_fake_tensor = node.target(*args, **kwargs)
+
+            if "val" in node.meta and is_fake_tensor_same(
+                new_fake_tensor, node.meta["val"], node=node
+            ):
+                continue
+
+            rebind_unbacked(fake_mode.shape_env, node, new_fake_tensor)
+
+            node.meta["val"] = new_fake_tensor
+            if (shape_env := fake_mode.shape_env) and (
+                symbol_to_path := compute_unbacked_bindings(shape_env, new_fake_tensor)
+            ):
+                # Refresh the bindings to the new symbols
+
+                node.meta["unbacked_bindings"] = symbol_to_path
+
+            existing_storages[get_node_storage(node)] += 1
+
+            to_process.update([id(user) for user in node.users])
+
+            self.processed_hashes.add(self.hash_node(node))
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py b/py/torch_tensorrt/dynamo/lowering/passes/_aten_lowering_pass.py
@@ -3,8 +3,11 @@
 from typing import Any, Callable, Optional, Sequence, Union
 
 import torch
+from torch._inductor.fx_utils import get_node_storage
+
 from torch_tensorrt._utils import is_tegra_platform
 from torch_tensorrt.dynamo._settings import CompilationSettings
+from torch_tensorrt.dynamo.lowering.passes._FakeTensorUpdater import FakeTensorUpdater
 from torch_tensorrt.dynamo.lowering.passes.pass_utils import (
     trace_intermediate_node_outputs,
 )
@@ -131,7 +134,10 @@ def post_lowering(
     logging.debug(
         f"Invoking DynamoPassManager and applying lowering passes: {ATEN_POST_LOWERING_PASSES}"
     )
+    fake_tensor_updater = FakeTensorUpdater(gm.graph)
     gm = ATEN_POST_LOWERING_PASSES(gm, settings)
+    if (fake_mode := torch._export.utils._detect_fake_mode_from_gm(gm)) is not None:
+        fake_tensor_updater.incremental_update(fake_mode)
 
     return gm
 
diff --git a/py/torch_tensorrt/dynamo/lowering/passes/replace_fused_rms_norm.py b/py/torch_tensorrt/dynamo/lowering/passes/replace_fused_rms_norm.py
@@ -37,7 +37,6 @@ def process_fused_rms_norm_node(
     # Calculate dimensions to normalize over (similar to layer_norm)
     # normalized_shape specifies the last N dimensions
     x_dim = len(node.meta["val"][0].shape)
-    x_fake = node.meta["val"][0]
     dims_to_reduce = []
     for i in range(len(shape)):
         dims_to_reduce.append(x_dim - i - 1)
@@ -48,44 +47,32 @@ def process_fused_rms_norm_node(
             torch.ops.aten.mul.Tensor,
             args=(x, x),
         )
-        x_squared_fake = x_fake * x_fake
-        x_squared.meta["val"] = x_squared_fake
 
         x_squared_sum = gm.graph.call_function(
             torch.ops.aten.mean.dim,
             args=(x_squared, dims_to_reduce, True),
         )
-        x_squared_sum_fake = x_squared_fake.mean(dims_to_reduce, keepdim=True)
-        x_squared_sum.meta["val"] = x_squared_sum_fake
 
         x_squared_sum_eps = gm.graph.call_function(
             torch.ops.aten.add.Tensor,
             args=(x_squared_sum, eps),
         )
-        x_squared_sum_eps_fake = x_squared_sum_fake + eps
-        x_squared_sum_eps.meta["val"] = x_squared_sum_eps_fake
 
         x_squared_sum_eps_rsqrt = gm.graph.call_function(
             torch.ops.aten.rsqrt.default,
             args=(x_squared_sum_eps,),
         )
-        x_squared_sum_eps_rsqrt_fake = x_squared_sum_eps_fake.rsqrt()
-        x_squared_sum_eps_rsqrt.meta["val"] = x_squared_sum_eps_rsqrt_fake
 
         x_normalized = gm.graph.call_function(
             torch.ops.aten.mul.Tensor,
             args=(x, x_squared_sum_eps_rsqrt),
         )
-        x_normalized_fake = x_fake * x_squared_sum_eps_rsqrt_fake
-        x_normalized.meta["val"] = x_normalized_fake
 
         if weight is not None:
             x_normalized = gm.graph.call_function(
                 torch.ops.aten.mul.Tensor,
                 args=(x_normalized, weight),
             )
-            x_normalized_fake = x_normalized_fake * weight.meta["val"]
-            x_normalized.meta["val"] = x_normalized_fake
 
         for i, user in enumerate(list(node.users)):
             if user.op == "call_function" and user.target == operator.getitem: