[compile] Invoke split FX graph by codegen.

zhxchen17 · zhxchen17 · commit 3ae46b479b71 · 2026-04-01T14:21:12.000-07:00
Summary: This PR reduces inference loop runtime overhead by codegen-ing slightly faster Python code instead of invoking the FX graph directly after compilation. Context: Today VllmBackend returns a callable as a FX GraphModule with multiple submodules with the following code: ``` def forward(self, ...): self.submod_0(...) self.submod_1(...) ... ``` FX graph execution has some overhead due to: 1. getattr() calls to fetch submodules. 2. submodule calls will push multiple levels of CPython stack frame before getting to the real kernels. We address this by introducing a new codegen layer after all compiler passes and right before inference runtime. In this codegen layer we get full customizability over how the graph is executed. Sample generated code: ``` submod_0 = _submods[0](l_input_ids_, s72, l_self_modules_embed_tokens_parameters_weight_, l_self_modules_layers_modules_0_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_0_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_) getitem = submod_0[0] getitem_1 = submod_0[1] getitem_2 = submod_0[2] getitem_3 = submod_0[3] getitem_4 = submod_0[4] submod_1 = _submods[1](getitem, s72, getitem_1, getitem_2, getitem_3) submod_2 = _submods[2](getitem_3, s72, l_self_modules_layers_modules_0_modules_self_attn_modules_o_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_post_attention_layernorm_parameters_weight_, getitem_4, l_self_modules_layers_modules_0_modules_mlp_modules_gate_up_proj_parameters_weight_, l_self_modules_layers_modules_0_modules_mlp_modules_down_proj_parameters_weight_, l_self_modules_layers_modules_1_modules_input_layernorm_parameters_weight_, l_self_modules_layers_modules_1_modules_self_attn_modules_qkv_proj_parameters_weight_, l_positions_, l_self_modules_layers_modules_0_modules_self_attn_modules_rotary_emb_buffers_cos_sin_cache_) getitem_5 = submod_2[0] getitem_6 = submod_2[1] getitem_7 = submod_2[2] getitem_8 = submod_2[3] getitem_9 = submod_2[4] submod_3 = _submods[3](getitem_5, s72, getitem_6, getitem_7, getitem_8) ``` This PR will reduce runtime overhead no matter VLLM_USE_AOT_COMPILE, VLLM_DISABLE_COMPILE_CACHE or VLLM_USE_MEGA_AOT_ARITFACT is enabled or disabled. It will always be used in all paths. In terms of caching, this PR will stores 2 extra pieces of data on disk: 1. Python execution code. 2. FQN of each submodule. When VLLM_USE_AOT_COMPILE=1, these will be loaded and optionally used depending on whether VLLM_USE_MEGA_ARTIFACT is enabled. Based on the current change, it's possible to further reduce warm start time by skipping graph module serialization. However to make the code review easier, we will do it in a separate PR and this PR still helps with the runtime overhead in a self-contained way. Benchmark script: https://github.com/zhxchen17/scripts/blob/main/vllm/overhead_bench.py Test Plan: <TODO Images> Reviewers: Subscribers: Tasks: Tags: Signed-off-by: zhxchen17 <zhxchen17@fb.com>
diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py
@@ -1234,6 +1234,23 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
             original_split_gm if envs.VLLM_USE_MEGA_AOT_ARTIFACT else self.graph
         )
 
+        from vllm.compilation.codegen import (
+            compile_execution_fn,
+            generate_execution_code,
+        )
+
+        execution_code, submod_names = generate_execution_code(self.split_gm)
+        # Use getattr to get correct callables: __dict__ has PiecewiseBackend
+        # instances (from PiecewiseCompileInterpreter), _modules has originals.
+        # getattr checks __dict__ first, then falls back to _modules.
+        submod_callables = {
+            name: getattr(self.split_gm, name)
+            for name, _ in self.split_gm.named_children()
+        }
+        runtime_callable = compile_execution_fn(
+            execution_code, submod_callables, submod_names
+        )
+
         if (
             self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE
             or not self.compilation_config.cudagraph_copy_inputs
@@ -1242,9 +1259,11 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
                 graph_to_serialize,
                 example_inputs,
                 self.prefix,
-                self.split_gm,
+                runtime_callable,
                 is_encoder=self.is_encoder,
                 vllm_backend=self,
+                execution_code=execution_code,
+                submod_names=submod_names,
             )
 
         # index of tensors that have symbolic shapes (batch size)
@@ -1265,7 +1284,7 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
         copy_and_call = make_copy_and_call(
             sym_tensor_indices,
             [example_inputs[x].clone() for x in sym_tensor_indices],
-            self.split_gm,
+            runtime_callable,
         )
 
         return VllmSerializableFunction(
@@ -1276,4 +1295,6 @@ def __call__(self, graph: fx.GraphModule, example_inputs: Sequence[Any]) -> Any:
             is_encoder=self.is_encoder,
             vllm_backend=self,
             sym_tensor_indices=sym_tensor_indices,
+            execution_code=execution_code,
+            submod_names=submod_names,
         )
diff --git a/vllm/compilation/caching.py b/vllm/compilation/caching.py
@@ -184,6 +184,8 @@ def __init__(
         vllm_backend: Any | None = None,
         sym_tensor_indices: list[int] | None = None,
         aot_autograd_config: dict[str, Any] | None = None,
+        execution_code: str | None = None,
+        submod_names: list[str] | None = None,
     ) -> None:
         assert isinstance(graph_module, torch.fx.GraphModule)
         self.graph_module = graph_module
@@ -194,6 +196,8 @@ def __init__(
         self.shape_env = None
         self.vllm_backend = vllm_backend
         self.sym_tensor_indices = sym_tensor_indices
+        self.execution_code = execution_code
+        self.submod_names = submod_names
         self._fake_mode: Any | None = None
 
         import torch._functorch.config as functorch_config
@@ -453,7 +457,7 @@ def reconstruct_serializable_fn_from_mega_artifact(
 
     standalone_compile_artifacts.load_all()
 
-    submod_names = standalone_compile_artifacts.submodule_names()
+    piecewise_submod_names = standalone_compile_artifacts.submodule_names()
     compiled_callables: dict[str, dict[str, Callable[..., Any]]] = {}
 
     for cache_key in standalone_compile_artifacts.submodule_bytes:
@@ -473,13 +477,13 @@ def reconstruct_serializable_fn_from_mega_artifact(
 
     # spot check that cached submodules exist in the graph structure
     graph_children = {name for name, _ in split_gm.named_children()}
-    missing = set(submod_names) - graph_children
+    missing = set(piecewise_submod_names) - graph_children
     assert not missing, (
         f"artifacts reference submodules not in graph: {missing}. "
         f"graph has: {sorted(graph_children)}"
     )
 
-    for i, submod_name in enumerate(submod_names):
+    for i, submod_name in enumerate(piecewise_submod_names):
         assert submod_name in sym_shape_indices_map and submod_name in returns_tuple_map
 
         sym_shape_indices = sym_shape_indices_map[submod_name]
@@ -490,15 +494,15 @@ def reconstruct_serializable_fn_from_mega_artifact(
             graph=None,  # not needed for cached artifacts
             vllm_config=vllm_config,
             piecewise_compile_index=i,
-            total_piecewise_compiles=len(submod_names),
+            total_piecewise_compiles=len(piecewise_submod_names),
             sym_shape_indices=sym_shape_indices,
             vllm_backend=vllm_backend,
             returns_tuple=returns_tuple,
             compiled_runnables=runnables,
         )
 
         is_first = i == 0
-        is_last = i == len(submod_names) - 1
+        is_last = i == len(piecewise_submod_names) - 1
         wrapped_backend = wrap_with_cudagraph_if_needed(
             piecewise_backend,
             vllm_config,
@@ -513,6 +517,21 @@ def reconstruct_serializable_fn_from_mega_artifact(
             submod_name,
         )
 
+    # Use codegen'd execution code if available, fall back to split_gm
+    execution_code = state.get("execution_code")
+    submod_names = state.get("submod_names")
+    if execution_code is not None and submod_names is not None:
+        from vllm.compilation.codegen import compile_execution_fn
+
+        submod_callables = {
+            name: getattr(split_gm, name) for name, _ in split_gm.named_children()
+        }
+        runtime_callable = compile_execution_fn(
+            execution_code, submod_callables, submod_names
+        )
+    else:
+        runtime_callable = split_gm
+
     if compilation_config.cudagraph_copy_inputs:
         sym_tensor_indices = state["sym_tensor_indices"]
         input_buffers = [
@@ -521,9 +540,11 @@ def reconstruct_serializable_fn_from_mega_artifact(
             )
             for idx in sym_tensor_indices
         ]
-        optimized_call = make_copy_and_call(sym_tensor_indices, input_buffers, split_gm)
+        optimized_call = make_copy_and_call(
+            sym_tensor_indices, input_buffers, runtime_callable
+        )
     else:
-        optimized_call = split_gm
+        optimized_call = runtime_callable
 
     fn = VllmSerializableFunction(
         **state,
diff --git a/vllm/compilation/codegen.py b/vllm/compilation/codegen.py
@@ -0,0 +1,131 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Code generation for split_gm stitching graph execution.
+
+Generates a plain Python function that replaces the FX GraphModule's
+interpreter-based execution of the stitching graph, eliminating
+nn.Module.__call__ overhead and __getattr__ dispatch.
+"""
+
+import operator
+from collections.abc import Callable
+from functools import partial
+from typing import Any
+
+import torch.fx
+from torch._dynamo.utils import dynamo_timed
+from torch._logging import trace_structured
+
+
+@dynamo_timed("vllm.generate_execution_code")
+def generate_execution_code(
+    split_gm: torch.fx.GraphModule,
+) -> tuple[str, list[str]]:
+    """Generate Python source code from a split_gm's stitching graph.
+
+    Walks split_gm.graph.nodes and produces a function that calls
+    submodules via a __vllm_submods__ list, avoiding FX GraphModule overhead
+    and dict lookup cost.
+
+    Args:
+        split_gm: The split graph module produced by split_graph().
+
+    Returns:
+        A tuple of (code, submod_names) where code is the Python source
+        and submod_names is the ordered list of submodule target names
+        corresponding to list indices used in the generated code.
+    """
+    lines: list[str] = []
+    param_names: list[str] = []
+    submod_names: list[str] = []
+    submod_index: dict[str, int] = {}
+
+    for node in split_gm.graph.nodes:
+        if node.op == "placeholder":
+            param_names.append(node.name)
+
+        elif node.op == "call_module":
+            target = node.target
+            if target not in submod_index:
+                submod_index[target] = len(submod_names)
+                submod_names.append(target)
+            idx = submod_index[target]
+            args_str = ", ".join(_node_ref(a) for a in node.args)
+            kwargs_str = ", ".join(
+                f"{k}={_node_ref(v)}" for k, v in node.kwargs.items()
+            )
+            all_args = ", ".join(filter(None, [args_str, kwargs_str]))
+            lines.append(f"    {node.name} = __vllm_submods__[{idx}]({all_args})")
+
+        elif node.op == "call_function" and node.target is operator.getitem:
+            source = _node_ref(node.args[0])
+            index = node.args[1]
+            assert isinstance(index, int)
+            lines.append(f"    {node.name} = {source}[{index}]")
+
+        elif node.op == "output":
+            assert len(node.args) == 1
+            ret = _node_ref(node.args[0])
+            lines.append(f"    return {ret}")
+
+        else:
+            raise RuntimeError(f"Unsupported node from codegen: {node.format_node()}")
+
+    assert len(param_names) > 0
+    params = ", ".join(param_names)
+    header = f"def execution_fn({params}, *, __vllm_submods__):"
+    return "import torch\n" + "\n".join([header] + lines) + "\n", submod_names
+
+
+@dynamo_timed("vllm.compile_execution_fn")
+def compile_execution_fn(
+    code: str,
+    submod_callables: dict[str, Callable[..., Any]],
+    submod_names: list[str],
+) -> Callable[..., Any]:
+    """Compile execution code and bind submodule callables.
+
+    Args:
+        code: Python source from generate_execution_code().
+        submod_callables: Mapping of submodule names to their callables.
+        submod_names: Ordered list of submodule names matching the indices
+            used in the generated code.
+
+    Returns:
+        A callable that executes the stitching logic.
+    """
+    trace_structured(
+        "artifact",
+        metadata_fn=lambda: {
+            "name": "vllm_execution_code",
+            "encoding": "string",
+        },
+        payload_fn=lambda: code,
+    )
+    namespace: dict[str, Any] = {}
+    exec(code, namespace)  # noqa: S102
+    fn = namespace["execution_fn"]
+    # Use .forward() directly to avoid nn.Module.__call__ overhead.
+    submods_list = [
+        c.forward if isinstance(c, torch.fx.GraphModule) else c
+        for c in (submod_callables[name] for name in submod_names)
+    ]
+    return partial(fn, __vllm_submods__=submods_list)
+
+
+def _node_ref(arg: Any) -> str:
+    """Convert an FX node argument to a source code reference recursively."""
+    if isinstance(arg, torch.fx.Node):
+        return arg.name
+    if isinstance(arg, list):
+        return f"[{', '.join(_node_ref(x) for x in arg)}]"
+    if isinstance(arg, tuple):
+        items = ", ".join(_node_ref(x) for x in arg)
+        return f"({items},)" if len(arg) == 1 else f"({items})"
+    if isinstance(arg, dict):
+        return (
+            "{"
+            + ", ".join(f"{_node_ref(k)}: {_node_ref(v)}" for k, v in arg.items())
+            + "}"
+        )
+    return repr(arg)