[None][chore] Wrap the swiglu into custom op to avoid redundant device copy. (NVIDIA#7021)

hyukn · web-flow · commit bed5bc9f2eeb · 2025-08-27T13:02:10.000+08:00
A redundant D2D copy is observed when enabling torch.compile for the Llama model due to the swiglu triton kernel, which brings perf overhead. Use a custom op to wrap the swiglu op to avoid this overhead.

Signed-off-by: Yukun He &lt;23156053+hyukn@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -1,7 +1,8 @@
 from functools import lru_cache
-from typing import List, Optional, Tuple
+from typing import List, Mapping, Optional, Tuple
 
 import torch
+import triton  # type: ignore[import]
 
 import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils
 import tensorrt_llm.quantization.utils.fp8_utils as fp8_utils
@@ -11,6 +12,7 @@
 from ..autotuner import (AutoTuner, ConstraintSpec, DynamicTensorSpec,
                          OptimizationProfile, TunableRunner, TuningConfig)
 from ..modules.multi_stream_utils import do_multi_stream
+from ..modules.swiglu import silu_and_mul_kernel
 from ..utils import (fp4_scale_infer_shape,
                      get_last_power_of_2_num_tokens_buckets,
                      last_positive_power_of_2)
@@ -989,6 +991,50 @@ def _(
     return input.new_empty((input.size(0), weight.size(0)), dtype=output_dtype)
 
 
+@torch.library.custom_op("trtllm::silu_and_mul", mutates_args=())
+def silu_and_mul(x: torch.Tensor,
+                 scale: Optional[torch.Tensor] = None,
+                 dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+    b, n = x.shape
+
+    assert n % 2 == 0
+    d = n // 2
+
+    o_dtype = dtype or x.dtype
+    o = torch.empty((b, d), dtype=o_dtype, device=x.device)
+
+    def grid(meta: Mapping[str, int]) -> tuple[int, int]:
+        return (b, triton.cdiv(d, meta["BLOCK_SIZE"]))
+
+    silu_and_mul_kernel[grid](
+        o_ptr=o,
+        o_stride=o.stride(0),
+        o_scale_ptr=scale,
+        x_ptr=x,
+        x_stride=x.stride(0),
+        d=d,
+        BLOCK_SIZE=1024,
+        HAS_O_SCALE=scale is not None,
+    )
+
+    return o
+
+
+@silu_and_mul.register_fake
+def _(
+    x: torch.Tensor,
+    scale: Optional[torch.Tensor] = None,
+    dtype: Optional[torch.dtype] = None,
+) -> torch.Tensor:
+    b, n = x.shape
+
+    assert n % 2 == 0
+    d = n // 2
+
+    o_dtype = dtype or x.dtype
+    return x.new_empty((b, d), dtype=o_dtype)
+
+
 def get_event(event_idx: int):
     from ..utils import get_model_extra_attrs
     extra_attrs = get_model_extra_attrs()
diff --git a/tensorrt_llm/_torch/modules/swiglu.py b/tensorrt_llm/_torch/modules/swiglu.py
@@ -1,6 +1,3 @@
-from collections.abc import Mapping
-from typing import Optional
-
 import torch
 import triton  # type: ignore[import]
 import triton.language as tl  # type: ignore[import]
@@ -51,37 +48,13 @@ def silu_and_mul_kernel(o_ptr, o_stride, o_scale_ptr, x_ptr, x_stride, d,
     tl.store(o_row_ptr + offsets, result, mask=mask)
 
 
-def silu_and_mul(x: torch.Tensor,
-                 scale: Optional[torch.Tensor] = None,
-                 dtype: Optional[torch.dtype] = None) -> torch.Tensor:
-    b, n = x.shape
-
-    assert n % 2 == 0
-    d = n // 2
-
-    o_dtype = dtype or x.dtype
-    o = torch.empty((b, d), dtype=o_dtype, device=x.device)
-
-    def grid(meta: Mapping[str, int]) -> tuple[int, int]:
-        return (b, triton.cdiv(d, meta["BLOCK_SIZE"]))
-
-    silu_and_mul_kernel[grid](
-        o_ptr=o,
-        o_stride=o.stride(0),
-        o_scale_ptr=scale,
-        x_ptr=x,
-        x_stride=x.stride(0),
-        d=d,
-        BLOCK_SIZE=1024,
-        HAS_O_SCALE=scale is not None,
-    )
-
-    return o
-
-
 def swiglu(x, quant_scale: torch.Tensor = None, quant_type=None):
     if quant_scale is not None:
         assert quant_type is not None
-        return silu_and_mul(x, scale=quant_scale, dtype=quant_type)
+        return torch.ops.trtllm.silu_and_mul(
+            x,
+            scale=quant_scale,
+            dtype=quant_type,
+        )
 
-    return silu_and_mul(x)
+    return torch.ops.trtllm.silu_and_mul(x)