[Perf] Optimize CutlassFP8ScaledMMLinearKernel when padding needed by pre-weight processing, 13.5% TTFT improvement (vllm-project#42651)

yewentao256 · MatthewBonanni · web-flow · commit 53ff50fcd3d2 · 2026-05-20T11:57:42.000-07:00
Signed-off-by: yewentao256 &lt;zhyanwentao@126.com&gt;
Signed-off-by: Wentao Ye &lt;44945378+yewentao256@users.noreply.github.com&gt;
Co-authored-by: Matthew Bonanni &lt;mbonanni@redhat.com&gt;
diff --git a/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py b/vllm/model_executor/kernels/linear/scaled_mm/cutlass.py
@@ -2,6 +2,8 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
 
+from collections.abc import Sequence
+
 import torch
 
 from vllm import _custom_ops as ops
@@ -150,6 +152,12 @@ def apply_weights(
 
 
 class CutlassFP8ScaledMMLinearKernel(FP8ScaledMMLinearKernel):
+    def __init__(
+        self, c: FP8ScaledMMLinearLayerConfig, layer_param_names: Sequence[str]
+    ) -> None:
+        self.logical_output_size: int | None = None
+        super().__init__(c, layer_param_names)
+
     @classmethod
     def is_supported(
         cls, compute_capability: int | None = None
@@ -176,6 +184,33 @@ def _pad_to_alignment(
         pad_spec[-(2 * dim + 1)] = pad_size
         return torch.nn.functional.pad(x, pad_spec, value=value)
 
+    def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
+        weight_name, weight_scale_name, _, _ = self.layer_param_names
+        weight = getattr(layer, weight_name)
+
+        # keep the logical output width so runtime can slice away static padding.
+        self.logical_output_size = weight.shape[1]
+
+        pad_k = (16 - weight.shape[0] % 16) % 16
+        pad_n = (16 - weight.shape[1] % 16) % 16
+        if pad_k == 0 and pad_n == 0:
+            return
+
+        # B is column-major [K, N]
+        padded_weight = torch.nn.functional.pad(
+            weight.t().contiguous(),
+            (0, pad_k, 0, pad_n),
+        ).t()
+        replace_parameter(layer, weight_name, padded_weight.data)
+
+        weight_scale = getattr(layer, weight_scale_name, None)
+        if weight_scale is not None and pad_n > 0 and weight_scale.numel() > 1:
+            flat_scale = weight_scale.reshape(-1)
+            padded_scale = self._pad_to_alignment(
+                flat_scale, dim=0, alignment=16, value=1.0
+            ).view(-1, *weight_scale.shape[1:])
+            replace_parameter(layer, weight_scale_name, padded_scale.data)
+
     def apply_scaled_mm(
         self,
         *,
@@ -187,39 +222,25 @@ def apply_scaled_mm(
         bias: torch.Tensor | None,
         output_shape: list,
     ) -> torch.Tensor:
-        # Per-tensor/Per-channel padding to use Cutlass instead of Triton.
-        K, N = B.shape
-        pad_k = (16 - K % 16) % 16
-        pad_n = (16 - N % 16) % 16
-
-        if pad_k > 0 or pad_n > 0:
-            # B is column-major [K, N].  Transpose to row-major [N, K],
-            # pad both dims in one call, then transpose back so the
-            # result keeps column-major layout with stride (1, K_padded).
-            B = torch.nn.functional.pad(B.t().contiguous(), (0, pad_k, 0, pad_n)).t()
-
-            if pad_k > 0:
-                A = self._pad_to_alignment(A, dim=1, alignment=16)
-            if pad_n > 0:
-                if bias is not None:
-                    bias = self._pad_to_alignment(bias, dim=0, alignment=16)
-                # Bs is per-tensor (numel==1) or per-channel (numel==N)
-                # in this kernel class — never 2D block-wise.
-                if Bs.numel() > 1:
-                    Bs = self._pad_to_alignment(
-                        Bs.view(-1), dim=0, alignment=16, value=1.0
-                    )
-                    if Bs.dim() == 1 and B.shape[1] > 1:
-                        Bs = Bs.view(-1, 1)
+        padded_k, padded_n = B.shape
+        output_size = self.logical_output_size
+        assert output_size is not None
+        pad_k = padded_k - A.shape[1]
+        pad_n = padded_n - output_size
+
+        if pad_k > 0:
+            A = self._pad_to_alignment(A, dim=1, alignment=16)
+        if pad_n > 0 and bias is not None:
+            bias = self._pad_to_alignment(bias, dim=0, alignment=16)
 
         output = ops.cutlass_scaled_mm(
             A, B, out_dtype=out_dtype, scale_a=As, scale_b=Bs, bias=bias
         )
 
         if pad_n > 0:
-            output = output[..., :N].contiguous()
+            output = output[..., :output_size].contiguous()
 
-        return output.view(*output_shape)
+        return output.view(*output_shape[:-1], output_size)
 
 
 class CutlassFp8BlockScaledMMKernel(Fp8BlockScaledMMLinearKernel):