more

b8zhong · b8zhong · commit 12793cfdd493 · 2026-04-23T23:25:34.000Z
diff --git a/docs_new/docs/advanced_features/server_arguments.mdx b/docs_new/docs/advanced_features/server_arguments.mdx
@@ -1196,9 +1196,9 @@ Please consult the documentation below and [server_args.py](https://github.com/s
     </tr>
     <tr>
       <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--fp4-gemm-backend`</td>
-      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Choose the runner backend for NVFP4 GEMM operations. Options: 'flashinfer_cutlass' (default), 'auto' (auto-selects between flashinfer_cudnn/flashinfer_cutlass based on CUDA/cuDNN version), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). All backends are from FlashInfer; when FlashInfer is unavailable, sgl-kernel CUTLASS is used as an automatic fallback.</td>
-      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><code>flashinfer_cutlass</code></td>
-      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}><code>auto</code>, <code>flashinfer_cudnn</code>, <code>flashinfer_cutlass</code>, <code>flashinfer_trtllm</code></td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}>Choose the runner backend for NVFP4 GEMM operations. Options: 'auto' (default; selects <code>flashinfer_cudnn</code> on SM120, <code>flashinfer_cutedsl</code> on SM100, <code>flashinfer_cutlass</code> otherwise), 'cutlass' (SGLang CUTLASS kernel), 'flashinfer_cutlass' (FlashInfer CUTLASS backend), 'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), 'flashinfer_cutedsl' (FlashInfer CuTe DSL backend), 'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). All FlashInfer backends fall back to sgl-kernel CUTLASS when FlashInfer is unavailable.</td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.02)"}}><code>auto</code></td>
+      <td style={{padding: "9px 12px", backgroundColor: "rgba(255,255,255,0.05)"}}><code>auto</code>, <code>cutlass</code>, <code>flashinfer_cudnn</code>, <code>flashinfer_cutedsl</code>, <code>flashinfer_cutlass</code>, <code>flashinfer_trtllm</code></td>
     </tr>
     <tr>
       <td style={{padding: "9px 12px", fontWeight: 500, backgroundColor: "rgba(255,255,255,0.02)"}}>`--disable-flashinfer-autotune`</td>
diff --git a/python/sglang/srt/layers/quantization/fp4_utils.py b/python/sglang/srt/layers/quantization/fp4_utils.py
@@ -4,7 +4,7 @@
 from enum import Enum
 from typing import TYPE_CHECKING
 
-from sglang.srt.utils.common import is_sm120_supported
+from sglang.srt.utils.common import is_sm100_supported, is_sm120_supported
 
 if TYPE_CHECKING:
     from sglang.srt.server_args import ServerArgs
@@ -18,6 +18,7 @@ class Fp4GemmRunnerBackend(Enum):
     AUTO = "auto"
     CUTLASS = "cutlass"
     FLASHINFER_CUDNN = "flashinfer_cudnn"
+    FLASHINFER_CUTEDSL = "flashinfer_cutedsl"
     FLASHINFER_CUTLASS = "flashinfer_cutlass"
     FLASHINFER_TRTLLM = "flashinfer_trtllm"
 
@@ -36,6 +37,9 @@ def is_flashinfer_cutlass(self) -> bool:
     def is_flashinfer_trtllm(self) -> bool:
         return self == Fp4GemmRunnerBackend.FLASHINFER_TRTLLM
 
+    def is_flashinfer_cutedsl(self) -> bool:
+        return self == Fp4GemmRunnerBackend.FLASHINFER_CUTEDSL
+
     def is_flashinfer(self) -> bool:
         return self.value.startswith("flashinfer_")
 
@@ -47,7 +51,10 @@ def get_flashinfer_backend(self) -> str:
             'flashinfer_trtllm' -> 'trtllm'
             'flashinfer_cutlass' -> 'cutlass'
             'flashinfer_cudnn' -> 'cudnn'
+            'flashinfer_cutedsl' -> 'cute-dsl'
         """
+        if self == Fp4GemmRunnerBackend.FLASHINFER_CUTEDSL:
+            return "cute-dsl"
         if self.value.startswith("flashinfer_"):
             return self.value.removeprefix("flashinfer_")
         else:
@@ -68,10 +75,8 @@ def initialize_fp4_gemm_config(server_args: ServerArgs) -> None:
             # heterogeneous batches on SM120 (Blackwell).  cudnn is stable.
             # See: https://github.com/sgl-project/sglang/issues/20043
             backend = "flashinfer_cudnn"
-            logger.info(
-                "SM120 (Blackwell) detected: auto-selecting "
-                "fp4-gemm-backend=flashinfer_cudnn"
-            )
+        elif is_sm100_supported():
+            backend = "flashinfer_cutedsl"
         else:
             backend = "flashinfer_cutlass"
 
diff --git a/python/sglang/srt/server_args.py b/python/sglang/srt/server_args.py
@@ -205,6 +205,7 @@
     "auto",
     "cutlass",
     "flashinfer_cudnn",
+    "flashinfer_cutedsl",
     "flashinfer_cutlass",
     "flashinfer_trtllm",
 ]
@@ -5196,10 +5197,11 @@ def add_cli_args(parser: argparse.ArgumentParser):
             default=ServerArgs.fp4_gemm_runner_backend,
             dest="fp4_gemm_runner_backend",
             help="Choose the runner backend for NVFP4 GEMM operations. "
-            "Options: 'auto' (default; selects flashinfer_cudnn on SM120, flashinfer_cutlass otherwise), "
+            "Options: 'auto' (default; selects flashinfer_cudnn on SM120, flashinfer_cutedsl on SM100, flashinfer_cutlass otherwise), "
             "'cutlass' (SGLang CUTLASS kernel), "
             "'flashinfer_cutlass' (FlashInfer CUTLASS backend), "
             "'flashinfer_cudnn' (FlashInfer cuDNN backend, optimal on CUDA 13+ with cuDNN 9.15+), "
+            "'flashinfer_cutedsl' (FlashInfer CuTe DSL backend), "
             "'flashinfer_trtllm' (FlashInfer TensorRT-LLM backend, requires different weight preparation with shuffling). ",
         )
         parser.add_argument(
diff --git a/python/sglang/srt/utils/common.py b/python/sglang/srt/utils/common.py
@@ -1206,6 +1206,11 @@ def configure_logger(server_args, prefix: str = ""):
     for name in ("httpx", "httpcore"):
         logging.getLogger(name).setLevel(logging.WARNING)
 
+    if is_flashinfer_available():
+        from flashinfer.jit.core import logger as flashinfer_logger
+
+        flashinfer_logger.setLevel(logging.ERROR)
+
 
 # source: https://github.com/vllm-project/vllm/blob/93b38bea5dd03e1b140ca997dfaadef86f8f1855/vllm/lora/utils.py#L9
 def replace_submodule(
diff --git a/sgl-kernel/benchmark/bench_fp4_gemm.py b/sgl-kernel/benchmark/bench_fp4_gemm.py
@@ -1,14 +1,18 @@
 import argparse
 import csv
-import os
+import logging
 from functools import partial
 from typing import List, Tuple
 
 import torch
 import triton
 from flashinfer import mm_fp4
+from flashinfer.autotuner import autotune
+from flashinfer.jit.core import logger as flashinfer_logger
 from flashinfer.testing import bench_gpu_time
 
+flashinfer_logger.setLevel(logging.ERROR)
+
 from sglang.jit_kernel.nvfp4 import cutlass_scaled_fp4_mm, scaled_fp4_quant
 from sglang.srt.utils import (
     get_device_capability,
@@ -150,23 +154,25 @@ def _run_mm_fp4(a_fp4, b_fp4_T, a_sf, b_sf_T, alpha, dtype, res_fi, backend):
         x_log=False,
         line_arg="provider",
         line_vals=(
-            ["sglang_cutlass", "cutlass", "cudnn", "trtllm", "auto"]
+            ["sglang_cutlass", "cutlass", "cudnn", "trtllm", "cute-dsl", "auto"]
             if is_sm100_supported()
-            else ["sglang_cutlass", "cutlass", "cudnn", "auto"]
+            else ["sglang_cutlass", "cutlass", "cudnn", "cute-dsl", "auto"]
         ),
         line_names=(
             [
                 "sglang cutlass fp4",
                 "flashinfer cutlass fp4",
                 "cudnn fp4",
                 "trtllm fp4",
+                "cute-dsl fp4",
                 "auto fp4 (cudnn/cutlass)",
             ]
             if is_sm100_supported()
             else [
                 "sglang cutlass fp4",
                 "flashinfer cutlass fp4",
                 "cudnn fp4",
+                "cute-dsl fp4",
                 "auto fp4",
             ]
         ),
@@ -176,13 +182,15 @@ def _run_mm_fp4(a_fp4, b_fp4_T, a_sf, b_sf_T, alpha, dtype, res_fi, backend):
                 ("orange", "solid"),
                 ("blue", "solid"),
                 ("green", "solid"),
+                ("brown", "solid"),
                 ("purple", "solid"),
             ]
             if is_sm100_supported()
             else [
                 ("red", "solid"),
                 ("orange", "solid"),
                 ("blue", "solid"),
+                ("brown", "solid"),
                 ("purple", "solid"),
             ]
         ),
@@ -224,6 +232,11 @@ def benchmark(batch_size, provider, N, K, dtype, correctness, csv_file):
             use_cuda_graph=True,
         )
     elif provider == "cutlass":
+        with autotune():
+            _run_mm_fp4(
+                a_fp4, b_fp4_T, a_scale_interleaved, b_sf_T,
+                alpha, dtype, res_fi, backend="cutlass",
+            )
         times_ms = bench_gpu_time(
             fn=partial(_run_mm_fp4, backend="cutlass"),
             input_args=(
@@ -238,6 +251,11 @@ def benchmark(batch_size, provider, N, K, dtype, correctness, csv_file):
             use_cuda_graph=True,
         )
     elif provider == "cudnn":
+        with autotune():
+            _run_mm_fp4(
+                a_fp4, b_fp4_T, a_scale_interleaved, b_sf_T,
+                alpha, dtype, res_fi, backend="cudnn",
+            )
         times_ms = bench_gpu_time(
             fn=partial(_run_mm_fp4, backend="cudnn"),
             input_args=(
@@ -254,12 +272,41 @@ def benchmark(batch_size, provider, N, K, dtype, correctness, csv_file):
     elif provider == "trtllm":
         a_sf_u8 = a_scale_interleaved.to(torch.uint8)
         b_sf_u8_T = b_sf_T.to(torch.uint8)
+        with autotune():
+            _run_mm_fp4(
+                a_fp4, b_fp4_T, a_sf_u8, b_sf_u8_T,
+                alpha, dtype, res_fi, backend="trtllm",
+            )
         times_ms = bench_gpu_time(
             fn=partial(_run_mm_fp4, backend="trtllm"),
             input_args=(a_fp4, b_fp4_T, a_sf_u8, b_sf_u8_T, alpha, dtype, res_fi),
             use_cuda_graph=True,
         )
+    elif provider == "cute-dsl":
+        with autotune():
+            _run_mm_fp4(
+                a_fp4, b_fp4_T, a_scale_interleaved, b_sf_T,
+                alpha, dtype, res_fi, backend="cute-dsl",
+            )
+        times_ms = bench_gpu_time(
+            fn=partial(_run_mm_fp4, backend="cute-dsl"),
+            input_args=(
+                a_fp4,
+                b_fp4_T,
+                a_scale_interleaved,
+                b_sf_T,
+                alpha,
+                dtype,
+                res_fi,
+            ),
+            use_cuda_graph=True,
+        )
     elif provider == "auto":
+        with autotune():
+            _run_mm_fp4(
+                a_fp4, b_fp4_T, a_scale_interleaved, b_sf_T,
+                alpha, dtype, res_fi, backend="auto",
+            )
         times_ms = bench_gpu_time(
             fn=partial(_run_mm_fp4, backend="auto"),
             input_args=(
diff --git a/test/registered/quant/test_nvfp4_gemm.py b/test/registered/quant/test_nvfp4_gemm.py
@@ -81,5 +81,10 @@ class TestFP4GemmFlashinferTrtllm(FP4GemmBase, unittest.TestCase):
     backend = "flashinfer_trtllm"
 
 
+@unittest.skipIf(get_device_sm() < 100, "Test requires CUDA SM 100 or higher")
+class TestFP4GemmFlashinferCutedsl(FP4GemmBase, unittest.TestCase):
+    backend = "flashinfer_cutedsl"
+
+
 if __name__ == "__main__":
     unittest.main()