flashinfer-ai · flashinfer-bot · Mar 6, 2026 · Feb 28, 2026 · Feb 28, 2026 · Feb 28, 2026
@@ -368,8 +368,8 @@ def dtype_str_to_torch_dtype(dtype_str):
         "8.6": [],
         "8.9": [],
         "9.0": [],
-        "10.0": ["cutlass"],
-        "10.3": ["cutlass"],
+        "10.0": ["cutlass", "cute-dsl"],
+        "10.3": ["cutlass", "cute-dsl"],
         "11.0": ["cutlass"],
         "12.0": [],
     },

@@ -1071,6 +1071,13 @@ def testMmFp4(args):
         print(f"[VVERBOSE] {mat2_fp4.dtype = }")
 
     alpha = 1.0 / (global_sf_input * global_sf_mat2) if use_nvfp4 else None
+    # TODO: for MXFP4, we don't need a global scale, we should change the compile interface to make
+    # alpha optional.
+    alpha_for_cute_dsl_mxfp4 = (
+        torch.tensor([1.0], dtype=torch.float32, device=device)
+        if not use_nvfp4
+        else None
+    )
     # Completed preparing inputs. Now programmatically filter backends
     block_size = 16 if use_nvfp4 else 32
     backends_to_remove = []
@@ -1091,7 +1098,7 @@ def testMmFp4(args):
                 b=mat2_fp4.T if backend != "trtllm" else mat2_fp4_trtllm.T,
                 a_descale=input_inv_s,
                 b_descale=mat2_inv_s.T if backend != "trtllm" else mat2_inv_s_trtllm.T,
-                alpha=alpha,
+                alpha=(alpha_for_cute_dsl_mxfp4 if (backend == "cute-dsl") else alpha),
                 out_dtype=res_dtype,
                 block_size=16
                 if use_nvfp4
@@ -1129,7 +1136,7 @@ def run_backend(
                 b=mat2_fp4.T if backend != "trtllm" else mat2_fp4_trtllm.T,
                 a_descale=input_inv_s,
                 b_descale=mat2_inv_s.T if backend != "trtllm" else mat2_inv_s_trtllm.T,
-                alpha=alpha,
+                alpha=(alpha_for_cute_dsl_mxfp4 if (backend == "cute-dsl") else alpha),
                 out_dtype=res_dtype,
                 block_size=block_size,
                 use_8x4_sf_layout=not use_128x4_sf_layout,
@@ -1289,9 +1296,7 @@ def testMmMxfp8(args):
     res_dtype = args.out_dtype
     is_cuda_graph_compatible = not args.no_cuda_graph
     run_refcheck = args.refcheck
-    autotune_supported_backends = [
-        "cutlass",
-    ]
+    autotune_supported_backends = ["cutlass", "cute-dsl", "auto"]
     res = []
 
     backends = filter_backends_by_compute_capability(backends, args.routine, device)
@@ -1344,7 +1349,7 @@ def testMmMxfp8(args):
         print(f"[VVERBOSE] {mat2_scale.dtype = }")
 
     def run_backend(backend, input_mxfp8, mat2_mxfp8, input_scale, mat2_scale):
-        if backend == "cutlass":
+        if backend in ["cutlass", "cute-dsl", "auto"]:
             return flashinfer.gemm.mm_mxfp8(
                 a=input_mxfp8,
                 b=mat2_mxfp8.t(),  # mm_mxfp8 expects b.t()