flashinfer-ai · limin2021 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026 · Apr 12, 2026
@@ -1646,18 +1646,40 @@ def testBatchPrefillWithRaggedKVCacheWrapper(args):
 
     cumsum_s_qo = torch.sum(actual_seq_lens_q)
     cumsum_s_kv = torch.sum(actual_seq_lens_kv)
-    q = torch.randn(
-        cumsum_s_qo, num_qo_heads, head_dim_qk, device=device, dtype=q_init_dtype
+
+    # Front-padding for cute-dsl varlen kernel: the persistent varlen kernel
+    # applies a negative pointer offset (-max_s * H * D), so there must be
+    # valid GPU memory before the data start.
+    front_pad_q = s_qo if "cute-dsl" in backends else 0
+    front_pad_kv = s_kv if "cute-dsl" in backends else 0
+
+    q_full = torch.randn(
+        front_pad_q + cumsum_s_qo,
+        num_qo_heads,
+        head_dim_qk,
+        device=device,
+        dtype=q_init_dtype,
     )
+    q = q_full[front_pad_q:]
     if args.verbose >= 2:
         print(f"[VVERBOSE] {q.shape = }")
 
-    k = torch.randn(
-        cumsum_s_kv, num_kv_heads, head_dim_qk, device=device, dtype=kv_init_dtype
+    k_full = torch.randn(
+        front_pad_kv + cumsum_s_kv,
+        num_kv_heads,
+        head_dim_qk,
+        device=device,
+        dtype=kv_init_dtype,
     )
-    v = torch.randn(
-        cumsum_s_kv, num_kv_heads, head_dim_vo, device=device, dtype=kv_init_dtype
+    k = k_full[front_pad_kv:]
+    v_full = torch.randn(
+        front_pad_kv + cumsum_s_kv,
+        num_kv_heads,
+        head_dim_vo,
+        device=device,
+        dtype=kv_init_dtype,
     )
+    v = v_full[front_pad_kv:]
 
     block_tables = None
 
@@ -1751,13 +1773,13 @@ def testBatchPrefillWithRaggedKVCacheWrapper(args):
     # Prepare wrappers
     backend_wrappers = {}
     for backend in backends:
-        if backend in ["cutlass", "fa2", "fa3", "trtllm-gen"]:
+        if backend in ["cutlass", "fa2", "fa3", "trtllm-gen", "cute-dsl"]:
             backend_wrappers[backend] = (
                 flashinfer.prefill.BatchPrefillWithRaggedKVCacheWrapper(
                     workspace_buffer,
                     "NHD",
                     use_cuda_graph=is_cuda_graph_compatible
-                    if backend != "fa2"
+                    if backend not in ["fa2"]
                     else False,
                     qo_indptr_buf=qo_indptr,
                     kv_indptr_buf=kv_indptr,
@@ -1843,6 +1865,8 @@ def run_backend_wrapper(
     ):
         if backend in ["cutlass", "fa2", "fa3", "trtllm-gen"]:
             return backend_wrappers[backend].run_return_lse(q, k, v)[0]
+        elif backend == "cute-dsl":
+            return backend_wrappers[backend].run(q, k, v)
         elif backend == "cudnn":
             # cuDNN uses wrapper API
             return backend_wrappers[backend].run(q, k, v)
@@ -1933,7 +1957,7 @@ def run_backend_wrapper(
             repeat_iters=args.num_iters,
             sleep_after_run=True,
             enable_cupti=args.use_cupti,
-            use_cuda_graph=(is_cuda_graph_compatible and cur_backend != "fa2"),
+            use_cuda_graph=(is_cuda_graph_compatible and cur_backend not in ["fa2"]),
             cold_l2_cache=True,
             input_args=(
                 cur_backend,

@@ -322,8 +322,24 @@ def dtype_str_to_torch_dtype(dtype_str):
         "8.6": ["fa2", "cudnn", "cudnn-native"],
         "8.9": ["fa2", "cudnn", "cudnn-native"],
         "9.0": ["fa2", "fa3", "cudnn", "cudnn-native"],
-        "10.0": ["fa2", "cudnn", "cudnn-native", "cutlass", "trtllm-native"],
-        "10.3": ["fa2", "cudnn", "cudnn-native", "cutlass", "trtllm-native"],
+        "10.0": [
+            "fa2",
+            "fa3",
+            "cudnn",
+            "cudnn-native",
+            "cutlass",
+            "cute-dsl",
+            "trtllm-native",
+        ],
+        "10.3": [
+            "fa2",
+            "fa3",
+            "cudnn",
+            "cudnn-native",
+            "cutlass",
+            "cute-dsl",
+            "trtllm-native",
+        ],
         "12.0": ["fa2", "cudnn", "cudnn-native"],
         "12.1": ["fa2", "cudnn", "cudnn-native"],
     },

diff --git a/flashinfer/attention_dsl/__init__.py b/flashinfer/attention_dsl/__init__.py
@@ -0,0 +1,19 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+FlashInfer Attention DSL Module
+================================
+
+CuTe DSL attention kernel implementations (cubin distribution).
+"""
diff --git a/flashinfer/attention_dsl/cute_dsl/__init__.py b/flashinfer/attention_dsl/cute_dsl/__init__.py
@@ -0,0 +1,39 @@
+# Copyright (c) 2025 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CuTe DSL Attention Kernels (Cubin Distribution)
+================================================
+
+Pre-compiled FMHA kernels loaded via ExternalBinaryModule.
+"""
+
+from flashinfer.cute_dsl.utils import is_cute_dsl_available
+
+if is_cute_dsl_available():
+    from .fmha import (
+        get_cute_dsl_fmha_kernel,
+        cute_dsl_fmha_prefill,
+        cute_dsl_fmha_ragged_prefill,
+    )
+
+    __all__ = [
+        "is_cute_dsl_available",
+        "get_cute_dsl_fmha_kernel",
+        "cute_dsl_fmha_prefill",
+        "cute_dsl_fmha_ragged_prefill",
+    ]
+else:
+    __all__ = [
+        "is_cute_dsl_available",
+    ]