flashinfer-ai
diff --git a/‎benchmarks/routines/attention.py‎
Lines changed: 60 additions & 9 deletions b/‎benchmarks/routines/attention.py‎
Lines changed: 60 additions & 9 deletions
diff --git a/‎benchmarks/routines/flashinfer_benchmark_utils.py‎
Lines changed: 16 additions & 2 deletions b/‎benchmarks/routines/flashinfer_benchmark_utils.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎flashinfer/artifacts.py‎
Lines changed: 39 additions & 0 deletions b/‎flashinfer/artifacts.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎flashinfer/attention/__init__.py‎
Lines changed: 23 additions & 0 deletions b/‎flashinfer/attention/__init__.py‎
Lines changed: 23 additions & 0 deletions
diff --git a/‎flashinfer/attention.py‎ ‎flashinfer/attention/_core.py‎flashinfer/attention.py renamed to flashinfer/attention/_core.py
Lines changed: 6 additions & 6 deletions b/‎flashinfer/attention.py‎ ‎flashinfer/attention/_core.py‎flashinfer/attention.py renamed to flashinfer/attention/_core.py
Lines changed: 6 additions & 6 deletions
diff --git a/‎flashinfer/attention/cute_dsl/__init__.py‎
Lines changed: 37 additions & 0 deletions b/‎flashinfer/attention/cute_dsl/__init__.py‎
Lines changed: 37 additions & 0 deletions
@@ -1646,18 +1646,40 @@ def testBatchPrefillWithRaggedKVCacheWrapper(args):
 
     cumsum_s_qo = torch.sum(actual_seq_lens_q)
     cumsum_s_kv = torch.sum(actual_seq_lens_kv)
-    q = torch.randn(
-        cumsum_s_qo, num_qo_heads, head_dim_qk, device=device, dtype=q_init_dtype
+
+    # Front-padding for cute-dsl varlen kernel: the persistent varlen kernel
+    # applies a negative pointer offset (-max_s * H * D), so there must be
+    # valid GPU memory before the data start.
+    front_pad_q = s_qo if "cute-dsl" in backends else 0
+    front_pad_kv = s_kv if "cute-dsl" in backends else 0
+
+    q_full = torch.randn(
+        front_pad_q + cumsum_s_qo,
+        num_qo_heads,
+        head_dim_qk,
+        device=device,
+        dtype=q_init_dtype,
     )
+    q = q_full[front_pad_q:]
     if args.verbose >= 2:
         print(f"[VVERBOSE] {q.shape = }")
 
-    k = torch.randn(
-        cumsum_s_kv, num_kv_heads, head_dim_qk, device=device, dtype=kv_init_dtype
+    k_full = torch.randn(
+        front_pad_kv + cumsum_s_kv,
+        num_kv_heads,
+        head_dim_qk,
+        device=device,
+        dtype=kv_init_dtype,
     )
-    v = torch.randn(
-        cumsum_s_kv, num_kv_heads, head_dim_vo, device=device, dtype=kv_init_dtype
+    k = k_full[front_pad_kv:]
+    v_full = torch.randn(
+        front_pad_kv + cumsum_s_kv,
+        num_kv_heads,
+        head_dim_vo,
+        device=device,
+        dtype=kv_init_dtype,
     )
+    v = v_full[front_pad_kv:]
 
     block_tables = None
 
@@ -1815,14 +1837,18 @@ def testBatchPrefillWithRaggedKVCacheWrapper(args):
         v = (v / v_scale).to(kv_dtype)
 
     trtllm_out = None
-    if "trtllm-native" in backends:
-        trtllm_out = torch.empty(
-            q.shape[0],
+    if "trtllm-native" in backends or "cute-dsl" in backends:
+        # cute-dsl varlen kernel uses negative pointer offsets on output,
+        # so front-pad like Q/K/V.
+        out_pad = front_pad_q if "cute-dsl" in backends else 0
+        trtllm_out_full = torch.empty(
+            out_pad + q.shape[0],
             q.shape[1],
             v.shape[2],
             device=q.device,
             dtype=out_dtype,
         )
+        trtllm_out = trtllm_out_full[out_pad:]
 
     def run_backend_wrapper(
         backend,
@@ -1843,6 +1869,31 @@ def run_backend_wrapper(
     ):
         if backend in ["cutlass", "fa2", "fa3", "trtllm-gen"]:
             return backend_wrappers[backend].run_return_lse(q, k, v)[0]
+        elif backend == "cute-dsl":
+            _q_scale = q_scale if q_scale is not None else 1.0
+            _k_scale = k_scale if k_scale is not None else 1.0
+            _v_scale = v_scale if v_scale is not None else 1.0
+            return flashinfer.prefill.trtllm_ragged_attention_deepseek(
+                query=q,
+                key=k,
+                value=v,
+                workspace_buffer=workspace_buffer,
+                seq_lens=actual_seq_lens_kv_device,
+                max_q_len=s_qo,
+                max_kv_len=s_kv,
+                bmm1_scale=_q_scale * _k_scale * scale,
+                bmm2_scale=_v_scale,
+                o_sf_scale=-1,
+                batch_size=batch_size,
+                window_left=-1,
+                cum_seq_lens_q=qo_indptr,
+                cum_seq_lens_kv=kv_indptr,
+                enable_pdl=False,
+                is_causal=causal,
+                return_lse=True,
+                out=trtllm_out,
+                backend="cute-dsl",
+            )[0]
         elif backend == "cudnn":
             # cuDNN uses wrapper API
             return backend_wrappers[backend].run(q, k, v)
 
@@ -335,8 +335,22 @@ def dtype_str_to_torch_dtype(dtype_str):
         "8.6": ["fa2", "cudnn", "cudnn-native"],
         "8.9": ["fa2", "cudnn", "cudnn-native"],
         "9.0": ["fa2", "fa3", "cudnn", "cudnn-native"],
-        "10.0": ["fa2", "cudnn", "cudnn-native", "cutlass", "trtllm-native"],
-        "10.3": ["fa2", "cudnn", "cudnn-native", "cutlass", "trtllm-native"],
+        "10.0": [
+            "fa2",
+            "cudnn",
+            "cudnn-native",
+            "cutlass",
+            "cute-dsl",
+            "trtllm-native",
+        ],
+        "10.3": [
+            "fa2",
+            "cudnn",
+            "cudnn-native",
+            "cutlass",
+            "cute-dsl",
+            "trtllm-native",
+        ],
         "12.0": ["fa2", "cudnn", "cudnn-native"],
         "12.1": ["fa2", "cudnn", "cudnn-native"],
     },
 
@@ -145,6 +145,8 @@ class ArtifactPath:
     CUDNN_SDPA: str = "a72d85b019dc125b9f711300cb989430f762f5a6/fmha/cudnn/"
     # For DEEPGEMM, we also need to update KernelMap.KERNEL_MAP_HASH in flashinfer/deep_gemm.py
     DEEPGEMM: str = "a72d85b019dc125b9f711300cb989430f762f5a6/deep-gemm/"
+    DSL_FMHA: str = "c770c91cb0d991b7828fc85d2253a62f0d356b6c/fmha/cute-dsl/"
+    DSL_FMHA_ARCHS: tuple[str, ...] = ("sm_100a", "sm_103a", "sm_110a")
 
 
 class CheckSumHash:
@@ -164,11 +166,32 @@ class CheckSumHash:
     TRTLLM_GEN_GEMM: str = (
         "64b7114a429ea153528dd4d4b0299363d7320964789eb5efaefec66f301523c7"
     )
+    # SHA256 of the checksums.txt manifest file per cpu-arch/sm-arch,
+    # NOT hashes of individual kernel .so files.
+    DSL_FMHA_CHECKSUMS: dict[str, dict[str, str]] = {
+        "x86_64": {
+            "sm_100a": "9533536698cdc256d897fffb3114de317076654ff8630ff283d850cc3dc96d86",
+            "sm_103a": "927e1954f1d45b0ee876f139084e4facdfcc87e86f4d30cb92d5c33698d4c2d6",
+            "sm_110a": "277b1dceaab2081e3def37cf997280a3f2c3ac515d22b80be141253c0278b8b5",
+        },
+        "aarch64": {
+            "sm_100a": "b48ed0bcc9bad4afd33e0784c8c9eb9e13e782afe197816b1d0747b11759493e",
+            "sm_103a": "bace619a560f3ce52ad6ba105fffb8ea8629fe57885a90892c9e15a7122467e1",
+            "sm_110a": "d8369bcfa443bfd791cd014e3b030d378f00a975db8278eebd5b2fb529e3257d",
+        },
+    }
     map_checksums: dict[str, str] = {
         safe_urljoin(ArtifactPath.TRTLLM_GEN_FMHA, "checksums.txt"): TRTLLM_GEN_FMHA,
         safe_urljoin(ArtifactPath.TRTLLM_GEN_BMM, "checksums.txt"): TRTLLM_GEN_BMM,
         safe_urljoin(ArtifactPath.DEEPGEMM, "checksums.txt"): DEEPGEMM,
         safe_urljoin(ArtifactPath.TRTLLM_GEN_GEMM, "checksums.txt"): TRTLLM_GEN_GEMM,
+        **{
+            safe_urljoin(
+                ArtifactPath.DSL_FMHA, f"{cpu_arch}/{sm_arch}/checksums.txt"
+            ): sha
+            for cpu_arch, sm_checksums in DSL_FMHA_CHECKSUMS.items()
+            for sm_arch, sha in sm_checksums.items()
+        },
     }
 
 
@@ -191,14 +214,30 @@ def get_checksums(subdirs):
     return checksums
 
 
+def _get_host_cpu_arch() -> str:
+    """Return CPU architecture string matching artifactory layout."""
+    import platform
+
+    machine = platform.machine()
+    if machine in ("aarch64", "arm64"):
+        return "aarch64"
+    return "x86_64"
+
+
 def get_subdir_file_list() -> Generator[tuple[str, str], None, None]:
     base = FLASHINFER_CUBINS_REPOSITORY
+    cpu_arch = _get_host_cpu_arch()
 
     cubin_dirs = [
         ArtifactPath.TRTLLM_GEN_FMHA,
         ArtifactPath.TRTLLM_GEN_BMM,
         ArtifactPath.TRTLLM_GEN_GEMM,
         ArtifactPath.DEEPGEMM,
+        # DSL FMHA: per cpu-arch and sm-arch subdirectories
+        *(
+            safe_urljoin(ArtifactPath.DSL_FMHA, f"{cpu_arch}/{arch}/")
+            for arch in ArtifactPath.DSL_FMHA_ARCHS
+        ),
     ]
 
     # Get checksums of all files
 
@@ -0,0 +1,23 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from ._core import *  # noqa: F401,F403
+from ._core import BatchAttention, BatchAttentionWithAttentionSinkWrapper
+
+__all__ = [
+    "BatchAttention",
+    "BatchAttentionWithAttentionSinkWrapper",
+]
@@ -20,19 +20,19 @@
 
 import torch
 
-from .api_logging import flashinfer_api
-from .jit import gen_batch_attention_module
-from .utils import (
+from ..api_logging import flashinfer_api
+from ..jit import gen_batch_attention_module
+from ..utils import (
     MaskMode,
     PosEncodingMode,
     TensorLayout,
     _check_kv_layout,
     _unpack_paged_kv_cache,
     determine_attention_backend,
 )
-from .prefill import BatchPrefillWithPagedKVCacheWrapper
-from .jit.attention.variants import attention_sink_decl
-from .jit.utils import filename_safe_dtype_map
+from ..prefill import BatchPrefillWithPagedKVCacheWrapper
+from ..jit.attention.variants import attention_sink_decl
+from ..jit.utils import filename_safe_dtype_map
 
 
 @functools.cache
 
@@ -0,0 +1,37 @@
+# Copyright (c) 2026 by FlashInfer team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CuTe DSL Attention Kernels (Cubin Distribution)
+================================================
+
+Pre-compiled FMHA kernels loaded via ExternalBinaryModule.
+"""
+
+from flashinfer.cute_dsl.utils import is_cute_dsl_available
+
+if is_cute_dsl_available():
+    from .fmha import (
+        get_cute_dsl_fmha_kernel,
+        cute_dsl_fmha_ragged_prefill,
+    )
+
+    __all__ = [
+        "is_cute_dsl_available",
+        "get_cute_dsl_fmha_kernel",
+        "cute_dsl_fmha_ragged_prefill",
+    ]
+else:
+    __all__ = [
+        "is_cute_dsl_available",
+    ]