Dao-AILab
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/benchmark_attn.py‎
Lines changed: 146 additions & 29 deletions b/‎benchmarks/benchmark_attn.py‎
Lines changed: 146 additions & 29 deletions
diff --git a/‎flash_attn/cute/bench_utils.py‎
Lines changed: 40 additions & 2 deletions b/‎flash_attn/cute/bench_utils.py‎
Lines changed: 40 additions & 2 deletions
diff --git a/‎flash_attn/cute/cute_dsl_utils.py‎
Lines changed: 35 additions & 0 deletions b/‎flash_attn/cute/cute_dsl_utils.py‎
Lines changed: 35 additions & 0 deletions
@@ -10,6 +10,7 @@ repos:
             flash_bwd|
             flash_fwd|
             flash_fwd_sm100|
+            flash_fwd_mla_sm100|
             interface|
           )\.py$
       - id: ruff-format
 
@@ -13,7 +13,15 @@
 
 
 def flops(
-    batch, nheads, seqlen_q, seqlen_k, headdim, headdim_v, causal=False, window_size=(None, None)
+    batch,
+    nheads,
+    seqlen_q,
+    seqlen_k,
+    headdim,
+    headdim_v,
+    causal=False,
+    window_size=(None, None),
+    has_qv=False,
 ):
     if causal:
         avg_seqlen = (max(0, seqlen_k - seqlen_q) + seqlen_k) / 2
@@ -35,7 +43,37 @@ def flops(
                 else torch.full_like(row_idx, seqlen_k - 1)
             )
             avg_seqlen = (col_right - col_left + 1).float().mean().item()
-    return batch * nheads * 2 * seqlen_q * avg_seqlen * (headdim + headdim_v)
+    eff_headdim = headdim + headdim_v if has_qv else headdim
+    return batch * nheads * 2 * seqlen_q * avg_seqlen * (eff_headdim + headdim_v)
+
+
+# ── Bandwidth calculation ────────────────────────────────────────────────────
+
+
+def bandwidth_fwd_bytes(
+    batch, nheads, nheads_kv, seqlen_q, seqlen_k, headdim, headdim_v, dtype_bytes=2, has_qv=False
+):
+    """HBM traffic for one attention pass: read Q,K,V + write O."""
+    q = batch * nheads * seqlen_q * headdim
+    qv = batch * nheads * seqlen_q * headdim_v if has_qv else 0
+    k = batch * nheads_kv * seqlen_k * headdim
+    v = batch * nheads_kv * seqlen_k * headdim_v
+    o = batch * nheads * seqlen_q * headdim_v
+    return (q + qv + k + v + o) * dtype_bytes
+
+
+def bandwidth_bwd_bytes(
+    batch, nheads, nheads_kv, seqlen_q, seqlen_k, headdim, headdim_v, dtype_bytes=2
+):
+    """HBM traffic for one attention pass: read Q,K,V,dO + write dQ,dK,dV."""
+    q = batch * nheads * seqlen_q * headdim
+    k = batch * nheads_kv * seqlen_k * headdim
+    v = batch * nheads_kv * seqlen_k * headdim_v
+    do = batch * nheads * seqlen_q * headdim_v
+    dq = q
+    dk = k
+    dv = v
+    return (q + k + v + do + dq + dk + dv) * dtype_bytes
 
 
 # ── Reference attention ─────────────────────────────────────────────────────
 
@@ -104,3 +104,38 @@ def get_broadcast_dims(tensor: torch.Tensor) -> Tuple[bool, ...]:
     patterns are not interchangeable.
     """
     return tuple(s == 0 for s in tensor.stride())
+
+
+# credit: monellz (https://github.com/NVIDIA/cutlass/issues/2658#issuecomment-3630564264)
+def dump_kernel_attributes(compiled_kernel):
+    from cuda.bindings import driver
+    from cutlass.utils import HardwareInfo
+    import torch
+
+    device_id = torch.cuda.current_device()
+    hardware_info = HardwareInfo(device_id=device_id)
+    cubin_data = compiled_kernel.artifacts.CUBIN
+    assert cubin_data is not None, "cubin_data is None, need '--keep-cubin' option when compiling"
+    cuda_library = hardware_info._checkCudaErrors(
+        driver.cuLibraryLoadData(cubin_data, None, None, 0, None, None, 0)
+    )
+    kernels = hardware_info._checkCudaErrors(driver.cuLibraryEnumerateKernels(1, cuda_library))
+    kernel = hardware_info._checkCudaErrors(driver.cuKernelGetFunction(kernels[0]))
+    # more metrics: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__EXEC.html#group__CUDA__EXEC_1g5e92a1b0d8d1b82cb00dcfb2de15961b
+    local_size_bytes = hardware_info._checkCudaErrors(
+        driver.cuFuncGetAttribute(
+            driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_LOCAL_SIZE_BYTES,
+            kernel,
+        )
+    )
+    num_regs = hardware_info._checkCudaErrors(
+        driver.cuFuncGetAttribute(
+            driver.CUfunction_attribute.CU_FUNC_ATTRIBUTE_NUM_REGS,
+            kernel,
+        )
+    )
+
+    print("--- Kernel Info ---")
+    print(f"local_size_bytes: {local_size_bytes}")
+    print(f"num_regs: {num_regs}")
+    print("--- End Kernel Info ---")