fix: Fix memory bandwidth calculation in MLA benchmarks (#2479)

bkryu · web-flow · commit f84ac1c97e2e · 2026-02-03T17:39:22.000-08:00
## 📌 Description Summary * Fixed incorrect memory bandwidth calculation in `testBatchMLAPagedAttentionWrapper` that was using full tensor allocations instead of actual bytes accessed based on sequence lengths * Updated `bench_trtllm_gen_mla.py` to use the unified `bench_gpu_time()` utility with CUPTI for consistent timing with the benchmark framework cc @hypdeb  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **Chores** * Improved benchmarking: switched to CUDA/CUPTI-based timing with refined iteration controls (dry/run and repeat by iterations) and optional CUDA graph support. * Updated performance reporting to use explicit memory accounting from actual token usage (query, KV, output), and adjusted bandwidth and FLOPs printouts for clearer, more accurate throughput metrics.
diff --git a/benchmarks/bench_trtllm_gen_mla.py b/benchmarks/bench_trtllm_gen_mla.py
@@ -2,7 +2,7 @@
 import torch
 
 import flashinfer
-from flashinfer.testing.utils import bench_gpu_time_with_cudagraph
+from flashinfer.testing.utils import bench_gpu_time
 
 num_q_heads = 128
 qk_nope_head_dim = 128
@@ -83,7 +83,7 @@ def bench_trtllm_mla(batch_size, q_len_per_request, seq_len, page_size, dtype):
         bmm2_scale=1.0,
     )
     # benchmark
-    measurements = bench_gpu_time_with_cudagraph(
+    measurements = bench_gpu_time(
         lambda: flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
             query=query,
             kv_cache=kv_cache.unsqueeze(1),
@@ -97,27 +97,40 @@ def bench_trtllm_mla(batch_size, q_len_per_request, seq_len, page_size, dtype):
             bmm1_scale=1.0 / ((128 + 64) ** 0.5),
             bmm2_scale=1.0,
         ),
-        dry_run_time_ms=100,
-        repeat_time_ms=1000,
-    )
-    io = (
-        query.numel() * query.element_size()
-        + kv_cache.numel() * kv_cache.element_size()
+        dry_run_iters=5,
+        repeat_iters=30,
+        enable_cupti=False,
+        use_cuda_graph=True,
+        cold_l2_cache=True,
     )
     ms = np.median(measurements)
+
+    # Memory bandwidth calculation based on actual bytes accessed
+    elem_size = query.element_size()
+    # Query bytes: batch_size * q_len_per_request * num_heads * head_dim
+    q_mem_bytes = query.numel() * elem_size
+    # KV cache bytes: actual tokens accessed (sum of seq_lens), not full allocation
+    actual_kv_tokens = sum(seq_lens)
+    kv_mem_bytes = actual_kv_tokens * (kv_lora_rank + qk_rope_head_dim) * elem_size
+    # Output bytes: batch_size * q_len_per_request * num_heads * kv_lora_rank
+    o_mem_bytes = (
+        batch_size * q_len_per_request * num_q_heads * kv_lora_rank * elem_size
+    )
+    total_mem_bytes = q_mem_bytes + kv_mem_bytes + o_mem_bytes
+
     flops = (
         2
         * num_q_heads
         * (2 * kv_lora_rank + qk_rope_head_dim)
-        * sum(seq_lens)
+        * actual_kv_tokens
         * q_len_per_request
     )
     print(
         f"batch_size={batch_size}, q_len_per_request={q_len_per_request}, seq_len={seq_len}, num_q_heads={num_q_heads}, qk_nope_head_dim={qk_nope_head_dim}, qk_rope_head_dim={qk_rope_head_dim}, kv_lora_rank={kv_lora_rank}, page_size={page_size}"
     )
-    print(f"execution time: {ms} ms")
-    print(f"memory bandwidth: {io / ms / 1024 / 1024:.2f} GB/s")
-    print(f"FLOPs: {flops * 1e-9 / ms:.2f} TFLOPs/s")
+    print(f"execution time: {ms:.4f} ms")
+    print(f"memory bandwidth: {total_mem_bytes / ms / 1e6:.2f} GB/s")
+    print(f"FLOPs: {flops / ms / 1e9:.2f} TFLOPs/s")
 
 
 if __name__ == "__main__":
diff --git a/benchmarks/routines/attention.py b/benchmarks/routines/attention.py
@@ -2244,20 +2244,26 @@ def run_backend_wrapper(
             actual_seq_lens_q_flat = torch.ones_like(
                 actual_seq_lens_kv.flatten().to("cpu")
             )
-            o_mem_bytes = (
-                actual_seq_lens_q_flat.numel()
-                * num_qo_heads
-                * head_dim_ckv
-                * q_dtype.itemsize
+
+            # Query bytes (q_nope + q_pe): batch_size * num_heads * head_dim
+            q_mem_bytes = (
+                q_nope.numel() * q_nope.element_size()
+                + q_pe.numel() * q_pe.element_size()
             )
-            qkv_mem_bytes = sum(
-                [
-                    _.numel() * _.element_size()
-                    for _ in [q_nope, q_pe, ckv_cache, kpe_cache]
-                ]
+
+            # KV cache bytes: based on actual sequence lengths accessed, not full allocation
+            actual_kv_tokens = actual_seq_lens_kv_flat.sum().item()
+            kv_elem_size = ckv_cache.element_size()  # Same dtype for ckv and kpe
+            kv_mem_bytes = (
+                actual_kv_tokens * (head_dim_ckv + head_dim_kpe) * kv_elem_size
             )
-            total_mem_bytes = o_mem_bytes + qkv_mem_bytes
-            tb_per_sec = (total_mem_bytes / (median_time * 1e9)).item()
+
+            # Output bytes: batch_size * num_heads * head_dim_ckv
+            o_elem_size = q_nope.element_size()  # Output has same dtype as query
+            o_mem_bytes = batch_size * num_qo_heads * head_dim_ckv * o_elem_size
+
+            total_mem_bytes = q_mem_bytes + kv_mem_bytes + o_mem_bytes
+            tb_per_sec = total_mem_bytes / (median_time * 1e9)
             tflops_total = (
                 2
                 * torch.dot(