add sdpa_vjp_bench

Brooooooklyn · Brooooooklyn · commit 5bba7f2f3be3 · 2026-01-20T23:47:55.000+08:00
diff --git a/benchmarks/python/sdpa_vjp_bench.py b/benchmarks/python/sdpa_vjp_bench.py
@@ -0,0 +1,271 @@
+# Copyright © 2024-25 Apple Inc.
+"""
+Benchmark SDPA VJP: Fused Flash Attention vs Unfused Fallback
+
+This benchmark measures the performance improvement from the fused VJP
+implementation for scaled dot product attention backward pass.
+"""
+
+import argparse
+import time
+import mlx.core as mx
+
+N_warmup = 10
+N_iter = 50
+
+
+def bench(f, *args):
+    """Warmup then time the function"""
+    for _ in range(N_warmup):
+        result = f(*args)
+        mx.eval(result)
+
+    mx.synchronize()
+    start = time.perf_counter()
+    for _ in range(N_iter):
+        result = f(*args)
+        mx.eval(result)
+    mx.synchronize()
+    return (time.perf_counter() - start) / N_iter * 1000  # ms
+
+
+def mlx_ref_attn(q, k, v, scale):
+    """Reference unfused attention implementation"""
+    n_q_heads = q.shape[-3]
+    n_kv_heads = k.shape[-3]
+    n_repeats = n_q_heads // n_kv_heads
+
+    B = q.shape[0]
+    L = q.shape[2]
+
+    if n_repeats > 1:
+        q = mx.reshape(q, [B, n_kv_heads, n_repeats, L, -1])
+        k = mx.expand_dims(k, 2)
+        v = mx.expand_dims(v, 2)
+
+    scores = (q * scale) @ mx.swapaxes(k, -1, -2)
+    weights = mx.softmax(scores, axis=-1)
+    out = weights @ v
+
+    if n_repeats > 1:
+        out = mx.reshape(out, [B, n_q_heads, L, -1])
+
+    return out
+
+
+def run_forward_benchmark(B, H_q, H_kv, L, D, dtype=mx.float16):
+    """Benchmark forward pass only"""
+    scale = D**-0.5
+
+    q = mx.random.normal((B, H_q, L, D), dtype=dtype)
+    k = mx.random.normal((B, H_kv, L, D), dtype=dtype)
+    v = mx.random.normal((B, H_kv, L, D), dtype=dtype)
+    mx.eval(q, k, v)
+
+    def unfused_fwd():
+        return mlx_ref_attn(q, k, v, scale)
+
+    def fused_fwd():
+        return mx.fast.scaled_dot_product_attention(q, k, v, scale=scale)
+
+    t_unfused = bench(unfused_fwd)
+    t_fused = bench(fused_fwd)
+
+    return t_unfused, t_fused
+
+
+def run_vjp_benchmark(B, H_q, H_kv, L, D, dtype=mx.float16):
+    """Benchmark forward + backward (VJP) pass"""
+    scale = D**-0.5
+
+    q = mx.random.normal((B, H_q, L, D), dtype=dtype)
+    k = mx.random.normal((B, H_kv, L, D), dtype=dtype)
+    v = mx.random.normal((B, H_kv, L, D), dtype=dtype)
+    mx.eval(q, k, v)
+
+    # Unfused forward+backward
+    def unfused_fwd_bwd():
+        def loss(q, k, v):
+            return mlx_ref_attn(q, k, v, scale).sum()
+
+        return mx.grad(loss)(q, k, v)
+
+    # Fused forward+backward
+    def fused_fwd_bwd():
+        def loss(q, k, v):
+            return mx.fast.scaled_dot_product_attention(q, k, v, scale=scale).sum()
+
+        return mx.grad(loss)(q, k, v)
+
+    t_unfused = bench(unfused_fwd_bwd)
+    t_fused = bench(fused_fwd_bwd)
+
+    return t_unfused, t_fused
+
+
+def run_backward_only_benchmark(B, H_q, H_kv, L, D, dtype=mx.float16):
+    """Benchmark backward pass only (isolate VJP performance)"""
+    scale = D**-0.5
+
+    q = mx.random.normal((B, H_q, L, D), dtype=dtype)
+    k = mx.random.normal((B, H_kv, L, D), dtype=dtype)
+    v = mx.random.normal((B, H_kv, L, D), dtype=dtype)
+    cotan = mx.ones((B, H_q, L, D), dtype=dtype)
+    mx.eval(q, k, v, cotan)
+
+    # Unfused backward
+    def unfused_bwd():
+        _, grads = mx.vjp(lambda q, k, v: mlx_ref_attn(q, k, v, scale), [q, k, v], [cotan])
+        return grads
+
+    # Fused backward
+    def fused_bwd():
+        _, grads = mx.vjp(
+            lambda q, k, v: mx.fast.scaled_dot_product_attention(q, k, v, scale=scale),
+            [q, k, v],
+            [cotan],
+        )
+        return grads
+
+    t_unfused = bench(unfused_bwd)
+    t_fused = bench(fused_bwd)
+
+    return t_unfused, t_fused
+
+
+def verify_correctness(B, H_q, H_kv, L, D, dtype=mx.float16):
+    """Verify that fused and unfused produce matching gradients"""
+    scale = D**-0.5
+
+    q = mx.random.normal((B, H_q, L, D), dtype=dtype)
+    k = mx.random.normal((B, H_kv, L, D), dtype=dtype)
+    v = mx.random.normal((B, H_kv, L, D), dtype=dtype)
+    cotan = mx.ones((B, H_q, L, D), dtype=dtype)
+
+    _, ref_grads = mx.vjp(lambda q, k, v: mlx_ref_attn(q, k, v, scale), [q, k, v], [cotan])
+    _, fused_grads = mx.vjp(
+        lambda q, k, v: mx.fast.scaled_dot_product_attention(q, k, v, scale=scale),
+        [q, k, v],
+        [cotan],
+    )
+
+    rtol, atol = (1e-2, 1e-2) if dtype != mx.float32 else (1e-4, 1e-4)
+    all_match = True
+    for i, (r, f) in enumerate(zip(ref_grads, fused_grads)):
+        if not mx.allclose(r, f, rtol=rtol, atol=atol):
+            max_diff = mx.max(mx.abs(r - f)).item()
+            print(f"  WARNING: Gradient {['dQ', 'dK', 'dV'][i]} mismatch, max_diff={max_diff:.2e}")
+            all_match = False
+
+    return all_match
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Benchmark SDPA VJP performance")
+    parser.add_argument(
+        "--mode",
+        choices=["vjp", "forward", "backward", "all"],
+        default="vjp",
+        help="Benchmark mode: vjp (fwd+bwd), forward only, backward only, or all",
+    )
+    parser.add_argument("--verify", action="store_true", help="Verify correctness before benchmarking")
+    parser.add_argument("--dtype", choices=["float16", "bfloat16", "float32"], default="float16")
+    parser.add_argument("--quick", action="store_true", help="Run quick subset of benchmarks")
+    args = parser.parse_args()
+
+    dtype = getattr(mx, args.dtype)
+    dtype_str = args.dtype[:4] if len(args.dtype) > 4 else args.dtype
+
+    # Configurations to benchmark
+    # (B, H_q, H_kv, L, D)
+    if args.quick:
+        configs = [
+            # Vector path (L <= 8)
+            (2, 8, 8, 1, 64),
+            (2, 8, 8, 8, 128),
+            # STEEL path (L > 8)
+            (2, 8, 8, 128, 64),
+            (2, 8, 8, 512, 128),
+            (1, 32, 8, 1024, 128),
+        ]
+    else:
+        configs = [
+            # Vector path (L <= 8) - short sequences
+            (2, 8, 8, 1, 64),
+            (2, 8, 8, 4, 64),
+            (2, 8, 8, 8, 64),
+            (2, 8, 8, 8, 128),
+            # STEEL path - medium sequences
+            (2, 8, 8, 32, 64),
+            (2, 8, 8, 64, 64),
+            (2, 8, 8, 128, 64),
+            (2, 8, 8, 128, 128),
+            (2, 8, 8, 256, 128),
+            # STEEL path - long sequences
+            (1, 32, 8, 512, 64),
+            (1, 32, 8, 512, 128),
+            (1, 32, 8, 1024, 64),
+            (1, 32, 8, 1024, 128),
+            (1, 32, 8, 2048, 128),
+            # GQA configurations
+            (2, 32, 8, 256, 64),   # 4:1 GQA
+            (2, 32, 4, 256, 64),   # 8:1 GQA
+        ]
+
+    print(f"SDPA VJP Benchmark - dtype={args.dtype}")
+    print("=" * 85)
+
+    if args.mode in ["vjp", "all"]:
+        print("\n[Forward + Backward (VJP)]")
+        print(f"{'B':>3} {'H_q':>4} {'H_kv':>5} {'L':>6} {'D':>4} | {'unfused':>10} {'fused':>10} {'speedup':>8} {'path':>8}")
+        print("-" * 85)
+
+        for B, H_q, H_kv, L, D in configs:
+            if args.verify:
+                correct = verify_correctness(B, H_q, H_kv, L, D, dtype)
+                if not correct:
+                    continue
+
+            t_unfused, t_fused = run_vjp_benchmark(B, H_q, H_kv, L, D, dtype)
+            speedup = t_unfused / t_fused
+            path = "vector" if L <= 8 else "STEEL"
+            print(
+                f"{B:3d} {H_q:4d} {H_kv:5d} {L:6d} {D:4d} | {t_unfused:9.2f}ms {t_fused:9.2f}ms {speedup:7.2f}x {path:>8}"
+            )
+
+    if args.mode in ["forward", "all"]:
+        print("\n[Forward Only]")
+        print(f"{'B':>3} {'H_q':>4} {'H_kv':>5} {'L':>6} {'D':>4} | {'unfused':>10} {'fused':>10} {'speedup':>8} {'path':>8}")
+        print("-" * 85)
+
+        for B, H_q, H_kv, L, D in configs:
+            t_unfused, t_fused = run_forward_benchmark(B, H_q, H_kv, L, D, dtype)
+            speedup = t_unfused / t_fused
+            path = "vector" if L <= 8 else "STEEL"
+            print(
+                f"{B:3d} {H_q:4d} {H_kv:5d} {L:6d} {D:4d} | {t_unfused:9.2f}ms {t_fused:9.2f}ms {speedup:7.2f}x {path:>8}"
+            )
+
+    if args.mode in ["backward", "all"]:
+        print("\n[Backward Only]")
+        print(f"{'B':>3} {'H_q':>4} {'H_kv':>5} {'L':>6} {'D':>4} | {'unfused':>10} {'fused':>10} {'speedup':>8} {'path':>8}")
+        print("-" * 85)
+
+        for B, H_q, H_kv, L, D in configs:
+            t_unfused, t_fused = run_backward_only_benchmark(B, H_q, H_kv, L, D, dtype)
+            speedup = t_unfused / t_fused
+            path = "vector" if L <= 8 else "STEEL"
+            print(
+                f"{B:3d} {H_q:4d} {H_kv:5d} {L:6d} {D:4d} | {t_unfused:9.2f}ms {t_fused:9.2f}ms {speedup:7.2f}x {path:>8}"
+            )
+
+    print("\n" + "=" * 85)
+    print("Legend:")
+    print("  - unfused: Reference implementation using separate matmul + softmax + matmul")
+    print("  - fused: mx.fast.scaled_dot_product_attention with Flash Attention VJP")
+    print("  - path: 'vector' for L<=8 (vector kernel), 'STEEL' for L>8 (tiled kernel)")
+    print("  - speedup > 1.0 means fused is faster")
+
+
+if __name__ == "__main__":
+    main()