ml-explore
diff --git a/‎benchmarks/python/sdpa_vjp_bench.py‎
Lines changed: 33 additions & 12 deletions b/‎benchmarks/python/sdpa_vjp_bench.py‎
Lines changed: 33 additions & 12 deletions
diff --git a/‎mlx/backend/metal/kernels/sdpa_vector.h‎
Lines changed: 2 additions & 1 deletion b/‎mlx/backend/metal/kernels/sdpa_vector.h‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎mlx/backend/metal/kernels/sdpa_vector_vjp.h‎
Lines changed: 15 additions & 9 deletions b/‎mlx/backend/metal/kernels/sdpa_vector_vjp.h‎
Lines changed: 15 additions & 9 deletions
diff --git a/‎mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h‎
Lines changed: 9 additions & 8 deletions b/‎mlx/backend/metal/kernels/steel/attn/kernels/steel_attention.h‎
Lines changed: 9 additions & 8 deletions
diff --git a/‎mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_vjp_dkv.h‎
Lines changed: 4 additions & 4 deletions b/‎mlx/backend/metal/kernels/steel/attn/kernels/steel_attention_vjp_dkv.h‎
Lines changed: 4 additions & 4 deletions
@@ -8,6 +8,7 @@
 
 import argparse
 import time
+
 import mlx.core as mx
 
 N_warmup = 10
@@ -115,7 +116,9 @@ def run_backward_only_benchmark(B, H_q, H_kv, L, D, dtype=mx.float16):
 
     # Unfused backward
     def unfused_bwd():
-        _, grads = mx.vjp(lambda q, k, v: mlx_ref_attn(q, k, v, scale), [q, k, v], [cotan])
+        _, grads = mx.vjp(
+            lambda q, k, v: mlx_ref_attn(q, k, v, scale), [q, k, v], [cotan]
+        )
         return grads
 
     # Fused backward
@@ -142,7 +145,9 @@ def verify_correctness(B, H_q, H_kv, L, D, dtype=mx.float16):
     v = mx.random.normal((B, H_kv, L, D), dtype=dtype)
     cotan = mx.ones((B, H_q, L, D), dtype=dtype)
 
-    _, ref_grads = mx.vjp(lambda q, k, v: mlx_ref_attn(q, k, v, scale), [q, k, v], [cotan])
+    _, ref_grads = mx.vjp(
+        lambda q, k, v: mlx_ref_attn(q, k, v, scale), [q, k, v], [cotan]
+    )
     _, fused_grads = mx.vjp(
         lambda q, k, v: mx.fast.scaled_dot_product_attention(q, k, v, scale=scale),
         [q, k, v],
@@ -154,7 +159,9 @@ def verify_correctness(B, H_q, H_kv, L, D, dtype=mx.float16):
     for i, (r, f) in enumerate(zip(ref_grads, fused_grads)):
         if not mx.allclose(r, f, rtol=rtol, atol=atol):
             max_diff = mx.max(mx.abs(r - f)).item()
-            print(f"  WARNING: Gradient {['dQ', 'dK', 'dV'][i]} mismatch, max_diff={max_diff:.2e}")
+            print(
+                f"  WARNING: Gradient {['dQ', 'dK', 'dV'][i]} mismatch, max_diff={max_diff:.2e}"
+            )
             all_match = False
 
     return all_match
@@ -168,9 +175,15 @@ def main():
         default="vjp",
         help="Benchmark mode: vjp (fwd+bwd), forward only, backward only, or all",
     )
-    parser.add_argument("--verify", action="store_true", help="Verify correctness before benchmarking")
-    parser.add_argument("--dtype", choices=["float16", "bfloat16", "float32"], default="float16")
-    parser.add_argument("--quick", action="store_true", help="Run quick subset of benchmarks")
+    parser.add_argument(
+        "--verify", action="store_true", help="Verify correctness before benchmarking"
+    )
+    parser.add_argument(
+        "--dtype", choices=["float16", "bfloat16", "float32"], default="float16"
+    )
+    parser.add_argument(
+        "--quick", action="store_true", help="Run quick subset of benchmarks"
+    )
     args = parser.parse_args()
 
     dtype = getattr(mx, args.dtype)
@@ -208,16 +221,18 @@ def main():
             (1, 32, 8, 1024, 128),
             (1, 32, 8, 2048, 128),
             # GQA configurations
-            (2, 32, 8, 256, 64),   # 4:1 GQA
-            (2, 32, 4, 256, 64),   # 8:1 GQA
+            (2, 32, 8, 256, 64),  # 4:1 GQA
+            (2, 32, 4, 256, 64),  # 8:1 GQA
         ]
 
     print(f"SDPA VJP Benchmark - dtype={args.dtype}")
     print("=" * 85)
 
     if args.mode in ["vjp", "all"]:
         print("\n[Forward + Backward (VJP)]")
-        print(f"{'B':>3} {'H_q':>4} {'H_kv':>5} {'L':>6} {'D':>4} | {'unfused':>10} {'fused':>10} {'speedup':>8} {'path':>8}")
+        print(
+            f"{'B':>3} {'H_q':>4} {'H_kv':>5} {'L':>6} {'D':>4} | {'unfused':>10} {'fused':>10} {'speedup':>8} {'path':>8}"
+        )
         print("-" * 85)
 
         for B, H_q, H_kv, L, D in configs:
@@ -235,7 +250,9 @@ def main():
 
     if args.mode in ["forward", "all"]:
         print("\n[Forward Only]")
-        print(f"{'B':>3} {'H_q':>4} {'H_kv':>5} {'L':>6} {'D':>4} | {'unfused':>10} {'fused':>10} {'speedup':>8} {'path':>8}")
+        print(
+            f"{'B':>3} {'H_q':>4} {'H_kv':>5} {'L':>6} {'D':>4} | {'unfused':>10} {'fused':>10} {'speedup':>8} {'path':>8}"
+        )
         print("-" * 85)
 
         for B, H_q, H_kv, L, D in configs:
@@ -248,7 +265,9 @@ def main():
 
     if args.mode in ["backward", "all"]:
         print("\n[Backward Only]")
-        print(f"{'B':>3} {'H_q':>4} {'H_kv':>5} {'L':>6} {'D':>4} | {'unfused':>10} {'fused':>10} {'speedup':>8} {'path':>8}")
+        print(
+            f"{'B':>3} {'H_q':>4} {'H_kv':>5} {'L':>6} {'D':>4} | {'unfused':>10} {'fused':>10} {'speedup':>8} {'path':>8}"
+        )
         print("-" * 85)
 
         for B, H_q, H_kv, L, D in configs:
@@ -261,7 +280,9 @@ def main():
 
     print("\n" + "=" * 85)
     print("Legend:")
-    print("  - unfused: Reference implementation using separate matmul + softmax + matmul")
+    print(
+        "  - unfused: Reference implementation using separate matmul + softmax + matmul"
+    )
     print("  - fused: mx.fast.scaled_dot_product_attention with Flash Attention VJP")
     print("  - path: 'vector' for L<=8 (vector kernel), 'STEEL' for L>8 (tiled kernel)")
     print("  - speedup > 1.0 means fused is faster")
 
@@ -93,7 +93,8 @@ template <typename T, int D, int V = D>
   U sum_exp_score = 0;
   if (has_sinks && simd_gid == 0) {
     // Scale sink by M_LOG2E_F to match log2 domain
-    max_score = static_cast<U>(M_LOG2E_F) * static_cast<U>(sinks[q_batch_head_idx % num_q_heads]);
+    max_score = static_cast<U>(M_LOG2E_F) *
+        static_cast<U>(sinks[q_batch_head_idx % num_q_heads]);
     sum_exp_score = 1;
   }
 
 
@@ -62,8 +62,9 @@ template <typename T, int D, int V = D>
     const constant size_t& v_head_stride [[buffer(13)]],
     const constant size_t& v_seq_stride [[buffer(14)]],
     const constant float& scale [[buffer(15)]],
-    // Output (O/dO) stride parameters - STEEL forward may produce non-row-major layout
-    // Physical layout can be BLHV (strides [L*H*V, V, H*V, 1]) vs logical BHLV
+    // Output (O/dO) stride parameters - STEEL forward may produce non-row-major
+    // layout Physical layout can be BLHV (strides [L*H*V, V, H*V, 1]) vs
+    // logical BHLV
     const constant int& num_q_heads [[buffer(16)]],
     const constant size_t& o_batch_stride [[buffer(17)]],
     const constant size_t& o_head_stride [[buffer(18)]],
@@ -138,7 +139,8 @@ template <typename T, int D, int V = D>
 
   // Set up output/gradient pointers
   // Use explicit strides for O/dO to handle BLHV physical layout from STEEL
-  // For BLHV strides: o_batch_stride = L*H*V, o_head_stride = V, o_seq_stride = H*V
+  // For BLHV strides: o_batch_stride = L*H*V, o_head_stride = V, o_seq_stride =
+  // H*V
   out += batch_idx * o_batch_stride + head_idx * o_head_stride +
       q_seq_idx * o_seq_stride + simd_lid * v_per_thread;
   d_out += batch_idx * o_batch_stride + head_idx * o_head_stride +
@@ -232,7 +234,8 @@ template <typename T, int D, int V = D>
       }
 
       // Reconstruct attention probability: P = exp2(S - logsumexp)
-      // Using exp2 to match STEEL attention domain (logsumexp is in log2 domain)
+      // Using exp2 to match STEEL attention domain (logsumexp is in log2
+      // domain)
       U prob = fast::exp2(score - lse);
 
       // Compute dP = dO @ V^T for this KV position
@@ -247,8 +250,8 @@ template <typename T, int D, int V = D>
 
       // Accumulate dQ += scale * dS @ K
       // Note: Although Q was scaled by M_LOG2E_F internally, the softmax
-      // gradient dS compensates for this because the overall softmax(S') = softmax(S).
-      // The gradient dQ = scale * dS @ K matches the reference.
+      // gradient dS compensates for this because the overall softmax(S') =
+      // softmax(S). The gradient dQ = scale * dS @ K matches the reference.
       for (int j = 0; j < qk_per_thread; j++) {
         dq[j] += static_cast<U>(scale) * dS * k[j];
       }
@@ -347,7 +350,8 @@ template <typename T, int D, int V = D>
     const constant size_t& v_head_stride [[buffer(14)]],
     const constant size_t& v_seq_stride [[buffer(15)]],
     const constant float& scale [[buffer(16)]],
-    // Output (O/dO) stride parameters - STEEL forward may produce non-row-major layout
+    // Output (O/dO) stride parameters - STEEL forward may produce non-row-major
+    // layout
     const constant int& num_q_heads [[buffer(17)]],
     const constant size_t& o_batch_stride [[buffer(18)]],
     const constant size_t& o_head_stride [[buffer(19)]],
@@ -489,11 +493,13 @@ template <typename T, int D, int V = D>
 
       if (float_mask) {
         // Scale float mask by M_LOG2E_F to match log2 domain
-        score += static_cast<U>(M_LOG2E_F) * static_cast<U>(fm_ptr[mask_offset]);
+        score +=
+            static_cast<U>(M_LOG2E_F) * static_cast<U>(fm_ptr[mask_offset]);
       }
 
       // Reconstruct probability: P = exp2(S - logsumexp)
-      // Using exp2 to match STEEL attention domain (logsumexp is in log2 domain)
+      // Using exp2 to match STEEL attention domain (logsumexp is in log2
+      // domain)
       U prob = fast::exp2(score - lse);
 
       // Compute dP
 
@@ -464,19 +464,20 @@ template <
 
   // Output logsumexp if requested for VJP backward pass
   // LSE = max_score + log2(sum_score) in log2 domain (matches STEEL convention)
-  // Physical storage shape: [B*H, qL], laid out as linear array indexed by (B*H + head)*qL + query_pos
-  // LSE_strides[0] = qL (stride between (batch, head) rows)
-  // LSE_strides[1] = 1 (stride between query positions within a row)
+  // Physical storage shape: [B*H, qL], laid out as linear array indexed by (B*H
+  // + head)*qL + query_pos LSE_strides[0] = qL (stride between (batch, head)
+  // rows) LSE_strides[1] = 1 (stride between query positions within a row)
   if (output_logsumexp) {
     // Compute linear index for (batch, head) combination
-    // This matches the VJP kernel's indexing: (tidl.z * H + tidl.y) * LSE_strides[0]
-    device float* lse_out = LSE +
-        (tidl.z * params->H + tidl.y) * params->LSE_strides[0];
+    // This matches the VJP kernel's indexing: (tidl.z * H + tidl.y) *
+    // LSE_strides[0]
+    device float* lse_out =
+        LSE + (tidl.z * params->H + tidl.y) * params->LSE_strides[0];
 
     // Write one logsumexp per query position in this tile
     // Each thread handles kRowsPT query positions
-    // align_Q=true means query length is aligned (all blocks full), so always write
-    // align_Q=false means last block is partial, so check bounds
+    // align_Q=true means query length is aligned (all blocks full), so always
+    // write align_Q=false means last block is partial, so check bounds
     STEEL_PRAGMA_UNROLL
     for (short i = 0; i < kRowsPT; ++i) {
       int row_pos = tid.x * BQ + tm + sm + (i * decltype(Stile)::kFragRows);
 
@@ -284,8 +284,8 @@ void attention_vjp_dkv(
       const device T* O_base =
           O + tidl.z * params->O_strides[0] + q_head_idx * params->O_strides[1];
 
-      const device T* dO_base =
-          dO + tidl.z * params->O_strides[0] + q_head_idx * params->O_strides[1];
+      const device T* dO_base = dO + tidl.z * params->O_strides[0] +
+          q_head_idx * params->O_strides[1];
 
       const device float* LSE_base =
           LSE + (tidl.z * params->H + q_head_idx) * params->LSE_strides[0];
@@ -835,8 +835,8 @@ void attention_vjp_dkv(
 // tname is the string name used in kernel lookup (e.g., "float32", "float16")
 // dtype is the actual C++ type (e.g., float, half, bfloat16_t)
 #define instantiate_attention_vjp_dkv_kernel(tname, dtype, bq, bk, bd, wm, wn) \
-  template [[host_name(                                                        \
-      "attention_vjp_dkv_" #tname "_" #bq "_" #bk "_" #bd)]] [[kernel]] void   \
+  template [[host_name("attention_vjp_dkv_" #tname "_" #bq "_" #bk             \
+                       "_" #bd)]] [[kernel]] void                              \
   attention_vjp_dkv<dtype, bq, bk, bd, wm, wn>(                                \
       const device dtype*,                                                     \
       const device dtype*,                                                     \
Original file line number	Diff line number	Diff line change
`@@ -93,7 +93,8 @@ template <typename T, int D, int V = D>`
`93`	`93`	`U sum_exp_score = 0;`
`94`	`94`	`if (has_sinks && simd_gid == 0) {`
`95`	`95`	`// Scale sink by M_LOG2E_F to match log2 domain`
`96`		`- max_score = static_cast<U>(M_LOG2E_F) * static_cast<U>(sinks[q_batch_head_idx % num_q_heads]);`
	`96`	`+ max_score = static_cast<U>(M_LOG2E_F) *`
	`97`	`+ static_cast<U>(sinks[q_batch_head_idx % num_q_heads]);`
`97`	`98`	`sum_exp_score = 1;`
`98`	`99`	`}`
`99`	`100`