Apply cr review

Brooooooklyn · Brooooooklyn · commit 9008d2e1342d · 2026-01-20T23:39:50.000+08:00
diff --git a/mlx/backend/cuda/scaled_dot_product_attention.cpp b/mlx/backend/cuda/scaled_dot_product_attention.cpp
@@ -462,13 +462,12 @@ void ScaledDotProductAttention::eval_gpu(
 bool ScaledDotProductAttentionVJP::use_fallback(
     const array& q,
     Stream s,
-    bool has_mask,
-    bool has_sinks,
+    bool /* has_mask */,
+    bool /* has_sinks */,
     int /* n_kv_heads */) {
-  // Force unfused attention when masks/sinks present
-  if (has_mask || has_sinks) {
-    return true;
-  }
+  // Note: cuDNN SDPA backward correctly handles masks/sinks,
+  // so we don't need to force fallback based on their presence.
+
   // The frontend adds a padding mask when sequence length is not a multiple of
   // tile size.
   if (q.shape(2) % 128 != 0) {
diff --git a/mlx/backend/metal/scaled_dot_product_attention.cpp b/mlx/backend/metal/scaled_dot_product_attention.cpp
@@ -777,11 +777,6 @@ void ScaledDotProductAttention::eval_gpu(
         output_logsumexp_,
         lse_out);
 
-    // Cache logsumexp for VJP access (handles both cases: in outputs[1] or
-    // separate array)
-    if (output_logsumexp_ && lse_out != nullptr) {
-      set_cached_logsumexp(*lse_out);
-    }
   }
 
   d.add_temporaries(std::move(copies), s.index);
diff --git a/mlx/fast_primitives.h b/mlx/fast_primitives.h
@@ -257,21 +257,6 @@ class ScaledDotProductAttention : public Custom {
   bool do_causal_;
   bool has_sinks_;
   bool output_logsumexp_;
-  // Cache logsumexp for VJP backward pass
-  // This enables Flash Attention VJP to access logsumexp even when
-  // the forward pass returns only the attention output to the user.
-  // Size is small: batch * heads * seq * 1 * sizeof(float) = ~512KB per layer
-  mutable std::optional<array> cached_logsumexp_;
-
- public:
-  // Getter for VJP to access cached logsumexp
-  const std::optional<array>& get_cached_logsumexp() const {
-    return cached_logsumexp_;
-  }
-  // Setter called during eval_gpu
-  void set_cached_logsumexp(array logsumexp) const {
-    cached_logsumexp_ = std::move(logsumexp);
-  }
 };
 
 class ScaledDotProductAttentionVJP : public Custom {