Add FA4 RoPE fusion path for low-precision attention

howardzhang-cv · howardzhang-cv · commit 484148cd8025 · 2026-03-11T11:39:46.000-07:00
ghstack-source-id: 56eda45 Pull-Request: #3947
diff --git a/benchmarks/prototype/attention/benchmark_sdpa.py b/benchmarks/prototype/attention/benchmark_sdpa.py
@@ -5,10 +5,12 @@
 # LICENSE file in the root directory of this source tree.
 
 """
-Benchmark two attention backends against each other for a single layer,
-sweeping sequence lengths and measuring runtime and SQNR.
+Benchmark two attention backends against each other for a single layer.
 
-Usage: python benchmarks/prototype/attention/benchmark_sdpa.py --baseline fa2 --test fa3_fp8
+Sweeps over sequence lengths from 1K to 128K, measuring runtime and SQNR.
+
+Usage:
+    python benchmarks/prototype/attention/benchmark_sdpa.py --baseline fa3 --test fa3_fp8
 """
 
 import argparse
@@ -25,14 +27,17 @@
 )
 
 from torchao.prototype.attention.fp8_fa3.attention import fp8_fa3_sdpa
+from torchao.prototype.attention.fp8_fa4.attention import fp8_fa4_sdpa
 from torchao.quantization.utils import compute_error as compute_sqnr
 
-BACKENDS = ["fa2", "fa3", "fa3_fp8"]
+BACKENDS = ["fa2", "fa3", "fa3_fp8", "fa4", "fa4_fp8"]
 
 BACKEND_LABELS = {
     "fa2": "FA2 BF16",
     "fa3": "FA3 BF16",
     "fa3_fp8": "FA3 FP8",
+    "fa4": "FA4 BF16",
+    "fa4_fp8": "FA4 FP8",
 }
 
 
@@ -41,20 +46,24 @@ def _activate_backend(backend: str):
     """Context manager that activates the appropriate flash attention impl."""
     if backend in ("fa3", "fa3_fp8"):
         activate_flash_attention_impl("FA3")
+    elif backend in ("fa4", "fa4_fp8"):
+        activate_flash_attention_impl("FA4")
     else:
         # fa2 is the default, no activation needed
         pass
     try:
         yield
     finally:
-        if backend in ("fa3", "fa3_fp8"):
+        if backend in ("fa3", "fa3_fp8", "fa4", "fa4_fp8"):
             restore_flash_attention_impl()
 
 
 def _run_attention(backend: str, q, k, v, is_causal: bool):
     """Run a single attention call for the given backend."""
     if backend == "fa3_fp8":
         return fp8_fa3_sdpa(q, k, v, is_causal=is_causal)
+    elif backend == "fa4_fp8":
+        return fp8_fa4_sdpa(q, k, v, is_causal=is_causal)
     else:
         with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
             return F.scaled_dot_product_attention(q, k, v, is_causal=is_causal)