Add FA4 RoPE fusion path for low-precision attention

howardzhang-cv · howardzhang-cv · commit 28add5edc8a7 · 2026-03-02T16:28:42.000-08:00
Adds the compile path (fuse_rope=True) for the FA4 backend, mirroring the FA3 fusion pass structure via the shared custom op and fusion pass factories. Key additions: - fp8_fa4/fusion_pass.py: FA4-specific custom ops and compile helper - fp8_fa4_rope_sdpa entry point in attention.py - Replace placeholder compile_fn with real fusion pass in setup.py - Wire up FA4 rope_sdpa_fn in test backend config ghstack-source-id: 2e69d43 Pull-Request: #3947
diff --git a/benchmarks/prototype/attention/benchmark_sdpa.py b/benchmarks/prototype/attention/benchmark_sdpa.py
@@ -14,6 +14,8 @@
     fa2      - BF16 SDPA with FlashAttention 2 (PyTorch default)
     fa3      - BF16 SDPA with FlashAttention 3
     fa3_fp8  - FP8 SDPA with FlashAttention 3 (includes quantization kernels)
+    fa4      - BF16 SDPA with FlashAttention 4
+    fa4_fp8  - FP8 SDPA with FlashAttention 4 (includes quantization kernels)
 
 Usage:
     # Default: FA2 vs FA3+FP8
@@ -22,8 +24,11 @@
     # FA3 bf16 vs FA3 fp8
     python benchmarks/prototype/attention/benchmark_sdpa.py --baseline fa3 --test fa3_fp8
 
+    # FA2 vs FA4
+    python benchmarks/prototype/attention/benchmark_sdpa.py --baseline fa2 --test fa4
+
     # With causal masking
-    python benchmarks/prototype/attention/benchmark_sdpa.py --baseline fa3 --test fa3_fp8 --causal
+    python benchmarks/prototype/attention/benchmark_sdpa.py --baseline fa3 --test fa4 --causal
 """
 
 import argparse
@@ -40,13 +45,16 @@
 )
 
 from torchao.prototype.attention.fp8_fa3.attention import fp8_fa3_sdpa
+from torchao.prototype.attention.fp8_fa4.attention import fp8_fa4_sdpa
 
-BACKENDS = ["fa2", "fa3", "fa3_fp8"]
+BACKENDS = ["fa2", "fa3", "fa3_fp8", "fa4", "fa4_fp8"]
 
 BACKEND_LABELS = {
     "fa2": "FA2 BF16",
     "fa3": "FA3 BF16",
     "fa3_fp8": "FA3 FP8",
+    "fa4": "FA4 BF16",
+    "fa4_fp8": "FA4 FP8",
 }
 
 
@@ -55,20 +63,24 @@ def _activate_backend(backend: str):
     """Context manager that activates the appropriate flash attention impl."""
     if backend in ("fa3", "fa3_fp8"):
         activate_flash_attention_impl("FA3")
+    elif backend in ("fa4", "fa4_fp8"):
+        activate_flash_attention_impl("FA4")
     else:
         # fa2 is the default, no activation needed
         pass
     try:
         yield
     finally:
-        if backend in ("fa3", "fa3_fp8"):
+        if backend in ("fa3", "fa3_fp8", "fa4", "fa4_fp8"):
             restore_flash_attention_impl()
 
 
 def _run_attention(backend: str, q, k, v, is_causal: bool):
     """Run a single attention call for the given backend."""
     if backend == "fa3_fp8":
         return fp8_fa3_sdpa(q, k, v, is_causal=is_causal)
+    elif backend == "fa4_fp8":
+        return fp8_fa4_sdpa(q, k, v, is_causal=is_causal)
     else:
         with sdpa_kernel(SDPBackend.FLASH_ATTENTION):
             return F.scaled_dot_product_attention(q, k, v, is_causal=is_causal)
diff --git a/benchmarks/prototype/attention/eval_flux_model.py b/benchmarks/prototype/attention/eval_flux_model.py
@@ -14,6 +14,8 @@
     fa2      - Flash Attention 2 (default SDPA)
     fa3      - Flash Attention 3
     fa3_fp8  - Flash Attention 3 with FP8 quantization (fused RoPE + FP8 SDPA)
+    fa4      - Flash Attention 4
+    fa4_fp8  - Flash Attention 4 with FP8 quantization (fused RoPE + FP8 SDPA)
 
 Usage:
     # Compare FA3 vs FA3 FP8 (default)
@@ -22,6 +24,9 @@
     # Compare FA2 vs FA3
     python eval_flux_model.py --baseline fa2 --test fa3
 
+    # Compare FA3 vs FA4
+    python eval_flux_model.py --baseline fa3 --test fa4
+
     # Full benchmark with 200 prompts
     python eval_flux_model.py --num_prompts 200
 
@@ -64,6 +69,12 @@
         "fp8": True,
         "fp8_backend": AttentionBackend.FP8_FA3,
     },
+    "fa4": {"flash_impl": "FA4", "fp8": False},
+    "fa4_fp8": {
+        "flash_impl": "FA4",
+        "fp8": True,
+        "fp8_backend": AttentionBackend.FP8_FA4,
+    },
 }
 
 IMAGE_SIZE = (512, 512)  # (width, height) - resize for consistent LPIPS
diff --git a/benchmarks/prototype/attention/eval_llama3_model.py b/benchmarks/prototype/attention/eval_llama3_model.py
@@ -17,6 +17,8 @@
     fa2      - Flash Attention 2 (default SDPA)
     fa3      - Flash Attention 3
     fa3_fp8  - Flash Attention 3 with FP8 quantization (fused RoPE + FP8 SDPA)
+    fa4      - Flash Attention 4
+    fa4_fp8  - Flash Attention 4 with FP8 quantization (fused RoPE + FP8 SDPA)
 
 Usage:
     # Default: FA3 vs FA3 FP8
@@ -25,6 +27,9 @@
     # FA2 vs FA3
     python eval_llama3_model.py --baseline fa2 --test fa3
 
+    # FA3 vs FA4
+    python eval_llama3_model.py --baseline fa3 --test fa4
+
     # With torch.compile (applies to non-FP8 backends)
     python eval_llama3_model.py --compile
 """
@@ -77,6 +82,17 @@
         "fp8_backend": AttentionBackend.FP8_FA3,
         "label": "FA3 FP8",
     },
+    "fa4": {
+        "flash_impl": "FA4",
+        "fp8": False,
+        "label": "FA4 BF16",
+    },
+    "fa4_fp8": {
+        "flash_impl": "FA4",
+        "fp8": True,
+        "fp8_backend": AttentionBackend.FP8_FA4,
+        "label": "FA4 FP8",
+    },
 }
 
 RANDOM_SEED = 42
diff --git a/benchmarks/prototype/attention/run_all_benchmarks_fa4.sh b/benchmarks/prototype/attention/run_all_benchmarks_fa4.sh
@@ -0,0 +1,87 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Run all low-precision attention benchmarks (FA4 baseline vs FA4 FP8 test).
+# Usage: bash benchmarks/prototype/attention/run_all_benchmarks_fa4.sh
+
+set -euo pipefail
+
+BENCH_DIR="benchmarks/prototype/attention"
+BASELINE="fa4"
+TEST="fa4_fp8"
+
+echo "================================================================"
+echo "  Low-Precision Attention Benchmarks ($BASELINE vs $TEST)"
+echo "================================================================"
+
+# --------------------------------------------------------------------------
+# 1. Single attention layer benchmark
+# --------------------------------------------------------------------------
+echo ""
+echo "================================================================"
+echo "  [1/9] benchmark_sdpa.py — Single Attention Layer"
+echo "================================================================"
+python "$BENCH_DIR/benchmark_sdpa.py" --baseline "$BASELINE" --test "$TEST"
+
+# --------------------------------------------------------------------------
+# 2. LLaMA 3 model benchmarks (4 configurations)
+# --------------------------------------------------------------------------
+echo ""
+echo "================================================================"
+echo "  [2/9] eval_llama3_model.py — No compile, no fuse_rope_using_torch_compile"
+echo "================================================================"
+python "$BENCH_DIR/eval_llama3_model.py" --baseline "$BASELINE" --test "$TEST"
+
+echo ""
+echo "================================================================"
+echo "  [3/9] eval_llama3_model.py — Compile, no fuse_rope_using_torch_compile"
+echo "================================================================"
+python "$BENCH_DIR/eval_llama3_model.py" --baseline "$BASELINE" --test "$TEST" --compile
+
+echo ""
+echo "================================================================"
+echo "  [4/9] eval_llama3_model.py — No compile, fuse_rope_using_torch_compile"
+echo "================================================================"
+python "$BENCH_DIR/eval_llama3_model.py" --baseline "$BASELINE" --test "$TEST" --fuse_rope_using_torch_compile
+
+echo ""
+echo "================================================================"
+echo "  [5/9] eval_llama3_model.py — Compile, fuse_rope_using_torch_compile"
+echo "================================================================"
+python "$BENCH_DIR/eval_llama3_model.py" --baseline "$BASELINE" --test "$TEST" --compile --fuse_rope_using_torch_compile
+
+# --------------------------------------------------------------------------
+# 3. FLUX model benchmarks (4 configurations)
+# --------------------------------------------------------------------------
+echo ""
+echo "================================================================"
+echo "  [6/9] eval_flux_model.py — No compile, no fuse_rope_using_torch_compile"
+echo "================================================================"
+python "$BENCH_DIR/eval_flux_model.py" --baseline "$BASELINE" --test "$TEST"
+
+echo ""
+echo "================================================================"
+echo "  [7/9] eval_flux_model.py — Compile, no fuse_rope_using_torch_compile"
+echo "================================================================"
+python "$BENCH_DIR/eval_flux_model.py" --baseline "$BASELINE" --test "$TEST" --compile
+
+echo ""
+echo "================================================================"
+echo "  [8/9] eval_flux_model.py — No compile, fuse_rope_using_torch_compile"
+echo "================================================================"
+python "$BENCH_DIR/eval_flux_model.py" --baseline "$BASELINE" --test "$TEST" --fuse_rope_using_torch_compile
+
+echo ""
+echo "================================================================"
+echo "  [9/9] eval_flux_model.py — Compile, fuse_rope_using_torch_compile"
+echo "================================================================"
+python "$BENCH_DIR/eval_flux_model.py" --baseline "$BASELINE" --test "$TEST" --compile --fuse_rope_using_torch_compile
+
+echo ""
+echo "================================================================"
+echo "  All benchmarks complete."
+echo "================================================================"
diff --git a/test/prototype/attention/test_fp8_attention.py b/test/prototype/attention/test_fp8_attention.py
@@ -131,12 +131,15 @@ def _build_backend_configs() -> List[BackendConfig]:
         and _is_fa4_available()
     )
     if fa4_available:
-        from torchao.prototype.attention.fp8_fa4.attention import fp8_fa4_sdpa
+        from torchao.prototype.attention.fp8_fa4.attention import (
+            fp8_fa4_rope_sdpa,
+            fp8_fa4_sdpa,
+        )
 
-        sdpa_fn = fp8_fa4_sdpa
+        sdpa_fn, rope_sdpa_fn = fp8_fa4_sdpa, fp8_fa4_rope_sdpa
         eager_ok = _probe_eager_quantized_sdpa(sdpa_fn, "FA4")
     else:
-        sdpa_fn = None
+        sdpa_fn = rope_sdpa_fn = None
         eager_ok = False
 
     configs.append(
@@ -145,7 +148,7 @@ def _build_backend_configs() -> List[BackendConfig]:
             flash_impl="FA4",
             attention_backend=AttentionBackend.FP8_FA4,
             sdpa_fn=sdpa_fn,
-            rope_sdpa_fn=None,  # FA4 rope not yet available
+            rope_sdpa_fn=rope_sdpa_fn,
             available_eager=eager_ok,
             available_compiled=eager_ok,
             skip_msg=(
diff --git a/torchao/prototype/attention/fp8_fa4/__init__.py b/torchao/prototype/attention/fp8_fa4/__init__.py
@@ -11,10 +11,14 @@
 For lower-level access, use fp8_fa4_sdpa() directly.
 """
 
-from torchao.prototype.attention.fp8_fa4.attention import fp8_fa4_sdpa
+from torchao.prototype.attention.fp8_fa4.attention import (
+    fp8_fa4_rope_sdpa,
+    fp8_fa4_sdpa,
+)
 from torchao.prototype.attention.quantization import _fp8_sdpa_quantize
 
 __all__ = [
     "fp8_fa4_sdpa",
+    "fp8_fa4_rope_sdpa",
     "_fp8_sdpa_quantize",
 ]
diff --git a/torchao/prototype/attention/fp8_fa4/attention.py b/torchao/prototype/attention/fp8_fa4/attention.py
@@ -35,10 +35,16 @@
 from functools import partial
 
 from torchao.prototype.attention.shared_utils.attention import (
+    _fp8_rope_sdpa,
     _fp8_sdpa,
 )
 
 fp8_fa4_sdpa = partial(_fp8_sdpa, backend_name="FA4")
 fp8_fa4_sdpa.__doc__ = _fp8_sdpa.__doc__
 fp8_fa4_sdpa.__name__ = "fp8_fa4_sdpa"
 fp8_fa4_sdpa.__qualname__ = "fp8_fa4_sdpa"
+
+fp8_fa4_rope_sdpa = partial(_fp8_rope_sdpa, backend_name="FA4")
+fp8_fa4_rope_sdpa.__doc__ = _fp8_rope_sdpa.__doc__
+fp8_fa4_rope_sdpa.__name__ = "fp8_fa4_rope_sdpa"
+fp8_fa4_rope_sdpa.__qualname__ = "fp8_fa4_rope_sdpa"
diff --git a/torchao/prototype/attention/fp8_fa4/fusion_pass.py b/torchao/prototype/attention/fp8_fa4/fusion_pass.py
@@ -0,0 +1,39 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+FA4-specific FX graph fusion pass and compile helper.
+
+Registers FA4 custom ops (torchao::fp8_fa4_rope_sdpa, torchao::fp8_fa4_sdpa)
+via the shared factory, and exposes ``rope_sdpa_fusion_pass`` and
+``compile_with_fp8_fusion`` for use by ``fp8_fa4/setup.py``.
+
+Pattern detection, graph surgery, and the main fusion loop are in
+torchao.prototype.attention.shared_utils.fusion_utils.
+"""
+
+from torchao.prototype.attention.fp8_fa4.attention import (
+    fp8_fa4_rope_sdpa,
+    fp8_fa4_sdpa,
+)
+from torchao.prototype.attention.shared_utils.custom_ops import (
+    make_compile_fn,
+    make_fusion_pass,
+    register_fp8_attention_ops,
+)
+
+# Register FA4 custom ops at import time.
+_ops = register_fp8_attention_ops(
+    backend_name="fa4",
+    rope_sdpa_fn=fp8_fa4_rope_sdpa,
+    sdpa_fn=fp8_fa4_sdpa,
+)
+
+# FA4-specific fusion pass entry point.
+rope_sdpa_fusion_pass = make_fusion_pass(_ops, backend_name="FA4", max_head_dim=256)
+
+# FA4-specific compile helper.
+compile_with_fp8_fusion = make_compile_fn(rope_sdpa_fusion_pass, flash_impl_name="FA4")
diff --git a/torchao/prototype/attention/fp8_fa4/setup.py b/torchao/prototype/attention/fp8_fa4/setup.py
@@ -8,7 +8,7 @@
 FP8 FA4 backend setup.
 
 Thin wrapper around the shared ``setup_fp8_backend``, binding the FA4
-attention function.
+attention function and FA4 compile helper.
 """
 
 import torch.nn as nn
@@ -17,24 +17,20 @@
 from torchao.prototype.attention.shared_utils.setup import setup_fp8_backend
 
 
-def _compile_not_available(model, config):
-    raise NotImplementedError(
-        "FA4 RoPE fusion (fuse_rope=True) is not yet available. "
-        "Use fuse_rope=False (default) for the monkey-patch path."
-    )
-
-
 def setup_fp8_fa4(
     model: nn.Module,
     config: LowPrecisionAttentionConfig,
 ) -> nn.Module:
     """Set up FP8 FA4 attention on *model* and wrap it."""
     from torchao.prototype.attention.fp8_fa4.attention import fp8_fa4_sdpa
+    from torchao.prototype.attention.fp8_fa4.fusion_pass import (
+        compile_with_fp8_fusion,
+    )
 
     return setup_fp8_backend(
         model,
         config,
         flash_impl_name="FA4",
         sdpa_fn=fp8_fa4_sdpa,
-        compile_fn=_compile_not_available,
+        compile_fn=compile_with_fp8_fusion,
     )