Add FA4 monkey-patch path for low-precision attention

howardzhang-cv · howardzhang-cv · commit 66bc20d5072b · 2026-03-11T13:52:14.000-07:00
ghstack-source-id: 6a451ce Pull-Request: #3960 Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags:
diff --git a/torchao/prototype/attention/api.py b/torchao/prototype/attention/api.py
@@ -13,7 +13,12 @@
 import torch._dynamo
 import torch.nn as nn
 
-from torchao.prototype.attention.utils import _is_fa3_available, _is_hopper
+from torchao.prototype.attention.utils import (
+    _is_blackwell,
+    _is_fa3_available,
+    _is_fa4_available,
+    _is_hopper,
+)
 from torchao.utils import torch_version_at_least
 
 _TORCH_VERSION_AT_LEAST_2_11 = torch_version_at_least("2.11.0")
@@ -29,14 +34,19 @@ class AttentionBackend(str, Enum):
     """Backend kernel for computing attention."""
 
     FP8_FA3 = "FP8_FA3"  # Requires SM90+ (Hopper)
+    FP8_FA4 = "FP8_FA4"  # Requires SM90+ (Hopper) or SM100+ (Blackwell)
 
 
 def _get_available_backend() -> AttentionBackend:
     if not torch.cuda.is_available():
         raise RuntimeError("Low-precision attention requires CUDA.")
     capability = torch.cuda.get_device_capability()
+    if _is_blackwell() and _is_fa4_available():
+        return AttentionBackend.FP8_FA4
     if _is_hopper() and _is_fa3_available():
         return AttentionBackend.FP8_FA3
+    if _is_hopper() and _is_fa4_available():
+        return AttentionBackend.FP8_FA4
     raise RuntimeError(f"No compatible backend for SM{capability[0]}{capability[1]}.")
 
 
@@ -53,6 +63,16 @@ def _check_backend_available(backend: AttentionBackend) -> None:
             raise RuntimeError(
                 "FP8_FA3 requires the flash-attn package with FA3 support."
             )
+    elif backend == AttentionBackend.FP8_FA4:
+        if not (_is_hopper() or _is_blackwell()):
+            raise RuntimeError(
+                f"FP8_FA4 requires Hopper or Blackwell, got SM{capability[0]}{capability[1]}."
+            )
+        if not _is_fa4_available():
+            raise RuntimeError(
+                "FP8_FA4 requires the flash-attn package with FA4 support "
+                "(flash_attn.cute.interface)."
+            )
     else:
         raise ValueError(f"Unknown backend: {backend}")
 
@@ -95,4 +115,7 @@ def apply_low_precision_attention(
     if backend == AttentionBackend.FP8_FA3:
         return setup_fp8_backend(model, "FA3")
 
+    if backend == AttentionBackend.FP8_FA4:
+        return setup_fp8_backend(model, "FA4")
+
     raise ValueError(f"Unknown backend: {backend}")
diff --git a/torchao/prototype/attention/fp8_fa4/__init__.py b/torchao/prototype/attention/fp8_fa4/__init__.py
@@ -0,0 +1,21 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+FP8 attention using FA4 backend.
+"""
+
+from torchao.prototype.attention.fp8_fa4.attention import (
+    fp8_fa4_rope_sdpa,
+    fp8_fa4_sdpa,
+)
+from torchao.prototype.attention.quantization import _fp8_sdpa_quantize
+
+__all__ = [
+    "fp8_fa4_sdpa",
+    "fp8_fa4_rope_sdpa",
+    "_fp8_sdpa_quantize",
+]
diff --git a/torchao/prototype/attention/fp8_fa4/attention.py b/torchao/prototype/attention/fp8_fa4/attention.py
@@ -0,0 +1,44 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD 3-Clause license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+FP8 SDPA using FA4 backend.
+
+When using these functions directly (not through apply_low_precision_attention),
+you must activate FA4 yourself::
+
+    activate_flash_attention_impl("FA4")
+    try:
+        out = fp8_fa4_sdpa(q, k, v, is_causal=True)
+    finally:
+        restore_flash_attention_impl()
+"""
+
+from functools import partial
+
+from torchao.prototype.attention.shared_utils.attention import (
+    _fp8_rope_sdpa,
+    _fp8_sdpa,
+)
+from torchao.prototype.attention.shared_utils.custom_ops import (
+    register_fp8_attention_ops,
+)
+
+fp8_fa4_sdpa = partial(_fp8_sdpa, backend_name="FA4")
+fp8_fa4_sdpa.__doc__ = _fp8_sdpa.__doc__
+fp8_fa4_sdpa.__name__ = "fp8_fa4_sdpa"
+fp8_fa4_sdpa.__qualname__ = "fp8_fa4_sdpa"
+
+fp8_fa4_rope_sdpa = partial(_fp8_rope_sdpa, backend_name="FA4")
+fp8_fa4_rope_sdpa.__doc__ = _fp8_rope_sdpa.__doc__
+fp8_fa4_rope_sdpa.__name__ = "fp8_fa4_rope_sdpa"
+fp8_fa4_rope_sdpa.__qualname__ = "fp8_fa4_rope_sdpa"
+
+_ops = register_fp8_attention_ops(
+    backend_name="fa4",
+    rope_sdpa_fn=fp8_fa4_rope_sdpa,
+    sdpa_fn=fp8_fa4_sdpa,
+)
diff --git a/torchao/prototype/attention/shared_utils/setup.py b/torchao/prototype/attention/shared_utils/setup.py
@@ -25,6 +25,8 @@ def setup_fp8_backend(
 ) -> nn.Module:
     if flash_impl_name == "FA3":
         from torchao.prototype.attention.fp8_fa3.attention import _ops
+    elif flash_impl_name == "FA4":
+        from torchao.prototype.attention.fp8_fa4.attention import _ops
     else:
         raise ValueError(f"Unknown flash_impl_name: {flash_impl_name}")