Added prototype low precision attention API to the docs

howardzhang-cv · howardzhang-cv · commit d3111bed9365 · 2026-03-11T13:52:13.000-07:00
Summary: Test Plan: Reviewers: Subscribers: Tasks: Tags: ghstack-source-id: f8e9811 Pull-Request: #4056
diff --git a/docs/source/api_reference/api_ref_prototype_attention.rst b/docs/source/api_reference/api_ref_prototype_attention.rst
@@ -0,0 +1,29 @@
+.. _api_attention:
+
+=======================================
+torchao.prototype.attention (prototype)
+=======================================
+
+.. currentmodule:: torchao.prototype.attention
+
+High-Level API
+--------------
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    apply_low_precision_attention
+    AttentionBackend
+
+.. currentmodule:: torchao.prototype.attention.fp8_fa3.attention
+
+Direct Usage (FA3)
+------------------
+
+.. autosummary::
+    :toctree: generated/
+    :nosignatures:
+
+    fp8_fa3_sdpa
+    fp8_fa3_rope_sdpa
diff --git a/docs/source/api_reference/index.rst b/docs/source/api_reference/index.rst
@@ -12,3 +12,4 @@ Comprehensive API documentation for torchao.
    api_ref_float8
    api_ref_utils
    api_ref_prototype_quant_logger
+   api_ref_prototype_attention
diff --git a/docs/source/examples/prototype/low_precision_attention.py b/docs/source/examples/prototype/low_precision_attention.py
@@ -0,0 +1,37 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from torchao.prototype.attention import apply_low_precision_attention
+
+
+# Simple model with attention
+class MyModel(nn.Module):
+    def __init__(self, embed_dim=512, num_heads=8):
+        super().__init__()
+        self.num_heads = num_heads
+        self.head_dim = embed_dim // num_heads
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
+
+    def forward(self, x):
+        B, S, _ = x.shape
+        q = self.q_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(B, S, self.num_heads, self.head_dim).transpose(1, 2)
+        attn_out = F.scaled_dot_product_attention(q, k, v, is_causal=True)
+        return self.out_proj(attn_out.transpose(1, 2).contiguous().view(B, S, -1))
+
+
+model = MyModel().to(device="cuda", dtype=torch.bfloat16).eval()
+
+# Auto-detect best backend
+model = apply_low_precision_attention(model)
+
+# Or specify a backend explicitly
+# model = apply_low_precision_attention(model, backend=AttentionBackend.FP8_FA3)
+
+# Optional: torch.compile for RoPE fusion
+model = torch.compile(model)
diff --git a/docs/source/workflows/index.md b/docs/source/workflows/index.md
@@ -11,6 +11,7 @@ This page provides an overview of the various workflows available in torchao.
 * QAT: the [QAT documentation](qat.md) for details on how to use quantization-aware training to improve model accuracy after quantization.
 * Inference: See the [inference quantization documentation](inference.md) for an overview of quantization for inference workflows.
 
+
 ## Workflows status by dtype + hardware
 
 🟢 = stable, 🟡 = prototype, 🟠 = planned, ⚪ = not supported
diff --git a/docs/source/workflows/inference.md b/docs/source/workflows/inference.md
@@ -202,3 +202,15 @@ The benchmarks below were run on a single NVIDIA-A6000 GPU.
 |             | codebook-4-64           |  10.095             |  1.73         |  8.63                   | 23.11            |  4.98           |
 
 You try can out these apis with the `quantize_` api as above alongside the config `CodebookWeightOnlyConfig` an example can be found in  in `torchao/_models/llama/generate.py`.
+
+### Low-Precision FP8 Attention (Prototype)
+
+FP8 low-precision attention for inference, built on Flash Attention backends. Currently supports FA3 on Hopper (SM90) and FA4 on Blackwell (SM100).
+
+**Requirements:** PyTorch >= 2.11, Hopper or Blackwell GPU, Flash Attention 3 (`pip install flash-attn-3 --index-url=https://download.pytorch.org/whl/{cuda_version}`).
+
+```{literalinclude} ../examples/prototype/low_precision_attention.py
+:language: python
+```
+
+`apply_low_precision_attention` replaces all `F.scaled_dot_product_attention` calls with FP8 attention for eager execution. When combined with `torch.compile`, RoPE patterns are automatically detected and fused into a single kernel. KV caching should be disabled before calling for best results with `torch.compile`. See the {ref}`API reference <api_attention>` for details.
diff --git a/torchao/prototype/attention/api.py b/torchao/prototype/attention/api.py
@@ -69,10 +69,12 @@ def apply_low_precision_attention(
 
     This replaces ``F.scaled_dot_product_attention`` with an FP8 SDPA
     for eager execution and sets a global pre-grad pass so that
-    ``torch.compile`` will automatically fuse RoPE where detected::
+    ``torch.compile`` will automatically fuse RoPE where detected.
 
-        model = apply_low_precision_attention(model)
-        model = torch.compile(model)   # RoPE fusion happens automatically
+    Example:
+
+    .. literalinclude:: ../../examples/prototype/low_precision_attention.py
+       :language: python
     """
     if not _TORCH_VERSION_AT_LEAST_2_11:
         raise RuntimeError("Low-precision attention requires PyTorch 2.11+.")