flashinfer-ai
diff --git a/‎flashinfer/parallel_attention/__init__.py‎
Lines changed: 19 additions & 0 deletions b/‎flashinfer/parallel_attention/__init__.py‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎flashinfer/parallel_attention/attention_ops.py‎
Lines changed: 253 additions & 0 deletions b/‎flashinfer/parallel_attention/attention_ops.py‎
Lines changed: 253 additions & 0 deletions
diff --git a/‎flashinfer/parallel_attention/parallel_attention.py‎
Lines changed: 117 additions & 0 deletions b/‎flashinfer/parallel_attention/parallel_attention.py‎
Lines changed: 117 additions & 0 deletions
@@ -0,0 +1,19 @@
+from .parallel_attention import ParallelAttention as ParallelAttention
+from .parallel_config import UnevenCPConfig as UnevenCPConfig
+from .parallel_config import VarlenCPConfig as VarlenCPConfig
+from .utils import split_varlen_input as split_varlen_input
+from .utils import ulysses_varlen_config as ulysses_varlen_config
+from .utils import ring_varlen_config as ring_varlen_config
+from .utils import uneven_cp_config as uneven_cp_config
+from .utils import get_parallel_groups as get_parallel_groups
+
+__all__ = [
+    "ParallelAttention",
+    "UnevenCPConfig",
+    "VarlenCPConfig",
+    "split_varlen_input",
+    "ulysses_varlen_config",
+    "ring_varlen_config",
+    "uneven_cp_config",
+    "get_parallel_groups",
+]
@@ -0,0 +1,253 @@
+import logging
+
+import math
+import torch
+
+from .utils import (
+    convert_output_layout,
+    convert_qkv_layout,
+)
+
+logger = logging.getLogger(__name__)
+
+try:
+    import flash_attn_interface
+except ImportError:
+    flash_attn_interface = None
+
+from flashinfer.prefill import fmha_varlen
+
+
+class AttentionOpManager:
+    _attn_registry: dict[str, type] = {}
+
+    @classmethod
+    def op_type(cls):
+        return "attention"
+
+    @classmethod
+    def set_attn_config(cls, **kwargs):
+        for key, value in kwargs.items():
+            if hasattr(cls, key):
+                setattr(cls, key, value)
+            else:
+                raise AttributeError(f"'{cls.__name__}' has no attribute '{key}'")
+
+    @classmethod
+    def register_attn(cls, attn_type):
+        def decorator(attn_class):
+            # Register the attention class
+            cls._attn_registry[attn_type] = attn_class
+            return attn_class
+
+        return decorator
+
+    @classmethod
+    def get_impl(cls, name=None):
+        if name is None:
+            name = cls.attn_type
+        attn_class = cls._attn_registry.get(name)
+        if attn_class is None:
+            raise ValueError(f"Attention function {name} not found in registry")
+        return attn_class()  # Create and return an instance
+
+    @classmethod
+    def get_registered_types(cls):
+        return list(cls._attn_registry.keys())
+
+
+@AttentionOpManager.register_attn("flash-attn3")
+class FlashAttn3:
+    def __call__(
+        self,
+        query,
+        key,
+        value,
+        attn_mask=None,
+        is_causal=False,
+        return_lse=False,
+        tensor_layout="HND",
+        cur_rank_cu_seqlens_q=None,
+        cur_rank_cu_seqlens_k=None,
+        cur_rank_max_seqlen_q=0,
+        cur_rank_max_seqlen_k=0,
+        **kwargs,
+    ):
+        if flash_attn_interface is None:
+            raise ImportError("FlashAttn3 is not installed")
+
+        if tensor_layout not in ["HND", "NHD"]:
+            raise NotImplementedError("Tensor layout not supported for FlashAttn3")
+
+        if tensor_layout == "HND":
+            query, key, value = convert_qkv_layout(
+                query, key, value, src_layout="HND", dst_layout="NHD"
+            )
+
+        if attn_mask is not None:
+            raise NotImplementedError("FlashAttn3 does not support attn_mask yet")
+
+        # FA3 only supports float16 and bfloat16
+        origin_dtype = query.dtype
+        if query.dtype not in [torch.float16, torch.bfloat16]:
+            query = query.to(torch.float16)
+            key = key.to(torch.float16)
+            value = value.to(torch.float16)
+
+        if cur_rank_cu_seqlens_q is None:
+            query = torch.unsqueeze(query, dim=0)
+            key = torch.unsqueeze(key, dim=0)
+            value = torch.unsqueeze(value, dim=0)
+            output = flash_attn_interface.flash_attn_func(
+                q=query,
+                k=key,
+                v=value,
+                softmax_scale=None,
+                causal=is_causal,
+                qv=None,
+                q_descale=None,
+                k_descale=None,
+                v_descale=None,
+                window_size=(-1, -1),
+                attention_chunk=0,
+                softcap=0.0,
+                num_splits=1,
+                pack_gqa=None,
+                deterministic=False,
+                sm_margin=0,
+                return_attn_probs=return_lse,
+            )
+
+            if isinstance(output, tuple):
+                lse = torch.squeeze(output[1], dim=0)
+                output = torch.squeeze(output[0], dim=0)
+                output = (output, lse)
+            else:
+                output = torch.squeeze(output, dim=0)
+
+        else:
+            output = flash_attn_interface.flash_attn_varlen_func(
+                q=query,
+                k=key,
+                v=value,
+                cu_seqlens_q=cur_rank_cu_seqlens_q,
+                cu_seqlens_k=cur_rank_cu_seqlens_k,
+                max_seqlen_q=cur_rank_max_seqlen_q,
+                max_seqlen_k=cur_rank_max_seqlen_k,
+                seqused_q=None,
+                seqused_k=None,
+                softmax_scale=None,
+                causal=is_causal,
+                qv=None,
+                q_descale=None,
+                k_descale=None,
+                v_descale=None,
+                window_size=(-1, -1),
+                attention_chunk=0,
+                softcap=0.0,
+                num_splits=1,
+                pack_gqa=None,
+                deterministic=False,
+                sm_margin=0,
+                return_attn_probs=return_lse,
+            )
+
+        lse = None
+        if isinstance(output, tuple):
+            lse = output[1]
+            output = output[0]
+
+        if tensor_layout == "HND":
+            output = convert_output_layout(output, src_layout="NHD", dst_layout="HND")
+
+        if tensor_layout == "NHD" and lse is not None:
+            lse = lse.permute(1, 0)
+
+        if output.dtype != origin_dtype:
+            output = output.to(origin_dtype)
+
+        if return_lse:
+            assert lse is not None, "lse is not returned by FlashAttn3"
+            return output, lse
+        else:
+            return output
+
+
+@AttentionOpManager.register_attn("cutlass")
+class CutlassFmha:
+    def __call__(
+        self,
+        query,
+        key,
+        value,
+        attn_mask=None,
+        is_causal=False,
+        return_lse=False,
+        tensor_layout="HND",
+        cur_rank_cu_seqlens_q=None,
+        cur_rank_cu_seqlens_k=None,
+        cur_rank_max_seqlen_q=0,
+        cur_rank_max_seqlen_k=0,
+        **kwargs,
+    ):
+        if tensor_layout not in ["HND", "NHD"]:
+            raise NotImplementedError("Tensor layout not supported for CutlassFmha")
+
+        if tensor_layout == "HND":
+            query, key, value = convert_qkv_layout(
+                query, key, value, src_layout="HND", dst_layout="NHD"
+            )
+
+        if attn_mask is not None:
+            raise NotImplementedError("CutlassFmha does not support attn_mask yet")
+
+        # CutlassFmha only supports float16 and bfloat16
+        origin_dtype = query.dtype
+        if query.dtype not in [torch.float16, torch.bfloat16]:
+            query = query.to(torch.float16)
+            key = key.to(torch.float16)
+            value = value.to(torch.float16)
+
+        if cur_rank_cu_seqlens_q is None:
+            qo_segment_offsets = torch.tensor(
+                [0, query.shape[0]], device=query.device, dtype=torch.int32
+            )
+            kv_segment_offsets = torch.tensor(
+                [0, key.shape[0]], device=key.device, dtype=torch.int32
+            )
+            max_qo_len = query.shape[0]
+        else:
+            qo_segment_offsets = cur_rank_cu_seqlens_q
+            kv_segment_offsets = cur_rank_cu_seqlens_k
+            max_qo_len = cur_rank_max_seqlen_q
+
+        output = fmha_varlen(
+            query,
+            key,
+            value,
+            qo_segment_offsets=qo_segment_offsets,
+            kv_segment_offsets=kv_segment_offsets,
+            max_qo_len=max_qo_len,
+            causal=is_causal,
+            sm_scale=1.0 / math.sqrt(query.size(-1)),
+            return_lse=return_lse,
+        )
+
+        lse = None
+        if isinstance(output, tuple):
+            lse = output[1]
+            output = output[0]
+
+        if tensor_layout == "HND":
+            output = convert_output_layout(output, src_layout="NHD", dst_layout="HND")
+            if lse is not None:
+                lse = lse.permute(1, 0)
+
+        if output.dtype != origin_dtype:
+            output = output.to(origin_dtype)
+
+        if return_lse:
+            assert lse is not None, "lse is not returned by cutlass fmha"
+            return output, lse
+        else:
+            return output
@@ -0,0 +1,117 @@
+import logging
+
+import torch
+
+from .attention_ops import AttentionOpManager
+from .parallel_config import UnevenCPConfig, VarlenCPConfig
+from .parallel_wrapper import ring_wrapper, ulysses_wrapper
+
+logger = logging.getLogger(__name__)
+
+
+class ParallelAttention:
+    """Runs an attention backend with Ulysses and/or Ring parallelism.
+
+    Wraps any registered attention implementation (see :class:`AttentionOpManager`)
+    and transparently applies Ulysses (all-to-all head splitting) and Ring
+    (P2P KV exchange with online softmax merging) parallelism via decorators.
+
+    Args:
+        attn_type: Name of the registered attention backend (e.g. ``"flash-attn3"``).
+        ulysses_group: Ulysses process group.
+        ring_group: Ring process group.
+        uneven_cp_config: Configuration for uneven context parallelism where
+            sequence lengths are not evenly divisible across ranks.
+        varlen_cp_config: Configuration for variable-length context parallelism
+            where multiple sequences of different lengths are packed together.
+        fuse_qkv: If ``True``, fuse Q/K/V into a single all-to-all communication
+            in Ulysses parallelism (reduces 3 NCCL calls to 1).
+
+    Example::
+
+        config = AttnParallelConfig()
+        config.set_config(ulysses_size=2, ring_size=2)
+        attn = ParallelAttention(
+            attn_type="flash-attn3",
+            ulysses_group=ulysses_group,
+            ring_group=ring_group,
+        )
+        output = attn.run(query, key, value, tensor_layout="HND")
+    """
+
+    def __init__(
+        self,
+        attn_type: str,
+        ulysses_group: torch.distributed.ProcessGroup,
+        ring_group: torch.distributed.ProcessGroup,
+        uneven_cp_config: UnevenCPConfig = None,
+        varlen_cp_config: VarlenCPConfig = None,
+        fuse_qkv: bool = False,
+    ):
+        self.attn_type = attn_type
+        self.attn_impl = AttentionOpManager.get_impl(attn_type)
+        self.ulysses_group = ulysses_group
+        self.ring_group = ring_group
+        self.uneven_cp_config = uneven_cp_config
+        self.varlen_cp_config = varlen_cp_config
+        self.fuse_qkv = fuse_qkv
+
+    @ulysses_wrapper
+    @ring_wrapper
+    def run(
+        self,
+        query,
+        key,
+        value,
+        tensor_layout,
+        attn_mask=None,
+        is_causal=False,
+        return_lse=False,
+        cur_rank_cu_seqlens_q=None,
+        cur_rank_cu_seqlens_k=None,
+        cur_rank_max_seqlen_q=0,
+        cur_rank_max_seqlen_k=0,
+        **kwargs,
+    ):
+        """Run parallel attention on the local rank's portion of Q/K/V.
+
+        The Ulysses and Ring wrappers transparently handle communication
+        before and after this method is called.
+
+        Args:
+            query: Query tensor, shape ``[H, S, D]`` (HND) or ``[S, H, D]`` (NHD).
+            key: Key tensor, same layout as query.
+            value: Value tensor, same layout as query.
+            tensor_layout: ``"HND"`` or ``"NHD"``.
+            attn_mask: Optional attention mask (not yet supported).
+            is_causal: Whether to apply causal masking (not yet supported).
+            return_lse: Must be ``False``; internally managed by ring wrapper.
+            cur_rank_cu_seqlens_q/ cur_rank_cu_seqlens_k/
+            cur_rank_max_seqlen_q/ cur_rank_max_seqlen_k:
+            please do not set this manually. This will be set by the parallel wrapper.
+            The sequence lengths should be set in the uneven_cp_config or varlen_cp_config.
+            **kwargs: Additional arguments forwarded to the attention backend.
+
+        Returns:
+            torch.Tensor: Attention output for the local rank, same layout as input.
+        """
+        if is_causal:
+            raise NotImplementedError(
+                "parallel attention does not support causal attention right now"
+            )
+
+        attn_inputs = {
+            "query": query,
+            "key": key,
+            "value": value,
+            "tensor_layout": tensor_layout,
+            "attn_mask": attn_mask,
+            "is_causal": is_causal,
+            "return_lse": return_lse,
+            "cur_rank_cu_seqlens_q": cur_rank_cu_seqlens_q,
+            "cur_rank_cu_seqlens_k": cur_rank_cu_seqlens_k,
+            "cur_rank_max_seqlen_q": cur_rank_max_seqlen_q,
+            "cur_rank_max_seqlen_k": cur_rank_max_seqlen_k,
+        }
+
+        return self.attn_impl(**attn_inputs, **kwargs)