Batch matmul fast path in MHAWithCache (facebookresearch#449)

rohan-varma · facebook-github-bot · commit a2e0a70d3707 · 2023-08-17T00:44:13.000-07:00
Summary: Pull Request resolved: facebookresearch#449 When doing self attention, an optimization is to combine the Q, K, V input projection matrices and do a single matmul, instead of 3. Adding this optimization in MHAWithCache. Differential Revision: D48418780 fbshipit-source-id: 0501341832910bf90a7ea1cc902b98f0760548ab
diff --git a/torchmultimodal/modules/layers/multi_head_attention.py b/torchmultimodal/modules/layers/multi_head_attention.py
@@ -7,16 +7,24 @@
 from typing import NamedTuple, Optional, Tuple, Union
 
 import torch
-
 import torch.nn.functional as F
 from torch import nn, Tensor
+from torch.nn import Module
 
 
 class MHAWithCacheOutput(NamedTuple):
     attn_output: Tensor
     past_key_value: Tuple[Tensor, Tensor]
 
 
+# def _batched_input_proj(
+#     query: Tensor, input_proj: Module
+# ) -> Tuple[Tensor, Tensor, Tensor]:
+#     projected_query = input_proj(query)
+#     query, key, value = projected_query.chunk(3, dim=-1)
+#     return query, key, value
+
+
 class MultiHeadSelfAttention(nn.Module):
     """
     Multihead self attention.
@@ -93,6 +101,7 @@ class MultiHeadAttentionWithCache(nn.Module):
         dropout (float): dropout rate
         add_bias (bool): if true, adds a learnable bias to query, key, value.
             Defaults to True.
+        is_self_attention
     """
 
     def __init__(
@@ -102,12 +111,17 @@ def __init__(
         num_heads: int,
         dropout: float = 0.0,
         add_bias: bool = True,
+        is_self_attention: bool = False,
     ) -> None:
         super().__init__()
         self.num_heads = num_heads
+        self.is_self_attention = is_self_attention
+        # Note: defining qkv and input_proj regardless of is_self_attention
+        # due to TorchScript compatibility.
         self.q_proj = nn.Linear(dim_q, dim_q, bias=add_bias)
         self.k_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
         self.v_proj = nn.Linear(dim_kv, dim_q, bias=add_bias)
+        self.input_proj_self_attn = nn.Linear(dim_q, 3 * dim_q, bias=add_bias)
         self.output_proj = nn.Linear(dim_q, dim_q)
         self.dropout = dropout
 
@@ -144,9 +158,13 @@ def forward(
         bsz = query.size(0)
         embed_dim = query.size(-1)
         head_dim = embed_dim // self.num_heads
-        query = self.q_proj(query)
-        key = self.k_proj(key)
-        value = self.v_proj(value)
+        if self.is_self_attention:
+            projected_query = self.input_proj_self_attn(query)
+            query, key, value = projected_query.chunk(3, dim=-1)
+        else:
+            query = self.q_proj(query)
+            key = self.k_proj(key)
+            value = self.v_proj(value)
 
         # bsz x seq_len x embed_dim => bsz x num_heads x seq_len x head_dim
         query = query.view(bsz, -1, self.num_heads, head_dim).transpose(1, 2)