complete sparse attention integration with transformer, user can now define which types of sparse attention to cycle between down the layers

lucidrains · lucidrains · commit de732e875675 · 2021-02-10T10:31:42.000-08:00
diff --git a/README.md b/README.md
@@ -152,6 +152,34 @@ dalle = DALLE(
 
 ## Sparse Attention
 
+The blogpost alluded to a mixture of different types of sparse attention, used mainly on the image (while the text presumably had full causal attention). I have done my best to replicate these types of sparse attention, on the scant details released. Primarily, it seems as though they are doing causal axial row / column attention, combined with a causal convolution-like attention.
+
+By default `DALLE` will use full attention for all layers, but you can specify the attention types as follows.
+
+- `full` stands for full attention
+
+- `axial_row` axial attention, along the rows of the image feature map
+
+- `axial_col` axial attention, along the columns of the image feature map
+
+- `conv_like` convolution-like attention, for the image feature map
+
+
+```python
+d = DALLE(
+    dim = 1024,
+    vae = vae,
+    num_text_tokens = 10000,
+    text_seq_len = 256,
+    depth = 64,
+    heads = 16,
+    reversible = True,
+    attn_types = ['full', 'axial_row', 'axial_col', 'conv_like']  # cycles between these four types of attention
+)
+```
+
+## Deepspeed Sparse Attention
+
 You can also train with Microsoft Deepspeed's <a href="https://www.deepspeed.ai/news/2020/09/08/sparse-attention.html">Sparse Attention</a>, with any combination of dense and sparse attention that you'd like. However, you will have to endure the installation process.
 
 First, you need to install Deepspeed with Sparse Attention
@@ -176,7 +204,7 @@ dalle = DALLE(
     text_seq_len = 256,
     depth = 64,
     heads = 8,
-    sparse_attn = (True, False) * 32  # interleave sparse and dense attention for 64 layers
+    attn_types = ('full', 'sparse')  # interleave sparse and dense attention for 64 layers
 )
 ```
 
diff --git a/dalle_pytorch/attention.py b/dalle_pytorch/attention.py
@@ -78,6 +78,7 @@ def __init__(self, dim, seq_len, image_size = 32, kernel_size = 5, dilation = 1,
         assert kernel_size % 2 == 1, 'kernel size must be odd'
 
         inner_dim = dim_head *  heads
+        self.seq_len = seq_len
         self.heads = heads
         self.scale = dim_head ** -0.5
         self.image_size = image_size
@@ -92,14 +93,21 @@ def __init__(self, dim, seq_len, image_size = 32, kernel_size = 5, dilation = 1,
         )
 
     def forward(self, x, mask = None):
-        b, n, _, h, img_size, kernel_size, dilation, device = *x.shape, self.heads, self.image_size, self.kernel_size, self.dilation, x.device
+        b, n, _, h, img_size, kernel_size, dilation, seq_len, device = *x.shape, self.heads, self.image_size, self.kernel_size, self.dilation, self.seq_len, x.device
+
+        if n < seq_len:
+            padding = seq_len - n
+            x = F.pad(x, (0, 0, 0, padding), value = 0)
+            if exists(mask):
+                mask = F.pad(x, (0, padding), value = False)
+
         qkv = self.to_qkv(x).chunk(3, dim = -1)
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), qkv)
 
         q *= self.scale
 
         img_seq_len = img_size ** 2
-        text_len = n - img_seq_len
+        text_len = seq_len - img_seq_len
         ((q_text, q_img), (k_text, k_img), (v_text, v_img)) = map(lambda t: (t[:, img_seq_len:], t[:, -img_seq_len:]), (q, k, v))
 
         # text attention
@@ -160,7 +168,7 @@ def forward(self, x, mask = None):
 
         out = rearrange(out, '(b h) n d -> b n (h d)', h = h)
         out =  self.to_out(out)
-        return out
+        return out[:, :n]
 
 # sparse axial causal attention
 
@@ -171,6 +179,7 @@ def __init__(self, dim, seq_len, image_size = 32, axis = 0, heads = 8, dim_head
         self.axis = axis
 
         inner_dim = dim_head *  heads
+        self.seq_len = seq_len
         self.heads = heads
         self.scale = dim_head ** -0.5
         self.image_size = image_size
@@ -183,14 +192,23 @@ def __init__(self, dim, seq_len, image_size = 32, axis = 0, heads = 8, dim_head
         )
 
     def forward(self, x, mask = None):
-        b, n, _, h, img_size, axis, device = *x.shape, self.heads, self.image_size, self.axis, x.device
+        b, n, _, h, img_size, axis, seq_len, device = *x.shape, self.heads, self.image_size, self.axis, self.seq_len, x.device
+
+        if n < seq_len:
+            padding = seq_len - n
+            x = F.pad(x, (0, 0, 0, padding), value = 0)
+
+            if exists(mask):
+                mask = F.pad(x, (0, padding), value = False)
+
         qkv = self.to_qkv(x).chunk(3, dim = -1)
         q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h = h), qkv)
 
         q *= self.scale
 
         img_seq_len = img_size ** 2
-        text_len = n - img_seq_len
+        text_len = seq_len - img_seq_len
+
         ((q_text, q_img), (k_text, k_img), (v_text, v_img)) = map(lambda t: (t[:, img_seq_len:], t[:, -img_seq_len:]), (q, k, v))
 
         # text attention
@@ -245,7 +263,7 @@ def forward(self, x, mask = None):
 
         out = rearrange(out, '(b h) n d -> b n (h d)', h = h)
         out =  self.to_out(out)
-        return out
+        return out[:, :n]
 
 # microsoft sparse attention CUDA kernel
 
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -257,14 +257,16 @@ def __init__(
         sparse_attn = False,
         noncausal_attn_len = 0,
         ignore_index = -100,
-        tie_codebook_image_emb = False
+        attn_types = None,
+        tie_codebook_image_emb = False,
     ):
         super().__init__()
         assert isinstance(vae, DiscreteVAE), 'vae must be an instance of DiscreteVAE'
 
         image_size = vae.image_size
         num_image_tokens = vae.num_tokens
-        image_seq_len = (vae.image_size // (2 ** vae.num_layers)) ** 2
+        image_fmap_size = (vae.image_size // (2 ** vae.num_layers))
+        image_seq_len = image_fmap_size ** 2
 
         self.text_emb = nn.Embedding(num_text_tokens, dim)
         self.image_emb = nn.Embedding(num_image_tokens, dim)
@@ -304,6 +306,8 @@ def __init__(
             attn_dropout = attn_dropout,
             ff_dropout = ff_dropout,
             noncausal_attn_len = (noncausal_attn_len + 1),
+            attn_types = attn_types,
+            image_fmap_size = image_fmap_size,
             sparse_attn = sparse_attn,
             sparse_attn_global_indices = range(text_seq_len)
         )
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -1,15 +1,22 @@
 from functools import partial
+from itertools import islice, cycle
 
 import torch
 from torch import nn, einsum
 import torch.nn.functional as F
-from einops import rearrange, repeat
+from einops import rearrange
 
 from dalle_pytorch.reversible import ReversibleSequence, SequentialSequence
 from dalle_pytorch.attention import Attention, SparseAttention, SparseConvCausalAttention, SparseAxialCausalAttention
 
 # helpers
 
+def exists(val):
+    return val is not None
+
+def default(val, d):
+    return val if exists(val) else d
+
 def cast_tuple(val, depth):
     return val if isinstance(val, tuple) else (val,) * depth
 
@@ -57,18 +64,33 @@ def __init__(
         attn_dropout = 0.,
         ff_dropout = 0.,
         noncausal_attn_len = 0,
+        attn_types = None,
+        image_fmap_size = None,
         sparse_attn = False,
         sparse_attn_global_indices = []
     ):
         super().__init__()
         layers = nn.ModuleList([])
         sparse_layer = cast_tuple(sparse_attn, depth)
-
-        for _, sparse_attn in zip(range(depth), sparse_layer):
-            attn_class = Attention if not sparse_attn else partial(SparseAttention, sparse_attn_global_indices = sparse_attn_global_indices)
+        attn_types = default(attn_types, ('full',))
+        attn_type_layer = islice(cycle(attn_types), depth)
+
+        for _, sparse_attn, attn_type in zip(range(depth), sparse_layer, attn_type_layer):
+            if attn_type == 'full':
+                attn_class = partial(Attention, noncausal_attn_len = noncausal_attn_len)
+            elif attn_type == 'sparse':
+                attn_class = partial(SparseAttention, sparse_attn_global_indices = sparse_attn_global_indices)
+            elif attn_type == 'axial_row':
+                attn_class = partial(SparseAxialCausalAttention, seq_len = seq_len, axis = 0, image_size = image_fmap_size)
+            elif attn_type == 'axial_col':
+                attn_class = partial(SparseAxialCausalAttention, seq_len = seq_len, axis = 1, image_size = image_fmap_size)
+            elif attn_type == 'conv_like':
+                attn_class = partial(SparseConvCausalAttention, seq_len = seq_len, image_size = image_fmap_size)
+            else:
+                raise ValueError(f'attention type "{attn_type}" is not valid')
 
             layers.append(nn.ModuleList([
-                PreNorm(dim, attn_class(dim, causal = causal, seq_len = seq_len, heads = heads, dim_head = dim_head, dropout = attn_dropout, noncausal_attn_len = noncausal_attn_len)),
+                PreNorm(dim, attn_class(dim, causal = causal, seq_len = seq_len, heads = heads, dim_head = dim_head, dropout = attn_dropout)),
                 PreNorm(dim, FeedForward(dim, mult = ff_mult, dropout = ff_dropout))
             ]))
 
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'dalle-pytorch',
   packages = find_packages(),
-  version = '0.0.54',
+  version = '0.0.55',
   license='MIT',
   description = 'DALL-E - Pytorch',
   author = 'Phil Wang',