when using sparse attention, make sure text receives global attention

lucidrains · lucidrains · commit 5c00ebc39b75 · 2021-01-15T11:29:24.000-08:00
diff --git a/dalle_pytorch/dalle_pytorch.py b/dalle_pytorch/dalle_pytorch.py
@@ -292,7 +292,8 @@ def __init__(
             reversible = reversible,
             attn_dropout = attn_dropout,
             ff_dropout = ff_dropout,
-            sparse_attn = sparse_attn
+            sparse_attn = sparse_attn,
+            sparse_attn_global_indices = range(text_seq_len)
         )
 
         self.to_logits = nn.Sequential(
diff --git a/dalle_pytorch/transformer.py b/dalle_pytorch/transformer.py
@@ -1,5 +1,7 @@
-import torch
+from functools import partial
 from inspect import isfunction
+
+import torch
 from torch import nn, einsum
 import torch.nn.functional as F
 from einops import rearrange, repeat
@@ -11,6 +13,9 @@
 def exists(val):
     return val is not None
 
+def uniq(arr):
+    return{el: True for el in arr}.keys()
+
 def default(val, d):
     if exists(val):
         return val
@@ -89,15 +94,18 @@ def forward(self, x, mask = None):
         return out
 
 class SparseAttention(Attention):
-    def __init__(self, *args, **kwargs):
+    def __init__(self, *args, sparse_attn_global_indices = [], block_size = 16, **kwargs):
         super().__init__(*args, **kwargs)
         from deepspeed.ops.sparse_attention import SparseSelfAttention, VariableSparsityConfig
-        self.block_size = 16
+
+        self.block_size = block_size
+        global_blocks = uniq(map(lambda t: t // self.block_size, sparse_attn_global_indices))
 
         self.attn_fn = SparseSelfAttention(
             sparsity_config = VariableSparsityConfig(
                 num_heads = self.heads,
                 block = self.block_size,
+                global_block_indices = global_blocks,
                 attention = 'unidirectional' if self.causal else 'bidirectional'
             ),
             max_seq_length = self.seq_len,
@@ -148,14 +156,15 @@ def __init__(
         ff_mult = 4,
         attn_dropout = 0.,
         ff_dropout = 0.,
-        sparse_attn = True
+        sparse_attn = True,
+        sparse_attn_global_indices = []
     ):
         super().__init__()
         layers = nn.ModuleList([])
         sparse_layer = cast_tuple(sparse_attn, depth)
 
         for _, sparse_attn in zip(range(depth), sparse_layer):
-            attn_class = Attention if not sparse_attn else SparseAttention
+            attn_class = Attention if not sparse_attn else partial(SparseAttention, sparse_attn_global_indices = sparse_attn_global_indices)
 
             layers.append(nn.ModuleList([
                 PreNorm(dim, attn_class(dim, causal = causal, seq_len = seq_len, heads = heads, dim_head = dim_head, dropout = attn_dropout)),
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'dalle-pytorch',
   packages = find_packages(),
-  version = '0.0.36',
+  version = '0.0.37',
   license='MIT',
   description = 'DALL-E - Pytorch',
   author = 'Phil Wang',

Original file line number	Diff line number	Diff line change
`@@ -292,7 +292,8 @@ def __init__(`
`292`	`292`	`reversible = reversible,`
`293`	`293`	`attn_dropout = attn_dropout,`
`294`	`294`	`ff_dropout = ff_dropout,`
`295`		`- sparse_attn = sparse_attn`
	`295`	`+ sparse_attn = sparse_attn,`
	`296`	`+ sparse_attn_global_indices = range(text_seq_len)`
`296`	`297`	`)`
`297`	`298`
`298`	`299`	`self.to_logits = nn.Sequential(`