give knn attention layer one more way to tune out local if need be

lucidrains · lucidrains · commit aaf9a0a4f914 · 2022-04-23T15:18:23.000-07:00
diff --git a/memorizing_transformers_pytorch/memorizing_transformers_pytorch.py b/memorizing_transformers_pytorch/memorizing_transformers_pytorch.py
@@ -191,6 +191,7 @@ def __init__(
         super().__init__()
         self.heads = heads
         self.scale = nn.Parameter(torch.ones(heads, 1, 1) * math.log(attn_scale_init))
+        self.local_attn_bias = nn.Parameter(torch.zeros(heads, 1, 1))
         self.knn_attn_bias = nn.Parameter(torch.zeros(heads, 1, 1))
 
         inner_dim = heads * dim_head
@@ -241,6 +242,7 @@ def forward(
         if exists(rel_pos_bias):
             sim = rel_pos_bias[..., -i:, -j:] + sim
 
+        sim = sim + self.local_attn_bias
         mask_value = -torch.finfo(sim.dtype).max
 
         causal_mask = torch.ones((i, j), dtype = torch.bool, device = device).triu(j - i + 1)
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'memorizing-transformers-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.3.5',
+  version = '0.3.6',
   license='MIT',
   description = 'Memorizing Transformer - Pytorch',
   author = 'Phil Wang',