allow the network to pay more attention to memory later into training, if need be

lucidrains · lucidrains · commit fabbe142e47d · 2022-04-23T14:57:18.000-07:00
diff --git a/README.md b/README.md
@@ -4,7 +4,7 @@
 
 Implementation of <a href="https://arxiv.org/abs/2203.08913">Memorizing Transformers</a> (ICLR 2022), attention net augmented with indexing and retrieval of memories using approximate nearest neighbors, in Pytorch
 
-This repository deviates from the paper slightly, using a hybrid attention across attention logits local and distant (knn). It also uses cosine similarity attention (with learned temperature) for the KNN attention layer.
+This repository deviates from the paper slightly, using a hybrid attention across attention logits local and distant (rather than the sigmoid gate setup). It also uses cosine similarity attention (with learned temperature) for the KNN attention layer.
 
 ## Install
 
diff --git a/memorizing_transformers_pytorch/memorizing_transformers_pytorch.py b/memorizing_transformers_pytorch/memorizing_transformers_pytorch.py
@@ -191,6 +191,7 @@ def __init__(
         super().__init__()
         self.heads = heads
         self.scale = nn.Parameter(torch.ones(heads, 1, 1) * math.log(attn_scale_init))
+        self.knn_attn_bias = nn.Parameter(torch.zeros(heads, 1, 1))
 
         inner_dim = heads * dim_head
         self.xl_max_memories = xl_max_memories
@@ -251,6 +252,7 @@ def forward(
         mem_k, mem_v = mem_kv.unbind(dim = -2)
 
         sim_mem = einsum('b h i d, b h i j d -> b h i j', q, mem_k) * scale
+        sim_mem = sim_mem + self.knn_attn_bias
         sim_mem = sim_mem.masked_fill(~mem_mask, mask_value)
 
         # calculate new XL memories, as well as memories to be discarded
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'memorizing-transformers-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.3.4',
+  version = '0.3.5',
   license='MIT',
   description = 'Memorizing Transformer - Pytorch',
   author = 'Phil Wang',