add prenorm residual to attention aggregation fine tuning adapter

lucidrains · lucidrains · commit 04735c5c3c32 · 2021-12-30T11:21:45.000-08:00
diff --git a/enformer_pytorch/finetune.py b/enformer_pytorch/finetune.py
@@ -122,20 +122,25 @@ def __init__(
         super().__init__()
         assert isinstance(enformer, Enformer)
         self.enformer = enformer
+        enformer_hidden_dim = enformer.dim * 2
+
+        self.query_norm = nn.LayerNorm(enformer_hidden_dim)
+        self.key_values_norm = nn.LayerNorm(context_dim)
 
         self.scale = dim_head ** -0.5
         self.heads = heads
         inner_dim = heads * dim_head
-        self.to_queries = nn.Linear(enformer.dim * 2, inner_dim)
+        self.to_queries = nn.Linear(enformer_hidden_dim, inner_dim)
 
         self.null_key = nn.Parameter(torch.randn(inner_dim))
         self.null_value = nn.Parameter(torch.randn(inner_dim))
 
         self.to_key_values = nn.Linear(context_dim, inner_dim * 2, bias = False)
+        self.to_out = nn.Linear(inner_dim, enformer_hidden_dim)
 
-        self.to_out  = nn.Sequential(
-            nn.Linear(inner_dim, 1),
-            Rearrange('c ... 1 -> ... c'),
+        self.to_pred  = nn.Sequential(
+            nn.Linear(enformer_hidden_dim, 1),
+            Rearrange('b c ... 1 -> b ... c'),
             nn.Softplus()
         )
 
@@ -155,8 +160,8 @@ def forward(
         if context.ndim == 2:
             context = rearrange(context, 'b d -> b 1 d')
 
-        q = self.to_queries(embeddings)
-        k, v = self.to_key_values(context).chunk(2, dim = -1)
+        q = self.to_queries(self.query_norm(embeddings))
+        k, v = self.to_key_values(self.key_values_norm(context)).chunk(2, dim = -1)
 
         null_k, null_v = map(lambda t: repeat(t, 'd -> b 1 d', b = context.shape[0]), (self.null_key, self.null_value))
 
@@ -174,13 +179,21 @@ def forward(
 
         # aggregate
 
-        out = einsum('b c h i j, c h j d -> c h i d', attn, v)
+        out = einsum('b c h i j, c h j d -> b c h i d', attn, v)
+
+        out = rearrange(out, 'b c h n d -> b c n (h d)', h = h)
+
+        # combine heads
+
+        branch_out = self.to_out(out)
+
+        # residual
 
-        out = rearrange(out, 'c h n d -> c n (h d)', h = h)
+        embeddings = embeddings + branch_out
 
-        # combine heads and project / softplus
+        # to prediction
 
-        pred = self.to_out(out)
+        pred = self.to_pred(embeddings)
 
         if not exists(target):
             return pred
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'enformer-pytorch',
   packages = find_packages(exclude=[]),
   include_package_data = True,
-  version = '0.1.14',
+  version = '0.1.15',
   license='MIT',
   description = 'Enformer - Pytorch',
   author = 'Phil Wang',