fix handling of N in genetic sequence

lucidrains · lucidrains · commit 9440265c9668 · 2022-01-05T16:54:19.000-08:00
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ model = Enformer(
     target_length = 896,
 )
 
-seq = torch.randint(0, 4, (1, 196_608)) # for ACGT, in that order
+seq = torch.randint(0, 5, (1, 196_608)) # for ACGTN, in that order
 output = model(seq)
 
 output['human'] # (1, 896, 5313)
@@ -36,7 +36,7 @@ You can also directly pass in the sequence as one-hot encodings, which must be f
 ```python
 import torch
 import torch.nn.functional as F
-from enformer_pytorch import Enformer
+from enformer_pytorch import Enformer, seq_indices_to_one_hot
 
 model = Enformer(
     dim = 1536,
@@ -46,8 +46,8 @@ model = Enformer(
     target_length = 896,
 )
 
-seq = torch.randint(0, 4, (1, 196_608))
-one_hot = F.one_hot(seq, num_classes = 4).float()
+seq = torch.randint(0, 5, (1, 196_608))
+one_hot = seq_indices_to_one_hot(seq)
 
 output = model(one_hot)
 
@@ -60,7 +60,7 @@ Finally, one can fetch the embeddings, for fine-tuning and otherwise, by setting
 ```python
 import torch
 import torch.nn.functional as F
-from enformer_pytorch import Enformer
+from enformer_pytorch import Enformer, seq_indices_to_one_hot
 
 model = Enformer(
     dim = 1536,
@@ -70,8 +70,8 @@ model = Enformer(
     target_length = 896,
 )
 
-seq = torch.randint(0, 4, (1, 196_608))
-one_hot = F.one_hot(seq, num_classes = 4).float()
+seq = torch.randint(0, 5, (1, 196_608))
+one_hot = seq_indices_to_one_hot(seq)
 
 output, embeddings = model(one_hot, return_embeddings = True)
 
@@ -82,7 +82,7 @@ For training, you can directly pass the head and target in to get the poisson lo
 
 ```python
 import torch
-from enformer_pytorch import Enformer
+from enformer_pytorch import Enformer, seq_indices_to_one_hot
 
 model = Enformer(
     dim = 1536,
@@ -92,7 +92,7 @@ model = Enformer(
     target_length = 200,
 ).cuda()
 
-seq = torch.randint(0, 4, (196_608 // 2,)).cuda()
+seq = torch.randint(0, 5, (196_608 // 2,)).cuda()
 target = torch.randn(200, 5313).cuda()
 
 loss = model(
@@ -188,7 +188,7 @@ model = HeadAdapterWrapper(
     num_tracks = 128
 ).cuda()
 
-seq = torch.randint(0, 4, (1, 196_608 // 2,)).cuda()
+seq = torch.randint(0, 5, (1, 196_608 // 2,)).cuda()
 target = torch.randn(1, 200, 128).cuda()  # 128 tracks
 
 loss = model(seq, target = target)
@@ -214,7 +214,7 @@ model = ContextAdapterWrapper(
     context_dim = 1024
 ).cuda()
 
-seq = torch.randint(0, 4, (1, 196_608 // 2,)).cuda()
+seq = torch.randint(0, 5, (1, 196_608 // 2,)).cuda()
 
 target = torch.randn(1, 200, 4).cuda()  # 4 tracks
 context = torch.randn(4, 1024).cuda()   # 4 contexts for the different 'tracks'
@@ -249,7 +249,7 @@ model = ContextAttentionAdapterWrapper(
     dim_head = 64           # dimension per head
 ).cuda()
 
-seq = torch.randint(0, 4, (1, 196_608 // 2,)).cuda()
+seq = torch.randint(0, 5, (1, 196_608 // 2,)).cuda()
 
 target = torch.randn(1, 200, 4).cuda()      # 4 tracks
 context = torch.randn(4, 16, 1024).cuda()   # 4 contexts for the different 'tracks', each with 16 tokens
diff --git a/enformer_pytorch/__init__.py b/enformer_pytorch/__init__.py
@@ -1,2 +1,2 @@
-from enformer_pytorch.enformer_pytorch import Enformer, SEQUENCE_LENGTH, AttentionPool
+from enformer_pytorch.enformer_pytorch import Enformer, SEQUENCE_LENGTH, AttentionPool, seq_indices_to_one_hot
 from enformer_pytorch.model_loader import load_pretrained_model
diff --git a/enformer_pytorch/enformer_pytorch.py b/enformer_pytorch/enformer_pytorch.py
@@ -33,6 +33,13 @@ def _round(x):
 def log(t, eps = 1e-20):
     return torch.log(t.clamp(min = eps))
 
+def seq_indices_to_one_hot(t):
+    wildcard = t == 4 # the Ns in the sequence
+    t = t.clamp(max = 3)
+    one_hot = F.one_hot(t, num_classes = 4)
+    one_hot = one_hot.masked_fill(wildcard[..., None], 0.)
+    return one_hot.float()
+
 # losses and metrics
 
 def poisson_loss(pred, target):
@@ -259,7 +266,6 @@ def __init__(
         heads = 8,
         output_heads = dict(human = 5313, mouse= 1643),
         target_length = TARGET_LENGTH,
-        num_alphabet = 4,
         attn_dim_key = 64,
         dropout_rate = 0.4,
         attn_dropout = 0.05,
@@ -268,15 +274,14 @@ def __init__(
     ):
         super().__init__()
         self.dim = dim
-        self.num_alphabet = num_alphabet
         half_dim = dim // 2
         twice_dim = dim * 2
 
         # create stem
 
         self.stem = nn.Sequential(
             Rearrange('b n d -> b d n'),
-            nn.Conv1d(num_alphabet, half_dim, 15, padding = 7),
+            nn.Conv1d(4, half_dim, 15, padding = 7),
             Residual(ConvBlock(half_dim)),
             AttentionPool(half_dim, pool_size = 2)
         )
@@ -402,7 +407,7 @@ def forward(
         dtype = x.dtype
 
         if x.dtype == torch.long:
-            x = F.one_hot(x, num_classes = self.num_alphabet).float()
+            x = seq_indices_to_one_hot(x)
 
         no_batch = x.ndim == 2
 
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'enformer-pytorch',
   packages = find_packages(exclude=[]),
   include_package_data = True,
-  version = '0.1.24',
+  version = '0.1.25',
   license='MIT',
   description = 'Enformer - Pytorch',
   author = 'Phil Wang',

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-from enformer_pytorch.enformer_pytorch import Enformer, SEQUENCE_LENGTH, AttentionPool`
	`1`	`+from enformer_pytorch.enformer_pytorch import Enformer, SEQUENCE_LENGTH, AttentionPool, seq_indices_to_one_hot`
`2`	`2`	`from enformer_pytorch.model_loader import load_pretrained_model`