add util function for converting genetic sequence string rep to one hot

lucidrains · lucidrains · commit 9756c22413cb · 2022-01-05T21:27:16.000-08:00
diff --git a/enformer_pytorch/enformer_pytorch.py b/enformer_pytorch/enformer_pytorch.py
@@ -1,12 +1,12 @@
 import math
 import torch
 from torch import nn, einsum
+import torch.nn.functional as F
 from torch.utils.checkpoint import checkpoint_sequential
+
 from einops import rearrange, reduce
 from einops.layers.torch import Rearrange
 
-import torch.nn.functional as F
-
 # constants
 
 SEQUENCE_LENGTH = 196_608
@@ -33,12 +33,17 @@ def _round(x):
 def log(t, eps = 1e-20):
     return torch.log(t.clamp(min = eps))
 
+# sequence helpers
+
+def str_to_seq_indices(seq_strs):
+    char_to_index_map = dict(a = 0, c = 1, g = 2, t = 3, n = 4)
+    seq_strs = map(lambda x: x.lower(), seq_strs)
+    seq_indices = list(map(lambda seq_str: torch.Tensor(list(map(lambda char: char_to_index_map[char], seq_str))), seq_strs))
+    return torch.stack(seq_indices).long()
+
 def seq_indices_to_one_hot(t):
-    wildcard = t == 4 # the Ns in the sequence
-    t = t.clamp(max = 3)
-    one_hot = F.one_hot(t, num_classes = 4)
-    one_hot = one_hot.masked_fill(wildcard[..., None], 0.)
-    return one_hot.float()
+    one_hot = F.one_hot(t, num_classes = 5)
+    return one_hot[..., :4].float()
 
 # losses and metrics
 
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'enformer-pytorch',
   packages = find_packages(exclude=[]),
   include_package_data = True,
-  version = '0.1.25',
+  version = '0.1.26',
   license='MIT',
   description = 'Enformer - Pytorch',
   author = 'Phil Wang',