maximize performance with embedding lookup for genomic string to one hot / seq indices

lucidrains · lucidrains · commit d73f17d55592 · 2022-01-13T16:47:50.000-08:00
diff --git a/README.md b/README.md
@@ -322,7 +322,7 @@ Special thanks goes out to <a href="https://www.eleuther.ai/">EleutherAI</a> for
 - [x] allow for fine tuning with only unfrozen layernorms (technique from fine tuning transformers)
 - [x] fix handling of 'N' in sequence, figure out representation of N in basenji barnyard
 - [x] take care of shift augmentation in `GenomicIntervalDataset`
-- [ ] speed up `str_to_seq_indices` using https://github.com/lucidrains/enformer-tensorflow-sonnet-training-script/blob/main/sequence.py#L12-L27
+- [x] speed up `str_to_seq_indices`
 - [ ] offer some basic training utils, as gradient accumulation will be needed for fine tuning
 - [ ] add to EleutherAI huggingface
 
diff --git a/enformer_pytorch/data.py b/enformer_pytorch/data.py
@@ -1,5 +1,14 @@
 import torch
 import torch.nn.functional as F
+from torch.utils.data import Dataset
+
+import polars as pl
+import numpy as np
+from random import randrange
+from pathlib import Path
+from pyfaidx import Fasta
+
+# helper functions
 
 def exists(val):
     return val is not None
@@ -10,12 +19,47 @@ def identity(t):
 def cast_list(t):
     return t if isinstance(t, list) else [t]
 
-def str_to_seq_indices(seq_strs, padding = '.'):
+# genomic function transforms
+
+seq_indices_embed = torch.zeros(256).long()
+seq_indices_embed[ord('a')] = 0
+seq_indices_embed[ord('c')] = 1
+seq_indices_embed[ord('g')] = 2
+seq_indices_embed[ord('t')] = 3
+seq_indices_embed[ord('n')] = 4
+seq_indices_embed[ord('A')] = 0
+seq_indices_embed[ord('C')] = 1
+seq_indices_embed[ord('G')] = 2
+seq_indices_embed[ord('T')] = 3
+seq_indices_embed[ord('N')] = 4
+seq_indices_embed[ord('.')] = -1
+
+one_hot_embed = torch.zeros(256, 4)
+one_hot_embed[ord('a')] = torch.Tensor([1., 0., 0., 0.])
+one_hot_embed[ord('c')] = torch.Tensor([0., 1., 0., 0.])
+one_hot_embed[ord('g')] = torch.Tensor([0., 0., 1., 0.])
+one_hot_embed[ord('t')] = torch.Tensor([0., 0., 0., 1.])
+one_hot_embed[ord('n')] = torch.Tensor([0., 0., 0., 0.])
+one_hot_embed[ord('A')] = torch.Tensor([1., 0., 0., 0.])
+one_hot_embed[ord('C')] = torch.Tensor([0., 1., 0., 0.])
+one_hot_embed[ord('G')] = torch.Tensor([0., 0., 1., 0.])
+one_hot_embed[ord('T')] = torch.Tensor([0., 0., 0., 1.])
+one_hot_embed[ord('N')] = torch.Tensor([0., 0., 0., 0.])
+one_hot_embed[ord('.')] = torch.Tensor([0.25, 0.25, 0.25, 0.25])
+
+def torch_fromstring(seq_strs):
     seq_strs = cast_list(seq_strs)
-    char_to_index_map = {'a': 0, 'c': 1, 'g': 2, 't': 3, 'n': 4, padding: -1}
-    seq_strs = map(lambda x: x.lower(), seq_strs)
-    seq_indices = list(map(lambda seq_str: torch.Tensor(list(map(lambda char: char_to_index_map[char], seq_str))), seq_strs))
-    return torch.stack(seq_indices).long()
+    np_seq_chrs = list(map(lambda t: np.fromstring(t, dtype = np.uint8), seq_strs))
+    seq_chrs = list(map(torch.from_numpy, np_seq_chrs))
+    return torch.stack(seq_chrs)
+
+def str_to_seq_indices(seq_strs):
+    seq_chrs = torch_fromstring(seq_strs)
+    return seq_indices_embed[seq_chrs.long()]
+
+def str_to_one_hot(seq_strs):
+    seq_chrs = torch_fromstring(seq_strs)
+    return one_hot_embed[seq_chrs.long()]
 
 def seq_indices_to_one_hot(t, padding = -1):
     is_padding = t == padding
@@ -27,12 +71,6 @@ def seq_indices_to_one_hot(t, padding = -1):
 
 # processing bed files
 
-import polars as pl
-from random import randrange
-from pathlib import Path
-from pyfaidx import Fasta
-from torch.utils.data import Dataset
-
 class GenomeIntervalDataset(Dataset):
     def __init__(
         self,
@@ -112,10 +150,8 @@ def __getitem__(self, ind):
                 end = chromosome_length
 
         seq = ('.' * left_padding) + str(chromosome[start:end]) + ('.' * right_padding)
-        seq_indices = str_to_seq_indices(seq)
 
         if self.return_seq_indices:
-            return seq_indices.squeeze(0)
+            return str_to_seq_indices(seq).squeeze(0)
 
-        seq_onehot = seq_indices_to_one_hot(seq_indices)
-        return seq_onehot.squeeze(0)
+        return str_to_one_hot(seq).squeeze(0)
diff --git a/enformer_pytorch/enformer_pytorch.py b/enformer_pytorch/enformer_pytorch.py
@@ -7,7 +7,7 @@
 from einops import rearrange, reduce
 from einops.layers.torch import Rearrange
 
-from enformer_pytorch.data import str_to_seq_indices, seq_indices_to_one_hot
+from enformer_pytorch.data import str_to_one_hot, seq_indices_to_one_hot
 
 # constants
 
@@ -400,11 +400,9 @@ def forward(
         head = None
     ):
         if isinstance(x, list):
-            x = str_to_seq_indices(x)
+            x = str_to_one_hot(x)
 
-        dtype = x.dtype
-
-        if x.dtype == torch.long:
+        elif x.dtype == torch.long:
             x = seq_indices_to_one_hot(x)
 
         no_batch = x.ndim == 2
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'enformer-pytorch',
   packages = find_packages(exclude=[]),
   include_package_data = True,
-  version = '0.2.7',
+  version = '0.2.8',
   license='MIT',
   description = 'Enformer - Pytorch',
   author = 'Phil Wang',
@@ -17,6 +17,7 @@
   ],
   install_requires=[
     'einops>=0.3',
+    'numpy',
     'torch>=1.6',
     'polars',
     'pyfaidx',