add GenomicIntervalDataset, for easy data fetching from .bed and .fasta files

lucidrains · lucidrains · commit 73244fc18f82 · 2022-01-06T13:51:04.000-08:00
diff --git a/README.md b/README.md
@@ -35,7 +35,6 @@ You can also directly pass in the sequence as one-hot encodings, which must be f
 
 ```python
 import torch
-import torch.nn.functional as F
 from enformer_pytorch import Enformer, seq_indices_to_one_hot
 
 model = Enformer(
@@ -59,7 +58,6 @@ Finally, one can fetch the embeddings, for fine-tuning and otherwise, by setting
 
 ```python
 import torch
-import torch.nn.functional as F
 from enformer_pytorch import Enformer, seq_indices_to_one_hot
 
 model = Enformer(
@@ -266,6 +264,34 @@ loss = model(
 loss.backward()
 ```
 
+## Data
+
+You can use the `GenomicIntervalDataset` to easily fetch sequences of any length from a `.bed` file, with greater context length dynamically computed if specified
+
+```python
+import torch
+from enformer_pytorch import Enformer, GenomeIntervalDataset
+
+ds = GenomeIntervalDataset(
+    bed_file = './sequences.bed',  # bed file
+    fasta_file = './hg38.fa',      # path to fasta file
+    context_length = 196_608
+    # this can be longer than the interval designated in the .bed file,
+    # in which case it will take care of lengthening the interval on either sides
+    # as well as proper padding if at the end of the chromosomes
+)
+
+model = Enformer(
+    dim = 1536,
+    depth = 11,
+    heads = 8,
+    output_heads = dict(human = 5313, mouse = 1643),
+    target_length = 896,
+)
+
+pred = model(ds[0], head = 'human') # (896, 5313)
+```
+
 ## Appreciation
 
 Special thanks goes out to <a href="https://www.eleuther.ai/">EleutherAI</a> for providing the resources to retrain the model in an acceptable amount of time
diff --git a/enformer_pytorch/__init__.py b/enformer_pytorch/__init__.py
@@ -1,2 +1,3 @@
-from enformer_pytorch.enformer_pytorch import Enformer, SEQUENCE_LENGTH, AttentionPool, seq_indices_to_one_hot
+from enformer_pytorch.enformer_pytorch import Enformer, SEQUENCE_LENGTH, AttentionPool
 from enformer_pytorch.model_loader import load_pretrained_model
+from enformer_pytorch.data import seq_indices_to_one_hot, GenomeIntervalDataset
diff --git a/enformer_pytorch/data.py b/enformer_pytorch/data.py
@@ -0,0 +1,90 @@
+import torch
+import torch.nn.functional as F
+
+def exists(val):
+    return val is not None
+
+def identity(t):
+    return t
+
+def cast_list(t):
+    return t if isinstance(t, list) else [t]
+
+def str_to_seq_indices(seq_strs, padding = '.'):
+    seq_strs = cast_list(seq_strs)
+    char_to_index_map = {'a': 0, 'c': 1, 'g': 2, 't': 3, 'n': 4, padding: -1}
+    seq_strs = map(lambda x: x.lower(), seq_strs)
+    seq_indices = list(map(lambda seq_str: torch.Tensor(list(map(lambda char: char_to_index_map[char], seq_str))), seq_strs))
+    return torch.stack(seq_indices).long()
+
+def seq_indices_to_one_hot(t, padding = -1):
+    is_padding = t == padding
+    t = t.clamp(min = 0)
+    one_hot = F.one_hot(t, num_classes = 5)
+    out = one_hot[..., :4].float()
+    out = out.masked_fill(is_padding[..., None], 0.25)
+    return out
+
+# processing bed files
+
+import pandas as pd
+from pathlib import Path
+from pyfaidx import Fasta
+from torch.utils.data import Dataset
+
+class GenomeIntervalDataset(Dataset):
+    def __init__(
+        self,
+        bed_file,
+        fasta_file,
+        context_length = None,
+        filter_df_fn = identity
+    ):
+        super().__init__()
+        bed_path = Path(bed_file)
+        fasta_file = Path(fasta_file)
+
+        assert bed_path.exists(), 'path to .bed file must exist'
+        assert fasta_file.exists(), 'path to fasta file must exist'
+
+        df = pd.read_csv(str(bed_path), sep = '\t', header = None, names = ['chr', 'start', 'end', 'type'])
+        df = filter_df_fn(df)
+
+        self.df = df
+        self.seqs = Fasta(str(fasta_file))
+        self.context_length = context_length
+
+    def __len__(self):
+        return len(self.df)
+
+    def __getitem__(self, ind):
+        interval = self.df.iloc[ind]
+        chr_name, start, end = (interval.chr, interval.start, interval.end)
+        interval_length = end - start
+
+        chromosome = self.seqs[chr_name]
+        chromosome_length = len(chromosome)
+
+        left_padding = right_padding = 0
+
+        if exists(self.context_length) and interval_length < self.context_length:
+            extra_seq = self.context_length - interval_length
+
+            extra_left_seq = extra_seq // 2
+            extra_right_seq = extra_seq - extra_left_seq
+
+            start -= extra_left_seq
+            end += extra_right_seq
+
+            if start < 0:
+                left_padding = -start
+                start = 0
+
+            if end > chromosome_length:
+                right_padding = end - chromosome_length
+                end = chromosome_length
+
+        seq = ('.' * left_padding) + str(chromosome[start:end]) + ('.' * right_padding)
+        seq_indices = str_to_seq_indices(seq)
+        seq_onehot = seq_indices_to_one_hot(seq_indices)
+        return seq_onehot.squeeze(0)
diff --git a/enformer_pytorch/enformer_pytorch.py b/enformer_pytorch/enformer_pytorch.py
@@ -7,6 +7,8 @@
 from einops import rearrange, reduce
 from einops.layers.torch import Rearrange
 
+from enformer_pytorch.data import str_to_seq_indices, seq_indices_to_one_hot
+
 # constants
 
 SEQUENCE_LENGTH = 196_608
@@ -33,22 +35,6 @@ def _round(x):
 def log(t, eps = 1e-20):
     return torch.log(t.clamp(min = eps))
 
-# sequence helpers
-
-def str_to_seq_indices(seq_strs, padding = '.'):
-    char_to_index_map = {'a': 0, 'c': 1, 'g': 2, 't': 3, 'n': 4, padding: -1}
-    seq_strs = map(lambda x: x.lower(), seq_strs)
-    seq_indices = list(map(lambda seq_str: torch.Tensor(list(map(lambda char: char_to_index_map[char], seq_str))), seq_strs))
-    return torch.stack(seq_indices).long()
-
-def seq_indices_to_one_hot(t, padding = -1):
-    is_padding = t == padding
-    t = t.clamp(min = 0)
-    one_hot = F.one_hot(t, num_classes = 5)
-    out = one_hot[..., :4].float()
-    out = out.masked_fill(is_padding[..., None], 0.25)
-    return out
-
 # losses and metrics
 
 def poisson_loss(pred, target):
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
   name = 'enformer-pytorch',
   packages = find_packages(exclude=[]),
   include_package_data = True,
-  version = '0.1.29',
+  version = '0.2.0',
   license='MIT',
   description = 'Enformer - Pytorch',
   author = 'Phil Wang',
@@ -18,6 +18,8 @@
   install_requires=[
     'einops>=0.3',
     'torch>=1.6',
+    'pandas',
+    'pyfaidx',
     'pyyaml'
   ],
   classifiers=[