adds dataset stuff

jomitchellnv · jomitchellnv · commit c4e88bc26d61 · 2026-03-31T17:39:15.000-07:00
Signed-off-by: Jonathan Mitchell &lt;jomitchell@nvidia.com&gt;
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/dataset.py b/bionemo-recipes/recipes/esm2_minifold_te/dataset.py
@@ -181,19 +181,32 @@ class MmcifStructureDataset(Dataset):
         "MSE": "M",
     }
 
-    def __init__(self, cif_dir: str, tokenizer, max_seq_length: int = 256, pdb_ids: list[str] | None = None):
+    def __init__(
+        self,
+        cif_dir: str,
+        tokenizer,
+        max_seq_length: int = 256,
+        pdb_ids: list[str] | None = None,
+        min_residues: int = 50,
+        max_residues: int = 300,
+        min_ca_completeness: float = 0.9,
+    ):
         from Bio.PDB.MMCIFParser import MMCIFParser
 
         self.tokenizer = tokenizer
         self.max_seq_length = max_seq_length
+        self.min_residues = min_residues
+        self.max_residues = max_residues
+        self.min_ca_completeness = min_ca_completeness
         self.parser = MMCIFParser(QUIET=True)
 
         cif_path = Path(cif_dir)
         all_files = sorted(cif_path.glob("*.cif"))
 
         if pdb_ids is not None:
-            pdb_set = {pid.upper() for pid in pdb_ids}
-            self.files = [f for f in all_files if f.stem.upper() in pdb_set]
+            # Preserve caller's ordering (e.g., to match parquet row order)
+            file_by_id = {f.stem.upper(): f for f in all_files}
+            self.files = [file_by_id[pid.upper()] for pid in pdb_ids if pid.upper() in file_by_id]
         else:
             self.files = all_files
 
@@ -208,24 +221,35 @@ def __len__(self):
     def _parse_cif(self, cif_path):
         """Parse mmCIF file and extract sequence + Ca coordinates.
 
+        Uses the same filtering as prepare_pdb_dataset.py: min/max residues,
+        Ca completeness threshold, and truncation to max_residues.
+
         Returns (sequence, ca_coords, ca_mask) or raises on failure.
         """
         pdb_id = cif_path.stem
         structure = self.parser.get_structure(pdb_id, str(cif_path))
         model = structure[0]
 
         for chain in model:
-            sequence = []
-            coords = []
-            ca_mask = []
-
+            residues = []
             for res in chain.get_residues():
                 if res.id[0] != " ":
                     continue
                 resname = res.get_resname().strip()
                 if resname not in self.AA_3TO1:
                     continue
+                residues.append(res)
+
+            if len(residues) < self.min_residues:
+                continue
+            if len(residues) > self.max_residues:
+                residues = residues[: self.max_residues]
 
+            sequence = []
+            coords = []
+            ca_mask = []
+            for res in residues:
+                resname = res.get_resname().strip()
                 sequence.append(self.AA_3TO1[resname])
                 if "CA" in res:
                     ca = res["CA"].get_vector()
@@ -235,8 +259,11 @@ def _parse_cif(self, cif_path):
                     coords.append([0.0, 0.0, 0.0])
                     ca_mask.append(0)
 
-            if len(sequence) >= 20:
-                return "".join(sequence), coords, ca_mask
+            completeness = sum(ca_mask) / len(ca_mask)
+            if completeness < self.min_ca_completeness:
+                continue
+
+            return "".join(sequence), coords, ca_mask
 
         raise ValueError(f"No valid protein chain in {pdb_id}")
 
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/run_100_real.yaml b/bionemo-recipes/recipes/esm2_minifold_te/hydra_config/run_100_real.yaml
@@ -1,6 +1,8 @@
 # ESM2-MiniFold TE: 100-step run with REAL PDB data
 # 2x RTX 5090, frozen ESM-2 650M
-# Usage: torchrun --nproc_per_node=2 train_fsdp2.py --config-name run_100_real
+# Usage:
+#   Parquet (default, faster): torchrun --nproc_per_node=2 train_fsdp2.py --config-name run_100_real
+#   MmCIF (on-the-fly parsing): torchrun --nproc_per_node=2 train_fsdp2.py --config-name run_100_real dataset.dataset_type=mmcif
 
 esm_model_name: facebook/esm2_t33_650M_UR50D
 
@@ -15,8 +17,9 @@ model:
   num_recycling: 0
 
 dataset:
-  dataset_type: parquet
+  dataset_type: parquet  # "parquet" (fast, pre-processed) or "mmcif" (on-the-fly BioPython parsing)
   parquet_path: data/pdb_structures.parquet
+  cif_dir: data/cif_files
   tokenizer_name: ${esm_model_name}
   micro_batch_size: 2
   max_seq_length: 256
diff --git a/bionemo-recipes/recipes/esm2_minifold_te/tests/test_data_pipeline.py b/bionemo-recipes/recipes/esm2_minifold_te/tests/test_data_pipeline.py
@@ -179,36 +179,36 @@ def test_first_residue_is_threonine(self, parsed_data):
 
 class TestMmcifStructureDataset:
     def test_batch_keys(self, cif_dir, tokenizer):
-        ds = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH)
+        ds = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH, min_residues=20)
         sample = ds[0]
         assert set(sample.keys()) == {"input_ids", "attention_mask", "mask", "coords"}
 
     def test_batch_shapes(self, cif_dir, tokenizer):
-        ds = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH)
+        ds = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH, min_residues=20)
         sample = ds[0]
         assert sample["input_ids"].shape == (MAX_SEQ_LENGTH,)
         assert sample["attention_mask"].shape == (MAX_SEQ_LENGTH,)
         assert sample["mask"].shape == (MAX_SEQ_LENGTH,)
         assert sample["coords"].shape == (MAX_SEQ_LENGTH, 3)
 
     def test_batch_dtypes(self, cif_dir, tokenizer):
-        ds = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH)
+        ds = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH, min_residues=20)
         sample = ds[0]
         assert sample["input_ids"].dtype == torch.long
         assert sample["attention_mask"].dtype == torch.long
         assert sample["mask"].dtype == torch.float32
         assert sample["coords"].dtype == torch.float32
 
     def test_cls_eos_tokens(self, cif_dir, tokenizer):
-        ds = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH)
+        ds = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH, min_residues=20)
         sample = ds[0]
         assert sample["input_ids"][0].item() == 0, "First token should be CLS (0)"
         # Find EOS position
         real_len = sample["attention_mask"].sum().item()
         assert sample["input_ids"][int(real_len) - 1].item() == 2, "Last real token should be EOS (2)"
 
     def test_padding_is_zero(self, cif_dir, tokenizer):
-        ds = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH)
+        ds = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH, min_residues=20)
         sample = ds[0]
         real_len = sample["attention_mask"].sum().item()
         assert (sample["attention_mask"][int(real_len) :] == 0).all()
@@ -248,7 +248,7 @@ class TestDatasetEquivalence:
     """Both datasets must produce matching outputs for the same protein."""
 
     def _get_samples(self, cif_dir, parquet_path, tokenizer):
-        ds_cif = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH)
+        ds_cif = MmcifStructureDataset(cif_dir, tokenizer, max_seq_length=MAX_SEQ_LENGTH, min_residues=20)
         ds_pq = ParquetStructureDataset(parquet_path, tokenizer, max_seq_length=MAX_SEQ_LENGTH)
         return ds_cif[0], ds_pq[0]