Subsample via mask

wukevin · wukevin · commit efb7b9b21388 · 2025-02-12T16:38:18.000Z
diff --git a/chai_lab/chai1.py b/chai_lab/chai1.py
@@ -32,6 +32,7 @@
 from chai_lab.data.dataset.msas.colabfold import generate_colabfold_msas
 from chai_lab.data.dataset.msas.load import get_msa_contexts
 from chai_lab.data.dataset.msas.msa_context import MSAContext
+from chai_lab.data.dataset.msas.utils import subsample_msa_rows
 from chai_lab.data.dataset.structure.all_atom_structure_context import (
     AllAtomStructureContext,
 )
@@ -441,6 +442,7 @@ def run_inference(
     msa_directory: Path | None = None,
     constraint_path: Path | None = None,
     # expose some params for easy tweaking
+    recycle_msa_subsample: int = 0,
     num_trunk_recycles: int = 3,
     num_diffn_timesteps: int = 200,
     num_diffn_samples: int = 5,
@@ -472,6 +474,7 @@ def run_inference(
         num_trunk_recycles=num_trunk_recycles,
         num_diffn_timesteps=num_diffn_timesteps,
         num_diffn_samples=num_diffn_samples,
+        recycle_msa_subsample=recycle_msa_subsample,
         seed=seed,
         device=torch_device,
         low_memory=low_memory,
@@ -488,6 +491,7 @@ def run_folding_on_context(
     *,
     output_dir: Path,
     # expose some params for easy tweaking
+    recycle_msa_subsample: int = 0,
     num_trunk_recycles: int = 3,
     num_diffn_timesteps: int = 200,
     # all diffusion samples come from the same trunk
@@ -647,7 +651,7 @@ def run_folding_on_context(
             token_single_trunk_repr=token_single_trunk_repr,  # recycled
             token_pair_trunk_repr=token_pair_trunk_repr,  # recycled
             msa_input_feats=msa_input_feats,
-            msa_mask=msa_mask,
+            msa_mask=subsample_msa_rows(msa_mask, select_n_rows=recycle_msa_subsample),
             template_input_feats=template_input_feats,
             template_input_masks=template_input_masks,
             token_single_mask=token_single_mask,
diff --git a/chai_lab/data/dataset/msas/utils.py b/chai_lab/data/dataset/msas/utils.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024 Chai Discovery, Inc.
+# Licensed under the Apache License, Version 2.0.
+# See the LICENSE file for details.
+
+import torch
+from einops import rearrange, repeat
+from torch import Tensor
+
+from chai_lab.utils.typing import Bool
+
+
+def subsample_msa_rows(
+    mask: Bool[Tensor, "1 depth tokens"],
+    select_n_rows: int = 4096,
+    generator: torch.Generator | None = None,
+) -> Bool[Tensor, "1 depth tokens"]:
+    """Adjust masking to look at a random subset of msas.
+
+    Returns input mask as-is if select_n_rows <= 0 or depth < select_n_rows."""
+    nonnull_rows_mask = rearrange(mask.any(dim=-1), "1 d -> d")
+    input_depth = nonnull_rows_mask.sum().item()
+    if select_n_rows <= 0 or input_depth <= select_n_rows:
+        return mask
+
+    # Select from rows of the MSA that are not fully masked out
+    (nonnull_row_indices,) = torch.where(nonnull_rows_mask)
+    assert (n := nonnull_row_indices.numel()) > select_n_rows
+    permuted = torch.randperm(n, device=mask.device, generator=generator)
+    selected_row_indices = nonnull_row_indices[permuted[:select_n_rows]]
+
+    # Create a mask for selected row indices
+    selection_mask = torch.zeros_like(nonnull_rows_mask)
+    selection_mask[selected_row_indices] = True
+    selection_mask = repeat(selection_mask, "d -> 1 d 1")
+
+    return mask & selection_mask