instadeepai
diff --git a/‎.github/workflows/python-publish.yml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/python-publish.yml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎instanovo/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎instanovo/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎instanovo/configs/inference/default.yaml‎
Lines changed: 6 additions & 4 deletions b/‎instanovo/configs/inference/default.yaml‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎instanovo/configs/inference/unit_test.yaml‎
Lines changed: 3 additions & 2 deletions b/‎instanovo/configs/inference/unit_test.yaml‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎instanovo/configs/instanovo.yaml‎
Lines changed: 4 additions & 1 deletion b/‎instanovo/configs/instanovo.yaml‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎instanovo/configs/instanovo_unit_test.yaml‎
Lines changed: 3 additions & 3 deletions b/‎instanovo/configs/instanovo_unit_test.yaml‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎instanovo/inference/greedy_search.py‎
Lines changed: 77 additions & 3 deletions b/‎instanovo/inference/greedy_search.py‎
Lines changed: 77 additions & 3 deletions
diff --git a/‎instanovo/models.json‎
Lines changed: 8 additions & 0 deletions b/‎instanovo/models.json‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎instanovo/scripts/convert_to_sdf.py‎
Lines changed: 1 addition & 1 deletion b/‎instanovo/scripts/convert_to_sdf.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎instanovo/scripts/get_zenodo_record.py‎
Lines changed: 1 addition & 3 deletions b/‎instanovo/scripts/get_zenodo_record.py‎
Lines changed: 1 addition & 3 deletions
@@ -34,12 +34,12 @@ jobs:
       - name: Build package
         run: python -m build
       - name: Publish package
-        uses: pypa/gh-action-pypi-publish@fb13cb306901256ace3dab689990e13a5550ffaa
+        uses: pypa/gh-action-pypi-publish@v1.12
         with:
           user: __token__
           password: ${{ secrets.PYPI_API_TOKEN }}
       - name: refresh PyPI badge
-        uses: fjogeleit/http-request-action@v1
+        uses: fjogeleit/http-request-action@v1.16
         with:
           url: https://camo.githubusercontent.com/a22fbcbadf81751212d5367cce341631bc28d7749b9cd5c317fbf0706a30c9ae/68747470733a2f2f62616467652e667572792e696f2f70792f696e7374616e6f766f2e737667
           method: PURGE
@@ -1,3 +1,3 @@
 from __future__ import annotations
 
-__version__ = "1.0.0"
+__version__ = "1.0.1"
@@ -1,6 +1,6 @@
 # Data paths and output location
-data_path: # type: .mgf, .mzml or any other filetype supported by SpectruMataFrame
-model_path: # type: .ckpt
+data_path: # type: .mgf, .mzml or any other filetype supported by SpectrumDataFrame
+model_path: instanovo-extended # type: .ckpt or model id
 output_path: # type: .csv
 knapsack_path: # type: directory
 
@@ -17,9 +17,11 @@ use_knapsack: False
 save_beams: False
 subset: 1.0 # Subset of dataset to perform inference on, useful for debugging
 
+# These two only work in greedy search
 # Residues whose log probability will be set to -inf
-# Only works in greedy search
-# suppressed_residues: TODO
+suppressed_residues:
+# Stop model from predicting n-terminal modifications anywhere along the sequence
+disable_terminal_residues_anywhere: True
 
 # Run config
 num_workers: 16
 
@@ -4,13 +4,14 @@ defaults:
 
 # Data paths and output location
 data_path: ./tests/instanovo_test_resources/example_data/test_sample.mgf # type: .ipc
-model_path: ./tests/instanovo_test_resources/train_test/epoch=4-step=2420.ckpt # type: .ckpt
-output_path: ./tests/instanovo_test_resources/train_test/test_sample_preds.csv # type: .csv
+model_path: ./tests/instanovo_test_resources/model.ckpt # type: .ckpt
+output_path: ./tests/instanovo_test_resources/test_sample_preds.csv # type: .csv
 knapsack_path: ./tests/instanovo_test_resources/example_knapsack # type: directory
 use_knapsack: False
 
 num_beams: 5
 max_length: 30
+max_charge: 3
 
 subset: 1
 
 
@@ -36,8 +36,11 @@ train_subset: 1.0
 valid_subset: 0.01
 val_check_interval: 1.0 # 1.0 This doesn't work
 lazy_loading: True # Use lazy loading mode
-max_shard_size: 100_000 # Max data shard size for lazy loading, may influence shuffling mechanics
+max_shard_size: 1_000_000 # Max data shard size for lazy loading, may influence shuffling mechanics
+preshuffle_shards: True # Perform a preshuffle across shards to ensure shards are homogeneous in lazy mode
 perform_data_checks: True # Check residues, check precursor masses, etc.
+validate_precursor_mass: False # Slow for large datasets
+verbose_loading: True # Verbose SDF logs when loading the dataset
 
 # Checkpointing parameters
 save_model: True
 
@@ -9,15 +9,15 @@ defaults:
 tb_summarywriter: "./logs/instanovo/instanovo-unit-test"
 
 # Training parameters
-warmup_iters: 1000
+warmup_iters: 480
 max_iters: 3_000_000
 learning_rate: 1e-3
 train_batch_size: 32
 grad_accumulation: 1
 
 # Logging parameters
 logger:
-epochs: 5
+epochs: 1
 num_sanity_val_steps: 10
 console_logging_steps: 50
 tensorboard_logging_steps: 500
@@ -29,4 +29,4 @@ valid_subset: 1.0
 
 # Checkpointing parameters
 model_save_folder_path: ./tests/instanovo_test_resources/train_test
-ckpt_interval: 2420
+ckpt_interval: 480
@@ -21,9 +21,18 @@ class GreedyDecoder(Decoder):
     models that conform to the `Decodable` interface.
     """
 
-    def __init__(self, model: Decodable, mass_scale: int = MASS_SCALE):
+    def __init__(
+        self,
+        model: Decodable,
+        suppressed_residues: list[str] | None = None,
+        mass_scale: int = MASS_SCALE,
+        disable_terminal_residues_anywhere: bool = True,
+    ):
         super().__init__(model=model)
         self.mass_scale = mass_scale
+        self.disable_terminal_residues_anywhere = disable_terminal_residues_anywhere
+
+        suppressed_residues = suppressed_residues or []
 
         # NOTE: Greedy search requires `residue_set` class in the model, update all methods accordingly.
         if not hasattr(model, "residue_set"):
@@ -37,10 +46,32 @@ def __init__(self, model: Decodable, mass_scale: int = MASS_SCALE):
         self.residue_masses = torch.zeros(
             (len(self.model.residue_set),), dtype=torch.float64
         )
+        terminal_residues_idx: list[int] = []
+        suppressed_residues_idx: list[int] = []
         for i, residue in enumerate(model.residue_set.vocab):
             if residue in self.model.residue_set.special_tokens:
                 continue
             self.residue_masses[i] = self.model.residue_set.get_mass(residue)
+            # If no residue is attached, assume it is a n-terminal residue
+            if not residue[0].isalpha():
+                terminal_residues_idx.append(i)
+
+            # Check if residue is suppressed
+            if residue in suppressed_residues:
+                suppressed_residues_idx.append(i)
+                suppressed_residues.remove(residue)
+
+        if len(suppressed_residues) > 0:
+            raise ValueError(
+                f"Suppressed residues not found in vocabulary: {suppressed_residues}"
+            )
+
+        self.terminal_residue_indices = torch.tensor(
+            terminal_residues_idx, dtype=torch.long
+        )
+        self.suppressed_residue_indices = torch.tensor(
+            suppressed_residues_idx, dtype=torch.long
+        )
 
         self.vocab_size = len(self.model.residue_set)
 
@@ -270,10 +301,53 @@ def decode(  # type:ignore
                 next_token_probabilities_filtered[
                     :, self.model.residue_set.EOS_INDEX
                 ] = -float("inf")
+                # Allow the model to predict PAD when all residues are -inf
+                # next_token_probabilities_filtered[
+                #     :, self.model.residue_set.PAD_INDEX
+                # ] = -float("inf")
                 next_token_probabilities_filtered[
                     :, self.model.residue_set.SOS_INDEX
                 ] = -float("inf")
-                # TODO set probability of n-terminal modifications to 0 when i > 0, requires n-terms to be specified in residue_set
+                next_token_probabilities_filtered[
+                    :, self.suppressed_residue_indices
+                ] = -float("inf")
+                # Set probability of n-terminal modifications to -inf when i > 0
+                if self.disable_terminal_residues_anywhere:
+                    # Check if adding terminal residues would result in a complete sequence
+                    # First generate remaining mass matrix with isotopes
+                    remaining_mass_incomplete_isotope = remaining_mass_incomplete[
+                        :, None
+                    ].expand(sub_batch_size, max_isotope + 1) - CARBON_MASS_DELTA * (
+                        torch.arange(max_isotope + 1, device=device)
+                    )
+                    # Expand with terminal residues and subtract
+                    remaining_mass_incomplete_isotope_delta = (
+                        remaining_mass_incomplete_isotope[:, :, None].expand(
+                            sub_batch_size,
+                            max_isotope + 1,
+                            self.terminal_residue_indices.shape[0],
+                        )
+                        - self.residue_masses[self.terminal_residue_indices]
+                    )
+
+                    # If within target delta, allow these residues to be predicted, otherwise set probability to -inf
+                    allow_terminal = (
+                        remaining_mass_incomplete_isotope_delta.abs()
+                        < mass_target_incomplete[:, None, None]
+                    ).any(dim=1)
+                    allow_terminal_full = torch.ones(
+                        (sub_batch_size, self.vocab_size),
+                        device=spectra.device,
+                        dtype=bool,
+                    )
+                    allow_terminal_full[:, self.terminal_residue_indices] = (
+                        allow_terminal
+                    )
+
+                    # Set to -inf
+                    next_token_probabilities_filtered[~allow_terminal_full] = -float(
+                        "inf"
+                    )
 
                 # Step 5: Select next token:
                 next_token = next_token_probabilities_filtered.argmax(-1).unsqueeze(
@@ -362,7 +436,7 @@ def decode(  # type:ignore
                     token_log_probabilities=[
                         x.cpu().item()
                         for x in all_log_probabilities[i, : len(sequence)]
-                    ],  # list[float] (sequence_length) excludes EOS
+                    ][::-1],  # list[float] (sequence_length) excludes EOS
                 )
             )
 
 
@@ -0,0 +1,8 @@
+{
+  "transformer": {
+    "instanovo-extended": {
+      "url": "https://github.com/instadeepai/InstaNovo/releases/download/1.0.0/instanovo_extended.ckpt"
+    }
+  },
+  "diffusion": {}
+}
@@ -10,7 +10,7 @@
 
 
 def main() -> None:
-    """Convert data to ipc."""
+    """Convert data to spectrum data frame and save as parquet."""
     logging.basicConfig(level=logging.INFO)
     parser = argparse.ArgumentParser()
 
 
@@ -68,15 +68,13 @@ def main(
         extract_path + "/instanovo_test_resources"
     ):
         print(
-            f"Skipping download and extraction. Path '{extract_path}'/instanovo_test_resources already exists and is non-empty."
+            f"Skipping download and extraction. Path '{extract_path}/instanovo_test_resources' already exists and is non-empty."
         )
         return
 
     get_zenodo(zenodo_url, zip_path)
     unzip_zenodo(zip_path, extract_path)
 
-    os.makedirs("./tests/instanovo_test_resources/train_test", exist_ok=True)
-
 
 if __name__ == "__main__":
     main()
Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`from __future__ import annotations`
`2`	`2`
`3`		`-__version__ = "1.0.0"`
	`3`	`+__version__ = "1.0.1"`