Release v1.6.1 (#391)

gordonkoehn · Gordon J. Köhn · commit 49cdf5b38b84 · 2025-12-08T10:07:34.000+01:00
* Optimizes Diamond database creation by extracting it into a separate function that can be called once and reused, rather than recreating the database for every batch of reads.
* Various mundane dependencies chores
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
@@ -14,7 +14,7 @@ jobs:
   deploy:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v5
+      - uses: actions/checkout@v6
       - name: Set up Python
         uses: actions/setup-python@v6
         with:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -15,7 +15,7 @@ jobs:
 
     steps:
       - name: Checkout code
-        uses: actions/checkout@v5
+        uses: actions/checkout@v6
 
       - name: Set up Python
         uses: actions/setup-python@v6
diff --git a/conda-recipe/meta.yaml b/conda-recipe/meta.yaml
@@ -1,6 +1,6 @@
 # conda recipe
 {% set name = "sr2silo" %}
-{% set version = "1.6.0" %}
+{% set version = "1.6.1" %}
 
 package:
   name: {{ name|lower }}
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "sr2silo"
-version = "1.6.0"
+version = "1.6.1"
 description = "ETL tool for importing short-read sequencing data into SILO database (v0.8.0+), powering Loculus."
 authors = ["Gordon Julian Koehn <gordon.koehn@dbsse.ethz.ch>"]
 readme = "README.md"
diff --git a/src/sr2silo/process/translate_align.py b/src/sr2silo/process/translate_align.py
@@ -26,17 +26,56 @@
 )
 
 
+def _make_diamond_db(in_aa_reference_fp: Path, out_db_fp: Path) -> None:
+    """
+    Creates a Diamond-formatted database file that can be used for protein
+    sequence alignment with Diamond blastx.
+
+    Args:
+        in_aa_reference_fp (Path): Path to the input amino acid reference FASTA file
+                                    with their gene names as headers
+        out_db_fp (Path): Path to the output Diamond database file.
+    """
+    try:
+        logging.info("Diamond makedb")
+        logging.info("== Making Sequence DB ==")
+        result = subprocess.run(
+            [
+                "diamond",
+                "makedb",
+                "--in",
+                str(in_aa_reference_fp),
+                "-d",
+                str(out_db_fp),
+            ],
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.DEVNULL,
+            check=False,
+        )
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"Error occurred while making sequence DB with diamond makedb "
+                f"- Error Code: {result.returncode}"
+            )
+    except Exception as e:
+        if not isinstance(e, RuntimeError):
+            logging.error(
+                f"An error occurred while making sequence DB - Error Code: {e}"
+            )
+        raise
+
+
 def nuc_to_aa_alignment(
     in_nuc_alignment_fp: Path,
-    in_aa_reference_fp: Path,
+    in_aa_db_fp: Path,
     out_aa_alignment_fp: Path,
 ) -> None:
     """
     Function to convert files and translate and align with Diamond / blastx.
 
     Args:
         in_nuc_alignment_fp (Path): Path to the input nucleotide alignment file.
-        in_aa_reference_fp (Path): Path to the input amino acid reference file.
+        in_aa_db_fp (Path): Path to the input amino acid diamond database file.
         out_aa_alignment_fp (Path): Path to the output amino acid alignment file.
 
     Returns:
@@ -76,36 +115,6 @@ def nuc_to_aa_alignment(
 
         logging.info(f"Using temporary directory for diamond: {temp_dir_path}")
 
-        # temporary file file for amino acid reference DB
-        db_ref_fp = temp_dir_path / Path(in_aa_reference_fp.stem + ".temp.db")
-        try:
-            # ==== Make Sequence DB ====
-            logging.info("Diamond makedb")
-            logging.info("== Making Sequence DB ==")
-            result = subprocess.run(
-                [
-                    "diamond",
-                    "makedb",
-                    "--in",
-                    str(in_aa_reference_fp),
-                    "-d",
-                    str(db_ref_fp),
-                ],
-                stdout=subprocess.DEVNULL,
-                stderr=subprocess.DEVNULL,
-                check=False,
-            )
-            if result.returncode != 0:
-                raise RuntimeError(
-                    f"Error occurred while making sequence DB with diamond makedb "
-                    f"- Error Code: {result}"
-                )
-        except Exception as e:
-            logging.error(
-                f"An error occurred while making sequence DB - Error Code: {e}"
-            )
-            raise
-
         try:
             # ==== Alignment ====
             logging.info("Diamond blastx alignment")
@@ -114,7 +123,7 @@ def nuc_to_aa_alignment(
                     "diamond",
                     "blastx",
                     "-d",
-                    str(db_ref_fp),
+                    str(in_aa_db_fp),
                     "-q",
                     str(fasta_nuc_for_aa_alignment),
                     "-o",
@@ -287,7 +296,10 @@ def enrich_read_with_aa_seq(
 
 
 def parse_translate_align(
-    nuc_reference_fp: Path, aa_reference_fp: Path, nuc_alignment_fp: Path
+    nuc_reference_fp: Path,
+    aa_reference_fp: Path,
+    nuc_alignment_fp: Path,
+    aa_db_fp: Path,
 ) -> Dict[str, AlignedRead]:
     """Parse nucleotides, translate and align amino acids the input files."""
     with tempfile.TemporaryDirectory() as temp_dir:
@@ -299,7 +311,7 @@ def parse_translate_align(
 
         missing_files = [
             str(f)
-            for f in [nuc_reference_fp, aa_reference_fp, nuc_alignment_fp]
+            for f in [nuc_reference_fp, aa_reference_fp, nuc_alignment_fp, aa_db_fp]
             if not f.exists()
         ]
         if missing_files:
@@ -315,7 +327,7 @@ def parse_translate_align(
 
         nuc_to_aa_alignment(
             in_nuc_alignment_fp=BAM_NUC_ALIGNMENT_FILE,
-            in_aa_reference_fp=aa_reference_fp,
+            in_aa_db_fp=aa_db_fp,
             out_aa_alignment_fp=AA_ALIGNMENT_FILE,
         )
 
@@ -373,14 +385,16 @@ def enrich_single_read(read: AlignedRead) -> AlignedRead:
     return enrich_single_read
 
 
-def process_bam_files(bam_splits_fps, nuc_reference_fp, aa_reference_fp, metadata_fp):
+def process_bam_files(
+    bam_splits_fps, nuc_reference_fp, aa_reference_fp, aa_db_fp, metadata_fp
+):
     """Generator to process BAM files and yield JSON strings."""
 
     enrich_single_read = curry_read_with_metadata(metadata_fp)
 
     for bam_split_fp in bam_splits_fps:
         for read in parse_translate_align(
-            nuc_reference_fp, aa_reference_fp, bam_split_fp
+            nuc_reference_fp, aa_reference_fp, bam_split_fp, aa_db_fp
         ).values():
             enriched_read = enrich_single_read(read)
             yield enriched_read.to_silo_json()
@@ -437,6 +451,10 @@ def parse_translate_align_in_batches(
         with tempfile.TemporaryDirectory() as temp_dir:
             temp_dir_path = Path(temp_dir)
 
+            # Create Diamond DB once
+            aa_db_fp = temp_dir_path / "diamond_db.dmnd"
+            _make_diamond_db(aa_reference_fp, aa_db_fp)
+
             bam_splits_fps = convert.split_bam(
                 input_bam=nuc_alignment_fp, out_dir=temp_dir_path, chunk_size=chunk_size
             )
@@ -452,7 +470,11 @@ def parse_translate_align_in_batches(
                 with open(output_fp, "wb") as f, cctx.stream_writer(f) as compressor:
                     buffer = []
                     for json_str in process_bam_files(
-                        bam_splits_fps, nuc_reference_fp, aa_reference_fp, metadata_fp
+                        bam_splits_fps,
+                        nuc_reference_fp,
+                        aa_reference_fp,
+                        aa_db_fp,
+                        metadata_fp,
                     ):
                         buffer.append(json_str)
                         if len(buffer) >= write_chunk_size:
diff --git a/tests/process/conftest.py b/tests/process/conftest.py
@@ -27,12 +27,16 @@ def aligned_reads(aa_ref_sarscov2_fp, nuc_ref_sarscov2_fp) -> Dict[str, AlignedR
     aa_ref_fp = aa_ref_sarscov2_fp
     nuc_alignment_fp = Path("tests/data/bam/combined.bam")
 
-    aligned_reads = translate_align.parse_translate_align(
-        nuc_ref_fp, aa_ref_fp, nuc_alignment_fp
-    )
+    with tempfile.TemporaryDirectory() as temp_dir:
+        aa_db_fp = Path(temp_dir) / "diamond_db.dmnd"
+        translate_align._make_diamond_db(aa_ref_fp, aa_db_fp)
+
+        aligned_reads = translate_align.parse_translate_align(
+            nuc_ref_fp, aa_ref_fp, nuc_alignment_fp, aa_db_fp
+        )
 
     # make mock metadata with all empty strings, but readId
-    # print emoptry ReadMetadata scehma to json
+    # print empty ReadMetadata schema to json
     metadata = {
         "read_id": "readId",
         "sample_id": "",
diff --git a/tests/process/test_translate_align.py b/tests/process/test_translate_align.py
@@ -5,6 +5,7 @@
 import copy
 import logging
 from pathlib import Path
+from unittest.mock import MagicMock, patch
 
 import pytest
 
@@ -165,3 +166,50 @@ def test_curry_read_with_metadata_empty_metadata(tmp_path):
 
     with pytest.raises(ValueError, match="No metadata found in the file"):
         translate_align.curry_read_with_metadata(empty_file)
+
+
+def test_make_diamond_db_success(tmp_path):
+    """Test successful creation of Diamond database."""
+    in_aa_ref = tmp_path / "ref.fasta"
+    out_db = tmp_path / "db.dmnd"
+    in_aa_ref.touch()
+
+    with patch("sr2silo.process.translate_align.subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=0)
+        translate_align._make_diamond_db(in_aa_ref, out_db)
+
+        mock_run.assert_called_once()
+        args = mock_run.call_args[0][0]
+        assert args[0] == "diamond"
+        assert args[1] == "makedb"
+        assert args[3] == str(in_aa_ref)
+        assert args[5] == str(out_db)
+
+
+def test_make_diamond_db_failure(tmp_path):
+    """Test failure during Diamond database creation (non-zero return code)."""
+    in_aa_ref = tmp_path / "ref.fasta"
+    out_db = tmp_path / "db.dmnd"
+    in_aa_ref.touch()
+
+    with patch("sr2silo.process.translate_align.subprocess.run") as mock_run:
+        mock_run.return_value = MagicMock(returncode=1)
+
+        with pytest.raises(
+            RuntimeError, match="Error occurred while making sequence DB"
+        ):
+            translate_align._make_diamond_db(in_aa_ref, out_db)
+
+
+def test_make_diamond_db_exception(tmp_path):
+    """Test exception handling during Diamond database creation."""
+    in_aa_ref = tmp_path / "ref.fasta"
+    out_db = tmp_path / "db.dmnd"
+    in_aa_ref.touch()
+
+    with patch(
+        "sr2silo.process.translate_align.subprocess.run",
+        side_effect=OSError("Command not found"),
+    ):
+        with pytest.raises(OSError, match="Command not found"):
+            translate_align._make_diamond_db(in_aa_ref, out_db)