Skip to content

Commit 49cdf5b

Browse files
gordonkoehnGordon J. Köhn
authored andcommitted
Release v1.6.1 (#391)
* Optimizes Diamond database creation by extracting it into a separate function that can be called once and reused, rather than recreating the database for every batch of reads. * Various mundane dependencies chores
2 parents 29e66c1 + e7e27ce commit 49cdf5b

File tree

8 files changed

+340
-247
lines changed

8 files changed

+340
-247
lines changed

.github/workflows/docs.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ jobs:
1414
deploy:
1515
runs-on: ubuntu-latest
1616
steps:
17-
- uses: actions/checkout@v5
17+
- uses: actions/checkout@v6
1818
- name: Set up Python
1919
uses: actions/setup-python@v6
2020
with:

.github/workflows/test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515

1616
steps:
1717
- name: Checkout code
18-
uses: actions/checkout@v5
18+
uses: actions/checkout@v6
1919

2020
- name: Set up Python
2121
uses: actions/setup-python@v6

conda-recipe/meta.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# conda recipe
22
{% set name = "sr2silo" %}
3-
{% set version = "1.6.0" %}
3+
{% set version = "1.6.1" %}
44

55
package:
66
name: {{ name|lower }}

poetry.lock

Lines changed: 219 additions & 200 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "sr2silo"
3-
version = "1.6.0"
3+
version = "1.6.1"
44
description = "ETL tool for importing short-read sequencing data into SILO database (v0.8.0+), powering Loculus."
55
authors = ["Gordon Julian Koehn <gordon.koehn@dbsse.ethz.ch>"]
66
readme = "README.md"

src/sr2silo/process/translate_align.py

Lines changed: 61 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -26,17 +26,56 @@
2626
)
2727

2828

29+
def _make_diamond_db(in_aa_reference_fp: Path, out_db_fp: Path) -> None:
30+
"""
31+
Creates a Diamond-formatted database file that can be used for protein
32+
sequence alignment with Diamond blastx.
33+
34+
Args:
35+
in_aa_reference_fp (Path): Path to the input amino acid reference FASTA file
36+
with their gene names as headers
37+
out_db_fp (Path): Path to the output Diamond database file.
38+
"""
39+
try:
40+
logging.info("Diamond makedb")
41+
logging.info("== Making Sequence DB ==")
42+
result = subprocess.run(
43+
[
44+
"diamond",
45+
"makedb",
46+
"--in",
47+
str(in_aa_reference_fp),
48+
"-d",
49+
str(out_db_fp),
50+
],
51+
stdout=subprocess.DEVNULL,
52+
stderr=subprocess.DEVNULL,
53+
check=False,
54+
)
55+
if result.returncode != 0:
56+
raise RuntimeError(
57+
f"Error occurred while making sequence DB with diamond makedb "
58+
f"- Error Code: {result.returncode}"
59+
)
60+
except Exception as e:
61+
if not isinstance(e, RuntimeError):
62+
logging.error(
63+
f"An error occurred while making sequence DB - Error Code: {e}"
64+
)
65+
raise
66+
67+
2968
def nuc_to_aa_alignment(
3069
in_nuc_alignment_fp: Path,
31-
in_aa_reference_fp: Path,
70+
in_aa_db_fp: Path,
3271
out_aa_alignment_fp: Path,
3372
) -> None:
3473
"""
3574
Function to convert files and translate and align with Diamond / blastx.
3675
3776
Args:
3877
in_nuc_alignment_fp (Path): Path to the input nucleotide alignment file.
39-
in_aa_reference_fp (Path): Path to the input amino acid reference file.
78+
in_aa_db_fp (Path): Path to the input amino acid diamond database file.
4079
out_aa_alignment_fp (Path): Path to the output amino acid alignment file.
4180
4281
Returns:
@@ -76,36 +115,6 @@ def nuc_to_aa_alignment(
76115

77116
logging.info(f"Using temporary directory for diamond: {temp_dir_path}")
78117

79-
# temporary file file for amino acid reference DB
80-
db_ref_fp = temp_dir_path / Path(in_aa_reference_fp.stem + ".temp.db")
81-
try:
82-
# ==== Make Sequence DB ====
83-
logging.info("Diamond makedb")
84-
logging.info("== Making Sequence DB ==")
85-
result = subprocess.run(
86-
[
87-
"diamond",
88-
"makedb",
89-
"--in",
90-
str(in_aa_reference_fp),
91-
"-d",
92-
str(db_ref_fp),
93-
],
94-
stdout=subprocess.DEVNULL,
95-
stderr=subprocess.DEVNULL,
96-
check=False,
97-
)
98-
if result.returncode != 0:
99-
raise RuntimeError(
100-
f"Error occurred while making sequence DB with diamond makedb "
101-
f"- Error Code: {result}"
102-
)
103-
except Exception as e:
104-
logging.error(
105-
f"An error occurred while making sequence DB - Error Code: {e}"
106-
)
107-
raise
108-
109118
try:
110119
# ==== Alignment ====
111120
logging.info("Diamond blastx alignment")
@@ -114,7 +123,7 @@ def nuc_to_aa_alignment(
114123
"diamond",
115124
"blastx",
116125
"-d",
117-
str(db_ref_fp),
126+
str(in_aa_db_fp),
118127
"-q",
119128
str(fasta_nuc_for_aa_alignment),
120129
"-o",
@@ -287,7 +296,10 @@ def enrich_read_with_aa_seq(
287296

288297

289298
def parse_translate_align(
290-
nuc_reference_fp: Path, aa_reference_fp: Path, nuc_alignment_fp: Path
299+
nuc_reference_fp: Path,
300+
aa_reference_fp: Path,
301+
nuc_alignment_fp: Path,
302+
aa_db_fp: Path,
291303
) -> Dict[str, AlignedRead]:
292304
"""Parse nucleotides, translate and align amino acids the input files."""
293305
with tempfile.TemporaryDirectory() as temp_dir:
@@ -299,7 +311,7 @@ def parse_translate_align(
299311

300312
missing_files = [
301313
str(f)
302-
for f in [nuc_reference_fp, aa_reference_fp, nuc_alignment_fp]
314+
for f in [nuc_reference_fp, aa_reference_fp, nuc_alignment_fp, aa_db_fp]
303315
if not f.exists()
304316
]
305317
if missing_files:
@@ -315,7 +327,7 @@ def parse_translate_align(
315327

316328
nuc_to_aa_alignment(
317329
in_nuc_alignment_fp=BAM_NUC_ALIGNMENT_FILE,
318-
in_aa_reference_fp=aa_reference_fp,
330+
in_aa_db_fp=aa_db_fp,
319331
out_aa_alignment_fp=AA_ALIGNMENT_FILE,
320332
)
321333

@@ -373,14 +385,16 @@ def enrich_single_read(read: AlignedRead) -> AlignedRead:
373385
return enrich_single_read
374386

375387

376-
def process_bam_files(bam_splits_fps, nuc_reference_fp, aa_reference_fp, metadata_fp):
388+
def process_bam_files(
389+
bam_splits_fps, nuc_reference_fp, aa_reference_fp, aa_db_fp, metadata_fp
390+
):
377391
"""Generator to process BAM files and yield JSON strings."""
378392

379393
enrich_single_read = curry_read_with_metadata(metadata_fp)
380394

381395
for bam_split_fp in bam_splits_fps:
382396
for read in parse_translate_align(
383-
nuc_reference_fp, aa_reference_fp, bam_split_fp
397+
nuc_reference_fp, aa_reference_fp, bam_split_fp, aa_db_fp
384398
).values():
385399
enriched_read = enrich_single_read(read)
386400
yield enriched_read.to_silo_json()
@@ -437,6 +451,10 @@ def parse_translate_align_in_batches(
437451
with tempfile.TemporaryDirectory() as temp_dir:
438452
temp_dir_path = Path(temp_dir)
439453

454+
# Create Diamond DB once
455+
aa_db_fp = temp_dir_path / "diamond_db.dmnd"
456+
_make_diamond_db(aa_reference_fp, aa_db_fp)
457+
440458
bam_splits_fps = convert.split_bam(
441459
input_bam=nuc_alignment_fp, out_dir=temp_dir_path, chunk_size=chunk_size
442460
)
@@ -452,7 +470,11 @@ def parse_translate_align_in_batches(
452470
with open(output_fp, "wb") as f, cctx.stream_writer(f) as compressor:
453471
buffer = []
454472
for json_str in process_bam_files(
455-
bam_splits_fps, nuc_reference_fp, aa_reference_fp, metadata_fp
473+
bam_splits_fps,
474+
nuc_reference_fp,
475+
aa_reference_fp,
476+
aa_db_fp,
477+
metadata_fp,
456478
):
457479
buffer.append(json_str)
458480
if len(buffer) >= write_chunk_size:

tests/process/conftest.py

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,16 @@ def aligned_reads(aa_ref_sarscov2_fp, nuc_ref_sarscov2_fp) -> Dict[str, AlignedR
2727
aa_ref_fp = aa_ref_sarscov2_fp
2828
nuc_alignment_fp = Path("tests/data/bam/combined.bam")
2929

30-
aligned_reads = translate_align.parse_translate_align(
31-
nuc_ref_fp, aa_ref_fp, nuc_alignment_fp
32-
)
30+
with tempfile.TemporaryDirectory() as temp_dir:
31+
aa_db_fp = Path(temp_dir) / "diamond_db.dmnd"
32+
translate_align._make_diamond_db(aa_ref_fp, aa_db_fp)
33+
34+
aligned_reads = translate_align.parse_translate_align(
35+
nuc_ref_fp, aa_ref_fp, nuc_alignment_fp, aa_db_fp
36+
)
3337

3438
# make mock metadata with all empty strings, but readId
35-
# print emoptry ReadMetadata scehma to json
39+
# print empty ReadMetadata schema to json
3640
metadata = {
3741
"read_id": "readId",
3842
"sample_id": "",

tests/process/test_translate_align.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
import copy
66
import logging
77
from pathlib import Path
8+
from unittest.mock import MagicMock, patch
89

910
import pytest
1011

@@ -165,3 +166,50 @@ def test_curry_read_with_metadata_empty_metadata(tmp_path):
165166

166167
with pytest.raises(ValueError, match="No metadata found in the file"):
167168
translate_align.curry_read_with_metadata(empty_file)
169+
170+
171+
def test_make_diamond_db_success(tmp_path):
172+
"""Test successful creation of Diamond database."""
173+
in_aa_ref = tmp_path / "ref.fasta"
174+
out_db = tmp_path / "db.dmnd"
175+
in_aa_ref.touch()
176+
177+
with patch("sr2silo.process.translate_align.subprocess.run") as mock_run:
178+
mock_run.return_value = MagicMock(returncode=0)
179+
translate_align._make_diamond_db(in_aa_ref, out_db)
180+
181+
mock_run.assert_called_once()
182+
args = mock_run.call_args[0][0]
183+
assert args[0] == "diamond"
184+
assert args[1] == "makedb"
185+
assert args[3] == str(in_aa_ref)
186+
assert args[5] == str(out_db)
187+
188+
189+
def test_make_diamond_db_failure(tmp_path):
190+
"""Test failure during Diamond database creation (non-zero return code)."""
191+
in_aa_ref = tmp_path / "ref.fasta"
192+
out_db = tmp_path / "db.dmnd"
193+
in_aa_ref.touch()
194+
195+
with patch("sr2silo.process.translate_align.subprocess.run") as mock_run:
196+
mock_run.return_value = MagicMock(returncode=1)
197+
198+
with pytest.raises(
199+
RuntimeError, match="Error occurred while making sequence DB"
200+
):
201+
translate_align._make_diamond_db(in_aa_ref, out_db)
202+
203+
204+
def test_make_diamond_db_exception(tmp_path):
205+
"""Test exception handling during Diamond database creation."""
206+
in_aa_ref = tmp_path / "ref.fasta"
207+
out_db = tmp_path / "db.dmnd"
208+
in_aa_ref.touch()
209+
210+
with patch(
211+
"sr2silo.process.translate_align.subprocess.run",
212+
side_effect=OSError("Command not found"),
213+
):
214+
with pytest.raises(OSError, match="Command not found"):
215+
translate_align._make_diamond_db(in_aa_ref, out_db)

0 commit comments

Comments
 (0)