Skip to content

Commit d9a0f0a

Browse files
Gordon J. Köhnclaude
authored andcommitted
feat: add --reference-accession filter for BAM files (#421)
RSV-A/RSV-B BAM files from V-PIPE contain alignments to multiple similar references because the viruses are similar enough that reads cross-align. This adds a --reference-accession CLI option to filter reads by target reference sequence. Changes: - Add --reference-accession CLI flag to process-from-vpipe command - Add REFERENCE_ACCESSION config option for Snakefile workflow - Filter reads in bam_to_fastq_handle_indels() by reference name - Add ZeroFilteredReadsError when filtering results in zero reads - Log filtering statistics (reads kept/filtered with percentages) - Thread target_reference parameter through translate_align.py - Skip filtered reads during AA enrichment Test data: - Add H3_16_2025_11_15 BAM with reads to both RSV-A (3086) and RSV-B (48711) - Update test_convert.py with reference filtering tests - Add snakemake integration test for RSV-A with reference filtering Config: - Add REFERENCE_ACCESSION to deployments/rsva/config.yaml - Update pre-commit to allow large BAM files in tests/data/ Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 13cfc7c commit d9a0f0a

File tree

16 files changed

+490
-5
lines changed

16 files changed

+490
-5
lines changed

.pre-commit-config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ repos:
33
rev: v5.0.0
44
hooks:
55
- id: check-added-large-files
6+
args: ['--maxkb=5000']
7+
exclude: tests/data/.*\.bam$
68
- id: check-case-conflict
79
- id: check-merge-conflict
810
- id: check-symlinks

deployments/covid/config.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ RELEASE_DELAY: 180 # Seconds to wait before releasing
4242
ENABLE_SUBSAMPLING: true
4343
SUBSAMPLE_MAX_READS: 4500000
4444

45+
### Reference Filtering
46+
# The reference accession to filter reads by when processing BAM files.
47+
# This should match the @SQ SN field in the BAM header.
48+
# Find this by running: samtools view -H REF_aln_trim.bam | grep @SQ
49+
# Leave empty or omit to process all reads (backward compatible).
50+
REFERENCE_ACCESSION: "" # Not needed - COVID BAM has single reference
51+
4552
### Resource Configuration
4653
SLURM_CPUS: 20
4754
SLURM_MEM: "160G"

deployments/rsva/config.yaml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,13 @@ RELEASE_DELAY: 180 # Seconds to wait before releasing
4242
ENABLE_SUBSAMPLING: true
4343
SUBSAMPLE_MAX_READS: 4500000
4444

45+
### Reference Filtering
46+
# The reference accession to filter reads by when processing BAM files.
47+
# This should match the @SQ SN field in the BAM header.
48+
# Find this by running: samtools view -H REF_aln_trim.bam | grep @SQ
49+
# Leave empty or omit to process all reads (backward compatible).
50+
REFERENCE_ACCESSION: "EPI_ISL_412866" # RSV-A reference
51+
4552
### Resource Configuration
4653
SLURM_CPUS: 4
4754
SLURM_MEM: "16G"

src/sr2silo/main.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,16 @@ def process_from_vpipe(
213213
help="Skip merging of paired-end reads.",
214214
),
215215
] = False,
216+
reference_accession: Annotated[
217+
str | None,
218+
typer.Option(
219+
"--reference-accession",
220+
help="Filter reads to only include those aligned to this reference accession. "
221+
"Should match @SQ SN field in BAM header (find with: "
222+
"samtools view -H file.bam | grep @SQ). "
223+
"If not specified, all reads are processed.",
224+
),
225+
] = None,
216226
) -> None:
217227
"""
218228
V-PIPE to SILO conversion with amino acids and special metadata.
@@ -239,6 +249,10 @@ def process_from_vpipe(
239249
logging.info(f"Using local {organism} references (no Lapis URL provided)")
240250
logging.info(f"Using sample_id: {sample_id}")
241251
logging.info(f"Skip read pair merging: {skip_merge}")
252+
if reference_accession:
253+
logging.info(f"Reference accession filter: {reference_accession}")
254+
else:
255+
logging.info("Reference accession filter: None (processing all reads)")
242256

243257
# check if $TMPDIR is set, if not use /tmp
244258
if "TMPDIR" in os.environ:
@@ -274,6 +288,7 @@ def process_from_vpipe(
274288
skip_merge=skip_merge,
275289
version_info=version_info,
276290
organism=organism,
291+
reference_accession=reference_accession,
277292
)
278293

279294

src/sr2silo/process/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from __future__ import annotations
77

88
from sr2silo.process.convert import (
9+
ZeroFilteredReadsError,
910
bam_to_fasta_query,
1011
bam_to_sam,
1112
get_gene_set_from_ref,
@@ -32,6 +33,7 @@
3233

3334
__all__ = [
3435
# from sr2silo.process.convert
36+
"ZeroFilteredReadsError",
3537
"bam_to_fasta_query",
3638
"bam_to_sam",
3739
"get_gene_set_from_ref",

src/sr2silo/process/convert.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,19 @@
1111

1212
from sr2silo.process.interface import Gene, GeneName, GeneSet, Insertion
1313

14+
15+
class ZeroFilteredReadsError(Exception):
16+
"""Raised when reference filtering results in zero reads.
17+
18+
This error indicates that a target reference was specified for filtering,
19+
but no reads in the BAM file aligned to that reference. This typically
20+
means the wrong reference was specified or the data doesn't contain
21+
reads for the expected reference.
22+
"""
23+
24+
pass
25+
26+
1427
logging.basicConfig(
1528
level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
1629
)
@@ -215,6 +228,7 @@ def bam_to_fastq_handle_indels(
215228
out_insertions_fp: Path,
216229
deletion_char: str = "-",
217230
skipped_char: str = "N",
231+
target_reference: str | None = None,
218232
):
219233
"""
220234
Convert a BAM file to a FASTQ file, removing insertions and adding a
@@ -229,14 +243,37 @@ def bam_to_fastq_handle_indels(
229243
:param out_insertions_fp: Path to the output file containing insertions
230244
:param deletion_char: Special character to use for deletions/skipped regions
231245
:param skipped_char: Special character to use for skipped regions
246+
:param target_reference: Filter reads to only include those aligned to this
247+
reference accession. Should match @SQ SN field in
248+
BAM header. If None, all reads are processed.
232249
"""
250+
# Validate target reference against BAM headers if specified
251+
if target_reference:
252+
with pysam.AlignmentFile(str(bam_file), "rb") as bam_check:
253+
bam_references = [sq["SN"] for sq in bam_check.header.get("SQ", [])] # type: ignore
254+
if target_reference not in bam_references:
255+
logging.warning(
256+
f"Target reference '{target_reference}' not found in BAM headers. "
257+
f"Available references: {bam_references}"
258+
)
259+
else:
260+
logging.info(f"Filtering reads to reference: {target_reference}")
261+
262+
processed_count = 0
263+
filtered_count = 0
264+
233265
with (
234266
pysam.AlignmentFile(str(bam_file), "rb") as bam,
235267
open(out_fastq_fp, "w") as fastq,
236268
open(out_insertions_fp, "w") as insertions,
237269
):
238270
for read in bam.fetch():
239271
if not read.is_unmapped:
272+
# Filter by reference if specified
273+
if target_reference and read.reference_name != target_reference:
274+
filtered_count += 1
275+
continue
276+
processed_count += 1
240277
logging.debug(f"Processing read: {read.query_name}")
241278
query_sequence = read.query_sequence if read.query_sequence else ""
242279
query_qualities = read.query_qualities if read.query_qualities else ""
@@ -310,6 +347,29 @@ def bam_to_fastq_handle_indels(
310347
f"{read.query_name}\t{insertion_pos}\t{''.join(insertion_seq)}\t{''.join(insertion_qual)}\n"
311348
)
312349

350+
# Log filtering statistics (use WARNING level to bypass suppress_info_and_below)
351+
if target_reference:
352+
total_reads = processed_count + filtered_count
353+
if total_reads > 0:
354+
kept_pct = (processed_count / total_reads) * 100
355+
filtered_pct = (filtered_count / total_reads) * 100
356+
logging.warning(
357+
f"Reference filtering for '{target_reference}': "
358+
f"{processed_count}/{total_reads} reads kept ({kept_pct:.1f}%), "
359+
f"{filtered_count} filtered out ({filtered_pct:.1f}%)"
360+
)
361+
else:
362+
logging.warning(
363+
f"Reference filtering for '{target_reference}': 0 reads in BAM file"
364+
)
365+
if processed_count == 0 and filtered_count > 0:
366+
raise ZeroFilteredReadsError(
367+
f"No reads matched target reference '{target_reference}'. "
368+
f"All {filtered_count} reads were aligned to other references. "
369+
f"Check that the reference accession matches the expected reference "
370+
f"(found by: samtools view -H <bam> | grep @SQ)."
371+
)
372+
313373

314374
def parse_cigar(cigar: str) -> List[Tuple[int, str]]:
315375
"""Parse the CIGAR string into a list of tuples."""

src/sr2silo/process/translate_align.py

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,12 @@ def enrich_read_with_aa_seq(
255255
fasta_aa_alignment_file: Path,
256256
gene_set: GeneSet,
257257
) -> Dict[str, AlignedRead]:
258-
"""Read in amino acid sequences and insertions from a FASTA file"""
258+
"""Read in amino acid sequences and insertions from a FASTA file.
259+
260+
Note: Reads in the AA alignment file that are not in aligned_reads
261+
(e.g., due to reference filtering) are skipped.
262+
"""
263+
skipped_count = 0
259264
with open(fasta_aa_alignment_file, "r") as f:
260265
total_lines = sum(1 for _ in f)
261266
f.seek(0) # Reset file pointer to the beginning
@@ -268,6 +273,13 @@ def enrich_read_with_aa_seq(
268273
continue
269274
fields = line.strip().split("\t")
270275
read_id = fields[0]
276+
277+
# Skip reads that were filtered out during nucleotide processing
278+
if read_id not in aligned_reads:
279+
skipped_count += 1
280+
pbar.update(1)
281+
continue
282+
271283
gene_name = GeneName(fields[2])
272284
pos = int(fields[3])
273285
cigar = fields[5]
@@ -292,6 +304,12 @@ def enrich_read_with_aa_seq(
292304
pos - 1,
293305
)
294306
pbar.update(1)
307+
308+
if skipped_count > 0:
309+
logging.info(
310+
f"AA enrichment: skipped {skipped_count} reads not in nucleotide alignment "
311+
"(filtered by reference)"
312+
)
295313
return aligned_reads
296314

297315

@@ -300,8 +318,21 @@ def parse_translate_align(
300318
aa_reference_fp: Path,
301319
nuc_alignment_fp: Path,
302320
aa_db_fp: Path,
321+
target_reference: str | None = None,
303322
) -> Dict[str, AlignedRead]:
304-
"""Parse nucleotides, translate and align amino acids the input files."""
323+
"""Parse nucleotides, translate and align amino acids the input files.
324+
325+
Args:
326+
nuc_reference_fp: Path to nucleotide reference FASTA file.
327+
aa_reference_fp: Path to amino acid reference FASTA file.
328+
nuc_alignment_fp: Path to nucleotide alignment BAM file.
329+
aa_db_fp: Path to Diamond database file.
330+
target_reference: Filter reads to only include those aligned to this
331+
reference accession. If None, all reads are processed.
332+
333+
Returns:
334+
Dictionary mapping read IDs to AlignedRead objects.
335+
"""
305336
with tempfile.TemporaryDirectory() as temp_dir:
306337
temp_dir_path = Path(temp_dir)
307338
BAM_NUC_ALIGNMENT_FILE = temp_dir_path / "combined_sorted.bam"
@@ -323,6 +354,7 @@ def parse_translate_align(
323354
bam_file=BAM_NUC_ALIGNMENT_FILE,
324355
out_fastq_fp=FASTQ_NUC_ALIGNMENT_FILE,
325356
out_insertions_fp=FASTA_NUC_INSERTIONS_FILE,
357+
target_reference=target_reference,
326358
)
327359

328360
nuc_to_aa_alignment(
@@ -386,15 +418,37 @@ def enrich_single_read(read: AlignedRead) -> AlignedRead:
386418

387419

388420
def process_bam_files(
389-
bam_splits_fps, nuc_reference_fp, aa_reference_fp, aa_db_fp, metadata_fp
421+
bam_splits_fps,
422+
nuc_reference_fp,
423+
aa_reference_fp,
424+
aa_db_fp,
425+
metadata_fp,
426+
target_reference: str | None = None,
390427
):
391-
"""Generator to process BAM files and yield JSON strings."""
428+
"""Generator to process BAM files and yield JSON strings.
429+
430+
Args:
431+
bam_splits_fps: List of paths to BAM file splits.
432+
nuc_reference_fp: Path to nucleotide reference FASTA file.
433+
aa_reference_fp: Path to amino acid reference FASTA file.
434+
aa_db_fp: Path to Diamond database file.
435+
metadata_fp: Path to metadata JSON file.
436+
target_reference: Filter reads to only include those aligned to this
437+
reference accession. If None, all reads are processed.
438+
439+
Yields:
440+
JSON strings for each processed read.
441+
"""
392442

393443
enrich_single_read = curry_read_with_metadata(metadata_fp)
394444

395445
for bam_split_fp in bam_splits_fps:
396446
for read in parse_translate_align(
397-
nuc_reference_fp, aa_reference_fp, bam_split_fp, aa_db_fp
447+
nuc_reference_fp,
448+
aa_reference_fp,
449+
bam_split_fp,
450+
aa_db_fp,
451+
target_reference=target_reference,
398452
).values():
399453
enriched_read = enrich_single_read(read)
400454
yield enriched_read.to_silo_json()
@@ -408,6 +462,7 @@ def parse_translate_align_in_batches(
408462
output_fp: Path,
409463
chunk_size: int = 100000,
410464
write_chunk_size: int = 20,
465+
target_reference: str | None = None,
411466
) -> Path:
412467
"""Parse nucleotides, translate and align amino acids in batches.
413468
@@ -419,6 +474,9 @@ def parse_translate_align_in_batches(
419474
output_fp (Path): Path to the output file - .ndjson
420475
chunk_size (int): Size of each batch, in number of reads.
421476
write_chunk_size (int): Size of each write batch.
477+
target_reference (str | None): Filter reads to only include those aligned
478+
to this reference accession. Should match @SQ SN field in BAM header.
479+
If None, all reads are processed.
422480
423481
Returns:
424482
Path: The path to the output file with the correct suffix.
@@ -475,6 +533,7 @@ def parse_translate_align_in_batches(
475533
aa_reference_fp,
476534
aa_db_fp,
477535
metadata_fp,
536+
target_reference=target_reference,
478537
):
479538
buffer.append(json_str)
480539
if len(buffer) >= write_chunk_size:

src/sr2silo/process_from_vpipe.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def nuc_align_to_silo_njson(
5050
skip_merge: bool = False,
5151
version_info: str | None = None,
5252
organism: str = "covid",
53+
reference_accession: str | None = None,
5354
) -> bool:
5455
"""Process a given input file.
5556
@@ -66,6 +67,9 @@ def nuc_align_to_silo_njson(
6667
Default is None.
6768
organism (str): The organism identifier (e.g., 'covid', 'rsva').
6869
Used for timeline column mappings. Default is 'covid'.
70+
reference_accession (str | None): Filter reads to only include those
71+
aligned to this reference accession. Should match @SQ SN
72+
field in BAM header. If None, all reads are processed.
6973
7074
Returns:
7175
bool: True if processing was performed, False if skipped (0 reads).
@@ -178,6 +182,7 @@ def nuc_align_to_silo_njson(
178182
nuc_alignment_fp=merged_reads_fp,
179183
metadata_fp=metadata_file,
180184
output_fp=aligned_reads_fp,
185+
target_reference=reference_accession,
181186
)
182187
finally:
183188
# Only remove temporary files if we created them (i.e., if we merged the reads)
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)