cbg-ethz
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 0 deletions b/‎.pre-commit-config.yaml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎deployments/covid/config.yaml‎
Lines changed: 7 additions & 0 deletions b/‎deployments/covid/config.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎deployments/rsva/config.yaml‎
Lines changed: 7 additions & 0 deletions b/‎deployments/rsva/config.yaml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/sr2silo/main.py‎
Lines changed: 15 additions & 0 deletions b/‎src/sr2silo/main.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎src/sr2silo/process/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎src/sr2silo/process/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/sr2silo/process/convert.py‎
Lines changed: 60 additions & 0 deletions b/‎src/sr2silo/process/convert.py‎
Lines changed: 60 additions & 0 deletions
diff --git a/‎src/sr2silo/process/translate_align.py‎
Lines changed: 64 additions & 5 deletions b/‎src/sr2silo/process/translate_align.py‎
Lines changed: 64 additions & 5 deletions
diff --git a/‎src/sr2silo/process_from_vpipe.py‎
Lines changed: 5 additions & 0 deletions b/‎src/sr2silo/process_from_vpipe.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tests/data/rsva/H3_16_2025_11_15/20251128_2511665243/alignments/REF_aln_trim.bam‎
4.4 MB b/‎tests/data/rsva/H3_16_2025_11_15/20251128_2511665243/alignments/REF_aln_trim.bam‎
4.4 MB
diff --git a/‎tests/data/rsva/H3_16_2025_11_15/20251128_2511665243/alignments/REF_aln_trim.bam.bai‎
176 Bytes b/‎tests/data/rsva/H3_16_2025_11_15/20251128_2511665243/alignments/REF_aln_trim.bam.bai‎
176 Bytes
@@ -3,6 +3,8 @@ repos:
     rev: v5.0.0
     hooks:
       - id: check-added-large-files
+        args: ['--maxkb=5000']
+        exclude: tests/data/.*\.bam$
       - id: check-case-conflict
       - id: check-merge-conflict
       - id: check-symlinks
 
@@ -42,6 +42,13 @@ RELEASE_DELAY: 180  # Seconds to wait before releasing
 ENABLE_SUBSAMPLING: true
 SUBSAMPLE_MAX_READS: 4500000
 
+### Reference Filtering
+# The reference accession to filter reads by when processing BAM files.
+# This should match the @SQ SN field in the BAM header.
+# Find this by running: samtools view -H REF_aln_trim.bam | grep @SQ
+# Leave empty or omit to process all reads (backward compatible).
+REFERENCE_ACCESSION: ""  # Not needed - COVID BAM has single reference
+
 ### Resource Configuration
 SLURM_CPUS: 20
 SLURM_MEM: "160G"
@@ -42,6 +42,13 @@ RELEASE_DELAY: 180  # Seconds to wait before releasing
 ENABLE_SUBSAMPLING: true
 SUBSAMPLE_MAX_READS: 4500000
 
+### Reference Filtering
+# The reference accession to filter reads by when processing BAM files.
+# This should match the @SQ SN field in the BAM header.
+# Find this by running: samtools view -H REF_aln_trim.bam | grep @SQ
+# Leave empty or omit to process all reads (backward compatible).
+REFERENCE_ACCESSION: "EPI_ISL_412866"  # RSV-A reference
+
 ### Resource Configuration
 SLURM_CPUS: 4
 SLURM_MEM: "16G"
@@ -213,6 +213,16 @@ def process_from_vpipe(
             help="Skip merging of paired-end reads.",
         ),
     ] = False,
+    reference_accession: Annotated[
+        str | None,
+        typer.Option(
+            "--reference-accession",
+            help="Filter reads to only include those aligned to this reference accession. "
+            "Should match @SQ SN field in BAM header (find with: "
+            "samtools view -H file.bam | grep @SQ). "
+            "If not specified, all reads are processed.",
+        ),
+    ] = None,
 ) -> None:
     """
     V-PIPE to SILO conversion with amino acids and special metadata.
@@ -239,6 +249,10 @@ def process_from_vpipe(
         logging.info(f"Using local {organism} references (no Lapis URL provided)")
     logging.info(f"Using sample_id: {sample_id}")
     logging.info(f"Skip read pair merging: {skip_merge}")
+    if reference_accession:
+        logging.info(f"Reference accession filter: {reference_accession}")
+    else:
+        logging.info("Reference accession filter: None (processing all reads)")
 
     # check if $TMPDIR is set, if not use /tmp
     if "TMPDIR" in os.environ:
@@ -274,6 +288,7 @@ def process_from_vpipe(
         skip_merge=skip_merge,
         version_info=version_info,
         organism=organism,
+        reference_accession=reference_accession,
     )
 
 
 
@@ -6,6 +6,7 @@
 from __future__ import annotations
 
 from sr2silo.process.convert import (
+    ZeroFilteredReadsError,
     bam_to_fasta_query,
     bam_to_sam,
     get_gene_set_from_ref,
@@ -32,6 +33,7 @@
 
 __all__ = [
     # from sr2silo.process.convert
+    "ZeroFilteredReadsError",
     "bam_to_fasta_query",
     "bam_to_sam",
     "get_gene_set_from_ref",
 
@@ -11,6 +11,19 @@
 
 from sr2silo.process.interface import Gene, GeneName, GeneSet, Insertion
 
+
+class ZeroFilteredReadsError(Exception):
+    """Raised when reference filtering results in zero reads.
+
+    This error indicates that a target reference was specified for filtering,
+    but no reads in the BAM file aligned to that reference. This typically
+    means the wrong reference was specified or the data doesn't contain
+    reads for the expected reference.
+    """
+
+    pass
+
+
 logging.basicConfig(
     level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
 )
@@ -215,6 +228,7 @@ def bam_to_fastq_handle_indels(
     out_insertions_fp: Path,
     deletion_char: str = "-",
     skipped_char: str = "N",
+    target_reference: str | None = None,
 ):
     """
     Convert a BAM file to a FASTQ file, removing insertions and adding a
@@ -229,14 +243,37 @@ def bam_to_fastq_handle_indels(
     :param out_insertions_fp: Path to the output file containing insertions
     :param deletion_char: Special character to use for deletions/skipped regions
     :param skipped_char: Special character to use for skipped regions
+    :param target_reference: Filter reads to only include those aligned to this
+                            reference accession. Should match @SQ SN field in
+                            BAM header. If None, all reads are processed.
     """
+    # Validate target reference against BAM headers if specified
+    if target_reference:
+        with pysam.AlignmentFile(str(bam_file), "rb") as bam_check:
+            bam_references = [sq["SN"] for sq in bam_check.header.get("SQ", [])]  # type: ignore
+            if target_reference not in bam_references:
+                logging.warning(
+                    f"Target reference '{target_reference}' not found in BAM headers. "
+                    f"Available references: {bam_references}"
+                )
+            else:
+                logging.info(f"Filtering reads to reference: {target_reference}")
+
+    processed_count = 0
+    filtered_count = 0
+
     with (
         pysam.AlignmentFile(str(bam_file), "rb") as bam,
         open(out_fastq_fp, "w") as fastq,
         open(out_insertions_fp, "w") as insertions,
     ):
         for read in bam.fetch():
             if not read.is_unmapped:
+                # Filter by reference if specified
+                if target_reference and read.reference_name != target_reference:
+                    filtered_count += 1
+                    continue
+                processed_count += 1
                 logging.debug(f"Processing read: {read.query_name}")
                 query_sequence = read.query_sequence if read.query_sequence else ""
                 query_qualities = read.query_qualities if read.query_qualities else ""
@@ -310,6 +347,29 @@ def bam_to_fastq_handle_indels(
                         f"{read.query_name}\t{insertion_pos}\t{''.join(insertion_seq)}\t{''.join(insertion_qual)}\n"
                     )
 
+    # Log filtering statistics (use WARNING level to bypass suppress_info_and_below)
+    if target_reference:
+        total_reads = processed_count + filtered_count
+        if total_reads > 0:
+            kept_pct = (processed_count / total_reads) * 100
+            filtered_pct = (filtered_count / total_reads) * 100
+            logging.warning(
+                f"Reference filtering for '{target_reference}': "
+                f"{processed_count}/{total_reads} reads kept ({kept_pct:.1f}%), "
+                f"{filtered_count} filtered out ({filtered_pct:.1f}%)"
+            )
+        else:
+            logging.warning(
+                f"Reference filtering for '{target_reference}': 0 reads in BAM file"
+            )
+        if processed_count == 0 and filtered_count > 0:
+            raise ZeroFilteredReadsError(
+                f"No reads matched target reference '{target_reference}'. "
+                f"All {filtered_count} reads were aligned to other references. "
+                f"Check that the reference accession matches the expected reference "
+                f"(found by: samtools view -H <bam> | grep @SQ)."
+            )
+
 
 def parse_cigar(cigar: str) -> List[Tuple[int, str]]:
     """Parse the CIGAR string into a list of tuples."""
 
@@ -255,7 +255,12 @@ def enrich_read_with_aa_seq(
     fasta_aa_alignment_file: Path,
     gene_set: GeneSet,
 ) -> Dict[str, AlignedRead]:
-    """Read in amino acid sequences and insertions from a FASTA file"""
+    """Read in amino acid sequences and insertions from a FASTA file.
+
+    Note: Reads in the AA alignment file that are not in aligned_reads
+    (e.g., due to reference filtering) are skipped.
+    """
+    skipped_count = 0
     with open(fasta_aa_alignment_file, "r") as f:
         total_lines = sum(1 for _ in f)
         f.seek(0)  # Reset file pointer to the beginning
@@ -268,6 +273,13 @@ def enrich_read_with_aa_seq(
                     continue
                 fields = line.strip().split("\t")
                 read_id = fields[0]
+
+                # Skip reads that were filtered out during nucleotide processing
+                if read_id not in aligned_reads:
+                    skipped_count += 1
+                    pbar.update(1)
+                    continue
+
                 gene_name = GeneName(fields[2])
                 pos = int(fields[3])
                 cigar = fields[5]
@@ -292,6 +304,12 @@ def enrich_read_with_aa_seq(
                     pos - 1,
                 )
                 pbar.update(1)
+
+    if skipped_count > 0:
+        logging.info(
+            f"AA enrichment: skipped {skipped_count} reads not in nucleotide alignment "
+            "(filtered by reference)"
+        )
     return aligned_reads
 
 
@@ -300,8 +318,21 @@ def parse_translate_align(
     aa_reference_fp: Path,
     nuc_alignment_fp: Path,
     aa_db_fp: Path,
+    target_reference: str | None = None,
 ) -> Dict[str, AlignedRead]:
-    """Parse nucleotides, translate and align amino acids the input files."""
+    """Parse nucleotides, translate and align amino acids the input files.
+
+    Args:
+        nuc_reference_fp: Path to nucleotide reference FASTA file.
+        aa_reference_fp: Path to amino acid reference FASTA file.
+        nuc_alignment_fp: Path to nucleotide alignment BAM file.
+        aa_db_fp: Path to Diamond database file.
+        target_reference: Filter reads to only include those aligned to this
+                         reference accession. If None, all reads are processed.
+
+    Returns:
+        Dictionary mapping read IDs to AlignedRead objects.
+    """
     with tempfile.TemporaryDirectory() as temp_dir:
         temp_dir_path = Path(temp_dir)
         BAM_NUC_ALIGNMENT_FILE = temp_dir_path / "combined_sorted.bam"
@@ -323,6 +354,7 @@ def parse_translate_align(
             bam_file=BAM_NUC_ALIGNMENT_FILE,
             out_fastq_fp=FASTQ_NUC_ALIGNMENT_FILE,
             out_insertions_fp=FASTA_NUC_INSERTIONS_FILE,
+            target_reference=target_reference,
         )
 
         nuc_to_aa_alignment(
@@ -386,15 +418,37 @@ def enrich_single_read(read: AlignedRead) -> AlignedRead:
 
 
 def process_bam_files(
-    bam_splits_fps, nuc_reference_fp, aa_reference_fp, aa_db_fp, metadata_fp
+    bam_splits_fps,
+    nuc_reference_fp,
+    aa_reference_fp,
+    aa_db_fp,
+    metadata_fp,
+    target_reference: str | None = None,
 ):
-    """Generator to process BAM files and yield JSON strings."""
+    """Generator to process BAM files and yield JSON strings.
+
+    Args:
+        bam_splits_fps: List of paths to BAM file splits.
+        nuc_reference_fp: Path to nucleotide reference FASTA file.
+        aa_reference_fp: Path to amino acid reference FASTA file.
+        aa_db_fp: Path to Diamond database file.
+        metadata_fp: Path to metadata JSON file.
+        target_reference: Filter reads to only include those aligned to this
+                         reference accession. If None, all reads are processed.
+
+    Yields:
+        JSON strings for each processed read.
+    """
 
     enrich_single_read = curry_read_with_metadata(metadata_fp)
 
     for bam_split_fp in bam_splits_fps:
         for read in parse_translate_align(
-            nuc_reference_fp, aa_reference_fp, bam_split_fp, aa_db_fp
+            nuc_reference_fp,
+            aa_reference_fp,
+            bam_split_fp,
+            aa_db_fp,
+            target_reference=target_reference,
         ).values():
             enriched_read = enrich_single_read(read)
             yield enriched_read.to_silo_json()
@@ -408,6 +462,7 @@ def parse_translate_align_in_batches(
     output_fp: Path,
     chunk_size: int = 100000,
     write_chunk_size: int = 20,
+    target_reference: str | None = None,
 ) -> Path:
     """Parse nucleotides, translate and align amino acids in batches.
 
@@ -419,6 +474,9 @@ def parse_translate_align_in_batches(
         output_fp (Path): Path to the output file - .ndjson
         chunk_size (int): Size of each batch, in number of reads.
         write_chunk_size (int): Size of each write batch.
+        target_reference (str | None): Filter reads to only include those aligned
+            to this reference accession. Should match @SQ SN field in BAM header.
+            If None, all reads are processed.
 
     Returns:
         Path: The path to the output file with the correct suffix.
@@ -475,6 +533,7 @@ def parse_translate_align_in_batches(
                         aa_reference_fp,
                         aa_db_fp,
                         metadata_fp,
+                        target_reference=target_reference,
                     ):
                         buffer.append(json_str)
                         if len(buffer) >= write_chunk_size:
 
@@ -50,6 +50,7 @@ def nuc_align_to_silo_njson(
     skip_merge: bool = False,
     version_info: str | None = None,
     organism: str = "covid",
+    reference_accession: str | None = None,
 ) -> bool:
     """Process a given input file.
 
@@ -66,6 +67,9 @@ def nuc_align_to_silo_njson(
                            Default is None.
         organism (str): The organism identifier (e.g., 'covid', 'rsva').
                        Used for timeline column mappings. Default is 'covid'.
+        reference_accession (str | None): Filter reads to only include those
+                       aligned to this reference accession. Should match @SQ SN
+                       field in BAM header. If None, all reads are processed.
 
     Returns:
         bool: True if processing was performed, False if skipped (0 reads).
@@ -178,6 +182,7 @@ def nuc_align_to_silo_njson(
             nuc_alignment_fp=merged_reads_fp,
             metadata_fp=metadata_file,
             output_fp=aligned_reads_fp,
+            target_reference=reference_accession,
         )
     finally:
         # Only remove temporary files if we created them (i.e., if we merged the reads)