modified minimap2 paramters to not output supplementary alignments, implemented aggregation of mapping stats over data processing steps

magmir71 · magmir71 · commit 2c2db17ec1ea · 2026-04-23T15:40:56.000+02:00
diff --git a/bin/aggregate_read_stats.py b/bin/aggregate_read_stats.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python3
+import argparse
+import pandas as pd
+
+def main():
+    parser = argparse.ArgumentParser(description="Aggregate read counts from Nextflow chunks")
+    parser.add_argument('--input', nargs='+', required=True, help="List of chunk TSV files")
+    parser.add_argument('--output', required=True, help="Output aggregated TSV")
+    args = parser.parse_args()
+
+    df_list = []
+    for f in args.input:
+        df = pd.read_csv(f, sep='\t', names=["sample_id", "step", "total_reads", "unmapped_reads", "um_reads", "mm_reads"])
+        df_list.append(df)
+    
+    combined = pd.concat(df_list)
+    
+    # Group by sample and step, then sum the chunks
+    agg = combined.groupby(["sample_id", "step"]).sum().reset_index()
+    
+    # Sort alphabetically (our step names will start with 01_, 02_, etc.)
+    agg = agg.sort_values(["sample_id", "step"])
+    
+    agg.to_csv(args.output, sep='\t', index=False)
+
+if __name__ == "__main__":
+    main()
diff --git a/bin/count_reads.sh b/bin/count_reads.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+SAMPLE=$1
+STEP=$2
+BAM=$3
+CPUS=$4
+OUT=$5
+
+# Extract read name and whether it mapped to a chromosome (RNAME != *)
+samtools view -@ $CPUS -F 2048 $BAM | awk '{if($3=="*") print $1"\tUN"; else print $1"\tMA"}' | sort -S 2G | uniq -c > counts.tmp
+
+# Sum up the occurrences
+UNMAPPED=$(awk '$3=="UN" {sum++} END {print sum+0}' counts.tmp)
+UM=$(awk '$1==1 && $3=="MA" {sum++} END {print sum+0}' counts.tmp)
+MM=$(awk '$1>1 && $3=="MA" {sum++} END {print sum+0}' counts.tmp)
+TOTAL=$((UNMAPPED + UM + MM))
+
+# Output as TSV row
+echo -e "${SAMPLE}\t${STEP}\t${TOTAL}\t${UNMAPPED}\t${UM}\t${MM}" > $OUT
+
+rm counts.tmp
diff --git a/envs/processing_bam_with_python.yml b/envs/processing_bam_with_python.yml
@@ -15,4 +15,4 @@ dependencies:
       - git+https://github.com/zavolanlab/zavolab_pyutils.git@dev
       - git+https://github.com/zavolanlab/SCINPAS.git@making_CLI_interface_for_python_functions
 
-# Cache Bust: Added April 21 14:39 to force pip pulling the latest version of zavolab_pyutils
+# Cache Bust: Added April 22 15:07 to force pip pulling the latest version of zavolab_pyutils
diff --git a/main.nf b/main.nf
@@ -37,7 +37,7 @@ workflow {
 
     // 2.c Alignment: Pass both the unmapped BAMs and the compiled index
     minimap2_align(orient_strands.out.ubam, build_minimap2_index.out.mmi)
-
+    
     // ==============================================================================
     // CHUNK-LEVEL PROCESSING (Highly Parallel)
     // ==============================================================================
@@ -47,7 +47,7 @@ workflow {
 
     // 2.e. Fix Softclipped Alignments
     scinpas_fix_softclipped(normalize_umi_lengths.out.bam, params.ref)
-
+    
     // 2.f. Extract PolyA reads
     scinpas_get_polyA(scinpas_fix_softclipped.out.bam, params.ref)
 
@@ -85,10 +85,31 @@ workflow {
 
     // 4.b Redefine NH Tags to correct for alignment filtering after UMI deduplication
     redefine_nh_tags(umi_tools_dedup.out.bam)
-
+    
     // 4.c Generate BigWigs for Cleavage Sites using the custom scripts
     make_bigwig_for_cleavage_sites(redefine_nh_tags.out.bam.join(redefine_nh_tags.out.bai), params.ref)
 
+    // ==============================================================================
+    // QC: MAPPING STATISTICS
+    // ==============================================================================
+    
+    ch_to_count = dorado_basecall.out.ubam.map{ id, bam -> [id, bam, "01_dorado_basecall"] }
+        .mix(
+            orient_strands.out.ubam.map{ id, bam -> [id, bam, "02_reverse_complemented_backward_oriented_reads"] },
+            minimap2_align.out.bam.map{ id, bam -> [id, bam, "03_aligned_with_minimap2"] },
+            normalize_umi_lengths.out.bam.map{ id, bam -> [id, bam, "04_normalized_umi_lengths"] },
+            scinpas_fix_softclipped.out.bam.map{ id, bam -> [id, bam, "05_fixed_softclipped_alignments"] },
+            scinpas_get_polyA.out.polyA_bam.map{ id, bam -> [id, bam, "06_extracted_polyA_reads"] },
+            append_polyA_tails.out.bam.map{ id, bam -> [id, bam, "07_appended_polyA_tails"] },
+            umi_tools_dedup.out.bam.map{ id, bam -> [id, bam, "08_umi_deduped"] },
+            redefine_nh_tags.out.bam.map{ id, bam -> [id, bam, "09_nh_tags_and_MAPQ_redefined"] }
+        )
+
+    // Call count_reads
+    all_read_counts = count_reads(ch_to_count)
+
+    aggregate_read_stats(all_read_counts.collect())
+
     // ==============================================================================
     // ISOFORM ANALYSIS
     // ==============================================================================
@@ -247,7 +268,7 @@ process minimap2_align {
     # 1. samtools fastq -T "*" extracts the fastq AND appends all BAM tags to the header.
     # 2. minimap2 -y reads those tags and securely copies them into the aligned BAM output.
     samtools fastq -@ ${task.cpus} -T "*" ${ubam} \\
-        | minimap2 -y -ax splice -Y -t ${task.cpus} ${mmi_index} - \\
+        | minimap2 -y -ax splice:hq --secondary=no -Y -t ${task.cpus} ${mmi_index} - \\
         | samtools sort -m 2G -@ ${task.cpus} -o \$OUT_BAM -
     """
 }
@@ -562,6 +583,41 @@ process redefine_nh_tags {
     """
 }
 
+process count_reads {
+    tag "${sample_id} - ${step_name}"
+    label 'process_medium'
+    label 'env_samtools'
+
+    input:
+    tuple val(sample_id), path(bam), val(step_name)
+
+    output:
+    path "${bam.baseName}.${step_name}.tsv", emit: tsv
+
+    script:
+    """
+    count_reads.sh ${sample_id} ${step_name} ${bam} ${task.cpus} ${bam.baseName}.${step_name}.tsv
+    """
+}
+
+process aggregate_read_stats {
+    publishDir "${params.outdir}/QC_reports", mode: 'copy'
+    label 'process_single'
+    label 'env_bam_processing_with_python'
+    
+    input:
+    path tsv_files
+    
+    output:
+    path "master_read_tracking_stats.tsv"
+    
+    script:
+    """
+    aggregate_read_stats.py --input ${tsv_files} --output master_read_tracking_stats.tsv
+    """
+}
+
+
 process make_bigwig_for_cleavage_sites {
     tag "${sample_id}"
     publishDir "${params.outdir}/cleavage_sites_bigwigs", mode: 'copy'