Skip to content

Commit 2c2db17

Browse files
committed
modified minimap2 paramters to not output supplementary alignments, implemented aggregation of mapping stats over data processing steps
1 parent 66188a4 commit 2c2db17

4 files changed

Lines changed: 109 additions & 5 deletions

File tree

bin/aggregate_read_stats.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
#!/usr/bin/env python3
2+
import argparse
3+
import pandas as pd
4+
5+
def main():
6+
parser = argparse.ArgumentParser(description="Aggregate read counts from Nextflow chunks")
7+
parser.add_argument('--input', nargs='+', required=True, help="List of chunk TSV files")
8+
parser.add_argument('--output', required=True, help="Output aggregated TSV")
9+
args = parser.parse_args()
10+
11+
df_list = []
12+
for f in args.input:
13+
df = pd.read_csv(f, sep='\t', names=["sample_id", "step", "total_reads", "unmapped_reads", "um_reads", "mm_reads"])
14+
df_list.append(df)
15+
16+
combined = pd.concat(df_list)
17+
18+
# Group by sample and step, then sum the chunks
19+
agg = combined.groupby(["sample_id", "step"]).sum().reset_index()
20+
21+
# Sort alphabetically (our step names will start with 01_, 02_, etc.)
22+
agg = agg.sort_values(["sample_id", "step"])
23+
24+
agg.to_csv(args.output, sep='\t', index=False)
25+
26+
if __name__ == "__main__":
27+
main()

bin/count_reads.sh

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
#!/bin/bash
2+
3+
SAMPLE=$1
4+
STEP=$2
5+
BAM=$3
6+
CPUS=$4
7+
OUT=$5
8+
9+
# Extract read name and whether it mapped to a chromosome (RNAME != *)
10+
samtools view -@ $CPUS -F 2048 $BAM | awk '{if($3=="*") print $1"\tUN"; else print $1"\tMA"}' | sort -S 2G | uniq -c > counts.tmp
11+
12+
# Sum up the occurrences
13+
UNMAPPED=$(awk '$3=="UN" {sum++} END {print sum+0}' counts.tmp)
14+
UM=$(awk '$1==1 && $3=="MA" {sum++} END {print sum+0}' counts.tmp)
15+
MM=$(awk '$1>1 && $3=="MA" {sum++} END {print sum+0}' counts.tmp)
16+
TOTAL=$((UNMAPPED + UM + MM))
17+
18+
# Output as TSV row
19+
echo -e "${SAMPLE}\t${STEP}\t${TOTAL}\t${UNMAPPED}\t${UM}\t${MM}" > $OUT
20+
21+
rm counts.tmp

envs/processing_bam_with_python.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,4 +15,4 @@ dependencies:
1515
- git+https://github.com/zavolanlab/zavolab_pyutils.git@dev
1616
- git+https://github.com/zavolanlab/SCINPAS.git@making_CLI_interface_for_python_functions
1717

18-
# Cache Bust: Added April 21 14:39 to force pip pulling the latest version of zavolab_pyutils
18+
# Cache Bust: Added April 22 15:07 to force pip pulling the latest version of zavolab_pyutils

main.nf

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ workflow {
3737

3838
// 2.c Alignment: Pass both the unmapped BAMs and the compiled index
3939
minimap2_align(orient_strands.out.ubam, build_minimap2_index.out.mmi)
40-
40+
4141
// ==============================================================================
4242
// CHUNK-LEVEL PROCESSING (Highly Parallel)
4343
// ==============================================================================
@@ -47,7 +47,7 @@ workflow {
4747

4848
// 2.e. Fix Softclipped Alignments
4949
scinpas_fix_softclipped(normalize_umi_lengths.out.bam, params.ref)
50-
50+
5151
// 2.f. Extract PolyA reads
5252
scinpas_get_polyA(scinpas_fix_softclipped.out.bam, params.ref)
5353

@@ -85,10 +85,31 @@ workflow {
8585

8686
// 4.b Redefine NH Tags to correct for alignment filtering after UMI deduplication
8787
redefine_nh_tags(umi_tools_dedup.out.bam)
88-
88+
8989
// 4.c Generate BigWigs for Cleavage Sites using the custom scripts
9090
make_bigwig_for_cleavage_sites(redefine_nh_tags.out.bam.join(redefine_nh_tags.out.bai), params.ref)
9191

92+
// ==============================================================================
93+
// QC: MAPPING STATISTICS
94+
// ==============================================================================
95+
96+
ch_to_count = dorado_basecall.out.ubam.map{ id, bam -> [id, bam, "01_dorado_basecall"] }
97+
.mix(
98+
orient_strands.out.ubam.map{ id, bam -> [id, bam, "02_reverse_complemented_backward_oriented_reads"] },
99+
minimap2_align.out.bam.map{ id, bam -> [id, bam, "03_aligned_with_minimap2"] },
100+
normalize_umi_lengths.out.bam.map{ id, bam -> [id, bam, "04_normalized_umi_lengths"] },
101+
scinpas_fix_softclipped.out.bam.map{ id, bam -> [id, bam, "05_fixed_softclipped_alignments"] },
102+
scinpas_get_polyA.out.polyA_bam.map{ id, bam -> [id, bam, "06_extracted_polyA_reads"] },
103+
append_polyA_tails.out.bam.map{ id, bam -> [id, bam, "07_appended_polyA_tails"] },
104+
umi_tools_dedup.out.bam.map{ id, bam -> [id, bam, "08_umi_deduped"] },
105+
redefine_nh_tags.out.bam.map{ id, bam -> [id, bam, "09_nh_tags_and_MAPQ_redefined"] }
106+
)
107+
108+
// Call count_reads
109+
all_read_counts = count_reads(ch_to_count)
110+
111+
aggregate_read_stats(all_read_counts.collect())
112+
92113
// ==============================================================================
93114
// ISOFORM ANALYSIS
94115
// ==============================================================================
@@ -247,7 +268,7 @@ process minimap2_align {
247268
# 1. samtools fastq -T "*" extracts the fastq AND appends all BAM tags to the header.
248269
# 2. minimap2 -y reads those tags and securely copies them into the aligned BAM output.
249270
samtools fastq -@ ${task.cpus} -T "*" ${ubam} \\
250-
| minimap2 -y -ax splice -Y -t ${task.cpus} ${mmi_index} - \\
271+
| minimap2 -y -ax splice:hq --secondary=no -Y -t ${task.cpus} ${mmi_index} - \\
251272
| samtools sort -m 2G -@ ${task.cpus} -o \$OUT_BAM -
252273
"""
253274
}
@@ -562,6 +583,41 @@ process redefine_nh_tags {
562583
"""
563584
}
564585

586+
process count_reads {
587+
tag "${sample_id} - ${step_name}"
588+
label 'process_medium'
589+
label 'env_samtools'
590+
591+
input:
592+
tuple val(sample_id), path(bam), val(step_name)
593+
594+
output:
595+
path "${bam.baseName}.${step_name}.tsv", emit: tsv
596+
597+
script:
598+
"""
599+
count_reads.sh ${sample_id} ${step_name} ${bam} ${task.cpus} ${bam.baseName}.${step_name}.tsv
600+
"""
601+
}
602+
603+
process aggregate_read_stats {
604+
publishDir "${params.outdir}/QC_reports", mode: 'copy'
605+
label 'process_single'
606+
label 'env_bam_processing_with_python'
607+
608+
input:
609+
path tsv_files
610+
611+
output:
612+
path "master_read_tracking_stats.tsv"
613+
614+
script:
615+
"""
616+
aggregate_read_stats.py --input ${tsv_files} --output master_read_tracking_stats.tsv
617+
"""
618+
}
619+
620+
565621
process make_bigwig_for_cleavage_sites {
566622
tag "${sample_id}"
567623
publishDir "${params.outdir}/cleavage_sites_bigwigs", mode: 'copy'

0 commit comments

Comments
 (0)