Skip to content

Commit fd2778e

Browse files
committed
Merge remote-tracking branch 'origin/dev' into aleksei/strand_tags_and_isoforms
2 parents 58bd6be + 441240f commit fd2778e

2 files changed

Lines changed: 60 additions & 4 deletions

File tree

main.nf

Lines changed: 57 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,13 @@ if( !params.reference_gtf ) { exit 1, "Please provide the reference GTF file wit
1616
params.qc_base_dir = "${params.outdir}/QC_plots/raw_signal_annotation"
1717

1818
workflow {
19-
20-
// 0. Build the minimap2 index once from the reference FASTA
19+
20+
// 0a. Optional: verify that the GTF and genome FASTA are compatible before doing any real work
21+
if (params.check_gtf_compatibility) {
22+
check_gtf_genome_compatibility(params.ref, params.reference_gtf)
23+
}
24+
25+
// 0b. Build the minimap2 index once from the reference FASTA
2126
build_minimap2_index(params.ref)
2227

2328
// 1. Initial Data Ingest: Create a stream of (sample_id, pod5_file)
@@ -739,8 +744,8 @@ process make_bigwig_for_cleavage_sites {
739744
sort -k1,1 -k2,2n minus.bg > minus.sorted.bg
740745
741746
# 6. Convert to BigWig
742-
bedGraphToBigWig plus.sorted.bg ${fasta}.fai ${sample_id}.plus.bigwig
743-
bedGraphToBigWig minus.sorted.bg ${fasta}.fai ${sample_id}.minus.bigwig
747+
[ -s plus.sorted.bg ] && bedGraphToBigWig plus.sorted.bg ${fasta}.fai ${sample_id}.plus.bigwig || touch ${sample_id}.plus.bigwig
748+
[ -s minus.sorted.bg ] && bedGraphToBigWig minus.sorted.bg ${fasta}.fai ${sample_id}.minus.bigwig || touch ${sample_id}.minus.bigwig
744749
745750
# 7. Generate Summary TSV (chr, start, end, strand, weighted_count)
746751
bedtools groupby -i cs.sorted.bed -g 1,2,3,6 -c 5 -o sum > ${sample_id}.read_sum.tsv
@@ -750,6 +755,54 @@ process make_bigwig_for_cleavage_sites {
750755
"""
751756
}
752757

758+
process check_gtf_genome_compatibility {
759+
label 'process_single'
760+
label 'env_samtools'
761+
762+
input:
763+
path fasta
764+
path gtf
765+
766+
output:
767+
path "compatibility_check.txt", emit: report
768+
769+
script:
770+
"""
771+
# Index the genome FASTA and extract chromosome names from column 1 of the .fai
772+
samtools faidx ${fasta}
773+
cut -f1 ${fasta}.fai | sort > genome_chroms.txt
774+
TOTAL_GENOME=\$(wc -l < genome_chroms.txt)
775+
776+
# Extract unique chromosome names from column 1 of the GTF (skip comment lines)
777+
grep -v '^#' ${gtf} | cut -f1 | sort -u > gtf_chroms.txt
778+
779+
# Count how many genome chromosomes appear in the GTF
780+
FOUND_IN_GTF=\$(comm -12 genome_chroms.txt gtf_chroms.txt | wc -l)
781+
782+
{
783+
echo "Genome chromosomes (from .fai): \$TOTAL_GENOME"
784+
echo "Genome chromosomes found in GTF: \$FOUND_IN_GTF"
785+
} | tee compatibility_check.txt
786+
787+
if [ "\$TOTAL_GENOME" -eq 0 ]; then
788+
echo "ERROR: No chromosomes found in genome FASTA index." >&2
789+
exit 1
790+
fi
791+
792+
# Require at least 50% of genome chromosomes to be present in the GTF.
793+
# Uses integer arithmetic: found*100 < total*50 ↔ found/total < 0.5
794+
PCT=\$(( FOUND_IN_GTF * 100 / TOTAL_GENOME ))
795+
if [ \$(( FOUND_IN_GTF * 100 )) -lt \$(( TOTAL_GENOME * 50 )) ]; then
796+
echo "ERROR: Only \${FOUND_IN_GTF}/\${TOTAL_GENOME} genome chromosomes (\${PCT}%) are present in the GTF." >&2
797+
echo "At least 50% of genome chromosomes must appear in the GTF." >&2
798+
echo "Please verify that the genome FASTA and GTF correspond to the same reference assembly," >&2
799+
echo "or set '--check_gtf_compatibility false' to skip this check." >&2
800+
exit 1
801+
fi
802+
803+
echo "OK: \${FOUND_IN_GTF}/\${TOTAL_GENOME} genome chromosomes (\${PCT}%) present in GTF." | tee -a compatibility_check.txt
804+
"""
805+
}
753806
process make_bigwig_for_polya_length {
754807
tag "${sample_id}"
755808
publishDir "${params.outdir}/polya_length_bigwigs", mode: 'copy'

nextflow.config

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,9 @@ params {
3434
gpu_qos = "h200-30min"
3535
gpu_time = "30min"
3636

37+
// Set to false to skip when uncharacterised scaffolds are expected to be absent from the GTF
38+
check_gtf_compatibility = True
39+
3740
// Visualization
3841
num_reads = 10
3942
figwidth = 15.0

0 commit comments

Comments
 (0)