@@ -16,8 +16,13 @@ if( !params.reference_gtf ) { exit 1, "Please provide the reference GTF file wit
1616params. qc_base_dir = " ${ params.outdir} /QC_plots/raw_signal_annotation"
1717
1818workflow {
19-
20- // 0. Build the minimap2 index once from the reference FASTA
19+
20+ // 0a. Optional: verify that the GTF and genome FASTA are compatible before doing any real work
21+ if (params. check_gtf_compatibility) {
22+ check_gtf_genome_compatibility(params. ref, params. reference_gtf)
23+ }
24+
25+ // 0b. Build the minimap2 index once from the reference FASTA
2126 build_minimap2_index(params. ref)
2227
2328 // 1. Initial Data Ingest: Create a stream of (sample_id, pod5_file)
@@ -739,8 +744,8 @@ process make_bigwig_for_cleavage_sites {
739744 sort -k1,1 -k2,2n minus.bg > minus.sorted.bg
740745
741746 # 6. Convert to BigWig
742- bedGraphToBigWig plus.sorted.bg ${ fasta} .fai ${ sample_id} .plus.bigwig
743- bedGraphToBigWig minus.sorted.bg ${ fasta} .fai ${ sample_id} .minus.bigwig
747+ [ -s plus.sorted.bg ] && bedGraphToBigWig plus.sorted.bg ${ fasta} .fai ${ sample_id } .plus.bigwig || touch ${ sample_id} .plus.bigwig
748+ [ -s minus.sorted.bg ] && bedGraphToBigWig minus.sorted.bg ${ fasta} .fai ${ sample_id } .minus.bigwig || touch ${ sample_id} .minus.bigwig
744749
745750 # 7. Generate Summary TSV (chr, start, end, strand, weighted_count)
746751 bedtools groupby -i cs.sorted.bed -g 1,2,3,6 -c 5 -o sum > ${ sample_id} .read_sum.tsv
@@ -750,6 +755,54 @@ process make_bigwig_for_cleavage_sites {
750755 """
751756}
752757
758+ process check_gtf_genome_compatibility {
759+ label ' process_single'
760+ label ' env_samtools'
761+
762+ input:
763+ path fasta
764+ path gtf
765+
766+ output:
767+ path " compatibility_check.txt" , emit: report
768+
769+ script:
770+ """
771+ # Index the genome FASTA and extract chromosome names from column 1 of the .fai
772+ samtools faidx ${ fasta}
773+ cut -f1 ${ fasta} .fai | sort > genome_chroms.txt
774+ TOTAL_GENOME=\$ (wc -l < genome_chroms.txt)
775+
776+ # Extract unique chromosome names from column 1 of the GTF (skip comment lines)
777+ grep -v '^#' ${ gtf} | cut -f1 | sort -u > gtf_chroms.txt
778+
779+ # Count how many genome chromosomes appear in the GTF
780+ FOUND_IN_GTF=\$ (comm -12 genome_chroms.txt gtf_chroms.txt | wc -l)
781+
782+ {
783+ echo "Genome chromosomes (from .fai): \$ TOTAL_GENOME"
784+ echo "Genome chromosomes found in GTF: \$ FOUND_IN_GTF"
785+ } | tee compatibility_check.txt
786+
787+ if [ "\$ TOTAL_GENOME" -eq 0 ]; then
788+ echo "ERROR: No chromosomes found in genome FASTA index." >&2
789+ exit 1
790+ fi
791+
792+ # Require at least 50% of genome chromosomes to be present in the GTF.
793+ # Uses integer arithmetic: found*100 < total*50 ↔ found/total < 0.5
794+ PCT=\$ (( FOUND_IN_GTF * 100 / TOTAL_GENOME ))
795+ if [ \$ (( FOUND_IN_GTF * 100 )) -lt \$ (( TOTAL_GENOME * 50 )) ]; then
796+ echo "ERROR: Only \$ {FOUND_IN_GTF}/\$ {TOTAL_GENOME} genome chromosomes (\$ {PCT}%) are present in the GTF." >&2
797+ echo "At least 50% of genome chromosomes must appear in the GTF." >&2
798+ echo "Please verify that the genome FASTA and GTF correspond to the same reference assembly," >&2
799+ echo "or set '--check_gtf_compatibility false' to skip this check." >&2
800+ exit 1
801+ fi
802+
803+ echo "OK: \$ {FOUND_IN_GTF}/\$ {TOTAL_GENOME} genome chromosomes (\$ {PCT}%) present in GTF." | tee -a compatibility_check.txt
804+ """
805+ }
753806process make_bigwig_for_polya_length {
754807 tag " ${ sample_id} "
755808 publishDir " ${ params.outdir} /polya_length_bigwigs" , mode: ' copy'
0 commit comments