@@ -16,8 +16,13 @@ if( !params.reference_gtf ) { exit 1, "Please provide the reference GTF file wit
1616params. qc_base_dir = " ${ params.outdir} /QC_plots/raw_signal_annotation"
1717
1818workflow {
19-
20- // 0. Build the minimap2 index once from the reference FASTA
19+
20+ // 0a. Optional: verify that the GTF and genome FASTA are compatible before doing any real work
21+ if (params. check_gtf_compatibility) {
22+ check_gtf_genome_compatibility(params. ref, params. reference_gtf)
23+ }
24+
25+ // 0b. Build the minimap2 index once from the reference FASTA
2126 build_minimap2_index(params. ref)
2227
2328 // 1. Initial Data Ingest: Create a stream of (sample_id, pod5_file)
@@ -613,6 +618,55 @@ process make_bigwig_for_cleavage_sites {
613618}
614619
615620
621+ process check_gtf_genome_compatibility {
622+ label ' process_single'
623+ label ' env_samtools'
624+
625+ input:
626+ path fasta
627+ path gtf
628+
629+ output:
630+ path " compatibility_check.txt" , emit: report
631+
632+ script:
633+ """
634+ # Index the genome FASTA and extract chromosome names from column 1 of the .fai
635+ samtools faidx ${ fasta}
636+ cut -f1 ${ fasta} .fai | sort > genome_chroms.txt
637+ TOTAL_GENOME=\$ (wc -l < genome_chroms.txt)
638+
639+ # Extract unique chromosome names from column 1 of the GTF (skip comment lines)
640+ grep -v '^#' ${ gtf} | cut -f1 | sort -u > gtf_chroms.txt
641+
642+ # Count how many genome chromosomes appear in the GTF
643+ FOUND_IN_GTF=\$ (comm -12 genome_chroms.txt gtf_chroms.txt | wc -l)
644+
645+ {
646+ echo "Genome chromosomes (from .fai): \$ TOTAL_GENOME"
647+ echo "Genome chromosomes found in GTF: \$ FOUND_IN_GTF"
648+ } | tee compatibility_check.txt
649+
650+ if [ "\$ TOTAL_GENOME" -eq 0 ]; then
651+ echo "ERROR: No chromosomes found in genome FASTA index." >&2
652+ exit 1
653+ fi
654+
655+ # Require at least 50% of genome chromosomes to be present in the GTF.
656+ # Uses integer arithmetic: found*100 < total*50 ↔ found/total < 0.5
657+ PCT=\$ (( FOUND_IN_GTF * 100 / TOTAL_GENOME ))
658+ if [ \$ (( FOUND_IN_GTF * 100 )) -lt \$ (( TOTAL_GENOME * 50 )) ]; then
659+ echo "ERROR: Only \$ {FOUND_IN_GTF}/\$ {TOTAL_GENOME} genome chromosomes (\$ {PCT}%) are present in the GTF." >&2
660+ echo "At least 50% of genome chromosomes must appear in the GTF." >&2
661+ echo "Please verify that the genome FASTA and GTF correspond to the same reference assembly," >&2
662+ echo "or set '--check_gtf_compatibility false' to skip this check." >&2
663+ exit 1
664+ fi
665+
666+ echo "OK: \$ {FOUND_IN_GTF}/\$ {TOTAL_GENOME} genome chromosomes (\$ {PCT}%) present in GTF." | tee -a compatibility_check.txt
667+ """
668+ }
669+
616670process transcriptome_annotation_enrichment {
617671 publishDir " ${ params.outdir} /transcriptome" , mode: ' copy'
618672 label ' process_high_memory_low_cpu'
0 commit comments