Skip to content

Commit 621f0fd

Browse files
authored
Merge pull request #41 from zavolanlab/krish/check_gtf_compatibility
feat: add gtf compatibility
2 parents fcc099a + 65e84a6 commit 621f0fd

3 files changed

Lines changed: 61 additions & 3 deletions

File tree

envs/processing_bam_with_python.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,11 @@ dependencies:
99
- conda-forge::pigz
1010
- bioconda::ucsc-wigtobigwig
1111
- bioconda::ucsc-bedgraphtobigwig
12+
- bioconda::pysam
1213
- pip
1314
- pip:
1415
# This tells pip to install directly from the branches of the GitHub repos
1516
- git+https://github.com/zavolanlab/zavolab_pyutils.git@dev
1617
- git+https://github.com/zavolanlab/SCINPAS.git@making_CLI_interface_for_python_functions
1718

18-
# Cache Bust: Added April 21 14:39 to force pip pulling the latest version of zavolab_pyutils
19+
# Cache Bust: Apr 22 — added pysam

main.nf

Lines changed: 56 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,13 @@ if( !params.reference_gtf ) { exit 1, "Please provide the reference GTF file wit
1616
params.qc_base_dir = "${params.outdir}/QC_plots/raw_signal_annotation"
1717

1818
workflow {
19-
20-
// 0. Build the minimap2 index once from the reference FASTA
19+
20+
// 0a. Optional: verify that the GTF and genome FASTA are compatible before doing any real work
21+
if (params.check_gtf_compatibility) {
22+
check_gtf_genome_compatibility(params.ref, params.reference_gtf)
23+
}
24+
25+
// 0b. Build the minimap2 index once from the reference FASTA
2126
build_minimap2_index(params.ref)
2227

2328
// 1. Initial Data Ingest: Create a stream of (sample_id, pod5_file)
@@ -613,6 +618,55 @@ process make_bigwig_for_cleavage_sites {
613618
}
614619

615620

621+
process check_gtf_genome_compatibility {
622+
label 'process_single'
623+
label 'env_samtools'
624+
625+
input:
626+
path fasta
627+
path gtf
628+
629+
output:
630+
path "compatibility_check.txt", emit: report
631+
632+
script:
633+
"""
634+
# Index the genome FASTA and extract chromosome names from column 1 of the .fai
635+
samtools faidx ${fasta}
636+
cut -f1 ${fasta}.fai | sort > genome_chroms.txt
637+
TOTAL_GENOME=\$(wc -l < genome_chroms.txt)
638+
639+
# Extract unique chromosome names from column 1 of the GTF (skip comment lines)
640+
grep -v '^#' ${gtf} | cut -f1 | sort -u > gtf_chroms.txt
641+
642+
# Count how many genome chromosomes appear in the GTF
643+
FOUND_IN_GTF=\$(comm -12 genome_chroms.txt gtf_chroms.txt | wc -l)
644+
645+
{
646+
echo "Genome chromosomes (from .fai): \$TOTAL_GENOME"
647+
echo "Genome chromosomes found in GTF: \$FOUND_IN_GTF"
648+
} | tee compatibility_check.txt
649+
650+
if [ "\$TOTAL_GENOME" -eq 0 ]; then
651+
echo "ERROR: No chromosomes found in genome FASTA index." >&2
652+
exit 1
653+
fi
654+
655+
# Require at least 50% of genome chromosomes to be present in the GTF.
656+
# Uses integer arithmetic: found*100 < total*50 ↔ found/total < 0.5
657+
PCT=\$(( FOUND_IN_GTF * 100 / TOTAL_GENOME ))
658+
if [ \$(( FOUND_IN_GTF * 100 )) -lt \$(( TOTAL_GENOME * 50 )) ]; then
659+
echo "ERROR: Only \${FOUND_IN_GTF}/\${TOTAL_GENOME} genome chromosomes (\${PCT}%) are present in the GTF." >&2
660+
echo "At least 50% of genome chromosomes must appear in the GTF." >&2
661+
echo "Please verify that the genome FASTA and GTF correspond to the same reference assembly," >&2
662+
echo "or set '--check_gtf_compatibility false' to skip this check." >&2
663+
exit 1
664+
fi
665+
666+
echo "OK: \${FOUND_IN_GTF}/\${TOTAL_GENOME} genome chromosomes (\${PCT}%) present in GTF." | tee -a compatibility_check.txt
667+
"""
668+
}
669+
616670
process transcriptome_annotation_enrichment {
617671
publishDir "${params.outdir}/transcriptome", mode: 'copy'
618672
label 'process_high_memory_low_cpu'

nextflow.config

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ params {
2929
ref = "Homo_sapiens.GRCh38.dna.primary_assembly.fa"
3030
reference_gtf = "gencode.v42.annotation.gtf"
3131

32+
// Set to false to skip when uncharacterised scaffolds are expected to be absent from the GTF
33+
check_gtf_compatibility = false
34+
3235
// Visualization
3336
num_reads = 10
3437
figwidth = 15.0

0 commit comments

Comments
 (0)