@@ -25,7 +25,7 @@ workflow BuildIndices {
2525 }
2626
2727 # version of this pipeline
28- String pipeline_version = "5.1.0 "
28+ String pipeline_version = "5.1.1 "
2929
3030 parameter_meta {
3131 annotations_gtf : "the annotation file"
@@ -63,13 +63,19 @@ workflow BuildIndices {
6363 File final_genome_fa = select_first ([RemoveDuplicateMitoContig .cleaned_fasta , genome_fa ])
6464 File final_annotations_gtf = select_first ([append_mito_gtf .out_gtf , annotations_gtf ])
6565
66- # ---- GTF modification block ----
66+ # ---- Always-run GTF gene_name fix ----
67+ call FixGeneNames {
68+ input :
69+ annotation_gtf = final_annotations_gtf
70+ }
71+
72+ # ---- Conditional GTF modification block ----
6773 Boolean is_marmoset = (organism == "marmoset" || organism == "Marmoset" )
6874
6975 if (run_modify_gtf && !is_marmoset ) {
7076 call ModifyGTF {
7177 input :
72- annotation_gtf = final_annotations_gtf ,
78+ annotation_gtf = FixGeneNames . fixed_gtf ,
7379 genome_source = genome_source ,
7480 genome_build = genome_build ,
7581 biotypes = select_first ([biotypes ])
@@ -79,12 +85,12 @@ workflow BuildIndices {
7985 if (run_modify_gtf && is_marmoset ) {
8086 call ModifyGTFMarmoset {
8187 input :
82- annotation_gtf = final_annotations_gtf ,
88+ annotation_gtf = FixGeneNames . fixed_gtf ,
8389 organism = organism
8490 }
8591 }
8692
87- File gtf_for_star = select_first ([ModifyGTF .modified_gtf , ModifyGTFMarmoset .modified_gtf , final_annotations_gtf ])
93+ File gtf_for_star = select_first ([ModifyGTF .modified_gtf , ModifyGTFMarmoset .modified_gtf , FixGeneNames . fixed_gtf ])
8894
8995 call BuildStarSingleNucleus {
9096 input :
@@ -124,6 +130,7 @@ workflow BuildIndices {
124130 Array [File ] recorded_outputs = select_all ([
125131 annotate_with_mitofinder .out_fasta , # File? from conditional block
126132 append_mito_gtf .out_gtf , # File? from conditional block
133+ FixGeneNames .fixed_gtf , # Always-run gene_name fix
127134 ModifyGTF .modified_gtf , # File? from conditional block
128135 ModifyGTFMarmoset .modified_gtf , # File? from conditional block
129136 BuildStarSingleNucleus .star_index ,
@@ -149,6 +156,7 @@ workflow BuildIndices {
149156 input_genome_fa = genome_fa ,
150157 mito_annotated_fasta = annotate_with_mitofinder .out_fasta ,
151158 mito_appended_gtf = append_mito_gtf .out_gtf ,
159+ fixed_gtf = FixGeneNames .fixed_gtf ,
152160 modified_gtf = ModifyGTF .modified_gtf ,
153161 modified_gtf_marmoset = ModifyGTFMarmoset .modified_gtf ,
154162 star_annotation_gtf = BuildStarSingleNucleus .modified_annotation_gtf ,
@@ -350,6 +358,66 @@ task CalculateChromosomeSizes {
350358 }
351359}
352360
361+ task FixGeneNames {
362+ input {
363+ File annotation_gtf
364+ }
365+
366+ meta {
367+ description : "Decompress GTF if needed and fix missing gene_name attributes by copying from gene_id"
368+ }
369+
370+ command <<<
371+ set -eo pipefail
372+
373+ # Decompress GTF if gzipped
374+ if [[ "~{annotation_gtf}" == *.gz ]]; then
375+ echo "Detected gzipped GTF file, decompressing..."
376+ gunzip -c ~{annotation_gtf } > annotation.gtf
377+ GTF_FILE = "annotation.gtf"
378+ else
379+ echo "GTF file is not compressed"
380+ GTF_FILE = "~{annotation_gtf}"
381+ fi
382+
383+ # Fix missing gene_name attributes
384+ echo "Checking and fixing gene_name attributes in GTF..."
385+ awk -F '\t' 'BEGIN { OFS="\t" }
386+ /^#/ { print; next }
387+ {
388+ gene_id = ""; gene_name = "";
389+ if ($9 ~ /gene_id/) {
390+ n = split($9, a, /gene_id "/)
391+ if (n > 1) {
392+ split(a[2], b, "\"")
393+ gene_id = b[1]
394+ }
395+ }
396+
397+ # Check if gene_name is missing and add it
398+ if ($9 !~ /gene_name/ && gene_id != "") {
399+ sub(/[[:space:]]*;[[:space:]]*$/, "", $9) # remove trailing semicolons/spaces
400+ $9 = $9 "; gene_name \"" gene_id "\";"
401+ }
402+
403+ print
404+ }' "$GTF_FILE " > fixed_annotation.gtf
405+
406+ echo "GTF gene_name fix complete"
407+ >>>
408+
409+ output {
410+ File fixed_gtf = "fixed_annotation.gtf"
411+ }
412+
413+ runtime {
414+ docker : "ubuntu:20.04"
415+ memory : "4 GiB"
416+ disks : "local-disk 50 HDD"
417+ cpu : 1
418+ }
419+ }
420+
353421task BuildStarSingleNucleus {
354422 input {
355423 # GTF annotation version refers to the version (GENCODE) or release (NCBI) listed in the GTF
@@ -426,41 +494,7 @@ task ModifyGTF {
426494 command <<<
427495 set -eo pipefail
428496
429- # Decompress GTF if gzipped
430- if [[ "~{annotation_gtf}" == *.gz ]]; then
431- echo "Detected gzipped GTF file, decompressing..."
432- gunzip -c ~{annotation_gtf } > annotation.gtf
433- GTF_FILE = "annotation.gtf"
434- else
435- echo "GTF file is not compressed"
436- GTF_FILE = "~{annotation_gtf}"
437- fi
438-
439- # Fix missing gene_name attributes
440- echo "Checking and fixing gene_name attributes in GTF..."
441- awk -F '\t' 'BEGIN { OFS="\t" }
442- /^#/ { print; next }
443- {
444- gene_id = ""; gene_name = "";
445- if ($9 ~ /gene_id/) {
446- n = split($9, a, /gene_id "/)
447- if (n > 1) {
448- split(a[2], b, "\"")
449- gene_id = b[1]
450- }
451- }
452-
453- # Check if gene_name is missing and add it
454- if ($9 !~ /gene_name/ && gene_id != "") {
455- sub(/[[:space:]]*;[[:space:]]*$/, "", $9) # remove trailing semicolons/spaces
456- $9 = $9 "; gene_name \"" gene_id "\";"
457- }
458-
459- print
460- }' "$GTF_FILE " > fixed_annotation.gtf
461-
462- GTF_FILE = "fixed_annotation.gtf"
463- echo "GTF gene_name fix complete"
497+ GTF_FILE = "~{annotation_gtf}"
464498
465499 # Validate GTF contains expected genome build
466500 if head -10 ${GTF_FILE } | grep -qi ~{genome_build }
@@ -513,41 +547,7 @@ task ModifyGTFMarmoset {
513547 command <<<
514548 set -eo pipefail
515549
516- # Decompress GTF if gzipped
517- if [[ "~{annotation_gtf}" == *.gz ]]; then
518- echo "Detected gzipped GTF file, decompressing..."
519- gunzip -c ~{annotation_gtf } > annotation.gtf
520- GTF_FILE = "annotation.gtf"
521- else
522- echo "GTF file is not compressed"
523- GTF_FILE = "~{annotation_gtf}"
524- fi
525-
526- # Fix missing gene_name attributes
527- echo "Checking and fixing gene_name attributes in GTF..."
528- awk -F '\t' 'BEGIN { OFS="\t" }
529- /^#/ { print; next }
530- {
531- gene_id = ""; gene_name = "";
532- if ($9 ~ /gene_id/) {
533- n = split($9, a, /gene_id "/)
534- if (n > 1) {
535- split(a[2], b, "\"")
536- gene_id = b[1]
537- }
538- }
539-
540- # Check if gene_name is missing and add it
541- if ($9 !~ /gene_name/ && gene_id != "") {
542- sub(/[[:space:]]*;[[:space:]]*$/, "", $9) # remove trailing semicolons/spaces
543- $9 = $9 "; gene_name \"" gene_id "\";"
544- }
545-
546- print
547- }' "$GTF_FILE " > fixed_annotation.gtf
548-
549- GTF_FILE = "fixed_annotation.gtf"
550- echo "GTF gene_name fix complete"
550+ GTF_FILE = "~{annotation_gtf}"
551551
552552 # Create marmoset header
553553 echo "Marmoset detected, running header modification"
@@ -641,6 +641,7 @@ task RecordMetadata {
641641 # Optional modification outputs for tracking which steps ran
642642 File ? mito_annotated_fasta
643643 File ? mito_appended_gtf
644+ File fixed_gtf
644645 File ? modified_gtf
645646 File ? modified_gtf_marmoset
646647 File star_annotation_gtf
@@ -695,6 +696,10 @@ task RecordMetadata {
695696 echo " [MitoFinder] Skipped" >> metadata.txt
696697 fi
697698
699+ # FixGeneNames (always runs)
700+ echo " [FixGeneNames] Fixed missing gene_name attributes" >> metadata.txt
701+ echo " Output fixed GTF: $(to_gs '~{fixed_gtf}')" >> metadata.txt
702+
698703 # GTF Modification
699704 if [ "~{run_modify_gtf}" = "true" ]; then
700705 if [ "~{is_marmoset}" = "true" ]; then
0 commit comments