Skip to content

Commit 65713ae

Browse files
committed
test suite and update buildindices refactor with FixGeneNames
1 parent 7fc4c32 commit 65713ae

File tree

3 files changed

+420
-77
lines changed

3 files changed

+420
-77
lines changed

pipelines/wdl/build_indices/BuildIndices.changelog.md

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,14 @@
1-
# 5.1.0
2-
2026-02-11 (Date of Last Commit)
1+
# 5.1.1
2+
2026-02-13 (Date of Last Commit)
33

44
* refactored to cleanly isolate the MitoAnnotate task and its outputs, mitochondrial deduplication is only handled within the MitoAnnotate task
55
* ModifyGTF and ModifyGTFMarmoset are now their own cleanly separated tasks, and the logic to determine which one to run is handled in the main workflow rather than inside BuildStarSingleNucleus
66
* skip_gtf_modification is now `run_modify_gtf`, a required input with no default, so you must know if your input GTF has already been modified or not
77
* updated the metadata.txt output to explicitly include versions and which tasks and file modifications were run in the pipeline
88
* biotypes.tsv is now an optional input, as it is only needed when running the ModifyGTF task (and not when running ModifyGTFMarmoset)
9+
* bugfix to move GTF gene_name / gene_id fix to FixGeneNames which is always run
10+
* added a test suite for the BuildIndices pipeline, found in verification/test-wdls/scripts/test_buildindices_mito_bug.py, which includes a test for the mitochondrial contig deduplication logic
11+
912

1013
# 5.0.4
1114
2026-02-11 (Date of Last Commit)

pipelines/wdl/build_indices/BuildIndices.wdl

Lines changed: 80 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ workflow BuildIndices {
2525
}
2626

2727
# version of this pipeline
28-
String pipeline_version = "5.1.0"
28+
String pipeline_version = "5.1.1"
2929

3030
parameter_meta {
3131
annotations_gtf: "the annotation file"
@@ -63,13 +63,19 @@ workflow BuildIndices {
6363
File final_genome_fa = select_first([RemoveDuplicateMitoContig.cleaned_fasta, genome_fa])
6464
File final_annotations_gtf = select_first([append_mito_gtf.out_gtf, annotations_gtf])
6565
66-
# ---- GTF modification block ----
66+
# ---- Always-run GTF gene_name fix ----
67+
call FixGeneNames {
68+
input:
69+
annotation_gtf = final_annotations_gtf
70+
}
71+
72+
# ---- Conditional GTF modification block ----
6773
Boolean is_marmoset = (organism == "marmoset" || organism == "Marmoset")
6874
6975
if (run_modify_gtf && !is_marmoset) {
7076
call ModifyGTF {
7177
input:
72-
annotation_gtf = final_annotations_gtf,
78+
annotation_gtf = FixGeneNames.fixed_gtf,
7379
genome_source = genome_source,
7480
genome_build = genome_build,
7581
biotypes = select_first([biotypes])
@@ -79,12 +85,12 @@ workflow BuildIndices {
7985
if (run_modify_gtf && is_marmoset) {
8086
call ModifyGTFMarmoset {
8187
input:
82-
annotation_gtf = final_annotations_gtf,
88+
annotation_gtf = FixGeneNames.fixed_gtf,
8389
organism = organism
8490
}
8591
}
8692
87-
File gtf_for_star = select_first([ModifyGTF.modified_gtf, ModifyGTFMarmoset.modified_gtf, final_annotations_gtf])
93+
File gtf_for_star = select_first([ModifyGTF.modified_gtf, ModifyGTFMarmoset.modified_gtf, FixGeneNames.fixed_gtf])
8894
8995
call BuildStarSingleNucleus {
9096
input:
@@ -124,6 +130,7 @@ workflow BuildIndices {
124130
Array[File] recorded_outputs = select_all([
125131
annotate_with_mitofinder.out_fasta, # File? from conditional block
126132
append_mito_gtf.out_gtf, # File? from conditional block
133+
FixGeneNames.fixed_gtf, # Always-run gene_name fix
127134
ModifyGTF.modified_gtf, # File? from conditional block
128135
ModifyGTFMarmoset.modified_gtf, # File? from conditional block
129136
BuildStarSingleNucleus.star_index,
@@ -149,6 +156,7 @@ workflow BuildIndices {
149156
input_genome_fa = genome_fa,
150157
mito_annotated_fasta = annotate_with_mitofinder.out_fasta,
151158
mito_appended_gtf = append_mito_gtf.out_gtf,
159+
fixed_gtf = FixGeneNames.fixed_gtf,
152160
modified_gtf = ModifyGTF.modified_gtf,
153161
modified_gtf_marmoset = ModifyGTFMarmoset.modified_gtf,
154162
star_annotation_gtf = BuildStarSingleNucleus.modified_annotation_gtf,
@@ -350,6 +358,66 @@ task CalculateChromosomeSizes {
350358
}
351359
}
352360

361+
task FixGeneNames {
362+
input {
363+
File annotation_gtf
364+
}
365+
366+
meta {
367+
description: "Decompress GTF if needed and fix missing gene_name attributes by copying from gene_id"
368+
}
369+
370+
command <<<
371+
set -eo pipefail
372+
373+
# Decompress GTF if gzipped
374+
if [[ "~{annotation_gtf}" == *.gz ]]; then
375+
echo "Detected gzipped GTF file, decompressing..."
376+
gunzip -c ~{annotation_gtf} > annotation.gtf
377+
GTF_FILE="annotation.gtf"
378+
else
379+
echo "GTF file is not compressed"
380+
GTF_FILE="~{annotation_gtf}"
381+
fi
382+
383+
# Fix missing gene_name attributes
384+
echo "Checking and fixing gene_name attributes in GTF..."
385+
awk -F'\t' 'BEGIN { OFS="\t" }
386+
/^#/ { print; next }
387+
{
388+
gene_id = ""; gene_name = "";
389+
if ($9 ~ /gene_id/) {
390+
n = split($9, a, /gene_id "/)
391+
if (n > 1) {
392+
split(a[2], b, "\"")
393+
gene_id = b[1]
394+
}
395+
}
396+
397+
# Check if gene_name is missing and add it
398+
if ($9 !~ /gene_name/ && gene_id != "") {
399+
sub(/[[:space:]]*;[[:space:]]*$/, "", $9) # remove trailing semicolons/spaces
400+
$9 = $9 "; gene_name \"" gene_id "\";"
401+
}
402+
403+
print
404+
}' "$GTF_FILE" > fixed_annotation.gtf
405+
406+
echo "GTF gene_name fix complete"
407+
>>>
408+
409+
output {
410+
File fixed_gtf = "fixed_annotation.gtf"
411+
}
412+
413+
runtime {
414+
docker: "ubuntu:20.04"
415+
memory: "4 GiB"
416+
disks: "local-disk 50 HDD"
417+
cpu: 1
418+
}
419+
}
420+
353421
task BuildStarSingleNucleus {
354422
input {
355423
# GTF annotation version refers to the version (GENCODE) or release (NCBI) listed in the GTF
@@ -426,41 +494,7 @@ task ModifyGTF {
426494
command <<<
427495
set -eo pipefail
428496
429-
# Decompress GTF if gzipped
430-
if [[ "~{annotation_gtf}" == *.gz ]]; then
431-
echo "Detected gzipped GTF file, decompressing..."
432-
gunzip -c ~{annotation_gtf} > annotation.gtf
433-
GTF_FILE="annotation.gtf"
434-
else
435-
echo "GTF file is not compressed"
436-
GTF_FILE="~{annotation_gtf}"
437-
fi
438-
439-
# Fix missing gene_name attributes
440-
echo "Checking and fixing gene_name attributes in GTF..."
441-
awk -F'\t' 'BEGIN { OFS="\t" }
442-
/^#/ { print; next }
443-
{
444-
gene_id = ""; gene_name = "";
445-
if ($9 ~ /gene_id/) {
446-
n = split($9, a, /gene_id "/)
447-
if (n > 1) {
448-
split(a[2], b, "\"")
449-
gene_id = b[1]
450-
}
451-
}
452-
453-
# Check if gene_name is missing and add it
454-
if ($9 !~ /gene_name/ && gene_id != "") {
455-
sub(/[[:space:]]*;[[:space:]]*$/, "", $9) # remove trailing semicolons/spaces
456-
$9 = $9 "; gene_name \"" gene_id "\";"
457-
}
458-
459-
print
460-
}' "$GTF_FILE" > fixed_annotation.gtf
461-
462-
GTF_FILE="fixed_annotation.gtf"
463-
echo "GTF gene_name fix complete"
497+
GTF_FILE="~{annotation_gtf}"
464498
465499
# Validate GTF contains expected genome build
466500
if head -10 ${GTF_FILE} | grep -qi ~{genome_build}
@@ -513,41 +547,7 @@ task ModifyGTFMarmoset {
513547
command <<<
514548
set -eo pipefail
515549
516-
# Decompress GTF if gzipped
517-
if [[ "~{annotation_gtf}" == *.gz ]]; then
518-
echo "Detected gzipped GTF file, decompressing..."
519-
gunzip -c ~{annotation_gtf} > annotation.gtf
520-
GTF_FILE="annotation.gtf"
521-
else
522-
echo "GTF file is not compressed"
523-
GTF_FILE="~{annotation_gtf}"
524-
fi
525-
526-
# Fix missing gene_name attributes
527-
echo "Checking and fixing gene_name attributes in GTF..."
528-
awk -F'\t' 'BEGIN { OFS="\t" }
529-
/^#/ { print; next }
530-
{
531-
gene_id = ""; gene_name = "";
532-
if ($9 ~ /gene_id/) {
533-
n = split($9, a, /gene_id "/)
534-
if (n > 1) {
535-
split(a[2], b, "\"")
536-
gene_id = b[1]
537-
}
538-
}
539-
540-
# Check if gene_name is missing and add it
541-
if ($9 !~ /gene_name/ && gene_id != "") {
542-
sub(/[[:space:]]*;[[:space:]]*$/, "", $9) # remove trailing semicolons/spaces
543-
$9 = $9 "; gene_name \"" gene_id "\";"
544-
}
545-
546-
print
547-
}' "$GTF_FILE" > fixed_annotation.gtf
548-
549-
GTF_FILE="fixed_annotation.gtf"
550-
echo "GTF gene_name fix complete"
550+
GTF_FILE="~{annotation_gtf}"
551551
552552
# Create marmoset header
553553
echo "Marmoset detected, running header modification"
@@ -641,6 +641,7 @@ task RecordMetadata {
641641
# Optional modification outputs for tracking which steps ran
642642
File? mito_annotated_fasta
643643
File? mito_appended_gtf
644+
File fixed_gtf
644645
File? modified_gtf
645646
File? modified_gtf_marmoset
646647
File star_annotation_gtf
@@ -695,6 +696,10 @@ task RecordMetadata {
695696
echo " [MitoFinder] Skipped" >> metadata.txt
696697
fi
697698
699+
# FixGeneNames (always runs)
700+
echo " [FixGeneNames] Fixed missing gene_name attributes" >> metadata.txt
701+
echo " Output fixed GTF: $(to_gs '~{fixed_gtf}')" >> metadata.txt
702+
698703
# GTF Modification
699704
if [ "~{run_modify_gtf}" = "true" ]; then
700705
if [ "~{is_marmoset}" = "true" ]; then

0 commit comments

Comments
 (0)