Minor bug fixes and updates to make the malaria comparison pipelines run to completion on the new test sets (#510)

jonn-smith · web-flow · commit 25d107f99665 · 2025-12-09T22:50:26.000-05:00
- Updated ReblockGVCFs docker image to `broadinstitute/gatk-nightly:2025-08-29-4.6.2.0-17-g2a1f41bf3-NIGHTLY-SNAPSHOT` to fix exception with certain spanning deletions.
- Updates to malaria workflows for comparisons (`BroadOnPremMalariaPipeline_2_JointVariantCalling.wdl`, `SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS.wdl`, `SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR.wdl`, `BroadOnPremMalariaPipelineTasks.wdl`)
- Fixed bug in SplitContigToIntervals that caused resulting bed files to sort out of order.
- Added `SplitMultiSampleVCF.wdl` which splits a multi-sample VCF into an array of single-sample VCFs using `bcftools +split`
- Added `SplitMultiSampleVCF.wdl` to dockstore.
- Added bcftools plugins to lr-basic docker image.
- Updated lr-basic to v0.1.3
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -180,3 +180,6 @@ workflows:
 - name: RemoveSingleOrganismContamination
   subclass: wdl
   primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/RemoveSingleOrganismContamination.wdl
+- name: SplitMultiSampleVCF
+  subclass: wdl
+  primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/SplitMultiSampleVCF.wdl
diff --git a/docker/lr-basic/Dockerfile b/docker/lr-basic/Dockerfile
@@ -51,6 +51,9 @@ CMD ["bash"]
 # copy from previous stage the binaries from samtools build
 COPY --from=0 /usr/local/bin/* /usr/local/bin/
 
+# copy bcftools plugins:
+COPY --from=0 /usr/local/libexec/bcftools/ /usr/local/libexec/bcftools/
+
 #### Basic utilities
 ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get -qqy update --fix-missing && \
diff --git a/docker/lr-basic/Makefile b/docker/lr-basic/Makefile
@@ -1,4 +1,4 @@
-VERSION = 0.1.2
+VERSION = 0.1.3
 TAG1 = us.gcr.io/broad-dsp-lrma/lr-basic:$(VERSION)
 TAG2 = us.gcr.io/broad-dsp-lrma/lr-basic:latest
 
diff --git a/wdl/pipelines/TechAgnostic/Utility/SplitMultiSampleVCF.wdl b/wdl/pipelines/TechAgnostic/Utility/SplitMultiSampleVCF.wdl
@@ -0,0 +1,94 @@
+version 1.0
+
+import "../../../structs/Structs.wdl"
+
+workflow SplitMultiSampleVCF {
+    meta {
+        description: "Split a multi-sample VCF into individual compressed VCF files, one per sample, with corresponding index files."
+        author: "Jonn Smith"
+    }
+
+    parameter_meta {
+        input_vcf: "Multi-sample VCF file (can be compressed or uncompressed)"
+        input_vcf_index: "Index file for the input VCF (required if VCF is compressed)"
+        num_samples: "Number of samples in the input VCF (optional; default: 100)"
+    }
+
+    input {
+        File input_vcf
+        File? input_vcf_index
+
+        Int num_samples = 100
+    }
+
+    call SplitMultiSampleVCFTask {
+        input:
+            input_vcf = input_vcf,
+            input_vcf_index = input_vcf_index,
+            num_samples = num_samples
+    }
+
+    output {
+        Array[File] sample_vcfs = SplitMultiSampleVCFTask.output_vcfs
+        Array[File] sample_vcf_indices = SplitMultiSampleVCFTask.output_vcf_indices
+    }
+}
+
+task SplitMultiSampleVCFTask {
+    meta {
+        description: "Split a multi-sample VCF into individual compressed VCF files, one per sample, with corresponding index files"
+    }
+
+    parameter_meta {
+        input_vcf: "Multi-sample VCF file (can be compressed or uncompressed)"
+        input_vcf_index: "Index file for the input VCF (required if VCF is compressed)"
+        num_samples: "Number of samples in the input VCF (optional; default: 100)"
+        runtime_attr_override: "Override default runtime attributes"
+    }
+
+    input {
+        File input_vcf
+        File? input_vcf_index
+
+        Int num_samples = 100
+
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Int disk_size = 10 + num_samples*ceil(size([input_vcf, input_vcf_index], "GB"))
+
+    command <<<
+        set -euxo pipefail
+
+        mkdir -p out_dir
+        bcftools +split ~{input_vcf} -Oz2 -W=tbi -o out_dir
+
+    >>>
+
+    output {
+        Array[File] output_vcfs = glob("out_dir/*.vcf.gz")
+        Array[File] output_vcf_indices = glob("out_dir/*.vcf.gz.tbi")
+    }
+
+    #########################
+    RuntimeAttr default_attr = object {
+        cpu_cores:          2,
+        mem_gb:             8,
+        disk_gb:            disk_size,
+        boot_disk_gb:       25,
+        preemptible_tries:  2,
+        max_retries:        1,
+        docker:             "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.3"
+    }
+    RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
+    runtime {
+        cpu:                    select_first([runtime_attr.cpu_cores,         default_attr.cpu_cores])
+        memory:                 select_first([runtime_attr.mem_gb,            default_attr.mem_gb]) + " GiB"
+        disks: "local-disk " +  select_first([runtime_attr.disk_gb,           default_attr.disk_gb]) + " HDD"
+        bootDiskSizeGb:         select_first([runtime_attr.boot_disk_gb,      default_attr.boot_disk_gb])
+        preemptible:            select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
+        maxRetries:             select_first([runtime_attr.max_retries,       default_attr.max_retries])
+        docker:                 select_first([runtime_attr.docker,            default_attr.docker])
+    }
+}
+
diff --git a/wdl/pipelines/Z_One_Off_Analyses/BroadOnPremMalariaPipeline_2_JointVariantCalling.wdl b/wdl/pipelines/Z_One_Off_Analyses/BroadOnPremMalariaPipeline_2_JointVariantCalling.wdl
@@ -30,18 +30,18 @@ workflow BroadOnPremMalariaPipeline_2_JointVariantCalling {
             prefix = sample_name,
             input_vcfs = vcf_files,
             input_vcf_indices = vcf_index_files,
-            reference_fasta = ref_map["reference_fasta"],
-            reference_fai = ref_map["reference_fai"],
-            reference_dict = ref_map["reference_dict"]
+            reference_fasta = ref_map["fasta"],
+            reference_fai = ref_map["fai"],
+            reference_dict = ref_map["dict"]
     }
 
     call BroadOnPremMalariaPipelineTasks.VariantRecalibrator as t_002_VariantRecalibrator {
         input:
             prefix = sample_name,
             input_vcf = t_001_GenotypeGVCFs.vcf,
-            reference_fasta = ref_map["reference_fasta"],
-            reference_fai = ref_map["reference_fai"],
-            reference_dict = ref_map["reference_dict"],
+            reference_fasta = ref_map["fasta"],
+            reference_fai = ref_map["fai"],
+            reference_dict = ref_map["dict"],
             resource_vcf_7g8_gb4 = resource_vcf_7g8_gb4,
             resource_vcf_hb3_dd2 = resource_vcf_hb3_dd2,
             resource_vcf_3d7_hb3 = resource_vcf_3d7_hb3
@@ -73,7 +73,13 @@ task GenotypeGVCFs {
         RuntimeAttr? runtime_attr_override
     }
 
-    Int disk_size = 1 + 10*ceil(size([input_vcfs, reference_fasta], "GB"))
+    Int disk_size = 1 + 10*ceil(
+        size(input_vcfs, "GB")
+        + size(input_vcf_indices, "GB")
+        + size(reference_fasta, "GB")
+        + size(reference_fai, "GB")
+        + size(reference_dict, "GB")
+    )
 
     command <<<
         ################################
@@ -108,7 +114,7 @@ task GenotypeGVCFs {
     RuntimeAttr default_attr = object {
         cpu_cores:          2,
         mem_gb:             16,
-        disk_gb:            disk_size,
+        disk_gb:            disk_size, # This uses the variable calculated above
         boot_disk_gb:       25,
         preemptible_tries:  1,
         max_retries:        1,
diff --git a/wdl/pipelines/Z_One_Off_Analyses/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS.wdl b/wdl/pipelines/Z_One_Off_Analyses/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS.wdl
@@ -6,7 +6,7 @@ import "../../tasks/Utility/Utils.wdl" as UTILS
 import "../../tasks/Utility/Finalize.wdl" as FF
 import "../../tasks/Z_One_Off_Analyses/Pf_Niare_HaplotypeCaller.wdl" as Niare_HC
 
-workflow SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR {
+workflow SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS {
 
     meta {
         author: "Jonn Smith"
diff --git a/wdl/pipelines/Z_One_Off_Analyses/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR.wdl b/wdl/pipelines/Z_One_Off_Analyses/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR.wdl
@@ -145,7 +145,7 @@ workflow SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR {
                 prefix = prefix + "." + contig,
         }
 
-        call Niare_HC.ApplyVqsrIndel as ApplyVqsrSnp {
+        call Niare_HC.ApplyVqsrSnp as ApplyVqsrSnp {
             input:
                 input_vcf = ApplyVqsrIndel.output_vcf,
                 input_vcf_index = ApplyVqsrIndel.output_vcf_index,
diff --git a/wdl/tasks/Utility/Utils.wdl b/wdl/tasks/Utility/Utils.wdl
@@ -2092,11 +2092,6 @@ task SplitContigToIntervals {
         String contig
         Int size = 200000
 
-        File ref_fasta
-        File ref_fasta_fai
-
-        String prefix
-
         RuntimeAttr? runtime_attr_override
     }
 
@@ -2105,16 +2100,20 @@ task SplitContigToIntervals {
     command <<<
         set -euxo pipefail
 
-        cat ~{ref_dict} | awk '{print $2,$3}' | grep '^SN' | sed -e 's@SN:@@' -e 's@LN:@@' | tr ' ' '\t' > genome.txt
+        awk '{print $2,$3}' ~{ref_dict} | grep '^SN' | sed -e 's@SN:@@' -e 's@LN:@@' | tr ' ' '\t' > genome.txt
         grep "~{contig}" genome.txt > genome.contig.txt
 
         bedtools makewindows -g genome.contig.txt -w ~{size} > ~{contig}.~{size}bp_intervals.bed
 
+        max_pos=$(tail -n1 ~{contig}.~{size}bp_intervals.bed | awk '{print $3}')
+
         # Make individual bed files from each line:
-        while read line ; do
+        # NOTE: We need to add leading zeros here for sorting purposes.
+        while read -r line ; do
             start=$(echo "${line}" | cut -d $'\t' -f 2)
             end=$(echo "${line}" | cut -d $'\t' -f 3)
-            echo "${line}" > ~{contig}.${start}-${end}.single_interval.bed
+            new_fn=$(printf "%s.%0${#max_pos}d-%0${#max_pos}d.single_interval.bed" ~{contig} "${start}" "${end}")
+            echo "${line}" > "${new_fn}"
         done < ~{contig}.~{size}bp_intervals.bed
     >>>
 
diff --git a/wdl/tasks/VariantCalling/HaplotypeCaller.wdl b/wdl/tasks/VariantCalling/HaplotypeCaller.wdl
@@ -406,7 +406,8 @@ task ReblockGVCF {
        boot_disk_gb:       25,
        preemptible_tries:  1,
        max_retries:        1,
-       docker:             "broadinstitute/gatk-nightly:2024-04-16-4.5.0.0-25-g986cb1549-NIGHTLY-SNAPSHOT"
+    #    docker:             "broadinstitute/gatk-nightly:2024-04-16-4.5.0.0-25-g986cb1549-NIGHTLY-SNAPSHOT"
+       docker: "broadinstitute/gatk-nightly:2025-08-29-4.6.2.0-17-g2a1f41bf3-NIGHTLY-SNAPSHOT"
     }
     # TODO: Fix this docker image to a stable version after the next GATK release!
 
diff --git a/wdl/tasks/Z_One_Off_Analyses/BroadOnPremMalariaPipelineTasks.wdl b/wdl/tasks/Z_One_Off_Analyses/BroadOnPremMalariaPipelineTasks.wdl
@@ -142,7 +142,9 @@ task SortCompressIndexVcf {
         RuntimeAttr? runtime_attr_override
     }
 
-    Int disk_size = 10 + 10*ceil(size(input_vcf, "GB"))
+    Int disk_size = 10 + 10*ceil(3*size(input_vcf, "GB"))
+
+    String output_vcf = basename(input_vcf) + ".gz"
 
     command <<<
         ################################
@@ -159,14 +161,87 @@ task SortCompressIndexVcf {
         tot_mem_mb=$(free -m | grep '^Mem' | awk '{print $2}')
 
         ################################
+        
+        # Sort first because otherwise we'll end up with integers in the INFO fields again.
+        bcftools sort -m$((tot_mem_mb-2048))M -o tmp.vcf ~{input_vcf}
+
+        # Then we need to fix the integer values in the floating point INFO fields.
+        # Without this fix / hack, downstream GATK3 tools will fail (specifically GenotypeGVCFs)
+        awk -f - "tmp.vcf" > tmp2.vcf << 'AWK_CODE'
+            BEGIN {
+                FS = "\t"; OFS = "\t"
+
+                # Define the set of ID keys that require floating point enforcement
+                targets["BaseQRankSum"] = 1
+                targets["ClippingRankSum"] = 1
+                targets["ExcessHet"] = 1
+                targets["HaplotypeScore"] = 1
+                targets["InbreedingCoeff"] = 1
+                targets["MLEAF"] = 1
+                targets["MQ"] = 1
+                targets["MQRankSum"] = 1
+                targets["RAW_MQ"] = 1
+                targets["ReadPosRankSum"] = 1
+            }
+
+            # Pass header lines through unchanged
+            /^#/ { print; next }
+
+            {
+                # Column 8 is the INFO column
+                # Split the INFO string by semicolon into an array
+                n = split($8, info_fields, ";")
+                
+                new_info_str = ""
+
+                for (i = 1; i <= n; i++) {
+                    # Split Key=Value pairs
+                    # We check if split returns 2 parts to avoid breaking on Boolean Flags
+                    if (split(info_fields[i], kv, "=") == 2) {
+                        key = kv[1]
+                        val = kv[2]
+
+                        # Check if this key is in our target list
+                        if (key in targets) {
+                            # Handle Number=A (comma-separated lists) like MLEAF
+                            m = split(val, subvals, ",")
+                            new_val_str = ""
+
+                            for (j = 1; j <= m; j++) {
+                                # Regex Check: Match strictly integers (optional - sign, digits only)
+                                # This ignores values that are already floats (contain a dot)
+                                if (subvals[j] ~ /^-?[0-9]+$/) {
+                                    subvals[j] = subvals[j] ".0"
+                                }
+                                # Reconstruct comma-separated list
+                                new_val_str = (j == 1 ? "" : new_val_str ",") subvals[j]
+                            }
+                            # Update the field with the new value
+                            info_fields[i] = key "=" new_val_str
+                        }
+                    }
+                    # Reconstruct the semicolon-separated INFO string
+                    new_info_str = (i == 1 ? "" : new_info_str ";") info_fields[i]
+                }
+
+                # Replace the INFO column and print the line
+                $8 = new_info_str
+                print
+            }
+AWK_CODE
+         
+        ################################
+        
+        # Zip it:
+        bgzip -c -l2 tmp2.vcf > ~{output_vcf}
 
-        bcftools sort -m$((tot_mem_mb-2048))M -Oz2 -o ~{input_vcf}.gz ~{input_vcf}
-        bcftools index --threads ${np} --tbi  ~{input_vcf}.gz
+        # Index the output:
+        bcftools index --threads ${np} --tbi ~{output_vcf}
     >>>
 
     output {
-        File vcf = "~{input_vcf}.gz"
-        File vcf_index = "~{input_vcf}.gz.tbi"
+        File vcf = output_vcf
+        File vcf_index = "~{output_vcf}.tbi"
     }
 
     #########################

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-VERSION = 0.1.2`
	`1`	`+VERSION = 0.1.3`
`2`	`2`	`TAG1 = us.gcr.io/broad-dsp-lrma/lr-basic:$(VERSION)`
`3`	`3`	`TAG2 = us.gcr.io/broad-dsp-lrma/lr-basic:latest`
`4`	`4`
Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,7 @@ workflow SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR {`
`145`	`145`	`prefix = prefix + "." + contig,`
`146`	`146`	`}`
`147`	`147`
`148`		`- call Niare_HC.ApplyVqsrIndel as ApplyVqsrSnp {`
	`148`	`+ call Niare_HC.ApplyVqsrSnp as ApplyVqsrSnp {`
`149`	`149`	`input:`
`150`	`150`	`input_vcf = ApplyVqsrIndel.output_vcf,`
`151`	`151`	`input_vcf_index = ApplyVqsrIndel.output_vcf_index,`
Original file line number	Diff line number	Diff line change
`@@ -406,7 +406,8 @@ task ReblockGVCF {`
`406`	`406`	`boot_disk_gb: 25,`
`407`	`407`	`preemptible_tries: 1,`
`408`	`408`	`max_retries: 1,`
`409`		`- docker: "broadinstitute/gatk-nightly:2024-04-16-4.5.0.0-25-g986cb1549-NIGHTLY-SNAPSHOT"`
	`409`	`+ # docker: "broadinstitute/gatk-nightly:2024-04-16-4.5.0.0-25-g986cb1549-NIGHTLY-SNAPSHOT"`
	`410`	`+ docker: "broadinstitute/gatk-nightly:2025-08-29-4.6.2.0-17-g2a1f41bf3-NIGHTLY-SNAPSHOT"`
`410`	`411`	`}`
`411`	`412`	`# TODO: Fix this docker image to a stable version after the next GATK release!`
`412`	`413`