Skip to content

Commit 25d107f

Browse files
authored
Minor bug fixes and updates to make the malaria comparison pipelines run to completion on the new test sets (#510)
- Updated ReblockGVCFs docker image to `broadinstitute/gatk-nightly:2025-08-29-4.6.2.0-17-g2a1f41bf3-NIGHTLY-SNAPSHOT` to fix exception with certain spanning deletions. - Updates to malaria workflows for comparisons (`BroadOnPremMalariaPipeline_2_JointVariantCalling.wdl`, `SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS.wdl`, `SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR.wdl`, `BroadOnPremMalariaPipelineTasks.wdl`) - Fixed bug in SplitContigToIntervals that caused resulting bed files to sort out of order. - Added `SplitMultiSampleVCF.wdl` which splits a multi-sample VCF into an array of single-sample VCFs using `bcftools +split` - Added `SplitMultiSampleVCF.wdl` to dockstore. - Added bcftools plugins to lr-basic docker image. - Updated lr-basic to v0.1.3
1 parent fba2a46 commit 25d107f

File tree

10 files changed

+206
-25
lines changed

10 files changed

+206
-25
lines changed

.dockstore.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,3 +180,6 @@ workflows:
180180
- name: RemoveSingleOrganismContamination
181181
subclass: wdl
182182
primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/RemoveSingleOrganismContamination.wdl
183+
- name: SplitMultiSampleVCF
184+
subclass: wdl
185+
primaryDescriptorPath: /wdl/pipelines/TechAgnostic/Utility/SplitMultiSampleVCF.wdl

docker/lr-basic/Dockerfile

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ CMD ["bash"]
5151
# copy from previous stage the binaries from samtools build
5252
COPY --from=0 /usr/local/bin/* /usr/local/bin/
5353

54+
# copy bcftools plugins:
55+
COPY --from=0 /usr/local/libexec/bcftools/ /usr/local/libexec/bcftools/
56+
5457
#### Basic utilities
5558
ARG DEBIAN_FRONTEND=noninteractive
5659
RUN apt-get -qqy update --fix-missing && \

docker/lr-basic/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
VERSION = 0.1.2
1+
VERSION = 0.1.3
22
TAG1 = us.gcr.io/broad-dsp-lrma/lr-basic:$(VERSION)
33
TAG2 = us.gcr.io/broad-dsp-lrma/lr-basic:latest
44

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
version 1.0
2+
3+
import "../../../structs/Structs.wdl"
4+
5+
workflow SplitMultiSampleVCF {
6+
meta {
7+
description: "Split a multi-sample VCF into individual compressed VCF files, one per sample, with corresponding index files."
8+
author: "Jonn Smith"
9+
}
10+
11+
parameter_meta {
12+
input_vcf: "Multi-sample VCF file (can be compressed or uncompressed)"
13+
input_vcf_index: "Index file for the input VCF (required if VCF is compressed)"
14+
num_samples: "Number of samples in the input VCF (optional; default: 100)"
15+
}
16+
17+
input {
18+
File input_vcf
19+
File? input_vcf_index
20+
21+
Int num_samples = 100
22+
}
23+
24+
call SplitMultiSampleVCFTask {
25+
input:
26+
input_vcf = input_vcf,
27+
input_vcf_index = input_vcf_index,
28+
num_samples = num_samples
29+
}
30+
31+
output {
32+
Array[File] sample_vcfs = SplitMultiSampleVCFTask.output_vcfs
33+
Array[File] sample_vcf_indices = SplitMultiSampleVCFTask.output_vcf_indices
34+
}
35+
}
36+
37+
task SplitMultiSampleVCFTask {
38+
meta {
39+
description: "Split a multi-sample VCF into individual compressed VCF files, one per sample, with corresponding index files"
40+
}
41+
42+
parameter_meta {
43+
input_vcf: "Multi-sample VCF file (can be compressed or uncompressed)"
44+
input_vcf_index: "Index file for the input VCF (required if VCF is compressed)"
45+
num_samples: "Number of samples in the input VCF (optional; default: 100)"
46+
runtime_attr_override: "Override default runtime attributes"
47+
}
48+
49+
input {
50+
File input_vcf
51+
File? input_vcf_index
52+
53+
Int num_samples = 100
54+
55+
RuntimeAttr? runtime_attr_override
56+
}
57+
58+
Int disk_size = 10 + num_samples*ceil(size([input_vcf, input_vcf_index], "GB"))
59+
60+
command <<<
61+
set -euxo pipefail
62+
63+
mkdir -p out_dir
64+
bcftools +split ~{input_vcf} -Oz2 -W=tbi -o out_dir
65+
66+
>>>
67+
68+
output {
69+
Array[File] output_vcfs = glob("out_dir/*.vcf.gz")
70+
Array[File] output_vcf_indices = glob("out_dir/*.vcf.gz.tbi")
71+
}
72+
73+
#########################
74+
RuntimeAttr default_attr = object {
75+
cpu_cores: 2,
76+
mem_gb: 8,
77+
disk_gb: disk_size,
78+
boot_disk_gb: 25,
79+
preemptible_tries: 2,
80+
max_retries: 1,
81+
docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.3"
82+
}
83+
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
84+
runtime {
85+
cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
86+
memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
87+
disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
88+
bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
89+
preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
90+
maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
91+
docker: select_first([runtime_attr.docker, default_attr.docker])
92+
}
93+
}
94+

wdl/pipelines/Z_One_Off_Analyses/BroadOnPremMalariaPipeline_2_JointVariantCalling.wdl

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,18 @@ workflow BroadOnPremMalariaPipeline_2_JointVariantCalling {
3030
prefix = sample_name,
3131
input_vcfs = vcf_files,
3232
input_vcf_indices = vcf_index_files,
33-
reference_fasta = ref_map["reference_fasta"],
34-
reference_fai = ref_map["reference_fai"],
35-
reference_dict = ref_map["reference_dict"]
33+
reference_fasta = ref_map["fasta"],
34+
reference_fai = ref_map["fai"],
35+
reference_dict = ref_map["dict"]
3636
}
3737
3838
call BroadOnPremMalariaPipelineTasks.VariantRecalibrator as t_002_VariantRecalibrator {
3939
input:
4040
prefix = sample_name,
4141
input_vcf = t_001_GenotypeGVCFs.vcf,
42-
reference_fasta = ref_map["reference_fasta"],
43-
reference_fai = ref_map["reference_fai"],
44-
reference_dict = ref_map["reference_dict"],
42+
reference_fasta = ref_map["fasta"],
43+
reference_fai = ref_map["fai"],
44+
reference_dict = ref_map["dict"],
4545
resource_vcf_7g8_gb4 = resource_vcf_7g8_gb4,
4646
resource_vcf_hb3_dd2 = resource_vcf_hb3_dd2,
4747
resource_vcf_3d7_hb3 = resource_vcf_3d7_hb3
@@ -73,7 +73,13 @@ task GenotypeGVCFs {
7373
RuntimeAttr? runtime_attr_override
7474
}
7575

76-
Int disk_size = 1 + 10*ceil(size([input_vcfs, reference_fasta], "GB"))
76+
Int disk_size = 1 + 10*ceil(
77+
size(input_vcfs, "GB")
78+
+ size(input_vcf_indices, "GB")
79+
+ size(reference_fasta, "GB")
80+
+ size(reference_fai, "GB")
81+
+ size(reference_dict, "GB")
82+
)
7783

7884
command <<<
7985
################################
@@ -108,7 +114,7 @@ task GenotypeGVCFs {
108114
RuntimeAttr default_attr = object {
109115
cpu_cores: 2,
110116
mem_gb: 16,
111-
disk_gb: disk_size,
117+
disk_gb: disk_size, # This uses the variable calculated above
112118
boot_disk_gb: 25,
113119
preemptible_tries: 1,
114120
max_retries: 1,

wdl/pipelines/Z_One_Off_Analyses/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import "../../tasks/Utility/Utils.wdl" as UTILS
66
import "../../tasks/Utility/Finalize.wdl" as FF
77
import "../../tasks/Z_One_Off_Analyses/Pf_Niare_HaplotypeCaller.wdl" as Niare_HC
88

9-
workflow SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR {
9+
workflow SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS {
1010

1111
meta {
1212
author: "Jonn Smith"

wdl/pipelines/Z_One_Off_Analyses/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -145,7 +145,7 @@ workflow SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR {
145145
prefix = prefix + "." + contig,
146146
}
147147
148-
call Niare_HC.ApplyVqsrIndel as ApplyVqsrSnp {
148+
call Niare_HC.ApplyVqsrSnp as ApplyVqsrSnp {
149149
input:
150150
input_vcf = ApplyVqsrIndel.output_vcf,
151151
input_vcf_index = ApplyVqsrIndel.output_vcf_index,

wdl/tasks/Utility/Utils.wdl

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2092,11 +2092,6 @@ task SplitContigToIntervals {
20922092
String contig
20932093
Int size = 200000
20942094

2095-
File ref_fasta
2096-
File ref_fasta_fai
2097-
2098-
String prefix
2099-
21002095
RuntimeAttr? runtime_attr_override
21012096
}
21022097

@@ -2105,16 +2100,20 @@ task SplitContigToIntervals {
21052100
command <<<
21062101
set -euxo pipefail
21072102
2108-
cat ~{ref_dict} | awk '{print $2,$3}' | grep '^SN' | sed -e 's@SN:@@' -e 's@LN:@@' | tr ' ' '\t' > genome.txt
2103+
awk '{print $2,$3}' ~{ref_dict} | grep '^SN' | sed -e 's@SN:@@' -e 's@LN:@@' | tr ' ' '\t' > genome.txt
21092104
grep "~{contig}" genome.txt > genome.contig.txt
21102105
21112106
bedtools makewindows -g genome.contig.txt -w ~{size} > ~{contig}.~{size}bp_intervals.bed
21122107
2108+
max_pos=$(tail -n1 ~{contig}.~{size}bp_intervals.bed | awk '{print $3}')
2109+
21132110
# Make individual bed files from each line:
2114-
while read line ; do
2111+
# NOTE: We need to add leading zeros here for sorting purposes.
2112+
while read -r line ; do
21152113
start=$(echo "${line}" | cut -d $'\t' -f 2)
21162114
end=$(echo "${line}" | cut -d $'\t' -f 3)
2117-
echo "${line}" > ~{contig}.${start}-${end}.single_interval.bed
2115+
new_fn=$(printf "%s.%0${#max_pos}d-%0${#max_pos}d.single_interval.bed" ~{contig} "${start}" "${end}")
2116+
echo "${line}" > "${new_fn}"
21182117
done < ~{contig}.~{size}bp_intervals.bed
21192118
>>>
21202119

wdl/tasks/VariantCalling/HaplotypeCaller.wdl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -406,7 +406,8 @@ task ReblockGVCF {
406406
boot_disk_gb: 25,
407407
preemptible_tries: 1,
408408
max_retries: 1,
409-
docker: "broadinstitute/gatk-nightly:2024-04-16-4.5.0.0-25-g986cb1549-NIGHTLY-SNAPSHOT"
409+
# docker: "broadinstitute/gatk-nightly:2024-04-16-4.5.0.0-25-g986cb1549-NIGHTLY-SNAPSHOT"
410+
docker: "broadinstitute/gatk-nightly:2025-08-29-4.6.2.0-17-g2a1f41bf3-NIGHTLY-SNAPSHOT"
410411
}
411412
# TODO: Fix this docker image to a stable version after the next GATK release!
412413

wdl/tasks/Z_One_Off_Analyses/BroadOnPremMalariaPipelineTasks.wdl

Lines changed: 80 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,9 @@ task SortCompressIndexVcf {
142142
RuntimeAttr? runtime_attr_override
143143
}
144144

145-
Int disk_size = 10 + 10*ceil(size(input_vcf, "GB"))
145+
Int disk_size = 10 + 10*ceil(3*size(input_vcf, "GB"))
146+
147+
String output_vcf = basename(input_vcf) + ".gz"
146148

147149
command <<<
148150
################################
@@ -159,14 +161,87 @@ task SortCompressIndexVcf {
159161
tot_mem_mb=$(free -m | grep '^Mem' | awk '{print $2}')
160162
161163
################################
164+
165+
# Sort first because otherwise we'll end up with integers in the INFO fields again.
166+
bcftools sort -m$((tot_mem_mb-2048))M -o tmp.vcf ~{input_vcf}
167+
168+
# Then we need to fix the integer values in the floating point INFO fields.
169+
# Without this fix / hack, downstream GATK3 tools will fail (specifically GenotypeGVCFs)
170+
awk -f - "tmp.vcf" > tmp2.vcf << 'AWK_CODE'
171+
BEGIN {
172+
FS = "\t"; OFS = "\t"
173+
174+
# Define the set of ID keys that require floating point enforcement
175+
targets["BaseQRankSum"] = 1
176+
targets["ClippingRankSum"] = 1
177+
targets["ExcessHet"] = 1
178+
targets["HaplotypeScore"] = 1
179+
targets["InbreedingCoeff"] = 1
180+
targets["MLEAF"] = 1
181+
targets["MQ"] = 1
182+
targets["MQRankSum"] = 1
183+
targets["RAW_MQ"] = 1
184+
targets["ReadPosRankSum"] = 1
185+
}
186+
187+
# Pass header lines through unchanged
188+
/^#/ { print; next }
189+
190+
{
191+
# Column 8 is the INFO column
192+
# Split the INFO string by semicolon into an array
193+
n = split($8, info_fields, ";")
194+
195+
new_info_str = ""
196+
197+
for (i = 1; i <= n; i++) {
198+
# Split Key=Value pairs
199+
# We check if split returns 2 parts to avoid breaking on Boolean Flags
200+
if (split(info_fields[i], kv, "=") == 2) {
201+
key = kv[1]
202+
val = kv[2]
203+
204+
# Check if this key is in our target list
205+
if (key in targets) {
206+
# Handle Number=A (comma-separated lists) like MLEAF
207+
m = split(val, subvals, ",")
208+
new_val_str = ""
209+
210+
for (j = 1; j <= m; j++) {
211+
# Regex Check: Match strictly integers (optional - sign, digits only)
212+
# This ignores values that are already floats (contain a dot)
213+
if (subvals[j] ~ /^-?[0-9]+$/) {
214+
subvals[j] = subvals[j] ".0"
215+
}
216+
# Reconstruct comma-separated list
217+
new_val_str = (j == 1 ? "" : new_val_str ",") subvals[j]
218+
}
219+
# Update the field with the new value
220+
info_fields[i] = key "=" new_val_str
221+
}
222+
}
223+
# Reconstruct the semicolon-separated INFO string
224+
new_info_str = (i == 1 ? "" : new_info_str ";") info_fields[i]
225+
}
226+
227+
# Replace the INFO column and print the line
228+
$8 = new_info_str
229+
print
230+
}
231+
AWK_CODE
232+
233+
################################
234+
235+
# Zip it:
236+
bgzip -c -l2 tmp2.vcf > ~{output_vcf}
162237
163-
bcftools sort -m$((tot_mem_mb-2048))M -Oz2 -o ~{input_vcf}.gz ~{input_vcf}
164-
bcftools index --threads ${np} --tbi ~{input_vcf}.gz
238+
# Index the output:
239+
bcftools index --threads ${np} --tbi ~{output_vcf}
165240
>>>
166241

167242
output {
168-
File vcf = "~{input_vcf}.gz"
169-
File vcf_index = "~{input_vcf}.gz.tbi"
243+
File vcf = output_vcf
244+
File vcf_index = "~{output_vcf}.tbi"
170245
}
171246

172247
#########################

0 commit comments

Comments
 (0)