Skip to content

Commit 9c7a822

Browse files
authored
Merge pull request #628 from broadinstitute/dp-bbnorm
Integrate bbnorm normalization and optimize resource allocation
2 parents 89d8dc0 + c503167 commit 9c7a822

16 files changed

+140
-142
lines changed

pipes/WDL/tasks/tasks_assembly.wdl

Lines changed: 41 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@ task assemble {
1515
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")
1616

1717
Int? machine_mem_gb
18-
String docker = "quay.io/broadinstitute/viral-assemble:2.5.18.0"
18+
Int? cpu
19+
String docker = "quay.io/broadinstitute/viral-assemble:2.5.21.0"
1920
}
2021
parameter_meta{
2122
reads_unmapped_bam: {
@@ -101,9 +102,9 @@ task assemble {
101102

102103
runtime {
103104
docker: docker
104-
memory: select_first([machine_mem_gb, 63]) + " GB"
105-
cpu: 4
106-
disks: "local-disk " + disk_size + " HDD"
105+
memory: select_first([machine_mem_gb, 32]) + " GB"
106+
cpu: select_first([cpu, 8])
107+
disks: "local-disk " + disk_size + " SSD"
107108
disk: disk_size + " GB" # TES
108109
dx_instance_type: "mem1_ssd1_v2_x8"
109110
maxRetries: 2
@@ -124,7 +125,7 @@ task select_references {
124125
Int? skani_c
125126
Int? skani_n
126127

127-
String docker = "quay.io/broadinstitute/viral-assemble:2.5.18.0"
128+
String docker = "quay.io/broadinstitute/viral-assemble:2.5.21.0"
128129
Int machine_mem_gb = 4
129130
Int cpu = 2
130131
Int disk_size = 100
@@ -193,7 +194,7 @@ task select_references {
193194
docker: docker
194195
memory: machine_mem_gb + " GB"
195196
cpu: cpu
196-
disks: "local-disk " + disk_size + " HDD"
197+
disks: "local-disk " + disk_size + " SSD"
197198
disk: disk_size + " GB" # TESs
198199
dx_instance_type: "mem1_ssd1_v2_x2"
199200
preemptible: 2
@@ -204,7 +205,7 @@ task select_references {
204205
task scaffold {
205206
input {
206207
File contigs_fasta
207-
File reads_bam
208+
File? reads_bam
208209
Array[File]+ reference_genome_fasta
209210

210211
String aligner="muscle"
@@ -224,16 +225,22 @@ task scaffold {
224225
Float? scaffold_min_pct_contig_aligned
225226

226227
Int? machine_mem_gb
227-
String docker="quay.io/broadinstitute/viral-assemble:2.5.18.0"
228+
String docker="quay.io/broadinstitute/viral-assemble:2.5.21.0"
228229

229230
# do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
230231
String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades")
231232
}
233+
234+
# Determine whether to run Gap2Seq based on reads_bam size
235+
# Gap2Seq can take 100+ min for large BAMs (>1GB), providing diminishing returns
236+
Float reads_bam_size_gb = if defined(reads_bam) then size(select_first([reads_bam]), "GB") else 0.0
237+
Boolean run_gap2seq = defined(reads_bam) && reads_bam_size_gb < 1.0
238+
232239
parameter_meta {
233240
reads_bam: {
234-
description: "Reads in BAM format.",
241+
description: "Reads in BAM format. If provided, Gap2Seq will attempt to fill gaps using reads. Skipping this for large BAMs (>1GB) can save significant runtime.",
235242
patterns: ["*.bam"],
236-
category: "required"
243+
category: "optional"
237244
}
238245

239246
contigs_fasta: {
@@ -367,13 +374,19 @@ task scaffold {
367374
fi
368375
grep '^>' "~{sample_name}".scaffolding_chosen_ref.fasta | cut -c 2- | cut -f 1 -d ' ' > "~{sample_name}".scaffolding_chosen_refs.txt
369376
370-
assembly.py gapfill_gap2seq \
371-
"~{sample_name}".intermediate_scaffold.fasta \
372-
"~{reads_bam}" \
373-
"~{sample_name}".intermediate_gapfill.fasta \
374-
--memLimitGb $mem_in_gb \
375-
--maskErrors \
376-
--loglevel=DEBUG
377+
# Run Gap2Seq only if reads_bam is provided and smaller than 1GB
378+
if ~{true='true' false='false' run_gap2seq}; then
379+
assembly.py gapfill_gap2seq \
380+
"~{sample_name}".intermediate_scaffold.fasta \
381+
"~{reads_bam}" \
382+
"~{sample_name}".intermediate_gapfill.fasta \
383+
--memLimitGb $mem_in_gb \
384+
--maskErrors \
385+
--loglevel=DEBUG
386+
else
387+
echo "Skipping Gap2Seq: reads_bam not provided or >= 1GB (~{reads_bam_size_gb} GB)" >&2
388+
cp "~{sample_name}".intermediate_scaffold.fasta "~{sample_name}".intermediate_gapfill.fasta
389+
fi
377390
378391
set +e +o pipefail
379392
grep -v '^>' "~{sample_name}".intermediate_gapfill.fasta | tr -d '\n' | wc -c | tee assembly_preimpute_length
@@ -435,9 +448,9 @@ task scaffold {
435448

436449
runtime {
437450
docker: docker
438-
memory: select_first([machine_mem_gb, 63]) + " GB"
451+
memory: select_first([machine_mem_gb, 20]) + " GB"
439452
cpu: 4
440-
disks: "local-disk " + disk_size + " HDD"
453+
disks: "local-disk " + disk_size + " SSD"
441454
disk: disk_size + " GB" # TES
442455
dx_instance_type: "mem1_ssd1_v2_x8"
443456
maxRetries: 2
@@ -457,7 +470,7 @@ task skani_triangle {
457470
Int compression_factor = 10
458471
Int min_aligned_frac = 15
459472

460-
String docker = "quay.io/broadinstitute/viral-assemble:2.5.18.0"
473+
String docker = "quay.io/broadinstitute/viral-assemble:2.5.21.0"
461474
Int machine_mem_gb = 8
462475
Int cpu = 4
463476
Int disk_size = 100
@@ -696,7 +709,7 @@ task align_reads {
696709

697710
Int? cpu
698711
Int? machine_mem_gb
699-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
712+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
700713

701714
String sample_name = basename(basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt"), ".clean")
702715
}
@@ -707,8 +720,8 @@ task align_reads {
707720
# Linear scaling: 8 + (input_GB / 15) * 56, capped at 64, rounded to nearest multiple of 4
708721
Float cpu_unclamped = 8.0 + (size(reads_unmapped_bam, "GB") / 15.0) * 56.0
709722
Int cpu_actual = select_first([cpu, floor(((if cpu_unclamped > 64.0 then 64.0 else cpu_unclamped) + 2.0) / 4.0) * 4])
710-
# Memory scales with CPU at 2x ratio (default), or use override
711-
Int machine_mem_gb_actual = select_first([machine_mem_gb, cpu_actual * 2])
723+
# Memory scales with CPU at 3x ratio (default), or use override
724+
Int machine_mem_gb_actual = select_first([machine_mem_gb, cpu_actual * 3])
712725

713726
parameter_meta {
714727
reference_fasta: {
@@ -811,7 +824,7 @@ task align_reads {
811824
docker: docker
812825
memory: machine_mem_gb_actual + " GB"
813826
cpu: cpu_actual
814-
disks: "local-disk " + disk_size + " SSD"
827+
disks: "local-disk " + disk_size + " LOCAL"
815828
disk: disk_size + " GB" # TES
816829
dx_instance_type: "mem1_ssd1_v2_x8"
817830
preemptible: 1
@@ -834,8 +847,8 @@ task refine_assembly_with_aligned_reads {
834847
Float major_cutoff = 0.5
835848
Int min_coverage = 3
836849

837-
Int machine_mem_gb = 15
838-
String docker = "quay.io/broadinstitute/viral-assemble:2.5.18.0"
850+
Int machine_mem_gb = 8
851+
String docker = "quay.io/broadinstitute/viral-assemble:2.5.21.0"
839852
}
840853

841854
Int disk_size = 375
@@ -972,7 +985,7 @@ task run_discordance {
972985
String out_basename = "run"
973986
Int min_coverage = 4
974987

975-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
988+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
976989
}
977990
parameter_meta {
978991
reads_aligned_bam: {
@@ -1221,7 +1234,7 @@ task wgsim {
12211234
Int? random_seed
12221235

12231236
Int machine_mem_gb = 7
1224-
String docker = "quay.io/broadinstitute/viral-assemble:2.5.18.0"
1237+
String docker = "quay.io/broadinstitute/viral-assemble:2.5.21.0"
12251238
}
12261239

12271240
parameter_meta {

pipes/WDL/tasks/tasks_demux.wdl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ task merge_tarballs {
66
String out_filename
77

88
Int? machine_mem_gb
9-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
9+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
1010
}
1111

1212
Int disk_size = 2625
@@ -181,7 +181,7 @@ task illumina_demux {
181181
# --- options for VM shape ----------------------
182182
Int? machine_mem_gb
183183
Int disk_size = 2625
184-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
184+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
185185
}
186186

187187
parameter_meta {
@@ -823,7 +823,7 @@ task get_illumina_run_metadata {
823823
String? sequencing_center
824824

825825
Int? machine_mem_gb
826-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
826+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
827827
}
828828

829829
parameter_meta {
@@ -927,7 +927,7 @@ task demux_fastqs {
927927
Int? machine_mem_gb
928928
Int max_cpu = 32 # Maximum CPU cap for autoscaling (use 16 for 2-barcode, 64 for 3-barcode)
929929
Int disk_size = 750
930-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
930+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
931931
}
932932

933933
# Calculate total input size for autoscaling
@@ -1056,7 +1056,7 @@ task merge_demux_metrics {
10561056
input {
10571057
Array[File]+ metrics_files
10581058
String output_filename = "merged_demux_metrics.txt"
1059-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
1059+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
10601060
}
10611061

10621062
parameter_meta {

pipes/WDL/tasks/tasks_interhost.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ task index_ref {
351351
File? novocraft_license
352352

353353
Int? machine_mem_gb
354-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
354+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
355355
}
356356

357357
Int disk_size = 100

pipes/WDL/tasks/tasks_intrahost.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ task lofreq {
178178
docker: docker
179179
cpu: 2
180180
memory: "3 GB"
181-
disks: "local-disk " + disk_size + " HDD"
181+
disks: "local-disk " + disk_size + " SSD"
182182
disk: disk_size + " GB" # TES
183183
dx_instance_type: "mem1_ssd1_v2_x2"
184184
maxRetries: 2

pipes/WDL/tasks/tasks_megablast.wdl

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ task trim_rmdup_subsamp {
1515
Int cpu = 16
1616
Int disk_size_gb = 100
1717

18-
String docker = "quay.io/broadinstitute/viral-assemble:2.5.18.0"
18+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
1919
}
2020

2121
parameter_meta {
@@ -36,17 +36,14 @@ task trim_rmdup_subsamp {
3636
command <<<
3737
set -ex o pipefail
3838
assembly.py --version | tee VERSION
39-
#BAM ->FASTQ-> OutBam? https://github.com/broadinstitute/viral-assemble:2.5.18.0
4039
assembly.py trim_rmdup_subsamp \
41-
"~{inBam}" \
42-
"~{clipDb}" \
43-
"$(pwd)/outbam.bam" \
44-
~{'--n_reads=' + n_reads}
40+
"~{inBam}" \
41+
"~{clipDb}" \
42+
outbam.bam \
43+
~{'--n_reads=' + n_reads}
4544
46-
47-
#samtools [OutBam -> FASTA]
4845
#-f 4 (f = include only) (4 = unmapped reads) https://broadinstitute.github.io/picard/explain-flags.html
49-
samtools fasta "$(pwd)/outbam.bam" > "~{bam_basename}.fasta"
46+
samtools fasta outbam.bam > "~{bam_basename}.fasta"
5047
>>>
5148

5249
output {
@@ -58,7 +55,6 @@ task trim_rmdup_subsamp {
5855
memory: machine_mem_gb + "GB"
5956
cpu: cpu
6057
disks: "local-disk " + disk_size_gb + " LOCAL"
61-
6258
dx_instance_type: "n2-highmem-4"
6359
}
6460
}

pipes/WDL/tasks/tasks_metagenomics.wdl

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ task krakenuniq {
1111
File krona_taxonomy_db_tgz # taxonomy.tab
1212
1313
Int machine_mem_gb = 320
14-
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin
14+
String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" #skip-global-version-pin
1515
}
1616

1717
Int disk_size = 750
@@ -148,7 +148,7 @@ task build_krakenuniq_db {
148148
Int? zstd_compression_level
149149

150150
Int machine_mem_gb = 240
151-
String docker = "quay.io/broadinstitute/viral-classify:2.2.4.0" #skip-global-version-pin
151+
String docker = "quay.io/broadinstitute/viral-classify:2.1.33.0" #skip-global-version-pin
152152
}
153153

154154
Int disk_size = 750
@@ -243,7 +243,12 @@ task kraken2 {
243243
}
244244

245245
String out_basename = basename(basename(reads_bam, '.bam'), '.fasta')
246-
Int disk_size = 750
246+
247+
# Disk autoscaling: BAM->FASTQ expansion is ~7-8x, plus kraken2 reads output (~1x input),
248+
# plus kraken2 database (1x localized tarball + 2x decompressed = 3x), plus overhead for krona and temp files.
249+
# Minimum 375GB to accommodate typical database sizes.
250+
Int disk_size_auto = ceil((8 * size(reads_bam, "GB") + 3 * size(kraken2_db_tgz, "GB") + 50) / 375.0) * 375
251+
Int disk_size = if disk_size_auto < 375 then 375 else disk_size_auto
247252

248253
command <<<
249254
set -ex -o pipefail
@@ -329,7 +334,7 @@ task kraken2 {
329334
memory: machine_mem_gb + " GB"
330335
cpu: 16
331336
cpuPlatform: "Intel Ice Lake"
332-
disks: "local-disk " + disk_size + " HDD"
337+
disks: "local-disk " + disk_size + " LOCAL"
333338
disk: disk_size + " GB" # TESs
334339
dx_instance_type: "mem3_ssd1_v2_x8"
335340
preemptible: 2
@@ -378,7 +383,7 @@ task report_primary_kraken_taxa {
378383
docker: docker
379384
memory: machine_mem_gb + " GB"
380385
cpu: 1
381-
disks: "local-disk " + disk_size + " LOCAL"
386+
disks: "local-disk " + disk_size + " HDD"
382387
disk: disk_size + " GB" # TESs
383388
dx_instance_type: "mem1_ssd1_v2_x2"
384389
preemptible: 2

pipes/WDL/tasks/tasks_ncbi.wdl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ task sequencing_platform_from_bam {
183183
input {
184184
File bam
185185

186-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
186+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
187187
}
188188

189189
command <<<
@@ -293,7 +293,7 @@ task structured_comments {
293293

294294
File? filter_to_ids
295295

296-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
296+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
297297
}
298298
String out_base = basename(assembly_stats_tsv, '.txt')
299299
command <<<
@@ -346,7 +346,7 @@ task structured_comments_from_aligned_bam {
346346
String out_basename = basename(aligned_bam, '.bam')
347347
Boolean is_genome_assembly = true
348348
Boolean sanitize_ids = true
349-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
349+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
350350
}
351351
# see https://www.ncbi.nlm.nih.gov/genbank/structuredcomment/
352352
command <<<
@@ -465,7 +465,7 @@ task rename_fasta_header {
465465

466466
String out_basename = basename(genome_fasta, ".fasta")
467467

468-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
468+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
469469
}
470470
command <<<
471471
set -e
@@ -630,7 +630,7 @@ task sra_meta_prep {
630630
Boolean paired
631631

632632
String out_name = "sra_metadata.tsv"
633-
String docker="quay.io/broadinstitute/viral-core:2.5.20"
633+
String docker="quay.io/broadinstitute/viral-core:2.5.21"
634634
}
635635
Int disk_size = 100
636636
parameter_meta {

pipes/WDL/tasks/tasks_nextstrain.wdl

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,7 @@ task derived_cols {
332332
String? lab_highlight_loc
333333
Array[File] table_map = []
334334

335-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
335+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
336336
Int disk_size = 50
337337
}
338338
parameter_meta {
@@ -900,7 +900,7 @@ task filter_sequences_to_list {
900900

901901
String out_fname = sub(sub(basename(sequences, ".zst"), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta")
902902
# Prior docker image: "nextstrain/base:build-20240318T173028Z"
903-
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
903+
String docker = "quay.io/broadinstitute/viral-core:2.5.21"
904904
Int disk_size = 750
905905
}
906906
parameter_meta {

0 commit comments

Comments
 (0)