Skip to content

Commit 89d8dc0

Browse files
authored
Merge pull request #627 from broadinstitute/dp-norm
Upgrade to viral-core 2.5.20 with optimized BAM filtering
2 parents 5b3b9ea + 862af51 commit 89d8dc0

16 files changed

+465
-290
lines changed

CLAUDE.md

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,3 +188,50 @@ Image versions are pinned in `requirements-modules.txt` and must be kept in sync
188188
## Dockstore Integration
189189

190190
Workflows are registered on Dockstore for easy import to Terra, DNAnexus, and other platforms. The `.dockstore.yml` file defines all published workflows and their test parameter files.
191+
192+
## Terra Performance Analysis
193+
194+
When analyzing workflow performance from Terra submissions, use the Terra MCP tools for structure/status queries and direct GCS access for log analysis.
195+
196+
### Timing Methodology for WDL Tasks
197+
198+
When measuring task execution time from Terra logs:
199+
200+
1. **Start time**: Use first Python log timestamp in stderr
201+
- Pattern: `^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}),\d+`
202+
203+
2. **End time**: Use GCS file modification timestamp of stderr
204+
- Get via: `gcloud storage ls -l <path>/stderr`
205+
- This captures ALL execution including post-Python BAM I/O
206+
207+
3. **Why not use Python log end time?**
208+
- Many tasks run external tools (Java, pysam) after Python logging ends
209+
- Python logs don't capture full execution time
210+
211+
### Efficient GCS Queries with Wildcards
212+
213+
Use wildcards to batch GCS queries instead of iterating:
214+
```bash
215+
# Get all stderr files from a submission with timestamps in one query
216+
gcloud storage ls -l "gs://bucket/submissions/<sub_id>/classify_single/*/call-deplete/stderr"
217+
gcloud storage ls -l "gs://bucket/submissions/<sub_id>/classify_single/*/call-deplete/attempt-*/stderr"
218+
```
219+
220+
### Handling Preemption Retries
221+
222+
When a task is preempted, Cromwell creates `attempt-*` directories:
223+
```
224+
call-deplete/
225+
stderr # First attempt (may be incomplete)
226+
attempt-2/ # Second attempt
227+
stderr # Final successful run
228+
```
229+
230+
**Always use the final (highest-numbered) attempt** for performance analysis - preemption time shouldn't count against code performance.
231+
232+
### Sample Identification
233+
234+
To identify which workflow corresponds to which sample:
235+
1. Read first few KB of stderr from each workflow
236+
2. Look for sample name in BAM file paths (e.g., `/S20.l1.xxxx.bam`)
237+
3. Cache the sample-to-workflow mapping for reuse

pipes/WDL/tasks/tasks_assembly.wdl

Lines changed: 50 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ task assemble {
6060

6161
Int disk_size = 375
6262

63-
command {
63+
command <<<
6464
set -ex -o pipefail
6565
6666
# find 90% memory
@@ -82,20 +82,28 @@ task assemble {
8282
--loglevel=DEBUG
8383
8484
samtools view -c ~{sample_name}.subsamp.bam | tee subsample_read_count >&2
85-
}
85+
86+
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
87+
cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M
88+
set +o pipefail
89+
{ if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi; } > MEM_BYTES
90+
>>>
8691

8792
output {
8893
File contigs_fasta = "~{sample_name}.assembly1-spades.fasta"
8994
File subsampBam = "~{sample_name}.subsamp.bam"
9095
Int subsample_read_count = read_int("subsample_read_count")
96+
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
97+
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
98+
Int cpu_load_15min = ceil(read_float("LOAD_15M"))
9199
String viralngs_version = read_string("VERSION")
92100
}
93101

94102
runtime {
95103
docker: docker
96104
memory: select_first([machine_mem_gb, 63]) + " GB"
97105
cpu: 4
98-
disks: "local-disk " + disk_size + " LOCAL"
106+
disks: "local-disk " + disk_size + " HDD"
99107
disk: disk_size + " GB" # TES
100108
dx_instance_type: "mem1_ssd1_v2_x8"
101109
maxRetries: 2
@@ -162,6 +170,11 @@ task select_references {
162170
# create top-hits output files
163171
cut -f 1 "~{contigs_basename}.refs_skani_dist.top.tsv" | tail +2 > TOP_FASTAS
164172
for f in $(cat TOP_FASTAS); do basename "$f" .fasta; done > TOP_FASTAS_BASENAMES
173+
174+
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
175+
cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M
176+
set +o pipefail
177+
{ if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi; } > MEM_BYTES
165178
>>>
166179

167180
output {
@@ -171,13 +184,16 @@ task select_references {
171184
Array[File] top_matches_per_cluster_fastas = read_lines("TOP_FASTAS")
172185
File skani_dist_full_tsv = "~{contigs_basename}.refs_skani_dist.full.tsv"
173186
File skani_dist_top_tsv = "~{contigs_basename}.refs_skani_dist.top.tsv"
187+
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
188+
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
189+
Int cpu_load_15min = ceil(read_float("LOAD_15M"))
174190
}
175191

176192
runtime {
177193
docker: docker
178194
memory: machine_mem_gb + " GB"
179195
cpu: cpu
180-
disks: "local-disk " + disk_size + " LOCAL"
196+
disks: "local-disk " + disk_size + " HDD"
181197
disk: disk_size + " GB" # TESs
182198
dx_instance_type: "mem1_ssd1_v2_x2"
183199
preemptible: 2
@@ -197,9 +213,9 @@ task scaffold {
197213
Int replace_length=55
198214
Boolean allow_incomplete_output = false
199215

200-
Int? skani_m
201-
Int? skani_s
202-
Int? skani_c
216+
Int? skani_m
217+
Int? skani_s
218+
Int? skani_c
203219

204220
Int? nucmer_max_gap
205221
Int? nucmer_min_match
@@ -387,6 +403,11 @@ task scaffold {
387403
~{'--aligner=' + aligner} \
388404
--loglevel=DEBUG
389405
fi
406+
407+
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
408+
cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M
409+
set +o pipefail
410+
{ if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi; } > MEM_BYTES
390411
>>>
391412

392413
output {
@@ -406,14 +427,17 @@ task scaffold {
406427
Float skani_ani = read_float("SKANI_ANI")
407428
Float skani_ref_aligned_frac = read_float("SKANI_REF_AF")
408429
Float skani_contigs_aligned_frac = read_float("SKANI_CONTIGS_AF")
430+
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
431+
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
432+
Int cpu_load_15min = ceil(read_float("LOAD_15M"))
409433
String viralngs_version = read_string("VERSION")
410434
}
411435

412436
runtime {
413437
docker: docker
414438
memory: select_first([machine_mem_gb, 63]) + " GB"
415439
cpu: 4
416-
disks: "local-disk " + disk_size + " LOCAL"
440+
disks: "local-disk " + disk_size + " HDD"
417441
disk: disk_size + " GB" # TES
418442
dx_instance_type: "mem1_ssd1_v2_x8"
419443
maxRetries: 2
@@ -498,7 +522,7 @@ task skani_triangle {
498522
docker: docker
499523
memory: machine_mem_gb + " GB"
500524
cpu: cpu
501-
disks: "local-disk " + disk_size + " LOCAL"
525+
disks: "local-disk " + disk_size + " SSD"
502526
disk: disk_size + " GB" # TES
503527
dx_instance_type: "mem1_ssd1_v2_x4"
504528
preemptible: 2
@@ -550,7 +574,7 @@ task ivar_trim {
550574
}
551575
}
552576

553-
command {
577+
command <<<
554578
ivar version | head -1 | tee VERSION
555579
if [ -f "~{trim_coords_bed}" ]; then
556580
ivar trim -e \
@@ -569,7 +593,7 @@ task ivar_trim {
569593
PCT=$(grep "Trimmed primers from" IVAR_OUT | perl -lape 's/Trimmed primers from (\S+)%.*/$1/')
570594
if [[ $PCT = -* ]]; then echo 0; else echo $PCT; fi > IVAR_TRIM_PCT
571595
grep "Trimmed primers from" IVAR_OUT | perl -lape 's/Trimmed primers from \S+% \((\d+)\).*/$1/' > IVAR_TRIM_COUNT
572-
}
596+
>>>
573597

574598
output {
575599
File aligned_trimmed_bam = "~{bam_basename}.trimmed.bam"
@@ -582,7 +606,7 @@ task ivar_trim {
582606
docker: docker
583607
memory: select_first([machine_mem_gb, 7]) + " GB"
584608
cpu: 4
585-
disks: "local-disk " + disk_size + " LOCAL"
609+
disks: "local-disk " + disk_size + " HDD"
586610
disk: disk_size + " GB" # TES
587611
dx_instance_type: "mem1_ssd1_v2_x4"
588612
maxRetries: 2
@@ -672,7 +696,7 @@ task align_reads {
672696

673697
Int? cpu
674698
Int? machine_mem_gb
675-
String docker = "quay.io/broadinstitute/viral-core:2.5.18"
699+
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
676700

677701
String sample_name = basename(basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt"), ".clean")
678702
}
@@ -787,7 +811,7 @@ task align_reads {
787811
docker: docker
788812
memory: machine_mem_gb_actual + " GB"
789813
cpu: cpu_actual
790-
disks: "local-disk " + disk_size + " LOCAL"
814+
disks: "local-disk " + disk_size + " SSD"
791815
disk: disk_size + " GB" # TES
792816
dx_instance_type: "mem1_ssd1_v2_x8"
793817
preemptible: 1
@@ -905,6 +929,11 @@ task refine_assembly_with_aligned_reads {
905929
set +o pipefail # grep will exit 1 if it fails to find the pattern
906930
grep -v '^>' trimmed.fasta | tr -d '\n' | wc -c | tee assembly_length
907931
grep -v '^>' trimmed.fasta | tr -d '\nNn' | wc -c | tee assembly_length_unambiguous
932+
933+
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
934+
cat /proc/loadavg | cut -f 3 -d ' ' > LOAD_15M
935+
set +o pipefail
936+
{ if [ -f /sys/fs/cgroup/memory.peak ]; then cat /sys/fs/cgroup/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.peak ]; then cat /sys/fs/cgroup/memory/memory.peak; elif [ -f /sys/fs/cgroup/memory/memory.max_usage_in_bytes ]; then cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes; else echo "0"; fi; } > MEM_BYTES
908937
>>>
909938

910939
output {
@@ -914,14 +943,17 @@ task refine_assembly_with_aligned_reads {
914943
Int assembly_length_unambiguous = read_int("assembly_length_unambiguous")
915944
Int dist_to_ref_snps = read_int("num_snps")
916945
Int dist_to_ref_indels = read_int("num_indels")
946+
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
947+
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
948+
Int cpu_load_15min = ceil(read_float("LOAD_15M"))
917949
String viralngs_version = read_string("VERSION")
918950
}
919951

920952
runtime {
921953
docker: docker
922954
memory: machine_mem_gb + " GB"
923955
cpu: 8
924-
disks: "local-disk " + disk_size + " LOCAL"
956+
disks: "local-disk " + disk_size + " SSD"
925957
disk: disk_size + " GB" # TES
926958
dx_instance_type: "mem1_ssd1_v2_x8"
927959
maxRetries: 2
@@ -940,7 +972,7 @@ task run_discordance {
940972
String out_basename = "run"
941973
Int min_coverage = 4
942974

943-
String docker = "quay.io/broadinstitute/viral-core:2.5.18"
975+
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
944976
}
945977
parameter_meta {
946978
reads_aligned_bam: {
@@ -960,7 +992,7 @@ task run_discordance {
960992

961993
Int disk_size = 100
962994

963-
command {
995+
command <<<
964996
set -ex -o pipefail
965997
966998
read_utils.py --version | tee VERSION
@@ -1026,7 +1058,7 @@ task run_discordance {
10261058
echo 0 > num_discordant_snps
10271059
echo 0 > num_discordant_indels
10281060
fi
1029-
}
1061+
>>>
10301062

10311063
output {
10321064
File discordant_sites_vcf = "~{out_basename}.discordant.vcf"

pipes/WDL/tasks/tasks_demux.wdl

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,12 @@ task merge_tarballs {
66
String out_filename
77

88
Int? machine_mem_gb
9-
String docker = "quay.io/broadinstitute/viral-core:2.5.18"
9+
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
1010
}
1111

1212
Int disk_size = 2625
1313

14-
command {
14+
command <<<
1515
set -ex -o pipefail
1616
1717
if [ -z "$TMPDIR" ]; then
@@ -23,7 +23,7 @@ task merge_tarballs {
2323
file_utils.py merge_tarballs \
2424
~{out_filename} ~{sep=' ' tar_chunks} \
2525
--loglevel=DEBUG
26-
}
26+
>>>
2727

2828
output {
2929
File combined_tar = "~{out_filename}"
@@ -181,7 +181,7 @@ task illumina_demux {
181181
# --- options for VM shape ----------------------
182182
Int? machine_mem_gb
183183
Int disk_size = 2625
184-
String docker = "quay.io/broadinstitute/viral-core:2.5.18"
184+
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
185185
}
186186

187187
parameter_meta {
@@ -823,7 +823,7 @@ task get_illumina_run_metadata {
823823
String? sequencing_center
824824

825825
Int? machine_mem_gb
826-
String docker = "quay.io/broadinstitute/viral-core:2.5.18"
826+
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
827827
}
828828

829829
parameter_meta {
@@ -927,7 +927,7 @@ task demux_fastqs {
927927
Int? machine_mem_gb
928928
Int max_cpu = 32 # Maximum CPU cap for autoscaling (use 16 for 2-barcode, 64 for 3-barcode)
929929
Int disk_size = 750
930-
String docker = "quay.io/broadinstitute/viral-core:2.5.18"
930+
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
931931
}
932932

933933
# Calculate total input size for autoscaling
@@ -1056,7 +1056,7 @@ task merge_demux_metrics {
10561056
input {
10571057
Array[File]+ metrics_files
10581058
String output_filename = "merged_demux_metrics.txt"
1059-
String docker = "quay.io/broadinstitute/viral-core:2.5.18"
1059+
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
10601060
}
10611061

10621062
parameter_meta {

pipes/WDL/tasks/tasks_interhost.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -351,7 +351,7 @@ task index_ref {
351351
File? novocraft_license
352352

353353
Int? machine_mem_gb
354-
String docker = "quay.io/broadinstitute/viral-core:2.5.18"
354+
String docker = "quay.io/broadinstitute/viral-core:2.5.20"
355355
}
356356

357357
Int disk_size = 100

0 commit comments

Comments
 (0)