Skip to content

Commit 2361eb8

Browse files
committed
now we can rely on Terra itself for fast localization in minimap2 task
add output tracking how long the alignment step took
1 parent 4e451b8 commit 2361eb8

File tree

4 files changed

+59
-17
lines changed

4 files changed

+59
-17
lines changed

wdl/pipelines/ONT/Preprocessing/ONTFlowcellWGSuBAM.wdl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ workflow ONTFlowcellWGSuBAM {
9797

9898
output {
9999
String last_processing_date = today.yyyy_mm_dd
100+
String aln_wallclock_time = ALN.total_runtime
100101

101102
File aligned_bam = FinalizeAlignedBam.gcs_path
102103
File aligned_bai = FinalizeAlignedBai.gcs_path

wdl/tasks/Alignment/AlignONTWGSuBAM.wdl

Lines changed: 38 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ workflow AlignONTWGSuBAM {
3838
output {
3939
File aligned_bam = aBAM
4040
File aligned_bai = aBAI
41+
42+
String total_runtime = select_first([sumRuntimes.total_runtime, Minimap2.wallclocktime])
4143
}
4244

4345
Map[String, String] ref_map = read_map(ref_map_file)
@@ -113,6 +115,7 @@ workflow AlignONTWGSuBAM {
113115
disk_type = if ('LOCAL' == aln_disk_type) then 'SSD' else 'LOCAL',
114116
timeout_hours = if (100 < size(uBAM, "GiB")) then 10 else 5 # heuristic: longer wait for bigger BAMs
115117
}
118+
call sumRuntimes { input: runtimes = MapShard.wallclocktime }
116119
}
117120
if (emperical_bam_sz_threshold >= ceil(size(uBAM, "GiB"))) {
118121
call AR.Minimap2 {
@@ -212,4 +215,38 @@ task OneOffHandleSpacesInRGLine {
212215
disks: "local-disk 10 HDD"
213216
docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest"
214217
}
215-
}
218+
}
219+
220+
task sumRuntimes {
221+
input {
222+
Array[String] runtimes
223+
}
224+
225+
output {
226+
String total_runtime = read_string("total.txt")
227+
}
228+
command <<<
229+
total_minutes=0
230+
231+
# Read each runtime and sum up
232+
while IFS= read -r line; do
233+
# Extract hours and minutes from format "XX hours, YY minutes"
234+
hours=$(echo "$line" | grep -oP '\d+(?=H)')
235+
minutes=$(echo "$line" | grep -oP '\d+(?=M)')
236+
237+
# Convert to total minutes and add
238+
total_minutes=$((total_minutes + hours * 60 + minutes))
239+
done < ~{write_lines(runtimes)}
240+
241+
# Convert back to hours and minutes
242+
final_hours=$((total_minutes / 60))
243+
final_minutes=$((total_minutes % 60))
244+
245+
# Output in same format with padding
246+
printf "%02dH%02dM\n" $final_hours $final_minutes > total.txt
247+
>>>
248+
runtime {
249+
disks: "local-disk 10 HDD"
250+
docker: "gcr.io/cloud-marketplace/google/ubuntu2004:latest"
251+
}
252+
}

wdl/tasks/Alignment/AlignReads.wdl

Lines changed: 17 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,7 @@ task Minimap2 {
2424
descrpiton: "A wrapper to minimap2 for mapping & aligning (groups of) sequences to a reference. Note that this only works for reads belonging to a single readgroup."
2525
}
2626
parameter_meta {
27-
reads: {
28-
desciption: "query sequences to be mapped and aligned",
29-
localization_optional: true
30-
}
27+
reads: "query sequences to be mapped and aligned"
3128
reads_file_basenames: "basenames of the BAM files, note this includes the extention (e.g. .bam, .fasta.gz, etc) of files"
3229
longer_ont_read_hint: "hint that the input reads are longer (~>20kb N50) ONT reads; this is used to bump the memory to the task"
3330
ref_fasta: "reference fasta"
@@ -57,14 +54,18 @@ task Minimap2 {
5754
Int mm2_threads = cpus - 2
5855

5956
command <<<
60-
set -euxo pipefail
57+
set -euxo pipefail
58+
start_time=$(date +%s)
59+
60+
results_dir=$(pwd)
6161
6262
FILE="~{reads[0]}"
6363
6464
############
6565
# localize data (much faster than Cromwell)
6666
mkdir -p reads_dir
67-
time gcloud storage cp ~{sep=' ' reads} /cromwell_root/reads_dir/
67+
mv ~{sep=' ' reads} \
68+
reads_dir/
6869
ls reads_dir
6970
7071
cd reads_dir
@@ -169,20 +170,28 @@ task Minimap2 {
169170
170171
############
171172
# move results up
172-
mv ~{prefix}.bam ~{prefix}.bam.bai /cromwell_root
173+
mv ~{prefix}.bam ~{prefix}.bam.bai "${results_dir}/"
174+
175+
end_time=$(date +%s)
176+
total_seconds=$((end_time - start_time))
177+
hours=$((total_seconds / 3600))
178+
minutes=$(((total_seconds % 3600) / 60))
179+
printf "%02dH%02dM\n" $hours $minutes > "${results_dir}/wallclocktime.txt"
173180
>>>
174181

175182
output {
176183
File aligned_bam = "~{prefix}.bam"
177184
File aligned_bai = "~{prefix}.bam.bai"
185+
186+
String wallclocktime = read_string("wallclocktime.txt")
178187
}
179188

180189
#########################
181190
RuntimeAttr default_attr = object {
182191
cpu_cores: cpus,
183192
mem_gb: mem,
184193
disk_gb: disk_size,
185-
preemptible_tries: 3,
194+
preemptible_tries: 1,
186195
max_retries: 0,
187196
docker: "us.gcr.io/broad-dsp-lrma/lr-minimap2:2.26-gcloud"
188197
}

wdl/tasks/Utility/BAMutils.wdl

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1645,7 +1645,6 @@ task SplitNameSortedUbam {
16451645
read_cnt: "number of reads in the uBAM; providing this will reduce run time."
16461646
n_reads: "desired number of reads per split; mutually exclusive with n_files"
16471647
n_files: "desired number of split files; mutually exclusive with n_reads"
1648-
uBAM: { localization_optional: true }
16491648
}
16501649
input {
16511650
File uBAM
@@ -1665,24 +1664,20 @@ task SplitNameSortedUbam {
16651664
String split_arg = if defined(n_reads) then "--SPLIT_TO_N_READS ~{X}" else "--SPLIT_TO_N_FILES ~{X}"
16661665
String helper_arg = if (defined(read_cnt)) then "--TOTAL_READS_IN_INPUT ~{read_cnt}" else " "
16671666

1668-
String base = basename(uBAM, ".bam")
1669-
String local_bam = "/cromwell_root/~{base}.bam"
1670-
16711667
command <<<
16721668
set -eux
16731669
16741670
if ~{fail}; then echo "one and only one of [n_reads, n_files] must be specified" && exit 1; fi
16751671
16761672
# prep
1677-
time gcloud storage cp ~{uBAM} ~{local_bam}
16781673
mkdir -p split_outputs
16791674
16801675
# higher memory, also lower # of reads in memory given ~100 longer reads (1.5E4 bp vs 1.5E2 bp)
16811676
gatk SplitSamByNumberOfReads \
16821677
--java-options "-Xmx28G -Xms24G" \
16831678
-use_jdk_deflater -use_jdk_inflater \
16841679
--MAX_RECORDS_IN_RAM 5000 \
1685-
-I ~{local_bam} \
1680+
-I ~{uBAM} \
16861681
-O split_outputs \
16871682
~{split_arg} \
16881683
~{helper_arg}
@@ -1695,8 +1690,8 @@ task SplitNameSortedUbam {
16951690
mem_gb: 32,
16961691
disk_gb: disk_size,
16971692
boot_disk_gb: 10,
1698-
preemptible_tries: 2,
1699-
max_retries: 1,
1693+
preemptible_tries: 1,
1694+
max_retries: 0,
17001695
docker: "us.gcr.io/broad-gatk/gatk:4.4.0.0"
17011696
}
17021697
RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])

0 commit comments

Comments
 (0)