Skip to content

Commit 808a2cd

Browse files
authored
write final mt to gcs (#1804)
1 parent 3fc439d commit 808a2cd

File tree

2 files changed

+22
-8
lines changed

2 files changed

+22
-8
lines changed

all_of_us/mitochondria/mtSwirl_refactor/Terra/Dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
#
88
# Build context should be the mtSwirl repo root:
99
# docker build -f generate_mtdna_call_mt/Terra/Dockerfile -t aou-mitochondrial-combine-vcfs-covdb:dev .
10+
# Dockerfile for base image can be found at: https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/aou_mitochondrial_annotate_coverage/Dockerfile
1011

1112
FROM us.gcr.io/broad-gotc-prod/aou-mitochondrial-annotate-coverage:1.0.0
1213

all_of_us/mitochondria/mt_coverage_merge.wdl

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,11 @@ workflow mt_coverage_merge {
3636
Int finalize_shard_n_partitions = 256
3737
Int finalize_union_n_partitions = 1000
3838

39+
# Step 5 (add_annotations) output bucket
40+
# Annotated outputs are written directly to GCS. A timestamp subdirectory
41+
# is created inside the task to prevent accidental overwrites.
42+
String annotated_output_bucket
43+
3944
}
4045

4146
if (defined(sample_list_tsv)) {
@@ -231,7 +236,8 @@ workflow mt_coverage_merge {
231236
coverage_tsv = process_tsv_files.processed_tsv, # Path to the coverage input TSV file
232237
vcf_mt = combined_mt_tar, # Path to the MatrixTable
233238
keep_all_samples = true,
234-
output_name = "annotated"
239+
output_name = "annotated",
240+
output_bucket = annotated_output_bucket
235241
}
236242
237243
@@ -241,15 +247,16 @@ workflow mt_coverage_merge {
241247
coverage_tsv = process_tsv_files.processed_tsv, # Path to the coverage input TSV file
242248
vcf_mt = combined_mt_tar, # Path to the MatrixTable
243249
keep_all_samples = false,
244-
output_name = "filt_annotated"
250+
output_name = "filt_annotated",
251+
output_bucket = annotated_output_bucket
245252
}
246253
247254
output {
248255
File processed_tsv = process_tsv_files.processed_tsv
249256
File output_coverage_db = annotate_coverage.output_db
250257
File combined_vcf = combined_mt_tar
251-
File annotated_output_tar = annotated.annotated_output_tar
252-
File filt_annotated_output_tar = filt_annotated.annotated_output_tar
258+
String annotated_output_gcs_path = annotated.annotated_output_gcs_path
259+
String filt_annotated_output_gcs_path = filt_annotated.annotated_output_gcs_path
253260
}
254261
}
255262

@@ -1319,6 +1326,7 @@ task add_annotations {
13191326
File coverage_tsv # Path to the coverage input TSV file
13201327
File vcf_mt # Path to the MatrixTable
13211328
String output_name # directory output name
1329+
String output_bucket # GCS bucket/prefix to write annotated outputs into
13221330
13231331
# Runtime parameters
13241332
Int memory_gb = 96
@@ -1407,6 +1415,11 @@ task add_annotations {
14071415
test -f ./coverage_db/coverage.h5
14081416
14091417
# Run the add_annotations.py script baked inside mtSwirl clone
1418+
# Build a timestamped GCS output path so runs never overwrite each other.
1419+
TIMESTAMP=$(date -u +%Y%m%dT%H%M%SZ)
1420+
OUT_GCS="~{sub(output_bucket, "/$", "")}/${TIMESTAMP}/~{output_name}"
1421+
echo "Annotated outputs will be written to: ${OUT_GCS}"
1422+
14101423
python3 /opt/mtSwirl/generate_mtdna_call_mt/add_annotations.py \
14111424
--sample-stats=~{coverage_tsv} \
14121425
~{if keep_all_samples then "--keep-all-samples" else ""} \
@@ -1417,7 +1430,7 @@ task add_annotations {
14171430
-v ./~{output_name}/vep \
14181431
-a ~{coverage_tsv} \
14191432
-m "${VCF_MT_DIR}" \
1420-
-d ./~{output_name} \
1433+
-d "${OUT_GCS}" \
14211434
--temp-dir ./tmp
14221435
14231436
echo "DONE WITH ANNOTATION"
@@ -1426,12 +1439,12 @@ task add_annotations {
14261439
echo "Contents of /tmp:"
14271440
ls -lh /tmp
14281441
1429-
# Compress the annotated output directory
1430-
tar -czf $WORK_DIR/annotated_output.tar.gz ~{output_name}
1442+
# Record the GCS output path for the workflow output
1443+
echo -n "${OUT_GCS}" > annotated_output_gcs_path.txt
14311444
>>>
14321445

14331446
output {
1434-
File annotated_output_tar = "annotated_output.tar.gz"
1447+
String annotated_output_gcs_path = read_string("annotated_output_gcs_path.txt")
14351448
}
14361449

14371450
runtime {

0 commit comments

Comments
 (0)