write final mt to gcs (#1804)

jessicaway · web-flow · commit 808a2cda0dcc · 2026-04-10T12:13:05.000-06:00
diff --git a/all_of_us/mitochondria/mtSwirl_refactor/Terra/Dockerfile b/all_of_us/mitochondria/mtSwirl_refactor/Terra/Dockerfile
@@ -7,6 +7,7 @@
 #
 # Build context should be the mtSwirl repo root:
 #   docker build -f generate_mtdna_call_mt/Terra/Dockerfile -t aou-mitochondrial-combine-vcfs-covdb:dev .
+# Dockerfile for base image can be found at: https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/aou_mitochondrial_annotate_coverage/Dockerfile
 
 FROM us.gcr.io/broad-gotc-prod/aou-mitochondrial-annotate-coverage:1.0.0
 
diff --git a/all_of_us/mitochondria/mt_coverage_merge.wdl b/all_of_us/mitochondria/mt_coverage_merge.wdl
@@ -36,6 +36,11 @@ workflow mt_coverage_merge {
         Int finalize_shard_n_partitions = 256
         Int finalize_union_n_partitions = 1000
 
+        # Step 5 (add_annotations) output bucket
+        # Annotated outputs are written directly to GCS. A timestamp subdirectory
+        # is created inside the task to prevent accidental overwrites.
+        String annotated_output_bucket
+
     }
 
     if (defined(sample_list_tsv)) {
@@ -231,7 +236,8 @@ workflow mt_coverage_merge {
             coverage_tsv = process_tsv_files.processed_tsv,  # Path to the coverage input TSV file
             vcf_mt = combined_mt_tar,  # Path to the MatrixTable
             keep_all_samples = true,
-            output_name = "annotated"
+            output_name = "annotated",
+            output_bucket = annotated_output_bucket
     }
 
 
@@ -241,15 +247,16 @@ workflow mt_coverage_merge {
             coverage_tsv = process_tsv_files.processed_tsv,  # Path to the coverage input TSV file
             vcf_mt = combined_mt_tar,  # Path to the MatrixTable
             keep_all_samples = false,
-            output_name = "filt_annotated"
+            output_name = "filt_annotated",
+            output_bucket = annotated_output_bucket
     }
 
     output {
         File processed_tsv = process_tsv_files.processed_tsv
         File output_coverage_db = annotate_coverage.output_db
         File combined_vcf = combined_mt_tar
-        File annotated_output_tar = annotated.annotated_output_tar
-        File filt_annotated_output_tar = filt_annotated.annotated_output_tar
+        String annotated_output_gcs_path = annotated.annotated_output_gcs_path
+        String filt_annotated_output_gcs_path = filt_annotated.annotated_output_gcs_path
     }
 }
 
@@ -1319,6 +1326,7 @@ task add_annotations {
         File coverage_tsv     # Path to the coverage input TSV file
         File vcf_mt             # Path to the MatrixTable
         String output_name      # directory output name
+        String output_bucket    # GCS bucket/prefix to write annotated outputs into
         
         # Runtime parameters
         Int memory_gb = 96
@@ -1407,6 +1415,11 @@ task add_annotations {
         test -f ./coverage_db/coverage.h5
 
         # Run the add_annotations.py script baked inside mtSwirl clone
+        # Build a timestamped GCS output path so runs never overwrite each other.
+        TIMESTAMP=$(date -u +%Y%m%dT%H%M%SZ)
+        OUT_GCS="~{sub(output_bucket, "/$", "")}/${TIMESTAMP}/~{output_name}"
+        echo "Annotated outputs will be written to: ${OUT_GCS}"
+
         python3 /opt/mtSwirl/generate_mtdna_call_mt/add_annotations.py \
             --sample-stats=~{coverage_tsv} \
             ~{if keep_all_samples then "--keep-all-samples" else ""} \
@@ -1417,7 +1430,7 @@ task add_annotations {
             -v ./~{output_name}/vep \
             -a ~{coverage_tsv} \
             -m "${VCF_MT_DIR}" \
-            -d ./~{output_name} \
+            -d "${OUT_GCS}" \
             --temp-dir ./tmp
 
         echo "DONE WITH ANNOTATION"
@@ -1426,12 +1439,12 @@ task add_annotations {
         echo "Contents of /tmp:"
         ls -lh /tmp
 
-        # Compress the annotated output directory
-        tar -czf $WORK_DIR/annotated_output.tar.gz ~{output_name}
+        # Record the GCS output path for the workflow output
+        echo -n "${OUT_GCS}" > annotated_output_gcs_path.txt
     >>>
 
     output {
-        File annotated_output_tar = "annotated_output.tar.gz"
+        String annotated_output_gcs_path = read_string("annotated_output_gcs_path.txt")
     }
 
     runtime {

Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@`
`7`	`7`	`#`
`8`	`8`	`# Build context should be the mtSwirl repo root:`
`9`	`9`	`# docker build -f generate_mtdna_call_mt/Terra/Dockerfile -t aou-mitochondrial-combine-vcfs-covdb:dev .`
	`10`	`+# Dockerfile for base image can be found at: https://github.com/broadinstitute/warp-tools/blob/develop/3rd-party-tools/aou_mitochondrial_annotate_coverage/Dockerfile`
`10`	`11`
`11`	`12`	`FROM us.gcr.io/broad-gotc-prod/aou-mitochondrial-annotate-coverage:1.0.0`
`12`	`13`