broadinstitute
diff --git a/‎CLAUDE.md‎
Lines changed: 87 additions & 4 deletions b/‎CLAUDE.md‎
Lines changed: 87 additions & 4 deletions
diff --git a/‎pipes/WDL/tasks/tasks_assembly.wdl‎
Lines changed: 7 additions & 1 deletion b/‎pipes/WDL/tasks/tasks_assembly.wdl‎
Lines changed: 7 additions & 1 deletion
diff --git a/‎pipes/WDL/tasks/tasks_demux.wdl‎
Lines changed: 2 additions & 1 deletion b/‎pipes/WDL/tasks/tasks_demux.wdl‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pipes/WDL/tasks/tasks_interhost.wdl‎
Lines changed: 3 additions & 3 deletions b/‎pipes/WDL/tasks/tasks_interhost.wdl‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎pipes/WDL/tasks/tasks_intrahost.wdl‎
Lines changed: 4 additions & 4 deletions b/‎pipes/WDL/tasks/tasks_intrahost.wdl‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎pipes/WDL/tasks/tasks_megablast.wdl‎
Lines changed: 1 addition & 1 deletion b/‎pipes/WDL/tasks/tasks_megablast.wdl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎pipes/WDL/tasks/tasks_metagenomics.wdl‎
Lines changed: 12 additions & 11 deletions b/‎pipes/WDL/tasks/tasks_metagenomics.wdl‎
Lines changed: 12 additions & 11 deletions
diff --git a/‎pipes/WDL/tasks/tasks_ncbi.wdl‎
Lines changed: 7 additions & 7 deletions b/‎pipes/WDL/tasks/tasks_ncbi.wdl‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎pipes/WDL/tasks/tasks_nextstrain.wdl‎
Lines changed: 3 additions & 3 deletions b/‎pipes/WDL/tasks/tasks_nextstrain.wdl‎
Lines changed: 3 additions & 3 deletions
@@ -164,11 +164,13 @@ GitHub Actions (`.github/workflows/build.yml`) runs on all PRs and pushes:
   - Supports novoalign, bwa, or minimap2 aligners
   - Primary workflow for viral genome assembly
 
-- **assemble_denovo.wdl**: De novo assembly with SPAdes
+- **assemble_denovo_metagenomic.wdl**: De novo metagenomic assembly with SPAdes
 
-- **classify_kraken2.wdl**: Taxonomic classification of reads
+- **classify_single.wdl**: Taxonomic classification and depletion pipeline
 
-- **sarscov2_illumina_full.wdl**: Complete SARS-CoV-2 analysis pipeline
+- **nextclade_single.wdl**: Nextclade analysis for single samples
+
+- **genbank_single.wdl**: GenBank submission preparation for single samples
 
 - **augur_from_assemblies.wdl**: Nextstrain phylogenetic analysis from assemblies
 
@@ -195,7 +197,31 @@ When analyzing workflow performance from Terra submissions, use the Terra MCP to
 
 ### Timing Methodology for WDL Tasks
 
-When measuring task execution time from Terra logs:
+**Preferred method - use `get_batch_job_status`:**
+
+The Terra MCP's `get_batch_job_status` tool returns timing data directly from the Google Batch API:
+
+```
+get_batch_job_status(
+  workspace_namespace="<namespace>",
+  workspace_name="<workspace>",
+  submission_id="<submission-uuid>",
+  workflow_id="<workflow-uuid>",
+  task_name="<task_name>",
+  shard_index=<optional>,
+  attempt=<optional>
+)
+```
+
+Returns timing in the `batch_job.timing` field:
+- **run_duration**: Actual task execution time (what you usually want for performance analysis)
+- **pre_run_duration**: Queue and setup time (VM provisioning, Docker pull, etc.)
+
+This is more accurate than log-based methods because it captures the complete execution including post-script I/O operations.
+
+**Alternative method - log-based timing (for detailed analysis):**
+
+When you need finer-grained timing within a task (e.g., timing individual steps):
 
 1. **Start time**: Use first Python log timestamp in stderr
    - Pattern: `^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}),\d+`
@@ -210,6 +236,8 @@ When measuring task execution time from Terra logs:
 
 ### Efficient GCS Queries with Wildcards
 
+**Always use `gcloud storage` instead of `gsutil`** - it's faster, more reliable, and the preferred CLI for GCS operations.
+
 Use wildcards to batch GCS queries instead of iterating:
 ```bash
 # Get all stderr files from a submission with timestamps in one query
@@ -235,3 +263,58 @@ To identify which workflow corresponds to which sample:
 1. Read first few KB of stderr from each workflow
 2. Look for sample name in BAM file paths (e.g., `/S20.l1.xxxx.bam`)
 3. Cache the sample-to-workflow mapping for reuse
+
+### Debugging Infrastructure-Level Failures
+
+Some workflow failures have errors that aren't visible in standard stderr logs. These include:
+- Docker pull failures (rate limits, image not found, auth errors)
+- VM provisioning failures
+- Preemption before task execution started
+- Network connectivity issues during container setup
+
+**Signs you need Batch logs instead of stderr:**
+- Batch reports exit code 0 (success) but task is marked as failed ("GCP Batch task exited with Success(0)")
+- Error message says "The job was stopped before the command finished"
+- stderr is empty or very short
+- Error message says "Executor error" without details
+- Task failed instantly (0 seconds runtime)
+- `get_job_metadata` summary shows failure but no useful error message
+
+**Use `get_batch_job_status` to diagnose infrastructure failures:**
+
+The Terra MCP provides `get_batch_job_status` which queries the Google Batch API directly:
+
+```
+get_batch_job_status(
+  workspace_namespace="<namespace>",
+  workspace_name="<workspace>",
+  submission_id="<submission-uuid>",
+  workflow_id="<workflow-uuid>",
+  task_name="<task_name>",
+  shard_index=<optional>,  # For scattered tasks
+  attempt=<optional>       # For retried tasks
+)
+```
+
+The tool returns:
+- **Batch job status**: QUEUED, SCHEDULED, RUNNING, SUCCEEDED, or FAILED
+- **Timing**: run_duration and pre_run_duration (queue/setup time)
+- **Resources**: machine_type, CPU, memory, disk sizes
+- **Status events**: State transitions with timestamps
+- **Detected issues**: Auto-detected problems with severity and suggestions
+- **Cloud Logging query**: Ready-to-use gcloud command for deeper debugging
+
+**Recommended debugging workflow:**
+1. `get_submission_status` → identify failed workflows
+2. `get_job_metadata` (summary mode) → identify failed tasks and error messages
+3. `get_workflow_logs` → check stderr for application errors
+4. `get_batch_job_status` → check infrastructure issues if logs don't explain failure
+
+**Common failure patterns detected:**
+- `"Failed to pull image"` - Check image name, tag, and registry auth
+- `"429 Too Many Requests"` - Registry rate limit, retry later
+- `"manifest unknown"` - Image tag doesn't exist
+- `"unauthorized"` - Service account lacks permission to pull from registry
+- `"PREEMPTED"` - VM was preempted, usually retried automatically
+- `"exit code 137"` - OOM killed (out of memory)
+- `"exit code 1"` - Application error in the task script
@@ -714,7 +714,12 @@ task align_reads {
     String   sample_name = basename(basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt"), ".clean")
   }
 
-  Int disk_size = ceil((6 * size(reads_unmapped_bam, "GB") + 2 * size(reference_fasta, "GB") + 100) / 375.0) * 375
+  # Note: GCP local SSDs must be allocated in pairs (2, 4, 8, 16, 24 × 375GB), so we round to 750GB multiples.
+  Int disk_size = ceil((6 * size(reads_unmapped_bam, "GB") + 2 * size(reference_fasta, "GB") + 100) / 750.0) * 750
+
+  # Skip indel realignment for large BAMs (>1GB) to save runtime
+  Float   reads_bam_size_gb = size(reads_unmapped_bam, "GB")
+  Boolean skip_realign = reads_bam_size_gb >= 1.0
 
   # Autoscale CPU based on input size: 8 CPUs for small inputs, up to 64 CPUs for ~15 GB inputs
   # Linear scaling: 8 + (input_GB / 15) * 56, capped at 64, rounded to nearest multiple of 4
@@ -773,6 +778,7 @@ task align_reads {
         --aligner ~{aligner} \
         ~{'--aligner_options "' + aligner_options + '"'} \
         ~{true='--skipMarkDupes' false="" skip_mark_dupes} \
+        ~{true='--skipRealign' false="" skip_realign} \
         --JVMmemory "$mem_in_mb"m \
         ~{"--NOVOALIGN_LICENSE_PATH=" + novocraft_license} \
         --loglevel=DEBUG
 
@@ -180,7 +180,8 @@ task illumina_demux {
 
     # --- options for VM shape ----------------------
     Int?    machine_mem_gb
-    Int     disk_size = 2625
+    # Note: GCP local SSDs must be allocated in pairs (2, 4, 8, 16, 24 × 375GB), so use 3000 (8 SSDs) instead of 2625 (7 SSDs)
+    Int     disk_size = 3000
     String  docker    = "quay.io/broadinstitute/viral-core:2.5.21"
   }
 
 
@@ -160,7 +160,7 @@ task multi_align_mafft_ref {
     Float?       mafft_gapOpeningPenalty
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
   }
 
   String         fasta_basename = basename(reference_fasta, '.fasta')
@@ -207,7 +207,7 @@ task multi_align_mafft {
     Float?       mafft_gapOpeningPenalty
 
     Int?         machine_mem_gb
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
   }
 
   Int disk_size = 200
@@ -476,7 +476,7 @@ task merge_vcfs_gatk {
     File        ref_fasta
 
     Int?        machine_mem_gb
-    String      docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String      docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
 
     String      output_prefix = "merged"
   }
 
@@ -136,7 +136,7 @@ task lofreq {
     File      reference_fasta
 
     String    out_basename = basename(aligned_bam, '.bam')
-    String    docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String    docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
   }
   Int disk_size = 200
   command <<<
@@ -196,7 +196,7 @@ task isnvs_per_sample {
     Boolean removeDoublyMappedReads = true
 
     Int?    machine_mem_gb
-    String  docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String  docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
 
     String  sample_name = basename(basename(basename(mapped_bam, ".bam"), ".all"), ".mapped")
   }
@@ -239,7 +239,7 @@ task isnvs_vcf {
     Boolean        naiveFilter = false
 
     Int?           machine_mem_gb
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
   }
 
   parameter_meta {
@@ -313,7 +313,7 @@ task annotate_vcf_snpeff {
     String?        emailAddress
 
     Int?           machine_mem_gb
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
 
     String         output_basename = basename(basename(in_vcf, ".gz"), ".vcf")
   }
 
@@ -75,7 +75,7 @@ task lca_megablast {
         Int     cpu            = 16
         Int     disk_size_gb   = 300
 
-        String  docker         = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+        String  docker         = "quay.io/broadinstitute/viral-classify:2.5.21.0"
     }
     parameter_meta {
         trimmed_fasta: {
 
@@ -218,7 +218,7 @@ task kraken2 {
     Int?   min_base_qual
 
     Int    machine_mem_gb = 90
-    String docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+    String docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"
   }
 
   parameter_meta {
@@ -246,9 +246,10 @@ task kraken2 {
 
   # Disk autoscaling: BAM->FASTQ expansion is ~7-8x, plus kraken2 reads output (~1x input),
   # plus kraken2 database (1x localized tarball + 2x decompressed = 3x), plus overhead for krona and temp files.
-  # Minimum 375GB to accommodate typical database sizes.
-  Int disk_size_auto = ceil((8 * size(reads_bam, "GB") + 3 * size(kraken2_db_tgz, "GB") + 50) / 375.0) * 375
-  Int disk_size = if disk_size_auto < 375 then 375 else disk_size_auto
+  # Minimum 750GB to accommodate typical database sizes.
+  # Note: GCP local SSDs must be allocated in pairs (2, 4, 8, 16, 24 × 375GB), so we round to 750GB multiples.
+  Int disk_size_auto = ceil((8 * size(reads_bam, "GB") + 3 * size(kraken2_db_tgz, "GB") + 50) / 750.0) * 750
+  Int disk_size = if disk_size_auto < 750 then 750 else disk_size_auto
 
   command <<<
     set -ex -o pipefail
@@ -350,7 +351,7 @@ task report_primary_kraken_taxa {
     File          kraken_summary_report
     String        focal_taxon = "Viruses"
 
-    String        docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+    String        docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"
   }
   String out_basename = basename(kraken_summary_report, '.txt')
   Int disk_size = 50
@@ -401,7 +402,7 @@ task filter_refs_to_found_taxa {
     File          taxdump_tgz
     Int           min_read_count = 100
 
-    String        docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+    String        docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"
   }
   String ref_basename = basename(taxid_to_ref_accessions_tsv, '.tsv')
   String hits_basename = basename(focal_report_tsv, '.tsv')
@@ -452,7 +453,7 @@ task build_kraken2_db {
     Int?          zstd_compression_level
 
     Int           machine_mem_gb = 100
-    String        docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+    String        docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"
   }
 
   Int disk_size = 750
@@ -594,7 +595,7 @@ task blastx {
     File   krona_taxonomy_db_tgz
 
     Int    machine_mem_gb = 8
-    String docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+    String docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"
   }
 
   parameter_meta {
@@ -684,7 +685,7 @@ task krona {
     Int?         magnitude_column
 
     Int          machine_mem_gb = 3
-    String       docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+    String       docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"
   }
 
   Int disk_size = 50
@@ -791,7 +792,7 @@ task filter_bam_to_taxa {
     String         out_filename_suffix = "filtered"
 
     Int            machine_mem_gb = 8
-    String         docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+    String         docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"
   }
 
   String out_basename = basename(classified_bam, ".bam") + "." + out_filename_suffix
@@ -884,7 +885,7 @@ task kaiju {
     File   krona_taxonomy_db_tgz  # taxonomy/taxonomy.tab
 
     Int    machine_mem_gb = 100
-    String docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+    String docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"
   }
 
   String   input_basename = basename(reads_unmapped_bam, ".bam")
 
@@ -6,7 +6,7 @@ task download_fasta {
     Array[String]+ accessions
     String         emailAddress
 
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
   }
 
   command <<<
@@ -42,7 +42,7 @@ task download_fasta_from_accession_string {
     String out_prefix
     String emailAddress
 
-    String docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
   }
 
   command <<<
@@ -94,7 +94,7 @@ task download_annotations {
     String         emailAddress
     String         combined_out_prefix
 
-    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String         docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
   }
 
   command <<<
@@ -136,7 +136,7 @@ task download_ref_genomes_from_tsv {
     File      ref_genomes_tsv    # [tax_id, isolate_prefix, taxname, colon_delim_accession_list]
     String    emailAddress
 
-    String    docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String    docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
   }
 
   command <<<
@@ -238,7 +238,7 @@ task align_and_annot_transfer_single {
 
     String       out_basename = basename(genome_fasta, '.fasta')
     Int          machine_mem_gb = 30
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
   }
 
   parameter_meta {
@@ -1246,7 +1246,7 @@ task table2asn {
 
     String       out_basename = basename(assembly_fasta, ".fasta")
     Int          machine_mem_gb = 8
-    String       docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"  # this could be a simpler docker image, we don't use anything beyond table2asn itself
+    String       docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"  # this could be a simpler docker image, we don't use anything beyond table2asn itself
   }
   Int disk_size = 50
 
@@ -1403,7 +1403,7 @@ task genbank_special_taxa {
     Int     taxid
     File    taxdump_tgz
     File    vadr_by_taxid_tsv # "gs://pathogen-public-dbs/viral-references/annotation/vadr/vadr-by-taxid.tsv"
-    String  docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+    String  docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"
   }
 
   command <<<
 
@@ -5,7 +5,7 @@ task taxid_to_nextclade_dataset_name {
         Int     taxid
         File    taxdump_tgz
         File    nextclade_by_taxid_tsv # "gs://pathogen-public-dbs/viral-references/typing/nextclade-by-taxid.tsv"
-        String  docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"
+        String  docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"
     }
     command <<<
         set -e
@@ -1001,7 +1001,7 @@ task mafft_one_chr {
         Boolean  large = false
         Boolean  memsavetree = false
 
-        String   docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+        String   docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
         Int      mem_size = 500
         Int      cpus = 64
         Int      disk_size = 750
@@ -1091,7 +1091,7 @@ task mafft_one_chr_chunked {
         Int      batch_chunk_size = 2000
         Int      threads_per_job = 2
 
-        String   docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"
+        String   docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"
         Int      mem_size = 32
         Int      cpus = 64
         Int      disk_size = 750
Original file line number	Diff line number	Diff line change
`@@ -180,7 +180,8 @@ task illumina_demux {`
`180`	`180`
`181`	`181`	`# --- options for VM shape ----------------------`
`182`	`182`	`Int? machine_mem_gb`
`183`		`- Int disk_size = 2625`
	`183`	`+ # Note: GCP local SSDs must be allocated in pairs (2, 4, 8, 16, 24 × 375GB), so use 3000 (8 SSDs) instead of 2625 (7 SSDs)`
	`184`	`+ Int disk_size = 3000`
`184`	`185`	`String docker = "quay.io/broadinstitute/viral-core:2.5.21"`
`185`	`186`	`}`
`186`	`187`
Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@ task multi_align_mafft_ref {`
`160`	`160`	`Float? mafft_gapOpeningPenalty`
`161`	`161`
`162`	`162`	`Int? machine_mem_gb`
`163`		`- String docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"`
	`163`	`+ String docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"`
`164`	`164`	`}`
`165`	`165`
`166`	`166`	`String fasta_basename = basename(reference_fasta, '.fasta')`
`@@ -207,7 +207,7 @@ task multi_align_mafft {`
`207`	`207`	`Float? mafft_gapOpeningPenalty`
`208`	`208`
`209`	`209`	`Int? machine_mem_gb`
`210`		`- String docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"`
	`210`	`+ String docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"`
`211`	`211`	`}`
`212`	`212`
`213`	`213`	`Int disk_size = 200`
`@@ -476,7 +476,7 @@ task merge_vcfs_gatk {`
`476`	`476`	`File ref_fasta`
`477`	`477`
`478`	`478`	`Int? machine_mem_gb`
`479`		`- String docker = "quay.io/broadinstitute/viral-phylo:2.5.16.0"`
	`479`	`+ String docker = "quay.io/broadinstitute/viral-phylo:2.5.21.0"`
`480`	`480`
`481`	`481`	`String output_prefix = "merged"`
`482`	`482`	`}`
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ task lca_megablast {`
`75`	`75`	`Int cpu = 16`
`76`	`76`	`Int disk_size_gb = 300`
`77`	`77`
`78`		`- String docker = "quay.io/broadinstitute/viral-classify:2.5.20.0"`
	`78`	`+ String docker = "quay.io/broadinstitute/viral-classify:2.5.21.0"`
`79`	`79`	`}`
`80`	`80`	`parameter_meta {`
`81`	`81`	`trimmed_fasta: {`