From b6e364e1024a4a118f752df9fda9e656ac9745b1 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Wed, 25 Mar 2026 10:49:46 -0400 Subject: [PATCH 01/14] first pass at making the wdl more contig friendly --- .../Glimpse2LowPassImputation.changelog.md | 5 + .../Glimpse2LowPassImputation.wdl | 265 ++++++++++++------ 2 files changed, 183 insertions(+), 87 deletions(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.changelog.md b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.changelog.md index 414b86c857..7fd6668410 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.changelog.md +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.changelog.md @@ -1,3 +1,8 @@ +# 0.0.3 +2026-03-25 (Date of Last Commit) + +* Reorganize wdl to be able to run on contigs more easily. Now the workflow is fully driven by the `contigs` input + # 0.0.2 2026-03-19 (Date of Last Commit) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index 97b275d291..c38e67d583 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -2,15 +2,12 @@ version 1.0 workflow Glimpse2LowPassImputation { input { - String pipeline_version = "0.0.2" + String pipeline_version = "0.0.3" # List of files, one per line - File reference_chunks - File sites_vcf - File sites_table - File sites_table_index Array[String] contigs + String reference_panel_prefix File? input_vcf File? input_vcf_index @@ -32,7 +29,8 @@ workflow Glimpse2LowPassImputation { # batch size used when calling SplitIntoBatches to make variant calls from the crams Int calling_batch_size = 100 - String docker = "us.gcr.io/broad-dsde-methods/glimpse:kachulis_ck_bam_reader_retry_cf5822c" + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.0.0" + String glimpse_docker = "us.gcr.io/broad-dsde-methods/glimpse:kachulis_ck_bam_reader_retry_cf5822c" String docker_extract_num_sites_from_reference_chunk = "us.gcr.io/broad-dsde-methods/glimpse_extract_num_sites_from_reference_chunks:michaelgatzen_edc7f3a" } @@ -45,114 +43,138 @@ workflow Glimpse2LowPassImputation { Int n_samples = select_first([CountSamples.nSamples, length(select_first([crams]))]) - if (defined(crams)) { - if (length(select_first([crams])) > 1) { - call SplitIntoBatches { - input: - batch_size = calling_batch_size, - crams = select_first([crams]), - cram_indices = select_first([cram_indices]), - sample_ids = sample_ids + scatter(contig in contigs) { + File sites_vcf = reference_panel_prefix + "sites." + contig + ".vcf.gz" + File sites_vcf_index =reference_panel_prefix + "sites." + contig + ".vcf.gz.tbi" + File sites_table = reference_panel_prefix + "sites_table." + contig + ".vcf.gz" + File sites_table_index = reference_panel_prefix + "sites_table." + contig + ".vcf.gz.tbi" + File reference_chunks = reference_panel_prefix + "reference_chunks." + contig + ".txt" + + if (defined(crams)) { + if (length(select_first([crams])) > 1) { + call SplitIntoBatches { + input: + batch_size = calling_batch_size, + crams = select_first([crams]), + cram_indices = select_first([cram_indices]), + sample_ids = sample_ids + } } - } - Array[Array[String]] crams_batches = select_first([SplitIntoBatches.crams_batches, [select_first([crams])]]) - Array[Array[String]] cram_indices_batches = select_first([SplitIntoBatches.cram_indices_batches, [select_first([cram_indices])]]) - Array[Array[String]] sample_ids_batches = select_first([SplitIntoBatches.sample_ids_batches, [select_first([sample_ids])]]) - - scatter(i in range(length(crams_batches))) { - call BcftoolsMpileup { - input: - crams = crams_batches[i], - cram_indices = cram_indices_batches[i], - sample_ids = sample_ids_batches[i], - fasta = fasta, - fasta_index = fasta_index, - call_indels = call_indels, - sites_vcf = sites_vcf, + Array[Array[String]] crams_batches = select_first([SplitIntoBatches.crams_batches, [select_first([crams])]]) + Array[Array[String]] cram_indices_batches = select_first([SplitIntoBatches.cram_indices_batches, [select_first([cram_indices])]]) + Array[Array[String]] sample_ids_batches = select_first([SplitIntoBatches.sample_ids_batches, [select_first([sample_ids])]]) + + scatter(i in range(length(crams_batches))) { + call BcftoolsMpileup { + input: + crams = crams_batches[i], + cram_indices = cram_indices_batches[i], + sample_ids = sample_ids_batches[i], + fasta = fasta, + fasta_index = fasta_index, + call_indels = call_indels, + sites_vcf = sites_vcf, + } + + call BcftoolsCall { + input: + mpileup_bcf = BcftoolsMpileup.output_bcf, + sites_table = sites_table, + sites_table_index = sites_table_index, + } + + call BcftoolsNorm { + input: + calls_bcf = BcftoolsCall.output_bcf, + } } - call BcftoolsCall { - input: - mpileup_bcf = BcftoolsMpileup.output_bcf, - sites_table = sites_table, - sites_table_index = sites_table_index, + if (length(BcftoolsNorm.output_vcf) > 1) { + call BcftoolsMerge { + input: + vcfs = BcftoolsNorm.output_vcf, + vcf_indices = BcftoolsNorm.output_vcf_index, + output_basename = output_basename + } } - call BcftoolsNorm { - input: - calls_bcf = BcftoolsCall.output_bcf, - } + File merged_vcf = select_first([BcftoolsMerge.merged_vcf, BcftoolsNorm.output_vcf[0]]) + File merged_vcf_index = select_first([BcftoolsMerge.merged_vcf_index, BcftoolsNorm.output_vcf_index[0]]) + } + + ## this task is used to grab the reference chunk but does not affect memory usage of glimpsePhase. + ## still tbd which method makes the most sense cost wise + call ComputeShardsAndMemoryPerShard { + input: + reference_chunks_memory = reference_chunks, + contigs = contigs, + n_samples = n_samples } - if (length(BcftoolsNorm.output_vcf) > 1) { - call BcftoolsMerge { + scatter (reference_chunk_index in range(length(ComputeShardsAndMemoryPerShard.reference_chunk_file_paths))) { + + call GlimpsePhase { input: - vcfs = BcftoolsNorm.output_vcf, - vcf_indices = BcftoolsNorm.output_vcf_index, - output_basename = output_basename + reference_chunk = ComputeShardsAndMemoryPerShard.reference_chunk_file_paths[reference_chunk_index], + input_vcf = select_first([merged_vcf,input_vcf]), + input_vcf_index = select_first([merged_vcf_index,input_vcf_index]), + impute_reference_only_variants = impute_reference_only_variants, + n_burnin = n_burnin, + n_main = n_main, + effective_population_size = effective_population_size, + call_indels = call_indels, + sample_ids = sample_ids, + fasta = fasta, + fasta_index = fasta_index, + docker = glimpse_docker } } - File merged_vcf = select_first([BcftoolsMerge.merged_vcf, BcftoolsNorm.output_vcf[0]]) - File merged_vcf_index = select_first([BcftoolsMerge.merged_vcf_index, BcftoolsNorm.output_vcf_index[0]]) + call GlimpseLigate { + input: + imputed_chunks = GlimpsePhase.imputed_vcf, + imputed_chunks_indices = GlimpsePhase.imputed_vcf_index, + output_basename = output_basename, + ref_dict = ref_dict, + docker = glimpse_docker + } + Array[File] contig_coverage_metrics = select_all(GlimpsePhase.coverage_metrics) } - ## this task is used to grab the reference chunk but does not affect memory usage of glimpsePhase. - ## still tbd which method makes the most sense cost wise - call ComputeShardsAndMemoryPerShard { + call GatherVcfsNoIndex { input: - reference_chunks_memory = reference_chunks, - contigs = contigs, - n_samples = n_samples - } - - scatter (reference_chunk_index in range(length(ComputeShardsAndMemoryPerShard.reference_chunk_file_paths))) { - - call GlimpsePhase { - input: - reference_chunk = ComputeShardsAndMemoryPerShard.reference_chunk_file_paths[reference_chunk_index], - input_vcf = select_first([merged_vcf,input_vcf]), - input_vcf_index = select_first([merged_vcf_index,input_vcf_index]), - impute_reference_only_variants = impute_reference_only_variants, - n_burnin = n_burnin, - n_main = n_main, - effective_population_size = effective_population_size, - call_indels = call_indels, - sample_ids = sample_ids, - fasta = fasta, - fasta_index = fasta_index, - docker = docker - } + input_vcfs = GlimpseLigate.imputed_vcf, + output_vcf_basename = output_basename + ".imputed", + gatk_docker = gatk_docker } - call GlimpseLigate { + call CreateVcfIndexAndMd5 { input: - imputed_chunks = GlimpsePhase.imputed_vcf, - imputed_chunks_indices = GlimpsePhase.imputed_vcf_index, - output_basename = output_basename, - ref_dict = ref_dict, - docker = docker + vcf_input = GatherVcfsNoIndex.output_vcf, + gatk_docker = gatk_docker, + preemptible = 0 } - if (length(select_all(GlimpsePhase.coverage_metrics)) > 0) { + Array[File] genome_coverage_metrics = flatten(contig_coverage_metrics) + if (length(genome_coverage_metrics) > 0) { call CombineCoverageMetrics { input: - cov_metrics = select_all(GlimpsePhase.coverage_metrics), + cov_metrics = genome_coverage_metrics, output_basename = output_basename } } call CollectQCMetrics { input: - imputed_vcf = GlimpseLigate.imputed_vcf, + imputed_vcf = GatherVcfsNoIndex.output_vcf, output_basename = output_basename } output { - File imputed_vcf = GlimpseLigate.imputed_vcf - File imputed_vcf_index = GlimpseLigate.imputed_vcf_index - File imputed_vcf_md5sum = GlimpseLigate.imputed_vcf_md5sum + File imputed_vcf = CreateVcfIndexAndMd5.output_vcf + File imputed_vcf_index = CreateVcfIndexAndMd5.output_vcf_index + File imputed_vcf_md5sum = CreateVcfIndexAndMd5.output_vcf_md5sum File qc_metrics = CollectQCMetrics.qc_metrics File? coverage_metrics = CombineCoverageMetrics.coverage_metrics @@ -537,9 +559,6 @@ task GlimpseLigate { bcftools view -h --no-version ligated.vcf.gz > old_header.vcf java -jar /picard.jar UpdateVcfSequenceDictionary -I old_header.vcf --SD ~{ref_dict} -O new_header.vcf bcftools reheader -h new_header.vcf -o ~{output_basename}.imputed.vcf.gz ligated.vcf.gz - tabix ~{output_basename}.imputed.vcf.gz - - md5sum ~{output_basename}.imputed.vcf.gz | awk '{ print $1 }' > ~{output_basename}.imputed.vcf.gz.md5sum >>> runtime { @@ -553,8 +572,6 @@ task GlimpseLigate { output { File imputed_vcf = "~{output_basename}.imputed.vcf.gz" - File imputed_vcf_index = "~{output_basename}.imputed.vcf.gz.tbi" - File imputed_vcf_md5sum = "~{output_basename}.imputed.vcf.gz.md5sum" } } @@ -678,3 +695,77 @@ task CombineCoverageMetrics File coverage_metrics="~{output_basename}.coverage_metrics.txt" } } + +task GatherVcfsNoIndex { + input { + Array[File] input_vcfs + String output_vcf_basename + + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.1.0" + Int cpu = 2 + Int memory_mb = 10000 + Int disk_size_gb = ceil(3*size(input_vcfs, "GiB")) + 10 + } + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 + + command <<< + set -e -o pipefail + + gatk --java-options "-Xms~{command_mem}m -Xmx~{max_heap}m" \ + GatherVcfs \ + -I ~{sep=' -I ' input_vcfs} \ + --REORDER_INPUT_BY_FIRST_VARIANT \ + -O ~{output_vcf_basename}.vcf.gz + >>> + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} SSD" + memory: "${memory_mb} MiB" + cpu: cpu + maxRetries: 1 + noAddress: true + } + output { + File output_vcf = "~{output_vcf_basename}.vcf.gz" + } +} + +task CreateVcfIndexAndMd5 { + input { + File vcf_input + + Int disk_size_gb = ceil(1.1*size(vcf_input, "GiB")) + 10 + Int cpu = 1 + Int memory_mb = 6000 + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" + Int preemptible = 3 + } + Int command_mem = memory_mb - 1500 + Int max_heap = memory_mb - 1000 + + String vcf_basename = basename(vcf_input) + + command <<< + set -e -o pipefail + + ln -sf ~{vcf_input} ~{vcf_basename} + + bcftools index -t ~{vcf_basename} + md5sum ~{vcf_basename} | awk '{ print $1 }' > ~{vcf_basename}.md5sum + >>> + runtime { + docker: gatk_docker + disks: "local-disk ${disk_size_gb} SSD" + memory: "${memory_mb} MiB" + cpu: cpu + preemptible: preemptible + maxRetries: 1 + noAddress: true + } + output { + File output_vcf = "~{vcf_basename}" + File output_vcf_index = "~{vcf_basename}.tbi" + File output_vcf_md5sum = "~{vcf_basename}.md5sum" + } +} From 75fe83c683f596b8e13db8f3d13f266a12e44c10 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Wed, 25 Mar 2026 10:58:24 -0400 Subject: [PATCH 02/14] make compute shard and memory task better with this new change --- .../Glimpse2LowPassImputation.wdl | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index c38e67d583..64553642da 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -107,7 +107,6 @@ workflow Glimpse2LowPassImputation { call ComputeShardsAndMemoryPerShard { input: reference_chunks_memory = reference_chunks, - contigs = contigs, n_samples = n_samples } @@ -231,7 +230,6 @@ task SplitIntoBatches { task ComputeShardsAndMemoryPerShard { input { File reference_chunks_memory - Array[String] contigs Int n_samples } @@ -243,17 +241,13 @@ task ComputeShardsAndMemoryPerShard { df = pd.read_csv('~{reference_chunks_memory}', sep='\t', header=None, names=['contig', 'reference_shard', 'base_gb', 'slope_per_sample_gb']) - # filter dataframe by contig list - chromosomes_to_filter = ["~{sep='", "' contigs}"] - filtered_df = df[df['contig'].isin(chromosomes_to_filter)] - # write out reference shards to process - filtered_df['reference_shard'].to_csv('reference_shard_file_paths.tsv', sep='\t', index=False, header=None) + df['reference_shard'].to_csv('reference_shard_file_paths.tsv', sep='\t', index=False, header=None) # calculate memory usage and save to file - filtered_df['mem_gb'] = filtered_df['base_gb'] + filtered_df['slope_per_sample_gb'] * ~{n_samples} - filtered_df['mem_gb'] = filtered_df['mem_gb'].apply(lambda x: min(256, int(np.ceil(x)))) # cap at 256 GB - filtered_df['mem_gb'].to_csv('memory_per_chunk.tsv', sep='\t', index=False, header=None) + df['mem_gb'] = filtered_df['base_gb'] + filtered_df['slope_per_sample_gb'] * ~{n_samples} + df['mem_gb'] = filtered_df['mem_gb'].apply(lambda x: min(256, int(np.ceil(x)))) # cap at 256 GB + df['mem_gb'].to_csv('memory_per_chunk.tsv', sep='\t', index=False, header=None) EOF >>> From acc1d35ddd92ca3a3cfdbe3c9ae50de72dabf48c Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Wed, 25 Mar 2026 11:04:01 -0400 Subject: [PATCH 03/14] only need to run split batches once per submission --- .../Glimpse2LowPassImputation.wdl | 21 +++++++++++-------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index 64553642da..0bf8f26004 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -43,6 +43,18 @@ workflow Glimpse2LowPassImputation { Int n_samples = select_first([CountSamples.nSamples, length(select_first([crams]))]) + if (defined(crams)) { + if (length(select_first([crams])) > 1) { + call SplitIntoBatches { + input: + batch_size = calling_batch_size, + crams = select_first([crams]), + cram_indices = select_first([cram_indices]), + sample_ids = sample_ids + } + } + } + scatter(contig in contigs) { File sites_vcf = reference_panel_prefix + "sites." + contig + ".vcf.gz" File sites_vcf_index =reference_panel_prefix + "sites." + contig + ".vcf.gz.tbi" @@ -51,15 +63,6 @@ workflow Glimpse2LowPassImputation { File reference_chunks = reference_panel_prefix + "reference_chunks." + contig + ".txt" if (defined(crams)) { - if (length(select_first([crams])) > 1) { - call SplitIntoBatches { - input: - batch_size = calling_batch_size, - crams = select_first([crams]), - cram_indices = select_first([cram_indices]), - sample_ids = sample_ids - } - } Array[Array[String]] crams_batches = select_first([SplitIntoBatches.crams_batches, [select_first([crams])]]) Array[Array[String]] cram_indices_batches = select_first([SplitIntoBatches.cram_indices_batches, [select_first([cram_indices])]]) Array[Array[String]] sample_ids_batches = select_first([SplitIntoBatches.sample_ids_batches, [select_first([sample_ids])]]) From 28e8813199b5b75a8a70f21d36a6bd53f1c6daa4 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Wed, 25 Mar 2026 11:35:51 -0400 Subject: [PATCH 04/14] fix compute memory and shard task --- .../glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index 0bf8f26004..36debc1a79 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -248,8 +248,8 @@ task ComputeShardsAndMemoryPerShard { df['reference_shard'].to_csv('reference_shard_file_paths.tsv', sep='\t', index=False, header=None) # calculate memory usage and save to file - df['mem_gb'] = filtered_df['base_gb'] + filtered_df['slope_per_sample_gb'] * ~{n_samples} - df['mem_gb'] = filtered_df['mem_gb'].apply(lambda x: min(256, int(np.ceil(x)))) # cap at 256 GB + df['mem_gb'] = df['base_gb'] + df['slope_per_sample_gb'] * ~{n_samples} + df['mem_gb'] = df['mem_gb'].apply(lambda x: min(256, int(np.ceil(x)))) # cap at 256 GB df['mem_gb'].to_csv('memory_per_chunk.tsv', sep='\t', index=False, header=None) EOF >>> From fd8917723383cbd9661fc64dd9220887a927ab64 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Thu, 26 Mar 2026 09:26:42 -0400 Subject: [PATCH 05/14] fix sites table suffix and increase memory for bcftools commands --- .../low_pass_imputation/Glimpse2LowPassImputation.wdl | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index 36debc1a79..97fefc4123 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -58,8 +58,8 @@ workflow Glimpse2LowPassImputation { scatter(contig in contigs) { File sites_vcf = reference_panel_prefix + "sites." + contig + ".vcf.gz" File sites_vcf_index =reference_panel_prefix + "sites." + contig + ".vcf.gz.tbi" - File sites_table = reference_panel_prefix + "sites_table." + contig + ".vcf.gz" - File sites_table_index = reference_panel_prefix + "sites_table." + contig + ".vcf.gz.tbi" + File sites_table = reference_panel_prefix + "sites_table." + contig + ".gz" + File sites_table_index = reference_panel_prefix + "sites_table." + contig + ".gz.tbi" File reference_chunks = reference_panel_prefix + "reference_chunks." + contig + ".txt" if (defined(crams)) { @@ -276,7 +276,7 @@ task BcftoolsMpileup { File sites_vcf Int seed = 12345 - Int mem_gb = 4 + Int mem_gb = 6 Int cpu = 1 Int preemptible = 0 } @@ -318,7 +318,7 @@ task BcftoolsCall { File sites_table File sites_table_index - Int mem_gb = 4 + Int mem_gb = 6 Int cpu = 1 Int preemptible = 3 } @@ -350,7 +350,7 @@ task BcftoolsNorm { input { File calls_bcf - Int mem_gb = 4 + Int mem_gb = 6 Int cpu = 1 Int preemptible = 3 } From e0170d8d64b44ef28e6607414dae3fb60bd0d4c7 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Fri, 27 Mar 2026 10:52:14 -0400 Subject: [PATCH 06/14] remove unused optional inputs that are failing and add max retries to bcftools tasks --- .../Glimpse2LowPassImputation.wdl | 34 ++++++++----------- 1 file changed, 14 insertions(+), 20 deletions(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index 97fefc4123..d5c19738a4 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -22,16 +22,12 @@ workflow Glimpse2LowPassImputation { Boolean impute_reference_only_variants = false Boolean call_indels = false - Int? n_burnin - Int? n_main - Int? effective_population_size # batch size used when calling SplitIntoBatches to make variant calls from the crams Int calling_batch_size = 100 String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.6.0.0" String glimpse_docker = "us.gcr.io/broad-dsde-methods/glimpse:kachulis_ck_bam_reader_retry_cf5822c" - String docker_extract_num_sites_from_reference_chunk = "us.gcr.io/broad-dsde-methods/glimpse_extract_num_sites_from_reference_chunks:michaelgatzen_edc7f3a" } if (defined(input_vcf)) { @@ -121,9 +117,6 @@ workflow Glimpse2LowPassImputation { input_vcf = select_first([merged_vcf,input_vcf]), input_vcf_index = select_first([merged_vcf_index,input_vcf_index]), impute_reference_only_variants = impute_reference_only_variants, - n_burnin = n_burnin, - n_main = n_main, - effective_population_size = effective_population_size, call_indels = call_indels, sample_ids = sample_ids, fasta = fasta, @@ -279,12 +272,11 @@ task BcftoolsMpileup { Int mem_gb = 6 Int cpu = 1 Int preemptible = 0 + Int max_retries = 3 } Int disk_size_gb = ceil(1.5*size(crams, "GiB") + size(fasta, "GiB") + size(sites_vcf, "GiB")) + 10 - String out_basename = "batch" - command <<< set -xeuo pipefail @@ -304,6 +296,8 @@ task BcftoolsMpileup { memory: mem_gb + " GiB" cpu: cpu preemptible: preemptible + maxRetries: max_retries + maxRetries: max_retries } output { @@ -321,12 +315,11 @@ task BcftoolsCall { Int mem_gb = 6 Int cpu = 1 Int preemptible = 3 + Int max_retries = 3 } Int disk_size_gb = ceil(3*size(mpileup_bcf, "GiB") + size(sites_table, "GiB")) + 10 - String out_basename = "batch" - command <<< set -xeuo pipefail @@ -339,6 +332,7 @@ task BcftoolsCall { memory: mem_gb + " GiB" cpu: cpu preemptible: preemptible + maxRetries: max_retries } output { @@ -353,18 +347,17 @@ task BcftoolsNorm { Int mem_gb = 6 Int cpu = 1 Int preemptible = 3 + Int max_retries = 3 } Int disk_size_gb = ceil(3*size(calls_bcf, "GiB")) + 10 - String out_basename = "batch" - command <<< set -xeuo pipefail - bcftools norm -m -both -Oz -o ~{out_basename}.vcf.gz ~{calls_bcf} - bcftools index -t ~{out_basename}.vcf.gz + bcftools norm -m -both -Oz -o normalized.vcf.gz ~{calls_bcf} + bcftools index -t normalized.vcf.gz >>> runtime { @@ -373,11 +366,12 @@ task BcftoolsNorm { memory: mem_gb + " GiB" cpu: cpu preemptible: preemptible + maxRetries: max_retries } output { - File output_vcf = "~{out_basename}.vcf.gz" - File output_vcf_index = "~{out_basename}.vcf.gz.tbi" + File output_vcf = "normalized.vcf.gz" + File output_vcf_index = "normalized.vcf.gz.tbi" } } @@ -385,9 +379,10 @@ task BcftoolsMerge { input { Array[File] vcfs Array[File] vcf_indices - Int mem_gb = 4 + Int mem_gb = 6 Int cpu = 1 Int preemptible = 0 + Int max_retries = 3 String output_basename } @@ -406,6 +401,7 @@ task BcftoolsMerge { memory: mem_gb + " GiB" cpu: cpu preemptible: preemptible + maxRetries: max_retries } output { @@ -738,8 +734,6 @@ task CreateVcfIndexAndMd5 { String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.5.0.0" Int preemptible = 3 } - Int command_mem = memory_mb - 1500 - Int max_heap = memory_mb - 1000 String vcf_basename = basename(vcf_input) From 96151a6ea170c512fc4c8906bf391355869e820d Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Sat, 28 Mar 2026 10:08:16 -0400 Subject: [PATCH 07/14] more memory for bcftools call task and try to fix optional variable passed down to nested scatters --- .../low_pass_imputation/Glimpse2LowPassImputation.wdl | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index d5c19738a4..765bb1ce14 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -58,6 +58,9 @@ workflow Glimpse2LowPassImputation { File sites_table_index = reference_panel_prefix + "sites_table." + contig + ".gz.tbi" File reference_chunks = reference_panel_prefix + "reference_chunks." + contig + ".txt" + File? input_vcf = input_vcf + File? input_vcf_index = input_vcf_index + if (defined(crams)) { Array[Array[String]] crams_batches = select_first([SplitIntoBatches.crams_batches, [select_first([crams])]]) Array[Array[String]] cram_indices_batches = select_first([SplitIntoBatches.cram_indices_batches, [select_first([cram_indices])]]) @@ -111,6 +114,9 @@ workflow Glimpse2LowPassImputation { scatter (reference_chunk_index in range(length(ComputeShardsAndMemoryPerShard.reference_chunk_file_paths))) { + File? input_vcf = input_vcf + File? input_vcf_index = input_vcf_index + call GlimpsePhase { input: reference_chunk = ComputeShardsAndMemoryPerShard.reference_chunk_file_paths[reference_chunk_index], @@ -312,7 +318,7 @@ task BcftoolsCall { File sites_table File sites_table_index - Int mem_gb = 6 + Int mem_gb = 12 Int cpu = 1 Int preemptible = 3 Int max_retries = 3 From e6b84a36b5397b39040a3fc13f6175a0625463a5 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Sat, 28 Mar 2026 10:11:23 -0400 Subject: [PATCH 08/14] thihs is dumb --- .../low_pass_imputation/Glimpse2LowPassImputation.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index 765bb1ce14..40375d63e6 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -58,8 +58,8 @@ workflow Glimpse2LowPassImputation { File sites_table_index = reference_panel_prefix + "sites_table." + contig + ".gz.tbi" File reference_chunks = reference_panel_prefix + "reference_chunks." + contig + ".txt" - File? input_vcf = input_vcf - File? input_vcf_index = input_vcf_index + File? input_vcf_scatter_1 = input_vcf + File? input_vcf_scatter_1_index = input_vcf_index if (defined(crams)) { Array[Array[String]] crams_batches = select_first([SplitIntoBatches.crams_batches, [select_first([crams])]]) @@ -114,8 +114,8 @@ workflow Glimpse2LowPassImputation { scatter (reference_chunk_index in range(length(ComputeShardsAndMemoryPerShard.reference_chunk_file_paths))) { - File? input_vcf = input_vcf - File? input_vcf_index = input_vcf_index + File? input_vcf_scatter_2 = input_vcf_scatter_1 + File? input_vcf_scatter_2_index = input_vcf_scatter_1_index call GlimpsePhase { input: From fe66126f087ef7d3318795d3309a5992a5a1bc8f Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Sat, 28 Mar 2026 10:12:15 -0400 Subject: [PATCH 09/14] use dumbest fix i can think of --- .../glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index 40375d63e6..fe20266ac4 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -120,8 +120,8 @@ workflow Glimpse2LowPassImputation { call GlimpsePhase { input: reference_chunk = ComputeShardsAndMemoryPerShard.reference_chunk_file_paths[reference_chunk_index], - input_vcf = select_first([merged_vcf,input_vcf]), - input_vcf_index = select_first([merged_vcf_index,input_vcf_index]), + input_vcf = select_first([merged_vcf,input_vcf_scatter_2]), + input_vcf_index = select_first([merged_vcf_index,input_vcf_scatter_2_index]), impute_reference_only_variants = impute_reference_only_variants, call_indels = call_indels, sample_ids = sample_ids, From 7fde2e2268fec48530eeaebf09e5477a585d461e Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Sun, 29 Mar 2026 01:39:43 -0400 Subject: [PATCH 10/14] figure out mor eoptional inputs --- .../Glimpse2LowPassImputation.wdl | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index fe20266ac4..7fdf70da4a 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -58,9 +58,6 @@ workflow Glimpse2LowPassImputation { File sites_table_index = reference_panel_prefix + "sites_table." + contig + ".gz.tbi" File reference_chunks = reference_panel_prefix + "reference_chunks." + contig + ".txt" - File? input_vcf_scatter_1 = input_vcf - File? input_vcf_scatter_1_index = input_vcf_index - if (defined(crams)) { Array[Array[String]] crams_batches = select_first([SplitIntoBatches.crams_batches, [select_first([crams])]]) Array[Array[String]] cram_indices_batches = select_first([SplitIntoBatches.cram_indices_batches, [select_first([cram_indices])]]) @@ -100,8 +97,8 @@ workflow Glimpse2LowPassImputation { } } - File merged_vcf = select_first([BcftoolsMerge.merged_vcf, BcftoolsNorm.output_vcf[0]]) - File merged_vcf_index = select_first([BcftoolsMerge.merged_vcf_index, BcftoolsNorm.output_vcf_index[0]]) + File phase_input_vcf = select_first([BcftoolsMerge.merged_vcf, BcftoolsNorm.output_vcf[0], input_vcf]) + File phase_input_vcf_index = select_first([BcftoolsMerge.merged_vcf_index, BcftoolsNorm.output_vcf_index[0],input_vcf_index]) } ## this task is used to grab the reference chunk but does not affect memory usage of glimpsePhase. @@ -114,14 +111,11 @@ workflow Glimpse2LowPassImputation { scatter (reference_chunk_index in range(length(ComputeShardsAndMemoryPerShard.reference_chunk_file_paths))) { - File? input_vcf_scatter_2 = input_vcf_scatter_1 - File? input_vcf_scatter_2_index = input_vcf_scatter_1_index - call GlimpsePhase { input: reference_chunk = ComputeShardsAndMemoryPerShard.reference_chunk_file_paths[reference_chunk_index], - input_vcf = select_first([merged_vcf,input_vcf_scatter_2]), - input_vcf_index = select_first([merged_vcf_index,input_vcf_scatter_2_index]), + input_vcf = phase_input_vcf, + input_vcf_index = phase_input_vcf_index, impute_reference_only_variants = impute_reference_only_variants, call_indels = call_indels, sample_ids = sample_ids, From e63bf8f062a9b3e5e6062bf385287cfda448c31b Mon Sep 17 00:00:00 2001 From: GitHub Action Date: Mon, 30 Mar 2026 13:38:02 +0000 Subject: [PATCH 11/14] Updated pipeline_versions.txt with all pipeline version information --- pipeline_versions.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipeline_versions.txt b/pipeline_versions.txt index c0d8e61068..e9ccdc4382 100644 --- a/pipeline_versions.txt +++ b/pipeline_versions.txt @@ -4,7 +4,7 @@ BuildIndices 5.1.0 2026-02-13 CramToUnmappedBams 1.1.3 2024-08-02 ExomeGermlineSingleSample 3.2.7 2026-01-21 ExomeReprocessing 3.3.7 2026-01-21 -Glimpse2LowPassImputation 0.0.2 2026-03-19 +Glimpse2LowPassImputation 0.0.3 2026-03-25 IlluminaGenotypingArray 1.12.27 2026-01-21 Imputation 1.1.23 2025-10-03 ImputationBeagle 3.0.1 2026-02-23 From ad8864619443e570f29770966602f4375ab0ee2d Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Mon, 30 Mar 2026 10:48:42 -0400 Subject: [PATCH 12/14] update changelog --- .../low_pass_imputation/Glimpse2LowPassImputation.changelog.md | 1 + 1 file changed, 1 insertion(+) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.changelog.md b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.changelog.md index 7fd6668410..803cdc7954 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.changelog.md +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.changelog.md @@ -2,6 +2,7 @@ 2026-03-25 (Date of Last Commit) * Reorganize wdl to be able to run on contigs more easily. Now the workflow is fully driven by the `contigs` input +* The wdl now expects the reference related files to all live under the same cloud base path # 0.0.2 2026-03-19 (Date of Last Commit) From 20151cb4335eef29a44716bfbe91f1dada535847 Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Wed, 1 Apr 2026 10:27:46 -0400 Subject: [PATCH 13/14] pr feedback1 --- .../low_pass_imputation/Glimpse2LowPassImputation.wdl | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index 7fdf70da4a..5173bb47ba 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -4,9 +4,9 @@ workflow Glimpse2LowPassImputation { input { String pipeline_version = "0.0.3" - # List of files, one per line - Array[String] contigs + + # this is the path the a directory that contains sites vcf, sites tabke, and reference chunks file. should end with a "/ String reference_panel_prefix File? input_vcf @@ -297,7 +297,6 @@ task BcftoolsMpileup { cpu: cpu preemptible: preemptible maxRetries: max_retries - maxRetries: max_retries } output { From bc32bc346e702ca03b24564cb5198503ff1640ac Mon Sep 17 00:00:00 2001 From: Jose Soto Date: Wed, 1 Apr 2026 10:29:02 -0400 Subject: [PATCH 14/14] add space aafeter comma --- .../glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl index 5173bb47ba..5e99b82089 100644 --- a/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl +++ b/pipelines/wdl/glimpse/low_pass_imputation/Glimpse2LowPassImputation.wdl @@ -98,7 +98,7 @@ workflow Glimpse2LowPassImputation { } File phase_input_vcf = select_first([BcftoolsMerge.merged_vcf, BcftoolsNorm.output_vcf[0], input_vcf]) - File phase_input_vcf_index = select_first([BcftoolsMerge.merged_vcf_index, BcftoolsNorm.output_vcf_index[0],input_vcf_index]) + File phase_input_vcf_index = select_first([BcftoolsMerge.merged_vcf_index, BcftoolsNorm.output_vcf_index[0], input_vcf_index]) } ## this task is used to grab the reference chunk but does not affect memory usage of glimpsePhase.