Skip to content

Commit 01ed8b5

Browse files
jsotobroadJose Soto
authored andcommitted
TSPS-341 remove tasks for recovering variants not in the reference panel (#1468)
* remove tasks for recovering variants not in the reference panel and separate out beagle tasks from imputation tasks * remove prechunk wdl and references to it remove "Beagle" from task names in BeagleTasks.wdl --------- Co-authored-by: Jose Soto <jsoto@broadinstitute.org>
1 parent 1612966 commit 01ed8b5

File tree

6 files changed

+198
-678
lines changed

6 files changed

+198
-678
lines changed

.dockstore.yml

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,10 +87,6 @@ workflows:
8787
subclass: WDL
8888
primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl
8989

90-
- name: ImputationBeaglePreChunk
91-
subclass: WDL
92-
primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.wdl
93-
9490
- name: LiftoverVcfs
9591
subclass: WDL
9692
primaryDescriptorPath: /pipelines/broad/arrays/imputation_beagle/LiftoverVcfs.wdl

pipelines/broad/arrays/imputation_beagle/ImputationBeagle.wdl

Lines changed: 17 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
version 1.0
22

33
import "../../../../tasks/broad/ImputationTasks.wdl" as tasks
4+
import "../../../../tasks/broad/ImputationBeagleTasks.wdl" as beagleTasks
45

56
workflow ImputationBeagle {
67

@@ -84,58 +85,38 @@ workflow ImputationBeagle {
8485
gatk_docker = gatk_docker
8586
}
8687
87-
call tasks.CountVariantsInChunksBeagle {
88+
call beagleTasks.CountVariantsInChunks {
8889
input:
8990
vcf = GenerateChunk.output_vcf,
9091
vcf_index = GenerateChunk.output_vcf_index,
9192
panel_bed_file = referencePanelContig.bed,
9293
gatk_docker = gatk_docker
9394
}
9495
95-
call tasks.CheckChunksBeagle {
96+
call beagleTasks.CheckChunks {
9697
input:
97-
var_in_original = CountVariantsInChunksBeagle.var_in_original,
98-
var_also_in_reference = CountVariantsInChunksBeagle.var_also_in_reference
99-
}
100-
101-
# create chunk without overlaps to get sites to impute
102-
call tasks.SubsetVcfToRegion {
103-
input:
104-
vcf = CreateVcfIndex.vcf,
105-
vcf_index = CreateVcfIndex.vcf_index,
106-
output_basename = "input_samples_subset_to_chunk",
107-
contig = referencePanelContig.contig,
108-
start = start,
109-
end = end,
110-
gatk_docker = gatk_docker
111-
}
112-
113-
call tasks.SetIDs as SetIdsVcfToImpute {
114-
input:
115-
vcf = SubsetVcfToRegion.output_vcf,
116-
output_basename = "input_samples_with_variant_ids"
98+
var_in_original = CountVariantsInChunks.var_in_original,
99+
var_also_in_reference = CountVariantsInChunks.var_also_in_reference
117100
}
118101
}
119102
120103
Array[File] chunkedVcfsWithOverlapsForImputation = GenerateChunk.output_vcf
121-
Array[File] chunkedVcfsWithoutOverlapsForSiteIds = SetIdsVcfToImpute.output_vcf
122-
Array[File] chunkedVcfIndexesWithoutOverlapsForSiteIds = SetIdsVcfToImpute.output_vcf_index
123104
124105
call tasks.StoreChunksInfo as StoreContigLevelChunksInfo {
125106
input:
126107
chroms = chunk_contig,
127108
starts = start,
128109
ends = end,
129-
vars_in_array = CountVariantsInChunksBeagle.var_in_original,
130-
vars_in_panel = CountVariantsInChunksBeagle.var_also_in_reference,
131-
valids = CheckChunksBeagle.valid,
110+
vars_in_array = CountVariantsInChunks.var_in_original,
111+
vars_in_panel = CountVariantsInChunks.var_also_in_reference,
112+
valids = CheckChunks.valid,
132113
basename = output_basename
133114
}
134115
135116
# if any chunk for any chromosome fail CheckChunks, then we will not impute run any task in the next scatter,
136117
# namely phasing and imputing which would be the most costly to throw away
137118
Int n_failed_chunks_int = select_first([error_count_override, read_int(StoreContigLevelChunksInfo.n_failed_chunks)])
138-
call tasks.ErrorWithMessageIfErrorCountNotZero as FailQCNChunks {
119+
call beagleTasks.ErrorWithMessageIfErrorCountNotZero as FailQCNChunks {
139120
input:
140121
errorCount = n_failed_chunks_int,
141122
message = "contig " + referencePanelContig.contig + " had " + n_failed_chunks_int + " failing chunks"
@@ -144,21 +125,14 @@ workflow ImputationBeagle {
144125
scatter (i in range(num_chunks)) {
145126
String chunk_basename_imputed = referencePanelContig.contig + "_chunk_" + i + "_imputed"
146127

147-
call tasks.ExtractIDs as ExtractIdsVcfToImpute {
148-
input:
149-
vcf = chunkedVcfsWithoutOverlapsForSiteIds[i],
150-
output_basename = "imputed_sites",
151-
for_dependency = FailQCNChunks.done # these shenanigans can be replaced with `after` in wdl 1.1
152-
}
153-
154128
# max amount of cpus you can ask for is 96 so at a max of 10k samples we can only ask for 9 cpu a sample.
155129
# these values are based on trying to optimize for pre-emptibility using a 400k sample referene panel
156130
# and up to a 10k sample input vcf
157131
Int beagle_cpu = if (CountSamples.nSamples <= 1000) then 8 else floor(CountSamples.nSamples / 1000) * 9
158132
Int beagle_phase_memory_in_gb = if (CountSamples.nSamples <= 1000) then 22 else ceil(beagle_cpu * 1.5)
159133
Int beagle_impute_memory_in_gb = if (CountSamples.nSamples <= 1000) then 30 else ceil(beagle_cpu * 4.3)
160134

161-
call tasks.PhaseBeagle {
135+
call beagleTasks.Phase {
162136
input:
163137
dataset_vcf = chunkedVcfsWithOverlapsForImputation[i],
164138
ref_panel_bref3 = referencePanelContig.bref3,
@@ -171,9 +145,9 @@ workflow ImputationBeagle {
171145
memory_mb = beagle_phase_memory_in_gb * 1024
172146
}
173147
174-
call tasks.ImputeBeagle {
148+
call beagleTasks.Impute {
175149
input:
176-
dataset_vcf = PhaseBeagle.vcf,
150+
dataset_vcf = Phase.vcf,
177151
ref_panel_bref3 = referencePanelContig.bref3,
178152
chrom = referencePanelContig.contig,
179153
basename = chunk_basename_imputed,
@@ -186,7 +160,7 @@ workflow ImputationBeagle {
186160
187161
call tasks.CreateVcfIndex as IndexImputedBeagle {
188162
input:
189-
vcf_input = ImputeBeagle.vcf,
163+
vcf_input = Impute.vcf,
190164
gatk_docker = gatk_docker
191165
}
192166
@@ -214,50 +188,9 @@ workflow ImputationBeagle {
214188
output_basename = chunk_basename_imputed,
215189
gatk_docker = gatk_docker
216190
}
217-
218-
call tasks.SetIDs {
219-
input:
220-
vcf = RemoveSymbolicAlleles.output_vcf,
221-
output_basename = chunk_basename_imputed
222-
}
223-
224-
call tasks.ExtractIDs {
225-
input:
226-
vcf = SetIDs.output_vcf,
227-
output_basename = "imputed_sites"
228-
}
229-
230-
call tasks.FindSitesUniqueToFileTwoOnly {
231-
input:
232-
file1 = select_first([ExtractIDs.ids, write_lines([])]),
233-
file2 = ExtractIdsVcfToImpute.ids,
234-
ubuntu_docker = ubuntu_docker
235-
}
236-
237-
call tasks.SelectVariantsByIds {
238-
input:
239-
vcf = chunkedVcfsWithoutOverlapsForSiteIds[i],
240-
vcf_index = chunkedVcfIndexesWithoutOverlapsForSiteIds[i],
241-
ids = FindSitesUniqueToFileTwoOnly.missing_sites,
242-
basename = "imputed_sites_to_recover",
243-
gatk_docker = gatk_docker
244-
}
245-
246-
call tasks.RemoveAnnotations {
247-
input:
248-
vcf = SelectVariantsByIds.output_vcf,
249-
basename = "imputed_sites_to_recover_annotations_removed"
250-
}
251-
252-
call tasks.InterleaveVariants {
253-
input:
254-
vcfs = select_all([RemoveAnnotations.output_vcf, SetIDs.output_vcf]),
255-
basename = output_basename, # TODO consider using a contig/chunk labeled basename
256-
gatk_docker = gatk_docker
257-
}
258191
}
259192
260-
Array[File] chromosome_vcfs = select_all(InterleaveVariants.output_vcf)
193+
Array[File] chromosome_vcfs = select_all(RemoveSymbolicAlleles.output_vcf)
261194
}
262195
263196
call tasks.GatherVcfs {
@@ -272,9 +205,9 @@ workflow ImputationBeagle {
272205
chroms = flatten(chunk_contig),
273206
starts = flatten(start),
274207
ends = flatten(end),
275-
vars_in_array = flatten(CountVariantsInChunksBeagle.var_in_original),
276-
vars_in_panel = flatten(CountVariantsInChunksBeagle.var_also_in_reference),
277-
valids = flatten(CheckChunksBeagle.valid),
208+
vars_in_array = flatten(CountVariantsInChunks.var_in_original),
209+
vars_in_panel = flatten(CountVariantsInChunks.var_also_in_reference),
210+
valids = flatten(CheckChunks.valid),
278211
basename = output_basename
279212
}
280213

pipelines/broad/arrays/imputation_beagle/ImputationBeaglePreChunk.changelog.md

Lines changed: 0 additions & 4 deletions
This file was deleted.

0 commit comments

Comments
 (0)