11version 1.0
22
33import "../../../../tasks/broad/ImputationTasks.wdl" as tasks
4+ import "../../../../tasks/broad/ImputationBeagleTasks.wdl" as beagleTasks
45
56workflow ImputationBeagle {
67
@@ -84,58 +85,38 @@ workflow ImputationBeagle {
8485 gatk_docker = gatk_docker
8586 }
8687
87- call tasks . CountVariantsInChunksBeagle {
88+ call beagleTasks . CountVariantsInChunks {
8889 input :
8990 vcf = GenerateChunk .output_vcf ,
9091 vcf_index = GenerateChunk .output_vcf_index ,
9192 panel_bed_file = referencePanelContig .bed ,
9293 gatk_docker = gatk_docker
9394 }
9495
95- call tasks . CheckChunksBeagle {
96+ call beagleTasks . CheckChunks {
9697 input :
97- var_in_original = CountVariantsInChunksBeagle .var_in_original ,
98- var_also_in_reference = CountVariantsInChunksBeagle .var_also_in_reference
99- }
100-
101- # create chunk without overlaps to get sites to impute
102- call tasks .SubsetVcfToRegion {
103- input :
104- vcf = CreateVcfIndex .vcf ,
105- vcf_index = CreateVcfIndex .vcf_index ,
106- output_basename = "input_samples_subset_to_chunk" ,
107- contig = referencePanelContig .contig ,
108- start = start ,
109- end = end ,
110- gatk_docker = gatk_docker
111- }
112-
113- call tasks .SetIDs as SetIdsVcfToImpute {
114- input :
115- vcf = SubsetVcfToRegion .output_vcf ,
116- output_basename = "input_samples_with_variant_ids"
98+ var_in_original = CountVariantsInChunks .var_in_original ,
99+ var_also_in_reference = CountVariantsInChunks .var_also_in_reference
117100 }
118101 }
119102
120103 Array [File ] chunkedVcfsWithOverlapsForImputation = GenerateChunk .output_vcf
121- Array [File ] chunkedVcfsWithoutOverlapsForSiteIds = SetIdsVcfToImpute .output_vcf
122- Array [File ] chunkedVcfIndexesWithoutOverlapsForSiteIds = SetIdsVcfToImpute .output_vcf_index
123104
124105 call tasks .StoreChunksInfo as StoreContigLevelChunksInfo {
125106 input :
126107 chroms = chunk_contig ,
127108 starts = start ,
128109 ends = end ,
129- vars_in_array = CountVariantsInChunksBeagle .var_in_original ,
130- vars_in_panel = CountVariantsInChunksBeagle .var_also_in_reference ,
131- valids = CheckChunksBeagle .valid ,
110+ vars_in_array = CountVariantsInChunks .var_in_original ,
111+ vars_in_panel = CountVariantsInChunks .var_also_in_reference ,
112+ valids = CheckChunks .valid ,
132113 basename = output_basename
133114 }
134115
135116 # if any chunk for any chromosome fail CheckChunks, then we will not impute run any task in the next scatter,
136117 # namely phasing and imputing which would be the most costly to throw away
137118 Int n_failed_chunks_int = select_first ([error_count_override , read_int (StoreContigLevelChunksInfo .n_failed_chunks )])
138- call tasks .ErrorWithMessageIfErrorCountNotZero as FailQCNChunks {
119+ call beagleTasks .ErrorWithMessageIfErrorCountNotZero as FailQCNChunks {
139120 input :
140121 errorCount = n_failed_chunks_int ,
141122 message = "contig " + referencePanelContig .contig + " had " + n_failed_chunks_int + " failing chunks"
@@ -144,21 +125,14 @@ workflow ImputationBeagle {
144125 scatter (i in range (num_chunks )) {
145126 String chunk_basename_imputed = referencePanelContig .contig + "_chunk_" + i + "_imputed"
146127
147- call tasks .ExtractIDs as ExtractIdsVcfToImpute {
148- input :
149- vcf = chunkedVcfsWithoutOverlapsForSiteIds [i ],
150- output_basename = "imputed_sites" ,
151- for_dependency = FailQCNChunks .done # these shenanigans can be replaced with `after` in wdl 1.1
152- }
153-
154128 # max amount of cpus you can ask for is 96 so at a max of 10k samples we can only ask for 9 cpu a sample.
155129 # these values are based on trying to optimize for pre-emptibility using a 400k sample referene panel
156130 # and up to a 10k sample input vcf
157131 Int beagle_cpu = if (CountSamples .nSamples <= 1000 ) then 8 else floor (CountSamples .nSamples / 1000 ) * 9
158132 Int beagle_phase_memory_in_gb = if (CountSamples .nSamples <= 1000 ) then 22 else ceil (beagle_cpu * 1.5 )
159133 Int beagle_impute_memory_in_gb = if (CountSamples .nSamples <= 1000 ) then 30 else ceil (beagle_cpu * 4.3 )
160134
161- call tasks . PhaseBeagle {
135+ call beagleTasks . Phase {
162136 input :
163137 dataset_vcf = chunkedVcfsWithOverlapsForImputation [i ],
164138 ref_panel_bref3 = referencePanelContig .bref3 ,
@@ -171,9 +145,9 @@ workflow ImputationBeagle {
171145 memory_mb = beagle_phase_memory_in_gb * 1024
172146 }
173147
174- call tasks . ImputeBeagle {
148+ call beagleTasks . Impute {
175149 input :
176- dataset_vcf = PhaseBeagle .vcf ,
150+ dataset_vcf = Phase .vcf ,
177151 ref_panel_bref3 = referencePanelContig .bref3 ,
178152 chrom = referencePanelContig .contig ,
179153 basename = chunk_basename_imputed ,
@@ -186,7 +160,7 @@ workflow ImputationBeagle {
186160
187161 call tasks .CreateVcfIndex as IndexImputedBeagle {
188162 input :
189- vcf_input = ImputeBeagle .vcf ,
163+ vcf_input = Impute .vcf ,
190164 gatk_docker = gatk_docker
191165 }
192166
@@ -214,50 +188,9 @@ workflow ImputationBeagle {
214188 output_basename = chunk_basename_imputed ,
215189 gatk_docker = gatk_docker
216190 }
217-
218- call tasks .SetIDs {
219- input :
220- vcf = RemoveSymbolicAlleles .output_vcf ,
221- output_basename = chunk_basename_imputed
222- }
223-
224- call tasks .ExtractIDs {
225- input :
226- vcf = SetIDs .output_vcf ,
227- output_basename = "imputed_sites"
228- }
229-
230- call tasks .FindSitesUniqueToFileTwoOnly {
231- input :
232- file1 = select_first ([ExtractIDs .ids , write_lines ([])]),
233- file2 = ExtractIdsVcfToImpute .ids ,
234- ubuntu_docker = ubuntu_docker
235- }
236-
237- call tasks .SelectVariantsByIds {
238- input :
239- vcf = chunkedVcfsWithoutOverlapsForSiteIds [i ],
240- vcf_index = chunkedVcfIndexesWithoutOverlapsForSiteIds [i ],
241- ids = FindSitesUniqueToFileTwoOnly .missing_sites ,
242- basename = "imputed_sites_to_recover" ,
243- gatk_docker = gatk_docker
244- }
245-
246- call tasks .RemoveAnnotations {
247- input :
248- vcf = SelectVariantsByIds .output_vcf ,
249- basename = "imputed_sites_to_recover_annotations_removed"
250- }
251-
252- call tasks .InterleaveVariants {
253- input :
254- vcfs = select_all ([RemoveAnnotations .output_vcf , SetIDs .output_vcf ]),
255- basename = output_basename , # TODO consider using a contig/chunk labeled basename
256- gatk_docker = gatk_docker
257- }
258191 }
259192
260- Array [File ] chromosome_vcfs = select_all (InterleaveVariants .output_vcf )
193+ Array [File ] chromosome_vcfs = select_all (RemoveSymbolicAlleles .output_vcf )
261194 }
262195
263196 call tasks .GatherVcfs {
@@ -272,9 +205,9 @@ workflow ImputationBeagle {
272205 chroms = flatten (chunk_contig ),
273206 starts = flatten (start ),
274207 ends = flatten (end ),
275- vars_in_array = flatten (CountVariantsInChunksBeagle .var_in_original ),
276- vars_in_panel = flatten (CountVariantsInChunksBeagle .var_also_in_reference ),
277- valids = flatten (CheckChunksBeagle .valid ),
208+ vars_in_array = flatten (CountVariantsInChunks .var_in_original ),
209+ vars_in_panel = flatten (CountVariantsInChunks .var_also_in_reference ),
210+ valids = flatten (CheckChunks .valid ),
278211 basename = output_basename
279212 }
280213
0 commit comments