Skip to content

Commit 0f389ef

Browse files
authored
Merge pull request #39 from broadinstitute/sc-nextstrain
add in nextstrain/augur WDLs
2 parents e92abee + 364c008 commit 0f389ef

File tree

2 files changed

+382
-0
lines changed

2 files changed

+382
-0
lines changed
Lines changed: 309 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,309 @@
1+
version 1.0
2+
3+
task concatenate {
4+
# this is nothing more than unix cat
5+
input {
6+
Array[File] infiles
7+
String output_name
8+
}
9+
command {
10+
cat ~{sep=" " infiles} > "${output_name}"
11+
}
12+
runtime {
13+
docker: "ubuntu"
14+
memory: "1 GB"
15+
cpu: 1
16+
disks: "local-disk 375 LOCAL"
17+
dx_instance_type: "mem1_ssd1_v2_x2"
18+
}
19+
output {
20+
File combined = "${output_name}"
21+
}
22+
}
23+
24+
task filter_segments {
25+
input {
26+
File all_samples_fasta
27+
Int? segment = 1
28+
File? pre_assembled_samples_fasta
29+
}
30+
command <<<
31+
python3 <<CODE
32+
33+
segment = "-"+'~{segment}'
34+
segment_fasta = ""
35+
36+
with open('~{all_samples_fasta}', 'r') as fasta:
37+
records=fasta.read().split(">")
38+
39+
for r in records:
40+
if len(r.split("\n")) > 1:
41+
header = r.split("\n")[0]
42+
43+
if segment in header:
44+
new_header = header.replace(segment, "")
45+
contents = r.replace(header, new_header)
46+
segment_fasta += ">"+contents
47+
48+
if '~{pre_assembled_samples_fasta}':
49+
with open('~{pre_assembled_samples_fasta}', 'r') as pre_assembled:
50+
segment_fasta += pre_assembled.read()
51+
print(segment_fasta)
52+
53+
CODE
54+
>>>
55+
runtime {
56+
docker : "python"
57+
memory : "3 GB"
58+
cpu : 1
59+
disks: "local-disk 375 LOCAL"
60+
dx_instance_type: "mem1_ssd1_v2_x2"
61+
}
62+
output {
63+
File assembly_of_segment = stdout()
64+
}
65+
}
66+
67+
task augur_mafft_align {
68+
input {
69+
File sequences
70+
File ref_fasta
71+
String basename
72+
73+
Boolean? existing_alignment = false
74+
Boolean? debug = false
75+
Boolean? fill_gaps = true
76+
Boolean? remove_reference = true
77+
78+
String docker = "nextstrain/base"
79+
}
80+
command {
81+
augur align --sequences ~{sequences} \
82+
--reference-sequence ~{ref_fasta} \
83+
--output ~{basename}_aligned.fasta \
84+
~{true="--fill-gaps" false="" fill_gaps} \
85+
~{true="--existing-alignment " false="" existing_alignment} \
86+
~{true="--remove-reference" false="" remove_reference} \
87+
~{true="--debug" false="" debug} \
88+
--nthreads auto
89+
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes | tee MAX_RAM
90+
}
91+
runtime {
92+
docker: docker
93+
memory: "104 GB"
94+
cpu : 16
95+
disks: "local-disk 375 LOCAL"
96+
preemptible: 2
97+
dx_instance_type: "mem3_ssd2_v2_x16"
98+
}
99+
output {
100+
File aligned_sequences = "~{basename}_aligned.fasta"
101+
File align_troubleshoot = stdout()
102+
Int max_ram_usage_in_bytes = read_int("MAX_RAM")
103+
}
104+
}
105+
106+
task draft_augur_tree {
107+
input {
108+
File aligned_fasta
109+
String basename
110+
111+
String? method # default iqtree
112+
String? substitution_model # default GTR
113+
File? exclude_sites
114+
File? vcf_reference
115+
116+
String docker = "nextstrain/base"
117+
}
118+
command {
119+
augur tree --alignment ~{aligned_fasta} \
120+
--output ~{basename}_raw_tree.nwk \
121+
--method ~{default="iqtree" method} \
122+
--substitution-model ~{default="GTR" substitution_model} \
123+
~{"--exclude-sites " + exclude_sites} \
124+
~{"--vcf-reference " + vcf_reference} \
125+
--nthreads auto
126+
}
127+
runtime {
128+
docker: docker
129+
memory: "30 GB"
130+
cpu : 16
131+
disks: "local-disk 375 LOCAL"
132+
dx_instance_type: "mem1_ssd1_v2_x16"
133+
preemptible: 2
134+
}
135+
output {
136+
File aligned_tree = "~{basename}_raw_tree.nwk"
137+
}
138+
}
139+
140+
task refine_augur_tree {
141+
input {
142+
File raw_tree
143+
File aligned_fasta
144+
File metadata
145+
String basename
146+
147+
Int? gen_per_year
148+
Boolean? clock_rate = false
149+
Boolean? clock_std_dev = false
150+
Boolean? keep_root = false
151+
String? root
152+
Boolean? covariance = false
153+
Boolean? no_covariance = false
154+
Boolean? keep_polytomies = false
155+
Int? precision
156+
Boolean? date_confidence = false
157+
String? date_inference
158+
String? branch_length_inference
159+
Boolean? clock_filter_iqd = false
160+
String? divergence_units
161+
File? vcf_reference
162+
163+
String docker = "nextstrain/base"
164+
}
165+
command {
166+
augur refine \
167+
--tree ~{raw_tree} \
168+
--alignment ~{aligned_fasta} \
169+
--metadata ~{metadata} \
170+
--output-tree ~{basename}_refined_tree.nwk \
171+
--output-node-data ~{basename}_branch_lengths.json \
172+
--timetree ~{true="--clock-rate" false="" clock_rate} \
173+
~{true="--clock-std-dev" false="" clock_std_dev} \
174+
--gen-per-year ~{default=50 gen_per_year} \
175+
~{true="--covariance" false="" covariance} \
176+
~{true="--no-covariance" false="" no_covariance} \
177+
--root ~{default="best" root} \
178+
~{true="--keep-root" false="" keep_root} \
179+
--precision ~{default=1 precision} \
180+
~{true="--keep-polytomies" false="" keep_polytomies} \
181+
--date-inference ~{default="joint" date_inference} \
182+
~{true="--date-confidence" false="" date_confidence} \
183+
--branch-length-inference ~{default="auto" branch_length_inference} \
184+
~{true="--clock-filter-iqd" false="" clock_filter_iqd} \
185+
--divergence-units ~{default="mutations-per-site" divergence_units} \
186+
~{"--vcf-reference " + vcf_reference}
187+
}
188+
runtime {
189+
docker: docker
190+
memory: "30 GB"
191+
cpu : 16
192+
disks: "local-disk 375 LOCAL"
193+
dx_instance_type: "mem1_ssd1_v2_x16"
194+
preemptible: 2
195+
}
196+
output {
197+
File tree_refined = "~{basename}_refined_tree.nwk"
198+
File branch_lengths = "~{basename}_branch_lengths.json"
199+
}
200+
}
201+
202+
task ancestral_tree {
203+
input {
204+
File refined_tree
205+
File aligned_fasta
206+
String basename
207+
208+
String? inference
209+
Boolean? keep_ambiguous = false
210+
Boolean? infer_ambiguous = false
211+
Boolean? keep_overhangs = false
212+
File? vcf_reference
213+
File? output_vcf
214+
215+
String docker = "nextstrain/base"
216+
}
217+
command {
218+
augur ancestral \
219+
--tree ~{refined_tree} \
220+
--alignment ~{aligned_fasta} \
221+
--output-node-data ~{basename}_nt_muts.json \
222+
~{"--vcf-reference " + vcf_reference} \
223+
~{"--output-vcf " + output_vcf} \
224+
--output-sequences ~{basename}_ancestral_sequences.fasta \
225+
~{true="--keep-0verhands" false="" keep_overhangs} \
226+
--inference ~{default="joint" inference} \
227+
~{true="--keep-ambiguous" false="" keep_ambiguous} \
228+
~{true="--infer-ambiguous" false="" infer_ambiguous}
229+
}
230+
runtime {
231+
docker: docker
232+
memory: "7 GB"
233+
cpu : 4
234+
disks: "local-disk 50 HDD"
235+
dx_instance_type: "mem1_ssd1_v2_x4"
236+
preemptible: 2
237+
}
238+
output {
239+
File nt_muts_json = "~{basename}_nt_muts.json"
240+
File sequences = "~{basename}_ancestral_sequences.fasta"
241+
}
242+
}
243+
244+
task translate_augur_tree {
245+
input {
246+
String basename
247+
File refined_tree
248+
File nt_muts
249+
File genbank_gb
250+
251+
File? genes
252+
File? vcf_reference_output
253+
File? vcf_reference
254+
255+
String docker = "nextstrain/base"
256+
}
257+
command {
258+
augur translate --tree ~{refined_tree} \
259+
--ancestral-sequences ~{nt_muts} \
260+
--reference-sequence ~{genbank_gb} \
261+
~{"--vcf-reference-output " + vcf_reference_output} \
262+
~{"--vcf-reference " + vcf_reference} \
263+
~{"--genes " + genes} \
264+
--output-node-data ~{basename}_aa_muts.json
265+
}
266+
runtime {
267+
docker: docker
268+
memory: "3 GB"
269+
cpu : 2
270+
disks: "local-disk 50 HDD"
271+
dx_instance_type: "mem1_ssd1_v2_x2"
272+
preemptible: 2
273+
}
274+
output {
275+
File aa_muts_json = "~{basename}_aa_muts.json"
276+
}
277+
}
278+
279+
task export_auspice_json {
280+
input {
281+
File auspice_config
282+
File metadata
283+
File refined_tree
284+
File branch_lengths
285+
File nt_muts
286+
File aa_muts
287+
String basename
288+
289+
String docker = "nextstrain/base"
290+
}
291+
command {
292+
augur export v2 --tree ~{refined_tree} \
293+
--metadata ~{metadata} \
294+
--node-data ~{branch_lengths} ~{nt_muts} ~{aa_muts}\
295+
--auspice-config ~{auspice_config} \
296+
--output ~{basename}_auspice.json
297+
}
298+
runtime {
299+
docker: docker
300+
memory: "3 GB"
301+
cpu : 2
302+
disks: "local-disk 100 HDD"
303+
dx_instance_type: "mem1_ssd1_v2_x2"
304+
preemptible: 2
305+
}
306+
output {
307+
File virus_json = "~{basename}_auspice.json"
308+
}
309+
}
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
version 1.0
2+
3+
import "../tasks/tasks_nextstrain.wdl" as nextstrain
4+
5+
workflow build_augur_tree {
6+
input {
7+
Array[File] assembly_fastas # fasta header records need to be "|" delimited for each metadata value
8+
File metadata
9+
String virus
10+
File ref_fasta # reference genome (often RefSeq)
11+
File genbank_gb # Genbank file for amino acid annotations (same coord space as ref_fasta, typically RefSeq)
12+
}
13+
call nextstrain.concatenate {
14+
input:
15+
infiles = assembly_fastas,
16+
output_name = "all_samples_combined_assembly.fasta"
17+
}
18+
call nextstrain.filter_segments {
19+
input:
20+
all_samples_fasta = concatenate.combined
21+
}
22+
call nextstrain.augur_mafft_align {
23+
input:
24+
sequences = filter_segments.assembly_of_segment,
25+
ref_fasta = ref_fasta,
26+
basename = virus
27+
}
28+
call nextstrain.draft_augur_tree {
29+
input:
30+
aligned_fasta = augur_mafft_align.aligned_sequences,
31+
basename = virus
32+
}
33+
call nextstrain.refine_augur_tree {
34+
input:
35+
raw_tree = draft_augur_tree.aligned_tree,
36+
aligned_fasta = augur_mafft_align.aligned_sequences,
37+
metadata = metadata,
38+
basename = virus
39+
}
40+
call nextstrain.ancestral_tree {
41+
input:
42+
refined_tree = refine_augur_tree.tree_refined,
43+
aligned_fasta = augur_mafft_align.aligned_sequences,
44+
basename = virus
45+
}
46+
call nextstrain.translate_augur_tree {
47+
input:
48+
basename = virus,
49+
refined_tree = refine_augur_tree.tree_refined,
50+
nt_muts = ancestral_tree.nt_muts_json,
51+
genbank_gb = genbank_gb
52+
}
53+
call nextstrain.export_auspice_json {
54+
input:
55+
refined_tree = refine_augur_tree.tree_refined,
56+
metadata = metadata,
57+
branch_lengths = refine_augur_tree.branch_lengths,
58+
nt_muts = ancestral_tree.nt_muts_json,
59+
aa_muts = translate_augur_tree.aa_muts_json,
60+
basename = virus
61+
}
62+
output {
63+
File combined_assembly_fasta = concatenate.combined
64+
File augur_aligned_fasta = augur_mafft_align.aligned_sequences
65+
File raw_tree = draft_augur_tree.aligned_tree
66+
File refined_tree = refine_augur_tree.tree_refined
67+
File branch_lengths = refine_augur_tree.branch_lengths
68+
File json_nt_muts = ancestral_tree.nt_muts_json
69+
File ancestral_sequences_fasta = ancestral_tree.sequences
70+
File json_aa_muts = translate_augur_tree.aa_muts_json
71+
File auspice_input_json = export_auspice_json.virus_json
72+
}
73+
}

0 commit comments

Comments
 (0)