Skip to content

Commit 65eb6fc

Browse files
authored
Merge pull request #371 from broadinstitute/pangolin
update pangolin, pangolearn, nextclade
2 parents 687f21f + d7cabfb commit 65eb6fc

File tree

6 files changed

+192
-178
lines changed

6 files changed

+192
-178
lines changed

pipes/WDL/tasks/tasks_nextstrain.wdl

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,181 @@
11
version 1.0
22

3+
task nextclade_one_sample {
4+
meta {
5+
description: "Nextclade classification of one sample. Leaving optional inputs unspecified will use SARS-CoV-2 defaults."
6+
}
7+
input {
8+
File genome_fasta
9+
File? root_sequence
10+
File? auspice_reference_tree_json
11+
File? qc_config_json
12+
File? gene_annotations_json
13+
File? pcr_primers_csv
14+
String? dataset_name
15+
String docker = "nextstrain/nextclade:1.4.0"
16+
}
17+
String basename = basename(genome_fasta, ".fasta")
18+
command {
19+
set -e
20+
apt-get update
21+
apt-get -y install python3
22+
23+
export NEXTCLADE_VERSION="$(nextclade --version)"
24+
echo $NEXTCLADE_VERSION > VERSION
25+
26+
# grab named nextclade dataset
27+
if [ -n "~{default='' dataset_name}" ]; then
28+
nextclade dataset get --name="~{default='' dataset_name}" --output-dir=.
29+
python3<<CODE1
30+
import json, os
31+
with open('tag.json', 'rt') as inf:
32+
datasetinfo = json.load(inf)
33+
with open('VERSION', 'wt') as outf:
34+
outf.write(os.environ['NEXTCLADE_VERSION'] + "; name=" + datasetinfo['name'] + "; tag=" + datasetinfo['tag'] + "\n")
35+
CODE1
36+
fi
37+
38+
nextclade \
39+
--input-fasta "~{genome_fasta}" \
40+
--input-root-seq ~{default="reference.fasta" root_sequence} \
41+
--input-tree ~{default="tree.json" auspice_reference_tree_json} \
42+
--input-qc-config ~{default="qc.json" qc_config_json} \
43+
--input-gene-map ~{default="genemap.gff" gene_annotations_json} \
44+
--input-pcr-primers ~{default="primers.csv" pcr_primers_csv} \
45+
--output-json "~{basename}".nextclade.json \
46+
--output-tsv "~{basename}".nextclade.tsv \
47+
--output-tree "~{basename}".nextclade.auspice.json
48+
cp "~{basename}".nextclade.tsv input.tsv
49+
python3 <<CODE
50+
# transpose table
51+
import codecs
52+
with codecs.open('input.tsv', 'r', encoding='utf-8') as inf:
53+
with codecs.open('transposed.tsv', 'w', encoding='utf-8') as outf:
54+
for c in zip(*(l.rstrip().split('\t') for l in inf)):
55+
outf.write('\t'.join(c)+'\n')
56+
CODE
57+
grep ^clade transposed.tsv | cut -f 2 | grep -v clade > NEXTCLADE_CLADE
58+
grep ^aaSubstitutions transposed.tsv | cut -f 2 | grep -v aaSubstitutions > NEXTCLADE_AASUBS
59+
grep ^aaDeletions transposed.tsv | cut -f 2 | grep -v aaDeletions > NEXTCLADE_AADELS
60+
}
61+
runtime {
62+
docker: docker
63+
memory: "3 GB"
64+
cpu: 2
65+
disks: "local-disk 50 HDD"
66+
dx_instance_type: "mem1_ssd1_v2_x2"
67+
}
68+
output {
69+
String nextclade_version = read_string("VERSION")
70+
File nextclade_json = "~{basename}.nextclade.json"
71+
File auspice_json = "~{basename}.nextclade.auspice.json"
72+
File nextclade_tsv = "~{basename}.nextclade.tsv"
73+
String nextclade_clade = read_string("NEXTCLADE_CLADE")
74+
String aa_subs_csv = read_string("NEXTCLADE_AASUBS")
75+
String aa_dels_csv = read_string("NEXTCLADE_AADELS")
76+
}
77+
}
78+
79+
task nextclade_many_samples {
80+
meta {
81+
description: "Nextclade classification of many samples. Leaving optional inputs unspecified will use SARS-CoV-2 defaults."
82+
}
83+
input {
84+
Array[File]+ genome_fastas
85+
File? root_sequence
86+
File? auspice_reference_tree_json
87+
File? qc_config_json
88+
File? gene_annotations_json
89+
File? pcr_primers_csv
90+
String? dataset_name
91+
String basename
92+
String docker = "nextstrain/nextclade:1.4.0"
93+
}
94+
command <<<
95+
set -e
96+
apt-get update
97+
apt-get -y install python3
98+
99+
export NEXTCLADE_VERSION="$(nextclade --version)"
100+
echo $NEXTCLADE_VERSION > VERSION
101+
102+
# grab named nextclade dataset
103+
if [ -n "~{default='' dataset_name}" ]; then
104+
nextclade dataset get --name="~{default='' dataset_name}" --output-dir=.
105+
python3<<CODE1
106+
import json, os
107+
with open('tag.json', 'rt') as inf:
108+
datasetinfo = json.load(inf)
109+
with open('VERSION', 'wt') as outf:
110+
outf.write(os.environ['NEXTCLADE_VERSION'] + "; name=" + datasetinfo['name'] + "; tag=" + datasetinfo['tag'] + "\n")
111+
CODE1
112+
fi
113+
114+
cat ~{sep=" " genome_fastas} > genomes.fasta
115+
nextclade \
116+
--input-fasta genomes.fasta \
117+
--input-root-seq ~{default="reference.fasta" root_sequence} \
118+
--input-tree ~{default="tree.json" auspice_reference_tree_json} \
119+
--input-qc-config ~{default="qc.json" qc_config_json} \
120+
--input-gene-map ~{default="genemap.gff" gene_annotations_json} \
121+
--input-pcr-primers ~{default="primers.csv" pcr_primers_csv} \
122+
--output-json "~{basename}".nextclade.json \
123+
--output-tsv "~{basename}".nextclade.tsv \
124+
--output-tree "~{basename}".nextclade.auspice.json
125+
126+
cp genomes.aligned.fasta "~{basename}".nextalign.msa.fasta
127+
128+
python3 <<CODE
129+
# transpose table
130+
import codecs, csv, json
131+
out_maps = {'clade':{}, 'aaSubstitutions':{}, 'aaDeletions':{}}
132+
with codecs.open('~{basename}.nextclade.tsv', 'r', encoding='utf-8') as inf:
133+
with codecs.open('NEXTCLADE_CLADE', 'w', encoding='utf-8') as outf_clade:
134+
with codecs.open('NEXTCLADE_AASUBS', 'w', encoding='utf-8') as outf_aasubs:
135+
with codecs.open('NEXTCLADE_AADELS', 'w', encoding='utf-8') as outf_aadels:
136+
for row in csv.DictReader(inf, delimiter='\t'):
137+
outf_clade.write('\t'.join([row['seqName'], row['clade']])+'\n')
138+
outf_aasubs.write('\t'.join([row['seqName'], row['aaSubstitutions']])+'\n')
139+
outf_aadels.write('\t'.join([row['seqName'], row['aaDeletions']])+'\n')
140+
for k in ('clade','aaSubstitutions','aaDeletions'):
141+
out_maps[k][row['seqName']] = row[k]
142+
with codecs.open('NEXTCLADE_CLADE.json', 'w', encoding='utf-8') as outf:
143+
json.dump(out_maps['clade'], outf)
144+
with codecs.open('NEXTCLADE_AASUBS.json', 'w', encoding='utf-8') as outf:
145+
json.dump(out_maps['aaSubstitutions'], outf)
146+
with codecs.open('NEXTCLADE_AADELS.json', 'w', encoding='utf-8') as outf:
147+
json.dump(out_maps['aaDeletions'], outf)
148+
CODE
149+
150+
# gather runtime metrics
151+
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
152+
cat /proc/loadavg > CPU_LOAD
153+
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
154+
>>>
155+
runtime {
156+
docker: docker
157+
memory: "14 GB"
158+
cpu: 16
159+
disks: "local-disk 100 HDD"
160+
dx_instance_type: "mem1_ssd1_v2_x16"
161+
}
162+
output {
163+
#Map[String,String] nextclade_clade = read_map("NEXTCLADE_CLADE")
164+
#Map[String,String] aa_subs_csv = read_map("NEXTCLADE_AASUBS")
165+
#Map[String,String] aa_dels_csv = read_map("NEXTCLADE_AADELS")
166+
Map[String,String] nextclade_clade = read_json("NEXTCLADE_CLADE.json")
167+
Map[String,String] aa_subs_csv = read_json("NEXTCLADE_AASUBS.json")
168+
Map[String,String] aa_dels_csv = read_json("NEXTCLADE_AADELS.json")
169+
String nextclade_version = read_string("VERSION")
170+
File nextalign_msa = "~{basename}.nextalign.msa.fasta"
171+
File nextclade_json = "~{basename}.nextclade.json"
172+
File auspice_json = "~{basename}.nextclade.auspice.json"
173+
File nextclade_tsv = "~{basename}.nextclade.tsv"
174+
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
175+
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
176+
String cpu_load = read_string("CPU_LOAD")
177+
}
178+
}
3179

4180
task nextmeta_prep {
5181
input {

pipes/WDL/tasks/tasks_sarscov2.wdl

Lines changed: 2 additions & 169 deletions
Original file line numberDiff line numberDiff line change
@@ -1,172 +1,5 @@
11
version 1.0
22

3-
task nextclade_one_sample {
4-
meta {
5-
description: "Nextclade classification of one sample. Leaving optional inputs unspecified will use SARS-CoV-2 defaults."
6-
}
7-
input {
8-
File genome_fasta
9-
File? root_sequence
10-
File? auspice_reference_tree_json
11-
File? qc_config_json
12-
File? gene_annotations_json
13-
File? pcr_primers_csv
14-
String docker = "nextstrain/nextclade:1.2.3"
15-
}
16-
String basename = basename(genome_fasta, ".fasta")
17-
command {
18-
set -e
19-
apt-get update
20-
apt-get -y install python3
21-
22-
URI=$(echo "~{docker}" | sed 's|:|/|g')
23-
NEXTCLADE_VERSION="$(nextclade --version)"
24-
echo $NEXTCLADE_VERSION > VERSION
25-
26-
# grab reference data for SARS-CoV-2
27-
curl https://raw.githubusercontent.com/$URI/data/sars-cov-2/reference.fasta > reference.fasta
28-
curl https://raw.githubusercontent.com/$URI/data/sars-cov-2/genemap.gff > genemap.gff
29-
curl https://raw.githubusercontent.com/$URI/data/sars-cov-2/tree.json > tree.json
30-
curl https://raw.githubusercontent.com/$URI/data/sars-cov-2/qc.json > qc.json
31-
curl https://raw.githubusercontent.com/$URI/data/sars-cov-2/primers.csv > primers.csv
32-
33-
nextclade \
34-
--input-fasta "~{genome_fasta}" \
35-
--input-root-seq ~{default="reference.fasta" root_sequence} \
36-
--input-tree ~{default="tree.json" auspice_reference_tree_json} \
37-
--input-qc-config ~{default="qc.json" qc_config_json} \
38-
--input-gene-map ~{default="genemap.gff" gene_annotations_json} \
39-
--input-pcr-primers ~{default="primers.csv" pcr_primers_csv} \
40-
--output-json "~{basename}".nextclade.json \
41-
--output-tsv "~{basename}".nextclade.tsv \
42-
--output-tree "~{basename}".nextclade.auspice.json
43-
cp "~{basename}".nextclade.tsv input.tsv
44-
python3 <<CODE
45-
# transpose table
46-
import codecs
47-
with codecs.open('input.tsv', 'r', encoding='utf-8') as inf:
48-
with codecs.open('transposed.tsv', 'w', encoding='utf-8') as outf:
49-
for c in zip(*(l.rstrip().split('\t') for l in inf)):
50-
outf.write('\t'.join(c)+'\n')
51-
CODE
52-
grep ^clade transposed.tsv | cut -f 2 | grep -v clade > NEXTCLADE_CLADE
53-
grep ^aaSubstitutions transposed.tsv | cut -f 2 | grep -v aaSubstitutions > NEXTCLADE_AASUBS
54-
grep ^aaDeletions transposed.tsv | cut -f 2 | grep -v aaDeletions > NEXTCLADE_AADELS
55-
}
56-
runtime {
57-
docker: docker
58-
memory: "3 GB"
59-
cpu: 2
60-
disks: "local-disk 50 HDD"
61-
dx_instance_type: "mem1_ssd1_v2_x2"
62-
}
63-
output {
64-
String nextclade_version = read_string("VERSION")
65-
File nextclade_json = "~{basename}.nextclade.json"
66-
File auspice_json = "~{basename}.nextclade.auspice.json"
67-
File nextclade_tsv = "~{basename}.nextclade.tsv"
68-
String nextclade_clade = read_string("NEXTCLADE_CLADE")
69-
String aa_subs_csv = read_string("NEXTCLADE_AASUBS")
70-
String aa_dels_csv = read_string("NEXTCLADE_AADELS")
71-
}
72-
}
73-
74-
task nextclade_many_samples {
75-
meta {
76-
description: "Nextclade classification of many samples. Leaving optional inputs unspecified will use SARS-CoV-2 defaults."
77-
}
78-
input {
79-
Array[File]+ genome_fastas
80-
File? root_sequence
81-
File? auspice_reference_tree_json
82-
File? qc_config_json
83-
File? gene_annotations_json
84-
File? pcr_primers_csv
85-
String basename
86-
String docker = "nextstrain/nextclade:1.2.3"
87-
}
88-
command <<<
89-
set -e
90-
apt-get update
91-
apt-get -y install python3
92-
93-
URI=$(echo "~{docker}" | sed 's|:|/|g')
94-
NEXTCLADE_VERSION="$(nextclade --version)"
95-
echo $NEXTCLADE_VERSION > VERSION
96-
97-
# grab reference data for SARS-CoV-2
98-
curl https://raw.githubusercontent.com/$URI/data/sars-cov-2/reference.fasta > reference.fasta
99-
curl https://raw.githubusercontent.com/$URI/data/sars-cov-2/genemap.gff > genemap.gff
100-
curl https://raw.githubusercontent.com/$URI/data/sars-cov-2/tree.json > tree.json
101-
curl https://raw.githubusercontent.com/$URI/data/sars-cov-2/qc.json > qc.json
102-
curl https://raw.githubusercontent.com/$URI/data/sars-cov-2/primers.csv > primers.csv
103-
104-
cat ~{sep=" " genome_fastas} > genomes.fasta
105-
nextclade \
106-
--input-fasta genomes.fasta \
107-
--input-root-seq ~{default="reference.fasta" root_sequence} \
108-
--input-tree ~{default="tree.json" auspice_reference_tree_json} \
109-
--input-qc-config ~{default="qc.json" qc_config_json} \
110-
--input-gene-map ~{default="genemap.gff" gene_annotations_json} \
111-
--input-pcr-primers ~{default="primers.csv" pcr_primers_csv} \
112-
--output-json "~{basename}".nextclade.json \
113-
--output-tsv "~{basename}".nextclade.tsv \
114-
--output-tree "~{basename}".nextclade.auspice.json
115-
116-
cp genomes.aligned.fasta "~{basename}".nextalign.msa.fasta
117-
118-
python3 <<CODE
119-
# transpose table
120-
import codecs, csv, json
121-
out_maps = {'clade':{}, 'aaSubstitutions':{}, 'aaDeletions':{}}
122-
with codecs.open('~{basename}.nextclade.tsv', 'r', encoding='utf-8') as inf:
123-
with codecs.open('NEXTCLADE_CLADE', 'w', encoding='utf-8') as outf_clade:
124-
with codecs.open('NEXTCLADE_AASUBS', 'w', encoding='utf-8') as outf_aasubs:
125-
with codecs.open('NEXTCLADE_AADELS', 'w', encoding='utf-8') as outf_aadels:
126-
for row in csv.DictReader(inf, delimiter='\t'):
127-
outf_clade.write('\t'.join([row['seqName'], row['clade']])+'\n')
128-
outf_aasubs.write('\t'.join([row['seqName'], row['aaSubstitutions']])+'\n')
129-
outf_aadels.write('\t'.join([row['seqName'], row['aaDeletions']])+'\n')
130-
for k in ('clade','aaSubstitutions','aaDeletions'):
131-
out_maps[k][row['seqName']] = row[k]
132-
with codecs.open('NEXTCLADE_CLADE.json', 'w', encoding='utf-8') as outf:
133-
json.dump(out_maps['clade'], outf)
134-
with codecs.open('NEXTCLADE_AASUBS.json', 'w', encoding='utf-8') as outf:
135-
json.dump(out_maps['aaSubstitutions'], outf)
136-
with codecs.open('NEXTCLADE_AADELS.json', 'w', encoding='utf-8') as outf:
137-
json.dump(out_maps['aaDeletions'], outf)
138-
CODE
139-
140-
# gather runtime metrics
141-
cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC
142-
cat /proc/loadavg > CPU_LOAD
143-
cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes > MEM_BYTES
144-
>>>
145-
runtime {
146-
docker: docker
147-
memory: "14 GB"
148-
cpu: 16
149-
disks: "local-disk 100 HDD"
150-
dx_instance_type: "mem1_ssd1_v2_x16"
151-
}
152-
output {
153-
#Map[String,String] nextclade_clade = read_map("NEXTCLADE_CLADE")
154-
#Map[String,String] aa_subs_csv = read_map("NEXTCLADE_AASUBS")
155-
#Map[String,String] aa_dels_csv = read_map("NEXTCLADE_AADELS")
156-
Map[String,String] nextclade_clade = read_json("NEXTCLADE_CLADE.json")
157-
Map[String,String] aa_subs_csv = read_json("NEXTCLADE_AASUBS.json")
158-
Map[String,String] aa_dels_csv = read_json("NEXTCLADE_AADELS.json")
159-
String nextclade_version = read_string("VERSION")
160-
File nextalign_msa = "~{basename}.nextalign.msa.fasta"
161-
File nextclade_json = "~{basename}.nextclade.json"
162-
File auspice_json = "~{basename}.nextclade.auspice.json"
163-
File nextclade_tsv = "~{basename}.nextclade.tsv"
164-
Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000)
165-
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
166-
String cpu_load = read_string("CPU_LOAD")
167-
}
168-
}
169-
1703
task pangolin_one_sample {
1714
meta {
1725
description: "Pangolin classification of one SARS-CoV-2 sample."
@@ -176,7 +9,7 @@ task pangolin_one_sample {
1769
Int? min_length
17710
Float? max_ambig
17811
Boolean inference_usher=true
179-
String docker = "quay.io/staphb/pangolin:3.1.11-pangolearn-2021-09-17"
12+
String docker = "quay.io/staphb/pangolin:3.1.14-pangolearn-2021-09-28"
18013
}
18114
String basename = basename(genome_fasta, ".fasta")
18215
command <<<
@@ -248,7 +81,7 @@ task pangolin_many_samples {
24881
Float? max_ambig
24982
Boolean inference_usher=true
25083
String basename
251-
String docker = "quay.io/staphb/pangolin:3.1.11-pangolearn-2021-09-17"
84+
String docker = "quay.io/staphb/pangolin:3.1.14-pangolearn-2021-09-28"
25285
}
25386
command <<<
25487
date | tee DATE

pipes/WDL/workflows/sarscov2_batch_relineage.wdl

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
version 1.0
22

3+
import "../tasks/tasks_nextstrain.wdl" as nextstrain
34
import "../tasks/tasks_sarscov2.wdl" as sarscov2
45
import "../tasks/tasks_utils.wdl" as utils
56

@@ -33,10 +34,11 @@ workflow sarscov2_batch_relineage {
3334
sequences_fasta = filter_sequences_by_length.filtered_fasta
3435
}
3536
36-
call sarscov2.nextclade_many_samples {
37+
call nextstrain.nextclade_many_samples {
3738
input:
3839
genome_fastas = [filter_sequences_by_length.filtered_fasta],
39-
basename = "nextclade-~{flowcell_id}"
40+
basename = "nextclade-~{flowcell_id}",
41+
dataset_name = "sars-cov-2"
4042
}
4143
4244
call sarscov2.pangolin_many_samples {

0 commit comments

Comments
 (0)