Skip to content

Commit ff81708

Browse files
authored
Merge pull request #200 from broadinstitute/dp-consensus
sarscov2 large workflow improvements
2 parents e04bb34 + 7220216 commit ff81708

File tree

7 files changed

+168
-108
lines changed

7 files changed

+168
-108
lines changed

pipes/WDL/tasks/tasks_assembly.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,7 @@ task refine_assembly_with_aligned_reads {
418418

419419
runtime {
420420
docker: "${docker}"
421-
memory: select_first([machine_mem_gb, 7]) + " GB"
421+
memory: select_first([machine_mem_gb, 15]) + " GB"
422422
cpu: 8
423423
disks: "local-disk 375 LOCAL"
424424
dx_instance_type: "mem1_ssd1_v2_x8"

pipes/WDL/tasks/tasks_ncbi.wdl

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -252,8 +252,8 @@ task prefix_fasta_header {
252252
input {
253253
File genome_fasta
254254
String prefix
255+
String out_basename = basename(genome_fasta, ".fasta")
255256
}
256-
String out_basename = basename(genome_fasta, ".fasta")
257257
command <<<
258258
set -e
259259
python3 <<CODE
@@ -308,6 +308,12 @@ task gisaid_meta_prep {
308308
String out_name
309309
String continent = "North America"
310310
Boolean strict = true
311+
String? username
312+
String? submitting_lab_name
313+
String? submitting_lab_addr
314+
String? originating_lab_addr
315+
String? authors
316+
String? fasta_filename
311317
}
312318
command <<<
313319
python3 << CODE
@@ -347,12 +353,12 @@ task gisaid_meta_prep {
347353
'covv_seq_technology': sample_to_cmt[row['Sequence_ID']]['Sequencing Technology'],
348354
349355
'covv_orig_lab': row['collected_by'],
350-
'covv_subm_lab': 'REQUIRED',
351-
'covv_authors': 'REQUIRED',
352-
'covv_orig_lab_addr': 'REQUIRED',
353-
'covv_subm_lab_addr': 'REQUIRED',
354-
'submitter': 'REQUIRED',
355-
'fn': 'REQUIRED',
356+
'covv_subm_lab': "~{default='REQUIRED' submitting_lab_name}",
357+
'covv_authors': "~{default='REQUIRED' authors}",
358+
'covv_orig_lab_addr': "~{default='REQUIRED' originating_lab_addr}",
359+
'covv_subm_lab_addr': "~{default='REQUIRED' submitting_lab_addr}",
360+
'submitter': "~{default='REQUIRED' username}",
361+
'fn': "~{default='REQUIRED' fasta_filename}",
356362
})
357363
358364
#covv_specimen
@@ -551,10 +557,12 @@ task biosample_to_genbank {
551557
--biosample_in_smt \
552558
--iso_dates \
553559
--loglevel DEBUG
560+
cut -f 1 "${base}.genbank.src" | tail +2 > "${base}.sample_ids.txt"
554561
}
555562
output {
556563
File genbank_source_modifier_table = "${base}.genbank.src"
557564
File biosample_map = "${base}.biosample.map.txt"
565+
File sample_ids = "${base}.sample_ids.txt"
558566
}
559567
runtime {
560568
docker: docker

pipes/WDL/tasks/tasks_ncbi_tools.wdl

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,7 @@ task group_sra_bams_by_biosample {
146146
Array[String] biosamples
147147
Array[File] biosample_attributes_jsons
148148
Array[String] library_strategies
149+
Array[String] seq_platforms
149150
}
150151
parameter_meta {
151152
bam_filepaths: {
@@ -165,14 +166,16 @@ task group_sra_bams_by_biosample {
165166
biosample_accs = '~{sep="*" biosamples}'.split('*')
166167
attributes = '~{sep="*" biosample_attributes_jsons}'.split('*')
167168
libstrats = '~{sep="*" library_strategies}'.split('*')
168-
assert len(bam_uris) == len(biosample_accs) == len(attributes) == len(libstrats)
169+
seqplats = '~{sep="*" seq_platforms}'.split('*')
170+
assert len(bam_uris) == len(biosample_accs) == len(attributes) == len(libstrats) == len(seqplats)
169171
170172
# lookup table files to dicts
171173
sample_to_bams = {}
172174
sample_to_attributes = {}
173175
sample_to_libstrat = {}
176+
sample_to_seqplat = {}
174177
attr_keys = set()
175-
for samn,bam,attr_file,libstrat in zip(biosample_accs,bam_uris, attributes, libstrats):
178+
for samn,bam,attr_file,libstrat,seqplat in zip(biosample_accs,bam_uris, attributes, libstrats, seqplats):
176179
sample_to_bams.setdefault(samn, [])
177180
sample_to_bams[samn].append(bam)
178181
with open(attr_file, 'rt') as inf:
@@ -181,24 +184,30 @@ task group_sra_bams_by_biosample {
181184
attr_keys.update(k for k,v in attr.items() if v)
182185
sample_to_libstrat.setdefault(samn, set())
183186
sample_to_libstrat[samn].add(libstrat)
187+
sample_to_seqplat.setdefault(samn, set())
188+
sample_to_seqplat[samn].add(seqplat)
184189
185190
# write outputs
186-
with open('attributes.json', 'wt') as out_attr:
187-
json.dump(sample_to_attributes, out_attr)
188-
with open('attributes.tsv', 'wt') as out_attr:
191+
with open('attributes.json', 'wt') as outf:
192+
json.dump(sample_to_attributes, outf)
193+
with open('attributes.tsv', 'wt') as outf:
189194
headers = tuple(sorted(attr_keys))
190-
out_attr.write('\t'.join(headers)+'\n')
195+
outf.write('\t'.join(headers)+'\n')
191196
for sample in sorted(sample_to_bams.keys()):
192-
out_attr.write('\t'.join(sample_to_attributes[sample].get(h,'') for h in headers)+'\n')
197+
outf.write('\t'.join(sample_to_attributes[sample].get(h,'') for h in headers)+'\n')
193198
with open('grouped_bams', 'wt') as out_groups:
194199
with open('samns', 'wt') as out_samples:
195200
for sample in sorted(sample_to_bams.keys()):
196201
out_samples.write(sample+'\n')
197202
out_groups.write('\t'.join(sample_to_bams[sample])+'\n')
198-
with open('library_strategies.json', 'wt') as out_attr:
203+
with open('library_strategies.json', 'wt') as outf:
199204
for k,v in sample_to_libstrat.items():
200205
sample_to_libstrat[k] = ';'.join(sorted(v))
201-
json.dump(sample_to_libstrat, out_attr)
206+
json.dump(sample_to_libstrat, outf)
207+
with open('sequencing_platforms.json', 'wt') as outf:
208+
for k,v in sample_to_seqplat.items():
209+
sample_to_seqplat[k] = ';'.join(sorted(v))
210+
json.dump(sample_to_seqplat, outf)
202211
CODE
203212
>>>
204213
output {
@@ -207,6 +216,7 @@ task group_sra_bams_by_biosample {
207216
Map[String,Map[String,String]] samn_to_attributes = read_json('attributes.json')
208217
File biosample_attributes_tsv = 'attributes.tsv'
209218
Map[String,String] samn_to_library_strategy = read_json('library_strategies.json')
219+
Map[String,String] samn_to_sequencing_platform = read_json('sequencing_platforms.json')
210220
}
211221
runtime {
212222
docker: "python:slim"

pipes/WDL/tasks/tasks_read_utils.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ task max {
1212
CODE
1313
>>>
1414
output {
15-
Int max = read_int(stdout())
15+
Int out = read_int(stdout())
1616
}
1717
runtime {
1818
docker: "python:slim"

pipes/WDL/workflows/demux_deplete.wdl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,6 +152,7 @@ workflow demux_deplete {
152152
Array[Int] read_counts_depleted = deplete.depletion_read_count_post
153153

154154
File? sra_metadata = sra_meta_prep.sra_metadata
155+
File? cleaned_bam_uris = sra_meta_prep.cleaned_bam_uris
155156

156157
Array[File] demux_metrics = illumina_demux.metrics
157158
Array[File] demux_commonBarcodes = illumina_demux.commonBarcodes

pipes/WDL/workflows/sarscov2_illumina_full.wdl

Lines changed: 84 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ version 1.0
33
import "../tasks/tasks_read_utils.wdl" as read_utils
44
import "../tasks/tasks_ncbi.wdl" as ncbi
55
import "../tasks/tasks_nextstrain.wdl" as nextstrain
6+
import "../tasks/tasks_reports.wdl" as reports
67

78
import "demux_deplete.wdl"
89
import "assemble_refbased.wdl"
@@ -38,22 +39,33 @@ workflow sarscov2_illumina_full {
3839
}
3940

4041
input {
42+
File flowcell_tgz
4143
File reference_fasta
4244
String amplicon_bed_prefix
4345

44-
File biosample_attributes
46+
Array[File] biosample_attributes
4547
String instrument_model
4648
String sra_title
4749

48-
Int min_genome_bases = 20000
50+
Int min_genome_bases = 15000
4951
}
5052
Int taxid = 2697049
5153
String gisaid_prefix = 'hCoV-19/'
54+
String flowcell_id = basename(basename(basename(basename(flowcell_tgz, ".gz"), ".zst"), ".tar"), ".tgz")
55+
56+
# merge biosample attributes tables
57+
call reports.tsv_join as biosample_merge {
58+
input:
59+
input_tsvs = biosample_attributes,
60+
id_col = 'accession',
61+
out_basename = "biosample_attributes-merged"
62+
}
5263
5364
### demux, deplete, SRA submission prep, fastqc/multiqc
5465
call demux_deplete.demux_deplete {
5566
input:
56-
biosample_map = biosample_attributes,
67+
flowcell_tgz = flowcell_tgz,
68+
biosample_map = biosample_merge.out_tsv,
5769
instrument_model = instrument_model,
5870
sra_title = sra_title
5971
}
@@ -99,7 +111,7 @@ workflow sarscov2_illumina_full {
99111
100112
File passing_assemblies = rename_fasta_header.renamed_fasta
101113
String passing_assembly_ids = orig_name
102-
Array[String] assembly_cmt = [orig_name, "Broad viral-ngs v. " + demux_deplete.demux_viral_core_version, assemble_refbased.assembly_mean_coverage]
114+
Array[String] assembly_cmt = [orig_name, "Broad viral-ngs v. " + demux_deplete.demux_viral_core_version, assemble_refbased.assembly_mean_coverage, instrument_model]
103115
104116
# lineage assignment
105117
call sarscov2_lineages.sarscov2_lineages {
@@ -124,81 +136,100 @@ workflow sarscov2_illumina_full {
124136
String failed_assembly_id = orig_name
125137
}
126138

127-
Map[String,String?] assembly_stats = {
128-
'sample_orig': orig_name,
129-
'sample': name_reads.left,
130-
'amplicon_set': demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"],
131-
'assembly_mean_coverage': assemble_refbased.assembly_mean_coverage,
132-
'nextclade_clade': sarscov2_lineages.nextclade_clade,
133-
'nextclade_aa_subs': sarscov2_lineages.nextclade_aa_subs,
134-
'nextclade_aa_dels': sarscov2_lineages.nextclade_aa_dels,
135-
'pango_lineage': sarscov2_lineages.pango_lineage
136-
}
137-
Map[String,File?] assembly_files = {
138-
'assembly_fasta': assemble_refbased.assembly_fasta,
139-
'coverage_plot': assemble_refbased.align_to_ref_merged_coverage_plot,
140-
'aligned_bam': assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam,
141-
'replicate_discordant_vcf': assemble_refbased.replicate_discordant_vcf,
142-
'nextclade_tsv': sarscov2_lineages.nextclade_tsv,
143-
'pangolin_csv': sarscov2_lineages.pangolin_csv,
144-
'vadr_tgz': vadr.outputs_tgz
145-
}
146-
Map[String,Int?] assembly_metrics = {
147-
'assembly_length_unambiguous': assemble_refbased.assembly_length_unambiguous,
148-
'dist_to_ref_snps': assemble_refbased.dist_to_ref_snps,
149-
'dist_to_ref_indels': assemble_refbased.dist_to_ref_indels,
150-
'replicate_concordant_sites': assemble_refbased.replicate_concordant_sites,
151-
'replicate_discordant_snps': assemble_refbased.replicate_discordant_snps,
152-
'replicate_discordant_indels': assemble_refbased.replicate_discordant_indels,
153-
'num_read_groups': assemble_refbased.num_read_groups,
154-
'num_libraries': assemble_refbased.num_libraries,
155-
'vadr_num_alerts': vadr.num_alerts
156-
}
157-
139+
Array[String] assembly_tsv_row = [
140+
orig_name,
141+
name_reads.left,
142+
demux_deplete.meta_by_sample[name_reads.left]["amplicon_set"],
143+
assemble_refbased.assembly_mean_coverage,
144+
assemble_refbased.assembly_length_unambiguous,
145+
select_first([sarscov2_lineages.nextclade_clade, ""]),
146+
select_first([sarscov2_lineages.nextclade_aa_subs, ""]),
147+
select_first([sarscov2_lineages.nextclade_aa_dels, ""]),
148+
select_first([sarscov2_lineages.pango_lineage, ""]),
149+
assemble_refbased.dist_to_ref_snps,
150+
assemble_refbased.dist_to_ref_indels,
151+
select_first([vadr.num_alerts, ""]),
152+
assemble_refbased.assembly_fasta,
153+
assemble_refbased.align_to_ref_merged_coverage_plot,
154+
assemble_refbased.align_to_ref_merged_aligned_trimmed_only_bam,
155+
assemble_refbased.replicate_discordant_vcf,
156+
select_first([sarscov2_lineages.nextclade_tsv, ""]),
157+
select_first([sarscov2_lineages.pangolin_csv, ""]),
158+
select_first([vadr.outputs_tgz, ""]),
159+
assemble_refbased.replicate_concordant_sites,
160+
assemble_refbased.replicate_discordant_snps,
161+
assemble_refbased.replicate_discordant_indels,
162+
assemble_refbased.num_read_groups,
163+
assemble_refbased.num_libraries,
164+
]
165+
}
166+
Array[String] assembly_tsv_header = [
167+
'sample', 'sample_sanitized', 'amplicon_set', 'assembly_mean_coverage', 'assembly_length_unambiguous',
168+
'nextclade_clade', 'nextclade_aa_subs', 'nextclade_aa_dels', 'pango_lineage',
169+
'dist_to_ref_snps', 'dist_to_ref_indels', 'vadr_num_alerts',
170+
'assembly_fasta', 'coverage_plot', 'aligned_bam', 'replicate_discordant_vcf',
171+
'nextclade_tsv', 'pangolin_csv', 'vadr_tgz',
172+
'replicate_concordant_sites', 'replicate_discordant_snps', 'replicate_discordant_indels', 'num_read_groups', 'num_libraries',
173+
]
174+
175+
call nextstrain.concatenate as assembly_meta_tsv {
176+
input:
177+
infiles = [write_tsv([assembly_tsv_header]), write_tsv(assembly_tsv_row)],
178+
output_name = "assembly_metadata-~{flowcell_id}.tsv"
158179
}
159180
160-
# TO DO: filter out genomes from submission that are less than ntc_bases.max
161-
call read_utils.max as ntc {
181+
182+
# TO DO: filter out genomes from submission that are less than ntc_max.out
183+
call read_utils.max as ntc_max {
162184
input:
163185
list = select_all(ntc_bases)
164186
}
165187
166188
### prep genbank submission
167-
call nextstrain.concatenate as submit_genomes {
168-
input:
169-
infiles = select_all(submittable_genomes),
170-
output_name = "assemblies.fasta"
171-
}
172189
call ncbi.biosample_to_genbank {
173190
input:
174-
biosample_attributes = biosample_attributes,
191+
biosample_attributes = biosample_merge.out_tsv,
175192
num_segments = 1,
176193
taxid = taxid,
177194
filter_to_ids = write_lines(select_all(submittable_id))
178195
}
179196
call ncbi.structured_comments {
180197
input:
181-
assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage']],select_all(assembly_cmt)])),
182-
filter_to_ids = write_lines(select_all(submittable_id))
198+
assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])),
199+
filter_to_ids = biosample_to_genbank.sample_ids
200+
}
201+
call nextstrain.concatenate as passing_genomes {
202+
input:
203+
infiles = select_all(submittable_genomes),
204+
output_name = "assemblies.fasta"
205+
}
206+
call nextstrain.filter_sequences_to_list as submit_genomes {
207+
input:
208+
sequences = passing_genomes.combined,
209+
keep_list = [biosample_to_genbank.sample_ids]
183210
}
184211
call ncbi.package_genbank_ftp_submission {
185212
input:
186-
sequences_fasta = submit_genomes.combined,
213+
sequences_fasta = submit_genomes.filtered_fasta,
187214
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
188-
structured_comment_table = structured_comments.structured_comment_table
215+
structured_comment_table = structured_comments.structured_comment_table,
216+
submission_name = flowcell_id,
217+
submission_uid = flowcell_id
189218
}
190219
191220
### prep gisaid submission
192221
call ncbi.prefix_fasta_header as prefix_gisaid {
193222
input:
194-
genome_fasta = submit_genomes.combined,
195-
prefix = gisaid_prefix
223+
genome_fasta = submit_genomes.filtered_fasta,
224+
prefix = gisaid_prefix,
225+
out_basename = "gisaid-sequences-~{flowcell_id}"
196226
}
197227
call ncbi.gisaid_meta_prep {
198228
input:
199229
source_modifier_table = biosample_to_genbank.genbank_source_modifier_table,
200230
structured_comments = structured_comments.structured_comment_table,
201-
out_name = "gisaid_meta.tsv"
231+
fasta_filename = "gisaid-sequences-~{flowcell_id}.fasta",
232+
out_name = "gisaid-meta-~{flowcell_id}.tsv"
202233
}
203234
204235
output {
@@ -212,12 +243,13 @@ workflow sarscov2_illumina_full {
212243
Array[Int] read_counts_depleted = demux_deplete.read_counts_depleted
213244

214245
File sra_metadata = select_first([demux_deplete.sra_metadata])
246+
File cleaned_bam_uris = select_first([demux_deplete.cleaned_bam_uris])
215247

216248
Array[File] assemblies_fasta = assemble_refbased.assembly_fasta
217249
Array[File] passing_assemblies_fasta = select_all(passing_assemblies)
218250
Array[File] submittable_assemblies_fasta = select_all(submittable_genomes)
219251

220-
Int max_ntc_bases = ntc.max
252+
Int max_ntc_bases = ntc_max.out
221253

222254
Array[File] demux_metrics = demux_deplete.demux_metrics
223255
Array[File] demux_commonBarcodes = demux_deplete.demux_commonBarcodes
@@ -227,9 +259,7 @@ workflow sarscov2_illumina_full {
227259
File multiqc_report_cleaned = demux_deplete.multiqc_report_cleaned
228260
File spikein_counts = demux_deplete.spikein_counts
229261

230-
Array[Map[String,String?]] per_assembly_stats = assembly_stats
231-
Array[Map[String,File?]] per_assembly_files = assembly_files
232-
Array[Map[String,Int?]] per_assembly_metrics = assembly_metrics
262+
File assembly_stats_tsv = assembly_meta_tsv.combined
233263

234264
File submission_zip = package_genbank_ftp_submission.submission_zip
235265
File submission_xml = package_genbank_ftp_submission.submission_xml

0 commit comments

Comments
 (0)