@@ -3,6 +3,7 @@ version 1.0
33import "../tasks/tasks_read_utils.wdl" as read_utils
44import "../tasks/tasks_ncbi.wdl" as ncbi
55import "../tasks/tasks_nextstrain.wdl" as nextstrain
6+ import "../tasks/tasks_reports.wdl" as reports
67
78import "demux_deplete.wdl"
89import "assemble_refbased.wdl"
@@ -38,22 +39,33 @@ workflow sarscov2_illumina_full {
3839 }
3940
4041 input {
42+ File flowcell_tgz
4143 File reference_fasta
4244 String amplicon_bed_prefix
4345
44- File biosample_attributes
46+ Array [ File ] biosample_attributes
4547 String instrument_model
4648 String sra_title
4749
48- Int min_genome_bases = 20000
50+ Int min_genome_bases = 15000
4951 }
5052 Int taxid = 2697049
5153 String gisaid_prefix = 'hCoV-19/'
54+ String flowcell_id = basename (basename (basename (basename (flowcell_tgz , ".gz" ), ".zst" ), ".tar" ), ".tgz" )
55+
56+ # merge biosample attributes tables
57+ call reports .tsv_join as biosample_merge {
58+ input :
59+ input_tsvs = biosample_attributes ,
60+ id_col = 'accession' ,
61+ out_basename = "biosample_attributes-merged"
62+ }
5263
5364 ### demux, deplete, SRA submission prep, fastqc/multiqc
5465 call demux_deplete .demux_deplete {
5566 input :
56- biosample_map = biosample_attributes ,
67+ flowcell_tgz = flowcell_tgz ,
68+ biosample_map = biosample_merge .out_tsv ,
5769 instrument_model = instrument_model ,
5870 sra_title = sra_title
5971 }
@@ -99,7 +111,7 @@ workflow sarscov2_illumina_full {
99111
100112 File passing_assemblies = rename_fasta_header .renamed_fasta
101113 String passing_assembly_ids = orig_name
102- Array [String ] assembly_cmt = [orig_name , "Broad viral-ngs v. " + demux_deplete .demux_viral_core_version , assemble_refbased .assembly_mean_coverage ]
114+ Array [String ] assembly_cmt = [orig_name , "Broad viral-ngs v. " + demux_deplete .demux_viral_core_version , assemble_refbased .assembly_mean_coverage , instrument_model ]
103115
104116 # lineage assignment
105117 call sarscov2_lineages .sarscov2_lineages {
@@ -124,81 +136,100 @@ workflow sarscov2_illumina_full {
124136 String failed_assembly_id = orig_name
125137 }
126138
127- Map [String ,String ?] assembly_stats = {
128- 'sample_orig' : orig_name ,
129- 'sample' : name_reads .left ,
130- 'amplicon_set' : demux_deplete .meta_by_sample [name_reads .left ]["amplicon_set" ],
131- 'assembly_mean_coverage' : assemble_refbased .assembly_mean_coverage ,
132- 'nextclade_clade' : sarscov2_lineages .nextclade_clade ,
133- 'nextclade_aa_subs' : sarscov2_lineages .nextclade_aa_subs ,
134- 'nextclade_aa_dels' : sarscov2_lineages .nextclade_aa_dels ,
135- 'pango_lineage' : sarscov2_lineages .pango_lineage
136- }
137- Map [String ,File ?] assembly_files = {
138- 'assembly_fasta' : assemble_refbased .assembly_fasta ,
139- 'coverage_plot' : assemble_refbased .align_to_ref_merged_coverage_plot ,
140- 'aligned_bam' : assemble_refbased .align_to_ref_merged_aligned_trimmed_only_bam ,
141- 'replicate_discordant_vcf' : assemble_refbased .replicate_discordant_vcf ,
142- 'nextclade_tsv' : sarscov2_lineages .nextclade_tsv ,
143- 'pangolin_csv' : sarscov2_lineages .pangolin_csv ,
144- 'vadr_tgz' : vadr .outputs_tgz
145- }
146- Map [String ,Int ?] assembly_metrics = {
147- 'assembly_length_unambiguous' : assemble_refbased .assembly_length_unambiguous ,
148- 'dist_to_ref_snps' : assemble_refbased .dist_to_ref_snps ,
149- 'dist_to_ref_indels' : assemble_refbased .dist_to_ref_indels ,
150- 'replicate_concordant_sites' : assemble_refbased .replicate_concordant_sites ,
151- 'replicate_discordant_snps' : assemble_refbased .replicate_discordant_snps ,
152- 'replicate_discordant_indels' : assemble_refbased .replicate_discordant_indels ,
153- 'num_read_groups' : assemble_refbased .num_read_groups ,
154- 'num_libraries' : assemble_refbased .num_libraries ,
155- 'vadr_num_alerts' : vadr .num_alerts
156- }
157-
139+ Array [String ] assembly_tsv_row = [
140+ orig_name ,
141+ name_reads .left ,
142+ demux_deplete .meta_by_sample [name_reads .left ]["amplicon_set" ],
143+ assemble_refbased .assembly_mean_coverage ,
144+ assemble_refbased .assembly_length_unambiguous ,
145+ select_first ([sarscov2_lineages .nextclade_clade , "" ]),
146+ select_first ([sarscov2_lineages .nextclade_aa_subs , "" ]),
147+ select_first ([sarscov2_lineages .nextclade_aa_dels , "" ]),
148+ select_first ([sarscov2_lineages .pango_lineage , "" ]),
149+ assemble_refbased .dist_to_ref_snps ,
150+ assemble_refbased .dist_to_ref_indels ,
151+ select_first ([vadr .num_alerts , "" ]),
152+ assemble_refbased .assembly_fasta ,
153+ assemble_refbased .align_to_ref_merged_coverage_plot ,
154+ assemble_refbased .align_to_ref_merged_aligned_trimmed_only_bam ,
155+ assemble_refbased .replicate_discordant_vcf ,
156+ select_first ([sarscov2_lineages .nextclade_tsv , "" ]),
157+ select_first ([sarscov2_lineages .pangolin_csv , "" ]),
158+ select_first ([vadr .outputs_tgz , "" ]),
159+ assemble_refbased .replicate_concordant_sites ,
160+ assemble_refbased .replicate_discordant_snps ,
161+ assemble_refbased .replicate_discordant_indels ,
162+ assemble_refbased .num_read_groups ,
163+ assemble_refbased .num_libraries ,
164+ ]
165+ }
166+ Array [String ] assembly_tsv_header = [
167+ 'sample' , 'sample_sanitized' , 'amplicon_set' , 'assembly_mean_coverage' , 'assembly_length_unambiguous' ,
168+ 'nextclade_clade' , 'nextclade_aa_subs' , 'nextclade_aa_dels' , 'pango_lineage' ,
169+ 'dist_to_ref_snps' , 'dist_to_ref_indels' , 'vadr_num_alerts' ,
170+ 'assembly_fasta' , 'coverage_plot' , 'aligned_bam' , 'replicate_discordant_vcf' ,
171+ 'nextclade_tsv' , 'pangolin_csv' , 'vadr_tgz' ,
172+ 'replicate_concordant_sites' , 'replicate_discordant_snps' , 'replicate_discordant_indels' , 'num_read_groups' , 'num_libraries' ,
173+ ]
174+
175+ call nextstrain .concatenate as assembly_meta_tsv {
176+ input :
177+ infiles = [write_tsv ([assembly_tsv_header ]), write_tsv (assembly_tsv_row )],
178+ output_name = "assembly_metadata-~{flowcell_id }.tsv"
158179 }
159180
160- # TO DO: filter out genomes from submission that are less than ntc_bases.max
161- call read_utils .max as ntc {
181+
182+ # TO DO: filter out genomes from submission that are less than ntc_max.out
183+ call read_utils .max as ntc_max {
162184 input :
163185 list = select_all (ntc_bases )
164186 }
165187
166188 ### prep genbank submission
167- call nextstrain .concatenate as submit_genomes {
168- input :
169- infiles = select_all (submittable_genomes ),
170- output_name = "assemblies.fasta"
171- }
172189 call ncbi .biosample_to_genbank {
173190 input :
174- biosample_attributes = biosample_attributes ,
191+ biosample_attributes = biosample_merge . out_tsv ,
175192 num_segments = 1 ,
176193 taxid = taxid ,
177194 filter_to_ids = write_lines (select_all (submittable_id ))
178195 }
179196 call ncbi .structured_comments {
180197 input :
181- assembly_stats_tsv = write_tsv (flatten ([[['SeqID' ,'Assembly Method' ,'Coverage' ]],select_all (assembly_cmt )])),
182- filter_to_ids = write_lines (select_all (submittable_id ))
198+ assembly_stats_tsv = write_tsv (flatten ([[['SeqID' ,'Assembly Method' ,'Coverage' ,'Sequencing Technology' ]],select_all (assembly_cmt )])),
199+ filter_to_ids = biosample_to_genbank .sample_ids
200+ }
201+ call nextstrain .concatenate as passing_genomes {
202+ input :
203+ infiles = select_all (submittable_genomes ),
204+ output_name = "assemblies.fasta"
205+ }
206+ call nextstrain .filter_sequences_to_list as submit_genomes {
207+ input :
208+ sequences = passing_genomes .combined ,
209+ keep_list = [biosample_to_genbank .sample_ids ]
183210 }
184211 call ncbi .package_genbank_ftp_submission {
185212 input :
186- sequences_fasta = submit_genomes .combined ,
213+ sequences_fasta = submit_genomes .filtered_fasta ,
187214 source_modifier_table = biosample_to_genbank .genbank_source_modifier_table ,
188- structured_comment_table = structured_comments .structured_comment_table
215+ structured_comment_table = structured_comments .structured_comment_table ,
216+ submission_name = flowcell_id ,
217+ submission_uid = flowcell_id
189218 }
190219
191220 ### prep gisaid submission
192221 call ncbi .prefix_fasta_header as prefix_gisaid {
193222 input :
194- genome_fasta = submit_genomes .combined ,
195- prefix = gisaid_prefix
223+ genome_fasta = submit_genomes .filtered_fasta ,
224+ prefix = gisaid_prefix ,
225+ out_basename = "gisaid-sequences-~{flowcell_id }"
196226 }
197227 call ncbi .gisaid_meta_prep {
198228 input :
199229 source_modifier_table = biosample_to_genbank .genbank_source_modifier_table ,
200230 structured_comments = structured_comments .structured_comment_table ,
201- out_name = "gisaid_meta.tsv"
231+ fasta_filename = "gisaid-sequences-~{flowcell_id }.fasta" ,
232+ out_name = "gisaid-meta-~{flowcell_id }.tsv"
202233 }
203234
204235 output {
@@ -212,12 +243,13 @@ workflow sarscov2_illumina_full {
212243 Array [Int ] read_counts_depleted = demux_deplete .read_counts_depleted
213244
214245 File sra_metadata = select_first ([demux_deplete .sra_metadata ])
246+ File cleaned_bam_uris = select_first ([demux_deplete .cleaned_bam_uris ])
215247
216248 Array [File ] assemblies_fasta = assemble_refbased .assembly_fasta
217249 Array [File ] passing_assemblies_fasta = select_all (passing_assemblies )
218250 Array [File ] submittable_assemblies_fasta = select_all (submittable_genomes )
219251
220- Int max_ntc_bases = ntc . max
252+ Int max_ntc_bases = ntc_max . out
221253
222254 Array [File ] demux_metrics = demux_deplete .demux_metrics
223255 Array [File ] demux_commonBarcodes = demux_deplete .demux_commonBarcodes
@@ -227,9 +259,7 @@ workflow sarscov2_illumina_full {
227259 File multiqc_report_cleaned = demux_deplete .multiqc_report_cleaned
228260 File spikein_counts = demux_deplete .spikein_counts
229261
230- Array [Map [String ,String ?]] per_assembly_stats = assembly_stats
231- Array [Map [String ,File ?]] per_assembly_files = assembly_files
232- Array [Map [String ,Int ?]] per_assembly_metrics = assembly_metrics
262+ File assembly_stats_tsv = assembly_meta_tsv .combined
233263
234264 File submission_zip = package_genbank_ftp_submission .submission_zip
235265 File submission_xml = package_genbank_ftp_submission .submission_xml
0 commit comments