@@ -68,6 +68,11 @@ workflow sarscov2_illumina_full {
6868 id_col = 'accession' ,
6969 out_basename = "biosample_attributes-merged"
7070 }
71+ call utils .fetch_col_from_tsv as accessioned_samples {
72+ input :
73+ tsv = biosample_merge .out_tsv ,
74+ col = 'sample_name'
75+ }
7176
7277 ### demux, deplete, SRA submission prep, fastqc/multiqc
7378 call demux_deplete .demux_deplete {
@@ -223,7 +228,7 @@ workflow sarscov2_illumina_full {
223228 output_name = "assembly_metadata-~{flowcell_id }.tsv"
224229 }
225230
226- ### filter out batches where NTCs assemble
231+ ### mark up the bad batches or lanes where NTCs assemble
227232 call assembly .filter_bad_ntc_batches {
228233 input :
229234 seqid_list = write_lines (select_all (passing_assembly_ids )),
@@ -264,6 +269,7 @@ workflow sarscov2_illumina_full {
264269
265270 ### filter and concatenate final sets for delivery ("passing" and "submittable")
266271 call sarscov2 .sc2_meta_final {
272+ # this decorates assembly_meta_tsv with collab/internal IDs, genome_status, and many other columns
267273 input :
268274 assembly_stats_tsv = assembly_meta_tsv .combined ,
269275 collab_ids_tsv = collab_ids_tsv ,
@@ -272,24 +278,34 @@ workflow sarscov2_illumina_full {
272278 genome_status_json = filter_bad_ntc_batches .fail_meta_json
273279 }
274280 call utils .concatenate as passing_cat_prefilter {
281+ # this emits a fasta of only genomes that pass min_unambig
275282 input :
276283 infiles = select_all (passing_assemblies ),
277284 output_name = "assemblies_passing-~{flowcell_id }.prefilter.fasta"
278285 }
279- call nextstrain .filter_sequences_to_list as passing_cat {
286+ call nextstrain .filter_sequences_to_list as passing_ntc {
287+ # this drops all genomes that are failed_NTC
280288 input :
281289 sequences = passing_cat_prefilter .combined ,
282- keep_list = [filter_bad_ntc_batches .seqids_kept ],
290+ keep_list = [filter_bad_ntc_batches .seqids_kept ]
291+ }
292+ call nextstrain .filter_sequences_to_list as passing_cat {
293+ # this drops all genomes that don't have BioSample accessions (e.g. control libraries)
294+ input :
295+ sequences = passing_ntc .filtered_fasta ,
296+ keep_list = [accessioned_samples .out_txt ],
283297 out_fname = "assemblies_passing-~{flowcell_id }.fasta"
284298 }
285299 call nextstrain .filter_sequences_to_list as submittable_filter {
300+ # this drops all failed_annotation (aka VADR fails)
286301 input :
287302 sequences = passing_cat .filtered_fasta ,
288303 keep_list = [write_lines (select_all (submittable_id ))]
289304 }
290305
291306 ### prep genbank submission
292307 call ncbi .biosample_to_genbank {
308+ # this takes a BioSample attributes file and emits a Genbank Source Modifier Table
293309 input :
294310 biosample_attributes = biosample_merge .out_tsv ,
295311 num_segments = 1 ,
@@ -301,14 +317,9 @@ workflow sarscov2_illumina_full {
301317 assembly_stats_tsv = write_tsv (flatten ([[['SeqID' ,'Assembly Method' ,'Coverage' ,'Sequencing Technology' ]],select_all (assembly_cmt )])),
302318 filter_to_ids = biosample_to_genbank .sample_ids
303319 }
304- call utils .concatenate as passing_genomes {
305- input :
306- infiles = select_all (submittable_genomes ),
307- output_name = "assemblies.fasta"
308- }
309320 call nextstrain .filter_sequences_to_list as submit_genomes {
310321 input :
311- sequences = passing_genomes . combined ,
322+ sequences = submittable_filter . filtered_fasta ,
312323 keep_list = [biosample_to_genbank .sample_ids ]
313324 }
314325 call ncbi .package_genbank_ftp_submission {
0 commit comments