Skip to content

Commit c1d835f

Browse files
authored
Merge pull request #278 from broadinstitute/dp-cdc-delivery
cdc delivery refine
2 parents 69f019c + 4b49a7d commit c1d835f

File tree

2 files changed

+57
-9
lines changed

2 files changed

+57
-9
lines changed

pipes/WDL/tasks/tasks_utils.wdl

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,43 @@ task fetch_row_from_tsv {
132132
}
133133
}
134134

135+
task fetch_col_from_tsv {
136+
input {
137+
File tsv
138+
String col
139+
Boolean drop_empty = true
140+
Boolean drop_header = true
141+
String out_name = "~{basename(basename(tsv, '.txt'), '.tsv')}-~{col}.txt"
142+
}
143+
command <<<
144+
python3 << CODE
145+
import csv, gzip
146+
col = "~{col}"
147+
drop_empty = ~{true="True" false="False" drop_empty}
148+
drop_header = ~{true="True" false="False" drop_header}
149+
open_or_gzopen = lambda *args, **kwargs: gzip.open(*args, **kwargs) if args[0].endswith('.gz') else open(*args, **kwargs)
150+
with open_or_gzopen('~{tsv}', 'rt') as inf:
151+
with open('~{out_name}', 'wt') as outf:
152+
if not drop_header:
153+
outf.write(col+'\n')
154+
for row in csv.DictReader(inf, delimiter='\t'):
155+
x = row.get(col, '')
156+
if x or not drop_empty:
157+
outf.write(x+'\n')
158+
CODE
159+
>>>
160+
output {
161+
File out_txt = "~{out_name}"
162+
}
163+
runtime {
164+
docker: "python:slim"
165+
memory: "1 GB"
166+
cpu: 1
167+
disks: "local-disk 50 HDD"
168+
dx_instance_type: "mem1_ssd1_v2_x2"
169+
}
170+
}
171+
135172
task tsv_join {
136173
meta {
137174
description: "Perform a full left outer join on multiple TSV tables. Each input tsv must have a header row, and each must must contain the value of id_col in its header. Inputs may or may not be gzipped. Unix/Mac/Win line endings are tolerated on input, Unix line endings are emitted as output. Unicode text safe."

pipes/WDL/workflows/sarscov2_illumina_full.wdl

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ workflow sarscov2_illumina_full {
6868
id_col = 'accession',
6969
out_basename = "biosample_attributes-merged"
7070
}
71+
call utils.fetch_col_from_tsv as accessioned_samples {
72+
input:
73+
tsv = biosample_merge.out_tsv,
74+
col = 'sample_name'
75+
}
7176
7277
### demux, deplete, SRA submission prep, fastqc/multiqc
7378
call demux_deplete.demux_deplete {
@@ -223,7 +228,7 @@ workflow sarscov2_illumina_full {
223228
output_name = "assembly_metadata-~{flowcell_id}.tsv"
224229
}
225230
226-
### filter out batches where NTCs assemble
231+
### mark up the bad batches or lanes where NTCs assemble
227232
call assembly.filter_bad_ntc_batches {
228233
input:
229234
seqid_list = write_lines(select_all(passing_assembly_ids)),
@@ -264,6 +269,7 @@ workflow sarscov2_illumina_full {
264269
265270
### filter and concatenate final sets for delivery ("passing" and "submittable")
266271
call sarscov2.sc2_meta_final {
272+
# this decorates assembly_meta_tsv with collab/internal IDs, genome_status, and many other columns
267273
input:
268274
assembly_stats_tsv = assembly_meta_tsv.combined,
269275
collab_ids_tsv = collab_ids_tsv,
@@ -272,24 +278,34 @@ workflow sarscov2_illumina_full {
272278
genome_status_json = filter_bad_ntc_batches.fail_meta_json
273279
}
274280
call utils.concatenate as passing_cat_prefilter {
281+
# this emits a fasta of only genomes that pass min_unambig
275282
input:
276283
infiles = select_all(passing_assemblies),
277284
output_name = "assemblies_passing-~{flowcell_id}.prefilter.fasta"
278285
}
279-
call nextstrain.filter_sequences_to_list as passing_cat {
286+
call nextstrain.filter_sequences_to_list as passing_ntc {
287+
# this drops all genomes that are failed_NTC
280288
input:
281289
sequences = passing_cat_prefilter.combined,
282-
keep_list = [filter_bad_ntc_batches.seqids_kept],
290+
keep_list = [filter_bad_ntc_batches.seqids_kept]
291+
}
292+
call nextstrain.filter_sequences_to_list as passing_cat {
293+
# this drops all genomes that don't have BioSample accessions (e.g. control libraries)
294+
input:
295+
sequences = passing_ntc.filtered_fasta,
296+
keep_list = [accessioned_samples.out_txt],
283297
out_fname = "assemblies_passing-~{flowcell_id}.fasta"
284298
}
285299
call nextstrain.filter_sequences_to_list as submittable_filter {
300+
# this drops all failed_annotation (aka VADR fails)
286301
input:
287302
sequences = passing_cat.filtered_fasta,
288303
keep_list = [write_lines(select_all(submittable_id))]
289304
}
290305
291306
### prep genbank submission
292307
call ncbi.biosample_to_genbank {
308+
# this takes a BioSample attributes file and emits a Genbank Source Modifier Table
293309
input:
294310
biosample_attributes = biosample_merge.out_tsv,
295311
num_segments = 1,
@@ -301,14 +317,9 @@ workflow sarscov2_illumina_full {
301317
assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])),
302318
filter_to_ids = biosample_to_genbank.sample_ids
303319
}
304-
call utils.concatenate as passing_genomes {
305-
input:
306-
infiles = select_all(submittable_genomes),
307-
output_name = "assemblies.fasta"
308-
}
309320
call nextstrain.filter_sequences_to_list as submit_genomes {
310321
input:
311-
sequences = passing_genomes.combined,
322+
sequences = submittable_filter.filtered_fasta,
312323
keep_list = [biosample_to_genbank.sample_ids]
313324
}
314325
call ncbi.package_genbank_ftp_submission {

0 commit comments

Comments
 (0)