Merge pull request #278 from broadinstitute/dp-cdc-delivery

dpark01 · web-flow · commit c1d835ff31ff · 2021-04-30T18:50:29.000-04:00
cdc delivery refine
diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl
@@ -132,6 +132,43 @@ task fetch_row_from_tsv {
   }
 }
 
+task fetch_col_from_tsv {
+  input {
+    File          tsv
+    String        col
+    Boolean       drop_empty = true
+    Boolean       drop_header = true
+    String        out_name = "~{basename(basename(tsv, '.txt'), '.tsv')}-~{col}.txt"
+  }
+  command <<<
+    python3 << CODE
+    import csv, gzip
+    col = "~{col}"
+    drop_empty = ~{true="True" false="False" drop_empty}
+    drop_header = ~{true="True" false="False" drop_header}
+    open_or_gzopen = lambda *args, **kwargs: gzip.open(*args, **kwargs) if args[0].endswith('.gz') else open(*args, **kwargs)
+    with open_or_gzopen('~{tsv}', 'rt') as inf:
+      with open('~{out_name}', 'wt') as outf:
+        if not drop_header:
+          outf.write(col+'\n')
+        for row in csv.DictReader(inf, delimiter='\t'):
+          x = row.get(col, '')
+          if x or not drop_empty:
+            outf.write(x+'\n')
+    CODE
+  >>>
+  output {
+    File  out_txt  = "~{out_name}"
+  }
+  runtime {
+    docker: "python:slim"
+    memory: "1 GB"
+    cpu: 1
+    disks: "local-disk 50 HDD"
+    dx_instance_type: "mem1_ssd1_v2_x2"
+  }
+}
+
 task tsv_join {
   meta {
       description: "Perform a full left outer join on multiple TSV tables. Each input tsv must have a header row, and each must must contain the value of id_col in its header. Inputs may or may not be gzipped. Unix/Mac/Win line endings are tolerated on input, Unix line endings are emitted as output. Unicode text safe."
diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl
@@ -68,6 +68,11 @@ workflow sarscov2_illumina_full {
             id_col       = 'accession',
             out_basename = "biosample_attributes-merged"
     }
+    call utils.fetch_col_from_tsv as accessioned_samples {
+      input:
+        tsv = biosample_merge.out_tsv,
+        col = 'sample_name'
+    }
 
     ### demux, deplete, SRA submission prep, fastqc/multiqc
     call demux_deplete.demux_deplete {
@@ -223,7 +228,7 @@ workflow sarscov2_illumina_full {
         output_name = "assembly_metadata-~{flowcell_id}.tsv"
     }
 
-    ### filter out batches where NTCs assemble
+    ### mark up the bad batches or lanes where NTCs assemble
     call assembly.filter_bad_ntc_batches {
       input:
         seqid_list = write_lines(select_all(passing_assembly_ids)),
@@ -264,6 +269,7 @@ workflow sarscov2_illumina_full {
 
     ### filter and concatenate final sets for delivery ("passing" and "submittable")
     call sarscov2.sc2_meta_final {
+      # this decorates assembly_meta_tsv with collab/internal IDs, genome_status, and many other columns
       input:
         assembly_stats_tsv = assembly_meta_tsv.combined,
         collab_ids_tsv = collab_ids_tsv,
@@ -272,24 +278,34 @@ workflow sarscov2_illumina_full {
         genome_status_json = filter_bad_ntc_batches.fail_meta_json
     }
     call utils.concatenate as passing_cat_prefilter {
+      # this emits a fasta of only genomes that pass min_unambig
       input:
         infiles     = select_all(passing_assemblies),
         output_name = "assemblies_passing-~{flowcell_id}.prefilter.fasta"
     }
-    call nextstrain.filter_sequences_to_list as passing_cat {
+    call nextstrain.filter_sequences_to_list as passing_ntc {
+      # this drops all genomes that are failed_NTC
       input:
         sequences = passing_cat_prefilter.combined,
-        keep_list = [filter_bad_ntc_batches.seqids_kept],
+        keep_list = [filter_bad_ntc_batches.seqids_kept]
+    }
+    call nextstrain.filter_sequences_to_list as passing_cat {
+      # this drops all genomes that don't have BioSample accessions (e.g. control libraries)
+      input:
+        sequences = passing_ntc.filtered_fasta,
+        keep_list = [accessioned_samples.out_txt],
         out_fname = "assemblies_passing-~{flowcell_id}.fasta"
     }
     call nextstrain.filter_sequences_to_list as submittable_filter {
+      # this drops all failed_annotation (aka VADR fails)
       input:
         sequences = passing_cat.filtered_fasta,
         keep_list = [write_lines(select_all(submittable_id))]
     }
 
     ### prep genbank submission
     call ncbi.biosample_to_genbank {
+      # this takes a BioSample attributes file and emits a Genbank Source Modifier Table
       input:
         biosample_attributes = biosample_merge.out_tsv,
         num_segments         = 1,
@@ -301,14 +317,9 @@ workflow sarscov2_illumina_full {
         assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])),
         filter_to_ids      = biosample_to_genbank.sample_ids
     }
-    call utils.concatenate as passing_genomes {
-      input:
-        infiles     = select_all(submittable_genomes),
-        output_name = "assemblies.fasta"
-    }
     call nextstrain.filter_sequences_to_list as submit_genomes {
       input:
-        sequences = passing_genomes.combined,
+        sequences = submittable_filter.filtered_fasta,
         keep_list = [biosample_to_genbank.sample_ids]
     }
     call ncbi.package_genbank_ftp_submission {