|
| 1 | +version 1.0 |
| 2 | + |
| 3 | +import "../tasks/tasks_assembly.wdl" as assembly |
| 4 | +import "../tasks/tasks_metagenomics.wdl" as metagenomics |
| 5 | +import "../tasks/tasks_ncbi.wdl" as ncbi |
| 6 | +import "../tasks/tasks_read_utils.wdl" as read_utils |
| 7 | +import "../tasks/tasks_reports.wdl" as reports |
| 8 | +import "../tasks/tasks_utils.wdl" as utils |
| 9 | +import "assemble_refbased.wdl" as assemble_refbased |
| 10 | + |
| 11 | +workflow assemble_denovo_metagenomic { |
| 12 | + meta { |
| 13 | + description: "Performs viral de novo assembly on metagenomic reads against a large range of possible reference genomes. Runs raw reads through taxonomic classification (Kraken2), human read depletion (based on Kraken2), de novo assembly (SPAdes), and FASTQC/multiQC of reads. Scaffold de novo contigs against a set of possible references and subsequently polish with reads. This workflow accepts a very large set of input reference genomes. It will subset the reference genomes to those with ANI hits to the provided contigs/MAGs and cluster the reference hits by any ANI similarity to each other. It will choose the top reference from each cluster and produce one assembly for each cluster. This is intended to allow for the presence of multiple diverse viral taxa (coinfections) while forcing a choice of the best assembly from groups of related reference genomes." |
| 14 | + author: "Broad Viral Genomics" |
| 15 | + email: "viral-ngs@broadinstitute.org" |
| 16 | + allowNestedInputs: true |
| 17 | + } |
| 18 | + |
| 19 | + input { |
| 20 | + String sample_id |
| 21 | + String? sample_name |
| 22 | + String? biosample_accession |
| 23 | + |
| 24 | + Array[File]+ reads_bams |
| 25 | + |
| 26 | + File ncbi_taxdump_tgz |
| 27 | + |
| 28 | + File spikein_db |
| 29 | + File trim_clip_db |
| 30 | + |
| 31 | + File kraken2_db_tgz |
| 32 | + File krona_taxonomy_db_kraken2_tgz |
| 33 | + |
| 34 | + File taxid_to_ref_accessions_tsv |
| 35 | + |
| 36 | + Array[String] taxa_to_dehost = ["Vertebrata"] |
| 37 | + Array[String] taxa_to_avoid_assembly = ["Vertebrata", "other sequences", "Bacteria"] |
| 38 | + |
| 39 | + } |
| 40 | + |
| 41 | + Int min_scaffold_unambig = 300 # in base-pairs; any scaffolded assembly < this length will not be refined/polished |
| 42 | + String sample_original_name = select_first([sample_name, sample_id]) |
| 43 | + |
| 44 | + parameter_meta { |
| 45 | + reads_bams: { |
| 46 | + description: "Reads to classify. May be unmapped or mapped or both, paired-end or single-end. Multiple input files will be merged first.", |
| 47 | + patterns: ["*.bam"] |
| 48 | + } |
| 49 | + spikein_db: { |
| 50 | + description: "ERCC spike-in sequences", |
| 51 | + patterns: ["*.fasta", "*.fasta.gz", "*.fasta.zst"] |
| 52 | + } |
| 53 | + trim_clip_db: { |
| 54 | + description: "Adapter sequences to remove via trimmomatic prior to SPAdes assembly", |
| 55 | + patterns: ["*.fasta", "*.fasta.gz", "*.fasta.zst"] |
| 56 | + } |
| 57 | + kraken2_db_tgz: { |
| 58 | + description: "Pre-built Kraken database tarball containing three files: hash.k2d, opts.k2d, and taxo.k2d.", |
| 59 | + patterns: ["*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"] |
| 60 | + } |
| 61 | + krona_taxonomy_db_kraken2_tgz: { |
| 62 | + description: "Krona taxonomy database containing a single file: taxonomy.tab, or possibly just a compressed taxonomy.tab", |
| 63 | + patterns: ["*.tab.zst", "*.tab.gz", "*.tab", "*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"] |
| 64 | + } |
| 65 | + ncbi_taxdump_tgz: { |
| 66 | + description: "An NCBI taxdump.tar.gz file that contains, at the minimum, a nodes.dmp and names.dmp file.", |
| 67 | + patterns: ["*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"] |
| 68 | + } |
| 69 | + cleaned_fastqc: { |
| 70 | + description: "Output cleaned fastqc reports in HTML.", |
| 71 | + category: "other" |
| 72 | + } |
| 73 | + deduplicated_reads_unaligned: { |
| 74 | + description: "Deduplication on unaligned reads in BAM format using mvicuna or cdhit.", |
| 75 | + category: "other" |
| 76 | + } |
| 77 | + kraken2_krona_plot: { |
| 78 | + description:"Visualize the results of the Kraken2 analysis with Krona, which disaplys taxonmic hierarchiral data in multi-layerd pie.", |
| 79 | + category:"other" |
| 80 | + } |
| 81 | + kraken2_summary_report: { |
| 82 | + description: "Kraken report output file.", |
| 83 | + category: "other" |
| 84 | + } |
| 85 | + raw_fastqc:{ |
| 86 | + description: "Merged raw fastqc reads.", |
| 87 | + category: "other" |
| 88 | + } |
| 89 | + |
| 90 | + } |
| 91 | + |
| 92 | + call read_utils.merge_and_reheader_bams as merge_raw_reads { |
| 93 | + input: |
| 94 | + in_bams = reads_bams |
| 95 | + } |
| 96 | + File reads_bam = merge_raw_reads.out_bam |
| 97 | +
|
| 98 | + call reports.align_and_count as spikein { |
| 99 | + input: |
| 100 | + reads_bam = reads_bam, |
| 101 | + ref_db = spikein_db |
| 102 | + } |
| 103 | + call metagenomics.kraken2 as kraken2 { |
| 104 | + input: |
| 105 | + reads_bam = reads_bam, |
| 106 | + kraken2_db_tgz = kraken2_db_tgz, |
| 107 | + krona_taxonomy_db_tgz = krona_taxonomy_db_kraken2_tgz |
| 108 | + } |
| 109 | + call metagenomics.filter_bam_to_taxa as deplete { |
| 110 | + input: |
| 111 | + classified_bam = reads_bam, |
| 112 | + classified_reads_txt_gz = kraken2.kraken2_reads_report, |
| 113 | + ncbi_taxonomy_db_tgz = ncbi_taxdump_tgz, |
| 114 | + exclude_taxa = true, |
| 115 | + taxonomic_names = taxa_to_dehost, |
| 116 | + out_filename_suffix = "hs_depleted" |
| 117 | + } |
| 118 | + call reports.fastqc as fastqc_cleaned { |
| 119 | + input: reads_bam = deplete.bam_filtered_to_taxa |
| 120 | + } |
| 121 | + call metagenomics.filter_bam_to_taxa as filter_acellular { |
| 122 | + input: |
| 123 | + classified_bam = reads_bam, |
| 124 | + classified_reads_txt_gz = kraken2.kraken2_reads_report, |
| 125 | + ncbi_taxonomy_db_tgz = ncbi_taxdump_tgz, |
| 126 | + exclude_taxa = true, |
| 127 | + taxonomic_names = taxa_to_avoid_assembly, |
| 128 | + out_filename_suffix = "acellular" |
| 129 | + } |
| 130 | + call read_utils.rmdup_ubam { |
| 131 | + input: |
| 132 | + reads_unmapped_bam = filter_acellular.bam_filtered_to_taxa |
| 133 | + } |
| 134 | + call assembly.assemble as spades { |
| 135 | + input: |
| 136 | + reads_unmapped_bam = rmdup_ubam.dedup_bam, |
| 137 | + trim_clip_db = trim_clip_db, |
| 138 | + always_succeed = true |
| 139 | + } |
| 140 | + call metagenomics.report_primary_kraken_taxa { |
| 141 | + input: |
| 142 | + kraken_summary_report = kraken2.kraken2_summary_report |
| 143 | + } |
| 144 | +
|
| 145 | + # download (multi-segment) genomes for each reference, fasta filename = colon-concatenated accession list |
| 146 | + scatter(taxon in read_tsv(taxid_to_ref_accessions_tsv)) { |
| 147 | + # taxon = [taxid, isolate_prefix, taxname, semicolon_delim_accession_list] |
| 148 | + call utils.string_split { |
| 149 | + input: |
| 150 | + joined_string = taxon[3], |
| 151 | + delimiter = ":" |
| 152 | + } |
| 153 | + call ncbi.download_annotations { |
| 154 | + input: |
| 155 | + accessions = string_split.tokens, |
| 156 | + combined_out_prefix = sub(taxon[3], ":", "-") # singularity does not like colons in filenames |
| 157 | + } |
| 158 | + } |
| 159 | +
|
| 160 | + # subset reference genomes to those with ANI hits to contigs and cluster reference hits by any ANI similarity to each other |
| 161 | + call assembly.select_references { |
| 162 | + input: |
| 163 | + reference_genomes_fastas = download_annotations.combined_fasta, |
| 164 | + contigs_fasta = spades.contigs_fasta |
| 165 | + } |
| 166 | +
|
| 167 | + # assemble and produce stats for every reference cluster |
| 168 | + Array[String] assembly_header = ["entity:assembly_id", "assembly_name", "sample_id", "sample_name", "taxid", "tax_name", "tax_shortname", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "scaffolding_num_segments_recovered", "reference_num_segments_required", "reference_length", "reference_accessions", "skani_num_ref_clusters", "skani_this_cluster_num_refs", "skani_dist_tsv", "scaffolding_ani", "scaffolding_pct_ref_cov", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned", "assembly_method", "assembly_method_version", "biosample_accession", "sample"] |
| 169 | + scatter(ref_cluster_tar in select_references.matched_reference_clusters_fastas_tars) { |
| 170 | + |
| 171 | + call utils.tar_extract { |
| 172 | + input: |
| 173 | + tar_file = ref_cluster_tar |
| 174 | + } |
| 175 | +
|
| 176 | + # assemble (scaffold-and-refine) genome against this reference cluster |
| 177 | + call assembly.scaffold { |
| 178 | + input: |
| 179 | + reads_bam = deplete.bam_filtered_to_taxa, |
| 180 | + contigs_fasta = spades.contigs_fasta, |
| 181 | + reference_genome_fasta = tar_extract.files, |
| 182 | + min_length_fraction = 0, |
| 183 | + min_unambig = 0, |
| 184 | + allow_incomplete_output = true |
| 185 | + } |
| 186 | + if (scaffold.assembly_preimpute_length_unambiguous > min_scaffold_unambig) { |
| 187 | + # polish de novo assembly with reads |
| 188 | + call assemble_refbased.assemble_refbased as refine { |
| 189 | + input: |
| 190 | + reads_unmapped_bams = [deplete.bam_filtered_to_taxa], |
| 191 | + reference_fasta = scaffold.scaffold_fasta, |
| 192 | + sample_name = sample_id, |
| 193 | + sample_original_name = sample_original_name |
| 194 | + } |
| 195 | + } |
| 196 | +
|
| 197 | + # get taxid and taxname from taxid_to_ref_accessions_tsv |
| 198 | + call utils.fetch_row_from_tsv as tax_lookup { |
| 199 | + input: |
| 200 | + tsv = taxid_to_ref_accessions_tsv, |
| 201 | + idx_col = "accessions", |
| 202 | + idx_val = sub(scaffold.scaffolding_chosen_ref_basename, "-", ":"), |
| 203 | + add_header = ["taxid", "isolate_prefix", "taxname", "accessions"] |
| 204 | + } |
| 205 | + String taxid = tax_lookup.map["taxid"] |
| 206 | + String tax_name = tax_lookup.map["taxname"] |
| 207 | + String isolate_prefix = tax_lookup.map["isolate_prefix"] |
| 208 | +
|
| 209 | + # build output tsv row |
| 210 | + Int assembly_length_unambiguous = select_first([refine.assembly_length_unambiguous, 0]) |
| 211 | + Float percent_reference_covered = 1.0 * assembly_length_unambiguous / scaffold.reference_length |
| 212 | + File assembly_fasta = select_first([refine.assembly_fasta, scaffold.intermediate_gapfill_fasta]) |
| 213 | + Map[String, String] stats_by_taxon = { |
| 214 | + "entity:assembly_id" : sample_id + "-" + taxid, |
| 215 | + "assembly_name" : tax_name + ": " + sample_original_name, |
| 216 | + "sample_id" : sample_id, |
| 217 | + "sample_name" : sample_original_name, |
| 218 | + "taxid" : taxid, |
| 219 | + "tax_name" : tax_name, |
| 220 | + "tax_shortname" : isolate_prefix, |
| 221 | + |
| 222 | + "assembly_fasta" : assembly_fasta, |
| 223 | + "aligned_only_reads_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ""]), |
| 224 | + "coverage_plot" : select_first([refine.align_to_self_merged_coverage_plot, ""]), |
| 225 | + "assembly_length" : select_first([refine.assembly_length, "0"]), |
| 226 | + "assembly_length_unambiguous" : assembly_length_unambiguous, |
| 227 | + "reads_aligned" : select_first([refine.align_to_self_merged_reads_aligned, "0"]), |
| 228 | + "mean_coverage" : select_first([refine.align_to_self_merged_mean_coverage, "0"]), |
| 229 | + "percent_reference_covered" : percent_reference_covered, |
| 230 | + "scaffolding_num_segments_recovered" : scaffold.assembly_num_segments_recovered, |
| 231 | + "reference_num_segments_required" : scaffold.reference_num_segments_required, |
| 232 | + "reference_length" : scaffold.reference_length, |
| 233 | + "reference_accessions" : tax_lookup.map["accessions"], |
| 234 | + |
| 235 | + "skani_num_ref_clusters" : length(select_references.matched_reference_clusters_fastas_tars), |
| 236 | + "skani_this_cluster_num_refs" : length(tar_extract.files), |
| 237 | + "skani_dist_tsv" : scaffold.scaffolding_stats, |
| 238 | + "scaffolding_ani" : scaffold.skani_ani, |
| 239 | + "scaffolding_pct_ref_cov" : scaffold.skani_ref_aligned_frac, |
| 240 | + |
| 241 | + "intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta, |
| 242 | + "assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous, |
| 243 | + |
| 244 | + "replicate_concordant_sites" : select_first([refine.replicate_concordant_sites, "0"]), |
| 245 | + "replicate_discordant_snps" : select_first([refine.replicate_discordant_snps, "0"]), |
| 246 | + "replicate_discordant_indels" : select_first([refine.replicate_discordant_indels, "0"]), |
| 247 | + "replicate_discordant_vcf" : select_first([refine.replicate_discordant_vcf, ""]), |
| 248 | + |
| 249 | + "isnvsFile" : select_first([refine.align_to_self_isnvs_vcf, ""]), |
| 250 | + "aligned_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ""]), |
| 251 | + "coverage_tsv" : select_first([refine.align_to_self_merged_coverage_tsv, ""]), |
| 252 | + "read_pairs_aligned" : select_first([refine.align_to_self_merged_read_pairs_aligned, "0"]), |
| 253 | + "bases_aligned" : select_first([refine.align_to_self_merged_bases_aligned, "0"]), |
| 254 | + |
| 255 | + "assembly_method" : "viral-ngs/assemble_denovo", |
| 256 | + "assembly_method_version" : scaffold.viralngs_version, |
| 257 | + |
| 258 | + "biosample_accession" : select_first([biosample_accession, ""]), |
| 259 | + |
| 260 | + "sample": '{"entityType":"sample","entityName":"' + sample_id + '"}' |
| 261 | + } |
| 262 | + |
| 263 | + if(assembly_length_unambiguous > min_scaffold_unambig) { |
| 264 | + scatter(h in assembly_header) { |
| 265 | + String stat_by_taxon = stats_by_taxon[h] |
| 266 | + } |
| 267 | + } |
| 268 | + } |
| 269 | + |
| 270 | + ### summary stats |
| 271 | + if (length(select_all(stat_by_taxon)) > 0) { |
| 272 | + call utils.concatenate as assembly_stats_non_empty { |
| 273 | + input: |
| 274 | + infiles = [write_tsv([assembly_header]), write_tsv(select_all(stat_by_taxon))], |
| 275 | + output_name = "assembly_metadata-~{sample_id}.tsv" |
| 276 | + } |
| 277 | + } |
| 278 | + if (length(select_all(stat_by_taxon)) == 0) { |
| 279 | + call utils.concatenate as assembly_stats_empty { |
| 280 | + input: |
| 281 | + infiles = [write_tsv([assembly_header])], |
| 282 | + output_name = "assembly_metadata-~{sample_id}.tsv" |
| 283 | + } |
| 284 | + } |
| 285 | +
|
| 286 | + output { |
| 287 | + File cleaned_reads_unaligned_bam = deplete.bam_filtered_to_taxa |
| 288 | + File deduplicated_reads_unaligned = rmdup_ubam.dedup_bam |
| 289 | + File contigs_fasta = spades.contigs_fasta |
| 290 | + |
| 291 | + Int read_counts_raw = deplete.classified_taxonomic_filter_read_count_pre |
| 292 | + Int read_counts_depleted = deplete.classified_taxonomic_filter_read_count_post |
| 293 | + Int read_counts_dedup = rmdup_ubam.dedup_read_count_post |
| 294 | + Int read_counts_prespades_subsample = spades.subsample_read_count |
| 295 | + |
| 296 | + File kraken2_summary_report = kraken2.kraken2_summary_report |
| 297 | + File kraken2_krona_plot = kraken2.krona_report_html |
| 298 | + File kraken2_top_taxa_report = report_primary_kraken_taxa.ranked_focal_report |
| 299 | + String kraken2_focal_taxon_name = report_primary_kraken_taxa.focal_tax_name |
| 300 | + Int kraken2_focal_total_reads = report_primary_kraken_taxa.total_focal_reads |
| 301 | + String kraken2_top_taxon_id = report_primary_kraken_taxa.tax_id |
| 302 | + String kraken2_top_taxon_name = report_primary_kraken_taxa.tax_name |
| 303 | + String kraken2_top_taxon_rank = report_primary_kraken_taxa.tax_rank |
| 304 | + Int kraken2_top_taxon_num_reads = report_primary_kraken_taxa.num_reads |
| 305 | + Float kraken2_top_taxon_pct_of_focal = report_primary_kraken_taxa.percent_of_focal |
| 306 | + |
| 307 | + File raw_fastqc = merge_raw_reads.fastqc |
| 308 | + File cleaned_fastqc = fastqc_cleaned.fastqc_html |
| 309 | + File spikein_report = spikein.report |
| 310 | + String spikein_tophit = spikein.top_hit_id |
| 311 | + String spikein_pct_of_total_reads = spikein.pct_total_reads_mapped |
| 312 | + String spikein_pct_lesser_hits = spikein.pct_lesser_hits_of_mapped |
| 313 | + |
| 314 | + String viral_classify_version = kraken2.viralngs_version |
| 315 | + String viral_assemble_version = spades.viralngs_version |
| 316 | + |
| 317 | + Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon |
| 318 | + File assembly_stats_by_taxon_tsv = select_first([assembly_stats_non_empty.combined, assembly_stats_empty.combined]) |
| 319 | + String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa" |
| 320 | + |
| 321 | + Int skani_num_ref_clusters = length(select_references.matched_reference_clusters_fastas_tars) |
| 322 | + File skani_contigs_to_refs_dist_tsv = select_references.skani_dist_full_tsv |
| 323 | + |
| 324 | + Array[String] assembly_all_taxids = taxid |
| 325 | + Array[String] assembly_all_taxnames = tax_name |
| 326 | + Array[Int] assembly_all_lengths_unambig = assembly_length_unambiguous |
| 327 | + Array[Float] assembly_all_pct_ref_cov = percent_reference_covered |
| 328 | + Array[File] assembly_all_fastas = assembly_fasta |
| 329 | + } |
| 330 | +} |
0 commit comments