Skip to content

Commit 900a7e8

Browse files
committed
add assemble_denovo_metagenomic
1 parent da896b6 commit 900a7e8

File tree

2 files changed

+333
-0
lines changed

2 files changed

+333
-0
lines changed

.dockstore.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,9 @@ workflows:
1616
primaryDescriptorPath: /pipes/WDL/workflows/assemble_denovo.wdl
1717
testParameterFiles:
1818
- /test/input/WDL/test_inputs-assemble_denovo-local.json
19+
- name: assemble_denovo_metagenomic
20+
subclass: WDL
21+
primaryDescriptorPath: /pipes/WDL/workflows/assemble_denovo_metagenomic.wdl
1922
- name: assemble_refbased
2023
subclass: WDL
2124
primaryDescriptorPath: /pipes/WDL/workflows/assemble_refbased.wdl
Lines changed: 330 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,330 @@
1+
version 1.0
2+
3+
import "../tasks/tasks_assembly.wdl" as assembly
4+
import "../tasks/tasks_metagenomics.wdl" as metagenomics
5+
import "../tasks/tasks_ncbi.wdl" as ncbi
6+
import "../tasks/tasks_read_utils.wdl" as read_utils
7+
import "../tasks/tasks_reports.wdl" as reports
8+
import "../tasks/tasks_utils.wdl" as utils
9+
import "assemble_refbased.wdl" as assemble_refbased
10+
11+
workflow assemble_denovo_metagenomic {
12+
meta {
13+
description: "Performs viral de novo assembly on metagenomic reads against a large range of possible reference genomes. Runs raw reads through taxonomic classification (Kraken2), human read depletion (based on Kraken2), de novo assembly (SPAdes), and FASTQC/multiQC of reads. Scaffold de novo contigs against a set of possible references and subsequently polish with reads. This workflow accepts a very large set of input reference genomes. It will subset the reference genomes to those with ANI hits to the provided contigs/MAGs and cluster the reference hits by any ANI similarity to each other. It will choose the top reference from each cluster and produce one assembly for each cluster. This is intended to allow for the presence of multiple diverse viral taxa (coinfections) while forcing a choice of the best assembly from groups of related reference genomes."
14+
author: "Broad Viral Genomics"
15+
email: "viral-ngs@broadinstitute.org"
16+
allowNestedInputs: true
17+
}
18+
19+
input {
20+
String sample_id
21+
String? sample_name
22+
String? biosample_accession
23+
24+
Array[File]+ reads_bams
25+
26+
File ncbi_taxdump_tgz
27+
28+
File spikein_db
29+
File trim_clip_db
30+
31+
File kraken2_db_tgz
32+
File krona_taxonomy_db_kraken2_tgz
33+
34+
File taxid_to_ref_accessions_tsv
35+
36+
Array[String] taxa_to_dehost = ["Vertebrata"]
37+
Array[String] taxa_to_avoid_assembly = ["Vertebrata", "other sequences", "Bacteria"]
38+
39+
}
40+
41+
Int min_scaffold_unambig = 300 # in base-pairs; any scaffolded assembly < this length will not be refined/polished
42+
String sample_original_name = select_first([sample_name, sample_id])
43+
44+
parameter_meta {
45+
reads_bams: {
46+
description: "Reads to classify. May be unmapped or mapped or both, paired-end or single-end. Multiple input files will be merged first.",
47+
patterns: ["*.bam"]
48+
}
49+
spikein_db: {
50+
description: "ERCC spike-in sequences",
51+
patterns: ["*.fasta", "*.fasta.gz", "*.fasta.zst"]
52+
}
53+
trim_clip_db: {
54+
description: "Adapter sequences to remove via trimmomatic prior to SPAdes assembly",
55+
patterns: ["*.fasta", "*.fasta.gz", "*.fasta.zst"]
56+
}
57+
kraken2_db_tgz: {
58+
description: "Pre-built Kraken database tarball containing three files: hash.k2d, opts.k2d, and taxo.k2d.",
59+
patterns: ["*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
60+
}
61+
krona_taxonomy_db_kraken2_tgz: {
62+
description: "Krona taxonomy database containing a single file: taxonomy.tab, or possibly just a compressed taxonomy.tab",
63+
patterns: ["*.tab.zst", "*.tab.gz", "*.tab", "*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
64+
}
65+
ncbi_taxdump_tgz: {
66+
description: "An NCBI taxdump.tar.gz file that contains, at the minimum, a nodes.dmp and names.dmp file.",
67+
patterns: ["*.tar.gz", "*.tar.lz4", "*.tar.bz2", "*.tar.zst"]
68+
}
69+
cleaned_fastqc: {
70+
description: "Output cleaned fastqc reports in HTML.",
71+
category: "other"
72+
}
73+
deduplicated_reads_unaligned: {
74+
description: "Deduplication on unaligned reads in BAM format using mvicuna or cdhit.",
75+
category: "other"
76+
}
77+
kraken2_krona_plot: {
78+
description:"Visualize the results of the Kraken2 analysis with Krona, which disaplys taxonmic hierarchiral data in multi-layerd pie.",
79+
category:"other"
80+
}
81+
kraken2_summary_report: {
82+
description: "Kraken report output file.",
83+
category: "other"
84+
}
85+
raw_fastqc:{
86+
description: "Merged raw fastqc reads.",
87+
category: "other"
88+
}
89+
90+
}
91+
92+
call read_utils.merge_and_reheader_bams as merge_raw_reads {
93+
input:
94+
in_bams = reads_bams
95+
}
96+
File reads_bam = merge_raw_reads.out_bam
97+
98+
call reports.align_and_count as spikein {
99+
input:
100+
reads_bam = reads_bam,
101+
ref_db = spikein_db
102+
}
103+
call metagenomics.kraken2 as kraken2 {
104+
input:
105+
reads_bam = reads_bam,
106+
kraken2_db_tgz = kraken2_db_tgz,
107+
krona_taxonomy_db_tgz = krona_taxonomy_db_kraken2_tgz
108+
}
109+
call metagenomics.filter_bam_to_taxa as deplete {
110+
input:
111+
classified_bam = reads_bam,
112+
classified_reads_txt_gz = kraken2.kraken2_reads_report,
113+
ncbi_taxonomy_db_tgz = ncbi_taxdump_tgz,
114+
exclude_taxa = true,
115+
taxonomic_names = taxa_to_dehost,
116+
out_filename_suffix = "hs_depleted"
117+
}
118+
call reports.fastqc as fastqc_cleaned {
119+
input: reads_bam = deplete.bam_filtered_to_taxa
120+
}
121+
call metagenomics.filter_bam_to_taxa as filter_acellular {
122+
input:
123+
classified_bam = reads_bam,
124+
classified_reads_txt_gz = kraken2.kraken2_reads_report,
125+
ncbi_taxonomy_db_tgz = ncbi_taxdump_tgz,
126+
exclude_taxa = true,
127+
taxonomic_names = taxa_to_avoid_assembly,
128+
out_filename_suffix = "acellular"
129+
}
130+
call read_utils.rmdup_ubam {
131+
input:
132+
reads_unmapped_bam = filter_acellular.bam_filtered_to_taxa
133+
}
134+
call assembly.assemble as spades {
135+
input:
136+
reads_unmapped_bam = rmdup_ubam.dedup_bam,
137+
trim_clip_db = trim_clip_db,
138+
always_succeed = true
139+
}
140+
call metagenomics.report_primary_kraken_taxa {
141+
input:
142+
kraken_summary_report = kraken2.kraken2_summary_report
143+
}
144+
145+
# download (multi-segment) genomes for each reference, fasta filename = colon-concatenated accession list
146+
scatter(taxon in read_tsv(taxid_to_ref_accessions_tsv)) {
147+
# taxon = [taxid, isolate_prefix, taxname, semicolon_delim_accession_list]
148+
call utils.string_split {
149+
input:
150+
joined_string = taxon[3],
151+
delimiter = ":"
152+
}
153+
call ncbi.download_annotations {
154+
input:
155+
accessions = string_split.tokens,
156+
combined_out_prefix = sub(taxon[3], ":", "-") # singularity does not like colons in filenames
157+
}
158+
}
159+
160+
# subset reference genomes to those with ANI hits to contigs and cluster reference hits by any ANI similarity to each other
161+
call assembly.select_references {
162+
input:
163+
reference_genomes_fastas = download_annotations.combined_fasta,
164+
contigs_fasta = spades.contigs_fasta
165+
}
166+
167+
# assemble and produce stats for every reference cluster
168+
Array[String] assembly_header = ["entity:assembly_id", "assembly_name", "sample_id", "sample_name", "taxid", "tax_name", "tax_shortname", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "scaffolding_num_segments_recovered", "reference_num_segments_required", "reference_length", "reference_accessions", "skani_num_ref_clusters", "skani_this_cluster_num_refs", "skani_dist_tsv", "scaffolding_ani", "scaffolding_pct_ref_cov", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned", "assembly_method", "assembly_method_version", "biosample_accession", "sample"]
169+
scatter(ref_cluster_tar in select_references.matched_reference_clusters_fastas_tars) {
170+
171+
call utils.tar_extract {
172+
input:
173+
tar_file = ref_cluster_tar
174+
}
175+
176+
# assemble (scaffold-and-refine) genome against this reference cluster
177+
call assembly.scaffold {
178+
input:
179+
reads_bam = deplete.bam_filtered_to_taxa,
180+
contigs_fasta = spades.contigs_fasta,
181+
reference_genome_fasta = tar_extract.files,
182+
min_length_fraction = 0,
183+
min_unambig = 0,
184+
allow_incomplete_output = true
185+
}
186+
if (scaffold.assembly_preimpute_length_unambiguous > min_scaffold_unambig) {
187+
# polish de novo assembly with reads
188+
call assemble_refbased.assemble_refbased as refine {
189+
input:
190+
reads_unmapped_bams = [deplete.bam_filtered_to_taxa],
191+
reference_fasta = scaffold.scaffold_fasta,
192+
sample_name = sample_id,
193+
sample_original_name = sample_original_name
194+
}
195+
}
196+
197+
# get taxid and taxname from taxid_to_ref_accessions_tsv
198+
call utils.fetch_row_from_tsv as tax_lookup {
199+
input:
200+
tsv = taxid_to_ref_accessions_tsv,
201+
idx_col = "accessions",
202+
idx_val = sub(scaffold.scaffolding_chosen_ref_basename, "-", ":"),
203+
add_header = ["taxid", "isolate_prefix", "taxname", "accessions"]
204+
}
205+
String taxid = tax_lookup.map["taxid"]
206+
String tax_name = tax_lookup.map["taxname"]
207+
String isolate_prefix = tax_lookup.map["isolate_prefix"]
208+
209+
# build output tsv row
210+
Int assembly_length_unambiguous = select_first([refine.assembly_length_unambiguous, 0])
211+
Float percent_reference_covered = 1.0 * assembly_length_unambiguous / scaffold.reference_length
212+
File assembly_fasta = select_first([refine.assembly_fasta, scaffold.intermediate_gapfill_fasta])
213+
Map[String, String] stats_by_taxon = {
214+
"entity:assembly_id" : sample_id + "-" + taxid,
215+
"assembly_name" : tax_name + ": " + sample_original_name,
216+
"sample_id" : sample_id,
217+
"sample_name" : sample_original_name,
218+
"taxid" : taxid,
219+
"tax_name" : tax_name,
220+
"tax_shortname" : isolate_prefix,
221+
222+
"assembly_fasta" : assembly_fasta,
223+
"aligned_only_reads_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ""]),
224+
"coverage_plot" : select_first([refine.align_to_self_merged_coverage_plot, ""]),
225+
"assembly_length" : select_first([refine.assembly_length, "0"]),
226+
"assembly_length_unambiguous" : assembly_length_unambiguous,
227+
"reads_aligned" : select_first([refine.align_to_self_merged_reads_aligned, "0"]),
228+
"mean_coverage" : select_first([refine.align_to_self_merged_mean_coverage, "0"]),
229+
"percent_reference_covered" : percent_reference_covered,
230+
"scaffolding_num_segments_recovered" : scaffold.assembly_num_segments_recovered,
231+
"reference_num_segments_required" : scaffold.reference_num_segments_required,
232+
"reference_length" : scaffold.reference_length,
233+
"reference_accessions" : tax_lookup.map["accessions"],
234+
235+
"skani_num_ref_clusters" : length(select_references.matched_reference_clusters_fastas_tars),
236+
"skani_this_cluster_num_refs" : length(tar_extract.files),
237+
"skani_dist_tsv" : scaffold.scaffolding_stats,
238+
"scaffolding_ani" : scaffold.skani_ani,
239+
"scaffolding_pct_ref_cov" : scaffold.skani_ref_aligned_frac,
240+
241+
"intermediate_gapfill_fasta" : scaffold.intermediate_gapfill_fasta,
242+
"assembly_preimpute_length_unambiguous" : scaffold.assembly_preimpute_length_unambiguous,
243+
244+
"replicate_concordant_sites" : select_first([refine.replicate_concordant_sites, "0"]),
245+
"replicate_discordant_snps" : select_first([refine.replicate_discordant_snps, "0"]),
246+
"replicate_discordant_indels" : select_first([refine.replicate_discordant_indels, "0"]),
247+
"replicate_discordant_vcf" : select_first([refine.replicate_discordant_vcf, ""]),
248+
249+
"isnvsFile" : select_first([refine.align_to_self_isnvs_vcf, ""]),
250+
"aligned_bam" : select_first([refine.align_to_self_merged_aligned_only_bam, ""]),
251+
"coverage_tsv" : select_first([refine.align_to_self_merged_coverage_tsv, ""]),
252+
"read_pairs_aligned" : select_first([refine.align_to_self_merged_read_pairs_aligned, "0"]),
253+
"bases_aligned" : select_first([refine.align_to_self_merged_bases_aligned, "0"]),
254+
255+
"assembly_method" : "viral-ngs/assemble_denovo",
256+
"assembly_method_version" : scaffold.viralngs_version,
257+
258+
"biosample_accession" : select_first([biosample_accession, ""]),
259+
260+
"sample": '{"entityType":"sample","entityName":"' + sample_id + '"}'
261+
}
262+
263+
if(assembly_length_unambiguous > min_scaffold_unambig) {
264+
scatter(h in assembly_header) {
265+
String stat_by_taxon = stats_by_taxon[h]
266+
}
267+
}
268+
}
269+
270+
### summary stats
271+
if (length(select_all(stat_by_taxon)) > 0) {
272+
call utils.concatenate as assembly_stats_non_empty {
273+
input:
274+
infiles = [write_tsv([assembly_header]), write_tsv(select_all(stat_by_taxon))],
275+
output_name = "assembly_metadata-~{sample_id}.tsv"
276+
}
277+
}
278+
if (length(select_all(stat_by_taxon)) == 0) {
279+
call utils.concatenate as assembly_stats_empty {
280+
input:
281+
infiles = [write_tsv([assembly_header])],
282+
output_name = "assembly_metadata-~{sample_id}.tsv"
283+
}
284+
}
285+
286+
output {
287+
File cleaned_reads_unaligned_bam = deplete.bam_filtered_to_taxa
288+
File deduplicated_reads_unaligned = rmdup_ubam.dedup_bam
289+
File contigs_fasta = spades.contigs_fasta
290+
291+
Int read_counts_raw = deplete.classified_taxonomic_filter_read_count_pre
292+
Int read_counts_depleted = deplete.classified_taxonomic_filter_read_count_post
293+
Int read_counts_dedup = rmdup_ubam.dedup_read_count_post
294+
Int read_counts_prespades_subsample = spades.subsample_read_count
295+
296+
File kraken2_summary_report = kraken2.kraken2_summary_report
297+
File kraken2_krona_plot = kraken2.krona_report_html
298+
File kraken2_top_taxa_report = report_primary_kraken_taxa.ranked_focal_report
299+
String kraken2_focal_taxon_name = report_primary_kraken_taxa.focal_tax_name
300+
Int kraken2_focal_total_reads = report_primary_kraken_taxa.total_focal_reads
301+
String kraken2_top_taxon_id = report_primary_kraken_taxa.tax_id
302+
String kraken2_top_taxon_name = report_primary_kraken_taxa.tax_name
303+
String kraken2_top_taxon_rank = report_primary_kraken_taxa.tax_rank
304+
Int kraken2_top_taxon_num_reads = report_primary_kraken_taxa.num_reads
305+
Float kraken2_top_taxon_pct_of_focal = report_primary_kraken_taxa.percent_of_focal
306+
307+
File raw_fastqc = merge_raw_reads.fastqc
308+
File cleaned_fastqc = fastqc_cleaned.fastqc_html
309+
File spikein_report = spikein.report
310+
String spikein_tophit = spikein.top_hit_id
311+
String spikein_pct_of_total_reads = spikein.pct_total_reads_mapped
312+
String spikein_pct_lesser_hits = spikein.pct_lesser_hits_of_mapped
313+
314+
String viral_classify_version = kraken2.viralngs_version
315+
String viral_assemble_version = spades.viralngs_version
316+
317+
Array[Map[String,String]] assembly_stats_by_taxon = stats_by_taxon
318+
File assembly_stats_by_taxon_tsv = select_first([assembly_stats_non_empty.combined, assembly_stats_empty.combined])
319+
String assembly_method = "viral-ngs/scaffold_and_refine_multitaxa"
320+
321+
Int skani_num_ref_clusters = length(select_references.matched_reference_clusters_fastas_tars)
322+
File skani_contigs_to_refs_dist_tsv = select_references.skani_dist_full_tsv
323+
324+
Array[String] assembly_all_taxids = taxid
325+
Array[String] assembly_all_taxnames = tax_name
326+
Array[Int] assembly_all_lengths_unambig = assembly_length_unambiguous
327+
Array[Float] assembly_all_pct_ref_cov = percent_reference_covered
328+
Array[File] assembly_all_fastas = assembly_fasta
329+
}
330+
}

0 commit comments

Comments
 (0)