Skip to content

Commit 6486c78

Browse files
authored
Merge pull request #594 from broadinstitute/dp-cleanup
metagenomic denovo optimizations and fixes
2 parents fd77585 + 4dd8d3b commit 6486c78

File tree

3 files changed

+67
-34
lines changed

3 files changed

+67
-34
lines changed

pipes/WDL/tasks/tasks_ncbi.wdl

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,54 @@ task download_annotations {
7575
}
7676
}
7777

78+
task download_ref_genomes_from_tsv {
79+
input {
80+
File ref_genomes_tsv # [tax_id, isolate_prefix, taxname, colon_delim_accession_list]
81+
String emailAddress
82+
83+
String docker = "quay.io/broadinstitute/viral-phylo:2.4.1.0"
84+
}
85+
86+
command <<<
87+
set -ex -o pipefail
88+
ncbi.py --version | tee VERSION
89+
mkdir -p combined
90+
91+
python3<<CODE
92+
import csv
93+
import phylo.genbank
94+
with open("~{ref_genomes_tsv}", 'rt') as inf:
95+
reader = csv.DictReader(inf, delimiter='\t',
96+
fieldnames=['tax_id', 'isolate_prefix', 'taxname', 'accessions']) # backwards support for headerless tsvs
97+
# for the future: batch all the downloads in a single call and re-organize output files afterwards
98+
for ref_genome in reader:
99+
if ref_genome['tax_id'] != 'tax_id': # skip header
100+
accessions = ref_genome['accessions'].split(':')
101+
phylo.genbank.fetch_fastas_from_genbank(
102+
accessionList=accessions,
103+
destinationDir=".",
104+
emailAddress="~{emailAddress}",
105+
forceOverwrite=True,
106+
combinedFilePrefix="combined/" + '-'.join(accessions),
107+
removeSeparateFiles=False,
108+
chunkSize=500)
109+
CODE
110+
>>>
111+
112+
output {
113+
Array[File] ref_genomes_fastas = glob("combined/*.fasta")
114+
Int num_references = length(ref_genomes_fastas)
115+
}
116+
117+
runtime {
118+
docker: docker
119+
memory: "7 GB"
120+
cpu: 2
121+
dx_instance_type: "mem2_ssd1_v2_x2"
122+
maxRetries: 2
123+
}
124+
}
125+
78126
task sequencing_platform_from_bam {
79127
input {
80128
File bam

pipes/WDL/workflows/assemble_denovo_metagenomic.wdl

Lines changed: 10 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ workflow assemble_denovo_metagenomic {
3838
Array[String] taxa_to_dehost = ["Vertebrata"]
3939
Array[String] taxa_to_avoid_assembly = ["Vertebrata", "other sequences", "Bacteria"]
4040

41+
String table_name = "sample"
4142
}
4243

4344
Int min_scaffold_unambig = 300 # in base-pairs; any scaffolded assembly < this length will not be refined/polished
@@ -149,30 +150,21 @@ workflow assemble_denovo_metagenomic {
149150
kraken_summary_report = kraken2.kraken2_summary_report
150151
}
151152
152-
# download (multi-segment) genomes for each reference, fasta filename = colon-concatenated accession list
153-
scatter(taxon in read_tsv(taxid_to_ref_accessions_tsv)) {
154-
# taxon = [taxid, isolate_prefix, taxname, semicolon_delim_accession_list]
155-
call utils.string_split {
156-
input:
157-
joined_string = taxon[3],
158-
delimiter = ":"
159-
}
160-
call ncbi.download_annotations {
161-
input:
162-
accessions = string_split.tokens,
163-
combined_out_prefix = sub(taxon[3], ":", "-") # singularity does not like colons in filenames
164-
}
153+
# download (multi-segment) genomes for each reference, fasta filename = dash-concatenated accession list
154+
call ncbi.download_ref_genomes_from_tsv {
155+
input:
156+
ref_genomes_tsv = taxid_to_ref_accessions_tsv
165157
}
166158
167159
# subset reference genomes to those with ANI hits to contigs and cluster reference hits by any ANI similarity to each other
168160
call assembly.select_references {
169161
input:
170-
reference_genomes_fastas = download_annotations.combined_fasta,
162+
reference_genomes_fastas = download_ref_genomes_from_tsv.ref_genomes_fastas,
171163
contigs_fasta = spades.contigs_fasta
172164
}
173165
174166
# assemble and produce stats for every reference cluster
175-
Array[String] assembly_header = ["entity:assembly_id", "assembly_name", "sample_id", "sample_name", "taxid", "tax_name", "tax_shortname", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "scaffolding_num_segments_recovered", "reference_num_segments_required", "reference_length", "reference_accessions", "skani_num_ref_clusters", "skani_this_cluster_num_refs", "skani_dist_tsv", "scaffolding_ani", "scaffolding_pct_ref_cov", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned", "assembly_method", "assembly_method_version", "biosample_accession", "batch_ids", "sample"]
167+
Array[String] assembly_header = ["entity:assembly_id", "assembly_name", "sample_id", "sample_name", "taxid", "tax_name", "tax_shortname", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "scaffolding_num_segments_recovered", "reference_num_segments_required", "reference_length", "reference_accessions", "skani_num_ref_clusters", "skani_this_cluster_num_refs", "skani_dist_tsv", "scaffolding_ani", "scaffolding_pct_ref_cov", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned", "assembly_method", "assembly_method_version", "biosample_accession", "batch_ids", "~{table_name}"]
176168
scatter(ref_cluster_tar in select_references.matched_reference_clusters_fastas_tars) {
177169

178170
call utils.tar_extract {
@@ -197,9 +189,9 @@ workflow assemble_denovo_metagenomic {
197189
tsv = taxid_to_ref_accessions_tsv,
198190
idx_col = "accessions",
199191
idx_val = sub(scaffold.scaffolding_chosen_ref_basename, "-", ":"),
200-
add_header = ["taxid", "isolate_prefix", "taxname", "accessions"]
192+
add_header = ["tax_id", "isolate_prefix", "taxname", "accessions"]
201193
}
202-
String taxid = tax_lookup.map["taxid"]
194+
String taxid = tax_lookup.map["tax_id"]
203195
String tax_name = tax_lookup.map["taxname"]
204196
String isolate_prefix = tax_lookup.map["isolate_prefix"]
205197
@@ -266,7 +258,7 @@ workflow assemble_denovo_metagenomic {
266258

267259
"batch_ids" : unique_batch_ids.sorted_unique_joined,
268260

269-
"sample": '{"entityType":"sample","entityName":"' + sample_id + '"}'
261+
"~{table_name}": '{"entityType":"~{table_name}","entityName":"' + sample_id + '"}'
270262
}
271263

272264
if(assembly_length_unambiguous > min_scaffold_unambig) {

pipes/WDL/workflows/scaffold_and_refine_multitaxa.wdl

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -22,35 +22,28 @@ workflow scaffold_and_refine_multitaxa {
2222
File taxid_to_ref_accessions_tsv
2323

2424
String? biosample_accession
25+
26+
String table_name = "sample"
2527
}
2628

2729
Int min_scaffold_unambig = 300 # in base-pairs; any scaffolded assembly < this length will not be refined/polished
2830
String sample_original_name = select_first([sample_name, sample_id])
2931

30-
# download (multi-segment) genomes for each reference, fasta filename = colon-concatenated accession list
31-
scatter(taxon in read_tsv(taxid_to_ref_accessions_tsv)) {
32-
# taxon = [taxid, isolate_prefix, taxname, semicolon_delim_accession_list]
33-
call utils.string_split {
34-
input:
35-
joined_string = taxon[3],
36-
delimiter = ":"
37-
}
38-
call ncbi.download_annotations {
39-
input:
40-
accessions = string_split.tokens,
41-
combined_out_prefix = sub(taxon[3], ":", "-") # singularity does not like colons in filenames
42-
}
32+
# download (multi-segment) genomes for each reference, fasta filename = dash-concatenated accession list
33+
call ncbi.download_ref_genomes_from_tsv {
34+
input:
35+
ref_genomes_tsv = taxid_to_ref_accessions_tsv
4336
}
4437
4538
# subset reference genomes to those with ANI hits to contigs and cluster reference hits by any ANI similarity to each other
4639
call assembly.select_references {
4740
input:
48-
reference_genomes_fastas = download_annotations.combined_fasta,
41+
reference_genomes_fastas = download_ref_genomes_from_tsv.ref_genomes_fastas,
4942
contigs_fasta = contigs_fasta
5043
}
5144
5245
# assemble and produce stats for every reference cluster
53-
Array[String] assembly_header = ["entity:assembly_id", "assembly_name", "sample_id", "sample_name", "taxid", "tax_name", "tax_shortname", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "scaffolding_num_segments_recovered", "reference_num_segments_required", "reference_length", "reference_accessions", "skani_num_ref_clusters", "skani_this_cluster_num_refs", "skani_dist_tsv", "scaffolding_ani", "scaffolding_pct_ref_cov", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned", "assembly_method", "assembly_method_version", "biosample_accession", "sample"]
46+
Array[String] assembly_header = ["entity:assembly_id", "assembly_name", "sample_id", "sample_name", "taxid", "tax_name", "tax_shortname", "assembly_fasta", "aligned_only_reads_bam", "coverage_plot", "assembly_length", "assembly_length_unambiguous", "reads_aligned", "mean_coverage", "percent_reference_covered", "scaffolding_num_segments_recovered", "reference_num_segments_required", "reference_length", "reference_accessions", "skani_num_ref_clusters", "skani_this_cluster_num_refs", "skani_dist_tsv", "scaffolding_ani", "scaffolding_pct_ref_cov", "intermediate_gapfill_fasta", "assembly_preimpute_length_unambiguous", "replicate_concordant_sites", "replicate_discordant_snps", "replicate_discordant_indels", "replicate_discordant_vcf", "isnvsFile", "aligned_bam", "coverage_tsv", "read_pairs_aligned", "bases_aligned", "assembly_method", "assembly_method_version", "biosample_accession", "~{table_name}"]
5447
scatter(ref_cluster_tar in select_references.matched_reference_clusters_fastas_tars) {
5548

5649
call utils.tar_extract {
@@ -142,7 +135,7 @@ workflow scaffold_and_refine_multitaxa {
142135

143136
"biosample_accession" : select_first([biosample_accession, ""]),
144137

145-
"sample": '{"entityType":"sample","entityName":"' + sample_id + '"}'
138+
"~{table_name}": '{"entityType":"~{table_name}","entityName":"' + sample_id + '"}'
146139
}
147140

148141
if(assembly_length_unambiguous > min_scaffold_unambig) {

0 commit comments

Comments
 (0)