Skip to content

Commit 799e054

Browse files
authored
Merge pull request #196 from broadinstitute/dp-genbank
improvements to sarscov2_illumina_full
2 parents ddbe7dc + fedc2f1 commit 799e054

14 files changed

+613
-120
lines changed

.dockstore.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,11 @@ workflows:
230230
primaryDescriptorPath: /pipes/WDL/workflows/sarscov2_genbank.wdl
231231
testParameterFiles:
232232
- empty.json
233+
- name: sarscov2_illumina_full
234+
subclass: WDL
235+
primaryDescriptorPath: /pipes/WDL/workflows/sarscov2_illumina_full.wdl
236+
testParameterFiles:
237+
- empty.json
233238
- name: sarscov2_lineages
234239
subclass: WDL
235240
primaryDescriptorPath: /pipes/WDL/workflows/sarscov2_lineages.wdl

pipes/WDL/tasks/tasks_assembly.wdl

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ task assemble {
1515
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt")
1616

1717
Int? machine_mem_gb
18-
String docker="quay.io/broadinstitute/viral-assemble:2.1.16.0"
18+
String docker="quay.io/broadinstitute/viral-assemble:2.1.16.1"
1919
}
2020

2121
command {
@@ -80,7 +80,7 @@ task scaffold {
8080
Float? scaffold_min_pct_contig_aligned
8181

8282
Int? machine_mem_gb
83-
String docker="quay.io/broadinstitute/viral-assemble:2.1.16.0"
83+
String docker="quay.io/broadinstitute/viral-assemble:2.1.16.1"
8484

8585
# do this in multiple steps in case the input doesn't actually have "assembly1-x" in the name
8686
String sample_name = basename(basename(contigs_fasta, ".fasta"), ".assembly1-spades")
@@ -226,7 +226,7 @@ task align_reads {
226226
Boolean? skip_mark_dupes=false
227227

228228
Int? machine_mem_gb
229-
String docker="quay.io/broadinstitute/viral-core:2.1.16"
229+
String docker="quay.io/broadinstitute/viral-core:2.1.18"
230230

231231
String sample_name = basename(basename(basename(reads_unmapped_bam, ".bam"), ".taxfilt"), ".clean")
232232
}
@@ -342,7 +342,7 @@ task refine_assembly_with_aligned_reads {
342342
Int? min_coverage=3
343343

344344
Int? machine_mem_gb
345-
String docker="quay.io/broadinstitute/viral-assemble:2.1.16.0"
345+
String docker="quay.io/broadinstitute/viral-assemble:2.1.16.1"
346346
}
347347

348348
parameter_meta {
@@ -389,9 +389,16 @@ task refine_assembly_with_aligned_reads {
389389
refined.fasta "${sample_name}.fasta" "${sample_name}"
390390
391391
# collect variant counts
392-
bcftools filter -e "FMT/DP<${min_coverage}" -S . "${sample_name}.sites.vcf.gz" -Ou | bcftools filter -i "AC>1" -Ou > "${sample_name}.diffs.vcf"
393-
bcftools filter -i 'TYPE="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_snps
394-
bcftools filter -i 'TYPE!="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_indels
392+
if (( $(cat refined.fasta | wc -l) > 1 )); then
393+
bcftools filter -e "FMT/DP<${min_coverage}" -S . "${sample_name}.sites.vcf.gz" -Ou | bcftools filter -i "AC>1" -Ou > "${sample_name}.diffs.vcf"
394+
bcftools filter -i 'TYPE="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_snps
395+
bcftools filter -i 'TYPE!="snp"' "${sample_name}.diffs.vcf" | bcftools query -f '%POS\n' | wc -l | tee num_indels
396+
else
397+
# empty output
398+
echo "0" > num_snps
399+
echo "0" > num_indels
400+
cp "${sample_name}.sites.vcf.gz" "${sample_name}.diffs.vcf"
401+
fi
395402
396403
# collect figures of merit
397404
set +o pipefail # grep will exit 1 if it fails to find the pattern
@@ -434,7 +441,7 @@ task refine {
434441
Int? min_coverage=1
435442

436443
Int? machine_mem_gb
437-
String docker="quay.io/broadinstitute/viral-assemble:2.1.16.0"
444+
String docker="quay.io/broadinstitute/viral-assemble:2.1.16.1"
438445

439446
String assembly_basename=basename(basename(assembly_fasta, ".fasta"), ".scaffold")
440447
}
@@ -504,7 +511,7 @@ task refine_2x_and_plot {
504511
String? plot_coverage_novoalign_options="-r Random -l 40 -g 40 -x 20 -t 100 -k"
505512

506513
Int? machine_mem_gb
507-
String docker="quay.io/broadinstitute/viral-assemble:2.1.16.0"
514+
String docker="quay.io/broadinstitute/viral-assemble:2.1.16.1"
508515

509516
# do this in two steps in case the input doesn't actually have "cleaned" in the name
510517
String sample_name = basename(basename(reads_unmapped_bam, ".bam"), ".cleaned")
@@ -636,7 +643,7 @@ task run_discordance {
636643
String out_basename = "run"
637644
Int min_coverage=4
638645

639-
String docker="quay.io/broadinstitute/viral-core:2.1.16"
646+
String docker="quay.io/broadinstitute/viral-core:2.1.18"
640647
}
641648

642649
command {

pipes/WDL/tasks/tasks_demux.wdl

Lines changed: 142 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ task merge_tarballs {
66
String out_filename
77

88
Int? machine_mem_gb
9-
String docker="quay.io/broadinstitute/viral-core:2.1.16"
9+
String docker="quay.io/broadinstitute/viral-core:2.1.18"
1010
}
1111

1212
command {
@@ -19,17 +19,17 @@ task merge_tarballs {
1919
file_utils.py --version | tee VERSION
2020
2121
file_utils.py merge_tarballs \
22-
${out_filename} ${sep=' ' tar_chunks} \
22+
~{out_filename} ~{sep=' ' tar_chunks} \
2323
--loglevel=DEBUG
2424
}
2525

2626
output {
27-
File combined_tar = "${out_filename}"
27+
File combined_tar = "~{out_filename}"
2828
String viralngs_version = read_string("VERSION")
2929
}
3030

3131
runtime {
32-
docker: "${docker}"
32+
docker: docker
3333
memory: select_first([machine_mem_gb, 7]) + " GB"
3434
cpu: 16
3535
disks: "local-disk 2625 LOCAL"
@@ -38,6 +38,47 @@ task merge_tarballs {
3838
}
3939
}
4040

41+
task samplesheet_rename_ids {
42+
input {
43+
File old_sheet
44+
File? rename_map
45+
String old_id_col = 'internal_id'
46+
String new_id_col = 'external_id'
47+
}
48+
String new_base = basename(old_sheet, '.txt')
49+
command <<<
50+
python3 << CODE
51+
import csv
52+
53+
# read in the rename_map file
54+
old_to_new = {}
55+
with open('~{default="/dev/null" rename_map}', 'rt') as inf:
56+
for row in csv.DictReader(inf, delimiter='\t'):
57+
old_to_new[row['~{old_id_col}']] = row['~{new_id_col}']
58+
59+
# change all ids in the sample column to new ids
60+
with open('~{old_sheet}', 'rt') as inf:
61+
reader = csv.DictReader(inf, delimiter='\t')
62+
with open('~{new_base}.renamed.txt', 'w', newline='') as outf:
63+
writer = csv.DictWriter(outf, reader.fieldnames, delimiter='\t', dialect=csv.unix_dialect, quoting=csv.QUOTE_MINIMAL)
64+
writer.writeheader()
65+
for row in reader:
66+
row['sample'] = old_to_new.get(row['sample'], row['sample'])
67+
writer.writerow(row)
68+
CODE
69+
>>>
70+
output {
71+
File new_sheet = '~{new_base}.renamed.txt'
72+
}
73+
runtime {
74+
docker: "python:slim"
75+
memory: "1 GB"
76+
cpu: 1
77+
disks: "local-disk 50 HDD"
78+
dx_instance_type: "mem1_ssd1_v2_x2"
79+
}
80+
}
81+
4182
task illumina_demux {
4283
input {
4384
File flowcell_tgz
@@ -60,7 +101,13 @@ task illumina_demux {
60101
Boolean? forceGC=true
61102

62103
Int? machine_mem_gb
63-
String docker="quay.io/broadinstitute/viral-core:2.1.16"
104+
String docker="quay.io/broadinstitute/viral-core:2.1.18"
105+
}
106+
parameter_meta {
107+
flowcell_tgz: {
108+
description: "Illumina BCL directory compressed as tarball. Must contain RunInfo.xml (unless overridden by runinfo), SampleSheet.csv (unless overridden by samplesheet), RTAComplete.txt, and Data/Intensities/BaseCalls/*",
109+
patterns: ["*.tar.gz", ".tar.zst", ".tar.bz2", ".tar.lz4", ".tgz"]
110+
}
64111
}
65112

66113
command {
@@ -77,12 +124,12 @@ task illumina_demux {
77124
read_utils.py --version | tee VERSION
78125
79126
read_utils.py extract_tarball \
80-
${flowcell_tgz} $FLOWCELL_DIR \
127+
~{flowcell_tgz} $FLOWCELL_DIR \
81128
--loglevel=DEBUG
82129
83130
# if we are overriding the RunInfo file, use the path of the file provided. Otherwise find the file
84-
if [ -n "${runinfo}" ]; then
85-
RUNINFO_FILE="${runinfo}"
131+
if [ -n "~{runinfo}" ]; then
132+
RUNINFO_FILE="~{runinfo}"
86133
else
87134
# full RunInfo.xml path
88135
RUNINFO_FILE="$(find $FLOWCELL_DIR -type f -name RunInfo.xml | head -n 1)"
@@ -157,45 +204,48 @@ task illumina_demux {
157204
158205
# use the passed-in (or default) WDL value first, then fall back to the auto-scaled value
159206
# if the result of this is null (nothing is passed in, no autoscaled value, no param is passed to the command)
160-
if [ -n "${minimumBaseQuality}" ]; then demux_min_base_quality="${minimumBaseQuality}"; else demux_min_base_quality="$demux_min_base_quality"; fi
207+
if [ -n "~{minimumBaseQuality}" ]; then demux_min_base_quality="~{minimumBaseQuality}"; else demux_min_base_quality="$demux_min_base_quality"; fi
161208
if [ -n "$demux_min_base_quality" ]; then demux_min_base_quality="--minimum_base_quality=$demux_min_base_quality";fi
162209
163-
if [ -n "${threads}" ]; then demux_threads="${threads}"; else demux_threads="$demux_threads"; fi
210+
if [ -n "~{threads}" ]; then demux_threads="~{threads}"; else demux_threads="$demux_threads"; fi
164211
if [ -n "$demux_threads" ]; then demux_threads="--threads=$demux_threads"; fi
165212
166213
167-
if [ -n "${maxReadsInRamPerTile}" ]; then max_reads_in_ram_per_tile="${maxReadsInRamPerTile}"; else max_reads_in_ram_per_tile="$max_reads_in_ram_per_tile"; fi
214+
if [ -n "~{maxReadsInRamPerTile}" ]; then max_reads_in_ram_per_tile="~{maxReadsInRamPerTile}"; else max_reads_in_ram_per_tile="$max_reads_in_ram_per_tile"; fi
168215
if [ -n "$max_reads_in_ram_per_tile" ]; then max_reads_in_ram_per_tile="--max_reads_in_ram_per_tile=$max_reads_in_ram_per_tile"; fi
169216
170-
if [ -n "${maxRecordsInRam}" ]; then max_records_in_ram="${maxRecordsInRam}"; else max_records_in_ram="$max_records_in_ram"; fi
217+
if [ -n "~{maxRecordsInRam}" ]; then max_records_in_ram="~{maxRecordsInRam}"; else max_records_in_ram="$max_records_in_ram"; fi
171218
if [ -n "$max_records_in_ram" ]; then max_records_in_ram="--max_records_in_ram=$max_records_in_ram"; fi
172219
173220
# note that we are intentionally setting --threads to about 2x the core
174221
# count. seems to still provide speed benefit (over 1x) when doing so.
175222
illumina.py illumina_demux \
176223
$FLOWCELL_DIR \
177-
${lane} \
224+
~{lane} \
178225
. \
179-
${'--sampleSheet=' + samplesheet} \
180-
${'--runInfo=' + runinfo} \
181-
${'--sequencing_center=' + sequencingCenter} \
226+
~{'--sampleSheet=' + samplesheet} \
227+
~{'--runInfo=' + runinfo} \
228+
~{'--sequencing_center=' + sequencingCenter} \
182229
--outMetrics=metrics.txt \
183230
--commonBarcodes=barcodes.txt \
184-
${'--flowcell=' + flowcell} \
231+
~{'--flowcell=' + flowcell} \
185232
$demux_min_base_quality \
186-
${'--max_mismatches=' + maxMismatches} \
187-
${'--min_mismatch_delta=' + minMismatchDelta} \
188-
${'--max_no_calls=' + maxNoCalls} \
189-
${'--read_structure=' + readStructure} \
190-
${'--minimum_quality=' + minimumQuality} \
191-
${'--run_start_date=' + runStartDate} \
233+
~{'--max_mismatches=' + maxMismatches} \
234+
~{'--min_mismatch_delta=' + minMismatchDelta} \
235+
~{'--max_no_calls=' + maxNoCalls} \
236+
~{'--read_structure=' + readStructure} \
237+
~{'--minimum_quality=' + minimumQuality} \
238+
~{'--run_start_date=' + runStartDate} \
192239
$max_reads_in_ram_per_tile \
193240
$max_records_in_ram \
194241
--JVMmemory="$mem_in_mb"m \
195242
$demux_threads \
196-
${true='--force_gc=true' false="--force_gc=false" forceGC} \
243+
~{true='--force_gc=true' false="--force_gc=false" forceGC} \
197244
--append_run_id \
198245
--compression_level=5 \
246+
--out_meta_by_sample meta_by_sample.json \
247+
--out_meta_by_filename meta_by_fname.json \
248+
--out_runinfo runinfo.json \
199249
--loglevel=DEBUG
200250
201251
illumina.py guess_barcodes --expected_assigned_fraction=0 barcodes.txt metrics.txt barcodes_outliers.txt
@@ -208,14 +258,15 @@ task illumina_demux {
208258
echo "$(basename $bam .bam)" >> $OUT_BASENAMES
209259
done
210260
261+
# fastqc
211262
FASTQC_HARDCODED_MEM_PER_THREAD=250 # the value fastqc sets for -Xmx per thread, not adjustable
212263
num_cpus=$(nproc)
213264
num_bam_files=$(cat $OUT_BASENAMES | wc -l)
214265
num_fastqc_jobs=1
215266
num_fastqc_threads=1
216267
total_ram_needed_mb=250
217268
218-
# determine the number of fastq jobs
269+
# determine the number of fastqc jobs
219270
while [[ $total_ram_needed_mb -lt $mem_in_mb ]] && [[ $num_fastqc_jobs -lt $num_cpus ]] && [[ $num_fastqc_jobs -lt $num_bam_files ]]; do
220271
num_fastqc_jobs=$(($num_fastqc_jobs+1))
221272
total_ram_needed_mb=$(($total_ram_needed_mb+$FASTQC_HARDCODED_MEM_PER_THREAD))
@@ -257,10 +308,17 @@ task illumina_demux {
257308
Int runtime_sec = ceil(read_float("UPTIME_SEC"))
258309
Int cpu_load_15min = ceil(read_float("LOAD_15M"))
259310
String viralngs_version = read_string("VERSION")
311+
312+
Map[String,Map[String,String]] meta_by_sample = read_json('meta_by_sample.json')
313+
Map[String,Map[String,String]] meta_by_filename = read_json('meta_by_fname.json')
314+
Map[String,String] run_info = read_json('runinfo.json')
315+
File meta_by_sample_json = 'meta_by_sample.json'
316+
File meta_by_filename_json = 'meta_by_fname.json'
317+
File run_info_json = 'runinfo.json'
260318
}
261319

262320
runtime {
263-
docker: "${docker}"
321+
docker: docker
264322
memory: select_first([machine_mem_gb, 200]) + " GB"
265323
cpu: 32
266324
disks: "local-disk 2625 LOCAL"
@@ -269,3 +327,61 @@ task illumina_demux {
269327
preemptible: 0 # this is the very first operation before scatter, so let's get it done quickly & reliably
270328
}
271329
}
330+
331+
task map_map_setdefault {
332+
input {
333+
File map_map_json
334+
Array[String] sub_keys
335+
}
336+
command <<<
337+
python3 << CODE
338+
import json
339+
sub_keys = '~{sep="*" sub_keys}'.split('*')
340+
with open('~{map_map_json}', 'rt') as inf:
341+
out = json.load(inf)
342+
for k in out.keys():
343+
for sub_key in sub_keys:
344+
out[k].setdefault(sub_key, "")
345+
with open('out.json', 'wt') as outf:
346+
json.dump(out, outf, indent=2)
347+
CODE
348+
>>>
349+
output {
350+
File out_json = 'out.json'
351+
}
352+
runtime {
353+
docker: "python:slim"
354+
memory: "1 GB"
355+
cpu: 1
356+
disks: "local-disk 20 HDD"
357+
dx_instance_type: "mem1_ssd1_v2_x2"
358+
}
359+
}
360+
361+
task merge_maps {
362+
input {
363+
Array[File] maps_jsons
364+
}
365+
command <<<
366+
python3 << CODE
367+
import json
368+
infiles = '~{sep='*' maps_jsons}'.split('*')
369+
out = {}
370+
for fname in infiles:
371+
with open(fname, 'rt') as inf:
372+
out.update(json.load(inf))
373+
with open('out.json', 'wt') as outf:
374+
json.dump(out, outf, indent=2)
375+
CODE
376+
>>>
377+
output {
378+
Map[String,Map[String,String]] merged = read_json('out.json')
379+
}
380+
runtime {
381+
docker: "python:slim"
382+
memory: "1 GB"
383+
cpu: 1
384+
disks: "local-disk 20 HDD"
385+
dx_instance_type: "mem1_ssd1_v2_x2"
386+
}
387+
}

pipes/WDL/tasks/tasks_interhost.wdl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,7 @@ task index_ref {
142142
File? novocraft_license
143143

144144
Int? machine_mem_gb
145-
String docker="quay.io/broadinstitute/viral-core:2.1.16"
145+
String docker="quay.io/broadinstitute/viral-core:2.1.18"
146146
}
147147

148148
command {

0 commit comments

Comments
 (0)