Skip to content

Commit a08f288

Browse files
authored
Adjust bracken results for unclassified reads (#69)
1 parent 3c05ec6 commit a08f288

File tree

3 files changed

+202
-11
lines changed

3 files changed

+202
-11
lines changed
Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,188 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import csv
5+
import json
6+
7+
8+
def parse_kraken_report(kraken_report_path):
9+
kraken_report = []
10+
11+
fieldnames = [
12+
'percent_seqs_this_clade',
13+
'num_seqs_this_clade',
14+
'num_seqs_this_taxon',
15+
'taxonomic_level',
16+
'ncbi_taxonomy_id',
17+
'taxon_name',
18+
]
19+
int_fields = [
20+
'num_seqs_this_clade',
21+
'num_seqs_this_taxon',
22+
]
23+
float_fields = [
24+
'percent_seqs_this_clade',
25+
]
26+
with open(kraken_report_path, 'r') as f:
27+
reader = csv.DictReader(f, fieldnames=fieldnames, dialect='excel-tab')
28+
for row in reader:
29+
for field in int_fields:
30+
try:
31+
row[field] = int(row[field])
32+
except ValueError as e:
33+
row[field] = None
34+
for field in float_fields:
35+
try:
36+
row[field] = float(row[field])
37+
except ValueError as e:
38+
row[field] = None
39+
kraken_report.append(row)
40+
41+
return kraken_report
42+
43+
44+
def parse_bracken_abundances(bracken_abundances_path):
45+
bracken_abundances = []
46+
int_fields = [
47+
'kraken_assigned_reads',
48+
'added_reads',
49+
'new_est_reads',
50+
]
51+
float_fields = [
52+
'fraction_total_reads',
53+
]
54+
with open(bracken_abundances_path, 'r') as f:
55+
reader = csv.DictReader(f, dialect='excel-tab')
56+
for row in reader:
57+
for field in int_fields:
58+
try:
59+
row[field] = int(row[field])
60+
except ValueError as e:
61+
row[field] = None
62+
for field in float_fields:
63+
try:
64+
row[field] = float(row[field])
65+
except ValueError as e:
66+
row[field] = None
67+
bracken_abundances.append(row)
68+
69+
return bracken_abundances
70+
71+
72+
def get_num_unclassified_seqs(parsed_kraken_report):
73+
unclassified_records = list(filter(lambda x: x['ncbi_taxonomy_id'] == "0", parsed_kraken_report))
74+
num_unclassified_seqs = 0
75+
if len(unclassified_records) > 0:
76+
unclassified_record = unclassified_records[0]
77+
if 'num_seqs_this_taxon' in unclassified_record:
78+
num_unclassified_seqs = unclassified_record['num_seqs_this_taxon']
79+
80+
return num_unclassified_seqs
81+
82+
83+
def get_num_classified_seqs(parsed_kraken_report):
84+
root_records = list(filter(lambda x: x['ncbi_taxonomy_id'] == "1", parsed_kraken_report))
85+
num_classified_seqs = 0
86+
if len(root_records) > 0:
87+
root_record = root_records[0]
88+
if 'num_seqs_this_clade' in root_record:
89+
num_classified_seqs = root_record['num_seqs_this_clade']
90+
91+
return num_classified_seqs
92+
93+
94+
def adjust_bracken_report(bracken_report, num_unclassified_seqs):
95+
adjusted_bracken_report = []
96+
unclassified_record = {
97+
"num_seqs_this_clade": num_unclassified_seqs,
98+
"num_seqs_this_taxon": num_unclassified_seqs,
99+
"taxonomic_level": "U",
100+
"ncbi_taxonomy_id": "0",
101+
"taxon_name": "unclassified"
102+
}
103+
root_bracken_records = list(filter(lambda x: x['ncbi_taxonomy_id'] == "1", bracken_report))
104+
105+
if len(root_bracken_records) > 0:
106+
root_bracken_record = root_bracken_records[0]
107+
num_classified_seqs = root_bracken_record['num_seqs_this_clade']
108+
total_seqs = num_classified_seqs + num_unclassified_seqs
109+
if total_seqs > 0:
110+
unclassified_record['percent_seqs_this_clade'] = round(unclassified_record['num_seqs_this_clade'] / total_seqs * 100, 2)
111+
adjusted_bracken_report.append(unclassified_record)
112+
for bracken_report_record in bracken_report:
113+
bracken_report_record['percent_seqs_this_clade'] = round(bracken_report_record['num_seqs_this_clade'] / total_seqs * 100, 2)
114+
adjusted_bracken_report.append(bracken_report_record)
115+
116+
return adjusted_bracken_report
117+
118+
119+
def adjust_bracken_abundances(bracken_abundances, num_total_seqs, num_unclassified_seqs):
120+
adjusted_bracken_abundances = []
121+
unclassified_record = {
122+
"name": "unclassified",
123+
"taxonomy_id": "0",
124+
"taxonomy_lvl": "U",
125+
"kraken_assigned_reads": num_unclassified_seqs,
126+
"added_reads": 0,
127+
"new_est_reads": num_unclassified_seqs,
128+
}
129+
if num_total_seqs > 0:
130+
unclassified_record['fraction_total_reads'] = round(num_unclassified_seqs / num_total_seqs, 6)
131+
adjusted_bracken_abundances.append(unclassified_record)
132+
for bracken_abundance_record in bracken_abundances:
133+
bracken_abundance_record['fraction_total_reads'] = round(bracken_abundance_record['new_est_reads'] / num_total_seqs, 6)
134+
adjusted_bracken_abundances.append(bracken_abundance_record)
135+
136+
return adjusted_bracken_abundances
137+
138+
139+
def main(args):
140+
141+
kraken_report = parse_kraken_report(args.kraken_report)
142+
num_unclassified_seqs = get_num_unclassified_seqs(kraken_report)
143+
num_classified_seqs = get_num_classified_seqs(kraken_report)
144+
num_total_seqs = num_unclassified_seqs + num_classified_seqs
145+
146+
bracken_report = parse_kraken_report(args.bracken_report)
147+
adjusted_bracken_report = adjust_bracken_report(bracken_report, num_unclassified_seqs)
148+
bracken_abundances = parse_bracken_abundances(args.bracken_abundances)
149+
adjusted_bracken_abundances = adjust_bracken_abundances(bracken_abundances, num_total_seqs, num_unclassified_seqs)
150+
151+
abundances_output_fieldnames = [
152+
'name',
153+
'taxonomy_id',
154+
'taxonomy_lvl',
155+
'kraken_assigned_reads',
156+
'added_reads',
157+
'new_est_reads',
158+
'fraction_total_reads',
159+
]
160+
with open(args.adjusted_abundances, 'w') as f:
161+
writer = csv.DictWriter(f, fieldnames=abundances_output_fieldnames, dialect='excel-tab', quoting=csv.QUOTE_MINIMAL)
162+
writer.writeheader()
163+
for row in adjusted_bracken_abundances:
164+
writer.writerow(row)
165+
166+
report_output_fieldnames = [
167+
'percent_seqs_this_clade',
168+
'num_seqs_this_clade',
169+
'num_seqs_this_taxon',
170+
'taxonomic_level',
171+
'ncbi_taxonomy_id',
172+
'taxon_name',
173+
]
174+
with open(args.adjusted_report, 'w') as f:
175+
writer = csv.DictWriter(f, fieldnames=report_output_fieldnames, dialect='excel-tab', quoting=csv.QUOTE_MINIMAL)
176+
for row in adjusted_bracken_report:
177+
writer.writerow(row)
178+
179+
180+
if __name__ == '__main__':
181+
parser = argparse.ArgumentParser()
182+
parser.add_argument('-k', '--kraken-report')
183+
parser.add_argument('-b', '--bracken-report')
184+
parser.add_argument('-a', '--bracken-abundances')
185+
parser.add_argument('--adjusted-report')
186+
parser.add_argument('--adjusted-abundances')
187+
args = parser.parse_args()
188+
main(args)

main.nf

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -47,13 +47,13 @@ workflow {
4747
kraken2(ch_fastq.combine(ch_kraken2_db))
4848
bracken(kraken2.out.combine(ch_bracken_db).combine(parse_sample_sheet.out).combine(ch_taxonomic_levels).unique{ it -> [it[0], it[4]] })
4949

50-
abundance_top_n(bracken.out)
50+
abundance_top_n(bracken.out.adjusted)
5151

5252
abundance_top_n.out.filter{ it[2] == 'Genus' }.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "top_3_abundances_genus.csv", storeDir: "${params.outdir}/abundance_top_n")
5353
abundance_top_n.out.filter{ it[2] == 'Species' }.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "top_5_abundances_species.csv", storeDir: "${params.outdir}/abundance_top_n")
5454

5555
ch_fastqc_collected = fastqc.out.map{ it -> [it[1], it[2]] }.collect()
56-
ch_bracken_species_multiqc_collected = bracken.out.filter{ it[4] == 'Species' }.map{ it -> it[2] }.collect()
56+
ch_bracken_species_multiqc_collected = bracken.out.adjusted.filter{ it[3] == 'Species' }.map{ it -> it[1] }.collect()
5757
ch_all_qc_outputs = interop_summary.out.map{ it -> it.drop(1) }.combine(ch_fastqc_collected).combine(ch_bracken_species_multiqc_collected)
5858

5959
combine_qc_stats(abundance_top_n.out.filter{ it[2] == 'Species' }.map{ it -> [it[0], it[1]] }.join(seqtk_fqchk_summary.out.map{ it -> [it[0], it[1]] }).join(mash_sketch_summary.out)).map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "basic_qc_stats.csv", storeDir: "${params.outdir}/basic_qc_stats")

modules/bracken.nf

Lines changed: 12 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,33 @@ process bracken {
1212
tuple val(sample_id), path(kraken2_report), path(bracken_db), path(sample_sheet_json), val(taxonomic_level)
1313

1414
output:
15-
tuple val(sample_id), path("${sample_id}_${taxonomic_level}_bracken.txt"), path("${sample_id}_${taxonomic_level}_multiqc_bracken.txt"), path("${sample_id}_${taxonomic_level}_bracken_abundances.tsv"), val(taxonomic_level)
15+
tuple val(sample_id), path("${sample_id}_${taxonomic_level}_bracken.txt"), path("${sample_id}_${taxonomic_level}_bracken_abundances.tsv"), val(taxonomic_level), emit: unadjusted
16+
tuple val(sample_id), path("${sample_id}_${taxonomic_level}_bracken_adjusted.txt"), path("${sample_id}_${taxonomic_level}_bracken_abundances_adjusted.tsv"), val(taxonomic_level), emit: adjusted
1617

1718
script:
1819
taxonomic_level_char = taxonomic_level.substring(0,1)
19-
// MultiQC uses the following regex on the first two lines of a file to identify it as a kraken output:
20-
// '^\s{1,2}(\d{1,2}\.\d{1,2})\t(\d+)\t(\d+)\t([\dUDKPCOFGS-]{1,3})\t(\d+)\s+(.+)'
21-
// The output is modified slightly to mimic kraken2 output so that it can be parsed by MultiQC.
22-
// The original outputs are stored to the output dir, and the modified ones are sent to MultiQC.
2320
"""
2421
bracken -d ${bracken_db} \
2522
-i ${kraken2_report} \
2623
-w ${sample_id}_${taxonomic_level}_bracken.txt \
2724
-o ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted.tsv \
2825
-r \$(get_read_length.py ${sample_sheet_json}) \
2926
-l ${taxonomic_level_char}
27+
3028
head -n 1 ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted.tsv > bracken_abundances_header.tsv
3129
tail -n+2 ${sample_id}_${taxonomic_level}_bracken_abundances_unsorted.tsv | sort -t \$'\\t' -nrk 7,7 > ${sample_id}_${taxonomic_level}_bracken_abundances_data.tsv
3230
cat bracken_abundances_header.tsv ${sample_id}_${taxonomic_level}_bracken_abundances_data.tsv > ${sample_id}_${taxonomic_level}_bracken_abundances.tsv
33-
sed 's/100\\.00/99\\.99/' ${sample_id}_${taxonomic_level}_bracken.txt | awk 'NR != 2' | awk '{print " ", \$0}' > ${sample_id}_${taxonomic_level}_bracken_tmp.txt
34-
echo -e " 0.01\\t1\\t1\\tU\\t1\\tunclassified" > unclassified_placeholder.tsv
35-
cat unclassified_placeholder.tsv ${sample_id}_${taxonomic_level}_bracken_tmp.txt > ${sample_id}_${taxonomic_level}_multiqc_bracken.txt
31+
32+
adjust_for_unclassified_reads.py \
33+
--kraken-report ${kraken2_report} \
34+
--bracken-report ${sample_id}_${taxonomic_level}_bracken.txt \
35+
--bracken-abundances ${sample_id}_${taxonomic_level}_bracken_abundances.tsv \
36+
--adjusted-report ${sample_id}_${taxonomic_level}_bracken_adjusted.txt \
37+
--adjusted-abundances ${sample_id}_${taxonomic_level}_bracken_abundances_adjusted.tsv
3638
"""
3739
}
3840

41+
3942
process abundance_top_n {
4043

4144
tag { sample_id + " / " + taxonomic_level }
@@ -47,7 +50,7 @@ process abundance_top_n {
4750
cpus 1
4851

4952
input:
50-
tuple val(sample_id), path(_), path(_2), path(bracken_abundances), val(taxonomic_level)
53+
tuple val(sample_id), path(_), path(bracken_abundances), val(taxonomic_level)
5154

5255
output:
5356
tuple val(sample_id), path("${sample_id}_${taxonomic_level}_top_*.tsv"), val(taxonomic_level)

0 commit comments

Comments
 (0)