Skip to content

Commit 51b89cc

Browse files
authored
Merge pull request #2 from BCCDC-PHL/add-bracken
Add bracken
2 parents 9a2696a + 2fa7315 commit 51b89cc

13 files changed

+313
-31
lines changed

README.md

+8
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,14 @@ A generic pipeline that can be run routinely on all Illumina sequence runs, rega
44
* Sequence quality information
55
* Possible contamination
66

7+
## Analyses
8+
9+
* Parse run-level QC statistics from the 'InterOp' directory and write to `.csv` and `.json` format.
10+
* [FastQC](https://www.bioinformatics.babraham.ac.uk/projects/fastqc/): sample-level sequence quality metrics
11+
* [Kraken2](https://github.com/DerrickWood/kraken2) + [Bracken](https://github.com/jenniferlu717/Bracken): Taxonomic classification
12+
of reads. Estimation of relative abundances of taxonomic groups (genus, species) in each sample.
13+
* [MultiQC](https://github.com/ewels/MultiQC): Collect several QC metrics into a single interactive HTML report.
14+
715
## Usage
816

917
```

assets/multiqc_config_base.yaml

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
show_analysis_paths: False
2+
fn_clean_exts:
3+
- "_"
4+
table_columns_visible:
5+
Kraken: False
6+
top_modules:
7+
- "interop"
8+
- "fastqc"
9+
remove_sections:
10+
- fastqc_per_base_sequence_content
11+
- fastqc_per_base_n_content
12+
- fastqc_sequence_length_distribution

bin/bracken_top_n_linelist.py

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
#!/usr/bin/env python
2+
3+
import argparse
4+
import csv
5+
import sys
6+
import re
7+
import json
8+
9+
10+
def parse_bracken_report(bracken_report_path):
11+
bracken_report_lines = []
12+
with open(bracken_report_path, 'r') as f:
13+
reader = csv.DictReader(f, dialect='excel-tab')
14+
for row in reader:
15+
bracken_report_lines.append(row)
16+
17+
return bracken_report_lines
18+
19+
20+
def main(args):
21+
bracken_report = parse_bracken_report(args.bracken_report)
22+
23+
bracken_report_sorted = sorted(bracken_report, key=lambda k: k['fraction_total_reads'], reverse=True)
24+
25+
output_fields = ['sample_id', 'taxonomy_level']
26+
output_line = {
27+
'sample_id': args.sample_id,
28+
'taxonomy_level': bracken_report_sorted[0]['taxonomy_lvl']
29+
}
30+
31+
for n in range(args.top_n):
32+
num = str(n + 1)
33+
name_field = 'abundance_' + num + '_name'
34+
output_line[name_field] = bracken_report_sorted[n]['name']
35+
output_fields.append(name_field)
36+
fraction_total_reads_field = 'abundance_' + num + '_fraction_total_reads'
37+
output_line[fraction_total_reads_field] = bracken_report_sorted[n]['fraction_total_reads']
38+
output_fields.append(fraction_total_reads_field)
39+
40+
41+
csv.register_dialect('unix-csv-quote-minimal', delimiter=',', doublequote=False, lineterminator='\n', quoting=csv.QUOTE_MINIMAL)
42+
writer = csv.DictWriter(sys.stdout, fieldnames=output_fields, dialect='unix-csv-quote-minimal')
43+
writer.writeheader()
44+
writer.writerow(output_line)
45+
46+
if __name__ == '__main__':
47+
parser = argparse.ArgumentParser()
48+
parser.add_argument('bracken_report')
49+
parser.add_argument('-s', '--sample-id')
50+
parser.add_argument('-n', '--top-n', type=int)
51+
args = parser.parse_args()
52+
main(args)

bin/get_read_length.py

+33
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
#!/usr/bin/env python
2+
3+
import argparse
4+
import json
5+
6+
7+
def main(args):
8+
with open(args.sample_sheet_json, 'r') as f:
9+
sample_sheet = json.load(f)
10+
actual_read_length = sample_sheet['reads'][0]
11+
12+
if actual_read_length < 60:
13+
read_length = 50
14+
elif actual_read_length < 90:
15+
read_length = 75
16+
elif actual_read_length < 125:
17+
read_length = 100
18+
elif actual_read_length < 175:
19+
read_length = 150
20+
elif actual_read_length < 225:
21+
read_length = 200
22+
elif actual_read_length < 275:
23+
read_length = 250
24+
else:
25+
read_length = 300
26+
print(read_length)
27+
28+
29+
if __name__ == '__main__':
30+
parser = argparse.ArgumentParser()
31+
parser.add_argument('sample_sheet_json')
32+
args = parser.parse_args()
33+
main(args)

bin/kraken_parser.py

+64
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
#!/usr/bin/env python3
2+
3+
import argparse
4+
import json
5+
import sys
6+
7+
8+
def main(args):
9+
10+
taxonomic_levels = {'U', 'D', 'K', 'P', 'C', 'O', 'F', 'G', 'S'}
11+
# assert args.taxonomic_level in rank_codes, "Rank must be one of [(U)nclassified, (D)omain, (K)ingdom, (P)hylum, (C)lass, (O)rder, (F)amily, (G)enus, or (S)pecies.]"
12+
13+
total_reads = 0
14+
unclassified_reads = 0
15+
total_reads_reported = 0
16+
other_reads = 0
17+
18+
headers = [
19+
"percent_reads_in_clade",
20+
"num_reads_in_clade",
21+
"ncbi_taxonomy_id",
22+
"taxon_name",
23+
]
24+
25+
print('\t'.join(headers))
26+
27+
with open(args.kraken_report, 'r') as f:
28+
for line in f:
29+
record = {}
30+
split_line = [record.strip() for record in line.strip().split("\t")]
31+
32+
record['percent_reads_clade'] = float(split_line[0])
33+
record['num_reads_clade'] = int(split_line[1])
34+
record['num_reads_taxon'] = int(split_line[2])
35+
record['taxonomic_level'] = split_line[3]
36+
record['ncbi_taxonomy_id'] = split_line[4]
37+
record['clade_name'] = split_line[5]
38+
39+
if record['clade_name'] == "unclassified":
40+
unclassified_reads = record['num_reads_clade']
41+
total_reads += record['num_reads_clade']
42+
elif record['clade_name'] == "root":
43+
total_reads += record['num_reads_clade']
44+
45+
# record['percent_reads_in_clade'] = record['num_reads_clade'] / float(total_reads) * 100
46+
47+
if record['taxonomic_level'] == args.taxonomic_level and record['percent_reads_clade'] >= args.threshold_percent:
48+
print('\t'.join([str(record['percent_reads_clade']), str(record['num_reads_clade']), str(record['ncbi_taxonomy_id']), record['clade_name']]))
49+
total_reads_reported += record['num_reads_clade']
50+
51+
52+
print(str('%.3f' % (unclassified_reads / float(total_reads) * 100)) + '\t' + str(unclassified_reads) + '\t\t' + 'unclassified')
53+
total_reads_reported += unclassified_reads
54+
other_reads = total_reads - total_reads_reported
55+
print(str('%.3f' % (other_reads / float(total_reads) * 100)) + '\t' + str(other_reads) + '\t\t' + 'other')
56+
57+
58+
if __name__ == '__main__':
59+
parser = argparse.ArgumentParser()
60+
parser.add_argument('kraken_report')
61+
parser.add_argument('-l', '--taxonomic_level')
62+
parser.add_argument('-t', '--threshold_percent', type=float)
63+
args = parser.parse_args()
64+
main(args)

bin/parse_run_summary.py

+61-15
Original file line numberDiff line numberDiff line change
@@ -8,24 +8,28 @@
88
def parse_read_summary(summary_path):
99
read_summary_headers = []
1010
read_summary_lines = []
11-
# Basic approach to parsing text between two specific lines
12-
# described here: https://stackoverflow.com/a/7559542/780188
11+
12+
replaced_fields = {'%>=q30': 'percent_greater_than_q30',
13+
'%_occupied': 'percent_occupied'}
14+
1315
with open(summary_path) as summary:
1416
for line in summary:
1517
if re.match("^Level", line):
1618
read_summary_headers = re.split("\s*,", line.rstrip())
1719
read_summary_headers = [
1820
x.lower().replace(" ", "_") for x in read_summary_headers
1921
]
20-
read_summary_headers = [
21-
x.replace("%>=q30", "percent_greater_than_q30") for x in read_summary_headers
22-
]
22+
for idx, header in enumerate(read_summary_headers):
23+
if header in replaced_fields:
24+
read_summary_headers[idx] = replaced_fields[header]
25+
2326
break
2427
for line in summary:
2528
if re.match("^Total", line):
26-
read_summary_lines.append(re.split("\s*,", line.rstrip()))
29+
read_summary_lines.append(re.split(",", line.rstrip()))
2730
break
28-
read_summary_lines.append(re.split("\s*,", line.rstrip()))
31+
else:
32+
read_summary_lines.append(re.split(",", line.rstrip()))
2933

3034
read_summary = []
3135
for line in read_summary_lines:
@@ -41,14 +45,15 @@ def parse_read_summary(summary_path):
4145

4246
return read_summary
4347

48+
4449
def parse_read_summary_detail(summary_path):
4550
headers = [
4651
'lane',
4752
'surface',
4853
'tiles',
4954
'density',
5055
'clusters_passing_filter',
51-
'legacy_pasing_prephasing_rate',
56+
'legacy_phasing_prephasing_rate',
5257
'phasing_slope_offset',
5358
'prephasing_slope_offset',
5459
'reads',
@@ -64,6 +69,31 @@ def parse_read_summary_detail(summary_path):
6469
'percent_occupied',
6570
'intensity_at_cycle_1',
6671
]
72+
average_stdev_fields = [
73+
'aligned',
74+
'clusters_passing_filter',
75+
'density',
76+
'error',
77+
'error_100',
78+
'error_75',
79+
'error_35',
80+
'intensity_at_cycle_1',
81+
'percent_occupied',
82+
]
83+
slash_fields = { 'legacy_phasing_prephasing_rate': {'numerator_field': 'legacy_phasing_rate',
84+
'denominator_field': 'legacy_prephasing_rate'},
85+
'phasing_slope_offset': {'numerator_field': 'phasing_slope',
86+
'denominator_field': 'phasing_offset'},
87+
'prephasing_slope_offset': {'numerator_field': 'prephasing_slope',
88+
'denominator_field': 'prephasing_offset'},
89+
}
90+
float_fields = [
91+
'percent_greater_than_q30',
92+
'reads',
93+
'reads_passing_filter',
94+
'yield',
95+
]
96+
6797
lines_by_read = {
6898
'read_1': [],
6999
'read_i1': [],
@@ -73,15 +103,13 @@ def parse_read_summary_detail(summary_path):
73103
with open(summary_path) as summary:
74104
current_read = None
75105
for line in summary:
76-
if re.match("^Read 1$", line):
106+
if re.match("^Read 1\n$", line):
77107
current_read = 'read_1'
78-
elif re.match("^Read 2 \(I\)$", line):
108+
elif re.match("^Read 2 \(I\)\n$", line):
79109
current_read = 'read_i1'
80-
elif re.match("^Read 3 \(I\)$", line):
110+
elif re.match("^Read 3 \(I\)\n$", line):
81111
current_read = 'read_i2'
82-
elif re.match("^Read 4$", line):
83-
current_read = 'read_2'
84-
elif re.match("^Read 4$", line):
112+
elif re.match("^Read 4$\n", line):
85113
current_read = 'read_2'
86114
elif re.match("^Extracted", line) or re.match("^Called", line) or re.match("^Scored", line):
87115
current_read = None
@@ -91,15 +119,33 @@ def parse_read_summary_detail(summary_path):
91119
lines_by_read[current_read].append(read_line_dict)
92120
else:
93121
pass
122+
123+
for field in average_stdev_fields:
124+
string_value = read_line_dict[field]
125+
[average, stdev] = [float(value) for value in string_value.split(' +/- ')]
126+
read_line_dict[field] = { 'average': average,
127+
'stdev': stdev }
128+
129+
for field, num_denom in slash_fields.items():
130+
string_value = read_line_dict[field]
131+
numerator_field = num_denom['numerator_field']
132+
denominator_field = num_denom['denominator_field']
133+
[numerator, denominator] = [float(value) for value in string_value.split(' / ')]
134+
read_line_dict[numerator_field] = numerator
135+
read_line_dict[denominator_field] = denominator
136+
read_line_dict.pop(field, None)
94137

95138
return lines_by_read
96139

140+
97141
def main(args):
98142
read_summary = parse_read_summary(args.summary)
99143
read_summary_detail = parse_read_summary_detail(args.summary)
100144

145+
output = {'read_summary': read_summary,
146+
'read_details': read_summary_detail}
101147
# print(json.dumps(read_summary))
102-
print(json.dumps(read_summary_detail))
148+
print(json.dumps(output))
103149

104150
if __name__ == '__main__':
105151
parser = argparse.ArgumentParser()

main.nf

+14-7
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,19 @@ include { multiqc } from './modules/multiqc.nf'
77
include { interop_summary } from './modules/interop.nf'
88
include { parse_sample_sheet } from './modules/sample-sheet.nf'
99
include { kraken2 } from './modules/kraken2.nf'
10+
include { bracken } from './modules/bracken.nf'
11+
include { abundance_top_n } from './modules/bracken.nf'
1012

1113
workflow {
1214
ch_fastq = Channel.fromFilePairs( "${params.run_dir}/Data/Intensities/BaseCalls/*_{R1,R2}_*.fastq.gz" )
1315
ch_sample_sheet = Channel.fromPath( "${params.run_dir}/SampleSheet.csv" )
16+
ch_multiqc_config = Channel.fromPath( "${projectDir}/assets/multiqc_config_base.yaml" )
1417
ch_run_dir = Channel.fromPath(params.run_dir)
1518
ch_run_id = Channel.fromPath(params.run_dir).map{ it -> it.baseName }
1619
ch_kraken2_db = Channel.fromPath(params.kraken2_db)
17-
20+
ch_bracken_db = Channel.fromPath(params.bracken_db)
21+
ch_taxonomic_levels = Channel.from('Genus', 'Species')
22+
1823
main:
1924
interop_summary(ch_run_id.combine(ch_run_dir))
2025

@@ -23,15 +28,17 @@ workflow {
2328
fastqc(ch_fastq)
2429

2530
kraken2(ch_fastq.combine(ch_kraken2_db))
31+
bracken(kraken2.out.combine(ch_bracken_db).combine(parse_sample_sheet.out).combine(ch_taxonomic_levels))
32+
33+
abundance_top_n(bracken.out)
2634

27-
// Line below is just composing the run_id and the list of fastqc_outdirs into a new list. There must be a better way(?)
28-
// "run_id" + ["fastqc_outdir1", "fastqc_outdir2", ...] => ["run_id", ["fastqc_outdir1", "fastqc_outdir2"]]
35+
abundance_top_n.out.filter{ it[2] == 'Genus' }.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "top_3_abundances_genus.csv", storeDir: "${params.outdir}/abundance_top_n")
36+
abundance_top_n.out.filter{ it[2] == 'Species' }.map{ it -> it[1] }.collectFile(keepHeader: true, sort: { it.text }, name: "top_5_abundances_species.csv", storeDir: "${params.outdir}/abundance_top_n")
2937

3038
ch_fastqc_collected = fastqc.out.map{ it -> [it[1], it[2]] }.collect()
31-
ch_kraken2_collected = kraken2.out.collect()
32-
ch_all_qc_outputs = ch_fastqc_collected.combine(ch_kraken2_collected)
39+
ch_bracken_species_multiqc_collected = bracken.out.filter{ it[4] == 'Species' }.map{ it -> it[2] }.collect()
40+
ch_all_qc_outputs = interop_summary.out.map{ it -> it.drop(1) }.combine(ch_fastqc_collected).combine(ch_bracken_species_multiqc_collected)
3341

3442
ch_all_qc_outputs_with_run_id = ch_run_id.combine(ch_all_qc_outputs).map{ it -> [it[0], it.drop(1)] }
35-
multiqc(ch_all_qc_outputs_with_run_id)
36-
43+
multiqc(ch_multiqc_config.combine(ch_all_qc_outputs_with_run_id))
3744
}

0 commit comments

Comments
 (0)