Skip to content

Commit 168f4ec

Browse files
authored
Add GC content to basic QC stats table (#39)
1 parent 89f93f4 commit 168f4ec

File tree

2 files changed

+14
-7
lines changed

2 files changed

+14
-7
lines changed

bin/summarize_seqtk_fqchk.py

+13-6
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,17 @@ def parse_seqtk_fqchk_output(seqtk_fqchk_output_path, quality_threshold):
99
with open(seqtk_fqchk_output_path, 'r') as f:
1010
reader = csv.DictReader(f)
1111
for row in reader:
12-
num_bases_and_avg_q = {}
12+
parsed_row = {}
1313
if row['position'] == 'ALL':
1414
percent_above_header = 'percent_bases_above_q' + str(quality_threshold)
15-
num_bases_and_avg_q['num_bases'] = int(row['num_bases'])
16-
num_bases_and_avg_q['average_q'] = float(row['average_q'])
17-
num_bases_and_avg_q[percent_above_header] = float(row[percent_above_header])
18-
output.append(num_bases_and_avg_q)
19-
15+
parsed_row['num_bases'] = int(row['num_bases'])
16+
parsed_row['average_q'] = float(row['average_q'])
17+
parsed_row[percent_above_header] = float(row[percent_above_header])
18+
percent_g = float(row['percent_g'])
19+
percent_c = float(row['percent_c'])
20+
parsed_row['percent_gc'] = percent_g + percent_c
21+
output.append(parsed_row)
22+
2023
return output
2124

2225

@@ -34,6 +37,8 @@ def main(args):
3437

3538
total_bases = sum([x['num_bases'] for x in seqtk_fqchk_output])
3639

40+
overall_percent_gc = sum([x['percent_gc'] * x['num_bases'] for x in seqtk_fqchk_output]) / total_bases
41+
3742
overall_average_q = sum([x['average_q'] * x['num_bases'] for x in seqtk_fqchk_output]) / total_bases
3843

3944
percent_above_header = 'percent_bases_above_q' + str(quality_threshold)
@@ -42,13 +47,15 @@ def main(args):
4247

4348
print(','.join([
4449
'sample_id',
50+
'percent_gc',
4551
'total_bases',
4652
'average_base_quality',
4753
percent_above_header,
4854
]))
4955

5056
print(','.join([
5157
args.sample_id,
58+
str(round(overall_percent_gc, 3)),
5259
str(total_bases),
5360
str(round(overall_average_q, 3)),
5461
str(round(overall_percent_above_threshold, 3)),

modules/combine_qc_stats.nf

+1-1
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ process combine_qc_stats {
1515
printf "sample_id\\n${sample_id}\\n" > sample_id.csv
1616
awk -F ',' 'BEGIN {OFS=FS}; NR==1 { for (i=1; i<=NF; i++) {idx[\$i] = i} }; { print \$(idx["abundance_1_name"]), \$(idx["abundance_1_fraction_total_reads"]) }' ${species_abundance} | sed -s 's/abundance_1/most_abundant_species/g' > species_abundance_stats.csv
1717
awk -F ',' 'BEGIN {OFS=FS}; NR==1 { for (i=1; i<=NF; i++) {idx[\$i] = i} }; { print \$(idx["estimated_genome_size_bp"]), \$(idx["estimated_depth_coverage"]) }' ${estimated_coverage} > estimated_coverage_stats.csv
18-
awk -F ',' 'BEGIN {OFS=FS}; NR==1 { for (i=1; i<=NF; i++) {idx[\$i] = i} }; { print \$(idx["total_bases"]), \$(idx["average_base_quality"]), \$(idx["percent_bases_above_q${params.seqtk_fqchk_threshold}"]) }' ${sequence_quality} > sequence_quality_stats.csv
18+
awk -F ',' 'BEGIN {OFS=FS}; NR==1 { for (i=1; i<=NF; i++) {idx[\$i] = i} }; { print \$(idx["total_bases"]), \$(idx["average_base_quality"]), \$(idx["percent_bases_above_q${params.seqtk_fqchk_threshold}"]), \$(idx["percent_gc"]) }' ${sequence_quality} > sequence_quality_stats.csv
1919
paste -d ',' sample_id.csv species_abundance_stats.csv estimated_coverage_stats.csv sequence_quality_stats.csv > ${sample_id}_combined_qc_stats.csv
2020
"""
2121
}

0 commit comments

Comments
 (0)