Skip to content

Commit aa60649

Browse files
matruldamahesh-panchalb97plaalvaannett
authored
v1.1.0: rRNA plots and updated nextflow version (#45)
* Update DSL2 enable * Update minimum Nextflow version * Uppercase workflow and process names according to DSL2 convention * Remove unsupported publish directive * Add channel element structure comments * Add comment * Fix lowercase process names * Fix unqualified input value declaration * Plot rRNA statistics from FastQScreen output (#44) * extract rRNA numbers from fastq_screen and plot with MultiQC * fix syntax * custom rrna plots into main multiqc config * refactored rrna extraction * refactored combining rrna data * rename channel * added explanatory comments * Update nextflow.config update version to 1.1.0-rc1 * Bump version to 1.1.0 to make production release Co-authored-by: Mahesh Binzer-Panchal <[email protected]> Co-authored-by: Pontus Larsson <[email protected]> Co-authored-by: alvaannett <[email protected]>
1 parent 2cbbeba commit aa60649

File tree

6 files changed

+209
-58
lines changed

6 files changed

+209
-58
lines changed

config/compute_resources.config

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
process {
2-
withName: 'fastq_screen' {
2+
withName: 'FASTQ_SCREEN' {
33
memory = '4G'
44
}
5-
withName: 'get_QC_thresholds' {
5+
withName: 'GET_QC_THRESHOLDS' {
66
errorStrategy = 'ignore'
77
}
88
withLabel: 'high_memory' {

config/multiqc_flowcell_config.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,3 +17,4 @@ top_modules:
1717
- 'custom_content'
1818
- 'bcl2fastq'
1919
- 'interop'
20+

config/multiqc_main_config.yaml

Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,131 @@ custom_plot_config:
2323

2424
remove_sections:
2525
- 'fastqc_sequence_counts'
26+
27+
custom_data:
28+
rrna_plot:
29+
id: "rrna_plot"
30+
section_name: "Ribosomal RNA - plot"
31+
parent_id: "rrna"
32+
parent_name: "Ribosomal RNA"
33+
title: "rRNA mapping statistics extracted from FastQScreen output"
34+
description: "shows the FastQScreen mapping statistics for the rRNA genome. The statistics have been extracted from the full FastQScreen output shown elsewhere in this report in order to highlight the rRNA contents."
35+
file_format: "tsv"
36+
plot_type: "bargraph"
37+
categories:
38+
- "#Unmapped"
39+
- "#One_hit_one_genome"
40+
- "#Multiple_hits_one_genome"
41+
- "#One_hit_multiple_genomes"
42+
- "Multiple_hits_multiple_genomes"
43+
pconfig:
44+
hide_zero_cats: False
45+
cpswitch_c_active: False
46+
title: "Reads mapped to rRNA genome"
47+
rrna_table:
48+
id: "rrna_table"
49+
parent_id: "rrna"
50+
parent_name: "Ribosomal RNA"
51+
section_name: "Ribosomal RNA - table"
52+
file_format: "tsv"
53+
plot_type: "table"
54+
description: "shows the FastQScreen mapping statistics for the rRNA genome. The statistics have been extracted from the full FastQScreen output shown elsewhere in this report in order to highlight the rRNA contents."
55+
pconfig:
56+
sortRows: True
57+
table_title: "rRNA mapping statistics extracted from FastQScreen output"
58+
headers:
59+
"Genome":
60+
title: 'Genome'
61+
description: screened genome
62+
hidden: True
63+
"#Reads_processed":
64+
namespace: 'rRNA number'
65+
title: 'Reads_processed'
66+
format: '{:,.0f}'
67+
description: number of sampled reads for the screen
68+
"#Unmapped":
69+
title: 'Unmapped'
70+
namespace: 'rRNA number'
71+
hidden: True
72+
format: '{:,.0f}'
73+
description: reads with no hits in any of the screened genomes
74+
"%Unmapped":
75+
namespace: 'rRNA percentage'
76+
title: 'Unmapped'
77+
suffix: '%'
78+
max: 100
79+
min: 0
80+
ceiling: 100
81+
floor: 0
82+
scale: 'RdYlGn'
83+
description: reads with no hits in any of the screened genomes
84+
"#One_hit_one_genome":
85+
namespace: 'rRNA number'
86+
title: 'One_hit_one_genome'
87+
hidden: True
88+
format: '{:,.0f}'
89+
description: reads with a unique hit only in the specified genome
90+
"%One_hit_one_genome":
91+
namespace: 'rRNA percentage'
92+
title: 'One_hit_one_genome'
93+
suffix: '%'
94+
max: 100
95+
min: 0
96+
ceiling: 100
97+
floor: 0
98+
scale: 'Reds'
99+
description: reads with a unique hit only in the specified genome
100+
"#Multiple_hits_one_genome":
101+
namespace: 'rRNA number'
102+
title: 'Multiple_hits_one_genome'
103+
hidden: True
104+
format: '{:,.0f}'
105+
description: reads with multiple hits only in the specified genome
106+
"%Multiple_hits_one_genome":
107+
namespace: 'rRNA percentage'
108+
title: 'Multiple_hits_one_genome'
109+
suffix: '%'
110+
max: 100
111+
min: 0
112+
ceiling: 100
113+
floor: 0
114+
scale: 'Reds'
115+
description: reads with multiple hits only in the specified genome
116+
"#One_hit_multiple_genomes":
117+
namespace: 'rRNA number'
118+
title: 'One_hit_multiple_genomes'
119+
hidden: True
120+
format: '{:,.0f}'
121+
description: reads with a unique hit in multiple screened genomes
122+
"%One_hit_multiple_genomes":
123+
namespace: 'rRNA percentage'
124+
title: 'One_hit_multiple_genomes'
125+
suffix: '%'
126+
max: 100
127+
min: 0
128+
ceiling: 100
129+
floor: 0
130+
scale: 'Reds'
131+
description: reads with a unique hit in multiple screened genomes
132+
"#Multiple_hits_multiple_genomes":
133+
namespace: 'rRNA number'
134+
title: 'Multiple_hits_multiple_genomes'
135+
hidden: True
136+
format: '{:,.0f}'
137+
description: reads with multiple hits in multiple screened genomes
138+
"%Multiple_hits_multiple_genomes":
139+
namespace: 'rRNA percentage'
140+
title: 'Multiple_hits_multiple_genomes'
141+
suffix: '%'
142+
max: 100
143+
min: 0
144+
ceiling: 100
145+
floor: 0
146+
scale: 'Reds'
147+
description: reads with multiple hits in multiple screened genomes
148+
149+
sp:
150+
rrna_table:
151+
fn: "rrna_table.tsv"
152+
rrna_plot:
153+
fn: "rrna_plot.tsv"

config/multiqc_project_config.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,3 @@ table_columns_visible:
1111

1212
top_modules:
1313
- 'custom_content'
14-

main.nf

Lines changed: 67 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
#! /usr/bin/env nextflow
22

3-
nextflow.preview.dsl=2
3+
nextflow.enable.dsl=2
44
/* ####################################################
55
66
seqreports: SNP & SEQ Run folder QC pipeline
@@ -80,11 +80,7 @@ workflow {
8080
Channel.fromPath(params.run_folder,checkIfExists:true)
8181
.ifEmpty { "Error: No run folder (--run_folder) given."; exit 1 }
8282
.set {run_folder}
83-
check_run_quality(run_folder)
84-
85-
publish:
86-
check_run_quality.out.projectqc to: "${params.result_dir}/projects", mode: 'copy', overwrite: true
87-
check_run_quality.out.flowcellqc to: "${params.result_dir}/flowcell_report", mode: 'copy', overwrite: true
83+
CHECK_RUN_QUALITY(run_folder)
8884

8985
}
9086

@@ -106,13 +102,20 @@ def get_project_and_reads(run_folder) {
106102

107103
}
108104

109-
def combine_results_by_project (fastqc_results,fastq_screen_results) {
105+
def combine_results_by_project (fastqc_results,fastq_screen_results,rrna_results) {
106+
// fastqc_results // [Project, [fqcfiles1, fqcfiles2, fqcfiles3]]
107+
// fastq_screen_results // [Project, [fqsfiles1, fqsfiles2, fqsfiles3]]
108+
// rrna_results // [Project, [rrnafiles1, rrnafiles2, rrnafiles3]]
110109

111-
fastqc_results.mix(fastq_screen_results).groupTuple().map { it -> tuple(it[0],it[1][0].flatten(),it[1][1].flatten()) }
110+
fastqc_results.join(fastq_screen_results)
111+
.join(
112+
rrna_results.collectFile(keepHeader:true,skip:1,sort:true) { it -> ["${it[0]}_rrna_table.tsv", it[1]] }
113+
.map { it -> tuple((it.name - ~/_rrna_table.tsv/), [it]) })
114+
// [Project, [fqcfiles1,fqcfiles2,fqcfiles3],[fqsfiles1,fqsfiles2,fqsfiles3],[Project_rrna_table.tsv]]
112115

113116
}
114117

115-
workflow check_run_quality {
118+
workflow CHECK_RUN_QUALITY {
116119

117120
/* Workflow Graph
118121
@@ -128,46 +131,46 @@ workflow check_run_quality {
128131
run_folder
129132

130133
main:
131-
interop_summary(run_folder)
132-
get_QC_thresholds(run_folder)
133-
get_metadata(run_folder)
134+
INTEROP_SUMMARY(run_folder)
135+
GET_QC_THRESHOLDS(run_folder)
136+
GET_METADATA(run_folder)
134137
project_and_reads = get_project_and_reads(params.run_folder)
135-
fastqc(project_and_reads)
136-
fastq_screen(project_and_reads,
138+
FASTQC(project_and_reads)
139+
FASTQ_SCREEN(project_and_reads,
137140
params.config_dir,
138141
params.fastqscreen_databases)
139-
multiqc_per_flowcell( params.run_folder,
140-
fastqc.out.map{ it[1] }.collect(),
141-
fastq_screen.out.map{ it[1] }.collect(),
142-
interop_summary.out.collect(),
143-
get_QC_thresholds.out.collect().ifEmpty([]),
144-
get_metadata.out.collect(),
142+
MULTIQC_PER_FLOWCELL( params.run_folder,
143+
FASTQC.out.map{ it[1] }.collect(),
144+
FASTQ_SCREEN.out.results.map{ it[1] }.collect(),
145+
FASTQ_SCREEN.out.tsv.map{ it[1] }.collectFile(keepHeader:true,skip:1,sort:true),
146+
INTEROP_SUMMARY.out.collect(),
147+
GET_QC_THRESHOLDS.out.collect().ifEmpty([]),
148+
GET_METADATA.out.collect(),
145149
Channel.fromPath("${params.run_folder}/${params.bcl2fastq_outdir}/Stats/Stats.json").collect().ifEmpty([]),
146150
params.assets_dir,
147151
params.config_dir)
148-
multiqc_per_project( params.run_folder,
149-
combine_results_by_project(fastqc.out.groupTuple(),fastq_screen.out.groupTuple()),
150-
get_metadata.out.collect(),
152+
MULTIQC_PER_PROJECT( params.run_folder,
153+
combine_results_by_project(
154+
FASTQC.out.groupTuple(),
155+
FASTQ_SCREEN.out.results.groupTuple(),
156+
FASTQ_SCREEN.out.tsv),
157+
GET_METADATA.out.collect(),
151158
params.assets_dir,
152159
params.config_dir)
153-
154-
emit:
155-
flowcellqc = multiqc_per_flowcell.out
156-
projectqc = multiqc_per_project.out
157-
158160
}
159161

160162
// ---------------------------------------------------
161163
// Processes
162164
// ---------------------------------------------------
163165

164-
process fastqc {
166+
167+
process FASTQC {
165168

166169
input:
167-
tuple project, path(fastq_file)
170+
tuple val(project), path(fastq_file)
168171

169172
output:
170-
tuple project, path("*_results")
173+
tuple val(project), path("*_results")
171174

172175
script:
173176
"""
@@ -176,17 +179,20 @@ process fastqc {
176179
"""
177180
}
178181

179-
process fastq_screen {
182+
process FASTQ_SCREEN {
180183

181184
input:
182-
tuple project, path(fastq_file)
185+
tuple val(project), path(fastq_file)
183186
path config_dir
184187
path fastqscreen_databases
185188

186189
output:
187-
tuple project, path("*_results")
190+
tuple val(project), path("*_results"), emit: results
191+
tuple val(project), path("rrna.tsv"), emit: tsv
188192

189193
script:
194+
outdir = fastq_file + "_fastq_screen_results"
195+
sample_name = (fastq_file.name =~ /^(.*_S\d+_L\d{3}_R\d+).*/)[0][1]
190196
"""
191197
sed -E 's/^(THREADS[[:blank:]]+)[[:digit:]]+/\1${task.cpus}/' \\
192198
${config_dir}/fastq_screen.conf > fastq_screen.conf
@@ -195,12 +201,18 @@ process fastq_screen {
195201
elif [ "${fastqscreen_databases}" != "${fastqscreen_default_databases}" ]; then
196202
sed -i 's#${fastqscreen_default_databases}#${fastqscreen_databases}#' fastq_screen.conf
197203
fi
198-
mkdir -p $fastq_file"_fastq_screen_results"
199-
fastq_screen --conf fastq_screen.conf --outdir $fastq_file"_fastq_screen_results" $fastq_file
204+
mkdir -p $outdir
205+
fastq_screen --conf fastq_screen.conf --outdir $outdir $fastq_file
206+
207+
# extract rRNA numbers for custom plotting with MultiQC
208+
printf \"Sample\\t\" > rrna.tsv
209+
grep -e '^Genome' -m1 -h $outdir/*_screen.txt >> rrna.tsv
210+
printf \"$sample_name\\t\" >> rrna.tsv
211+
grep -e '^rRNA' -h $outdir/*_screen.txt >> rrna.tsv
200212
"""
201213
}
202214

203-
process get_QC_thresholds {
215+
process GET_QC_THRESHOLDS {
204216

205217
input:
206218
path runfolder
@@ -220,7 +232,7 @@ process get_QC_thresholds {
220232
"""
221233
}
222234

223-
process get_metadata {
235+
process GET_METADATA {
224236

225237
input:
226238
path runfolder
@@ -240,7 +252,7 @@ process get_metadata {
240252
"""
241253
}
242254

243-
process interop_summary {
255+
process INTEROP_SUMMARY {
244256

245257
input:
246258
path runfolder
@@ -254,26 +266,32 @@ process interop_summary {
254266
"""
255267
}
256268

257-
process multiqc_per_flowcell {
269+
process MULTIQC_PER_FLOWCELL {
270+
271+
publishDir "${params.result_dir}/flowcell_report", mode: 'copy', overwrite: true
258272
label 'high_memory'
259273

260274
input:
261275
val runfolder_name // Run folder name
262276
path ('FastQC/*') // Fastqc logs
263277
path ('FastqScreen/*') // Fastq screen logs
278+
path ('rRNA/rrna_table.tsv') // Extracted rRNA values
264279
path ('Interop_summary/*') // Interop log
265280
path qc_thresholds // Quality check thresholds (optional)
266281
path sequencing_metadata // Sequencing meta data ( custom content data )
267282
path bcl2fastq_stats // Bcl2Fastq logs
268283
path assets // Staged copy of assets folder
269-
path config_dir
284+
path config_dir // Staged copy of config folder
270285

271286
output:
272287
tuple path("*multiqc_report.html"), path("*_data.zip")
273288

274289
script:
275290
threshold_parameter = qc_thresholds ? "-c ${qc_thresholds}" : ""
276291
"""
292+
# making a separate file to use for plotting in MultiQC since custom content can only have one plot per section
293+
# as described here: https://multiqc.info/docs/#introduction-1
294+
cp rRNA/rrna_table.tsv rRNA/rrna_plot.tsv
277295
RUNFOLDER=\$( basename ${runfolder_name} )
278296
multiqc \\
279297
--title "Flowcell report for \${RUNFOLDER}" \\
@@ -286,21 +304,26 @@ process multiqc_per_flowcell {
286304

287305
}
288306

289-
process multiqc_per_project {
307+
process MULTIQC_PER_PROJECT {
308+
309+
publishDir "${params.result_dir}/projects", mode: 'copy', overwrite: true
290310
label 'high_memory'
291311

292312
input:
293313
val runfolder_name
294-
tuple project, path("FastQC/*"), path("FastqScreen/*")
314+
tuple val(project), path("FastQC/*"), path("FastqScreen/*"), path("rRNA/rrna_table.tsv")
295315
path sequencing_metadata
296316
path assets // Staged copy of assets folder
297-
path config_dir
317+
path config_dir // Staged copy of config folder
298318

299319
output:
300320
tuple path("${project}/*multiqc_report.html"), path("${project}/*_data.zip")
301321

302322
script:
303323
"""
324+
# making a separate file to use for plotting in MultiQC since custom content can only have one plot per section
325+
# as described here: https://multiqc.info/docs/#introduction-1
326+
cp rRNA/rrna_table.tsv rRNA/rrna_plot.tsv
304327
RUNFOLDER=\$( basename ${runfolder_name} )
305328
multiqc \\
306329
--title "Report for project ${project} on runfolder \${RUNFOLDER}" \\

0 commit comments

Comments
 (0)