Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions .github/PULL_REQUEST_TEMPLATE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
**What problems does this PR solve?**
Provide a short description or reference to the relevant issue, explaining what problems this PR solves.

**An outline of the validation procedure for this feature**
In addition to automatic tests, has any manual testing been carried out?

**Risk analysis - Reasons for careful code review**
If any of the boxes below are checked, extra careful code review should be inititated.

- [ ] This PR contains code that could remove data
67 changes: 44 additions & 23 deletions bin/get_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import xmltodict
from collections import OrderedDict
import re
import glob
import csv
import argparse
import os
import json
Expand All @@ -12,11 +14,12 @@


class RunfolderInfo:
def __init__(self, runfolder, bcl2fastq_outdir):
def __init__(self, runfolder, demultiplexer_outdir, demultiplexer):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given my other comments, I don't think we need the demultiplexer parameters in this class? Please double check.

self.runfolder = runfolder
self.demultiplexer = demultiplexer
self.run_info = self.read_run_info()
self.run_parameters = self.read_run_parameters()
self.stats_json = self.read_stats_json(bcl2fastq_outdir)
self.stats_json = self.read_stats_json(demultiplexer_outdir, demultiplexer)
self.description_and_identifier = OrderedDict()
self.run_parameters_tags = {
"RunId": "Run ID",
Expand Down Expand Up @@ -79,27 +82,38 @@ def find_flowcell_type_novaseqx(self):
return None
return {"Flowcell type": flowcell_type}

def read_stats_json(self, bcl2fastq_outdir):
stats_json_path = os.path.join(
self.runfolder, bcl2fastq_outdir, "Stats/Stats.json"
)
def read_stats_json(self, demultiplexer_outdir, demultiplexer):
stats_path = "Reports" if demultiplexer == "bclconvert" else "Stats/Stats.json"
stats_json_path = os.path.join(self.runfolder, demultiplexer_outdir, stats_path)
if os.path.exists(stats_json_path):
with open(stats_json_path) as f:
return json.load(f)
if demultiplexer == "bclconvert":
# Bclconvert produces multiple statistical output files
files = glob.glob(stats_json_path + "/*.csv")
bclconvert_data = {}
for file in files:
with open(file) as csvfile:
reader = csv.reader(csvfile)
file_name = re.sub(r".*/|\.csv", "", file)
bclconvert_data[file_name] = [row for row in reader]
return bclconvert_data
else:
with open(stats_json_path) as f:
return json.load(f)
else:
return None

def get_bcl2fastq_version(self, runfolder):
with open(os.path.join(runfolder, "bcl2fastq_version")) as f:
bcl2fastq_str = f.read()
return bcl2fastq_str.split("v")[1].strip()
def get_demultiplexer_version(self, runfolder):
with open(os.path.join(runfolder, f"{self.demultiplexer}_version")) as f:
demultiplexer_str = f.read()
return demultiplexer_str.split("v")[1].strip()

def get_software_version(self, runfolder):
with open(
Path(runfolder)
/ "pipeline_info"
/ "nf_core_pipeline_software_mqc_versions.yml"
) as f:
pipeline_info_filename = (
"nf_core_pipeline_software_mqc_versions.yml"
if self.demultiplexer == "bcl2fastq"
else "nf_core_demultiplex_software_mqc_versions.yml"
)
with open(Path(runfolder) / "pipeline_info" / pipeline_info_filename) as f:
return {
software: version
for software_dict in yaml.safe_load(f).values()
Expand Down Expand Up @@ -154,7 +168,7 @@ def get_demultiplexing_info(self):
try:
return {
"Demultiplexing": {
"bcl2fastq": self.get_bcl2fastq_version(self.runfolder)
self.demultiplexer: self.get_demultiplexer_version(self.runfolder)
}
}
except FileNotFoundError:
Expand All @@ -174,17 +188,24 @@ def get_demultiplexing_info(self):
"--runfolder", type=str, required=True, help="Path to runfolder"
)
parser.add_argument(
"--bcl2fastq-outdir",
"--demultiplexer",
type=str,
default="bcl2fastq",
help="Name of demultiplexer used",
)
parser.add_argument(
"--demultiplexer-outdir",
type=str,
default="Data/Intensities/BaseCalls",
help="Path to bcl2fastq output folder relative to the runfolder",
default="Unaligned",
help="Path to demultiplexer output folder relative to the runfolder",
)

args = parser.parse_args()
runfolder = args.runfolder
bcl2fastq_outdir = args.bcl2fastq_outdir
demultiplexer = args.demultiplexer
demultiplexer_outdir = args.demultiplexer_outdir

runfolder_info = RunfolderInfo(runfolder, bcl2fastq_outdir)
runfolder_info = RunfolderInfo(runfolder, demultiplexer_outdir, demultiplexer)
info = runfolder_info.get_info()

print(
Expand Down
20 changes: 20 additions & 0 deletions config/tool_config/bclconvert/multiqc_flowcell_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# Config for flowcell reports

run_modules:
- fastqc
- fastq_screen
- bclconvert
- interop
- custom_content

table_columns_visible:
FastQC:
percent_duplicates: False
percent_gc: False
total_sequences: False

top_modules:
- 'custom_content'
- 'bclconvert'
- 'interop'

61 changes: 46 additions & 15 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@ params.run_folder = "/path/to/run_folder"
params.result_dir = "results"
fastqscreen_default_databases = "FastQ_Screen_Genomes"
params.fastqscreen_databases = fastqscreen_default_databases
params.bcl2fastq_outdir = "Unaligned"
params.demultiplexer = "bcl2fastq"
params.demultiplexer_outdir = "Unaligned"

params.checkqc_config = "" // See: https://github.com/Molmed/checkQC
params.assets_dir = "$baseDir/assets"
params.config_dir = "$baseDir/config/tool_config"
Expand Down Expand Up @@ -47,11 +49,12 @@ def helpMessage() {

Optional parameters:
--result_dir Path to write results (default: results)
--bcl2fastq_outdir Folder name to check for fastq.gz files and demultiplexing stats (default: Unaligned)
--demultiplexer_outdir Folder name to check for fastq.gz files and demultiplexing stats (default: Unaligned)
--checkqc_config Configuration file for CheckQC
--assets_dir Location of project assests (default: "\$baseDir/assets").
--config_dir Location of tool configuration files (default: "\$baseDir/config/tool_config").
--script_dir Location of project scripts (default: "\$baseDir/bin")
--demultiplexer Name of demultiplexer used e.g 'bcl2fastq' or 'bclconvert'

--help Print this help message.

Expand All @@ -65,14 +68,19 @@ if (params.help || !params.run_folder){
helpMessage()
exit 0
}
if (params.help || !params.demultiplexer){
helpMessage()
exit 0
}

workflow {

main:
Channel.fromPath(params.run_folder,checkIfExists:true)
.ifEmpty { "Error: No run folder (--run_folder) given."; exit 1 }
.set {run_folder}
CHECK_RUN_QUALITY(run_folder)
Channel.value(params.demultiplexer).set {demultiplexer}
CHECK_RUN_QUALITY(run_folder, demultiplexer)

}

Expand All @@ -83,13 +91,13 @@ workflow.onComplete {
def get_project_and_reads(run_folder) {

Channel
.fromPath("${run_folder}/${params.bcl2fastq_outdir}/**.fastq.gz" )
.fromPath("${run_folder}/${params.demultiplexer_outdir}/**.fastq.gz" )
.filter( ~/.*_[^I]\d_001\.fastq\.gz$/ )
.ifEmpty { "Error: No fastq files found under ${run_folder}/ !\n"; exit 1 }
.map {
it.toString().indexOf('Undetermined') > 0 ?
['NoProject', it] :
[(it.toString() =~ /^.*\/${params.bcl2fastq_outdir}\/([^\/]+)\/.*\.fastq\.gz$/)[0][1],it]
[(it.toString() =~ /^.*\/${params.demultiplexer_outdir}\/([^\/]+)\/.*\.fastq\.gz$/)[0][1],it]
}

}
Expand Down Expand Up @@ -121,27 +129,42 @@ workflow CHECK_RUN_QUALITY {

take:
run_folder
demultiplexer

main:
if (params.demultiplexer == 'bclconvert') {
Channel.fromPath([
"${params.run_folder}/${params.demultiplexer_outdir}/Reports/*.csv",
"${params.run_folder}/RunInfo.xml"])
.collect().ifEmpty([])
.set { demux_stats }
} else {
Channel.fromPath("${params.run_folder}/${params.demultiplexer_outdir}/Stats/Stats.json")
.collect().ifEmpty([])
.set { demux_stats }
}

INTEROP_SUMMARY(run_folder)
GET_QC_THRESHOLDS(run_folder)
GET_METADATA(run_folder)
GET_METADATA(run_folder, demultiplexer)
project_and_reads = get_project_and_reads(params.run_folder)
FASTQC(project_and_reads,
params.config_dir)
FASTQ_SCREEN(project_and_reads,
params.config_dir,
params.fastqscreen_databases)
MULTIQC_PER_FLOWCELL( params.run_folder,
MULTIQC_PER_FLOWCELL(
params.run_folder,
FASTQC.out.map{ it[1] }.collect(),
FASTQ_SCREEN.out.results.map{ it[1] }.collect(),
FASTQ_SCREEN.out.tsv.map{ it[1] }.collectFile(keepHeader:true,skip:1,sort:true),
INTEROP_SUMMARY.out.collect(),
GET_QC_THRESHOLDS.out.collect().ifEmpty([]),
GET_METADATA.out.collect(),
Channel.fromPath("${params.run_folder}/${params.bcl2fastq_outdir}/Stats/Stats.json").collect().ifEmpty([]),
demux_stats,
params.assets_dir,
params.config_dir)
params.config_dir,
demultiplexer)
MULTIQC_PER_PROJECT( params.run_folder,
combine_results_by_project(
FASTQC.out.groupTuple(),
Expand Down Expand Up @@ -234,19 +257,21 @@ process GET_METADATA {

input:
path runfolder
val demultiplexer

output:
path 'sequencing_metadata_mqc.yaml'

script:
if ( params.bcl2fastq_outdir ){
bcl2fastq_outdir_section = "--bcl2fastq-outdir ${params.bcl2fastq_outdir}"
if ( params.demultiplexer_outdir ){
demultiplexer_outdir_section = "--demultiplexer-outdir ${params.demultiplexer_outdir}"
} else {
bcl2fastq_outdir_section = ""
demultiplexer_outdir_section = ""
}
"""
python ${params.script_dir}/get_metadata.py --runfolder $runfolder \\
$bcl2fastq_outdir_section &> sequencing_metadata_mqc.yaml
--demultiplexer $demultiplexer \\
$demultiplexer_outdir_section &> sequencing_metadata_mqc.yaml
"""
}

Expand Down Expand Up @@ -277,10 +302,16 @@ process MULTIQC_PER_FLOWCELL {
path ('Interop_summary/*') // Interop log
path qc_thresholds // Quality check thresholds (optional)
path sequencing_metadata // Sequencing meta data ( custom content data )
path bcl2fastq_stats // Bcl2Fastq logs
path demux_stats // demux logs
path assets // Staged copy of assets folder
path config_dir // Staged copy of config folder
val demultiplexer // Demultiplexer name

script:
// """
// echo $demux_stats
// echo demultiplexer: $demultiplexer
// """
output:
tuple path("*multiqc_report.html"), path("*_data.zip")

Expand All @@ -295,7 +326,7 @@ process MULTIQC_PER_FLOWCELL {
--title "Flowcell report for \${RUNFOLDER}" \\
--filename \${RUNFOLDER}_multiqc_report.html -z \\
-c ${config_dir}/multiqc_main_config.yaml \\
-c ${config_dir}/multiqc_flowcell_config.yaml \\
-c ${config_dir}/${demultiplexer}/multiqc_flowcell_config.yaml \\
${threshold_parameter} \\
.
"""
Expand Down
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
15 changes: 15 additions & 0 deletions test_data/230825_M04034_0043_000000000-L6NVV/RunInfo.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
<?xml version="1.0"?>
<RunInfo xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Version="2">
<Run Id="230825_M04034_0043_000000000-L6NVV" Number="43">
<Flowcell>000000000-L6NVV</Flowcell>
<Instrument>M04034</Instrument>
<Date>230825</Date>
<Reads>
<Read NumCycles="151" Number="1" IsIndexedRead="N" />
<Read NumCycles="8" Number="2" IsIndexedRead="Y" />
<Read NumCycles="8" Number="3" IsIndexedRead="Y" />
<Read NumCycles="151" Number="4" IsIndexedRead="N" />
</Reads>
<FlowcellLayout LaneCount="1" SurfaceCount="2" SwathCount="1" TileCount="14" />
</Run>
</RunInfo>
Loading