Skip to content

Commit 031528d

Browse files
authored
Merge pull request #19 from nkongenelly/DATAOPS-1178_use_checkqc_handler
Using checkqc illumina parser for bclconvert
2 parents 18e7feb + 0c61041 commit 031528d

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

50 files changed

+21466
-39
lines changed

projman_filler/app.py

Lines changed: 105 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,13 @@
1111
from projman_filler.models.db_models import FlowcellRunfolder
1212
from projman_filler.bcl2fastq_run_stats_parser import Bcl2fastqRunStatsParser
1313
from projman_filler.interop_run_stats_parser import InteropRunStatsParser
14+
from projman_filler.qc_data_parser import QCDataParser
1415

1516
from projman_filler.repositories.sample_results_repo import SampleResultRepo
1617
from projman_filler.repositories.flowcell_lane_results_repo import FlowcellLaneResultsRepo
1718
from projman_filler.repositories.flowcell_runfolder_repo import FlowcellRunfolderRepo
18-
19+
from checkQC.qc_data import QCData
20+
from checkQC.config import ConfigFactory
1921

2022
class App(object):
2123

@@ -43,8 +45,8 @@ def delete_existing_flowcell_from_db(self, flowcell_name, force):
4345
return
4446

4547
if force:
46-
print("Found the specified runfolder in the db, but got a force option, so will proceed to "
47-
"delete it and insert new values.")
48+
print("Found the specified runfolder in the db, but got a force "
49+
"option, so will proceed to delete it and insert new values.")
4850
self.flowcell_lane_results_repo.delete_by_flowcell_name(flowcell_name)
4951
self.flowcell_runfolder_repo.delete_by_flowcell_name(flowcell_name)
5052
self.sample_results_repo.delete_by_flowcell_name(flowcell_name)
@@ -59,50 +61,131 @@ def insert_flowcell_runfolder_into_db(self, runfolder, flowcell_name):
5961
run_date=runfolder_date)
6062
self.flowcell_runfolder_repo.add(flowcell_runfolder)
6163

62-
def insert_runfolder_into_db(self, runfolder, bcl2fastq_stats_dir, force=False, atac_seq_mode=False, olink_mode=False):
63-
if olink_mode:
64-
print("Olink mode activated. Will read lane-level statistics from InterOp files instead of bcl2fastq Stats.json.")
64+
def insert_runfolder_into_db(self, runfolder, bcl2fastq_stats_dir,
65+
demultiplexer, ch_config_path, force=False,
66+
atac_seq_mode=False, olink_mode=False):
67+
"""
68+
Inserts runfolder data into the specific database based on the specified
69+
demultiplexer and mode.
70+
71+
:param runfolder (str): Path to the runfolder directory.
72+
:param bcl2fastq_stats_dir (str): Subdirectory containing bcl2fastq statistics.
73+
:param demultiplexer (str): Demultiplexer used (e.g 'bcl2fastq' or 'bclconvert').
74+
:param ch_config_path (Path): Path to the checkQC configuration file
75+
:param force (bool, optional): If True, existing flowcell data will be overwritten.
76+
:param atac_seq_mode (bool, optional): If True, enables ATAC-seq specific processing.
77+
:param olink_mode (bool, optional): If True, enables Olink-specific processing.
78+
"""
79+
flowcell_name = None
80+
81+
if demultiplexer == "bcl2fastq":
82+
flowcell_name = self._handle_bcl2fastq(
83+
runfolder, bcl2fastq_stats_dir, force, atac_seq_mode
84+
)
85+
elif olink_mode:
86+
print("Olink mode activated. Will read lane-level statistics from "
87+
"InterOp files instead of bcl2fastq Stats.json.")
6588
return self.insert_olink_runfolder_into_db(runfolder, force)
89+
else:
90+
flowcell_name = self._handle_other_demultiplexer(
91+
runfolder, demultiplexer, ch_config_path, force
92+
)
93+
94+
self.insert_flowcell_runfolder_into_db(runfolder, flowcell_name)
95+
96+
def _handle_bcl2fastq(self, runfolder, stats_dir, force, atac_seq_mode):
97+
"""
98+
Handles runfolder processing for the bcl2fastq demultiplexer.
99+
100+
:param runfolder (str): Path to the runfolder directory.
101+
:param stats_dir (str): Subdirectory containing bcl2fastq statistics.
102+
:param force (bool): If True, existing flowcell data will be overwritten.
103+
:param atac_seq_mode (bool): If True, enables ATAC-seq specific processing.
104+
105+
:return flowcell_name (str): The flowcell name extracted from the bcl2fastq
106+
statistics (after saving flowcel_lane and sample data in DB).
107+
"""
108+
stats_path = os.path.join(runfolder, stats_dir)
109+
bcl2fastq_stats = Bcl2fastqRunStatsParser(stats_path)
66110

67-
bcl2fastq_stats = Bcl2fastqRunStatsParser(os.path.join(runfolder, bcl2fastq_stats_dir))
68111
flowcell_name = bcl2fastq_stats.get_flowcell_name()
69112
reads_and_cycles = bcl2fastq_stats.get_reads_and_cycles()
70113
conversion_results = bcl2fastq_stats.get_conversion_results()
71114

72-
# Check if flowcell exists and should be overriden
73115
self.delete_existing_flowcell_from_db(flowcell_name, force)
74116

75-
# For atac-seq we run bcl2fastq with special parameters declaring
76-
# that the second index should be interpreted as a non-index read.
77-
# So we allow overriding the Interop list of non-index-reads with
78-
# a custom list obtained from bcl2fastq stats. /ML 2021-09
79117
non_index_reads = None
80118
if atac_seq_mode:
81-
print("ATAC-seq mode activated. Will re-map read numbers according to settings used by bcl2fastq.")
119+
print("ATAC-seq mode activated. Will re-map read numbers according "
120+
"to settings used by bcl2fastq.")
82121
non_index_reads = bcl2fastq_stats.get_non_index_reads()
83-
122+
84123
interop = InteropRunStatsParser(runfolder, non_index_reads)
85-
lane_stats = calculate_lane_statistics(interop, flowcell_name, conversion_results)
124+
lane_stats = calculate_lane_statistics(
125+
interop, flowcell_name, conversion_results
126+
)
86127
self.flowcell_lane_results_repo.add(list(lane_stats))
87128

88-
samplesheet_file = os.path.join(runfolder, "SampleSheet.csv")
89-
samplesheet = Samplesheet(samplesheet_file)
90-
91-
sample_stats = calculate_sample_statistics(flowcell_name, conversion_results, reads_and_cycles, samplesheet)
129+
samplesheet = Samplesheet(os.path.join(runfolder, "SampleSheet.csv"))
130+
sample_stats = calculate_sample_statistics(
131+
flowcell_name, conversion_results, reads_and_cycles, samplesheet
132+
)
92133
self.sample_results_repo.add(list(sample_stats))
93134

94-
self.insert_flowcell_runfolder_into_db(runfolder, flowcell_name)
95-
135+
return flowcell_name
96136

97137
def insert_olink_runfolder_into_db(self, runfolder, force=False):
138+
"""
139+
Inserts runfolder data into the database using Olink-specific processing.
140+
141+
:param runfolder (str): Path to the runfolder directory.
142+
:param force (bool, optional): If True, existing flowcell data will be
143+
overwritten. Defaults to False.
144+
"""
98145
interop = InteropRunStatsParser(runfolder)
99146
flowcell_name = interop.get_flowcell_name()
100147

101148
# Check if flowcell exists and should be overriden
102149
self.delete_existing_flowcell_from_db(flowcell_name, force)
103150

104151
conversion_results = interop.get_conversion_results()
105-
lane_stats = calculate_lane_statistics(interop, flowcell_name, conversion_results)
152+
lane_stats = calculate_lane_statistics(
153+
interop, flowcell_name, conversion_results
154+
)
106155

107156
self.flowcell_lane_results_repo.add(list(lane_stats))
108157
self.insert_flowcell_runfolder_into_db(runfolder, flowcell_name)
158+
159+
def _handle_other_demultiplexer(self, runfolder, demultiplexer, ch_config_path,
160+
force):
161+
"""
162+
Handles runfolder processing for demultiplexers other than bcl2fastq.
163+
164+
:param runfolder (str): Path to the runfolder directory.
165+
:param demultiplexer (str): Demultiplexer used (e.g., 'bclconvert').
166+
:param ch_config_path (Path): Path to the checkQC configuration file
167+
:param force (bool): If True, existing flowcell data will be overwritten.
168+
169+
:return flowcell_name (str): The flowcell name extracted from the runfolder
170+
(after saving flowcel_lane and sample data in DB).
171+
"""
172+
173+
checkqc_conf = ConfigFactory.from_config_path(ch_config_path)._config
174+
qc_data = getattr(QCData, f"from_{demultiplexer}")(
175+
runfolder_path=runfolder,
176+
parser_config=(
177+
checkqc_conf
178+
.get("parser_configurations", {})
179+
.get(f"from_{demultiplexer}", {})
180+
)
181+
)
182+
qc_data_parser = QCDataParser(qc_data, runfolder)
183+
flowcell_name = qc_data_parser.flowcell_id
184+
self.delete_existing_flowcell_from_db(flowcell_name, force)
185+
186+
lane_results = qc_data_parser._build_lane_results()
187+
sample_results = qc_data_parser._build_sample_results()
188+
self.flowcell_lane_results_repo.add(lane_results)
189+
self.sample_results_repo.add(sample_results)
190+
191+
return flowcell_name

projman_filler/cli.py

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,15 +18,30 @@
1818
@click.option('--debug', is_flag=True)
1919
@click.option('--atac-seq-mode', is_flag=True)
2020
@click.option('--olink-mode', is_flag=True)
21-
@click.option('-b', '--bcl2fastq-stats', default="Unaligned/Stats", type=click.Path())
21+
@click.option('-b', '--bcl2fastq-stats', default="Unaligned/Stats",
22+
type=click.Path()
23+
)
24+
@click.option('-d', '--demultiplexer', default ="bcl2fastq",
25+
type=click.Choice(['bcl2fastq', 'bclconvert'])
26+
)
27+
@click.option(
28+
"--config",
29+
default="/etc/arteria/checkqc_config/checkqc.config",
30+
type=click.Path(exists=True, dir_okay=False),
31+
help="Path to the checkQC configuration file",
32+
)
2233
@click.argument('runfolder', type=click.Path())
23-
def main(runfolder, force, atac_seq_mode, olink_mode, bcl2fastq_stats, debug):
34+
def main(runfolder, force, atac_seq_mode, olink_mode, bcl2fastq_stats,
35+
demultiplexer, config, debug):
2436
"""Console script for projman_filler."""
2537
print("projman_filler v{}".format(projman_filler_version))
2638
try:
2739
db_connection_string = os.environ["PROJMAN_DB"]
2840
app = App(db_connection_string, debug)
29-
app.insert_runfolder_into_db(runfolder, bcl2fastq_stats, force=force, atac_seq_mode=atac_seq_mode, olink_mode=olink_mode)
41+
app.insert_runfolder_into_db(
42+
runfolder, bcl2fastq_stats, demultiplexer, config, force=force,
43+
atac_seq_mode=atac_seq_mode, olink_mode=olink_mode
44+
)
3045
except FlowcellAlreadyInDb:
3146
print("Flowcell was already present in db.")
3247
sys.exit(1)

projman_filler/interop_run_stats_parser.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -128,5 +128,4 @@ def _get_conversion_results(self) -> list:
128128

129129
lanes.append(Lane(l, total_clusters_raw, total_clusters_pf))
130130
return lanes
131-
132-
131+

projman_filler/qc_data_parser.py

Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
import pandas as pd
2+
from math import isnan
3+
4+
from checkQC.parsers.illumina import _read_interop_summary
5+
from projman_filler.models.db_models import FlowcellLaneResult, SampleResult
6+
7+
8+
class QCDataParser:
9+
def __init__(self, qc_data, runfolder):
10+
"""
11+
Initializes the QCDataParser with QC data.
12+
13+
:param qc_data: QC data object from checkQC parser.
14+
:param runfolder: Path to the sequencing run folder.
15+
"""
16+
self.run_summary, index_summary, run_info = _read_interop_summary(runfolder)
17+
self.flowcell_id = run_info.flowcell_id()
18+
self.samplesheet_df = pd.DataFrame(qc_data.samplesheet)
19+
self.qc_data = qc_data
20+
21+
22+
def _build_lane_results(self):
23+
"""
24+
Constructs lane-level statistics from checkQC sequencing metrics results
25+
from illumina parser.
26+
27+
:return results (List[FlowcellLaneResult]): Lane-level statistics for non-index reads.
28+
"""
29+
results = []
30+
for lane_no, lane_data in self.qc_data.sequencing_metrics.items():
31+
read_no = 0
32+
for number, read_data in lane_data["reads"].items():
33+
if read_data["is_index"]:
34+
continue
35+
read_no += 1 # counts only non-index reads
36+
cycles = self.run_summary.at(number - 1).read().total_cycles()
37+
error_rate = None if read_data["mean_error_rate"] and \
38+
isnan(read_data["mean_error_rate"]) else \
39+
read_data["mean_error_rate"]
40+
41+
results.append(FlowcellLaneResult(
42+
flowcell_id=self.flowcell_id,
43+
lane_num=lane_no,
44+
read_num=read_no,
45+
raw_density=lane_data["raw_density"],
46+
pf_density=lane_data["pf_density"],
47+
error_rate=error_rate,
48+
pf_clusters=lane_data["total_reads_pf"],
49+
raw_clusters=lane_data["total_reads"],
50+
cycles=cycles,
51+
pct_q30=read_data["percent_q30"] / 100,
52+
))
53+
return results
54+
55+
def _build_sample_results(self):
56+
"""
57+
Constructs sample-level statistics by combining sequencing metrics,
58+
sample sheet data results from the checkQC interop parser
59+
60+
:return results (List[SampleResult]): Sample-level statistics including
61+
quality scores, indexing accuracy, and library metadata.
62+
"""
63+
results = []
64+
for lane_no, lane_data in self.qc_data.sequencing_metrics.items():
65+
for sample_data in lane_data["reads_per_sample"]:
66+
sample_id = sample_data["sample_id"]
67+
sample_row = self.samplesheet_df[
68+
(self.samplesheet_df['lane'] == lane_no) &
69+
(self.samplesheet_df['sample_id'] == sample_id)
70+
]
71+
72+
library_name = \
73+
sample_row['custom_description'].to_string(index=False).split(
74+
"LIBRARY_NAME:"
75+
)[-1].strip()
76+
index1 = sample_row['index'].to_string(index=False)
77+
index2 = sample_row['index2'].to_string(index=False)
78+
sample_project = sample_row['sample_project'].to_string(index=False)
79+
80+
read_no = 0
81+
for number, read_data in lane_data["reads"].items():
82+
if read_data["is_index"]:
83+
continue
84+
read_no += 1 # counts only non-index reads
85+
cycles = self.run_summary.at(number - 1).read().total_cycles()
86+
no_of_reads = len(
87+
[
88+
no
89+
for no, read_data in lane_data["reads"].items()
90+
if not read_data["is_index"]
91+
]
92+
)
93+
results.append(SampleResult(
94+
flowcell_id=self.flowcell_id,
95+
project_id=sample_project,
96+
sample_name="_".join(sample_id.split("_")[1:]),
97+
tag_seq=f"{index1}-{index2}" if index2 else index1,
98+
lane_num=lane_no,
99+
read_num=read_no,
100+
cycles=cycles,
101+
pct_lane=sample_data["percent_of_lane"],
102+
pf_clusters=float(sample_data["cluster_count"]/no_of_reads),
103+
pct_q30=sample_data["percent_q30"],
104+
pct_tag_err=100 - sample_data["percent_perfect_index_reads"],
105+
library_name=library_name,
106+
mean_q=sample_data["mean_q30"],
107+
))
108+
return results

setup.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,10 @@
1616
'SQLAlchemy~=2.0.40',
1717
'pymssql~=2.3.4',
1818
'pandas~=2.2.3',
19-
'numpy~=2.2.4'
19+
'numpy~=2.2.4',
20+
# CheckQC pinned installation be changed once the correct checkQV version is published to pypi
21+
'checkQC@git+https://github.com/Molmed/checkQC.git@master#egg=checkQC',
22+
2023
]
2124

2225
setup_requirements = [
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
[Header],,,,,
2+
Date,6/24/2020,,,,
3+
Application,Illumina DRAGEN COVIDSeq Test Pipeline,,,,
4+
Instrument Type,NovaSeq6000,,,,
5+
Assay,Illumina COVIDSeq Test,,,,
6+
Index Adapters,IDT-ILMN DNA-RNA UDP Indexes ,,,,
7+
Chemistry,Amplicon,,,,
8+
,,,,,
9+
[Settings],,,,,,,
10+
,,,,,
11+
[Data],,,,,
12+
Lane,Sample_ID,Sample_Name,Sample_Project,Index_ID,index,index2,Description
13+
1,Sample_AB-1234-14574-Qiagen-IndexSet1-SP-Lane1,AB-1234-14574-Qiagen-IndexSet1-SP-Lane1,AB-1234,UDP0001,GAACTGAGCG,TCGTGGAGCG,LIBRARY_NAME:test
14+
2,Sample_AB-1234-14574-Qiagen-IndexSet1-SP-Lane2,AB-1234-14574-Qiagen-IndexSet1-SP-Lane2,AB-1234,UDP0001,GAACTGAGCG,TCGTGGAGCG,LIBRARY_NAME:test

0 commit comments

Comments
 (0)