Skip to content

Commit b7abf00

Browse files
authored
Merge pull request #134 from nkongenelly/DATAOPS_1178_update_checkqc_for_projman
Dataops 1178 update checkqc for projman
2 parents c68852c + 18b6e7e commit b7abf00

File tree

13 files changed

+321
-232
lines changed

13 files changed

+321
-232
lines changed

.github/workflows/unit_tests.yml

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,17 @@ on: [push, pull_request]
55
jobs:
66
build:
77
runs-on: ubuntu-22.04
8+
strategy:
9+
matrix:
10+
python-version: ['3.10', '3.11', '3.12','3.13']
11+
name: Set up Python ${{ matrix.python-version }}
812
steps:
913
- uses: actions/checkout@v4
1014

11-
- name: Set up Python 3.10
15+
- name: Set up Python ${{ matrix.python-version }}
1216
uses: actions/setup-python@v4
1317
with:
14-
python-version: '3.10'
18+
python-version: ${{ matrix.python-version }}
1519

1620
- name: Install dependencies
1721
run: |

checkQC/parsers/illumina.py

Lines changed: 48 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -12,24 +12,33 @@ def from_bclconvert(cls, runfolder_path, parser_config):
1212
runfolder_path = pathlib.Path(runfolder_path)
1313
assert runfolder_path.is_dir()
1414

15-
summary, index_summary = _read_interop_summary(runfolder_path)
16-
quality_metrics = _read_quality_metrics(
15+
summary, index_summary, run_info = _read_interop_summary(runfolder_path)
16+
quality_metrics = _read_demultiplexing_metrics(
1717
runfolder_path
1818
/ parser_config["reports_location"]
1919
/ "Quality_Metrics.csv"
2020
)
21-
top_unknown_barcodes = _read_top_unknown_barcodes(
21+
top_unknown_barcodes = _read_demultiplexing_metrics(
2222
runfolder_path
2323
/ parser_config["reports_location"]
2424
/ "Top_Unknown_Barcodes.csv"
2525
)
26+
27+
demultiplex_stats = _read_demultiplexing_metrics(
28+
runfolder_path
29+
/ parser_config["reports_location"]
30+
/ "Demultiplex_Stats.csv"
31+
)
2632
samplesheet = _read_samplesheet(runfolder_path)
2733

2834
instrument, read_length = _read_run_metadata(runfolder_path)
2935

3036
sequencing_metrics = {
3137
lane + 1: {
32-
"total_cluster_pf": summary.at(0).at(lane).reads_pf(),
38+
"total_reads_pf": summary.at(0).at(lane).reads_pf(),
39+
"total_reads": summary.at(0).at(lane).reads(),
40+
"raw_density":summary.at(0).at(lane).density().mean(),
41+
"pf_density":summary.at(0).at(lane).density_pf().mean(),
3342
"yield": sum(
3443
int(row["Yield"])
3544
for row in quality_metrics
@@ -69,6 +78,36 @@ def from_bclconvert(cls, runfolder_path, parser_config):
6978
sample_summary := index_summary.at(lane).at(sample_no)
7079
).sample_id(),
7180
"cluster_count": sample_summary.cluster_count(),
81+
"percent_of_lane": next(
82+
round(float(sample_stat["% Reads"]) * 100, 2)
83+
for sample_stat in demultiplex_stats
84+
if sample_stat["Lane"] == str(lane + 1) and
85+
sample_stat["SampleID"] == sample_summary.sample_id()
86+
),
87+
"percent_perfect_index_reads": next(
88+
round(float(sample_stat["% Perfect Index Reads"]) * 100, 2)
89+
for sample_stat in demultiplex_stats
90+
if sample_stat["Lane"] == str(lane + 1) and
91+
sample_stat["SampleID"] == sample_summary.sample_id()
92+
),
93+
"mean_q30": next(
94+
float(row["Mean Quality Score (PF)"])
95+
for row in quality_metrics
96+
if (
97+
row["Lane"] == str(lane + 1)
98+
and row["SampleID"] == sample_summary.sample_id()
99+
)
100+
),
101+
"percent_q30": next(
102+
float(row["% Q30"]) * 100
103+
for row in quality_metrics
104+
if (
105+
row["Lane"] == str(lane + 1)
106+
and row["SampleID"] == sample_summary.sample_id()
107+
)
108+
)
109+
110+
72111
}
73112
for sample_no in range(index_summary.at(lane).size())
74113
],
@@ -104,24 +143,16 @@ def _read_interop_summary(runfolder_path):
104143
index_summary = interop.py_interop_summary.index_flowcell_summary()
105144
interop.py_interop_summary.summarize_index_metrics(run_metrics, index_summary)
106145

107-
return run_summary, index_summary
146+
return run_summary, index_summary, run_info
108147

109148

110-
def _read_quality_metrics(quality_metrics_path):
149+
def _read_demultiplexing_metrics(metrics_path):
111150
"""
112-
Read quality metrics file
151+
Read demultiplexing metrics file
113152
"""
114-
with open(quality_metrics_path, encoding="utf-8") as csvfile:
153+
with open(metrics_path, encoding="utf-8") as csvfile:
115154
return list(csv.DictReader(csvfile))
116-
117-
118-
def _read_top_unknown_barcodes(top_unknown_barcodes_path):
119-
"""
120-
Read top unknown barcodes file
121-
"""
122-
with open(top_unknown_barcodes_path, encoding="utf-8") as csvfile:
123-
return list(csv.DictReader(csvfile))
124-
155+
125156

126157
def _read_run_metadata(runfolder_path):
127158
"""

checkQC/qc_checkers/cluster_pf.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,33 +17,33 @@ def cluster_pf(
1717
if warning_threshold != "unknown":
1818
warning_threshold = int(warning_threshold * 10**6)
1919

20-
def format_msg(total_cluster_pf, threshold, lane, **kwargs):
21-
return f"Clusters PF {total_cluster_pf / 10**6}M < {threshold / 10**6}M on lane {lane}"
20+
def format_msg(total_reads_pf, threshold, lane, **kwargs):
21+
return f"Clusters PF {total_reads_pf / 10**6}M < {threshold / 10**6}M on lane {lane}"
2222

23-
def _qualify_error(total_cluster_pf, lane):
23+
def _qualify_error(total_reads_pf, lane):
2424
data = {
2525
"lane": lane,
26-
"total_cluster_pf": total_cluster_pf,
26+
"total_reads_pf": total_reads_pf,
2727
"qc_checker": "cluster_pf",
2828
}
2929

30-
match total_cluster_pf:
31-
case total_cluster_pf if (
30+
match total_reads_pf:
31+
case total_reads_pf if (
3232
error_threshold != "unknown"
33-
and total_cluster_pf < error_threshold
33+
and total_reads_pf < error_threshold
3434
):
3535
data["threshold"] = error_threshold
3636
return QCErrorFatal(format_msg(**data), data=data)
37-
case total_cluster_pf if (
37+
case total_reads_pf if (
3838
warning_threshold != "unknown"
39-
and total_cluster_pf < warning_threshold
39+
and total_reads_pf < warning_threshold
4040
):
4141
data["threshold"] = warning_threshold
4242
return QCErrorWarning(format_msg(**data), data=data)
4343

4444
return [
4545
qc_report
4646
for lane, lane_data in qc_data.sequencing_metrics.items()
47-
if (qc_report := _qualify_error(lane_data["total_cluster_pf"], lane))
47+
if (qc_report := _qualify_error(lane_data["total_reads_pf"], lane))
4848
]
4949

checkQC/qc_checkers/unidentified_index.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ def unidentified_index(
4040
qc_errors = []
4141
for lane, lane_data in qc_data.sequencing_metrics.items():
4242
for barcode in lane_data["top_unknown_barcodes"]:
43-
significance = barcode["count"] / lane_data["total_cluster_pf"] * 100.
43+
significance = barcode["count"] / lane_data["total_reads_pf"] * 100.
4444
if significance < significance_threshold:
4545
continue
4646
index = (

checkQC/qc_data_utils.py

Lines changed: 204 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
import numpy as np
2+
from checkQC.parsers.illumina import _read_interop_summary
3+
4+
5+
def bclconvert_test_runfolder(qc_data, runfolder_path):
6+
_, _, run_info = _read_interop_summary(runfolder_path)
7+
flowcell_id = run_info.flowcell_id()
8+
if "HMTFYDRXX" in flowcell_id:
9+
return {
10+
"qc_data": qc_data,
11+
"expected_instrument": "novaseq_SP",
12+
"expected_read_length": 36,
13+
"expected_samplesheet": {
14+
"len": 4,
15+
"head": [
16+
{
17+
"lane": 1,
18+
"sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1",
19+
"index": "GAACTGAGCG",
20+
"index2": "TCGTGGAGCG",
21+
"sample_project": "AB-1234",
22+
"overridecycles": "Y36;I10;I10",
23+
"custom_description": "LIBRARY_NAME:test",
24+
},
25+
{
26+
"lane": 1,
27+
"sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1",
28+
"index": "AGGTCAGATA",
29+
"index2": "CTACAAGATA",
30+
"sample_project": "CD-5678",
31+
"overridecycles": "Y36;I10;I10",
32+
"custom_description": "LIBRARY_NAME:test",
33+
},
34+
{
35+
"lane": 2,
36+
"sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2",
37+
"index": "GAACTGAGCG",
38+
"index2": "TCGTGGAGCG",
39+
"sample_project": "AB-1234",
40+
"overridecycles": "Y36;I10;I10",
41+
"custom_description": "LIBRARY_NAME:test",
42+
},
43+
{
44+
"lane": 2,
45+
"sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2",
46+
"index": "AGGTCAGATA",
47+
"index2": "CTACAAGATA",
48+
"sample_project": "CD-5678",
49+
"overridecycles": "Y36;I10;I10",
50+
"custom_description": "LIBRARY_NAME:test",
51+
},
52+
],
53+
},
54+
"expected_sequencing_metrics": {
55+
1: {
56+
"total_reads_pf": 532_464_327,
57+
"total_reads": 638_337_024,
58+
"raw_density": 2_961_270.5,
59+
"pf_density": 2_470_118.25,
60+
"yield": 122_605_416,
61+
"yield_undetermined": 121_940_136,
62+
"top_unknown_barcodes": {
63+
"len": 1029,
64+
"head": [
65+
{
66+
'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT',
67+
'count': 12857,
68+
},
69+
{
70+
'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT',
71+
'count': 12406,
72+
},
73+
{
74+
'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG',
75+
'count': 12177,
76+
},
77+
{
78+
'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT',
79+
'count': 11590,
80+
},
81+
{
82+
'index': 'GGTCCGCTTC', 'index2': 'CTCACACAAG',
83+
'count': 11509,
84+
},
85+
],
86+
},
87+
"reads": {
88+
1: {
89+
"mean_error_rate": np.nan,
90+
"percent_q30": 95.70932006835938,
91+
"is_index": False,
92+
"mean_percent_phix_aligned": 0.,
93+
},
94+
2: {
95+
"mean_error_rate": np.nan,
96+
"percent_q30": 92.57965850830078,
97+
"is_index": True,
98+
"mean_percent_phix_aligned": np.nan,
99+
},
100+
3: {
101+
"mean_error_rate": np.nan,
102+
"percent_q30": 90.3790283203125,
103+
"is_index": True,
104+
"mean_percent_phix_aligned": np.nan,
105+
},
106+
},
107+
"reads_per_sample": [
108+
{
109+
"sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1",
110+
"cluster_count": 9920,
111+
"percent_of_lane": 0.29,
112+
"percent_perfect_index_reads": 97.96,
113+
"mean_q30": 36.37,
114+
"percent_q30": 96,
115+
},
116+
{
117+
"sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1",
118+
"cluster_count": 8560,
119+
"percent_of_lane": 0.25,
120+
"percent_perfect_index_reads": 98.15,
121+
"mean_q30": 36.43,
122+
"percent_q30": 96,
123+
},
124+
],
125+
},
126+
2: {
127+
"total_reads_pf": 530_917_565,
128+
"total_reads": 638_337_024,
129+
"raw_density": 2_961_270.5,
130+
"pf_density": 2_462_942.5,
131+
"yield": 124_497_108,
132+
"yield_undetermined": 123_817_428,
133+
"top_unknown_barcodes": {
134+
"len": 1055,
135+
"head": [
136+
{
137+
'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT',
138+
'count': 13176,
139+
},
140+
{
141+
'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG',
142+
'count': 12395,
143+
},
144+
{
145+
'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT',
146+
'count': 12247,
147+
},
148+
{
149+
'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT',
150+
'count': 11909,
151+
},
152+
{
153+
'index': 'TAATTAGCGT', 'index2': 'TGGTTAAGAA',
154+
'count': 11330,
155+
},
156+
],
157+
},
158+
"reads": {
159+
1: {
160+
"mean_error_rate": np.nan,
161+
"percent_q30": 95.75276184082031,
162+
"is_index": False,
163+
"mean_percent_phix_aligned": 0.,
164+
},
165+
2: {
166+
"mean_error_rate": np.nan,
167+
"percent_q30": 92.60448455810547,
168+
"is_index": True,
169+
"mean_percent_phix_aligned": np.nan,
170+
},
171+
3: {
172+
"mean_error_rate": np.nan,
173+
"percent_q30": 90.2811050415039,
174+
"is_index": True,
175+
"mean_percent_phix_aligned": np.nan,
176+
},
177+
},
178+
"reads_per_sample": [
179+
{
180+
"sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2",
181+
"cluster_count": 10208,
182+
"percent_of_lane": 0.3,
183+
"percent_perfect_index_reads": 98.2,
184+
"mean_q30": 36.4,
185+
"percent_q30": 96,
186+
},
187+
{
188+
"sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2",
189+
"cluster_count": 8672,
190+
"percent_of_lane": 0.25,
191+
"percent_perfect_index_reads": 98.29,
192+
"mean_q30": 36.48,
193+
"percent_q30": 97,
194+
},
195+
],
196+
},
197+
},
198+
}
199+
else:
200+
raise Exception(
201+
"This function is only compatible with the run with flowcell_id: 'HMTFYDRXX', "
202+
f"the supplied runfolder has flowcell_id: {flowcell_id}"
203+
)
204+

requirements/prod

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,10 @@
11
click~=8.1.1
22
PyYAML~=6.0
3-
interop~=1.3.2
3+
interop~=1.4.0
44
xmltodict~=0.13.0
55
tornado~=6.3.2
66
sample_sheet~=0.13.0
77
pandas~=2.2.2
8-
numpy~=1.26.4
8+
numpy~=2.2.4
99
samshee~=0.2.3
1010
jsonschema~=4.23.0

0 commit comments

Comments
 (0)