Skip to content
Merged
Show file tree
Hide file tree
Changes from 15 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@ on: [push, pull_request]
jobs:
build:
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12','3.13']
name: Set up Python ${{ matrix.python-version }}
steps:
- uses: actions/checkout@v4

- name: Set up Python 3.10
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
Expand Down
65 changes: 48 additions & 17 deletions checkQC/parsers/illumina.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,24 +12,33 @@ def from_bclconvert(cls, runfolder_path, parser_config):
runfolder_path = pathlib.Path(runfolder_path)
assert runfolder_path.is_dir()

summary, index_summary = _read_interop_summary(runfolder_path)
quality_metrics = _read_quality_metrics(
summary, index_summary, run_info = _read_interop_summary(runfolder_path)
quality_metrics = _read_demultiplexing_metrics(
runfolder_path
/ parser_config["reports_location"]
/ "Quality_Metrics.csv"
)
top_unknown_barcodes = _read_top_unknown_barcodes(
top_unknown_barcodes = _read_demultiplexing_metrics(
runfolder_path
/ parser_config["reports_location"]
/ "Top_Unknown_Barcodes.csv"
)

demultiplex_stats = _read_demultiplexing_metrics(
runfolder_path
/ parser_config["reports_location"]
/ "Demultiplex_Stats.csv"
)
samplesheet = _read_samplesheet(runfolder_path)

instrument, read_length = _read_run_metadata(runfolder_path)

sequencing_metrics = {
lane + 1: {
"total_cluster_pf": summary.at(0).at(lane).reads_pf(),
"total_reads_pf": summary.at(0).at(lane).reads_pf(),
"total_reads": summary.at(0).at(lane).reads(),
"raw_density":summary.at(0).at(lane).density().mean(),
"pf_density":summary.at(0).at(lane).density_pf().mean(),
"yield": sum(
int(row["Yield"])
for row in quality_metrics
Expand Down Expand Up @@ -69,6 +78,36 @@ def from_bclconvert(cls, runfolder_path, parser_config):
sample_summary := index_summary.at(lane).at(sample_no)
).sample_id(),
"cluster_count": sample_summary.cluster_count(),
"percent_of_lane": next(
round(float(sample_stat["% Reads"]) * 100, 2)
for sample_stat in demultiplex_stats
if sample_stat["Lane"] == str(lane + 1) and
sample_stat["SampleID"] == sample_summary.sample_id()
),
"percent_perfect_index_reads": next(
round(float(sample_stat["% Perfect Index Reads"]) * 100, 2)
for sample_stat in demultiplex_stats
if sample_stat["Lane"] == str(lane + 1) and
sample_stat["SampleID"] == sample_summary.sample_id()
),
"mean_q30": next(
float(row["Mean Quality Score (PF)"])
for row in quality_metrics
if (
row["Lane"] == str(lane + 1)
and row["SampleID"] == sample_summary.sample_id()
)
),
"percent_q30": next(
float(row["% Q30"]) * 100
for row in quality_metrics
if (
row["Lane"] == str(lane + 1)
and row["SampleID"] == sample_summary.sample_id()
)
)


}
for sample_no in range(index_summary.at(lane).size())
],
Expand Down Expand Up @@ -104,24 +143,16 @@ def _read_interop_summary(runfolder_path):
index_summary = interop.py_interop_summary.index_flowcell_summary()
interop.py_interop_summary.summarize_index_metrics(run_metrics, index_summary)

return run_summary, index_summary
return run_summary, index_summary, run_info


def _read_quality_metrics(quality_metrics_path):
def _read_demultiplexing_metrics(metrics_path):
"""
Read quality metrics file
Read demultiplexing metrics file
"""
with open(quality_metrics_path, encoding="utf-8") as csvfile:
with open(metrics_path, encoding="utf-8") as csvfile:
return list(csv.DictReader(csvfile))


def _read_top_unknown_barcodes(top_unknown_barcodes_path):
"""
Read top unknown barcodes file
"""
with open(top_unknown_barcodes_path, encoding="utf-8") as csvfile:
return list(csv.DictReader(csvfile))



def _read_run_metadata(runfolder_path):
"""
Expand Down
20 changes: 10 additions & 10 deletions checkQC/qc_checkers/cluster_pf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,33 @@ def cluster_pf(
if warning_threshold != "unknown":
warning_threshold = int(warning_threshold * 10**6)

def format_msg(total_cluster_pf, threshold, lane, **kwargs):
return f"Clusters PF {total_cluster_pf / 10**6}M < {threshold / 10**6}M on lane {lane}"
def format_msg(total_reads_pf, threshold, lane, **kwargs):
return f"Clusters PF {total_reads_pf / 10**6}M < {threshold / 10**6}M on lane {lane}"

def _qualify_error(total_cluster_pf, lane):
def _qualify_error(total_reads_pf, lane):
data = {
"lane": lane,
"total_cluster_pf": total_cluster_pf,
"total_reads_pf": total_reads_pf,
"qc_checker": "cluster_pf",
}

match total_cluster_pf:
case total_cluster_pf if (
match total_reads_pf:
case total_reads_pf if (
error_threshold != "unknown"
and total_cluster_pf < error_threshold
and total_reads_pf < error_threshold
):
data["threshold"] = error_threshold
return QCErrorFatal(format_msg(**data), data=data)
case total_cluster_pf if (
case total_reads_pf if (
warning_threshold != "unknown"
and total_cluster_pf < warning_threshold
and total_reads_pf < warning_threshold
):
data["threshold"] = warning_threshold
return QCErrorWarning(format_msg(**data), data=data)

return [
qc_report
for lane, lane_data in qc_data.sequencing_metrics.items()
if (qc_report := _qualify_error(lane_data["total_cluster_pf"], lane))
if (qc_report := _qualify_error(lane_data["total_reads_pf"], lane))
]

2 changes: 1 addition & 1 deletion checkQC/qc_checkers/unidentified_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def unidentified_index(
qc_errors = []
for lane, lane_data in qc_data.sequencing_metrics.items():
for barcode in lane_data["top_unknown_barcodes"]:
significance = barcode["count"] / lane_data["total_cluster_pf"] * 100.
significance = barcode["count"] / lane_data["total_reads_pf"] * 100.
if significance < significance_threshold:
continue
index = (
Expand Down
199 changes: 199 additions & 0 deletions checkQC/qc_data_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
import numpy as np
from checkQC.parsers.illumina import _read_interop_summary


def bclconvert_test_runfolder(qc_data, runfolder_path):
_, _, run_info = _read_interop_summary(runfolder_path)
flowcell_id = run_info.flowcell_id()
if "HMTFYDRXX" in flowcell_id:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great! Just one minor thing, I think it would be nice to throw an exception if the flowcell ID does not match. Explaining that the ouytput of this funtion is adapated for a specific run

Copy link
Contributor Author

@nkongenelly nkongenelly Oct 22, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh yes, thanks

return {
"qc_data": qc_data,
"expected_instrument": "novaseq_SP",
"expected_read_length": 36,
"expected_samplesheet": {
"len": 4,
"head": [
{
"lane": 1,
"sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1",
"index": "GAACTGAGCG",
"index2": "TCGTGGAGCG",
"sample_project": "AB-1234",
"overridecycles": "Y36;I10;I10",
"custom_description": "LIBRARY_NAME:test",
},
{
"lane": 1,
"sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1",
"index": "AGGTCAGATA",
"index2": "CTACAAGATA",
"sample_project": "CD-5678",
"overridecycles": "Y36;I10;I10",
"custom_description": "LIBRARY_NAME:test",
},
{
"lane": 2,
"sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2",
"index": "GAACTGAGCG",
"index2": "TCGTGGAGCG",
"sample_project": "AB-1234",
"overridecycles": "Y36;I10;I10",
"custom_description": "LIBRARY_NAME:test",
},
{
"lane": 2,
"sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2",
"index": "AGGTCAGATA",
"index2": "CTACAAGATA",
"sample_project": "CD-5678",
"overridecycles": "Y36;I10;I10",
"custom_description": "LIBRARY_NAME:test",
},
],
},
"expected_sequencing_metrics": {
1: {
"total_reads_pf": 532_464_327,
"total_reads": 638_337_024,
"raw_density": 2_961_270.5,
"pf_density": 2_470_118.25,
"yield": 122_605_416,
"yield_undetermined": 121_940_136,
"top_unknown_barcodes": {
"len": 1029,
"head": [
{
'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT',
'count': 12857,
},
{
'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT',
'count': 12406,
},
{
'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG',
'count': 12177,
},
{
'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT',
'count': 11590,
},
{
'index': 'GGTCCGCTTC', 'index2': 'CTCACACAAG',
'count': 11509,
},
],
},
"reads": {
1: {
"mean_error_rate": np.nan,
"percent_q30": 95.70932006835938,
"is_index": False,
"mean_percent_phix_aligned": 0.,
},
2: {
"mean_error_rate": np.nan,
"percent_q30": 92.57965850830078,
"is_index": True,
"mean_percent_phix_aligned": np.nan,
},
3: {
"mean_error_rate": np.nan,
"percent_q30": 90.3790283203125,
"is_index": True,
"mean_percent_phix_aligned": np.nan,
},
},
"reads_per_sample": [
{
"sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1",
"cluster_count": 9920,
"percent_of_lane": 0.29,
"percent_perfect_index_reads": 97.96,
"mean_q30": 36.37,
"percent_q30": 96,
},
{
"sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1",
"cluster_count": 8560,
"percent_of_lane": 0.25,
"percent_perfect_index_reads": 98.15,
"mean_q30": 36.43,
"percent_q30": 96,
},
],
},
2: {
"total_reads_pf": 530_917_565,
"total_reads": 638_337_024,
"raw_density": 2_961_270.5,
"pf_density": 2_462_942.5,
"yield": 124_497_108,
"yield_undetermined": 123_817_428,
"top_unknown_barcodes": {
"len": 1055,
"head": [
{
'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT',
'count': 13176,
},
{
'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG',
'count': 12395,
},
{
'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT',
'count': 12247,
},
{
'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT',
'count': 11909,
},
{
'index': 'TAATTAGCGT', 'index2': 'TGGTTAAGAA',
'count': 11330,
},
],
},
"reads": {
1: {
"mean_error_rate": np.nan,
"percent_q30": 95.75276184082031,
"is_index": False,
"mean_percent_phix_aligned": 0.,
},
2: {
"mean_error_rate": np.nan,
"percent_q30": 92.60448455810547,
"is_index": True,
"mean_percent_phix_aligned": np.nan,
},
3: {
"mean_error_rate": np.nan,
"percent_q30": 90.2811050415039,
"is_index": True,
"mean_percent_phix_aligned": np.nan,
},
},
"reads_per_sample": [
{
"sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2",
"cluster_count": 10208,
"percent_of_lane": 0.3,
"percent_perfect_index_reads": 98.2,
"mean_q30": 36.4,
"percent_q30": 96,
},
{
"sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2",
"cluster_count": 8672,
"percent_of_lane": 0.25,
"percent_perfect_index_reads": 98.29,
"mean_q30": 36.48,
"percent_q30": 97,
},
],
},
},
}

4 changes: 2 additions & 2 deletions requirements/prod
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
click~=8.1.1
PyYAML~=6.0
interop~=1.3.2
interop~=1.4.0
xmltodict~=0.13.0
tornado~=6.3.2
sample_sheet~=0.13.0
pandas~=2.2.2
numpy~=1.26.4
numpy~=2.2.4
samshee~=0.2.3
jsonschema~=4.23.0
Loading