Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/workflows/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@ on: [push, pull_request]
jobs:
build:
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: ['3.10', '3.11', '3.12','3.13']
name: Set up Python ${{ matrix.python-version }}
steps:
- uses: actions/checkout@v4

- name: Set up Python 3.10
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v4
with:
python-version: '3.10'
python-version: ${{ matrix.python-version }}

- name: Install dependencies
run: |
Expand Down
61 changes: 46 additions & 15 deletions checkQC/parsers/illumina.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,23 +13,32 @@ def from_bclconvert(cls, runfolder_path, parser_config):
assert runfolder_path.is_dir()

summary, index_summary = _read_interop_summary(runfolder_path)
quality_metrics = _read_quality_metrics(
quality_metrics = _read_demultiplexing_metrics(
runfolder_path
/ parser_config["reports_location"]
/ "Quality_Metrics.csv"
)
top_unknown_barcodes = _read_top_unknown_barcodes(
top_unknown_barcodes = _read_demultiplexing_metrics(
runfolder_path
/ parser_config["reports_location"]
/ "Top_Unknown_Barcodes.csv"
)

demultiplex_stats = _read_demultiplexing_metrics(
runfolder_path
/ parser_config["reports_location"]
/ "Demultiplex_Stats.csv"
)
samplesheet = _read_samplesheet(runfolder_path)

instrument, read_length = _read_run_metadata(runfolder_path)

sequencing_metrics = {
lane + 1: {
"total_cluster_pf": summary.at(0).at(lane).reads_pf(),
"total_reads_pf": summary.at(0).at(lane).reads_pf(),
"total_reads": summary.at(0).at(lane).reads(),
"raw_density":summary.at(0).at(lane).density().mean(),
"pf_density":summary.at(0).at(lane).density_pf().mean(),
"yield": sum(
int(row["Yield"])
for row in quality_metrics
Expand Down Expand Up @@ -69,6 +78,36 @@ def from_bclconvert(cls, runfolder_path, parser_config):
sample_summary := index_summary.at(lane).at(sample_no)
).sample_id(),
"cluster_count": sample_summary.cluster_count(),
"percent_of_lane": next(
round(float(sample_stat["% Reads"]) * 100, 2)
for sample_stat in demultiplex_stats
if sample_stat["Lane"] == str(lane + 1) and
sample_stat["SampleID"] == sample_summary.sample_id()
),
"percent_perfect_index_reads": next(
round(float(sample_stat["% Perfect Index Reads"]) * 100, 2)
for sample_stat in demultiplex_stats
if sample_stat["Lane"] == str(lane + 1) and
sample_stat["SampleID"] == sample_summary.sample_id()
),
"mean_q30": next(
float(row["Mean Quality Score (PF)"])
for row in quality_metrics
if (
row["Lane"] == str(lane + 1)
and row["SampleID"] == sample_summary.sample_id()
)
),
"percent_q30": next(
float(row["% Q30"]) * 100
for row in quality_metrics
if (
row["Lane"] == str(lane + 1)
and row["SampleID"] == sample_summary.sample_id()
)
)


}
for sample_no in range(index_summary.at(lane).size())
],
Expand Down Expand Up @@ -107,21 +146,13 @@ def _read_interop_summary(runfolder_path):
return run_summary, index_summary


def _read_quality_metrics(quality_metrics_path):
def _read_demultiplexing_metrics(metrics_path):
"""
Read quality metrics file
Read demultiplexing metrics file
"""
with open(quality_metrics_path, encoding="utf-8") as csvfile:
with open(metrics_path, encoding="utf-8") as csvfile:
return list(csv.DictReader(csvfile))


def _read_top_unknown_barcodes(top_unknown_barcodes_path):
"""
Read top unknown barcodes file
"""
with open(top_unknown_barcodes_path, encoding="utf-8") as csvfile:
return list(csv.DictReader(csvfile))



def _read_run_metadata(runfolder_path):
"""
Expand Down
20 changes: 10 additions & 10 deletions checkQC/qc_checkers/cluster_pf.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,33 +17,33 @@ def cluster_pf(
if warning_threshold != "unknown":
warning_threshold = int(warning_threshold * 10**6)

def format_msg(total_cluster_pf, threshold, lane, **kwargs):
return f"Clusters PF {total_cluster_pf / 10**6}M < {threshold / 10**6}M on lane {lane}"
def format_msg(total_reads_pf, threshold, lane, **kwargs):
return f"Clusters PF {total_reads_pf / 10**6}M < {threshold / 10**6}M on lane {lane}"

def _qualify_error(total_cluster_pf, lane):
def _qualify_error(total_reads_pf, lane):
data = {
"lane": lane,
"total_cluster_pf": total_cluster_pf,
"total_reads_pf": total_reads_pf,
"qc_checker": "cluster_pf",
}

match total_cluster_pf:
case total_cluster_pf if (
match total_reads_pf:
case total_reads_pf if (
error_threshold != "unknown"
and total_cluster_pf < error_threshold
and total_reads_pf < error_threshold
):
data["threshold"] = error_threshold
return QCErrorFatal(format_msg(**data), data=data)
case total_cluster_pf if (
case total_reads_pf if (
warning_threshold != "unknown"
and total_cluster_pf < warning_threshold
and total_reads_pf < warning_threshold
):
data["threshold"] = warning_threshold
return QCErrorWarning(format_msg(**data), data=data)

return [
qc_report
for lane, lane_data in qc_data.sequencing_metrics.items()
if (qc_report := _qualify_error(lane_data["total_cluster_pf"], lane))
if (qc_report := _qualify_error(lane_data["total_reads_pf"], lane))
]

2 changes: 1 addition & 1 deletion checkQC/qc_checkers/unidentified_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def unidentified_index(
qc_errors = []
for lane, lane_data in qc_data.sequencing_metrics.items():
for barcode in lane_data["top_unknown_barcodes"]:
significance = barcode["count"] / lane_data["total_cluster_pf"] * 100.
significance = barcode["count"] / lane_data["total_reads_pf"] * 100.
if significance < significance_threshold:
continue
index = (
Expand Down
4 changes: 2 additions & 2 deletions requirements/prod
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
click~=8.1.1
PyYAML~=6.0
interop~=1.3.2
interop~=1.4.0
xmltodict~=0.13.0
tornado~=6.3.2
sample_sheet~=0.13.0
pandas~=2.2.2
numpy~=1.26.4
numpy~=2.2.4
samshee~=0.2.3
jsonschema~=4.23.0
19 changes: 12 additions & 7 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,14 +11,19 @@
author_email='[email protected]',
url="https://www.github.com/Molmed/checkQC",
download_url='https://github.com/Molmed/checkQC/archive/{}.tar.gz'.format(__version__),
python_requires='>3.10, <3.11',
python_requires='>3.10',
install_requires=[
"click",
"PyYAML>=6.0",
"interop>=1.2.4",
"xmltodict",
"tornado",
"sample_sheet"],
"click~=8.1.1",
"PyYAML~=6.0",
"interop~=1.4.0",
" xmltodict~=0.13.0",
"tornado~=6.3.2",
" sample_sheet~=0.13.0",
" pandas~=2.2.2",
"numpy~=2.2.4",
"samshee~=0.2.3",
"jsonschema~=4.23.0",
],
packages=find_packages(exclude=["tests*"]),
test_suite="tests",
package_data={
Expand Down
12 changes: 6 additions & 6 deletions tests/parsers/test_illumina_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@

from checkQC.parsers.illumina import (
_read_interop_summary,
_read_quality_metrics,
_read_top_unknown_barcodes,
_read_demultiplexing_metrics,
_read_run_metadata,
_read_samplesheet,
)
Expand All @@ -22,15 +21,15 @@ def runfolder_path():
def test_read_interop_summary(runfolder_path):
run_summary, index_summary = _read_interop_summary(runfolder_path)

total_cluster_pf = run_summary.at(0).at(0).reads_pf()
assert total_cluster_pf == 532464327
total_reads_pf = run_summary.at(0).at(0).reads_pf()
assert total_reads_pf == 532464327

sample_id = index_summary.at(0).at(0).sample_id()
assert sample_id == "Sample_14574-Qiagen-IndexSet1-SP-Lane1"


def test_read_quality_metrics(runfolder_path):
quality_metrics = _read_quality_metrics(
quality_metrics = _read_demultiplexing_metrics(
runfolder_path / "Reports/Quality_Metrics.csv")

assert len(quality_metrics) == 6
Expand All @@ -50,7 +49,7 @@ def test_read_quality_metrics(runfolder_path):


def test_read_to_unknown_barcodes(runfolder_path):
top_unknown_barcodes = _read_top_unknown_barcodes(
top_unknown_barcodes = _read_demultiplexing_metrics(
runfolder_path / "Reports/Top_Unknown_Barcodes.csv")

assert len(top_unknown_barcodes) == 2084
Expand Down Expand Up @@ -108,4 +107,5 @@ def test_read_samplesheet(runfolder_path):
'lane': 1,
'sample_id': 'Sample_14574-Qiagen-IndexSet1-SP-Lane1',
'sample_project': 'AB-1234',
'custom_description': 'LIBRARY_NAME:test',
}
22 changes: 11 additions & 11 deletions tests/qc_checkers/test_cluster_pf.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@
def qc_data():
return namedtuple("QCData", "sequencing_metrics")(
{
1: {"total_cluster_pf": 1_000_000_000},
2: {"total_cluster_pf": 10_000_000},
3: {"total_cluster_pf": 100_000_000},
4: {"total_cluster_pf": 10_000_000_000},
1: {"total_reads_pf": 1_000_000_000},
2: {"total_reads_pf": 10_000_000},
3: {"total_reads_pf": 100_000_000},
4: {"total_reads_pf": 10_000_000_000},
}
)


def format_msg(total_cluster_pf, threshold, lane, **kwargs):
return f"Clusters PF {total_cluster_pf / 10**6}M < {threshold / 10**6}M on lane {lane}"
def format_msg(total_reads_pf, threshold, lane, **kwargs):
return f"Clusters PF {total_reads_pf / 10**6}M < {threshold / 10**6}M on lane {lane}"


def test_cluster_pf(qc_data):
Expand All @@ -34,7 +34,7 @@ def test_cluster_pf(qc_data):
match lane:
case 2:
exp_data = {
"total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"],
"total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"],
"threshold": 50_000_000,
"lane": lane,
"qc_checker": "cluster_pf",
Expand All @@ -44,7 +44,7 @@ def test_cluster_pf(qc_data):
assert report.data == exp_data
case 3:
exp_data = {
"total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"],
"total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"],
"threshold": 500_500_000,
"lane": lane,
"qc_checker": "cluster_pf",
Expand All @@ -69,7 +69,7 @@ def test_cluster_pf_error_unknown(qc_data):
match lane:
case 2:
exp_data = {
"total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"],
"total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"],
"threshold": 500_000_000,
"lane": lane,
"qc_checker": "cluster_pf",
Expand All @@ -79,7 +79,7 @@ def test_cluster_pf_error_unknown(qc_data):
assert report.data == exp_data
case 3:
exp_data = {
"total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"],
"total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"],
"threshold": 500_000_000,
"lane": lane,
"qc_checker": "cluster_pf",
Expand All @@ -104,7 +104,7 @@ def test_cluster_pf_warning_unknown(qc_data):
match lane:
case 2:
exp_data = {
"total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"],
"total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"],
"threshold": 50_000_000,
"lane": lane,
"qc_checker": "cluster_pf",
Expand Down
2 changes: 1 addition & 1 deletion tests/qc_checkers/test_unidentified_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def qc_data():
return namedtuple("QCData", ["sequencing_metrics", "samplesheet"])(
{
1: {
"total_cluster_pf": 100,
"total_reads_pf": 100,
"top_unknown_barcodes": [
{"lane": 1, "index": "ACCT", "count": 10},
{"lane": 1, "index": "AC", "count": 50},
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
Lane,SampleID,Sample_Project,Index,# Reads,# Perfect Index Reads,# One Mismatch Index Reads,# Two Mismatch Index Reads,% Reads,% Perfect Index Reads,% One Mismatch Index Reads,% Two Mismatch Index Reads
1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,AB-1234,GAACTGAGCG-TCGTGGAGCG,9920,9718,202,0,0.0029,0.9796,0.0204,0.0000
1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,CD-5678,AGGTCAGATA-CTACAAGATA,8560,8402,158,0,0.0025,0.9815,0.0185,0.0000
1,Undetermined,Undetermined,,3387226,3387226,0,0,0.9946,1.0000,0.0000,0.0000
2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,AB-1234,GAACTGAGCG-TCGTGGAGCG,10208,10024,184,0,0.0030,0.9820,0.0180,0.0000
2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,CD-5678,AGGTCAGATA-CTACAAGATA,8672,8524,148,0,0.0025,0.9829,0.0171,0.0000
2,Undetermined,Undetermined,,3439373,3439373,0,0,0.9945,1.0000,0.0000,0.0000
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@ Date,6/24/2020,,
Application,Illumina DRAGEN COVIDSeq Test Pipeline,,
Instrument Type,NovaSeq6000,,
Assay,Illumina COVIDSeq Test,,
Index Adapters,IDT-ILMN DNA-RNA UDP Indexes ,,
Index Adapters,"IDT-ILMN DNA-RNA UDP Indexes ",,
Chemistry,Amplicon,,
,,,
[Reads],,,,,,
[Reads],,,,,
Read1Cycles,36,,
Index1Cycles,10,,
Index2Cycles,10,,
Expand All @@ -18,8 +18,8 @@ FastqCompressionFormat,gzip,,
SoftwareVersion,4.1.5,,
,,,
[BCLConvert_Data],,,
Lane,Sample_ID,Index,Index2,Sample_Project
1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,GAACTGAGCG,TCGTGGAGCG,AB-1234
1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,AGGTCAGATA,CTACAAGATA,CD-5678
2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,GAACTGAGCG,TCGTGGAGCG,AB-1234
2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,AGGTC AGATA,C TACAA GATA,CD-5678
Lane,Sample_ID,Index,Index2,Sample_Project,custom_Description
1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,GAACTGAGCG,TCGTGGAGCG,AB-1234,LIBRARY_NAME:test
1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,AGGTCAGATA,CTACAAGATA,CD-5678,LIBRARY_NAME:test
2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,GAACTGAGCG,TCGTGGAGCG,AB-1234,LIBRARY_NAME:test
2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,AGGTC AGATA,C TACAA GATA,CD-5678,LIBRARY_NAME:test
Loading