Molmed
diff --git a/‎.github/workflows/unit_tests.yml‎
Lines changed: 6 additions & 2 deletions b/‎.github/workflows/unit_tests.yml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎checkQC/parsers/illumina.py‎
Lines changed: 48 additions & 17 deletions b/‎checkQC/parsers/illumina.py‎
Lines changed: 48 additions & 17 deletions
diff --git a/‎checkQC/qc_checkers/cluster_pf.py‎
Lines changed: 10 additions & 10 deletions b/‎checkQC/qc_checkers/cluster_pf.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎checkQC/qc_checkers/unidentified_index.py‎
Lines changed: 1 addition & 1 deletion b/‎checkQC/qc_checkers/unidentified_index.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎checkQC/qc_data_utils.py‎
Lines changed: 204 additions & 0 deletions b/‎checkQC/qc_data_utils.py‎
Lines changed: 204 additions & 0 deletions
diff --git a/‎requirements/prod‎
Lines changed: 2 additions & 2 deletions b/‎requirements/prod‎
Lines changed: 2 additions & 2 deletions
@@ -5,13 +5,17 @@ on: [push, pull_request]
 jobs:
   build:
     runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        python-version: ['3.10', '3.11', '3.12','3.13']
+    name: Set up Python ${{ matrix.python-version }}
     steps:
     - uses: actions/checkout@v4
 
-    - name: Set up Python 3.10
+    - name: Set up Python ${{ matrix.python-version }}
       uses: actions/setup-python@v4
       with:
-        python-version: '3.10'
+        python-version: ${{ matrix.python-version }}
 
     - name: Install dependencies
       run: |
 
@@ -12,24 +12,33 @@ def from_bclconvert(cls, runfolder_path, parser_config):
     runfolder_path = pathlib.Path(runfolder_path)
     assert runfolder_path.is_dir()
 
-    summary, index_summary = _read_interop_summary(runfolder_path)
-    quality_metrics = _read_quality_metrics(
+    summary, index_summary, run_info = _read_interop_summary(runfolder_path)
+    quality_metrics = _read_demultiplexing_metrics(
         runfolder_path
         / parser_config["reports_location"]
         / "Quality_Metrics.csv"
     )
-    top_unknown_barcodes = _read_top_unknown_barcodes(
+    top_unknown_barcodes = _read_demultiplexing_metrics(
         runfolder_path
         / parser_config["reports_location"]
         / "Top_Unknown_Barcodes.csv"
     )
+
+    demultiplex_stats = _read_demultiplexing_metrics(
+        runfolder_path
+        / parser_config["reports_location"]
+        / "Demultiplex_Stats.csv"
+    )
     samplesheet = _read_samplesheet(runfolder_path)
 
     instrument, read_length = _read_run_metadata(runfolder_path)
 
     sequencing_metrics = {
         lane + 1: {
-            "total_cluster_pf": summary.at(0).at(lane).reads_pf(),
+            "total_reads_pf": summary.at(0).at(lane).reads_pf(),
+            "total_reads": summary.at(0).at(lane).reads(),
+            "raw_density":summary.at(0).at(lane).density().mean(),
+            "pf_density":summary.at(0).at(lane).density_pf().mean(),
             "yield": sum(
                 int(row["Yield"])
                 for row in quality_metrics
@@ -69,6 +78,36 @@ def from_bclconvert(cls, runfolder_path, parser_config):
                         sample_summary := index_summary.at(lane).at(sample_no)
                     ).sample_id(),
                     "cluster_count": sample_summary.cluster_count(),
+                    "percent_of_lane": next(
+                        round(float(sample_stat["% Reads"]) * 100, 2)
+                        for sample_stat in demultiplex_stats
+                        if sample_stat["Lane"] == str(lane + 1) and
+                        sample_stat["SampleID"] == sample_summary.sample_id()
+                    ),
+                    "percent_perfect_index_reads": next(
+                        round(float(sample_stat["% Perfect Index Reads"]) * 100, 2)
+                        for sample_stat in demultiplex_stats
+                        if sample_stat["Lane"] == str(lane + 1) and
+                        sample_stat["SampleID"] == sample_summary.sample_id()
+                    ),
+                    "mean_q30": next(
+                        float(row["Mean Quality Score (PF)"])
+                        for row in quality_metrics
+                        if (
+                            row["Lane"] == str(lane + 1)
+                            and row["SampleID"] == sample_summary.sample_id()
+                        )
+                    ),
+                    "percent_q30": next(
+                        float(row["% Q30"]) * 100
+                        for row in quality_metrics
+                        if (
+                            row["Lane"] == str(lane + 1)
+                            and row["SampleID"] == sample_summary.sample_id()
+                        )
+                    )
+
+
                 }
                 for sample_no in range(index_summary.at(lane).size())
             ],
@@ -104,24 +143,16 @@ def _read_interop_summary(runfolder_path):
     index_summary = interop.py_interop_summary.index_flowcell_summary()
     interop.py_interop_summary.summarize_index_metrics(run_metrics, index_summary)
 
-    return run_summary, index_summary
+    return run_summary, index_summary, run_info
 
 
-def _read_quality_metrics(quality_metrics_path):
+def _read_demultiplexing_metrics(metrics_path):
     """
-    Read quality metrics file
+    Read demultiplexing metrics file
     """
-    with open(quality_metrics_path, encoding="utf-8") as csvfile:
+    with open(metrics_path, encoding="utf-8") as csvfile:
         return list(csv.DictReader(csvfile))
-
-
-def _read_top_unknown_barcodes(top_unknown_barcodes_path):
-    """
-    Read top unknown barcodes file
-    """
-    with open(top_unknown_barcodes_path, encoding="utf-8") as csvfile:
-        return list(csv.DictReader(csvfile))
-
+    
 
 def _read_run_metadata(runfolder_path):
     """
 
@@ -17,33 +17,33 @@ def cluster_pf(
     if warning_threshold != "unknown":
         warning_threshold = int(warning_threshold * 10**6)
 
-    def format_msg(total_cluster_pf, threshold, lane, **kwargs):
-        return f"Clusters PF {total_cluster_pf / 10**6}M < {threshold / 10**6}M on lane {lane}"
+    def format_msg(total_reads_pf, threshold, lane, **kwargs):
+        return f"Clusters PF {total_reads_pf / 10**6}M < {threshold / 10**6}M on lane {lane}"
 
-    def _qualify_error(total_cluster_pf, lane):
+    def _qualify_error(total_reads_pf, lane):
         data = {
             "lane": lane,
-            "total_cluster_pf": total_cluster_pf,
+            "total_reads_pf": total_reads_pf,
             "qc_checker": "cluster_pf",
         }
 
-        match total_cluster_pf:
-            case total_cluster_pf if (
+        match total_reads_pf:
+            case total_reads_pf if (
                     error_threshold != "unknown"
-                    and total_cluster_pf < error_threshold
+                    and total_reads_pf < error_threshold
                 ):
                     data["threshold"] = error_threshold
                     return QCErrorFatal(format_msg(**data), data=data)
-            case total_cluster_pf if (
+            case total_reads_pf if (
                     warning_threshold != "unknown"
-                    and total_cluster_pf < warning_threshold
+                    and total_reads_pf < warning_threshold
                 ):
                     data["threshold"] = warning_threshold
                     return QCErrorWarning(format_msg(**data), data=data)
 
     return [
         qc_report
         for lane, lane_data in qc_data.sequencing_metrics.items()
-        if (qc_report := _qualify_error(lane_data["total_cluster_pf"], lane))
+        if (qc_report := _qualify_error(lane_data["total_reads_pf"], lane))
     ]
 
@@ -40,7 +40,7 @@ def unidentified_index(
     qc_errors = []
     for lane, lane_data in qc_data.sequencing_metrics.items():
         for barcode in lane_data["top_unknown_barcodes"]:
-            significance = barcode["count"] / lane_data["total_cluster_pf"] * 100.
+            significance = barcode["count"] / lane_data["total_reads_pf"] * 100.
             if significance < significance_threshold:
                 continue
             index = (
 
@@ -0,0 +1,204 @@
+import numpy as np
+from checkQC.parsers.illumina import _read_interop_summary
+
+
+def bclconvert_test_runfolder(qc_data, runfolder_path):
+    _, _, run_info = _read_interop_summary(runfolder_path)
+    flowcell_id = run_info.flowcell_id()
+    if "HMTFYDRXX" in flowcell_id:
+        return {
+            "qc_data": qc_data,
+            "expected_instrument": "novaseq_SP",
+            "expected_read_length": 36,
+            "expected_samplesheet": {
+                "len": 4,
+                "head": [
+                    {
+                        "lane": 1,
+                        "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1",
+                        "index": "GAACTGAGCG",
+                        "index2": "TCGTGGAGCG",
+                        "sample_project": "AB-1234",
+                        "overridecycles": "Y36;I10;I10",
+                        "custom_description": "LIBRARY_NAME:test",
+                    },
+                    {
+                        "lane": 1,
+                        "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1",
+                        "index": "AGGTCAGATA",
+                        "index2": "CTACAAGATA",
+                        "sample_project": "CD-5678",
+                        "overridecycles": "Y36;I10;I10",
+                        "custom_description": "LIBRARY_NAME:test",
+                    },
+                    {
+                        "lane": 2,
+                        "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2",
+                        "index": "GAACTGAGCG",
+                        "index2": "TCGTGGAGCG",
+                        "sample_project": "AB-1234",
+                        "overridecycles": "Y36;I10;I10",
+                        "custom_description": "LIBRARY_NAME:test",
+                    },
+                    {
+                        "lane": 2,
+                        "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2",
+                        "index": "AGGTCAGATA",
+                        "index2": "CTACAAGATA",
+                        "sample_project": "CD-5678",
+                        "overridecycles": "Y36;I10;I10",
+                        "custom_description": "LIBRARY_NAME:test",
+                    },
+                ],
+            },
+            "expected_sequencing_metrics": {
+                1: {
+                    "total_reads_pf": 532_464_327,
+                    "total_reads": 638_337_024,
+                    "raw_density": 2_961_270.5,
+                    "pf_density": 2_470_118.25,
+                    "yield": 122_605_416,
+                    "yield_undetermined": 121_940_136,
+                    "top_unknown_barcodes": {
+                        "len": 1029,
+                        "head": [
+                            {
+                                'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT',
+                                'count': 12857,
+                            },
+                            {
+                                'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT',
+                                'count': 12406,
+                            },
+                            {
+                                'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG',
+                                'count': 12177,
+                            },
+                            {
+                                'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT',
+                                'count': 11590,
+                            },
+                            {
+                                'index': 'GGTCCGCTTC', 'index2': 'CTCACACAAG',
+                                'count': 11509,
+                            },
+                        ],
+                    },
+                    "reads": {
+                        1: {
+                            "mean_error_rate": np.nan,
+                            "percent_q30": 95.70932006835938,
+                            "is_index": False,
+                            "mean_percent_phix_aligned": 0.,
+                        },
+                        2: {
+                            "mean_error_rate": np.nan,
+                            "percent_q30": 92.57965850830078,
+                            "is_index": True,
+                            "mean_percent_phix_aligned": np.nan,
+                        },
+                        3: {
+                            "mean_error_rate": np.nan,
+                            "percent_q30": 90.3790283203125,
+                            "is_index": True,
+                            "mean_percent_phix_aligned": np.nan,
+                        },
+                    },
+                    "reads_per_sample": [
+                        {
+                            "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1",
+                            "cluster_count": 9920,
+                            "percent_of_lane": 0.29,
+                            "percent_perfect_index_reads": 97.96,
+                            "mean_q30": 36.37,
+                            "percent_q30": 96,
+                        },
+                        {
+                            "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1",
+                            "cluster_count": 8560,
+                            "percent_of_lane": 0.25,
+                            "percent_perfect_index_reads": 98.15,
+                            "mean_q30": 36.43,
+                            "percent_q30": 96,
+                        },
+                    ],
+                },
+                2: {
+                    "total_reads_pf": 530_917_565,
+                    "total_reads": 638_337_024,
+                    "raw_density": 2_961_270.5,
+                    "pf_density": 2_462_942.5,
+                    "yield": 124_497_108,
+                    "yield_undetermined": 123_817_428,
+                    "top_unknown_barcodes": {
+                        "len": 1055,
+                        "head": [
+                            {
+                                'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT',
+                                'count': 13176,
+                            },
+                            {
+                                'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG',
+                                'count': 12395,
+                            },
+                            {
+                                'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT',
+                                'count': 12247,
+                            },
+                            {
+                                'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT',
+                                'count': 11909,
+                            },
+                            {
+                                'index': 'TAATTAGCGT', 'index2': 'TGGTTAAGAA',
+                                'count': 11330,
+                            },
+                        ],
+                    },
+                    "reads": {
+                        1: {
+                            "mean_error_rate": np.nan,
+                            "percent_q30": 95.75276184082031,
+                            "is_index": False,
+                            "mean_percent_phix_aligned": 0.,
+                        },
+                        2: {
+                            "mean_error_rate": np.nan,
+                            "percent_q30": 92.60448455810547,
+                            "is_index": True,
+                            "mean_percent_phix_aligned": np.nan,
+                        },
+                        3: {
+                            "mean_error_rate": np.nan,
+                            "percent_q30": 90.2811050415039,
+                            "is_index": True,
+                            "mean_percent_phix_aligned": np.nan,
+                        },
+                    },
+                    "reads_per_sample": [
+                        {
+                            "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2",
+                            "cluster_count": 10208,
+                            "percent_of_lane": 0.3,
+                            "percent_perfect_index_reads": 98.2,
+                            "mean_q30": 36.4,
+                            "percent_q30": 96,
+                        },
+                        {
+                            "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2",
+                            "cluster_count": 8672,
+                            "percent_of_lane": 0.25,
+                            "percent_perfect_index_reads": 98.29,
+                            "mean_q30": 36.48,
+                            "percent_q30": 97,
+                        },
+                    ],
+                },
+            },
+        }
+    else:
+        raise Exception(
+            "This function is only compatible with the run with flowcell_id: 'HMTFYDRXX', "
+                f"the supplied runfolder has flowcell_id: {flowcell_id}"
+        )
+
@@ -1,10 +1,10 @@
 click~=8.1.1
 PyYAML~=6.0
-interop~=1.3.2
+interop~=1.4.0
 xmltodict~=0.13.0
 tornado~=6.3.2
 sample_sheet~=0.13.0
 pandas~=2.2.2
-numpy~=1.26.4
+numpy~=2.2.4
 samshee~=0.2.3
 jsonschema~=4.23.0