From c9bb46611cddb7e384411d7c12f9073d839e9f6f Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Thu, 21 Aug 2025 09:58:27 +0200 Subject: [PATCH 01/17] Updating illumina parser to return values for projman --- checkQC/parsers/illumina.py | 61 ++++++++++++++----- tests/parsers/test_illumina_parser.py | 7 +-- .../Reports/Demultiplex_Stats.csv | 7 +++ .../SampleSheet.csv | 10 +-- 4 files changed, 62 insertions(+), 23 deletions(-) create mode 100644 tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/Reports/Demultiplex_Stats.csv diff --git a/checkQC/parsers/illumina.py b/checkQC/parsers/illumina.py index da55c69..e4de080 100644 --- a/checkQC/parsers/illumina.py +++ b/checkQC/parsers/illumina.py @@ -13,16 +13,22 @@ def from_bclconvert(cls, runfolder_path, parser_config): assert runfolder_path.is_dir() summary, index_summary = _read_interop_summary(runfolder_path) - quality_metrics = _read_quality_metrics( + quality_metrics = _read_demultiplexing_metrics( runfolder_path / parser_config["reports_location"] / "Quality_Metrics.csv" ) - top_unknown_barcodes = _read_top_unknown_barcodes( + top_unknown_barcodes = _read_demultiplexing_metrics( runfolder_path / parser_config["reports_location"] / "Top_Unknown_Barcodes.csv" ) + + demultiplex_stats = _read_demultiplexing_metrics( + runfolder_path + / parser_config["reports_location"] + / "Demultiplex_Stats.csv" + ) samplesheet = _read_samplesheet(runfolder_path) instrument, read_length = _read_run_metadata(runfolder_path) @@ -30,6 +36,11 @@ def from_bclconvert(cls, runfolder_path, parser_config): sequencing_metrics = { lane + 1: { "total_cluster_pf": summary.at(0).at(lane).reads_pf(), + "total_cluster": summary.at(0).at(lane).reads(), + "pf_clusters": summary.at(0).at(lane).cluster_count_pf().mean(), + "raw_clusters": summary.at(0).at(lane).cluster_count().mean(), + "raw_density":summary.at(0).at(lane).density().mean(), + "pf_density":summary.at(0).at(lane).density_pf().mean(), "yield": sum( int(row["Yield"]) for row in quality_metrics @@ -69,6 +80,36 @@ def from_bclconvert(cls, runfolder_path, parser_config): sample_summary := index_summary.at(lane).at(sample_no) ).sample_id(), "cluster_count": sample_summary.cluster_count(), + "percent_of_lane":next( + round(float(sample_stat["% Reads"]) * 100, 2) + for sample_stat in demultiplex_stats + if sample_stat["Lane"] == str(lane + 1) and + sample_stat["SampleID"] == sample_summary.sample_id() + ), + "percent_perfect_index_reads": next( + round(float(sample_stat["% Perfect Index Reads"]) * 100, 2) + for sample_stat in demultiplex_stats + if sample_stat["Lane"] == str(lane + 1) and + sample_stat["SampleID"] == sample_summary.sample_id() + ), + "mean_q30": next( + float(row["Mean Quality Score (PF)"]) + for row in quality_metrics + if ( + row["Lane"] == str(lane + 1) + and row["SampleID"] == sample_summary.sample_id() + ) + ), + "percent_q30": next( + float(row["% Q30"]) + for row in quality_metrics + if ( + row["Lane"] == str(lane + 1) + and row["SampleID"] == sample_summary.sample_id() + ) + ) + + } for sample_no in range(index_summary.at(lane).size()) ], @@ -107,21 +148,13 @@ def _read_interop_summary(runfolder_path): return run_summary, index_summary -def _read_quality_metrics(quality_metrics_path): +def _read_demultiplexing_metrics(metrics_path): """ - Read quality metrics file + Read demultiplexing metrics file """ - with open(quality_metrics_path, encoding="utf-8") as csvfile: + with open(metrics_path, encoding="utf-8") as csvfile: return list(csv.DictReader(csvfile)) - - -def _read_top_unknown_barcodes(top_unknown_barcodes_path): - """ - Read top unknown barcodes file - """ - with open(top_unknown_barcodes_path, encoding="utf-8") as csvfile: - return list(csv.DictReader(csvfile)) - + def _read_run_metadata(runfolder_path): """ diff --git a/tests/parsers/test_illumina_parser.py b/tests/parsers/test_illumina_parser.py index 2071cab..4c7c862 100644 --- a/tests/parsers/test_illumina_parser.py +++ b/tests/parsers/test_illumina_parser.py @@ -4,8 +4,7 @@ from checkQC.parsers.illumina import ( _read_interop_summary, - _read_quality_metrics, - _read_top_unknown_barcodes, + _read_demultiplexing_metrics, _read_run_metadata, _read_samplesheet, ) @@ -30,7 +29,7 @@ def test_read_interop_summary(runfolder_path): def test_read_quality_metrics(runfolder_path): - quality_metrics = _read_quality_metrics( + quality_metrics = _read_demultiplexing_metrics( runfolder_path / "Reports/Quality_Metrics.csv") assert len(quality_metrics) == 6 @@ -50,7 +49,7 @@ def test_read_quality_metrics(runfolder_path): def test_read_to_unknown_barcodes(runfolder_path): - top_unknown_barcodes = _read_top_unknown_barcodes( + top_unknown_barcodes = _read_demultiplexing_metrics( runfolder_path / "Reports/Top_Unknown_Barcodes.csv") assert len(top_unknown_barcodes) == 2084 diff --git a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/Reports/Demultiplex_Stats.csv b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/Reports/Demultiplex_Stats.csv new file mode 100644 index 0000000..f8afe72 --- /dev/null +++ b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/Reports/Demultiplex_Stats.csv @@ -0,0 +1,7 @@ +Lane,SampleID,Sample_Project,Index,# Reads,# Perfect Index Reads,# One Mismatch Index Reads,# Two Mismatch Index Reads,% Reads,% Perfect Index Reads,% One Mismatch Index Reads,% Two Mismatch Index Reads +1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,AB-1234,GAACTGAGCG-TCGTGGAGCG,9920,9718,202,0,0.0029,0.9796,0.0204,0.0000 +1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,CD-5678,AGGTCAGATA-CTACAAGATA,8560,8402,158,0,0.0025,0.9815,0.0185,0.0000 +1,Undetermined,Undetermined,,3387226,3387226,0,0,0.9946,1.0000,0.0000,0.0000 +2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,AB-1234,GAACTGAGCG-TCGTGGAGCG,10208,10024,184,0,0.0030,0.9820,0.0180,0.0000 +2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,CD-5678,AGGTCAGATA-CTACAAGATA,8672,8524,148,0,0.0025,0.9829,0.0171,0.0000 +2,Undetermined,Undetermined,,3439373,3439373,0,0,0.9945,1.0000,0.0000,0.0000 diff --git a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv index abb88cc..8b8f3c4 100755 --- a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv +++ b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv @@ -18,8 +18,8 @@ FastqCompressionFormat,gzip,, SoftwareVersion,4.1.5,, ,,, [BCLConvert_Data],,, -Lane,Sample_ID,Index,Index2,Sample_Project -1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,GAACTGAGCG,TCGTGGAGCG,AB-1234 -1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,AGGTCAGATA,CTACAAGATA,CD-5678 -2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,GAACTGAGCG,TCGTGGAGCG,AB-1234 -2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,AGGTC AGATA,C TACAA GATA,CD-5678 +Lane,Sample_ID,Index,Index2,Sample_Project, Description +1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,GAACTGAGCG,TCGTGGAGCG,AB-1234,LIBRARY_NAME:test +1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,AGGTCAGATA,CTACAAGATA,CD-5678,LIBRARY_NAME:test +2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,GAACTGAGCG,TCGTGGAGCG,AB-1234,LIBRARY_NAME:test +2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,AGGTC AGATA,C TACAA GATA,CD-5678,LIBRARY_NAME:test From 09aae6f0e7f0871e8e0c959056199f15cd722708 Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Wed, 27 Aug 2025 16:11:12 +0200 Subject: [PATCH 02/17] Corrected tests after adding more details in qcData sequencing_metrics --- checkQC/parsers/illumina.py | 1 - requirements/prod | 4 +-- setup.py | 19 ++++++++----- tests/parsers/test_illumina_parser.py | 1 + .../SampleSheet.csv | 2 +- tests/test_qc_data.py | 28 +++++++++++++++++++ 6 files changed, 44 insertions(+), 11 deletions(-) diff --git a/checkQC/parsers/illumina.py b/checkQC/parsers/illumina.py index e4de080..f99bf22 100644 --- a/checkQC/parsers/illumina.py +++ b/checkQC/parsers/illumina.py @@ -36,7 +36,6 @@ def from_bclconvert(cls, runfolder_path, parser_config): sequencing_metrics = { lane + 1: { "total_cluster_pf": summary.at(0).at(lane).reads_pf(), - "total_cluster": summary.at(0).at(lane).reads(), "pf_clusters": summary.at(0).at(lane).cluster_count_pf().mean(), "raw_clusters": summary.at(0).at(lane).cluster_count().mean(), "raw_density":summary.at(0).at(lane).density().mean(), diff --git a/requirements/prod b/requirements/prod index 07cc4bb..777b608 100644 --- a/requirements/prod +++ b/requirements/prod @@ -1,10 +1,10 @@ click~=8.1.1 PyYAML~=6.0 -interop~=1.3.2 +interop~=1.4.0 xmltodict~=0.13.0 tornado~=6.3.2 sample_sheet~=0.13.0 pandas~=2.2.2 -numpy~=1.26.4 +numpy~=2.2.4 samshee~=0.2.3 jsonschema~=4.23.0 diff --git a/setup.py b/setup.py index 0c36f37..7b4c546 100644 --- a/setup.py +++ b/setup.py @@ -11,14 +11,19 @@ author_email='johan.dahlberg@medsci.uu.se', url="https://www.github.com/Molmed/checkQC", download_url='https://github.com/Molmed/checkQC/archive/{}.tar.gz'.format(__version__), - python_requires='>3.10, <3.11', + python_requires='>3.10', install_requires=[ - "click", - "PyYAML>=6.0", - "interop>=1.2.4", - "xmltodict", - "tornado", - "sample_sheet"], + "click~=8.1.1", + "PyYAML~=6.0", + "interop~=1.4.0", + " xmltodict~=0.13.0", + "tornado~=6.3.2", + " sample_sheet~=0.13.0", + " pandas~=2.2.2", + "numpy~=2.2.4", + "samshee~=0.2.3", + "jsonschema~=4.23.0", + ], packages=find_packages(exclude=["tests*"]), test_suite="tests", package_data={ diff --git a/tests/parsers/test_illumina_parser.py b/tests/parsers/test_illumina_parser.py index 4c7c862..6743ef9 100644 --- a/tests/parsers/test_illumina_parser.py +++ b/tests/parsers/test_illumina_parser.py @@ -107,4 +107,5 @@ def test_read_samplesheet(runfolder_path): 'lane': 1, 'sample_id': 'Sample_14574-Qiagen-IndexSet1-SP-Lane1', 'sample_project': 'AB-1234', + 'description': 'LIBRARY_NAME:test', } diff --git a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv index 8b8f3c4..4699910 100755 --- a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv +++ b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv @@ -18,7 +18,7 @@ FastqCompressionFormat,gzip,, SoftwareVersion,4.1.5,, ,,, [BCLConvert_Data],,, -Lane,Sample_ID,Index,Index2,Sample_Project, Description +Lane,Sample_ID,Index,Index2,Sample_Project,Description 1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,GAACTGAGCG,TCGTGGAGCG,AB-1234,LIBRARY_NAME:test 1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,AGGTCAGATA,CTACAAGATA,CD-5678,LIBRARY_NAME:test 2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,GAACTGAGCG,TCGTGGAGCG,AB-1234,LIBRARY_NAME:test diff --git a/tests/test_qc_data.py b/tests/test_qc_data.py index 35e3dee..22a258c 100644 --- a/tests/test_qc_data.py +++ b/tests/test_qc_data.py @@ -35,6 +35,7 @@ def bclconvert_runfolder(): "index": "GAACTGAGCG", "index2": "TCGTGGAGCG", "sample_project": "AB-1234", + "description": "LIBRARY_NAME:test", }, { "lane": 1, @@ -42,6 +43,7 @@ def bclconvert_runfolder(): "index": "AGGTCAGATA", "index2": "CTACAAGATA", "sample_project": "CD-5678", + "description": "LIBRARY_NAME:test", }, { "lane": 2, @@ -49,6 +51,7 @@ def bclconvert_runfolder(): "index": "GAACTGAGCG", "index2": "TCGTGGAGCG", "sample_project": "AB-1234", + "description": "LIBRARY_NAME:test", }, { "lane": 2, @@ -56,12 +59,17 @@ def bclconvert_runfolder(): "index": "AGGTCAGATA", "index2": "CTACAAGATA", "sample_project": "CD-5678", + "description": "LIBRARY_NAME:test", }, ], }, "expected_sequencing_metrics": { 1: { "total_cluster_pf": 532_464_327, + "pf_clusters": 3_413_232.5, + "raw_clusters": 4_091_904.0, + "raw_density": 2_961_270.5, + "pf_density": 2_470_118.25, "yield": 122_605_416, "yield_undetermined": 121_940_136, "top_unknown_barcodes": { @@ -113,15 +121,27 @@ def bclconvert_runfolder(): { "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", "cluster_count": 9920, + "percent_of_lane": 0.29, + "percent_perfect_index_reads": 97.96, + "mean_q30": 36.37, + "percent_q30": 0.96, }, { "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", "cluster_count": 8560, + "percent_of_lane": 0.25, + "percent_perfect_index_reads": 98.15, + "mean_q30": 36.43, + "percent_q30": 0.96, }, ], }, 2: { "total_cluster_pf": 530_917_565, + "pf_clusters": 3_403_318.25, + "raw_clusters": 4_091_904.0, + "raw_density": 2_961_270.5, + "pf_density": 2_462_942.5, "yield": 124_497_108, "yield_undetermined": 123_817_428, "top_unknown_barcodes": { @@ -173,10 +193,18 @@ def bclconvert_runfolder(): { "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", "cluster_count": 10208, + "percent_of_lane": 0.3, + "percent_perfect_index_reads": 98.2, + "mean_q30": 36.4, + "percent_q30": 0.96, }, { "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", "cluster_count": 8672, + "percent_of_lane": 0.25, + "percent_perfect_index_reads": 98.29, + "mean_q30": 36.48, + "percent_q30": 0.97, }, ], }, From c95b2edb1bdd3ea38ba963709120d66b467b3399 Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Wed, 3 Sep 2025 15:26:22 +0200 Subject: [PATCH 03/17] Refactored code --- checkQC/parsers/illumina.py | 4 ++-- tests/test_qc_data.py | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/checkQC/parsers/illumina.py b/checkQC/parsers/illumina.py index f99bf22..a7857c7 100644 --- a/checkQC/parsers/illumina.py +++ b/checkQC/parsers/illumina.py @@ -79,7 +79,7 @@ def from_bclconvert(cls, runfolder_path, parser_config): sample_summary := index_summary.at(lane).at(sample_no) ).sample_id(), "cluster_count": sample_summary.cluster_count(), - "percent_of_lane":next( + "percent_of_lane": next( round(float(sample_stat["% Reads"]) * 100, 2) for sample_stat in demultiplex_stats if sample_stat["Lane"] == str(lane + 1) and @@ -100,7 +100,7 @@ def from_bclconvert(cls, runfolder_path, parser_config): ) ), "percent_q30": next( - float(row["% Q30"]) + float(row["% Q30"]) * 100 for row in quality_metrics if ( row["Lane"] == str(lane + 1) diff --git a/tests/test_qc_data.py b/tests/test_qc_data.py index 22a258c..8597a6c 100644 --- a/tests/test_qc_data.py +++ b/tests/test_qc_data.py @@ -124,7 +124,7 @@ def bclconvert_runfolder(): "percent_of_lane": 0.29, "percent_perfect_index_reads": 97.96, "mean_q30": 36.37, - "percent_q30": 0.96, + "percent_q30": 96, }, { "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", @@ -132,7 +132,7 @@ def bclconvert_runfolder(): "percent_of_lane": 0.25, "percent_perfect_index_reads": 98.15, "mean_q30": 36.43, - "percent_q30": 0.96, + "percent_q30": 96, }, ], }, @@ -196,7 +196,7 @@ def bclconvert_runfolder(): "percent_of_lane": 0.3, "percent_perfect_index_reads": 98.2, "mean_q30": 36.4, - "percent_q30": 0.96, + "percent_q30": 96, }, { "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", @@ -204,7 +204,7 @@ def bclconvert_runfolder(): "percent_of_lane": 0.25, "percent_perfect_index_reads": 98.29, "mean_q30": 36.48, - "percent_q30": 0.97, + "percent_q30": 97, }, ], }, From 0ad571cf6366af5e9d38ec305ab173f738aacb69 Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Wed, 3 Sep 2025 15:31:48 +0200 Subject: [PATCH 04/17] Testing GHA with python 3.11 --- .github/workflows/unit_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index bd34ea1..ac1e4f0 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -8,10 +8,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python 3.10 + - name: Set up Python 3.11 uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.11' - name: Install dependencies run: | From ca0a8c573a3f45e01b69278ec5ebe6b3819ae8ad Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Wed, 3 Sep 2025 15:33:53 +0200 Subject: [PATCH 05/17] Testing GHA with python 3.12 --- .github/workflows/unit_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index ac1e4f0..c1d3bc4 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -8,10 +8,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python 3.11 + - name: Set up Python 3.12 uses: actions/setup-python@v4 with: - python-version: '3.11' + python-version: '3.12' - name: Install dependencies run: | From a9971fa25f7b434a016314274e8bcc4faf98cd11 Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Wed, 3 Sep 2025 15:36:34 +0200 Subject: [PATCH 06/17] Testing GHA with python 3.13 --- .github/workflows/unit_tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index c1d3bc4..6ba0924 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -8,10 +8,10 @@ jobs: steps: - uses: actions/checkout@v4 - - name: Set up Python 3.12 + - name: Set up Python 3.13 uses: actions/setup-python@v4 with: - python-version: '3.12' + python-version: '3.13' - name: Install dependencies run: | From 4938558b61b95a2c67161a957ab2c74b6d109d0a Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Fri, 5 Sep 2025 07:27:29 +0200 Subject: [PATCH 07/17] Using python-versio matrix i GHA workflow --- .github/workflows/unit_tests.yml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 6ba0924..443af1f 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -5,13 +5,17 @@ on: [push, pull_request] jobs: build: runs-on: ubuntu-22.04 + strategy: + matrix: + python-version: ['3.10', '3.11', '3.12','3.13'] + name: Set up Python ${{ matrix.python-version } steps: - uses: actions/checkout@v4 - - name: Set up Python 3.13 + - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: - python-version: '3.13' + python-version: ${{ matrix.python-version } - name: Install dependencies run: | From c1c2744b7696f0ef9003c9b83c3066546679252f Mon Sep 17 00:00:00 2001 From: nkongenelly Date: Fri, 5 Sep 2025 09:12:23 +0200 Subject: [PATCH 08/17] Update .github/workflows/unit_tests.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matilda Åslin --- .github/workflows/unit_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 443af1f..a24677b 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -8,7 +8,7 @@ jobs: strategy: matrix: python-version: ['3.10', '3.11', '3.12','3.13'] - name: Set up Python ${{ matrix.python-version } + name: Set up Python ${{ matrix.python-version }} steps: - uses: actions/checkout@v4 From 1e186865cdb3da6af009387559eb5af2732a7faf Mon Sep 17 00:00:00 2001 From: nkongenelly Date: Fri, 5 Sep 2025 09:12:33 +0200 Subject: [PATCH 09/17] Update .github/workflows/unit_tests.yml MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Matilda Åslin --- .github/workflows/unit_tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index a24677b..11041de 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -15,7 +15,7 @@ jobs: - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 with: - python-version: ${{ matrix.python-version } + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | From ba0a7ed8043d07c62acdf8959889966657fffc4a Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Mon, 15 Sep 2025 16:11:42 +0200 Subject: [PATCH 10/17] removed pf_clusters from bclconvert sequencing metrics returned --- checkQC/parsers/illumina.py | 1 - tests/test_qc_data.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/checkQC/parsers/illumina.py b/checkQC/parsers/illumina.py index a7857c7..4e3cf5d 100644 --- a/checkQC/parsers/illumina.py +++ b/checkQC/parsers/illumina.py @@ -36,7 +36,6 @@ def from_bclconvert(cls, runfolder_path, parser_config): sequencing_metrics = { lane + 1: { "total_cluster_pf": summary.at(0).at(lane).reads_pf(), - "pf_clusters": summary.at(0).at(lane).cluster_count_pf().mean(), "raw_clusters": summary.at(0).at(lane).cluster_count().mean(), "raw_density":summary.at(0).at(lane).density().mean(), "pf_density":summary.at(0).at(lane).density_pf().mean(), diff --git a/tests/test_qc_data.py b/tests/test_qc_data.py index 8597a6c..b14432d 100644 --- a/tests/test_qc_data.py +++ b/tests/test_qc_data.py @@ -66,7 +66,6 @@ def bclconvert_runfolder(): "expected_sequencing_metrics": { 1: { "total_cluster_pf": 532_464_327, - "pf_clusters": 3_413_232.5, "raw_clusters": 4_091_904.0, "raw_density": 2_961_270.5, "pf_density": 2_470_118.25, @@ -138,7 +137,6 @@ def bclconvert_runfolder(): }, 2: { "total_cluster_pf": 530_917_565, - "pf_clusters": 3_403_318.25, "raw_clusters": 4_091_904.0, "raw_density": 2_961_270.5, "pf_density": 2_462_942.5, From 5d7a4535ce98369f6623d57e366c03069c5e39f9 Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Wed, 24 Sep 2025 17:35:13 +0200 Subject: [PATCH 11/17] Updated samplesheet v2 structure --- checkQC/parsers/illumina.py | 4 ++-- checkQC/qc_checkers/cluster_pf.py | 20 ++++++++--------- checkQC/qc_checkers/unidentified_index.py | 2 +- tests/parsers/test_illumina_parser.py | 6 ++--- tests/qc_checkers/test_cluster_pf.py | 22 +++++++++---------- tests/qc_checkers/test_unidentified_index.py | 2 +- .../SampleSheet.csv | 6 ++--- tests/test_qc_data.py | 16 +++++++------- 8 files changed, 39 insertions(+), 39 deletions(-) diff --git a/checkQC/parsers/illumina.py b/checkQC/parsers/illumina.py index 4e3cf5d..7cb444c 100644 --- a/checkQC/parsers/illumina.py +++ b/checkQC/parsers/illumina.py @@ -35,8 +35,8 @@ def from_bclconvert(cls, runfolder_path, parser_config): sequencing_metrics = { lane + 1: { - "total_cluster_pf": summary.at(0).at(lane).reads_pf(), - "raw_clusters": summary.at(0).at(lane).cluster_count().mean(), + "total_reads_pf": summary.at(0).at(lane).reads_pf(), + "total_reads": summary.at(0).at(lane).reads(), "raw_density":summary.at(0).at(lane).density().mean(), "pf_density":summary.at(0).at(lane).density_pf().mean(), "yield": sum( diff --git a/checkQC/qc_checkers/cluster_pf.py b/checkQC/qc_checkers/cluster_pf.py index 867516b..2e4eb0c 100644 --- a/checkQC/qc_checkers/cluster_pf.py +++ b/checkQC/qc_checkers/cluster_pf.py @@ -17,26 +17,26 @@ def cluster_pf( if warning_threshold != "unknown": warning_threshold = int(warning_threshold * 10**6) - def format_msg(total_cluster_pf, threshold, lane, **kwargs): - return f"Clusters PF {total_cluster_pf / 10**6}M < {threshold / 10**6}M on lane {lane}" + def format_msg(total_reads_pf, threshold, lane, **kwargs): + return f"Clusters PF {total_reads_pf / 10**6}M < {threshold / 10**6}M on lane {lane}" - def _qualify_error(total_cluster_pf, lane): + def _qualify_error(total_reads_pf, lane): data = { "lane": lane, - "total_cluster_pf": total_cluster_pf, + "total_reads_pf": total_reads_pf, "qc_checker": "cluster_pf", } - match total_cluster_pf: - case total_cluster_pf if ( + match total_reads_pf: + case total_reads_pf if ( error_threshold != "unknown" - and total_cluster_pf < error_threshold + and total_reads_pf < error_threshold ): data["threshold"] = error_threshold return QCErrorFatal(format_msg(**data), data=data) - case total_cluster_pf if ( + case total_reads_pf if ( warning_threshold != "unknown" - and total_cluster_pf < warning_threshold + and total_reads_pf < warning_threshold ): data["threshold"] = warning_threshold return QCErrorWarning(format_msg(**data), data=data) @@ -44,6 +44,6 @@ def _qualify_error(total_cluster_pf, lane): return [ qc_report for lane, lane_data in qc_data.sequencing_metrics.items() - if (qc_report := _qualify_error(lane_data["total_cluster_pf"], lane)) + if (qc_report := _qualify_error(lane_data["total_reads_pf"], lane)) ] diff --git a/checkQC/qc_checkers/unidentified_index.py b/checkQC/qc_checkers/unidentified_index.py index ce12352..c824c0d 100644 --- a/checkQC/qc_checkers/unidentified_index.py +++ b/checkQC/qc_checkers/unidentified_index.py @@ -40,7 +40,7 @@ def unidentified_index( qc_errors = [] for lane, lane_data in qc_data.sequencing_metrics.items(): for barcode in lane_data["top_unknown_barcodes"]: - significance = barcode["count"] / lane_data["total_cluster_pf"] * 100. + significance = barcode["count"] / lane_data["total_reads_pf"] * 100. if significance < significance_threshold: continue index = ( diff --git a/tests/parsers/test_illumina_parser.py b/tests/parsers/test_illumina_parser.py index 6743ef9..d8b645e 100644 --- a/tests/parsers/test_illumina_parser.py +++ b/tests/parsers/test_illumina_parser.py @@ -21,8 +21,8 @@ def runfolder_path(): def test_read_interop_summary(runfolder_path): run_summary, index_summary = _read_interop_summary(runfolder_path) - total_cluster_pf = run_summary.at(0).at(0).reads_pf() - assert total_cluster_pf == 532464327 + total_reads_pf = run_summary.at(0).at(0).reads_pf() + assert total_reads_pf == 532464327 sample_id = index_summary.at(0).at(0).sample_id() assert sample_id == "Sample_14574-Qiagen-IndexSet1-SP-Lane1" @@ -107,5 +107,5 @@ def test_read_samplesheet(runfolder_path): 'lane': 1, 'sample_id': 'Sample_14574-Qiagen-IndexSet1-SP-Lane1', 'sample_project': 'AB-1234', - 'description': 'LIBRARY_NAME:test', + 'custom_description': 'LIBRARY_NAME:test', } diff --git a/tests/qc_checkers/test_cluster_pf.py b/tests/qc_checkers/test_cluster_pf.py index e8fce39..ec613fb 100644 --- a/tests/qc_checkers/test_cluster_pf.py +++ b/tests/qc_checkers/test_cluster_pf.py @@ -8,16 +8,16 @@ def qc_data(): return namedtuple("QCData", "sequencing_metrics")( { - 1: {"total_cluster_pf": 1_000_000_000}, - 2: {"total_cluster_pf": 10_000_000}, - 3: {"total_cluster_pf": 100_000_000}, - 4: {"total_cluster_pf": 10_000_000_000}, + 1: {"total_reads_pf": 1_000_000_000}, + 2: {"total_reads_pf": 10_000_000}, + 3: {"total_reads_pf": 100_000_000}, + 4: {"total_reads_pf": 10_000_000_000}, } ) -def format_msg(total_cluster_pf, threshold, lane, **kwargs): - return f"Clusters PF {total_cluster_pf / 10**6}M < {threshold / 10**6}M on lane {lane}" +def format_msg(total_reads_pf, threshold, lane, **kwargs): + return f"Clusters PF {total_reads_pf / 10**6}M < {threshold / 10**6}M on lane {lane}" def test_cluster_pf(qc_data): @@ -34,7 +34,7 @@ def test_cluster_pf(qc_data): match lane: case 2: exp_data = { - "total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"], + "total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"], "threshold": 50_000_000, "lane": lane, "qc_checker": "cluster_pf", @@ -44,7 +44,7 @@ def test_cluster_pf(qc_data): assert report.data == exp_data case 3: exp_data = { - "total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"], + "total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"], "threshold": 500_500_000, "lane": lane, "qc_checker": "cluster_pf", @@ -69,7 +69,7 @@ def test_cluster_pf_error_unknown(qc_data): match lane: case 2: exp_data = { - "total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"], + "total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"], "threshold": 500_000_000, "lane": lane, "qc_checker": "cluster_pf", @@ -79,7 +79,7 @@ def test_cluster_pf_error_unknown(qc_data): assert report.data == exp_data case 3: exp_data = { - "total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"], + "total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"], "threshold": 500_000_000, "lane": lane, "qc_checker": "cluster_pf", @@ -104,7 +104,7 @@ def test_cluster_pf_warning_unknown(qc_data): match lane: case 2: exp_data = { - "total_cluster_pf": qc_data.sequencing_metrics[lane]["total_cluster_pf"], + "total_reads_pf": qc_data.sequencing_metrics[lane]["total_reads_pf"], "threshold": 50_000_000, "lane": lane, "qc_checker": "cluster_pf", diff --git a/tests/qc_checkers/test_unidentified_index.py b/tests/qc_checkers/test_unidentified_index.py index 5dbe481..4c81282 100644 --- a/tests/qc_checkers/test_unidentified_index.py +++ b/tests/qc_checkers/test_unidentified_index.py @@ -165,7 +165,7 @@ def qc_data(): return namedtuple("QCData", ["sequencing_metrics", "samplesheet"])( { 1: { - "total_cluster_pf": 100, + "total_reads_pf": 100, "top_unknown_barcodes": [ {"lane": 1, "index": "ACCT", "count": 10}, {"lane": 1, "index": "AC", "count": 50}, diff --git a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv index 4699910..5f193cf 100755 --- a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv +++ b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv @@ -4,10 +4,10 @@ Date,6/24/2020,, Application,Illumina DRAGEN COVIDSeq Test Pipeline,, Instrument Type,NovaSeq6000,, Assay,Illumina COVIDSeq Test,, -Index Adapters,IDT-ILMN DNA-RNA UDP Indexes ,, +Index Adapters,"IDT-ILMN DNA-RNA UDP Indexes ",, Chemistry,Amplicon,, ,,, -[Reads],,,,,, +[Reads],,,,, Read1Cycles,36,, Index1Cycles,10,, Index2Cycles,10,, @@ -18,7 +18,7 @@ FastqCompressionFormat,gzip,, SoftwareVersion,4.1.5,, ,,, [BCLConvert_Data],,, -Lane,Sample_ID,Index,Index2,Sample_Project,Description +Lane,Sample_ID,Index,Index2,Sample_Project,custom_Description 1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,GAACTGAGCG,TCGTGGAGCG,AB-1234,LIBRARY_NAME:test 1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,AGGTCAGATA,CTACAAGATA,CD-5678,LIBRARY_NAME:test 2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,GAACTGAGCG,TCGTGGAGCG,AB-1234,LIBRARY_NAME:test diff --git a/tests/test_qc_data.py b/tests/test_qc_data.py index b14432d..473e8a5 100644 --- a/tests/test_qc_data.py +++ b/tests/test_qc_data.py @@ -35,7 +35,7 @@ def bclconvert_runfolder(): "index": "GAACTGAGCG", "index2": "TCGTGGAGCG", "sample_project": "AB-1234", - "description": "LIBRARY_NAME:test", + "custom_description": "LIBRARY_NAME:test", }, { "lane": 1, @@ -43,7 +43,7 @@ def bclconvert_runfolder(): "index": "AGGTCAGATA", "index2": "CTACAAGATA", "sample_project": "CD-5678", - "description": "LIBRARY_NAME:test", + "custom_description": "LIBRARY_NAME:test", }, { "lane": 2, @@ -51,7 +51,7 @@ def bclconvert_runfolder(): "index": "GAACTGAGCG", "index2": "TCGTGGAGCG", "sample_project": "AB-1234", - "description": "LIBRARY_NAME:test", + "custom_description": "LIBRARY_NAME:test", }, { "lane": 2, @@ -59,14 +59,14 @@ def bclconvert_runfolder(): "index": "AGGTCAGATA", "index2": "CTACAAGATA", "sample_project": "CD-5678", - "description": "LIBRARY_NAME:test", + "custom_description": "LIBRARY_NAME:test", }, ], }, "expected_sequencing_metrics": { 1: { - "total_cluster_pf": 532_464_327, - "raw_clusters": 4_091_904.0, + "total_reads_pf": 532_464_327, + "total_reads": 638337024, "raw_density": 2_961_270.5, "pf_density": 2_470_118.25, "yield": 122_605_416, @@ -136,8 +136,8 @@ def bclconvert_runfolder(): ], }, 2: { - "total_cluster_pf": 530_917_565, - "raw_clusters": 4_091_904.0, + "total_reads_pf": 530_917_565, + "total_reads": 638337024, "raw_density": 2_961_270.5, "pf_density": 2_462_942.5, "yield": 124_497_108, From 394f852188312c72ae7cc54650a694ae293a4ef7 Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Mon, 29 Sep 2025 09:28:41 +0200 Subject: [PATCH 12/17] Updated test data format --- tests/test_qc_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_qc_data.py b/tests/test_qc_data.py index 473e8a5..0238a10 100644 --- a/tests/test_qc_data.py +++ b/tests/test_qc_data.py @@ -66,7 +66,7 @@ def bclconvert_runfolder(): "expected_sequencing_metrics": { 1: { "total_reads_pf": 532_464_327, - "total_reads": 638337024, + "total_reads": 638_337_024, "raw_density": 2_961_270.5, "pf_density": 2_470_118.25, "yield": 122_605_416, @@ -137,7 +137,7 @@ def bclconvert_runfolder(): }, 2: { "total_reads_pf": 530_917_565, - "total_reads": 638337024, + "total_reads": 638_337_024, "raw_density": 2_961_270.5, "pf_density": 2_462_942.5, "yield": 124_497_108, From e3ae2312df85918e7313c4c287621685c2d70e8a Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Mon, 6 Oct 2025 13:18:25 +0200 Subject: [PATCH 13/17] Made run_info available in qc_data and test_runfolders available in module --- checkQC/parsers/illumina.py | 4 +- checkQC/qc_data_utils.py | 191 +++++++++++++++++++++++++ tests/parsers/test_illumina_parser.py | 2 +- tests/test_qc_data.py | 193 +------------------------- 4 files changed, 196 insertions(+), 194 deletions(-) create mode 100644 checkQC/qc_data_utils.py diff --git a/checkQC/parsers/illumina.py b/checkQC/parsers/illumina.py index 7cb444c..0fe0705 100644 --- a/checkQC/parsers/illumina.py +++ b/checkQC/parsers/illumina.py @@ -12,7 +12,7 @@ def from_bclconvert(cls, runfolder_path, parser_config): runfolder_path = pathlib.Path(runfolder_path) assert runfolder_path.is_dir() - summary, index_summary = _read_interop_summary(runfolder_path) + summary, index_summary, run_info = _read_interop_summary(runfolder_path) quality_metrics = _read_demultiplexing_metrics( runfolder_path / parser_config["reports_location"] @@ -143,7 +143,7 @@ def _read_interop_summary(runfolder_path): index_summary = interop.py_interop_summary.index_flowcell_summary() interop.py_interop_summary.summarize_index_metrics(run_metrics, index_summary) - return run_summary, index_summary + return run_summary, index_summary, run_info def _read_demultiplexing_metrics(metrics_path): diff --git a/checkQC/qc_data_utils.py b/checkQC/qc_data_utils.py new file mode 100644 index 0000000..e503ecc --- /dev/null +++ b/checkQC/qc_data_utils.py @@ -0,0 +1,191 @@ +import numpy as np + + +def bclconvert_test_runfolder(qc_data): + return { + "qc_data": qc_data, + "expected_instrument": "novaseq_SP", + "expected_read_length": 36, + "expected_samplesheet": { + "len": 4, + "head": [ + { + "lane": 1, + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", + "index": "GAACTGAGCG", + "index2": "TCGTGGAGCG", + "sample_project": "AB-1234", + "custom_description": "LIBRARY_NAME:test", + }, + { + "lane": 1, + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", + "index": "AGGTCAGATA", + "index2": "CTACAAGATA", + "sample_project": "CD-5678", + "custom_description": "LIBRARY_NAME:test", + }, + { + "lane": 2, + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", + "index": "GAACTGAGCG", + "index2": "TCGTGGAGCG", + "sample_project": "AB-1234", + "custom_description": "LIBRARY_NAME:test", + }, + { + "lane": 2, + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", + "index": "AGGTCAGATA", + "index2": "CTACAAGATA", + "sample_project": "CD-5678", + "custom_description": "LIBRARY_NAME:test", + }, + ], + }, + "expected_sequencing_metrics": { + 1: { + "total_reads_pf": 532_464_327, + "total_reads": 638_337_024, + "raw_density": 2_961_270.5, + "pf_density": 2_470_118.25, + "yield": 122_605_416, + "yield_undetermined": 121_940_136, + "top_unknown_barcodes": { + "len": 1029, + "head": [ + { + 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', + 'count': 12857, + }, + { + 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', + 'count': 12406, + }, + { + 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', + 'count': 12177, + }, + { + 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', + 'count': 11590, + }, + { + 'index': 'GGTCCGCTTC', 'index2': 'CTCACACAAG', + 'count': 11509, + }, + ], + }, + "reads": { + 1: { + "mean_error_rate": np.nan, + "percent_q30": 95.70932006835938, + "is_index": False, + "mean_percent_phix_aligned": 0., + }, + 2: { + "mean_error_rate": np.nan, + "percent_q30": 92.57965850830078, + "is_index": True, + "mean_percent_phix_aligned": np.nan, + }, + 3: { + "mean_error_rate": np.nan, + "percent_q30": 90.3790283203125, + "is_index": True, + "mean_percent_phix_aligned": np.nan, + }, + }, + "reads_per_sample": [ + { + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", + "cluster_count": 9920, + "percent_of_lane": 0.29, + "percent_perfect_index_reads": 97.96, + "mean_q30": 36.37, + "percent_q30": 96, + }, + { + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", + "cluster_count": 8560, + "percent_of_lane": 0.25, + "percent_perfect_index_reads": 98.15, + "mean_q30": 36.43, + "percent_q30": 96, + }, + ], + }, + 2: { + "total_reads_pf": 530_917_565, + "total_reads": 638_337_024, + "raw_density": 2_961_270.5, + "pf_density": 2_462_942.5, + "yield": 124_497_108, + "yield_undetermined": 123_817_428, + "top_unknown_barcodes": { + "len": 1055, + "head": [ + { + 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', + 'count': 13176, + }, + { + 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', + 'count': 12395, + }, + { + 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', + 'count': 12247, + }, + { + 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', + 'count': 11909, + }, + { + 'index': 'TAATTAGCGT', 'index2': 'TGGTTAAGAA', + 'count': 11330, + }, + ], + }, + "reads": { + 1: { + "mean_error_rate": np.nan, + "percent_q30": 95.75276184082031, + "is_index": False, + "mean_percent_phix_aligned": 0., + }, + 2: { + "mean_error_rate": np.nan, + "percent_q30": 92.60448455810547, + "is_index": True, + "mean_percent_phix_aligned": np.nan, + }, + 3: { + "mean_error_rate": np.nan, + "percent_q30": 90.2811050415039, + "is_index": True, + "mean_percent_phix_aligned": np.nan, + }, + }, + "reads_per_sample": [ + { + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", + "cluster_count": 10208, + "percent_of_lane": 0.3, + "percent_perfect_index_reads": 98.2, + "mean_q30": 36.4, + "percent_q30": 96, + }, + { + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", + "cluster_count": 8672, + "percent_of_lane": 0.25, + "percent_perfect_index_reads": 98.29, + "mean_q30": 36.48, + "percent_q30": 97, + }, + ], + }, + }, + } + diff --git a/tests/parsers/test_illumina_parser.py b/tests/parsers/test_illumina_parser.py index d8b645e..a695e80 100644 --- a/tests/parsers/test_illumina_parser.py +++ b/tests/parsers/test_illumina_parser.py @@ -19,7 +19,7 @@ def runfolder_path(): def test_read_interop_summary(runfolder_path): - run_summary, index_summary = _read_interop_summary(runfolder_path) + run_summary, index_summary, _ = _read_interop_summary(runfolder_path) total_reads_pf = run_summary.at(0).at(0).reads_pf() assert total_reads_pf == 532464327 diff --git a/tests/test_qc_data.py b/tests/test_qc_data.py index 0238a10..753a452 100644 --- a/tests/test_qc_data.py +++ b/tests/test_qc_data.py @@ -1,11 +1,8 @@ from pathlib import Path -from unittest import mock - -import numpy as np import pytest +from checkQC.qc_data_utils import bclconvert_test_runfolder from checkQC.qc_data import QCData -from checkQC.handlers.qc_handler import QCErrorFatal, QCErrorWarning from tests.test_utils import float_eq @@ -20,195 +17,9 @@ def bclconvert_runfolder(): Path(__file__).parent / "resources/bclconvert/200624_A00834_0183_BHMTFYTINY", parser_config, ) + return bclconvert_test_runfolder(qc_data) - return { - "qc_data": qc_data, - "expected_instrument": "novaseq_SP", - "expected_read_length": 36, - "expected_samplesheet": { - "len": 4, - "head": [ - { - "lane": 1, - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", - "index": "GAACTGAGCG", - "index2": "TCGTGGAGCG", - "sample_project": "AB-1234", - "custom_description": "LIBRARY_NAME:test", - }, - { - "lane": 1, - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", - "index": "AGGTCAGATA", - "index2": "CTACAAGATA", - "sample_project": "CD-5678", - "custom_description": "LIBRARY_NAME:test", - }, - { - "lane": 2, - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", - "index": "GAACTGAGCG", - "index2": "TCGTGGAGCG", - "sample_project": "AB-1234", - "custom_description": "LIBRARY_NAME:test", - }, - { - "lane": 2, - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", - "index": "AGGTCAGATA", - "index2": "CTACAAGATA", - "sample_project": "CD-5678", - "custom_description": "LIBRARY_NAME:test", - }, - ], - }, - "expected_sequencing_metrics": { - 1: { - "total_reads_pf": 532_464_327, - "total_reads": 638_337_024, - "raw_density": 2_961_270.5, - "pf_density": 2_470_118.25, - "yield": 122_605_416, - "yield_undetermined": 121_940_136, - "top_unknown_barcodes": { - "len": 1029, - "head": [ - { - 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', - 'count': 12857, - }, - { - 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', - 'count': 12406, - }, - { - 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', - 'count': 12177, - }, - { - 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', - 'count': 11590, - }, - { - 'index': 'GGTCCGCTTC', 'index2': 'CTCACACAAG', - 'count': 11509, - }, - ], - }, - "reads": { - 1: { - "mean_error_rate": np.nan, - "percent_q30": 95.70932006835938, - "is_index": False, - "mean_percent_phix_aligned": 0., - }, - 2: { - "mean_error_rate": np.nan, - "percent_q30": 92.57965850830078, - "is_index": True, - "mean_percent_phix_aligned": np.nan, - }, - 3: { - "mean_error_rate": np.nan, - "percent_q30": 90.3790283203125, - "is_index": True, - "mean_percent_phix_aligned": np.nan, - }, - }, - "reads_per_sample": [ - { - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", - "cluster_count": 9920, - "percent_of_lane": 0.29, - "percent_perfect_index_reads": 97.96, - "mean_q30": 36.37, - "percent_q30": 96, - }, - { - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", - "cluster_count": 8560, - "percent_of_lane": 0.25, - "percent_perfect_index_reads": 98.15, - "mean_q30": 36.43, - "percent_q30": 96, - }, - ], - }, - 2: { - "total_reads_pf": 530_917_565, - "total_reads": 638_337_024, - "raw_density": 2_961_270.5, - "pf_density": 2_462_942.5, - "yield": 124_497_108, - "yield_undetermined": 123_817_428, - "top_unknown_barcodes": { - "len": 1055, - "head": [ - { - 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', - 'count': 13176, - }, - { - 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', - 'count': 12395, - }, - { - 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', - 'count': 12247, - }, - { - 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', - 'count': 11909, - }, - { - 'index': 'TAATTAGCGT', 'index2': 'TGGTTAAGAA', - 'count': 11330, - }, - ], - }, - "reads": { - 1: { - "mean_error_rate": np.nan, - "percent_q30": 95.75276184082031, - "is_index": False, - "mean_percent_phix_aligned": 0., - }, - 2: { - "mean_error_rate": np.nan, - "percent_q30": 92.60448455810547, - "is_index": True, - "mean_percent_phix_aligned": np.nan, - }, - 3: { - "mean_error_rate": np.nan, - "percent_q30": 90.2811050415039, - "is_index": True, - "mean_percent_phix_aligned": np.nan, - }, - }, - "reads_per_sample": [ - { - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", - "cluster_count": 10208, - "percent_of_lane": 0.3, - "percent_perfect_index_reads": 98.2, - "mean_q30": 36.4, - "percent_q30": 96, - }, - { - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", - "cluster_count": 8672, - "percent_of_lane": 0.25, - "percent_perfect_index_reads": 98.29, - "mean_q30": 36.48, - "percent_q30": 97, - }, - ], - }, - }, - } - def test_qc_data(bclconvert_runfolder): qc_data = bclconvert_runfolder["qc_data"] From 9d971deb0595f3964859d75898582fe90496d692 Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Wed, 8 Oct 2025 15:25:39 +0200 Subject: [PATCH 14/17] Added OverrideCycles in bclconvert samplesheet --- checkQC/qc_data_utils.py | 4 ++++ tests/parsers/test_illumina_parser.py | 1 + .../200624_A00834_0183_BHMTFYTINY/SampleSheet.csv | 10 +++++----- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/checkQC/qc_data_utils.py b/checkQC/qc_data_utils.py index e503ecc..4967a16 100644 --- a/checkQC/qc_data_utils.py +++ b/checkQC/qc_data_utils.py @@ -15,6 +15,7 @@ def bclconvert_test_runfolder(qc_data): "index": "GAACTGAGCG", "index2": "TCGTGGAGCG", "sample_project": "AB-1234", + "overridecycles": "Y36;I10;I10", "custom_description": "LIBRARY_NAME:test", }, { @@ -23,6 +24,7 @@ def bclconvert_test_runfolder(qc_data): "index": "AGGTCAGATA", "index2": "CTACAAGATA", "sample_project": "CD-5678", + "overridecycles": "Y36;I10;I10", "custom_description": "LIBRARY_NAME:test", }, { @@ -31,6 +33,7 @@ def bclconvert_test_runfolder(qc_data): "index": "GAACTGAGCG", "index2": "TCGTGGAGCG", "sample_project": "AB-1234", + "overridecycles": "Y36;I10;I10", "custom_description": "LIBRARY_NAME:test", }, { @@ -39,6 +42,7 @@ def bclconvert_test_runfolder(qc_data): "index": "AGGTCAGATA", "index2": "CTACAAGATA", "sample_project": "CD-5678", + "overridecycles": "Y36;I10;I10", "custom_description": "LIBRARY_NAME:test", }, ], diff --git a/tests/parsers/test_illumina_parser.py b/tests/parsers/test_illumina_parser.py index a695e80..7f10343 100644 --- a/tests/parsers/test_illumina_parser.py +++ b/tests/parsers/test_illumina_parser.py @@ -107,5 +107,6 @@ def test_read_samplesheet(runfolder_path): 'lane': 1, 'sample_id': 'Sample_14574-Qiagen-IndexSet1-SP-Lane1', 'sample_project': 'AB-1234', + "overridecycles": "Y36;I10;I10", 'custom_description': 'LIBRARY_NAME:test', } diff --git a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv index 5f193cf..c051b24 100755 --- a/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv +++ b/tests/resources/bclconvert/200624_A00834_0183_BHMTFYTINY/SampleSheet.csv @@ -18,8 +18,8 @@ FastqCompressionFormat,gzip,, SoftwareVersion,4.1.5,, ,,, [BCLConvert_Data],,, -Lane,Sample_ID,Index,Index2,Sample_Project,custom_Description -1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,GAACTGAGCG,TCGTGGAGCG,AB-1234,LIBRARY_NAME:test -1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,AGGTCAGATA,CTACAAGATA,CD-5678,LIBRARY_NAME:test -2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,GAACTGAGCG,TCGTGGAGCG,AB-1234,LIBRARY_NAME:test -2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,AGGTC AGATA,C TACAA GATA,CD-5678,LIBRARY_NAME:test +Lane,Sample_ID,Index,Index2,Sample_Project,OverrideCycles,custom_Description +1,Sample_14574-Qiagen-IndexSet1-SP-Lane1,GAACTGAGCG,TCGTGGAGCG,AB-1234,Y36;I10;I10,LIBRARY_NAME:test +1,Sample_14575-Qiagen-IndexSet1-SP-Lane1,AGGTCAGATA,CTACAAGATA,CD-5678,Y36;I10;I10,LIBRARY_NAME:test +2,Sample_14574-Qiagen-IndexSet1-SP-Lane2,GAACTGAGCG,TCGTGGAGCG,AB-1234,Y36;I10;I10,LIBRARY_NAME:test +2,Sample_14575-Qiagen-IndexSet1-SP-Lane2,AGGTC AGATA,C TACAA GATA,CD-5678,Y36;I10;I10,LIBRARY_NAME:test From a946b2ad0d922ad3f858fbae1e9e619accd19fbe Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Mon, 20 Oct 2025 16:04:04 +0200 Subject: [PATCH 15/17] Passing runfolder to qc_data_utils --- checkQC/qc_data_utils.py | 344 ++++++++++++++++++++------------------- tests/test_qc_data.py | 6 +- 2 files changed, 177 insertions(+), 173 deletions(-) diff --git a/checkQC/qc_data_utils.py b/checkQC/qc_data_utils.py index 4967a16..a79c22c 100644 --- a/checkQC/qc_data_utils.py +++ b/checkQC/qc_data_utils.py @@ -1,195 +1,199 @@ import numpy as np +from checkQC.parsers.illumina import _read_interop_summary -def bclconvert_test_runfolder(qc_data): - return { - "qc_data": qc_data, - "expected_instrument": "novaseq_SP", - "expected_read_length": 36, - "expected_samplesheet": { - "len": 4, - "head": [ - { - "lane": 1, - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", - "index": "GAACTGAGCG", - "index2": "TCGTGGAGCG", - "sample_project": "AB-1234", - "overridecycles": "Y36;I10;I10", - "custom_description": "LIBRARY_NAME:test", - }, - { - "lane": 1, - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", - "index": "AGGTCAGATA", - "index2": "CTACAAGATA", - "sample_project": "CD-5678", - "overridecycles": "Y36;I10;I10", - "custom_description": "LIBRARY_NAME:test", - }, - { - "lane": 2, - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", - "index": "GAACTGAGCG", - "index2": "TCGTGGAGCG", - "sample_project": "AB-1234", - "overridecycles": "Y36;I10;I10", - "custom_description": "LIBRARY_NAME:test", - }, - { - "lane": 2, - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", - "index": "AGGTCAGATA", - "index2": "CTACAAGATA", - "sample_project": "CD-5678", - "overridecycles": "Y36;I10;I10", - "custom_description": "LIBRARY_NAME:test", - }, - ], - }, - "expected_sequencing_metrics": { - 1: { - "total_reads_pf": 532_464_327, - "total_reads": 638_337_024, - "raw_density": 2_961_270.5, - "pf_density": 2_470_118.25, - "yield": 122_605_416, - "yield_undetermined": 121_940_136, - "top_unknown_barcodes": { - "len": 1029, - "head": [ - { - 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', - 'count': 12857, - }, - { - 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', - 'count': 12406, - }, - { - 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', - 'count': 12177, - }, - { - 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', - 'count': 11590, - }, - { - 'index': 'GGTCCGCTTC', 'index2': 'CTCACACAAG', - 'count': 11509, - }, - ], - }, - "reads": { - 1: { - "mean_error_rate": np.nan, - "percent_q30": 95.70932006835938, - "is_index": False, - "mean_percent_phix_aligned": 0., - }, - 2: { - "mean_error_rate": np.nan, - "percent_q30": 92.57965850830078, - "is_index": True, - "mean_percent_phix_aligned": np.nan, - }, - 3: { - "mean_error_rate": np.nan, - "percent_q30": 90.3790283203125, - "is_index": True, - "mean_percent_phix_aligned": np.nan, - }, - }, - "reads_per_sample": [ +def bclconvert_test_runfolder(qc_data, runfolder_path): + _, _, run_info = _read_interop_summary(runfolder_path) + flowcell_id = run_info.flowcell_id() + if "HMTFYDRXX" in flowcell_id: + return { + "qc_data": qc_data, + "expected_instrument": "novaseq_SP", + "expected_read_length": 36, + "expected_samplesheet": { + "len": 4, + "head": [ { + "lane": 1, "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", - "cluster_count": 9920, - "percent_of_lane": 0.29, - "percent_perfect_index_reads": 97.96, - "mean_q30": 36.37, - "percent_q30": 96, + "index": "GAACTGAGCG", + "index2": "TCGTGGAGCG", + "sample_project": "AB-1234", + "overridecycles": "Y36;I10;I10", + "custom_description": "LIBRARY_NAME:test", }, { + "lane": 1, "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", - "cluster_count": 8560, - "percent_of_lane": 0.25, - "percent_perfect_index_reads": 98.15, - "mean_q30": 36.43, - "percent_q30": 96, + "index": "AGGTCAGATA", + "index2": "CTACAAGATA", + "sample_project": "CD-5678", + "overridecycles": "Y36;I10;I10", + "custom_description": "LIBRARY_NAME:test", + }, + { + "lane": 2, + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", + "index": "GAACTGAGCG", + "index2": "TCGTGGAGCG", + "sample_project": "AB-1234", + "overridecycles": "Y36;I10;I10", + "custom_description": "LIBRARY_NAME:test", + }, + { + "lane": 2, + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", + "index": "AGGTCAGATA", + "index2": "CTACAAGATA", + "sample_project": "CD-5678", + "overridecycles": "Y36;I10;I10", + "custom_description": "LIBRARY_NAME:test", }, ], }, - 2: { - "total_reads_pf": 530_917_565, - "total_reads": 638_337_024, - "raw_density": 2_961_270.5, - "pf_density": 2_462_942.5, - "yield": 124_497_108, - "yield_undetermined": 123_817_428, - "top_unknown_barcodes": { - "len": 1055, - "head": [ - { - 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', - 'count': 13176, + "expected_sequencing_metrics": { + 1: { + "total_reads_pf": 532_464_327, + "total_reads": 638_337_024, + "raw_density": 2_961_270.5, + "pf_density": 2_470_118.25, + "yield": 122_605_416, + "yield_undetermined": 121_940_136, + "top_unknown_barcodes": { + "len": 1029, + "head": [ + { + 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', + 'count': 12857, + }, + { + 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', + 'count': 12406, + }, + { + 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', + 'count': 12177, + }, + { + 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', + 'count': 11590, + }, + { + 'index': 'GGTCCGCTTC', 'index2': 'CTCACACAAG', + 'count': 11509, + }, + ], + }, + "reads": { + 1: { + "mean_error_rate": np.nan, + "percent_q30": 95.70932006835938, + "is_index": False, + "mean_percent_phix_aligned": 0., }, - { - 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', - 'count': 12395, + 2: { + "mean_error_rate": np.nan, + "percent_q30": 92.57965850830078, + "is_index": True, + "mean_percent_phix_aligned": np.nan, }, - { - 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', - 'count': 12247, + 3: { + "mean_error_rate": np.nan, + "percent_q30": 90.3790283203125, + "is_index": True, + "mean_percent_phix_aligned": np.nan, }, + }, + "reads_per_sample": [ { - 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', - 'count': 11909, + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane1", + "cluster_count": 9920, + "percent_of_lane": 0.29, + "percent_perfect_index_reads": 97.96, + "mean_q30": 36.37, + "percent_q30": 96, }, { - 'index': 'TAATTAGCGT', 'index2': 'TGGTTAAGAA', - 'count': 11330, + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane1", + "cluster_count": 8560, + "percent_of_lane": 0.25, + "percent_perfect_index_reads": 98.15, + "mean_q30": 36.43, + "percent_q30": 96, }, ], }, - "reads": { - 1: { - "mean_error_rate": np.nan, - "percent_q30": 95.75276184082031, - "is_index": False, - "mean_percent_phix_aligned": 0., - }, - 2: { - "mean_error_rate": np.nan, - "percent_q30": 92.60448455810547, - "is_index": True, - "mean_percent_phix_aligned": np.nan, + 2: { + "total_reads_pf": 530_917_565, + "total_reads": 638_337_024, + "raw_density": 2_961_270.5, + "pf_density": 2_462_942.5, + "yield": 124_497_108, + "yield_undetermined": 123_817_428, + "top_unknown_barcodes": { + "len": 1055, + "head": [ + { + 'index': 'ATATCTGCTT', 'index2': 'TAGACAATCT', + 'count': 13176, + }, + { + 'index': 'ATGTAACGTT', 'index2': 'ACGATTGCTG', + 'count': 12395, + }, + { + 'index': 'CACCTCTCTT', 'index2': 'CTCGACTCCT', + 'count': 12247, + }, + { + 'index': 'TTCGGTGTGA', 'index2': 'GAACAAGTAT', + 'count': 11909, + }, + { + 'index': 'TAATTAGCGT', 'index2': 'TGGTTAAGAA', + 'count': 11330, + }, + ], }, - 3: { - "mean_error_rate": np.nan, - "percent_q30": 90.2811050415039, - "is_index": True, - "mean_percent_phix_aligned": np.nan, + "reads": { + 1: { + "mean_error_rate": np.nan, + "percent_q30": 95.75276184082031, + "is_index": False, + "mean_percent_phix_aligned": 0., + }, + 2: { + "mean_error_rate": np.nan, + "percent_q30": 92.60448455810547, + "is_index": True, + "mean_percent_phix_aligned": np.nan, + }, + 3: { + "mean_error_rate": np.nan, + "percent_q30": 90.2811050415039, + "is_index": True, + "mean_percent_phix_aligned": np.nan, + }, }, + "reads_per_sample": [ + { + "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", + "cluster_count": 10208, + "percent_of_lane": 0.3, + "percent_perfect_index_reads": 98.2, + "mean_q30": 36.4, + "percent_q30": 96, + }, + { + "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", + "cluster_count": 8672, + "percent_of_lane": 0.25, + "percent_perfect_index_reads": 98.29, + "mean_q30": 36.48, + "percent_q30": 97, + }, + ], }, - "reads_per_sample": [ - { - "sample_id": "Sample_14574-Qiagen-IndexSet1-SP-Lane2", - "cluster_count": 10208, - "percent_of_lane": 0.3, - "percent_perfect_index_reads": 98.2, - "mean_q30": 36.4, - "percent_q30": 96, - }, - { - "sample_id": "Sample_14575-Qiagen-IndexSet1-SP-Lane2", - "cluster_count": 8672, - "percent_of_lane": 0.25, - "percent_perfect_index_reads": 98.29, - "mean_q30": 36.48, - "percent_q30": 97, - }, - ], }, - }, - } + } diff --git a/tests/test_qc_data.py b/tests/test_qc_data.py index 753a452..79776b0 100644 --- a/tests/test_qc_data.py +++ b/tests/test_qc_data.py @@ -12,12 +12,12 @@ def bclconvert_runfolder(): parser_config = { "reports_location": "Reports" } - + runfolder_path = Path(__file__).parent / f"resources/bclconvert/200624_A00834_0183_BHMTFYTINY" qc_data = QCData.from_bclconvert( - Path(__file__).parent / "resources/bclconvert/200624_A00834_0183_BHMTFYTINY", + runfolder_path, parser_config, ) - return bclconvert_test_runfolder(qc_data) + return bclconvert_test_runfolder(qc_data, runfolder_path) From 861ea510b4738b1d58f745b8b9c137c914f81b19 Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Wed, 22 Oct 2025 15:27:00 +0200 Subject: [PATCH 16/17] Added exception for bclconvert_test_runfolder --- checkQC/qc_data_utils.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/checkQC/qc_data_utils.py b/checkQC/qc_data_utils.py index a79c22c..2c3a9bb 100644 --- a/checkQC/qc_data_utils.py +++ b/checkQC/qc_data_utils.py @@ -196,4 +196,8 @@ def bclconvert_test_runfolder(qc_data, runfolder_path): }, }, } + else: + raise Exception("Excpected flowcell_id value as 'HMTFYDRXX' only for " + f"this fuction but got {flowcell_id}" + ) From 18b6e7e047aa73d6d8f641c35559e9ab1de38a80 Mon Sep 17 00:00:00 2001 From: nelnk861 Date: Thu, 23 Oct 2025 17:01:17 +0200 Subject: [PATCH 17/17] Refactoring code --- checkQC/qc_data_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/checkQC/qc_data_utils.py b/checkQC/qc_data_utils.py index 2c3a9bb..55f7456 100644 --- a/checkQC/qc_data_utils.py +++ b/checkQC/qc_data_utils.py @@ -197,7 +197,8 @@ def bclconvert_test_runfolder(qc_data, runfolder_path): }, } else: - raise Exception("Excpected flowcell_id value as 'HMTFYDRXX' only for " - f"this fuction but got {flowcell_id}" + raise Exception( + "This function is only compatible with the run with flowcell_id: 'HMTFYDRXX', " + f"the supplied runfolder has flowcell_id: {flowcell_id}" )