diff --git a/CHANGELOG.md b/CHANGELOG.md
index 208aa1061..6d9c6e2ea 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Tests for call_repeat_expansions and qc_bam subworkflows [#713](https://github.com/nf-core/raredisease/pull/713)
- Feature to subsample mitochondrial alignments based on number of reads [#748](https://github.com/nf-core/raredisease/pull/748)
- Functionality to generate coverage information using Sambamba depth [#752](https://github.com/nf-core/raredisease/pull/752)
+- Added GATK contamination check for WES/WGS samples as complement to VerifyBamID2 [#758](https://github.com/nf-core/raredisease/pull/758)
+- GATK Contamination results displayed in MultiQC with color-coded thresholds [#758](https://github.com/nf-core/raredisease/pull/758)
### `Changed`
@@ -31,9 +33,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
### Parameters
-| Old parameter | New parameter |
-| ------------- | ---------------- |
-| | sambamba_regions |
+| Old parameter | New parameter |
+| ------------- | ----------------------- |
+| | sambamba_regions |
+| | run_contamination |
+| | contamination_sites |
+| | contamination_sites_tbi |
### Tool updates
@@ -48,6 +53,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
| gens-preproc | 1.0.11 | 1.1.2 |
| samtools (sort & view) | 1.21 | 1.22.1 |
| sambamba | | 1.0.1 |
+| gatk4/calculatecontamination | | 4.6.2.0 |
+| gatk4/getpileupsummaries | | 4.6.2.0 |
## 2.6.0 - Cacofonix [2025-06-25]
diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml
index 034fb00ed..bd604186a 100644
--- a/assets/multiqc_config.yml
+++ b/assets/multiqc_config.yml
@@ -6,7 +6,6 @@ report_comment: >
target="_blank">nf-core/raredisease analysis pipeline. For information about
how to interpret these results, please see the documentation.
-
report_section_order:
"nf-core-raredisease-methods-description":
order: -1000
@@ -14,9 +13,9 @@ report_section_order:
order: -1001
"nf-core-raredisease-summary":
order: -1002
-
+ gatk_contamination:
+ order: 1050
export_plots: true
-
run_modules:
- fastqc
- fastp
@@ -28,7 +27,6 @@ run_modules:
- peddy
- verifybamid
- custom_content
-
module_order:
- fastqc:
name: "FastQC"
@@ -51,8 +49,52 @@ module_order:
- verifybamid:
name: "VerifyBamID2"
+# Custom content configuration for GATK contamination
+custom_data:
+ gatk_contamination:
+ id: "gatk_contamination"
+ section_name: "GATK Contamination"
+ description: "Sample contamination estimates from GATK CalculateContamination based on common variant allele frequencies"
+ plot_type: "generalstats"
+ pconfig:
+ contamination_pct:
+ title: "Contamination"
+ description: "Estimated sample contamination percentage"
+ max: 10
+ min: 0
+ scale: "RdYlGn-rev"
+ suffix: "%"
+ format: "{:,.2f}"
+ shared_key: "contamination"
+
+# Make contamination visible in general stats by default
+table_columns_visible:
+ gatk_contamination:
+ contamination_pct: true
+
+# Color coding thresholds for contamination
+table_cond_formatting_rules:
+ contamination_pct:
+ pass:
+ - s_eq: "pass"
+ - lt: 2.0
+ warn:
+ - s_eq: "warn"
+ - lt: 5.0
+ - gte: 2.0
+ fail:
+ - s_eq: "fail"
+ - gte: 5.0
+
+# Add to General Statistics table configuration
+table_columns_placement:
+ gatk_contamination:
+ contamination_pct: 900
+
extra_fn_clean_exts:
- "_sorted_md"
+ - "_contamination"
+ - "_pileups"
- type: regex
pattern: "_LNUMBER[0-9]{1,}"
diff --git a/conf/modules/contamination.config b/conf/modules/contamination.config
new file mode 100644
index 000000000..f07ebeca2
--- /dev/null
+++ b/conf/modules/contamination.config
@@ -0,0 +1,46 @@
+/*
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ Config file for GATK contamination checking modules
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+*/
+
+process {
+
+ //
+ // GATK GetPileupSummaries
+ //
+ withName: '.*:CONTAMINATION_CHECK:GATK4_GETPILEUPSUMMARIES' {
+ ext.args = ''
+ ext.prefix = { "${meta.id}_pileups" }
+ publishDir = [
+ path: { "${params.outdir}/qc/contamination/pileups" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ //
+ // GATK CalculateContamination
+ //
+ withName: '.*:CONTAMINATION_CHECK:GATK4_CALCULATECONTAMINATION' {
+ ext.args = ''
+ ext.prefix = { "${meta.id}_contamination" }
+ publishDir = [
+ path: { "${params.outdir}/qc/contamination" },
+ mode: params.publish_dir_mode,
+ saveAs: { filename -> filename.equals('versions.yml') ? null : filename }
+ ]
+ }
+
+ //
+ // Parse contamination results for MultiQC
+ //
+ withName: '.*:RAREDISEASE:PARSE_CONTAMINATION' {
+ ext.prefix = { "${meta.id}_contamination" }
+ publishDir = [
+ path: { "${params.outdir}/multiqc" },
+ mode: params.publish_dir_mode,
+ pattern: '*_mqc.tsv'
+ ]
+ }
+}
diff --git a/modules.json b/modules.json
index 56535bc9b..be7408569 100644
--- a/modules.json
+++ b/modules.json
@@ -257,6 +257,16 @@
"git_sha": "666652151335353eef2fcd58880bcef5bc2928e1",
"installed_by": ["modules"]
},
+ "gatk4/calculatecontamination": {
+ "branch": "master",
+ "git_sha": "ae8cd884f895585c6799ab4eb6a2c9f44df03336",
+ "installed_by": ["modules"]
+ },
+ "gatk4/getpileupsummaries": {
+ "branch": "master",
+ "git_sha": "ae8cd884f895585c6799ab4eb6a2c9f44df03336",
+ "installed_by": ["modules"]
+ },
"gawk": {
"branch": "master",
"git_sha": "5ee4d69ed992c3ce81cfbbdd0bef932fcb81c75a",
diff --git a/modules/local/parse_contamination/bin/parse_contamination.py b/modules/local/parse_contamination/bin/parse_contamination.py
new file mode 100755
index 000000000..5f5b3f1e0
--- /dev/null
+++ b/modules/local/parse_contamination/bin/parse_contamination.py
@@ -0,0 +1,40 @@
+#!/usr/bin/env python3
+"""Parse GATK CalculateContamination output for MultiQC."""
+
+import argparse
+
+
+def parse_contamination(contamination_table, sample_id, prefix):
+ """Parse contamination table and write MultiQC output."""
+ with open(contamination_table, 'r') as f:
+ lines = f.readlines()
+ data_line = lines[1].strip().split('\t')
+ contamination = float(data_line[1])
+ contamination_pct = contamination * 100
+
+ with open(f"{prefix}_contamination_mqc.tsv", 'w') as out:
+ out.write("# id: 'gatk_contamination'\n")
+ out.write("# section_name: 'GATK Contamination'\n")
+ out.write("# description: 'Sample contamination estimates from GATK CalculateContamination'\n")
+ out.write("# plot_type: 'generalstats'\n")
+ out.write("# pconfig:\n")
+ out.write("# contamination_pct:\n")
+ out.write("# title: 'Contamination'\n")
+ out.write("# description: 'Estimated sample contamination percentage'\n")
+ out.write("# max: 10\n")
+ out.write("# min: 0\n")
+ out.write("# scale: 'RdYlGn-rev'\n")
+ out.write("# suffix: '%'\n")
+ out.write("# format: '{:,.2f}'\n")
+ out.write("Sample\tcontamination_pct\n")
+ out.write(f"{sample_id}\t{contamination_pct:.4f}\n")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Parse GATK contamination output for MultiQC")
+ parser.add_argument("--input", required=True, help="GATK contamination table")
+ parser.add_argument("--sample_id", required=True, help="Sample ID")
+ parser.add_argument("--prefix", required=True, help="Output prefix")
+ args = parser.parse_args()
+
+ parse_contamination(args.input, args.sample_id, args.prefix)
diff --git a/modules/local/parse_contamination/environment.yml b/modules/local/parse_contamination/environment.yml
new file mode 100644
index 000000000..b5260d32d
--- /dev/null
+++ b/modules/local/parse_contamination/environment.yml
@@ -0,0 +1,7 @@
+name: parse_contamination
+channels:
+ - conda-forge
+ - bioconda
+ - defaults
+dependencies:
+ - python=3.11
diff --git a/modules/local/parse_contamination/main.nf b/modules/local/parse_contamination/main.nf
new file mode 100644
index 000000000..d8be68d13
--- /dev/null
+++ b/modules/local/parse_contamination/main.nf
@@ -0,0 +1,44 @@
+process PARSE_CONTAMINATION {
+ tag "$meta.id"
+ label 'process_single'
+
+ conda "${moduleDir}/environment.yml"
+ container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ?
+ 'https://depot.galaxyproject.org/singularity/python:3.11' :
+ 'biocontainers/python:3.11' }"
+
+ input:
+ tuple val(meta), path(contamination_table)
+
+ output:
+ tuple val(meta), path("*_contamination_mqc.tsv"), emit: mqc_table
+ path "versions.yml" , emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ python3 ${moduleDir}/bin/parse_contamination.py \\
+ --input ${contamination_table} \\
+ --sample_id ${meta.id} \\
+ --prefix ${prefix}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ python: \$(python --version | sed 's/Python //')
+ END_VERSIONS
+ """
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch ${prefix}_contamination_mqc.tsv
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ python: \$(python --version | sed 's/Python //')
+ END_VERSIONS
+ """
+}
diff --git a/modules/local/parse_contamination/meta.yml b/modules/local/parse_contamination/meta.yml
new file mode 100644
index 000000000..6018fd2df
--- /dev/null
+++ b/modules/local/parse_contamination/meta.yml
@@ -0,0 +1,29 @@
+name: parse_contamination
+description: Parse GATK CalculateContamination output for MultiQC
+keywords:
+ - contamination
+ - MultiQC
+ - parsing
+tools:
+ - python:
+ description: Python programming language
+ homepage: https://www.python.org/
+input:
+ - meta:
+ type: map
+ description: Groovy Map containing sample information
+ - contamination_table:
+ type: file
+ description: GATK CalculateContamination output table
+ pattern: "*.contamination.table"
+output:
+ - mqc_table:
+ type: file
+ description: MultiQC custom content table
+ pattern: "*_contamination_mqc.tsv"
+ - versions:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "56053vujinovic"
diff --git a/modules/nf-core/gatk4/calculatecontamination/environment.yml b/modules/nf-core/gatk4/calculatecontamination/environment.yml
new file mode 100644
index 000000000..67e0eb860
--- /dev/null
+++ b/modules/nf-core/gatk4/calculatecontamination/environment.yml
@@ -0,0 +1,10 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ # renovate: datasource=conda depName=bioconda/gatk4
+ - bioconda::gatk4=4.6.2.0
+ # renovate: datasource=conda depName=bioconda/gcnvkernel
+ - bioconda::gcnvkernel=0.9
diff --git a/modules/nf-core/gatk4/calculatecontamination/main.nf b/modules/nf-core/gatk4/calculatecontamination/main.nf
new file mode 100644
index 000000000..0b8706e76
--- /dev/null
+++ b/modules/nf-core/gatk4/calculatecontamination/main.nf
@@ -0,0 +1,59 @@
+process GATK4_CALCULATECONTAMINATION {
+ tag "${meta.id}"
+ label 'process_low'
+
+ conda "${moduleDir}/environment.yml"
+ container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+ ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ce/ced519873646379e287bc28738bdf88e975edd39a92e7bc6a34bccd37153d9d0/data'
+ : 'community.wave.seqera.io/library/gatk4_gcnvkernel:edb12e4f0bf02cd3'}"
+
+ input:
+ tuple val(meta), path(pileup), path(matched)
+
+ output:
+ tuple val(meta), path('*.contamination.table'), emit: contamination
+ tuple val(meta), path('*.segmentation.table'), emit: segmentation, optional: true
+ path "versions.yml", emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def matched_command = matched ? "--matched-normal ${matched}" : ''
+
+ def avail_mem = 3072
+ if (!task.memory) {
+ log.info('[GATK CalculateContamination] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.')
+ }
+ else {
+ avail_mem = (task.memory.mega * 0.8).intValue()
+ }
+ """
+ gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\
+ CalculateContamination \\
+ --input ${pileup} \\
+ --output ${prefix}.contamination.table \\
+ ${matched_command} \\
+ --tmp-dir . \\
+ ${args}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
+ END_VERSIONS
+ """
+
+ stub:
+ prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch ${prefix}.contamination.table
+ touch ${prefix}.segmentation.table
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/gatk4/calculatecontamination/meta.yml b/modules/nf-core/gatk4/calculatecontamination/meta.yml
new file mode 100644
index 000000000..477e9d3b1
--- /dev/null
+++ b/modules/nf-core/gatk4/calculatecontamination/meta.yml
@@ -0,0 +1,76 @@
+name: gatk4_calculatecontamination
+description: |
+ Calculates the fraction of reads from cross-sample contamination based on summary tables from getpileupsummaries. Output to be used with filtermutectcalls.
+keywords:
+ - gatk4
+ - calculatecontamination
+ - cross-samplecontamination
+ - getpileupsummaries
+ - filtermutectcalls
+tools:
+ - gatk4:
+ description: |
+ Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
+ with a primary focus on variant discovery and genotyping. Its powerful processing engine
+ and high-performance computing features make it capable of taking on projects of any size.
+ homepage: https://gatk.broadinstitute.org/hc/en-us
+ documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s
+ doi: 10.1158/1538-7445.AM2017-3590
+ licence: ["Apache-2.0"]
+ identifier: ""
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test' ]
+ - pileup:
+ type: file
+ description: File containing the pileups summary table of a tumor sample to
+ be used to calculate contamination.
+ pattern: "*.pileups.table"
+ ontologies: []
+ - matched:
+ type: file
+ description: File containing the pileups summary table of a normal sample that
+ matches with the tumor sample specified in pileup argument. This is an optional
+ input.
+ pattern: "*.pileups.table"
+ ontologies: []
+output:
+ contamination:
+ - - meta:
+ type: file
+ description: File containing the contamination table.
+ pattern: "*.contamination.table"
+ ontologies: []
+ - "*.contamination.table":
+ type: file
+ description: File containing the contamination table.
+ pattern: "*.contamination.table"
+ ontologies: []
+ segmentation:
+ - - meta:
+ type: file
+ description: File containing the contamination table.
+ pattern: "*.contamination.table"
+ ontologies: []
+ - "*.segmentation.table":
+ type: file
+ description: output table containing segmentation of tumor minor allele fractions
+ (optional)
+ pattern: "*.segmentation.table"
+ ontologies: []
+ versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+ ontologies:
+ - edam: http://edamontology.org/format_3750 # YAML
+authors:
+ - "@GCJMackenzie"
+ - "@maxulysse"
+maintainers:
+ - "@GCJMackenzie"
+ - "@maxulysse"
diff --git a/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test b/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test
new file mode 100644
index 000000000..81f048f67
--- /dev/null
+++ b/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test
@@ -0,0 +1,106 @@
+nextflow_process {
+
+ name "Test Process GATK4_CALCULATECONTAMINATION"
+ script "../main.nf"
+ process "GATK4_CALCULATECONTAMINATION"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "gatk4"
+ tag "gatk4/calculatecontamination"
+
+ test("human - pileup-table") {
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test2.pileups.table', checkIfExists: true),
+ [] ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out.versions).match("versions") },
+ { assert snapshot(file(process.out.contamination.get(0).get(1)).readLines()[0]).match() }
+ )
+ }
+
+ }
+
+ test("human - pileup-table - matched-pair") {
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test2.pileups.table', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test.pileups.table', checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out.versions).match("versions_pair") },
+ { assert snapshot(file(process.out.contamination.get(0).get(1)).readLines()[0]).match() }
+ )
+ }
+
+ }
+
+ test("human - pileup-table - segmentation") {
+
+ config "./nextflow.config"
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test2.pileups.table', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test.pileups.table', checkIfExists: true)
+ ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out.versions).match("versions_segmentation") },
+ { assert snapshot(file(process.out.contamination.get(0).get(1)).readLines()[0]).match("contamination") },
+ { assert snapshot(file(process.out.segmentation.get(0).get(1)).readLines()[0]).match("segmentation") }
+ )
+ }
+
+ }
+
+ test("human - pileup-table - stub") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test2.pileups.table', checkIfExists: true),
+ [] ]
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+}
diff --git a/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test.snap b/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test.snap
new file mode 100644
index 000000000..4ef184bfc
--- /dev/null
+++ b/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test.snap
@@ -0,0 +1,127 @@
+{
+ "versions_pair": {
+ "content": [
+ [
+ "versions.yml:md5,703ef3c3104ffcb977090771a8761da2"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "25.04.7"
+ },
+ "timestamp": "2025-09-15T12:29:33.077296321"
+ },
+ "versions": {
+ "content": [
+ [
+ "versions.yml:md5,703ef3c3104ffcb977090771a8761da2"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "25.04.7"
+ },
+ "timestamp": "2025-09-15T12:29:20.224234796"
+ },
+ "versions_segmentation": {
+ "content": [
+ [
+ "versions.yml:md5,703ef3c3104ffcb977090771a8761da2"
+ ]
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "25.04.7"
+ },
+ "timestamp": "2025-09-15T12:29:45.806413439"
+ },
+ "segmentation": {
+ "content": [
+ "#SAMPLE=tumour"
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.0"
+ },
+ "timestamp": "2024-05-15T12:57:38.845287"
+ },
+ "human - pileup-table": {
+ "content": [
+ "sample\tcontamination\terror"
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.0"
+ },
+ "timestamp": "2024-05-15T11:09:16.292509"
+ },
+ "human - pileup-table - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.contamination.table:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ [
+ {
+ "id": "test"
+ },
+ "test.segmentation.table:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "2": [
+ "versions.yml:md5,703ef3c3104ffcb977090771a8761da2"
+ ],
+ "contamination": [
+ [
+ {
+ "id": "test"
+ },
+ "test.contamination.table:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "segmentation": [
+ [
+ {
+ "id": "test"
+ },
+ "test.segmentation.table:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,703ef3c3104ffcb977090771a8761da2"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "25.04.7"
+ },
+ "timestamp": "2025-09-15T12:29:55.994434812"
+ },
+ "human - pileup-table - matched-pair": {
+ "content": [
+ "sample\tcontamination\terror"
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.0"
+ },
+ "timestamp": "2024-05-15T11:09:26.408848"
+ },
+ "contamination": {
+ "content": [
+ "sample\tcontamination\terror"
+ ],
+ "meta": {
+ "nf-test": "0.8.4",
+ "nextflow": "23.10.0"
+ },
+ "timestamp": "2024-05-15T12:57:38.840651"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/gatk4/calculatecontamination/tests/nextflow.config b/modules/nf-core/gatk4/calculatecontamination/tests/nextflow.config
new file mode 100644
index 000000000..db836b7f8
--- /dev/null
+++ b/modules/nf-core/gatk4/calculatecontamination/tests/nextflow.config
@@ -0,0 +1,5 @@
+process {
+ withName: GATK4_CALCULATECONTAMINATION {
+ ext.args = { "--tumor-segmentation ${meta.id}.segmentation.table" }
+ }
+}
diff --git a/modules/nf-core/gatk4/getpileupsummaries/environment.yml b/modules/nf-core/gatk4/getpileupsummaries/environment.yml
new file mode 100644
index 000000000..67e0eb860
--- /dev/null
+++ b/modules/nf-core/gatk4/getpileupsummaries/environment.yml
@@ -0,0 +1,10 @@
+---
+# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json
+channels:
+ - conda-forge
+ - bioconda
+dependencies:
+ # renovate: datasource=conda depName=bioconda/gatk4
+ - bioconda::gatk4=4.6.2.0
+ # renovate: datasource=conda depName=bioconda/gcnvkernel
+ - bioconda::gcnvkernel=0.9
diff --git a/modules/nf-core/gatk4/getpileupsummaries/main.nf b/modules/nf-core/gatk4/getpileupsummaries/main.nf
new file mode 100644
index 000000000..d0cabdaf8
--- /dev/null
+++ b/modules/nf-core/gatk4/getpileupsummaries/main.nf
@@ -0,0 +1,65 @@
+process GATK4_GETPILEUPSUMMARIES {
+ tag "${meta.id}"
+ label 'process_low'
+
+ conda "${moduleDir}/environment.yml"
+ container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container
+ ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ce/ced519873646379e287bc28738bdf88e975edd39a92e7bc6a34bccd37153d9d0/data'
+ : 'community.wave.seqera.io/library/gatk4_gcnvkernel:edb12e4f0bf02cd3'}"
+
+ input:
+ tuple val(meta), path(input), path(index), path(intervals)
+ tuple val(meta2), path(fasta)
+ tuple val(meta3), path(fai)
+ tuple val(meta4), path(dict)
+ path variants
+ path variants_tbi
+
+ output:
+ tuple val(meta), path('*.pileups.table'), emit: table
+ path "versions.yml", emit: versions
+
+ when:
+ task.ext.when == null || task.ext.when
+
+ script:
+ def args = task.ext.args ?: ''
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ def interval_command = intervals ? "--intervals ${intervals}" : "--intervals ${variants}"
+ def reference_command = fasta ? "--reference ${fasta}" : ''
+
+ def avail_mem = 3072
+ if (!task.memory) {
+ log.info('[GATK GetPileupSummaries] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.')
+ }
+ else {
+ avail_mem = (task.memory.mega * 0.8).intValue()
+ }
+ """
+ gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\
+ GetPileupSummaries \\
+ --input ${input} \\
+ --variant ${variants} \\
+ --output ${prefix}.pileups.table \\
+ ${reference_command} \\
+ ${interval_command} \\
+ --tmp-dir . \\
+ ${args}
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
+ END_VERSIONS
+ """
+
+ stub:
+ def prefix = task.ext.prefix ?: "${meta.id}"
+ """
+ touch ${prefix}.pileups.table
+
+ cat <<-END_VERSIONS > versions.yml
+ "${task.process}":
+ gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//')
+ END_VERSIONS
+ """
+}
diff --git a/modules/nf-core/gatk4/getpileupsummaries/meta.yml b/modules/nf-core/gatk4/getpileupsummaries/meta.yml
new file mode 100644
index 000000000..d7f66ad19
--- /dev/null
+++ b/modules/nf-core/gatk4/getpileupsummaries/meta.yml
@@ -0,0 +1,106 @@
+name: gatk4_getpileupsummaries
+description: |
+ Summarizes counts of reads that support reference, alternate and other alleles for given sites. Results can be used with CalculateContamination. Requires a common germline variant sites file, such as from gnomAD.
+keywords:
+ - gatk4
+ - germlinevariantsites
+ - getpileupsumaries
+ - readcountssummary
+tools:
+ - gatk4:
+ description: |
+ Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools
+ with a primary focus on variant discovery and genotyping. Its powerful processing engine
+ and high-performance computing features make it capable of taking on projects of any size.
+ homepage: https://gatk.broadinstitute.org/hc/en-us
+ documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s
+ doi: 10.1158/1538-7445.AM2017-3590
+ licence: ["Apache-2.0"]
+ identifier: ""
+input:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test' ]
+ - input:
+ type: file
+ description: BAM/CRAM file to be summarised.
+ pattern: "*.{bam,cram}"
+ ontologies: []
+ - index:
+ type: file
+ description: Index file for the input BAM/CRAM file.
+ pattern: "*.{bam.bai,cram.crai}"
+ ontologies: []
+ - intervals:
+ type: file
+ description: File containing specified sites to be used for the summary. If
+ this option is not specified, variants file is used instead automatically.
+ pattern: "*.interval_list"
+ ontologies: []
+ - - meta2:
+ type: map
+ description: |
+ Groovy Map containing reference information
+ e.g. [ id:'genome' ]
+ - fasta:
+ type: file
+ description: The reference fasta file
+ pattern: "*.fasta"
+ ontologies: []
+ - - meta3:
+ type: map
+ description: |
+ Groovy Map containing reference information
+ e.g. [ id:'genome' ]
+ - fai:
+ type: file
+ description: Index of reference fasta file
+ pattern: "*.fasta.fai"
+ ontologies: []
+ - - meta4:
+ type: map
+ description: |
+ Groovy Map containing reference information
+ e.g. [ id:'genome' ]
+ - dict:
+ type: file
+ description: GATK sequence dictionary
+ pattern: "*.dict"
+ ontologies: []
+ - variants:
+ type: file
+ description: Population vcf of germline sequencing, containing allele fractions.
+ Is also used as sites file if no separate sites file is specified.
+ pattern: "*.vcf.gz"
+ ontologies:
+ - edam: http://edamontology.org/format_3989 # GZIP format
+ - variants_tbi:
+ type: file
+ description: Index file for the germline resource.
+ pattern: "*.vcf.gz.tbi"
+ ontologies: []
+output:
+ table:
+ - - meta:
+ type: map
+ description: |
+ Groovy Map containing sample information
+ e.g. [ id:'test' ]
+ - "*.pileups.table":
+ type: file
+ description: Table containing read counts for each site.
+ pattern: "*.pileups.table"
+ ontologies: []
+ versions:
+ - versions.yml:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+ ontologies:
+ - edam: http://edamontology.org/format_3750 # YAML
+authors:
+ - "@GCJMackenzie"
+maintainers:
+ - "@GCJMackenzie"
diff --git a/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test b/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test
new file mode 100644
index 000000000..79cd6344c
--- /dev/null
+++ b/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test
@@ -0,0 +1,104 @@
+nextflow_process {
+
+ name "Test Process GATK4_GETPILEUPSUMMARIES"
+ script "../main.nf"
+ process "GATK4_GETPILEUPSUMMARIES"
+
+ tag "modules"
+ tag "modules_nfcore"
+ tag "gatk4"
+ tag "gatk4/getpileupsummaries"
+
+ test("human - bam") {
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam', checkIfExists: true) ,
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam.bai', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.interval_list', checkIfExists: true)
+ ]
+ input[1] = [[],[]] // fasta
+ input[2] = [[],[]] // fai
+ input[3] = [[],[]] // dict
+ input[4] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz', checkIfExists: true)
+ input[5] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz.tbi', checkIfExists: true)
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+ test("human - cram") {
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true) ,
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.interval_list', checkIfExists: true)
+ ]
+ input[1] = [ [ id:'genome' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.fasta', checkIfExists: true)
+ ]
+ input[2] = [ [ id:'genome' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.fasta.fai', checkIfExists: true)
+ ]
+ input[3] = [ [ id:'genome' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.dict', checkIfExists: true)
+ ]
+ input[4] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz', checkIfExists: true)
+ input[5] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz.tbi', checkIfExists: true)
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+ test("human - bam - stub") {
+
+ options "-stub"
+
+ when {
+ process {
+ """
+ input[0] = [ [ id:'test' ], // meta map
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam', checkIfExists: true) ,
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam.bai', checkIfExists: true),
+ file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.interval_list', checkIfExists: true)
+ ]
+ input[1] = [[],[]] // fasta
+ input[2] = [[],[]] // fai
+ input[3] = [[],[]] // dict
+ input[4] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz', checkIfExists: true)
+ input[5] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz.tbi', checkIfExists: true)
+ """
+ }
+ }
+
+ then {
+ assertAll(
+ { assert process.success },
+ { assert snapshot(process.out).match() }
+ )
+ }
+
+ }
+
+}
diff --git a/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test.snap b/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test.snap
new file mode 100644
index 000000000..fe713315a
--- /dev/null
+++ b/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test.snap
@@ -0,0 +1,101 @@
+{
+ "human - bam - stub": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.pileups.table:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee"
+ ],
+ "table": [
+ [
+ {
+ "id": "test"
+ },
+ "test.pileups.table:md5,d41d8cd98f00b204e9800998ecf8427e"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "25.04.7"
+ },
+ "timestamp": "2025-09-15T15:15:36.702715205"
+ },
+ "human - bam": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.pileups.table:md5,2d7ce4a54df6b9249e12d60737a67dac"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee"
+ ],
+ "table": [
+ [
+ {
+ "id": "test"
+ },
+ "test.pileups.table:md5,2d7ce4a54df6b9249e12d60737a67dac"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "25.04.7"
+ },
+ "timestamp": "2025-09-15T15:14:59.43518059"
+ },
+ "human - cram": {
+ "content": [
+ {
+ "0": [
+ [
+ {
+ "id": "test"
+ },
+ "test.pileups.table:md5,2d7ce4a54df6b9249e12d60737a67dac"
+ ]
+ ],
+ "1": [
+ "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee"
+ ],
+ "table": [
+ [
+ {
+ "id": "test"
+ },
+ "test.pileups.table:md5,2d7ce4a54df6b9249e12d60737a67dac"
+ ]
+ ],
+ "versions": [
+ "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee"
+ ]
+ }
+ ],
+ "meta": {
+ "nf-test": "0.9.2",
+ "nextflow": "25.04.7"
+ },
+ "timestamp": "2025-09-15T15:15:25.628760353"
+ }
+}
\ No newline at end of file
diff --git a/modules/nf-core/svdb/merge/main.nf b/modules/nf-core/svdb/merge/main.nf
index 104f5ad79..75b1373b6 100644
--- a/modules/nf-core/svdb/merge/main.nf
+++ b/modules/nf-core/svdb/merge/main.nf
@@ -102,4 +102,4 @@ process SVDB_MERGE {
bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//')
END_VERSIONS
"""
-}
+}
\ No newline at end of file
diff --git a/nextflow.config b/nextflow.config
index 159a6cbd3..0b59d3b00 100644
--- a/nextflow.config
+++ b/nextflow.config
@@ -32,6 +32,7 @@ params {
run_mt_for_wes = false
run_rtgvcfeval = false
save_mapped_as_cram = false
+ run_contamination = false
scatter_count = 20
skip_tools = null
skip_subworkflows = null
@@ -91,6 +92,8 @@ params {
vcfanno_lua = null
vep_cache = null
vep_plugin_files = null
+ contamination_sites = null
+ contamination_sites_tbi = null
modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/'
pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/raredisease'
@@ -486,6 +489,8 @@ includeConfig 'conf/modules/variant_evaluation.config'
includeConfig 'conf/modules/subsample_mt_frac.config'
includeConfig 'conf/modules/subsample_mt_reads.config'
includeConfig 'conf/modules/annotate_rhocallviz.config'
+includeConfig 'conf/modules/contamination.config'
+
// Nextflow plugins
plugins {
diff --git a/nextflow_schema.json b/nextflow_schema.json
index 357b122e6..8ee70783b 100644
--- a/nextflow_schema.json
+++ b/nextflow_schema.json
@@ -443,6 +443,30 @@
"description": "Path to UD matrix file from SVD result of genotype matrix. Used by verifybamid2.",
"fa_icon": "fas fa-file-csv"
},
+ "run_contamination": {
+ "type": "boolean",
+ "default": false,
+ "description": "Run GATK contamination check in addition to VerifyBamID2.",
+ "help_text": "GATK contamination check works better for WES samples and provides validation for WGS.",
+ "fa_icon": "fas fa-toggle-on"
+ },
+ "contamination_sites": {
+ "type": "string",
+ "exists": true,
+ "format": "file-path",
+ "pattern": "^\\S+\\.vcf(\\.gz)?$",
+ "description": "Path to VCF file with common variants for GATK contamination detection.",
+ "help_text": "Use small_exac_common_3.hg38.vcf.gz for no_alt reference genomes. This file should contain common SNP sites with population allele frequencies.",
+ "fa_icon": "fas fa-file"
+ },
+ "contamination_sites_tbi": {
+ "type": "string",
+ "exists": true,
+ "format": "file-path",
+ "pattern": "^\\S+\\.vcf(\\.gz)?\\.tbi$",
+ "description": "Index file for contamination sites VCF.",
+ "fa_icon": "fas fa-file"
+ },
"vcf2cytosure_blacklist": {
"type": "string",
"exists": true,
diff --git a/subworkflows/local/contamination_check/main.nf b/subworkflows/local/contamination_check/main.nf
new file mode 100644
index 000000000..e9ef9c757
--- /dev/null
+++ b/subworkflows/local/contamination_check/main.nf
@@ -0,0 +1,67 @@
+//
+// Subworkflow: Check sample contamination using GATK
+//
+
+include { GATK4_GETPILEUPSUMMARIES } from '../../../modules/nf-core/gatk4/getpileupsummaries/main'
+include { GATK4_CALCULATECONTAMINATION } from '../../../modules/nf-core/gatk4/calculatecontamination/main'
+
+workflow CONTAMINATION_CHECK {
+
+ take:
+ ch_bam // channel: [ val(meta), path(bam), path(bai) ]
+ ch_fasta // channel: [ val(meta), path(fasta) ]
+ ch_fai // channel: [ val(meta), path(fai) ]
+ ch_dict // channel: [ val(meta), path(dict) ]
+ ch_contamination_vcf // channel: [ path(vcf), path(tbi) ]
+ ch_intervals // channel: [ path(bed) ] - only used for WES, empty for WGS
+
+ main:
+ ch_versions = Channel.empty()
+
+ // Prepare BAM with intervals - conditionally based on analysis type
+ // For WGS: intervals will be empty [], for WES: intervals will contain the BED file
+ ch_bam_with_intervals = ch_bam
+ .combine(ch_intervals.ifEmpty([[]]))
+ .map { meta, bam, bai, bed ->
+ // If bed is an empty list, pass empty list; otherwise pass bed file
+ def intervals = (bed instanceof List && bed.isEmpty()) ? [] : (bed ?: [])
+ [ meta, bam, bai, intervals ]
+ }
+
+ // Separate VCF and TBI - collect them to make value channels
+ ch_variants_vcf = ch_contamination_vcf
+ .map { vcf, tbi -> vcf }
+ .collect()
+
+ ch_variants_tbi = ch_contamination_vcf
+ .map { vcf, tbi -> tbi }
+ .collect()
+
+ // Run GetPileupSummaries
+ GATK4_GETPILEUPSUMMARIES (
+ ch_bam_with_intervals, // [meta, bam, bai, intervals]
+ ch_fasta, // [meta2, fasta]
+ ch_fai, // [meta3, fai]
+ ch_dict, // [meta4, dict]
+ ch_variants_vcf, // path(vcf)
+ ch_variants_tbi // path(tbi)
+ )
+ ch_versions = ch_versions.mix(GATK4_GETPILEUPSUMMARIES.out.versions.first())
+
+ // Run CalculateContamination (tumor-only, no matched normal)
+ // Format: [meta, pileup_table, matched_normal_table]
+ // matched_normal_table is empty [] for tumor-only mode
+ ch_contamination_input = GATK4_GETPILEUPSUMMARIES.out.table
+ .map { meta, table -> [ meta, table, [] ] }
+
+ GATK4_CALCULATECONTAMINATION (
+ ch_contamination_input
+ )
+ ch_versions = ch_versions.mix(GATK4_CALCULATECONTAMINATION.out.versions.first())
+
+ emit:
+ contamination_table = GATK4_CALCULATECONTAMINATION.out.contamination
+ segmentation_table = GATK4_CALCULATECONTAMINATION.out.segmentation
+ pileup_table = GATK4_GETPILEUPSUMMARIES.out.table
+ versions = ch_versions
+}
diff --git a/subworkflows/local/contamination_check/meta.yml b/subworkflows/local/contamination_check/meta.yml
new file mode 100644
index 000000000..ab7f5937b
--- /dev/null
+++ b/subworkflows/local/contamination_check/meta.yml
@@ -0,0 +1,41 @@
+name: contamination_check
+description: Estimate sample contamination using GATK GetPileupSummaries and CalculateContamination
+keywords:
+ - contamination
+ - QC
+ - GATK
+components:
+ - gatk4/getpileupsummaries
+ - gatk4/calculatecontamination
+input:
+ - ch_bam:
+ type: file
+ description: BAM files with their indices
+ pattern: "*.{bam,cram}"
+ - ch_fasta:
+ type: file
+ description: Reference genome FASTA file
+ pattern: "*.{fa,fasta}"
+ - ch_contamination_vcf:
+ type: file
+ description: VCF file containing common germline variants (gnomAD)
+ pattern: "*.vcf.gz"
+ - ch_intervals:
+ type: file
+ description: BED file with target intervals
+ pattern: "*.bed"
+output:
+ - contamination_table:
+ type: file
+ description: Table containing contamination estimates
+ pattern: "*.contamination.table"
+ - segmentation_table:
+ type: file
+ description: Table containing tumor segmentation
+ pattern: "*.segmentation.table"
+ - versions:
+ type: file
+ description: File containing software versions
+ pattern: "versions.yml"
+authors:
+ - "56053vujinovic"
diff --git a/subworkflows/local/contamination_check/tests/main.nf.test b/subworkflows/local/contamination_check/tests/main.nf.test
new file mode 100644
index 000000000..2909dccd7
--- /dev/null
+++ b/subworkflows/local/contamination_check/tests/main.nf.test
@@ -0,0 +1,112 @@
+nextflow_workflow {
+
+ name "Test Workflow CONTAMINATION_CHECK"
+ script "subworkflows/local/contamination_check/main.nf"
+ workflow "CONTAMINATION_CHECK"
+ tag "subworkflows"
+ tag "contamination_check"
+ config "./nextflow.config"
+
+ test("CONTAMINATION_CHECK - WGS, no intervals") {
+
+ when {
+ params {
+ outdir = "$outputDir"
+ }
+ workflow {
+ """
+ input[0] = channel.of(
+ [
+ [id:'earlycasualcaiman', sample:'earlycasualcaiman', single_end:false, sex:1, phenotype:1, case_id:'justhusky'],
+ file(params.pipelines_testdata_base_path + 'testdata/earlycasualcaiman_sorted_md.bam', checkIfExists: true),
+ file(params.pipelines_testdata_base_path + 'testdata/earlycasualcaiman_sorted_md.bam.bai', checkIfExists: true)
+ ]
+ )
+ input[1] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta', checkIfExists: true)).collect()
+ input[2] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true)).collect()
+ input[3] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.dict', checkIfExists: true)).collect()
+ input[4] = channel.of(
+ [
+ file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz', checkIfExists: true),
+ file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz.tbi', checkIfExists: true)
+ ]
+ ).collect()
+ input[5] = Channel.empty()
+ """
+ }
+ }
+
+ then {
+ assertAll (
+ { assert workflow.success },
+ { assert snapshot(
+ workflow.out.contamination_table
+ .collect { it[1] }
+ .flatten()
+ .collect { file(it).name }
+ .toSorted(),
+ workflow.out.pileup_table
+ .collect { it[1] }
+ .flatten()
+ .collect { file(it).name }
+ .toSorted(),
+ workflow.out.versions
+ ).match()
+ }
+ )
+ }
+
+ }
+
+ test("CONTAMINATION_CHECK - WES, with intervals") {
+
+ when {
+ params {
+ outdir = "$outputDir"
+ }
+ workflow {
+ """
+ input[0] = channel.of(
+ [
+ [id:'earlycasualcaiman', sample:'earlycasualcaiman', single_end:false, sex:1, phenotype:1, case_id:'justhusky'],
+ file(params.pipelines_testdata_base_path + 'testdata/earlycasualcaiman_sorted_md.bam', checkIfExists: true),
+ file(params.pipelines_testdata_base_path + 'testdata/earlycasualcaiman_sorted_md.bam.bai', checkIfExists: true)
+ ]
+ )
+ input[1] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta', checkIfExists: true)).collect()
+ input[2] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true)).collect()
+ input[3] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.dict', checkIfExists: true)).collect()
+ input[4] = channel.of(
+ [
+ file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz', checkIfExists: true),
+ file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz.tbi', checkIfExists: true)
+ ]
+ ).collect()
+ input[5] = Channel.fromPath(params.pipelines_testdata_base_path + 'reference/target.bed', checkIfExists: true).collect()
+ """
+ }
+ }
+
+ then {
+ assertAll (
+ { assert workflow.success },
+ { assert snapshot(
+ workflow.out.contamination_table
+ .collect { it[1] }
+ .flatten()
+ .collect { file(it).name }
+ .toSorted(),
+ workflow.out.pileup_table
+ .collect { it[1] }
+ .flatten()
+ .collect { file(it).name }
+ .toSorted(),
+ workflow.out.versions
+ ).match()
+ }
+ )
+ }
+
+ }
+
+}
diff --git a/subworkflows/local/contamination_check/tests/main.nf.test.snap b/subworkflows/local/contamination_check/tests/main.nf.test.snap
new file mode 100644
index 000000000..c9b91a552
--- /dev/null
+++ b/subworkflows/local/contamination_check/tests/main.nf.test.snap
@@ -0,0 +1,40 @@
+{
+ "CONTAMINATION_CHECK - WGS, no intervals": {
+ "content": [
+ [
+ "earlycasualcaiman_contamination.contamination.table"
+ ],
+ [
+ "earlycasualcaiman_pileups.pileups.table"
+ ],
+ [
+ "versions.yml:md5,46fc6219f36f2e8e436a824f3c0f9a4a",
+ "versions.yml:md5,dd08885235dcc7e9f408eba91ef7303e"
+ ]
+ ],
+ "timestamp": "2026-03-23T14:29:41.579345381",
+ "meta": {
+ "nf-test": "0.9.5",
+ "nextflow": "25.10.4"
+ }
+ },
+ "CONTAMINATION_CHECK - WES, with intervals": {
+ "content": [
+ [
+ "earlycasualcaiman_contamination.contamination.table"
+ ],
+ [
+ "earlycasualcaiman_pileups.pileups.table"
+ ],
+ [
+ "versions.yml:md5,46fc6219f36f2e8e436a824f3c0f9a4a",
+ "versions.yml:md5,dd08885235dcc7e9f408eba91ef7303e"
+ ]
+ ],
+ "timestamp": "2026-03-23T14:31:02.497064504",
+ "meta": {
+ "nf-test": "0.9.5",
+ "nextflow": "25.10.4"
+ }
+ }
+}
\ No newline at end of file
diff --git a/subworkflows/local/contamination_check/tests/nextflow.config b/subworkflows/local/contamination_check/tests/nextflow.config
new file mode 100644
index 000000000..3a5a56e5c
--- /dev/null
+++ b/subworkflows/local/contamination_check/tests/nextflow.config
@@ -0,0 +1,12 @@
+process {
+
+ withName: '.*CONTAMINATION_CHECK:GATK4_GETPILEUPSUMMARIES' {
+ ext.args = '--disable-sequence-dictionary-validation'
+ ext.prefix = { "${meta.id}_pileups" }
+ }
+
+ withName: '.*CONTAMINATION_CHECK:GATK4_CALCULATECONTAMINATION' {
+ ext.args = ''
+ ext.prefix = { "${meta.id}_contamination" }
+ }
+}
diff --git a/workflows/raredisease.nf b/workflows/raredisease.nf
index 7d7bd3d24..460a10366 100644
--- a/workflows/raredisease.nf
+++ b/workflows/raredisease.nf
@@ -27,13 +27,15 @@ include { SPRING_DECOMPRESS as SPRING_DECOMPRESS_TO_R1_FQ } from '../modules/n
include { SPRING_DECOMPRESS as SPRING_DECOMPRESS_TO_R2_FQ } from '../modules/nf-core/spring/decompress/main'
include { SPRING_DECOMPRESS as SPRING_DECOMPRESS_TO_FQ_PAIR } from '../modules/nf-core/spring/decompress/main'
include { STRANGER } from '../modules/nf-core/stranger/main'
-
//
// MODULE: Local modules
//
include { RENAME_ALIGN_FILES as RENAME_BAM } from '../modules/local/rename_align_files'
include { RENAME_ALIGN_FILES as RENAME_BAI } from '../modules/local/rename_align_files'
+include { CREATE_HGNCIDS_FILE } from '../modules/local/create_hgncids_file'
+include { CREATE_PEDIGREE_FILE } from '../modules/local/create_pedigree_file'
+include { PARSE_CONTAMINATION } from '../modules/local/parse_contamination/main'
//
// SUBWORKFLOWS
@@ -52,6 +54,7 @@ include { CALL_MOBILE_ELEMENTS } from '..
include { CALL_REPEAT_EXPANSIONS } from '../subworkflows/local/call_repeat_expansions'
include { CALL_SNV } from '../subworkflows/local/call_snv'
include { CALL_STRUCTURAL_VARIANTS } from '../subworkflows/local/call_structural_variants'
+include { CONTAMINATION_CHECK } from '../subworkflows/local/contamination_check/main'
include { GENERATE_CYTOSURE_FILES } from '../subworkflows/local/generate_cytosure_files'
include { GENS } from '../subworkflows/local/gens'
include { PREPARE_REFERENCES } from '../subworkflows/local/prepare_references'
@@ -294,7 +297,53 @@ workflow RAREDISEASE {
)
ch_versions = ch_versions.mix(QC_BAM.out.versions)
+ //
+ // SUBWORKFLOW: Check for contamination using GATK
+ //
+ ch_contamination_mqc = Channel.empty()
+
+ if (params.run_contamination && params.contamination_sites) {
+
+ log.info "=== CONTAMINATION CHECK ENABLED ==="
+ log.info "Analysis type: ${params.analysis_type}"
+
+ // Prepare contamination sites channel
+ ch_contamination_sites = Channel.of([
+ file(params.contamination_sites, checkIfExists: true),
+ file(params.contamination_sites_tbi, checkIfExists: true)
+ ]).collect()
+
+ // Prepare intervals channel - CRITICAL: Use Channel.empty() for WGS, not Channel.of([])
+ if (params.analysis_type == 'wes' && params.target_bed) {
+ ch_intervals_contamination = Channel.fromPath(params.target_bed).collect()
+ log.info "Using target BED for WES contamination check"
+ } else {
+ ch_intervals_contamination = Channel.empty()
+ log.info "No intervals for WGS contamination check (genome-wide)"
+ }
+
+ // Prepare BAM input with BAI
+ ch_bam_for_contamination = ch_mapped.genome_marked_bam
+ .join(ch_mapped.genome_marked_bai, failOnMismatch:true, failOnDuplicate:true)
+ CONTAMINATION_CHECK (
+ ch_bam_for_contamination,
+ ch_genome_fasta,
+ ch_genome_fai,
+ ch_genome_dictionary,
+ ch_contamination_sites,
+ ch_intervals_contamination
+ )
+
+ // Parse for MultiQC
+ PARSE_CONTAMINATION (
+ CONTAMINATION_CHECK.out.contamination_table
+ )
+
+ ch_contamination_mqc = PARSE_CONTAMINATION.out.mqc_table
+ ch_versions = ch_versions.mix(CONTAMINATION_CHECK.out.versions)
+ ch_versions = ch_versions.mix(PARSE_CONTAMINATION.out.versions)
+ }
/*
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
RENAME ALIGNMENT FILES FOR SMNCOPYNUMBERCALLER & REPEATCALLING
@@ -848,6 +897,9 @@ workflow RAREDISEASE {
ch_multiqc_files = ch_multiqc_files.mix(QC_BAM.out.global_dist.map{_meta, reports -> reports}.collect().ifEmpty([]))
ch_multiqc_files = ch_multiqc_files.mix(QC_BAM.out.cov.map{_meta, reports -> reports}.collect().ifEmpty([]))
ch_multiqc_files = ch_multiqc_files.mix(QC_BAM.out.self_sm.map{_meta, reports -> reports}.collect().ifEmpty([]))
+
+ // Add contamination results to MultiQC
+ ch_multiqc_files = ch_multiqc_files.mix(ch_contamination_mqc.map { _meta, file -> file })
if (!skip_peddy) {
ch_multiqc_files = ch_multiqc_files.mix(PEDDY.out.ped.map{_meta, reports -> reports}.collect().ifEmpty([]))