diff --git a/CHANGELOG.md b/CHANGELOG.md index 208aa1061..6d9c6e2ea 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Tests for call_repeat_expansions and qc_bam subworkflows [#713](https://github.com/nf-core/raredisease/pull/713) - Feature to subsample mitochondrial alignments based on number of reads [#748](https://github.com/nf-core/raredisease/pull/748) - Functionality to generate coverage information using Sambamba depth [#752](https://github.com/nf-core/raredisease/pull/752) +- Added GATK contamination check for WES/WGS samples as complement to VerifyBamID2 [#758](https://github.com/nf-core/raredisease/pull/758) +- GATK Contamination results displayed in MultiQC with color-coded thresholds [#758](https://github.com/nf-core/raredisease/pull/758) ### `Changed` @@ -31,9 +33,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Parameters -| Old parameter | New parameter | -| ------------- | ---------------- | -| | sambamba_regions | +| Old parameter | New parameter | +| ------------- | ----------------------- | +| | sambamba_regions | +| | run_contamination | +| | contamination_sites | +| | contamination_sites_tbi | ### Tool updates @@ -48,6 +53,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | gens-preproc | 1.0.11 | 1.1.2 | | samtools (sort & view) | 1.21 | 1.22.1 | | sambamba | | 1.0.1 | +| gatk4/calculatecontamination | | 4.6.2.0 | +| gatk4/getpileupsummaries | | 4.6.2.0 | ## 2.6.0 - Cacofonix [2025-06-25] diff --git a/assets/multiqc_config.yml b/assets/multiqc_config.yml index 034fb00ed..bd604186a 100644 --- a/assets/multiqc_config.yml +++ b/assets/multiqc_config.yml @@ -6,7 +6,6 @@ report_comment: > target="_blank">nf-core/raredisease analysis pipeline. For information about how to interpret these results, please see the documentation. - report_section_order: "nf-core-raredisease-methods-description": order: -1000 @@ -14,9 +13,9 @@ report_section_order: order: -1001 "nf-core-raredisease-summary": order: -1002 - + gatk_contamination: + order: 1050 export_plots: true - run_modules: - fastqc - fastp @@ -28,7 +27,6 @@ run_modules: - peddy - verifybamid - custom_content - module_order: - fastqc: name: "FastQC" @@ -51,8 +49,52 @@ module_order: - verifybamid: name: "VerifyBamID2" +# Custom content configuration for GATK contamination +custom_data: + gatk_contamination: + id: "gatk_contamination" + section_name: "GATK Contamination" + description: "Sample contamination estimates from GATK CalculateContamination based on common variant allele frequencies" + plot_type: "generalstats" + pconfig: + contamination_pct: + title: "Contamination" + description: "Estimated sample contamination percentage" + max: 10 + min: 0 + scale: "RdYlGn-rev" + suffix: "%" + format: "{:,.2f}" + shared_key: "contamination" + +# Make contamination visible in general stats by default +table_columns_visible: + gatk_contamination: + contamination_pct: true + +# Color coding thresholds for contamination +table_cond_formatting_rules: + contamination_pct: + pass: + - s_eq: "pass" + - lt: 2.0 + warn: + - s_eq: "warn" + - lt: 5.0 + - gte: 2.0 + fail: + - s_eq: "fail" + - gte: 5.0 + +# Add to General Statistics table configuration +table_columns_placement: + gatk_contamination: + contamination_pct: 900 + extra_fn_clean_exts: - "_sorted_md" + - "_contamination" + - "_pileups" - type: regex pattern: "_LNUMBER[0-9]{1,}" diff --git a/conf/modules/contamination.config b/conf/modules/contamination.config new file mode 100644 index 000000000..f07ebeca2 --- /dev/null +++ b/conf/modules/contamination.config @@ -0,0 +1,46 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for GATK contamination checking modules +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +process { + + // + // GATK GetPileupSummaries + // + withName: '.*:CONTAMINATION_CHECK:GATK4_GETPILEUPSUMMARIES' { + ext.args = '' + ext.prefix = { "${meta.id}_pileups" } + publishDir = [ + path: { "${params.outdir}/qc/contamination/pileups" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // + // GATK CalculateContamination + // + withName: '.*:CONTAMINATION_CHECK:GATK4_CALCULATECONTAMINATION' { + ext.args = '' + ext.prefix = { "${meta.id}_contamination" } + publishDir = [ + path: { "${params.outdir}/qc/contamination" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + // + // Parse contamination results for MultiQC + // + withName: '.*:RAREDISEASE:PARSE_CONTAMINATION' { + ext.prefix = { "${meta.id}_contamination" } + publishDir = [ + path: { "${params.outdir}/multiqc" }, + mode: params.publish_dir_mode, + pattern: '*_mqc.tsv' + ] + } +} diff --git a/modules.json b/modules.json index 56535bc9b..be7408569 100644 --- a/modules.json +++ b/modules.json @@ -257,6 +257,16 @@ "git_sha": "666652151335353eef2fcd58880bcef5bc2928e1", "installed_by": ["modules"] }, + "gatk4/calculatecontamination": { + "branch": "master", + "git_sha": "ae8cd884f895585c6799ab4eb6a2c9f44df03336", + "installed_by": ["modules"] + }, + "gatk4/getpileupsummaries": { + "branch": "master", + "git_sha": "ae8cd884f895585c6799ab4eb6a2c9f44df03336", + "installed_by": ["modules"] + }, "gawk": { "branch": "master", "git_sha": "5ee4d69ed992c3ce81cfbbdd0bef932fcb81c75a", diff --git a/modules/local/parse_contamination/bin/parse_contamination.py b/modules/local/parse_contamination/bin/parse_contamination.py new file mode 100755 index 000000000..5f5b3f1e0 --- /dev/null +++ b/modules/local/parse_contamination/bin/parse_contamination.py @@ -0,0 +1,40 @@ +#!/usr/bin/env python3 +"""Parse GATK CalculateContamination output for MultiQC.""" + +import argparse + + +def parse_contamination(contamination_table, sample_id, prefix): + """Parse contamination table and write MultiQC output.""" + with open(contamination_table, 'r') as f: + lines = f.readlines() + data_line = lines[1].strip().split('\t') + contamination = float(data_line[1]) + contamination_pct = contamination * 100 + + with open(f"{prefix}_contamination_mqc.tsv", 'w') as out: + out.write("# id: 'gatk_contamination'\n") + out.write("# section_name: 'GATK Contamination'\n") + out.write("# description: 'Sample contamination estimates from GATK CalculateContamination'\n") + out.write("# plot_type: 'generalstats'\n") + out.write("# pconfig:\n") + out.write("# contamination_pct:\n") + out.write("# title: 'Contamination'\n") + out.write("# description: 'Estimated sample contamination percentage'\n") + out.write("# max: 10\n") + out.write("# min: 0\n") + out.write("# scale: 'RdYlGn-rev'\n") + out.write("# suffix: '%'\n") + out.write("# format: '{:,.2f}'\n") + out.write("Sample\tcontamination_pct\n") + out.write(f"{sample_id}\t{contamination_pct:.4f}\n") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Parse GATK contamination output for MultiQC") + parser.add_argument("--input", required=True, help="GATK contamination table") + parser.add_argument("--sample_id", required=True, help="Sample ID") + parser.add_argument("--prefix", required=True, help="Output prefix") + args = parser.parse_args() + + parse_contamination(args.input, args.sample_id, args.prefix) diff --git a/modules/local/parse_contamination/environment.yml b/modules/local/parse_contamination/environment.yml new file mode 100644 index 000000000..b5260d32d --- /dev/null +++ b/modules/local/parse_contamination/environment.yml @@ -0,0 +1,7 @@ +name: parse_contamination +channels: + - conda-forge + - bioconda + - defaults +dependencies: + - python=3.11 diff --git a/modules/local/parse_contamination/main.nf b/modules/local/parse_contamination/main.nf new file mode 100644 index 000000000..d8be68d13 --- /dev/null +++ b/modules/local/parse_contamination/main.nf @@ -0,0 +1,44 @@ +process PARSE_CONTAMINATION { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/python:3.11' : + 'biocontainers/python:3.11' }" + + input: + tuple val(meta), path(contamination_table) + + output: + tuple val(meta), path("*_contamination_mqc.tsv"), emit: mqc_table + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + python3 ${moduleDir}/bin/parse_contamination.py \\ + --input ${contamination_table} \\ + --sample_id ${meta.id} \\ + --prefix ${prefix} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}_contamination_mqc.tsv + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + python: \$(python --version | sed 's/Python //') + END_VERSIONS + """ +} diff --git a/modules/local/parse_contamination/meta.yml b/modules/local/parse_contamination/meta.yml new file mode 100644 index 000000000..6018fd2df --- /dev/null +++ b/modules/local/parse_contamination/meta.yml @@ -0,0 +1,29 @@ +name: parse_contamination +description: Parse GATK CalculateContamination output for MultiQC +keywords: + - contamination + - MultiQC + - parsing +tools: + - python: + description: Python programming language + homepage: https://www.python.org/ +input: + - meta: + type: map + description: Groovy Map containing sample information + - contamination_table: + type: file + description: GATK CalculateContamination output table + pattern: "*.contamination.table" +output: + - mqc_table: + type: file + description: MultiQC custom content table + pattern: "*_contamination_mqc.tsv" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "56053vujinovic" diff --git a/modules/nf-core/gatk4/calculatecontamination/environment.yml b/modules/nf-core/gatk4/calculatecontamination/environment.yml new file mode 100644 index 000000000..67e0eb860 --- /dev/null +++ b/modules/nf-core/gatk4/calculatecontamination/environment.yml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + # renovate: datasource=conda depName=bioconda/gatk4 + - bioconda::gatk4=4.6.2.0 + # renovate: datasource=conda depName=bioconda/gcnvkernel + - bioconda::gcnvkernel=0.9 diff --git a/modules/nf-core/gatk4/calculatecontamination/main.nf b/modules/nf-core/gatk4/calculatecontamination/main.nf new file mode 100644 index 000000000..0b8706e76 --- /dev/null +++ b/modules/nf-core/gatk4/calculatecontamination/main.nf @@ -0,0 +1,59 @@ +process GATK4_CALCULATECONTAMINATION { + tag "${meta.id}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ce/ced519873646379e287bc28738bdf88e975edd39a92e7bc6a34bccd37153d9d0/data' + : 'community.wave.seqera.io/library/gatk4_gcnvkernel:edb12e4f0bf02cd3'}" + + input: + tuple val(meta), path(pileup), path(matched) + + output: + tuple val(meta), path('*.contamination.table'), emit: contamination + tuple val(meta), path('*.segmentation.table'), emit: segmentation, optional: true + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def matched_command = matched ? "--matched-normal ${matched}" : '' + + def avail_mem = 3072 + if (!task.memory) { + log.info('[GATK CalculateContamination] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.') + } + else { + avail_mem = (task.memory.mega * 0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + CalculateContamination \\ + --input ${pileup} \\ + --output ${prefix}.contamination.table \\ + ${matched_command} \\ + --tmp-dir . \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.contamination.table + touch ${prefix}.segmentation.table + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/calculatecontamination/meta.yml b/modules/nf-core/gatk4/calculatecontamination/meta.yml new file mode 100644 index 000000000..477e9d3b1 --- /dev/null +++ b/modules/nf-core/gatk4/calculatecontamination/meta.yml @@ -0,0 +1,76 @@ +name: gatk4_calculatecontamination +description: | + Calculates the fraction of reads from cross-sample contamination based on summary tables from getpileupsummaries. Output to be used with filtermutectcalls. +keywords: + - gatk4 + - calculatecontamination + - cross-samplecontamination + - getpileupsummaries + - filtermutectcalls +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - pileup: + type: file + description: File containing the pileups summary table of a tumor sample to + be used to calculate contamination. + pattern: "*.pileups.table" + ontologies: [] + - matched: + type: file + description: File containing the pileups summary table of a normal sample that + matches with the tumor sample specified in pileup argument. This is an optional + input. + pattern: "*.pileups.table" + ontologies: [] +output: + contamination: + - - meta: + type: file + description: File containing the contamination table. + pattern: "*.contamination.table" + ontologies: [] + - "*.contamination.table": + type: file + description: File containing the contamination table. + pattern: "*.contamination.table" + ontologies: [] + segmentation: + - - meta: + type: file + description: File containing the contamination table. + pattern: "*.contamination.table" + ontologies: [] + - "*.segmentation.table": + type: file + description: output table containing segmentation of tumor minor allele fractions + (optional) + pattern: "*.segmentation.table" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@GCJMackenzie" + - "@maxulysse" +maintainers: + - "@GCJMackenzie" + - "@maxulysse" diff --git a/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test b/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test new file mode 100644 index 000000000..81f048f67 --- /dev/null +++ b/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test @@ -0,0 +1,106 @@ +nextflow_process { + + name "Test Process GATK4_CALCULATECONTAMINATION" + script "../main.nf" + process "GATK4_CALCULATECONTAMINATION" + + tag "modules" + tag "modules_nfcore" + tag "gatk4" + tag "gatk4/calculatecontamination" + + test("human - pileup-table") { + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test2.pileups.table', checkIfExists: true), + [] ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions") }, + { assert snapshot(file(process.out.contamination.get(0).get(1)).readLines()[0]).match() } + ) + } + + } + + test("human - pileup-table - matched-pair") { + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test2.pileups.table', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test.pileups.table', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions_pair") }, + { assert snapshot(file(process.out.contamination.get(0).get(1)).readLines()[0]).match() } + ) + } + + } + + test("human - pileup-table - segmentation") { + + config "./nextflow.config" + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test2.pileups.table', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test.pileups.table', checkIfExists: true) + ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out.versions).match("versions_segmentation") }, + { assert snapshot(file(process.out.contamination.get(0).get(1)).readLines()[0]).match("contamination") }, + { assert snapshot(file(process.out.segmentation.get(0).get(1)).readLines()[0]).match("segmentation") } + ) + } + + } + + test("human - pileup-table - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gatk/test2.pileups.table', checkIfExists: true), + [] ] + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test.snap b/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test.snap new file mode 100644 index 000000000..4ef184bfc --- /dev/null +++ b/modules/nf-core/gatk4/calculatecontamination/tests/main.nf.test.snap @@ -0,0 +1,127 @@ +{ + "versions_pair": { + "content": [ + [ + "versions.yml:md5,703ef3c3104ffcb977090771a8761da2" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.7" + }, + "timestamp": "2025-09-15T12:29:33.077296321" + }, + "versions": { + "content": [ + [ + "versions.yml:md5,703ef3c3104ffcb977090771a8761da2" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.7" + }, + "timestamp": "2025-09-15T12:29:20.224234796" + }, + "versions_segmentation": { + "content": [ + [ + "versions.yml:md5,703ef3c3104ffcb977090771a8761da2" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.7" + }, + "timestamp": "2025-09-15T12:29:45.806413439" + }, + "segmentation": { + "content": [ + "#SAMPLE=tumour" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-15T12:57:38.845287" + }, + "human - pileup-table": { + "content": [ + "sample\tcontamination\terror" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-15T11:09:16.292509" + }, + "human - pileup-table - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.contamination.table:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "test.segmentation.table:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "2": [ + "versions.yml:md5,703ef3c3104ffcb977090771a8761da2" + ], + "contamination": [ + [ + { + "id": "test" + }, + "test.contamination.table:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "segmentation": [ + [ + { + "id": "test" + }, + "test.segmentation.table:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,703ef3c3104ffcb977090771a8761da2" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.7" + }, + "timestamp": "2025-09-15T12:29:55.994434812" + }, + "human - pileup-table - matched-pair": { + "content": [ + "sample\tcontamination\terror" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-15T11:09:26.408848" + }, + "contamination": { + "content": [ + "sample\tcontamination\terror" + ], + "meta": { + "nf-test": "0.8.4", + "nextflow": "23.10.0" + }, + "timestamp": "2024-05-15T12:57:38.840651" + } +} \ No newline at end of file diff --git a/modules/nf-core/gatk4/calculatecontamination/tests/nextflow.config b/modules/nf-core/gatk4/calculatecontamination/tests/nextflow.config new file mode 100644 index 000000000..db836b7f8 --- /dev/null +++ b/modules/nf-core/gatk4/calculatecontamination/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: GATK4_CALCULATECONTAMINATION { + ext.args = { "--tumor-segmentation ${meta.id}.segmentation.table" } + } +} diff --git a/modules/nf-core/gatk4/getpileupsummaries/environment.yml b/modules/nf-core/gatk4/getpileupsummaries/environment.yml new file mode 100644 index 000000000..67e0eb860 --- /dev/null +++ b/modules/nf-core/gatk4/getpileupsummaries/environment.yml @@ -0,0 +1,10 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + # renovate: datasource=conda depName=bioconda/gatk4 + - bioconda::gatk4=4.6.2.0 + # renovate: datasource=conda depName=bioconda/gcnvkernel + - bioconda::gcnvkernel=0.9 diff --git a/modules/nf-core/gatk4/getpileupsummaries/main.nf b/modules/nf-core/gatk4/getpileupsummaries/main.nf new file mode 100644 index 000000000..d0cabdaf8 --- /dev/null +++ b/modules/nf-core/gatk4/getpileupsummaries/main.nf @@ -0,0 +1,65 @@ +process GATK4_GETPILEUPSUMMARIES { + tag "${meta.id}" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container + ? 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/ce/ced519873646379e287bc28738bdf88e975edd39a92e7bc6a34bccd37153d9d0/data' + : 'community.wave.seqera.io/library/gatk4_gcnvkernel:edb12e4f0bf02cd3'}" + + input: + tuple val(meta), path(input), path(index), path(intervals) + tuple val(meta2), path(fasta) + tuple val(meta3), path(fai) + tuple val(meta4), path(dict) + path variants + path variants_tbi + + output: + tuple val(meta), path('*.pileups.table'), emit: table + path "versions.yml", emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def interval_command = intervals ? "--intervals ${intervals}" : "--intervals ${variants}" + def reference_command = fasta ? "--reference ${fasta}" : '' + + def avail_mem = 3072 + if (!task.memory) { + log.info('[GATK GetPileupSummaries] Available memory not known - defaulting to 3GB. Specify process memory requirements to change this.') + } + else { + avail_mem = (task.memory.mega * 0.8).intValue() + } + """ + gatk --java-options "-Xmx${avail_mem}M -XX:-UsePerfData" \\ + GetPileupSummaries \\ + --input ${input} \\ + --variant ${variants} \\ + --output ${prefix}.pileups.table \\ + ${reference_command} \\ + ${interval_command} \\ + --tmp-dir . \\ + ${args} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.pileups.table + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + gatk4: \$(echo \$(gatk --version 2>&1) | sed 's/^.*(GATK) v//; s/ .*\$//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/gatk4/getpileupsummaries/meta.yml b/modules/nf-core/gatk4/getpileupsummaries/meta.yml new file mode 100644 index 000000000..d7f66ad19 --- /dev/null +++ b/modules/nf-core/gatk4/getpileupsummaries/meta.yml @@ -0,0 +1,106 @@ +name: gatk4_getpileupsummaries +description: | + Summarizes counts of reads that support reference, alternate and other alleles for given sites. Results can be used with CalculateContamination. Requires a common germline variant sites file, such as from gnomAD. +keywords: + - gatk4 + - germlinevariantsites + - getpileupsumaries + - readcountssummary +tools: + - gatk4: + description: | + Developed in the Data Sciences Platform at the Broad Institute, the toolkit offers a wide variety of tools + with a primary focus on variant discovery and genotyping. Its powerful processing engine + and high-performance computing features make it capable of taking on projects of any size. + homepage: https://gatk.broadinstitute.org/hc/en-us + documentation: https://gatk.broadinstitute.org/hc/en-us/categories/360002369672s + doi: 10.1158/1538-7445.AM2017-3590 + licence: ["Apache-2.0"] + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - input: + type: file + description: BAM/CRAM file to be summarised. + pattern: "*.{bam,cram}" + ontologies: [] + - index: + type: file + description: Index file for the input BAM/CRAM file. + pattern: "*.{bam.bai,cram.crai}" + ontologies: [] + - intervals: + type: file + description: File containing specified sites to be used for the summary. If + this option is not specified, variants file is used instead automatically. + pattern: "*.interval_list" + ontologies: [] + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fasta: + type: file + description: The reference fasta file + pattern: "*.fasta" + ontologies: [] + - - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - fai: + type: file + description: Index of reference fasta file + pattern: "*.fasta.fai" + ontologies: [] + - - meta4: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'genome' ] + - dict: + type: file + description: GATK sequence dictionary + pattern: "*.dict" + ontologies: [] + - variants: + type: file + description: Population vcf of germline sequencing, containing allele fractions. + Is also used as sites file if no separate sites file is specified. + pattern: "*.vcf.gz" + ontologies: + - edam: http://edamontology.org/format_3989 # GZIP format + - variants_tbi: + type: file + description: Index file for the germline resource. + pattern: "*.vcf.gz.tbi" + ontologies: [] +output: + table: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test' ] + - "*.pileups.table": + type: file + description: Table containing read counts for each site. + pattern: "*.pileups.table" + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@GCJMackenzie" +maintainers: + - "@GCJMackenzie" diff --git a/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test b/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test new file mode 100644 index 000000000..79cd6344c --- /dev/null +++ b/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test @@ -0,0 +1,104 @@ +nextflow_process { + + name "Test Process GATK4_GETPILEUPSUMMARIES" + script "../main.nf" + process "GATK4_GETPILEUPSUMMARIES" + + tag "modules" + tag "modules_nfcore" + tag "gatk4" + tag "gatk4/getpileupsummaries" + + test("human - bam") { + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam', checkIfExists: true) , + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam.bai', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.interval_list', checkIfExists: true) + ] + input[1] = [[],[]] // fasta + input[2] = [[],[]] // fai + input[3] = [[],[]] // dict + input[4] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz', checkIfExists: true) + input[5] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz.tbi', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("human - cram") { + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram', checkIfExists: true) , + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/cram/test.paired_end.recalibrated.sorted.cram.crai', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.interval_list', checkIfExists: true) + ] + input[1] = [ [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.fasta', checkIfExists: true) + ] + input[2] = [ [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.fasta.fai', checkIfExists: true) + ] + input[3] = [ [ id:'genome' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.dict', checkIfExists: true) + ] + input[4] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz', checkIfExists: true) + input[5] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz.tbi', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + + test("human - bam - stub") { + + options "-stub" + + when { + process { + """ + input[0] = [ [ id:'test' ], // meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam', checkIfExists: true) , + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/bam/test.paired_end.recalibrated.sorted.bam.bai', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/sequence/genome.interval_list', checkIfExists: true) + ] + input[1] = [[],[]] // fasta + input[2] = [[],[]] // fai + input[3] = [[],[]] // dict + input[4] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz', checkIfExists: true) + input[5] = file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/chr21/germlineresources/gnomAD.r2.1.1.vcf.gz.tbi', checkIfExists: true) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot(process.out).match() } + ) + } + + } + +} diff --git a/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test.snap b/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test.snap new file mode 100644 index 000000000..fe713315a --- /dev/null +++ b/modules/nf-core/gatk4/getpileupsummaries/tests/main.nf.test.snap @@ -0,0 +1,101 @@ +{ + "human - bam - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.pileups.table:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee" + ], + "table": [ + [ + { + "id": "test" + }, + "test.pileups.table:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions": [ + "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.7" + }, + "timestamp": "2025-09-15T15:15:36.702715205" + }, + "human - bam": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.pileups.table:md5,2d7ce4a54df6b9249e12d60737a67dac" + ] + ], + "1": [ + "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee" + ], + "table": [ + [ + { + "id": "test" + }, + "test.pileups.table:md5,2d7ce4a54df6b9249e12d60737a67dac" + ] + ], + "versions": [ + "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.7" + }, + "timestamp": "2025-09-15T15:14:59.43518059" + }, + "human - cram": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.pileups.table:md5,2d7ce4a54df6b9249e12d60737a67dac" + ] + ], + "1": [ + "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee" + ], + "table": [ + [ + { + "id": "test" + }, + "test.pileups.table:md5,2d7ce4a54df6b9249e12d60737a67dac" + ] + ], + "versions": [ + "versions.yml:md5,e06a2d6fcf646da962971eab2035b3ee" + ] + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.7" + }, + "timestamp": "2025-09-15T15:15:25.628760353" + } +} \ No newline at end of file diff --git a/modules/nf-core/svdb/merge/main.nf b/modules/nf-core/svdb/merge/main.nf index 104f5ad79..75b1373b6 100644 --- a/modules/nf-core/svdb/merge/main.nf +++ b/modules/nf-core/svdb/merge/main.nf @@ -102,4 +102,4 @@ process SVDB_MERGE { bcftools: \$(bcftools --version 2>&1 | head -n1 | sed 's/^.*bcftools //; s/ .*\$//') END_VERSIONS """ -} +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 159a6cbd3..0b59d3b00 100644 --- a/nextflow.config +++ b/nextflow.config @@ -32,6 +32,7 @@ params { run_mt_for_wes = false run_rtgvcfeval = false save_mapped_as_cram = false + run_contamination = false scatter_count = 20 skip_tools = null skip_subworkflows = null @@ -91,6 +92,8 @@ params { vcfanno_lua = null vep_cache = null vep_plugin_files = null + contamination_sites = null + contamination_sites_tbi = null modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/raredisease' @@ -486,6 +489,8 @@ includeConfig 'conf/modules/variant_evaluation.config' includeConfig 'conf/modules/subsample_mt_frac.config' includeConfig 'conf/modules/subsample_mt_reads.config' includeConfig 'conf/modules/annotate_rhocallviz.config' +includeConfig 'conf/modules/contamination.config' + // Nextflow plugins plugins { diff --git a/nextflow_schema.json b/nextflow_schema.json index 357b122e6..8ee70783b 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -443,6 +443,30 @@ "description": "Path to UD matrix file from SVD result of genotype matrix. Used by verifybamid2.", "fa_icon": "fas fa-file-csv" }, + "run_contamination": { + "type": "boolean", + "default": false, + "description": "Run GATK contamination check in addition to VerifyBamID2.", + "help_text": "GATK contamination check works better for WES samples and provides validation for WGS.", + "fa_icon": "fas fa-toggle-on" + }, + "contamination_sites": { + "type": "string", + "exists": true, + "format": "file-path", + "pattern": "^\\S+\\.vcf(\\.gz)?$", + "description": "Path to VCF file with common variants for GATK contamination detection.", + "help_text": "Use small_exac_common_3.hg38.vcf.gz for no_alt reference genomes. This file should contain common SNP sites with population allele frequencies.", + "fa_icon": "fas fa-file" + }, + "contamination_sites_tbi": { + "type": "string", + "exists": true, + "format": "file-path", + "pattern": "^\\S+\\.vcf(\\.gz)?\\.tbi$", + "description": "Index file for contamination sites VCF.", + "fa_icon": "fas fa-file" + }, "vcf2cytosure_blacklist": { "type": "string", "exists": true, diff --git a/subworkflows/local/contamination_check/main.nf b/subworkflows/local/contamination_check/main.nf new file mode 100644 index 000000000..e9ef9c757 --- /dev/null +++ b/subworkflows/local/contamination_check/main.nf @@ -0,0 +1,67 @@ +// +// Subworkflow: Check sample contamination using GATK +// + +include { GATK4_GETPILEUPSUMMARIES } from '../../../modules/nf-core/gatk4/getpileupsummaries/main' +include { GATK4_CALCULATECONTAMINATION } from '../../../modules/nf-core/gatk4/calculatecontamination/main' + +workflow CONTAMINATION_CHECK { + + take: + ch_bam // channel: [ val(meta), path(bam), path(bai) ] + ch_fasta // channel: [ val(meta), path(fasta) ] + ch_fai // channel: [ val(meta), path(fai) ] + ch_dict // channel: [ val(meta), path(dict) ] + ch_contamination_vcf // channel: [ path(vcf), path(tbi) ] + ch_intervals // channel: [ path(bed) ] - only used for WES, empty for WGS + + main: + ch_versions = Channel.empty() + + // Prepare BAM with intervals - conditionally based on analysis type + // For WGS: intervals will be empty [], for WES: intervals will contain the BED file + ch_bam_with_intervals = ch_bam + .combine(ch_intervals.ifEmpty([[]])) + .map { meta, bam, bai, bed -> + // If bed is an empty list, pass empty list; otherwise pass bed file + def intervals = (bed instanceof List && bed.isEmpty()) ? [] : (bed ?: []) + [ meta, bam, bai, intervals ] + } + + // Separate VCF and TBI - collect them to make value channels + ch_variants_vcf = ch_contamination_vcf + .map { vcf, tbi -> vcf } + .collect() + + ch_variants_tbi = ch_contamination_vcf + .map { vcf, tbi -> tbi } + .collect() + + // Run GetPileupSummaries + GATK4_GETPILEUPSUMMARIES ( + ch_bam_with_intervals, // [meta, bam, bai, intervals] + ch_fasta, // [meta2, fasta] + ch_fai, // [meta3, fai] + ch_dict, // [meta4, dict] + ch_variants_vcf, // path(vcf) + ch_variants_tbi // path(tbi) + ) + ch_versions = ch_versions.mix(GATK4_GETPILEUPSUMMARIES.out.versions.first()) + + // Run CalculateContamination (tumor-only, no matched normal) + // Format: [meta, pileup_table, matched_normal_table] + // matched_normal_table is empty [] for tumor-only mode + ch_contamination_input = GATK4_GETPILEUPSUMMARIES.out.table + .map { meta, table -> [ meta, table, [] ] } + + GATK4_CALCULATECONTAMINATION ( + ch_contamination_input + ) + ch_versions = ch_versions.mix(GATK4_CALCULATECONTAMINATION.out.versions.first()) + + emit: + contamination_table = GATK4_CALCULATECONTAMINATION.out.contamination + segmentation_table = GATK4_CALCULATECONTAMINATION.out.segmentation + pileup_table = GATK4_GETPILEUPSUMMARIES.out.table + versions = ch_versions +} diff --git a/subworkflows/local/contamination_check/meta.yml b/subworkflows/local/contamination_check/meta.yml new file mode 100644 index 000000000..ab7f5937b --- /dev/null +++ b/subworkflows/local/contamination_check/meta.yml @@ -0,0 +1,41 @@ +name: contamination_check +description: Estimate sample contamination using GATK GetPileupSummaries and CalculateContamination +keywords: + - contamination + - QC + - GATK +components: + - gatk4/getpileupsummaries + - gatk4/calculatecontamination +input: + - ch_bam: + type: file + description: BAM files with their indices + pattern: "*.{bam,cram}" + - ch_fasta: + type: file + description: Reference genome FASTA file + pattern: "*.{fa,fasta}" + - ch_contamination_vcf: + type: file + description: VCF file containing common germline variants (gnomAD) + pattern: "*.vcf.gz" + - ch_intervals: + type: file + description: BED file with target intervals + pattern: "*.bed" +output: + - contamination_table: + type: file + description: Table containing contamination estimates + pattern: "*.contamination.table" + - segmentation_table: + type: file + description: Table containing tumor segmentation + pattern: "*.segmentation.table" + - versions: + type: file + description: File containing software versions + pattern: "versions.yml" +authors: + - "56053vujinovic" diff --git a/subworkflows/local/contamination_check/tests/main.nf.test b/subworkflows/local/contamination_check/tests/main.nf.test new file mode 100644 index 000000000..2909dccd7 --- /dev/null +++ b/subworkflows/local/contamination_check/tests/main.nf.test @@ -0,0 +1,112 @@ +nextflow_workflow { + + name "Test Workflow CONTAMINATION_CHECK" + script "subworkflows/local/contamination_check/main.nf" + workflow "CONTAMINATION_CHECK" + tag "subworkflows" + tag "contamination_check" + config "./nextflow.config" + + test("CONTAMINATION_CHECK - WGS, no intervals") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = channel.of( + [ + [id:'earlycasualcaiman', sample:'earlycasualcaiman', single_end:false, sex:1, phenotype:1, case_id:'justhusky'], + file(params.pipelines_testdata_base_path + 'testdata/earlycasualcaiman_sorted_md.bam', checkIfExists: true), + file(params.pipelines_testdata_base_path + 'testdata/earlycasualcaiman_sorted_md.bam.bai', checkIfExists: true) + ] + ) + input[1] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta', checkIfExists: true)).collect() + input[2] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true)).collect() + input[3] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.dict', checkIfExists: true)).collect() + input[4] = channel.of( + [ + file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz', checkIfExists: true), + file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz.tbi', checkIfExists: true) + ] + ).collect() + input[5] = Channel.empty() + """ + } + } + + then { + assertAll ( + { assert workflow.success }, + { assert snapshot( + workflow.out.contamination_table + .collect { it[1] } + .flatten() + .collect { file(it).name } + .toSorted(), + workflow.out.pileup_table + .collect { it[1] } + .flatten() + .collect { file(it).name } + .toSorted(), + workflow.out.versions + ).match() + } + ) + } + + } + + test("CONTAMINATION_CHECK - WES, with intervals") { + + when { + params { + outdir = "$outputDir" + } + workflow { + """ + input[0] = channel.of( + [ + [id:'earlycasualcaiman', sample:'earlycasualcaiman', single_end:false, sex:1, phenotype:1, case_id:'justhusky'], + file(params.pipelines_testdata_base_path + 'testdata/earlycasualcaiman_sorted_md.bam', checkIfExists: true), + file(params.pipelines_testdata_base_path + 'testdata/earlycasualcaiman_sorted_md.bam.bai', checkIfExists: true) + ] + ) + input[1] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta', checkIfExists: true)).collect() + input[2] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true)).collect() + input[3] = channel.of([id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.dict', checkIfExists: true)).collect() + input[4] = channel.of( + [ + file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz', checkIfExists: true), + file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz.tbi', checkIfExists: true) + ] + ).collect() + input[5] = Channel.fromPath(params.pipelines_testdata_base_path + 'reference/target.bed', checkIfExists: true).collect() + """ + } + } + + then { + assertAll ( + { assert workflow.success }, + { assert snapshot( + workflow.out.contamination_table + .collect { it[1] } + .flatten() + .collect { file(it).name } + .toSorted(), + workflow.out.pileup_table + .collect { it[1] } + .flatten() + .collect { file(it).name } + .toSorted(), + workflow.out.versions + ).match() + } + ) + } + + } + +} diff --git a/subworkflows/local/contamination_check/tests/main.nf.test.snap b/subworkflows/local/contamination_check/tests/main.nf.test.snap new file mode 100644 index 000000000..c9b91a552 --- /dev/null +++ b/subworkflows/local/contamination_check/tests/main.nf.test.snap @@ -0,0 +1,40 @@ +{ + "CONTAMINATION_CHECK - WGS, no intervals": { + "content": [ + [ + "earlycasualcaiman_contamination.contamination.table" + ], + [ + "earlycasualcaiman_pileups.pileups.table" + ], + [ + "versions.yml:md5,46fc6219f36f2e8e436a824f3c0f9a4a", + "versions.yml:md5,dd08885235dcc7e9f408eba91ef7303e" + ] + ], + "timestamp": "2026-03-23T14:29:41.579345381", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + }, + "CONTAMINATION_CHECK - WES, with intervals": { + "content": [ + [ + "earlycasualcaiman_contamination.contamination.table" + ], + [ + "earlycasualcaiman_pileups.pileups.table" + ], + [ + "versions.yml:md5,46fc6219f36f2e8e436a824f3c0f9a4a", + "versions.yml:md5,dd08885235dcc7e9f408eba91ef7303e" + ] + ], + "timestamp": "2026-03-23T14:31:02.497064504", + "meta": { + "nf-test": "0.9.5", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/subworkflows/local/contamination_check/tests/nextflow.config b/subworkflows/local/contamination_check/tests/nextflow.config new file mode 100644 index 000000000..3a5a56e5c --- /dev/null +++ b/subworkflows/local/contamination_check/tests/nextflow.config @@ -0,0 +1,12 @@ +process { + + withName: '.*CONTAMINATION_CHECK:GATK4_GETPILEUPSUMMARIES' { + ext.args = '--disable-sequence-dictionary-validation' + ext.prefix = { "${meta.id}_pileups" } + } + + withName: '.*CONTAMINATION_CHECK:GATK4_CALCULATECONTAMINATION' { + ext.args = '' + ext.prefix = { "${meta.id}_contamination" } + } +} diff --git a/workflows/raredisease.nf b/workflows/raredisease.nf index 7d7bd3d24..460a10366 100644 --- a/workflows/raredisease.nf +++ b/workflows/raredisease.nf @@ -27,13 +27,15 @@ include { SPRING_DECOMPRESS as SPRING_DECOMPRESS_TO_R1_FQ } from '../modules/n include { SPRING_DECOMPRESS as SPRING_DECOMPRESS_TO_R2_FQ } from '../modules/nf-core/spring/decompress/main' include { SPRING_DECOMPRESS as SPRING_DECOMPRESS_TO_FQ_PAIR } from '../modules/nf-core/spring/decompress/main' include { STRANGER } from '../modules/nf-core/stranger/main' - // // MODULE: Local modules // include { RENAME_ALIGN_FILES as RENAME_BAM } from '../modules/local/rename_align_files' include { RENAME_ALIGN_FILES as RENAME_BAI } from '../modules/local/rename_align_files' +include { CREATE_HGNCIDS_FILE } from '../modules/local/create_hgncids_file' +include { CREATE_PEDIGREE_FILE } from '../modules/local/create_pedigree_file' +include { PARSE_CONTAMINATION } from '../modules/local/parse_contamination/main' // // SUBWORKFLOWS @@ -52,6 +54,7 @@ include { CALL_MOBILE_ELEMENTS } from '.. include { CALL_REPEAT_EXPANSIONS } from '../subworkflows/local/call_repeat_expansions' include { CALL_SNV } from '../subworkflows/local/call_snv' include { CALL_STRUCTURAL_VARIANTS } from '../subworkflows/local/call_structural_variants' +include { CONTAMINATION_CHECK } from '../subworkflows/local/contamination_check/main' include { GENERATE_CYTOSURE_FILES } from '../subworkflows/local/generate_cytosure_files' include { GENS } from '../subworkflows/local/gens' include { PREPARE_REFERENCES } from '../subworkflows/local/prepare_references' @@ -294,7 +297,53 @@ workflow RAREDISEASE { ) ch_versions = ch_versions.mix(QC_BAM.out.versions) + // + // SUBWORKFLOW: Check for contamination using GATK + // + ch_contamination_mqc = Channel.empty() + + if (params.run_contamination && params.contamination_sites) { + + log.info "=== CONTAMINATION CHECK ENABLED ===" + log.info "Analysis type: ${params.analysis_type}" + + // Prepare contamination sites channel + ch_contamination_sites = Channel.of([ + file(params.contamination_sites, checkIfExists: true), + file(params.contamination_sites_tbi, checkIfExists: true) + ]).collect() + + // Prepare intervals channel - CRITICAL: Use Channel.empty() for WGS, not Channel.of([]) + if (params.analysis_type == 'wes' && params.target_bed) { + ch_intervals_contamination = Channel.fromPath(params.target_bed).collect() + log.info "Using target BED for WES contamination check" + } else { + ch_intervals_contamination = Channel.empty() + log.info "No intervals for WGS contamination check (genome-wide)" + } + + // Prepare BAM input with BAI + ch_bam_for_contamination = ch_mapped.genome_marked_bam + .join(ch_mapped.genome_marked_bai, failOnMismatch:true, failOnDuplicate:true) + CONTAMINATION_CHECK ( + ch_bam_for_contamination, + ch_genome_fasta, + ch_genome_fai, + ch_genome_dictionary, + ch_contamination_sites, + ch_intervals_contamination + ) + + // Parse for MultiQC + PARSE_CONTAMINATION ( + CONTAMINATION_CHECK.out.contamination_table + ) + + ch_contamination_mqc = PARSE_CONTAMINATION.out.mqc_table + ch_versions = ch_versions.mix(CONTAMINATION_CHECK.out.versions) + ch_versions = ch_versions.mix(PARSE_CONTAMINATION.out.versions) + } /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ RENAME ALIGNMENT FILES FOR SMNCOPYNUMBERCALLER & REPEATCALLING @@ -848,6 +897,9 @@ workflow RAREDISEASE { ch_multiqc_files = ch_multiqc_files.mix(QC_BAM.out.global_dist.map{_meta, reports -> reports}.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(QC_BAM.out.cov.map{_meta, reports -> reports}.collect().ifEmpty([])) ch_multiqc_files = ch_multiqc_files.mix(QC_BAM.out.self_sm.map{_meta, reports -> reports}.collect().ifEmpty([])) + + // Add contamination results to MultiQC + ch_multiqc_files = ch_multiqc_files.mix(ch_contamination_mqc.map { _meta, file -> file }) if (!skip_peddy) { ch_multiqc_files = ch_multiqc_files.mix(PEDDY.out.ped.map{_meta, reports -> reports}.collect().ifEmpty([]))