From f7d2a3d08475c6ab781a1682fda57bd5dd9e0478 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Wed, 11 Feb 2026 15:35:14 +0100 Subject: [PATCH 01/23] add cadd -raw --- assets/cadd_to_vcf_header_-1.0-.txt | 1 + conf/test.config | 4 ++ modules/nf-core/cadd/environment.yml | 9 +++ modules/nf-core/cadd/main.nf | 63 +++++++++++++++++ modules/nf-core/cadd/meta.yml | 72 ++++++++++++++++++++ modules/nf-core/cadd/tests/main.nf.test | 37 ++++++++++ modules/nf-core/cadd/tests/main.nf.test.snap | 44 ++++++++++++ nextflow.config | 5 ++ nextflow_schema.json | 17 +++++ subworkflows/local/annotate_cadd/main.nf | 44 ++++++++++++ workflows/oncorefiner.nf | 35 +++++++++- 11 files changed, 329 insertions(+), 2 deletions(-) create mode 100644 assets/cadd_to_vcf_header_-1.0-.txt create mode 100644 modules/nf-core/cadd/environment.yml create mode 100644 modules/nf-core/cadd/main.nf create mode 100644 modules/nf-core/cadd/meta.yml create mode 100644 modules/nf-core/cadd/tests/main.nf.test create mode 100644 modules/nf-core/cadd/tests/main.nf.test.snap create mode 100644 subworkflows/local/annotate_cadd/main.nf diff --git a/assets/cadd_to_vcf_header_-1.0-.txt b/assets/cadd_to_vcf_header_-1.0-.txt new file mode 100644 index 0000000..8deee48 --- /dev/null +++ b/assets/cadd_to_vcf_header_-1.0-.txt @@ -0,0 +1 @@ +##INFO= diff --git a/conf/test.config b/conf/test.config index e6cf193..e21a556 100644 --- a/conf/test.config +++ b/conf/test.config @@ -46,4 +46,8 @@ params { svdb_query_dbs = params.pipelines_testdata_base_path + 'reference/svdb_querydb_files.csv' + // Mock input for CADD + cadd_resources = params.pipelines_testdata_base_path + "/assets" //TODO add + cadd_prescored_indels = params.pipelines_testdata_base_path + "docs" //TODO add + } diff --git a/modules/nf-core/cadd/environment.yml b/modules/nf-core/cadd/environment.yml new file mode 100644 index 0000000..d98de65 --- /dev/null +++ b/modules/nf-core/cadd/environment.yml @@ -0,0 +1,9 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::cadd-scripts=1.6.post1 + - conda-forge::conda=4.14.0 + - conda-forge::mamba=1.4.0 diff --git a/modules/nf-core/cadd/main.nf b/modules/nf-core/cadd/main.nf new file mode 100644 index 0000000..0e3c79b --- /dev/null +++ b/modules/nf-core/cadd/main.nf @@ -0,0 +1,63 @@ +process CADD { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container 'docker.io/biocontainers/cadd-scripts-with-envs:1.6.post1_cv1' + + containerOptions { + if (prescored_dir) { + ['singularity', 'apptainer'].contains(workflow.containerEngine) ? + "-B ${annotation_dir}:/opt/CADD-scripts-1.6.post1/data/annotations -B ${prescored_dir}:/opt/CADD-scripts-1.6.post1/data/prescored" : + "-v ${annotation_dir}:/opt/CADD-scripts-1.6.post1/data/annotations -v ${prescored_dir}:/opt/CADD-scripts-1.6.post1/data/prescored" + } else { + ['singularity', 'apptainer'].contains(workflow.containerEngine) ? + "-B ${annotation_dir}:/opt/CADD-scripts-1.6.post1/data/annotations" : + "-v ${annotation_dir}:/opt/CADD-scripts-1.6.post1/data/annotations" + } + } + + input: + tuple val(meta), path(vcf) + tuple val(meta2), path(annotation_dir) + + output: + tuple val(meta), path("*.tsv.gz"), emit: tsv + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = "1.6.post1" + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + """ + export XDG_CACHE_HOME=\$PWD/snakemake_cache + mkdir -p \$XDG_CACHE_HOME + + cadd.sh \\ + -o ${prefix}.tsv.gz \\ + ${args} \\ + ${vcf} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cadd: ${VERSION} + END_VERSIONS + """ + + stub: + def prefix = task.ext.prefix ?: "${meta.id}" + def VERSION = "1.6.post1" + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + """ + echo "" | gzip > ${prefix}.tsv.gz + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + cadd: ${VERSION} + END_VERSIONS + """ +} diff --git a/modules/nf-core/cadd/meta.yml b/modules/nf-core/cadd/meta.yml new file mode 100644 index 0000000..60c863c --- /dev/null +++ b/modules/nf-core/cadd/meta.yml @@ -0,0 +1,72 @@ +name: "cadd" +description: CADD is a tool for scoring the deleteriousness of single nucleotide variants + as well as insertion/deletions variants in the human genome. +keywords: + - cadd + - annotate + - variants +tools: + - "cadd": + description: "CADD scripts release for offline scoring" + homepage: "https://cadd.gs.washington.edu/" + documentation: "https://github.com/kircherlab/CADD-scripts/blob/master/README.md" + tool_dev_url: "https://github.com/kircherlab/CADD-scripts/" + doi: "10.1093/nar/gky1016" + licence: + - Restricted. Free for non-commercial users. + identifier: biotools:cadd_phred +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: Input file for annotation in vcf or vcf.gz format + pattern: "*.{vcf,vcf.gz}" + ontologies: [] + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - annotation_dir: + type: directory + description: | + Path to folder containing the vcf files with precomputed CADD scores. + This folder contains the uncompressed files that would otherwise be in data/annotation folder as described in https://github.com/kircherlab/CADD-scripts/#manual-installation. + - - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - prescored_dir: + type: directory + description: | + Path to folder containing prescored files. + This folder contains the uncompressed files that would otherwise be in data/prescored/${GENOME_BUILD}_${VERSION}/ folder as described in https://github.com/kircherlab/CADD-scripts/#manual-installation. +output: + tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.tsv.gz": + type: file + description: Annotated tsv file + pattern: "*.{tsv,tsv.gz}" + ontologies: + - edam: http://edamontology.org/format_3475 # TSV + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@ramprasadn" +maintainers: + - "@ramprasadn" diff --git a/modules/nf-core/cadd/tests/main.nf.test b/modules/nf-core/cadd/tests/main.nf.test new file mode 100644 index 0000000..cc36d0c --- /dev/null +++ b/modules/nf-core/cadd/tests/main.nf.test @@ -0,0 +1,37 @@ +nextflow_process { + + name "Test Process CADD" + + script "../main.nf" + process "CADD" + + tag "modules" + tag "modules_nfcore" + tag "cadd" + + test("test_cadd - stub") { + options '-stub' + when { + + process { + """ + input[0] = [ + [id:'test',single_end:false],// meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gvcf/test.genome.vcf',checkIfExists:true) + ] + input[1] = Channel.from("\$PWD").map { dir -> [ [ id: dir ], dir ] } + input[2] = Channel.from("/").map { dir -> [ [ id: dir ], dir ] } + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out, + process.out.versions.collect{ path(it).yaml } + ).match() } + ) + } + } +} diff --git a/modules/nf-core/cadd/tests/main.nf.test.snap b/modules/nf-core/cadd/tests/main.nf.test.snap new file mode 100644 index 0000000..15a0fa1 --- /dev/null +++ b/modules/nf-core/cadd/tests/main.nf.test.snap @@ -0,0 +1,44 @@ +{ + "test_cadd - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + "versions.yml:md5,ef02d93c7627a5a20a25326b5d7ebffc" + ], + "tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions": [ + "versions.yml:md5,ef02d93c7627a5a20a25326b5d7ebffc" + ] + }, + [ + { + "CADD": { + "cadd": "1.6.post1" + } + } + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.5" + }, + "timestamp": "2025-04-16T09:56:33.347204138" + } +} \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 43091a6..9e791da 100644 --- a/nextflow.config +++ b/nextflow.config @@ -18,6 +18,11 @@ params { snv_vcf = null sv_vcf = null + // CADD + cadd_resources = null + cadd_prescored_indels = null + + // Vep vep_cache_version = 112 vep_plugin_files = null diff --git a/nextflow_schema.json b/nextflow_schema.json index b9dcf56..76265a4 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -88,7 +88,24 @@ "fa_icon": "fas fa-ban", "hidden": true, "default": "s3://ngi-igenomes/igenomes/" + }, + "cadd_prescored_indels": { + "type": "string", + "exists": true, + "format": "directory-path", + "fa_icon": "fas fa-file", + "description": "Path to a directory containing prescored indels for CADD.", + "help_text": "This folder contains the compressed files and indexes that would otherwise be in data/prescored folder as described in https://github.com/kircherlab/CADD-scripts/#manual-installation." + }, + "cadd_resources": { + "type": "string", + "exists": true, + "format": "directory-path", + "fa_icon": "fas fa-file", + "description": "Path to the directory containing cadd annotations.", + "help_text": "This folder contains the uncompressed files that would otherwise be in data/annotation folder as described in https://github.com/kircherlab/CADD-scripts/#manual-installation." } + } }, "annotation_options": { diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf new file mode 100644 index 0000000..a3097b9 --- /dev/null +++ b/subworkflows/local/annotate_cadd/main.nf @@ -0,0 +1,44 @@ +// +// A subworkflow to annotate cadd +// + +include { BCFTOOLS_ANNOTATE } from '../../../modules/nf-core/bcftools/annotate/main' +include { CADD } from '../../../modules/nf-core/cadd/main' +include { TABIX_TABIX as TABIX_CADD } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_ANNOTATE } from '../../../modules/nf-core/tabix/tabix/main' + + +workflow ANNOTATE_CADD { + + take: + ch_snv_vcf // channel: [mandatory] [ val(meta), path(vcfs), path(idx) ] + ch_cadd_header // channel: [mandatory] [ path(txt) ] + ch_cadd_resources // channel: [mandatory] [ path(dir) ] + ch_cadd_prescored_indels // channel: [mandatory] [ val(meta), path(dir) ] + + main: + ch_versions = channel.empty() + + CADD(ch_snv_vcf, ch_cadd_resources, ch_cadd_prescored_indels) + + TABIX_CADD(CADD.out.tsv) + + ch_snv_vcf + .join(CADD.out.tsv) + .join(TABIX_CADD.out.tbi) + .set { ch_annotate_in } + + BCFTOOLS_ANNOTATE(ch_annotate_in, ch_cadd_header ) + + TABIX_ANNOTATE (BCFTOOLS_ANNOTATE.out.vcf) + + ch_versions = ch_versions.mix(CADD.out.versions.first()) + ch_versions = ch_versions.mix(TABIX_CADD.out.versions.first()) + ch_versions = ch_versions.mix(BCFTOOLS_ANNOTATE.out.versions.first()) + ch_versions = ch_versions.mix(TABIX_ANNOTATE.out.versions.first()) + + emit: + vcf = BCFTOOLS_ANNOTATE.out.vcf // channel: [ val(meta), path(vcf) ] + tbi = TABIX_ANNOTATE.out.tbi + versions = ch_versions +} diff --git a/workflows/oncorefiner.nf b/workflows/oncorefiner.nf index c9df030..2d57e5a 100644 --- a/workflows/oncorefiner.nf +++ b/workflows/oncorefiner.nf @@ -19,6 +19,7 @@ include { SVDB_QUERY as SVDB_QUERY_DB } from '../modules/nf-core/sv include { ENSEMBLVEP_VEP as ENSEMBLVEP_SV } from '../modules/nf-core/ensemblvep/vep/main' include { BCFTOOLS_VIEW as RESEARCH_FILTERING_SV } from '../modules/nf-core/bcftools/view/main' include { BCFTOOLS_VIEW as CLINICAL_FILTERING_SV } from '../modules/nf-core/bcftools/view/main' +include { TABIX_TABIX as TABIX_RESEARCH_FILTERING } from '../modules/nf-core/tabix/tabix/main' // // MODULE: Local modules @@ -33,6 +34,7 @@ include { paramsSummaryMultiqc } from '../subworkflows/nf-core/utils_nfcore_pi include { softwareVersionsToYAML } from '../subworkflows/nf-core/utils_nfcore_pipeline' include { methodsDescriptionText } from '../subworkflows/local/utils_nfcore_oncorefiner_pipeline' include { PREPARE_REFERENCES } from '../subworkflows/local/prepare_references' +include { ANNOTATE_CADD } from '../subworkflows/local/annotate_cadd' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -71,7 +73,13 @@ workflow ONCOREFINER { // Gather or get from params ch_vep_cache = ( params.vep_cache && params.vep_cache.endsWith("tar.gz") ) ? ch_references.vep_resources - : ( params.vep_cache ? channel.fromPath(params.vep_cache).collect() : channel.value([]) ) + : ( params.vep_cache ? channel.fromPath(params.vep_cache).collect() : channel.value([]) ) + + ch_cadd_header = Channel.fromPath("$projectDir/assets/cadd_to_vcf_header_-1.0-.txt", checkIfExists: true).collect() + ch_cadd_resources = params.cadd_resources ? Channel.fromPath(params.cadd_resources).collect() + : Channel.value([]) + ch_cadd_prescored_indels = createReferenceChannelFromPath(params.cadd_prescored_indels) // align with above + // // Read and store paths in the vep_plugin_files file @@ -133,16 +141,39 @@ workflow ONCOREFINER { tuple(meta, vcf, tbi) } .set { ch_research_filtering_in } + RESEARCH_FILTERING(ch_research_filtering_in, [], [], []) + /* // VEP RESEARCH_FILTERING.out.vcf .map { meta, vcf -> def custom_extra_files = params.custom_extra_files ? file(params.custom_extra_files) : [] tuple(meta, vcf, custom_extra_files) } - .set { ch_vep_snv } + .set { ch_cadd_snv } + */ + + + // ANNOTATE WITH CADD + if (params.cadd_resources != null) { + TABIX_RESEARCH_FILTERING (RESEARCH_FILTERING.out.vcf) + + RESEARCH_FILTERING.out.vcf + .join(TABIX_RESEARCH_FILTERING.out.tbi, failOnMismatch:true, failOnDuplicate:true) + .set {ch_cadd_snv} + + ANNOTATE_CADD ( + ch_cadd_snv, + ch_cadd_header, + ch_cadd_resources, + ch_cadd_prescored_indels + ) + ch_vep_snv = ANNOTATE_CADD.out.vcf + ch_versions = ch_versions.mix(ANNOTATE_CADD.out.versions) + + } ENSEMBLVEP_SNV ( ch_vep_snv, From 45cb012cf9887f6aa3bbdae82a8aae1b946b0682 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 23 Mar 2026 15:15:20 +0100 Subject: [PATCH 02/23] update --- conf/subworkflows/annotate_cadd.config | 9 ++++ conf/test.config | 5 ++- modules.json | 5 +++ modules/nf-core/cadd/environment.yml | 4 +- modules/nf-core/cadd/main.nf | 40 +++++++---------- modules/nf-core/cadd/meta.yml | 46 +++++++++++++++----- modules/nf-core/cadd/tests/main.nf.test | 2 + modules/nf-core/cadd/tests/main.nf.test.snap | 28 +++++++----- modules/nf-core/cadd/tests/nextflow.config | 5 +++ nextflow.config | 1 + workflows/oncorefiner.nf | 15 +++---- 11 files changed, 97 insertions(+), 63 deletions(-) create mode 100644 conf/subworkflows/annotate_cadd.config create mode 100644 modules/nf-core/cadd/tests/nextflow.config diff --git a/conf/subworkflows/annotate_cadd.config b/conf/subworkflows/annotate_cadd.config new file mode 100644 index 0000000..635799a --- /dev/null +++ b/conf/subworkflows/annotate_cadd.config @@ -0,0 +1,9 @@ +/* +Annotate with CADD +*/ + + +process { + + +} diff --git a/conf/test.config b/conf/test.config index 0836cf8..4a750f4 100644 --- a/conf/test.config +++ b/conf/test.config @@ -47,7 +47,8 @@ params { svdb_query_dbs = params.pipelines_testdata_base_path + 'reference/svdb_querydb_files.csv' // Mock input for CADD - cadd_resources = params.pipelines_testdata_base_path + "/assets" //TODO add - cadd_prescored_indels = params.pipelines_testdata_base_path + "docs" //TODO add + cadd_resources = '../test-datasets' + //cadd_resources = params.pipelines_testdata_base_path + "assets" //TODO add + //cadd_prescored_indels = params.pipelines_testdata_base_path + "docs" //TODO add } diff --git a/modules.json b/modules.json index 8a4a313..ab7e9cb 100644 --- a/modules.json +++ b/modules.json @@ -25,6 +25,11 @@ "git_sha": "6383d8fe58f9498eecd5aa303e71a4a932d1e9f6", "installed_by": ["modules"] }, + "cadd": { + "branch": "master", + "git_sha": "64ab14a6905e5c9d649f61e2757a1e600dbdb8e0", + "installed_by": ["modules"] + }, "ensemblvep/vep": { "branch": "master", "git_sha": "34505e1fc5e9f4fd641210ca440acff6bd33b842", diff --git a/modules/nf-core/cadd/environment.yml b/modules/nf-core/cadd/environment.yml index d98de65..39701b4 100644 --- a/modules/nf-core/cadd/environment.yml +++ b/modules/nf-core/cadd/environment.yml @@ -4,6 +4,4 @@ channels: - conda-forge - bioconda dependencies: - - bioconda::cadd-scripts=1.6.post1 - - conda-forge::conda=4.14.0 - - conda-forge::mamba=1.4.0 + - bioconda::cadd-scripts=1.7.3 diff --git a/modules/nf-core/cadd/main.nf b/modules/nf-core/cadd/main.nf index 0e3c79b..771d144 100644 --- a/modules/nf-core/cadd/main.nf +++ b/modules/nf-core/cadd/main.nf @@ -3,61 +3,51 @@ process CADD { label 'process_medium' conda "${moduleDir}/environment.yml" - container 'docker.io/biocontainers/cadd-scripts-with-envs:1.6.post1_cv1' + container 'docker.io/clinicalgenomics/cadd-with-scripts:1.7.3' containerOptions { if (prescored_dir) { ['singularity', 'apptainer'].contains(workflow.containerEngine) ? - "-B ${annotation_dir}:/opt/CADD-scripts-1.6.post1/data/annotations -B ${prescored_dir}:/opt/CADD-scripts-1.6.post1/data/prescored" : - "-v ${annotation_dir}:/opt/CADD-scripts-1.6.post1/data/annotations -v ${prescored_dir}:/opt/CADD-scripts-1.6.post1/data/prescored" + "-B ${annotation_dir}:/cadd-scripts/data/annotations -B ${prescored_dir}:/cadd-scripts/data/prescored" : + "-v ${annotation_dir}:/cadd-scripts/data/annotations -v ${prescored_dir}:/cadd-scripts/data/prescored" } else { ['singularity', 'apptainer'].contains(workflow.containerEngine) ? - "-B ${annotation_dir}:/opt/CADD-scripts-1.6.post1/data/annotations" : - "-v ${annotation_dir}:/opt/CADD-scripts-1.6.post1/data/annotations" + "-B ${annotation_dir}:/cadd-scripts/data/annotations" : + "-v ${annotation_dir}:/cadd-scripts/data/annotations" } } input: tuple val(meta), path(vcf) - tuple val(meta2), path(annotation_dir) + tuple val(meta2), val(annotation_dir) + tuple val(meta3), val(prescored_dir) output: - tuple val(meta), path("*.tsv.gz"), emit: tsv - path "versions.yml" , emit: versions + tuple val(meta), path("${prefix}.tsv.gz"), emit: tsv + tuple val("${task.process}"), val("cadd"), val("1.7.3"), emit: versions_cadd, topic: versions + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. when: task.ext.when == null || task.ext.when script: def args = task.ext.args ?: '' - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = "1.6.post1" - // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + prefix = task.ext.prefix ?: "${meta.id}" """ export XDG_CACHE_HOME=\$PWD/snakemake_cache + export MPLCONFIGDIR=. mkdir -p \$XDG_CACHE_HOME - cadd.sh \\ + CADD.sh \\ + -m \\ -o ${prefix}.tsv.gz \\ ${args} \\ ${vcf} - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - cadd: ${VERSION} - END_VERSIONS """ stub: - def prefix = task.ext.prefix ?: "${meta.id}" - def VERSION = "1.6.post1" - // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + prefix = task.ext.prefix ?: "${meta.id}" """ echo "" | gzip > ${prefix}.tsv.gz - - cat <<-END_VERSIONS > versions.yml - "${task.process}": - cadd: ${VERSION} - END_VERSIONS """ } diff --git a/modules/nf-core/cadd/meta.yml b/modules/nf-core/cadd/meta.yml index 60c863c..1efaa94 100644 --- a/modules/nf-core/cadd/meta.yml +++ b/modules/nf-core/cadd/meta.yml @@ -1,6 +1,6 @@ name: "cadd" -description: CADD is a tool for scoring the deleteriousness of single nucleotide variants - as well as insertion/deletions variants in the human genome. +description: CADD is a tool for scoring the deleteriousness of single nucleotide + variants as well as insertion/deletions variants in the human genome. keywords: - cadd - annotate @@ -44,8 +44,16 @@ input: - prescored_dir: type: directory description: | - Path to folder containing prescored files. - This folder contains the uncompressed files that would otherwise be in data/prescored/${GENOME_BUILD}_${VERSION}/ folder as described in https://github.com/kircherlab/CADD-scripts/#manual-installation. + Path to folder containing prescored CADD score files. + Expected structure mirrors data/prescored/ from the CADD-scripts installation: + / + GRCh38_v1.7/ + incl_anno/ # *.tsv.gz + *.tsv.gz.tbi (scores with annotations) + no_anno/ # *.tsv.gz + *.tsv.gz.tbi (scores only) + GRCh37_v1.7/ + incl_anno/ + no_anno/ + See https://github.com/kircherlab/CADD-scripts/#manual-installation for details. output: tsv: - - meta: @@ -53,19 +61,33 @@ output: description: | Groovy Map containing sample information e.g. [ id:'test', single_end:false ] - - "*.tsv.gz": + - ${prefix}.tsv.gz: type: file description: Annotated tsv file pattern: "*.{tsv,tsv.gz}" ontologies: - - edam: http://edamontology.org/format_3475 # TSV + - edam: http://edamontology.org/format_3475 + versions_cadd: + - - ${task.process}: + type: string + description: The name of the process + - cadd: + type: string + description: The name of the tool + - 1.7.3: + type: string + description: The expression to obtain the version of the tool +topics: versions: - - versions.yml: - type: file - description: File containing software versions - pattern: "versions.yml" - ontologies: - - edam: http://edamontology.org/format_3750 # YAML + - - ${task.process}: + type: string + description: The name of the process + - cadd: + type: string + description: The name of the tool + - 1.7.3: + type: string + description: The expression to obtain the version of the tool authors: - "@ramprasadn" maintainers: diff --git a/modules/nf-core/cadd/tests/main.nf.test b/modules/nf-core/cadd/tests/main.nf.test index cc36d0c..c328790 100644 --- a/modules/nf-core/cadd/tests/main.nf.test +++ b/modules/nf-core/cadd/tests/main.nf.test @@ -9,6 +9,8 @@ nextflow_process { tag "modules_nfcore" tag "cadd" + config "./nextflow.config" + test("test_cadd - stub") { options '-stub' when { diff --git a/modules/nf-core/cadd/tests/main.nf.test.snap b/modules/nf-core/cadd/tests/main.nf.test.snap index 15a0fa1..5e38eea 100644 --- a/modules/nf-core/cadd/tests/main.nf.test.snap +++ b/modules/nf-core/cadd/tests/main.nf.test.snap @@ -12,7 +12,11 @@ ] ], "1": [ - "versions.yml:md5,ef02d93c7627a5a20a25326b5d7ebffc" + [ + "CADD", + "cadd", + "1.7.3" + ] ], "tsv": [ [ @@ -23,22 +27,22 @@ "test.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" ] ], - "versions": [ - "versions.yml:md5,ef02d93c7627a5a20a25326b5d7ebffc" + "versions_cadd": [ + [ + "CADD", + "cadd", + "1.7.3" + ] ] }, [ - { - "CADD": { - "cadd": "1.6.post1" - } - } + ] ], + "timestamp": "2026-03-01T12:08:37.372500636", "meta": { - "nf-test": "0.9.2", - "nextflow": "24.10.5" - }, - "timestamp": "2025-04-16T09:56:33.347204138" + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } } } \ No newline at end of file diff --git a/modules/nf-core/cadd/tests/nextflow.config b/modules/nf-core/cadd/tests/nextflow.config new file mode 100644 index 0000000..bd24d9f --- /dev/null +++ b/modules/nf-core/cadd/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'CADD' { + container = "nf-core/ubuntu:22.04" //Using an basic container because v1.7.3 is too big for CI. + } +} diff --git a/nextflow.config b/nextflow.config index d96ef24..4df69fa 100644 --- a/nextflow.config +++ b/nextflow.config @@ -293,3 +293,4 @@ validation { // Load modules.config for DSL2 module specific options includeConfig 'conf/modules.config' includeConfig 'conf/modules/prepare_references.config' +includeConfig 'conf/subworkflows/annotate_cadd.config' diff --git a/workflows/oncorefiner.nf b/workflows/oncorefiner.nf index 16f1c74..4308373 100644 --- a/workflows/oncorefiner.nf +++ b/workflows/oncorefiner.nf @@ -144,27 +144,24 @@ workflow ONCOREFINER { RESEARCH_FILTERING(ch_research_filtering_in, [], [], []) - /* + // VEP RESEARCH_FILTERING.out.vcf .map { meta, vcf -> def custom_extra_files = params.custom_extra_files ? file(params.custom_extra_files) : [] tuple(meta, vcf, custom_extra_files) } - .set { ch_cadd_snv } - */ + //.set { ch_cadd_snv } + .set {ch_vep_snv} + // ANNOTATE WITH CADD if (params.cadd_resources != null) { - TABIX_RESEARCH_FILTERING (RESEARCH_FILTERING.out.vcf) - - RESEARCH_FILTERING.out.vcf - .join(TABIX_RESEARCH_FILTERING.out.tbi, failOnMismatch:true, failOnDuplicate:true) - .set {ch_cadd_snv} ANNOTATE_CADD ( - ch_cadd_snv, + ch_vep_snv, + //ch_cadd_snv, ch_cadd_header, ch_cadd_resources, ch_cadd_prescored_indels From 8b6b8a04b4cb73188d381d664573055732fe81a6 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Thu, 26 Mar 2026 13:12:13 +0100 Subject: [PATCH 03/23] commit update --- assets/cadd_to_vcf_header.txt | 1 + modules/nf-core/gawk/environment.yml | 7 + modules/nf-core/gawk/main.nf | 60 ++++++ modules/nf-core/gawk/meta.yml | 84 ++++++++ modules/nf-core/gawk/tests/main.nf.test | 211 +++++++++++++++++++ modules/nf-core/gawk/tests/main.nf.test.snap | 199 +++++++++++++++++ modules/nf-core/gawk/tests/nextflow.config | 6 + 7 files changed, 568 insertions(+) create mode 100644 assets/cadd_to_vcf_header.txt create mode 100644 modules/nf-core/gawk/environment.yml create mode 100644 modules/nf-core/gawk/main.nf create mode 100644 modules/nf-core/gawk/meta.yml create mode 100644 modules/nf-core/gawk/tests/main.nf.test create mode 100644 modules/nf-core/gawk/tests/main.nf.test.snap create mode 100644 modules/nf-core/gawk/tests/nextflow.config diff --git a/assets/cadd_to_vcf_header.txt b/assets/cadd_to_vcf_header.txt new file mode 100644 index 0000000..8deee48 --- /dev/null +++ b/assets/cadd_to_vcf_header.txt @@ -0,0 +1 @@ +##INFO= diff --git a/modules/nf-core/gawk/environment.yml b/modules/nf-core/gawk/environment.yml new file mode 100644 index 0000000..185a0f5 --- /dev/null +++ b/modules/nf-core/gawk/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::gawk=5.3.1 diff --git a/modules/nf-core/gawk/main.nf b/modules/nf-core/gawk/main.nf new file mode 100644 index 0000000..33dd24c --- /dev/null +++ b/modules/nf-core/gawk/main.nf @@ -0,0 +1,60 @@ +process GAWK { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/a1/a125c778baf3865331101a104b60d249ee15fe1dca13bdafd888926cc5490a34/data' : + 'community.wave.seqera.io/library/gawk:5.3.1--e09efb5dfc4b8156' }" + + input: + tuple val(meta), path(input, arity: '0..*') + path(program_file) + val(disable_redirect_output) + + output: + tuple val(meta), path("*.${suffix}"), emit: output + tuple val("${task.process}"), val('gawk'), eval("awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//'"), topic: versions, emit: versions_gawk + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' // args is used for the main arguments of the tool + def args2 = task.ext.args2 ?: '' // args2 is used to specify a program when no program file has been given + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.collect{ file -> file.getExtension()}.get(0)}" // use the first extension of the input files + + program = program_file ? "-f ${program_file}" : "${args2}" + lst_gz = input.findResults{ file -> file.getExtension().endsWith("gz") ? file.toString() : null } + unzip = lst_gz ? "gunzip -q -f ${lst_gz.join(" ")}" : "" + input_cmd = input.collect { file -> file.toString() - ~/\.gz$/ }.join(" ") + output_cmd = suffix.endsWith("gz") ? "| gzip > ${prefix}.${suffix}" : "> ${prefix}.${suffix}" + output = disable_redirect_output ? "" : output_cmd + cleanup = lst_gz ? "rm ${lst_gz.collect{ file -> file - ~/\.gz$/ }.join(" ")}" : "" + + input.collect{ file -> + assert file.name != "${prefix}.${suffix}" : "Input and output names are the same, set prefix in module configuration to disambiguate!" + } + + """ + ${unzip} + + awk \\ + ${args} \\ + ${program} \\ + ${input_cmd} \\ + ${output} + + ${cleanup} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.collect{ file -> file.getExtension()}.get(0)}" + def create_cmd = suffix.endsWith("gz") ? "echo '' | gzip >" : "touch" + + """ + ${create_cmd} ${prefix}.${suffix} + """ +} diff --git a/modules/nf-core/gawk/meta.yml b/modules/nf-core/gawk/meta.yml new file mode 100644 index 0000000..96cd0c7 --- /dev/null +++ b/modules/nf-core/gawk/meta.yml @@ -0,0 +1,84 @@ +name: "gawk" +description: | + If you are like many computer users, you would frequently like to make changes in various text files + wherever certain patterns appear, or extract data from parts of certain lines while discarding the rest. + The job is easy with awk, especially the GNU implementation gawk. +keywords: + - gawk + - awk + - txt + - text + - file parsing +tools: + - "gawk": + description: "GNU awk" + homepage: "https://www.gnu.org/software/gawk/" + documentation: "https://www.gnu.org/software/gawk/manual/" + tool_dev_url: "https://www.gnu.org/prep/ftp.html" + licence: + - "GPL v3" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: The input file - Specify the logic that needs to be executed + on this file on the `ext.args2` or in the program file. If the files + have a `.gz` extension, they will be unzipped using `zcat`. + pattern: "*" + ontologies: [] + - program_file: + type: file + description: Optional file containing logic for awk to execute. If you don't + wish to use a file, you can use `ext.args2` to specify the logic. + pattern: "*" + ontologies: [] + - disable_redirect_output: + type: boolean + description: Disable the redirection of awk output to a given file. This is + useful if you want to use awk's built-in redirect to write files instead + of the shell's redirect. +output: + output: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.${suffix}": + type: file + description: The output file - if using shell redirection, specify the + name of this file using `ext.prefix` and the extension using + `ext.suffix`. Otherwise, ensure the awk program produces files with + the extension in `ext.suffix`. + pattern: "*" + ontologies: [] + versions_gawk: + - - ${task.process}: + type: string + description: The name of the process + - gawk: + type: string + description: The name of the tool + - awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//': + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - gawk: + type: string + description: The name of the tool + - awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//': + type: eval + description: The expression to obtain the version of the tool +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/nf-core/gawk/tests/main.nf.test b/modules/nf-core/gawk/tests/main.nf.test new file mode 100644 index 0000000..3bd0a43 --- /dev/null +++ b/modules/nf-core/gawk/tests/main.nf.test @@ -0,0 +1,211 @@ +nextflow_process { + + name "Test Process GAWK" + script "../main.nf" + process "GAWK" + + tag "modules" + tag "modules_nfcore" + tag "gawk" + + config "./nextflow.config" + + test("Convert fasta to bed") { + when { + params { + gawk_suffix = "bed" + gawk_args2 = '\'BEGIN { FS = OFS = "\t"}; { print \$1, "0", \$2 }\'' + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = [] + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Convert fasta to bed - stub") { + + options "-stub" + + when { + params { + gawk_suffix = "bed" + gawk_args2 = '\'BEGIN { FS = OFS = "\t"}; { print \$1, "0", \$2 }\'' + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = [] + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } + + test("Convert fasta to bed with program file") { + when { + params { + gawk_suffix = "bed" + gawk_args2 = "" + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = Channel.of('BEGIN { FS = OFS = "\t"}; { print \$1, "0", \$2 }').collectFile(name:"program.awk") + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Convert fasta to bed using awk redirect instead of shell redirect") { + when { + params { + gawk_suffix = "bed" + gawk_args2 = '\'BEGIN { FS = OFS = "\t"}; { print \$1, "0", \$2 > "test.bed" }\'' + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = [] + input[2] = true + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Extract first column from multiple files") { + when { + params { + gawk_suffix = "bed" + gawk_args2 = "" + } + process { + """ + input[0] = [ + [ id:'test' ], + [file(params.modules_testdata_base_path + 'generic/txt/hello.txt', checkIfExists: true), + file(params.modules_testdata_base_path + 'generic/txt/species_names.txt', checkIfExists: true)] + ] + input[1] = Channel.of('BEGIN {FS=" "}; {print \$1}').collectFile(name:"program.awk") + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Unzip files before processing") { + when { + params { + gawk_suffix = "bed" + gawk_args2 = "" + } + process { + """ + input[0] = [ + [ id:'test' ], + [file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/vcf/NA12878_chrM.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz', checkIfExists: true)] + ] + input[1] = Channel.of('/^#CHROM/ { print \$1, \$10 }').collectFile(name:"column_header.awk") + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Compress after processing") { + when { + params { + gawk_suffix = "txt.gz" + gawk_args2 = '\'BEGIN { FS = OFS = "\t"}; { print \$1, "0", \$2 }\'' + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = [] + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Input and output files are similar") { + when { + params { + gawk_suffix = "txt" + gawk_args = "" + gawk_args2 = "" + } + process { + """ + input[0] = [ + [ id:'hello' ], + [file(params.modules_testdata_base_path + 'generic/txt/hello.txt', checkIfExists: true), + file(params.modules_testdata_base_path + 'generic/txt/species_names.txt', checkIfExists: true)] + ] + input[1] = Channel.of('BEGIN {FS=" "}; {print \$1}').collectFile(name:"program.awk") + input[2] = false + """ + } + } + + then { + assert process.failed + assert process.errorReport.contains("Input and output names are the same, set prefix in module configuration to disambiguate!") + } + } +} diff --git a/modules/nf-core/gawk/tests/main.nf.test.snap b/modules/nf-core/gawk/tests/main.nf.test.snap new file mode 100644 index 0000000..9d6a369 --- /dev/null +++ b/modules/nf-core/gawk/tests/main.nf.test.snap @@ -0,0 +1,199 @@ +{ + "Compress after processing": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.txt.gz:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:50.761549948", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Convert fasta to bed": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:30:50.804933797", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Convert fasta to bed with program file": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:10.838989113", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Convert fasta to bed - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ], + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:00.182649403", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Extract first column from multiple files": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,566c51674bd643227bb2d83e0963376d" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:30.796772884", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Unzip files before processing": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,1e31ebd4a060aab5433bbbd9ab24e403" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:40.72259289", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Convert fasta to bed using awk redirect instead of shell redirect": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:20.33222004", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/gawk/tests/nextflow.config b/modules/nf-core/gawk/tests/nextflow.config new file mode 100644 index 0000000..895709a --- /dev/null +++ b/modules/nf-core/gawk/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + withName: GAWK { + ext.suffix = params.gawk_suffix + ext.args2 = params.gawk_args2 + } +} From 73774a1227806c313cf28d35a0dee6e66d4240ba Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Thu, 26 Mar 2026 18:03:55 +0100 Subject: [PATCH 04/23] fixing subwf --- assets/cadd_to_vcf_header_-1.0-.txt | 1 - conf/subworkflows/annotate_cadd.config | 35 +++++++++++ conf/test.config | 3 +- modules.json | 5 ++ subworkflows/local/annotate_cadd/main.nf | 78 ++++++++++++++++++------ workflows/oncorefiner.nf | 34 ++++++----- 6 files changed, 119 insertions(+), 37 deletions(-) delete mode 100644 assets/cadd_to_vcf_header_-1.0-.txt diff --git a/assets/cadd_to_vcf_header_-1.0-.txt b/assets/cadd_to_vcf_header_-1.0-.txt deleted file mode 100644 index 8deee48..0000000 --- a/assets/cadd_to_vcf_header_-1.0-.txt +++ /dev/null @@ -1 +0,0 @@ -##INFO= diff --git a/conf/subworkflows/annotate_cadd.config b/conf/subworkflows/annotate_cadd.config index 635799a..1214262 100644 --- a/conf/subworkflows/annotate_cadd.config +++ b/conf/subworkflows/annotate_cadd.config @@ -5,5 +5,40 @@ Annotate with CADD process { + withName: '.*:ANNOTATE_CADD:.*' { + publishDir = [ + enabled: false + ] + } + withName: '.*:ANNOTATE_CADD:BCFTOOLS_VIEW' { + ext.args = { "--output-type z --types indels,other" } + ext.prefix = { "${vcf.simpleName}_indels" } + } + + withName: '.*:ANNOTATE_CADD:CADD' { + ext.args = { "-g ${params.genome}" } + ext.prefix = { "${vcf.simpleName}_cadd" } + } + + withName: '.*:ANNOTATE_CADD:TABIX_CADD' { + ext.args = { "--force --sequence 1 --begin 2 --end 2" } + } + + withName: '.*:ANNOTATE_CADD:CADD_TO_REFERENCE_CHRNAMES' { + ext.args2 = '\'{original=$1; sub("chr","",$1); print $1, original}\'' + ext.prefix = "cadd_to_reference" + ext.suffix = "txt" + } + + withName: '.*:ANNOTATE_CADD:REFERENCE_TO_CADD_CHRNAMES' { + ext.args2 = '\'{original=$1; sub("chr","",$1); print original, $1}\'' + ext.prefix = "reference_to_cadd" + ext.suffix = "txt" + } + + withName: '.*:ANNOTATE_CADD:ANNOTATE_INDELS' { + ext.args = { "--columns Chrom,Pos,Ref,Alt,-,CADD --output-type z --write-index=tbi" } + ext.prefix = { "${input.simpleName}_ann" } + } } diff --git a/conf/test.config b/conf/test.config index 4a750f4..d7bf962 100644 --- a/conf/test.config +++ b/conf/test.config @@ -47,7 +47,8 @@ params { svdb_query_dbs = params.pipelines_testdata_base_path + 'reference/svdb_querydb_files.csv' // Mock input for CADD - cadd_resources = '../test-datasets' + cadd_resources = '../test-datasets/reference' + cadd_prescored_indels = '../test-datasets/' //cadd_resources = params.pipelines_testdata_base_path + "assets" //TODO add //cadd_prescored_indels = params.pipelines_testdata_base_path + "docs" //TODO add diff --git a/modules.json b/modules.json index ab7e9cb..9632cda 100644 --- a/modules.json +++ b/modules.json @@ -35,6 +35,11 @@ "git_sha": "34505e1fc5e9f4fd641210ca440acff6bd33b842", "installed_by": ["modules"] }, + "gawk": { + "branch": "master", + "git_sha": "c0da8f3a26835d663873001382a708f75766fec6", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "2c73cc8fa92cf48de3da0b643fdf357a8a290b36", diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf index a3097b9..98da106 100644 --- a/subworkflows/local/annotate_cadd/main.nf +++ b/subworkflows/local/annotate_cadd/main.nf @@ -2,8 +2,12 @@ // A subworkflow to annotate cadd // -include { BCFTOOLS_ANNOTATE } from '../../../modules/nf-core/bcftools/annotate/main' +include { BCFTOOLS_ANNOTATE as RENAME_CHR_CADD } from '../../../modules/nf-core/bcftools/annotate/main' +include { BCFTOOLS_ANNOTATE as ANNOTATE_INDELS } from '../../../modules/nf-core/bcftools/annotate/main' +include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' include { CADD } from '../../../modules/nf-core/cadd/main' +include { GAWK as REFERENCE_TO_CADD_CHRNAMES } from '../../../modules/nf-core/gawk/main' +include { GAWK as CADD_TO_REFERENCE_CHRNAMES } from '../../../modules/nf-core/gawk/main' include { TABIX_TABIX as TABIX_CADD } from '../../../modules/nf-core/tabix/tabix/main' include { TABIX_TABIX as TABIX_ANNOTATE } from '../../../modules/nf-core/tabix/tabix/main' @@ -11,34 +15,68 @@ include { TABIX_TABIX as TABIX_ANNOTATE } from '../../../modules/nf-c workflow ANNOTATE_CADD { take: - ch_snv_vcf // channel: [mandatory] [ val(meta), path(vcfs), path(idx) ] - ch_cadd_header // channel: [mandatory] [ path(txt) ] - ch_cadd_resources // channel: [mandatory] [ path(dir) ] - ch_cadd_prescored_indels // channel: [mandatory] [ val(meta), path(dir) ] + ch_vcf // channel: [mandatory] [ val(meta), path(vcf), path(idx) ] + val_genome // string: [mandatory] GRCh37 or GRCh38 + ch_fai // channel: [mandatory] [ val(meta), path(fai) ] + ch_header // channel: [mandatory] [ path(txt) ] + ch_cadd_resources // channel: [mandatory] [ val(meta), path(dir) ] + ch_cadd_prescored_indels // channel: [mandatory] [ val(meta), path(dir) ] main: - ch_versions = channel.empty() - CADD(ch_snv_vcf, ch_cadd_resources, ch_cadd_prescored_indels) + ch_rename_chrs_ref = [] + // Create files and rename chromosomes if reference is GRCh38 + if (val_genome.equals('GRCh38')) { // TODO change to 38 + + // Create txt files for changing of chromosomes + REFERENCE_TO_CADD_CHRNAMES ( ch_fai , [], false ) + + REFERENCE_TO_CADD_CHRNAMES.out.output.map { _meta, txt -> txt } + .set {ch_chrnames_cadd} + + CADD_TO_REFERENCE_CHRNAMES ( ch_fai , [], false ) + + CADD_TO_REFERENCE_CHRNAMES.out.output.map { _meta, txt -> txt } + .set { ch_rename_chrs_ref } + + ch_vcf + .combine(ch_chrnames_cadd) + .map { meta, vcf, tbi, txt -> tuple( meta, vcf, tbi, [], [], [], [], txt ) } + .set {rename_chrnames_in} + + // Change chr names to CADD compatible names + RENAME_CHR_CADD( rename_chrnames_in ) + + RENAME_CHR_CADD.out.vcf + .map {meta, vcf -> tuple( meta , vcf, [] )} + .set { ch_vcf } + } + + // Filter to extract indels + BCFTOOLS_VIEW(ch_vcf, [], [], []) + + // CADD + CADD(BCFTOOLS_VIEW.out.vcf, ch_cadd_resources, ch_cadd_prescored_indels) + + // Index CADD TABIX_CADD(CADD.out.tsv) - ch_snv_vcf - .join(CADD.out.tsv) - .join(TABIX_CADD.out.tbi) - .set { ch_annotate_in } + // Change chr names back to desired naming and annotate original vcf with cadd results + ch_vcf + .join(CADD.out.tsv, failOnMismatch: true, failOnDuplicate: true) + .join(TABIX_CADD.out.index, failOnMismatch: true, failOnDuplicate: true) + .combine( ch_header ) + .combine( ch_rename_chrs_ref ) + .map { meta, vcf, tbi, annotations, annotations_index, header, txt -> tuple( meta, vcf, [], annotations, annotations_index, [], header, txt ) } + .set { ch_annotate } - BCFTOOLS_ANNOTATE(ch_annotate_in, ch_cadd_header ) - TABIX_ANNOTATE (BCFTOOLS_ANNOTATE.out.vcf) + ANNOTATE_INDELS( ch_annotate ) - ch_versions = ch_versions.mix(CADD.out.versions.first()) - ch_versions = ch_versions.mix(TABIX_CADD.out.versions.first()) - ch_versions = ch_versions.mix(BCFTOOLS_ANNOTATE.out.versions.first()) - ch_versions = ch_versions.mix(TABIX_ANNOTATE.out.versions.first()) + ANNOTATE_INDELS.out.vcf.view() //TODO fix emit: - vcf = BCFTOOLS_ANNOTATE.out.vcf // channel: [ val(meta), path(vcf) ] - tbi = TABIX_ANNOTATE.out.tbi - versions = ch_versions + vcf = ANNOTATE_INDELS.out.vcf // channel: [ val(meta), path(vcf) ] + tbi = ANNOTATE_INDELS.out.tbi // channel: [ val(meta), path(tbi) ] } diff --git a/workflows/oncorefiner.nf b/workflows/oncorefiner.nf index 691554f..aa14054 100644 --- a/workflows/oncorefiner.nf +++ b/workflows/oncorefiner.nf @@ -60,6 +60,7 @@ workflow ONCOREFINER { // Reference files ch_genome_fasta = channel.fromPath(params.fasta).map { it -> [[id:it.simpleName], it] }.collect() + ch_genome_fai = channel.fromPath(params.fai).map {it -> [[id:it.simpleName], it] }.collect() // File channels for PREPARE_REFERENCES ch_vep_cache_unprocessed = params.vep_cache ? channel.fromPath(params.vep_cache).map { it -> [[id:'vep_cache'], it] }.collect() @@ -71,14 +72,14 @@ workflow ONCOREFINER { .set { ch_references } // Gather or get from params - ch_vep_cache = ( params.vep_cache && params.vep_cache.endsWith("tar.gz") ) ? ch_references.vep_resources + ch_vep_cache = ( params.vep_cache && params.vep_cache.endsWith("tar.gz") ) ? ch_references.vep_resources : ( params.vep_cache ? channel.fromPath(params.vep_cache).collect() : channel.value([]) ) - ch_cadd_header = Channel.fromPath("$projectDir/assets/cadd_to_vcf_header_-1.0-.txt", checkIfExists: true).collect() - ch_cadd_resources = params.cadd_resources ? Channel.fromPath(params.cadd_resources).collect() - : Channel.value([]) - ch_cadd_prescored_indels = createReferenceChannelFromPath(params.cadd_prescored_indels) // align with above - + ch_cadd_header = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() + ch_cadd_resources = params.cadd_resources ? channel.fromPath(params.cadd_resources).map { it -> [[id:'cadd_resources'], it] }.collect() + : channel.value([]) + ch_cadd_prescored_indels = params.cadd_prescored_indels ? channel.fromPath(params.cadd_prescored_indels).map { it -> [[id:'cadd_prescored_indels'], it] }.collect() + : channel.value([]) // // Read and store paths in the vep_plugin_files file @@ -143,9 +144,7 @@ workflow ONCOREFINER { RESEARCH_FILTERING(ch_research_filtering_in, [], [], []) - - - // VEP + // TODO remove or move down - not used if cadd output is input to vep RESEARCH_FILTERING.out.vcf .map { meta, vcf -> tuple(meta, vcf, []) @@ -153,23 +152,28 @@ workflow ONCOREFINER { //.set { ch_cadd_snv } .set {ch_vep_snv} + // ANNOTATE WITH CADD - currently depends on resources - could be variable instead (ref optional wf refinement)? + if (params.cadd_resources != null) { + TABIX_RESEARCH_FILTERING(RESEARCH_FILTERING.out.vcf) - // ANNOTATE WITH CADD - if (params.cadd_resources != null) { + RESEARCH_FILTERING.out.vcf + .join(TABIX_RESEARCH_FILTERING.out.index, failOnMismatch:true, failOnDuplicate:true) + .set{ ch_cadd_in } ANNOTATE_CADD ( - ch_vep_snv, - //ch_cadd_snv, + ch_cadd_in, + params.genome, + ch_genome_fai, ch_cadd_header, ch_cadd_resources, ch_cadd_prescored_indels ) - ch_vep_snv = ANNOTATE_CADD.out.vcf - ch_versions = ch_versions.mix(ANNOTATE_CADD.out.versions) + //ch_vep_snv = ANNOTATE_CADD.out.vcf } + // VEP ENSEMBLVEP_SNV ( ch_vep_snv, params.genome, From eafa726bafbc62d9a915b913499f219eee36cc10 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Fri, 27 Mar 2026 16:17:10 +0100 Subject: [PATCH 05/23] update subwf and test --- conf/subworkflows/annotate_cadd.config | 5 ++ subworkflows/local/annotate_cadd/main.nf | 8 +-- .../local/annotate_cadd/tests/main.nf.test | 67 ++++++++++++++++++ .../annotate_cadd/tests/main.nf.test.snap | 68 +++++++++++++++++++ .../local/annotate_cadd/tests/nextflow.config | 40 +++++++++++ tests/nextflow.config | 2 +- workflows/oncorefiner.nf | 11 +-- 7 files changed, 191 insertions(+), 10 deletions(-) create mode 100644 subworkflows/local/annotate_cadd/tests/main.nf.test create mode 100644 subworkflows/local/annotate_cadd/tests/main.nf.test.snap create mode 100644 subworkflows/local/annotate_cadd/tests/nextflow.config diff --git a/conf/subworkflows/annotate_cadd.config b/conf/subworkflows/annotate_cadd.config index 1214262..55bcf0d 100644 --- a/conf/subworkflows/annotate_cadd.config +++ b/conf/subworkflows/annotate_cadd.config @@ -11,6 +11,11 @@ process { ] } + withName: 'RENAME_CHR_CADD' { + ext.args = { "--output-type z" } + ext.prefix = { "${vcf.simpleName}_indels" } + } + withName: '.*:ANNOTATE_CADD:BCFTOOLS_VIEW' { ext.args = { "--output-type z --types indels,other" } ext.prefix = { "${vcf.simpleName}_indels" } diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf index 98da106..286dc89 100644 --- a/subworkflows/local/annotate_cadd/main.nf +++ b/subworkflows/local/annotate_cadd/main.nf @@ -3,7 +3,7 @@ // include { BCFTOOLS_ANNOTATE as RENAME_CHR_CADD } from '../../../modules/nf-core/bcftools/annotate/main' -include { BCFTOOLS_ANNOTATE as ANNOTATE_INDELS } from '../../../modules/nf-core/bcftools/annotate/main' +include { BCFTOOLS_ANNOTATE as ANNOTATE_INDELS } from '../../../modules/nf-core/bcftools/annotate/main' include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' include { CADD } from '../../../modules/nf-core/cadd/main' include { GAWK as REFERENCE_TO_CADD_CHRNAMES } from '../../../modules/nf-core/gawk/main' @@ -24,10 +24,10 @@ workflow ANNOTATE_CADD { main: - ch_rename_chrs_ref = [] + ch_rename_chrs_ref = channel.value([[]]) // Create files and rename chromosomes if reference is GRCh38 - if (val_genome.equals('GRCh38')) { // TODO change to 38 + if (val_genome.equals('GRCh38')) { // Create txt files for changing of chromosomes REFERENCE_TO_CADD_CHRNAMES ( ch_fai , [], false ) @@ -74,8 +74,6 @@ workflow ANNOTATE_CADD { ANNOTATE_INDELS( ch_annotate ) - ANNOTATE_INDELS.out.vcf.view() //TODO fix - emit: vcf = ANNOTATE_INDELS.out.vcf // channel: [ val(meta), path(vcf) ] tbi = ANNOTATE_INDELS.out.tbi // channel: [ val(meta), path(tbi) ] diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test b/subworkflows/local/annotate_cadd/tests/main.nf.test new file mode 100644 index 0000000..e01f85c --- /dev/null +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test @@ -0,0 +1,67 @@ +nextflow_workflow { + + name "Test Workflow ANNOTATE_CADD" + script "subworkflows/local/annotate_cadd/main.nf" + workflow "ANNOTATE_CADD" + tag "subworkflows" + tag "annotate_cadd" + config "./nextflow.config" + + test("ANNOTATE_CADD - GRCh37, stub") { + + options "-stub" + + when { + params { + genome = "GRCh37" + } + workflow { + """ + input[0] = channel.of([ [id:'test'], file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz'), file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz.tbi') ]).collect() + input[1] = 'GRCh37' + input[2] = channel.of([ [id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true)]).collect() + input[3] = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() + input[4] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_resources' ], dir ] } + input[5] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_prescored_indels' ], dir ] } + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + } + + test("ANNOTATE_CADD - GRCh38, stub") { + + options "-stub" + + when { + params { + genome = "GRCh38" + } + workflow { + """ + input[0] = channel.of([ [id:'test'], file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz'), file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz.tbi') ]).collect() + input[1] = 'GRCh38' + input[2] = channel.fromPath( params.pipelines_testdata_base_path + 'reference/reference.fasta.fai' ).map { it -> [[id:it.simpleName], it] }.collect() + input[3] = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() + input[4] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_resources' ], dir ] } + input[5] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_prescored_indels' ], dir ] } + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out).match() } + ) + } + } + +} diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test.snap b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap new file mode 100644 index 0000000..301b1a4 --- /dev/null +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap @@ -0,0 +1,68 @@ +{ + "ANNOTATE_CADD - GRCh37, stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "SNV_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "SNV_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tbi": [ + [ + { + "id": "test" + }, + "SNV_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "vcf": [ + [ + { + "id": "test" + }, + "SNV_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-27T16:04:52.225642" + }, + "ANNOTATE_CADD - GRCh38, stub": { + "content": [ + { + "0": [ + + ], + "1": [ + + ], + "tbi": [ + + ], + "vcf": [ + + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-03-27T15:48:16.48239" + } +} \ No newline at end of file diff --git a/subworkflows/local/annotate_cadd/tests/nextflow.config b/subworkflows/local/annotate_cadd/tests/nextflow.config new file mode 100644 index 0000000..3bdabb5 --- /dev/null +++ b/subworkflows/local/annotate_cadd/tests/nextflow.config @@ -0,0 +1,40 @@ +process { + + withName: 'BCFTOOLS_VIEW' { + ext.args = { "--output-type z --types indels,other" } + ext.prefix = { "${vcf.simpleName}_indels" } + } + + withName: 'CADD' { + container = "nf-core/ubuntu:22.04" //Using an basic container because v1.7.3 is too big for CI. + ext.args = { "-g ${params.genome}" } + ext.prefix = { "${vcf.simpleName}_cadd" } + } + + withName: 'TABIX_CADD' { + ext.args = { "--force --sequence 1 --begin 2 --end 2" } + } + + withName: 'CADD_TO_REFERENCE_CHRNAMES' { + ext.args2 = '\'{original=$1; sub("chr","",$1); print $1, original}\'' + ext.prefix = "cadd_to_reference" + ext.suffix = "txt" + } + + withName: 'REFERENCE_TO_CADD_CHRNAMES' { + ext.args2 = '\'{original=$1; sub("chr","",$1); print original, $1}\'' + ext.prefix = "reference_to_cadd" + ext.suffix = "txt" + } + + withName: 'ANNOTATE_INDELS' { + ext.args = { "--columns Chrom,Pos,Ref,Alt,-,CADD --output-type z --write-index=tbi" } + ext.prefix = { "${input.simpleName}_ann" } + } + + withName: 'RENAME_CHR_CADD' { + ext.args = { "--output-type z" } + ext.prefix = { "${vcf.simpleName}_indels" } + } + +} diff --git a/tests/nextflow.config b/tests/nextflow.config index b705059..03c6297 100644 --- a/tests/nextflow.config +++ b/tests/nextflow.config @@ -12,7 +12,7 @@ params { config_profile_name = 'Test profile' config_profile_description = 'Minimal test dataset to check pipeline function' modules_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/modules/data/' - pipelines_testdata_base_path = 'https://raw.githubusercontent.com/Clinical-Genomics/test-datasets/tree/1184e1c31b5e47055e3580c7e0f65240a1c005d0/' + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/Clinical-Genomics/test-datasets/1184e1c31b5e47055e3580c7e0f65240a1c005d0/' } diff --git a/workflows/oncorefiner.nf b/workflows/oncorefiner.nf index aa14054..210a294 100644 --- a/workflows/oncorefiner.nf +++ b/workflows/oncorefiner.nf @@ -144,12 +144,10 @@ workflow ONCOREFINER { RESEARCH_FILTERING(ch_research_filtering_in, [], [], []) - // TODO remove or move down - not used if cadd output is input to vep RESEARCH_FILTERING.out.vcf .map { meta, vcf -> tuple(meta, vcf, []) } - //.set { ch_cadd_snv } .set {ch_vep_snv} // ANNOTATE WITH CADD - currently depends on resources - could be variable instead (ref optional wf refinement)? @@ -163,16 +161,21 @@ workflow ONCOREFINER { ANNOTATE_CADD ( ch_cadd_in, - params.genome, + params.genome, //TODO pull dev and change to val_genome ch_genome_fai, ch_cadd_header, ch_cadd_resources, ch_cadd_prescored_indels ) - //ch_vep_snv = ANNOTATE_CADD.out.vcf + ch_cadd_snv = ANNOTATE_CADD.out.vcf } + ch_cadd_snv // Q: is it better to make this channel in the annotate cadd subwf? + .join(ANNOTATE_CADD.out.tbi) + .map { meta, vcf, tbi -> tuple(meta, vcf, tbi) } + .set { ch_vep_snv } + // VEP ENSEMBLVEP_SNV ( ch_vep_snv, From 1448d888b1063e0cd5f399c09c5ac1e2d7733d51 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Fri, 27 Mar 2026 16:32:19 +0100 Subject: [PATCH 06/23] fixing inputs --- workflows/oncorefiner.nf | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/workflows/oncorefiner.nf b/workflows/oncorefiner.nf index fa2aa29..06bbed1 100644 --- a/workflows/oncorefiner.nf +++ b/workflows/oncorefiner.nf @@ -63,6 +63,13 @@ workflow ONCOREFINER { ch_genome_fasta = channel.fromPath(params.fasta).map { it -> [[id:it.simpleName], it] }.collect() ch_genome_fai = channel.fromPath(params.fai).map {it -> [[id:it.simpleName], it] }.collect() + ch_cadd_header = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() + ch_cadd_resources = params.cadd_resources ? channel.fromPath(params.cadd_resources).map { it -> [[id:'cadd_resources'], it] }.collect() + : channel.value([]) + ch_cadd_prescored_indels = params.cadd_prescored_indels ? channel.fromPath(params.cadd_prescored_indels).map { it -> [[id:'cadd_prescored_indels'], it] }.collect() + : channel.value([]) + + // // Read and store paths in the vep_plugin_files file // From 175639a1e47e81e8f2a8f4d7b335fac79118bd04 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 30 Mar 2026 14:00:13 +0200 Subject: [PATCH 07/23] update test --- conf/subworkflows/annotate_cadd.config | 4 +-- subworkflows/local/annotate_cadd/main.nf | 1 - .../local/annotate_cadd/tests/main.nf.test | 18 +++++++---- .../annotate_cadd/tests/main.nf.test.snap | 30 +++++++++++++++---- .../local/annotate_cadd/tests/nextflow.config | 10 +++---- 5 files changed, 45 insertions(+), 18 deletions(-) diff --git a/conf/subworkflows/annotate_cadd.config b/conf/subworkflows/annotate_cadd.config index 55bcf0d..0ce8b02 100644 --- a/conf/subworkflows/annotate_cadd.config +++ b/conf/subworkflows/annotate_cadd.config @@ -11,9 +11,9 @@ process { ] } - withName: 'RENAME_CHR_CADD' { + withName: '.*:ANNOTATE_CADD:RENAME_CHR_CADD' { ext.args = { "--output-type z" } - ext.prefix = { "${vcf.simpleName}_indels" } + ext.prefix = { "${input.simpleName}_indels" } } withName: '.*:ANNOTATE_CADD:BCFTOOLS_VIEW' { diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf index 286dc89..287046e 100644 --- a/subworkflows/local/annotate_cadd/main.nf +++ b/subworkflows/local/annotate_cadd/main.nf @@ -9,7 +9,6 @@ include { CADD } from '../../../modules/nf-c include { GAWK as REFERENCE_TO_CADD_CHRNAMES } from '../../../modules/nf-core/gawk/main' include { GAWK as CADD_TO_REFERENCE_CHRNAMES } from '../../../modules/nf-core/gawk/main' include { TABIX_TABIX as TABIX_CADD } from '../../../modules/nf-core/tabix/tabix/main' -include { TABIX_TABIX as TABIX_ANNOTATE } from '../../../modules/nf-core/tabix/tabix/main' workflow ANNOTATE_CADD { diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test b/subworkflows/local/annotate_cadd/tests/main.nf.test index e01f85c..9023868 100644 --- a/subworkflows/local/annotate_cadd/tests/main.nf.test +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test @@ -1,7 +1,7 @@ nextflow_workflow { - name "Test Workflow ANNOTATE_CADD" - script "subworkflows/local/annotate_cadd/main.nf" + name "Test Subworkflow ANNOTATE_CADD" + script "../main.nf" workflow "ANNOTATE_CADD" tag "subworkflows" tag "annotate_cadd" @@ -17,7 +17,11 @@ nextflow_workflow { } workflow { """ - input[0] = channel.of([ [id:'test'], file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz'), file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz.tbi') ]).collect() + input[0] = channel.of([ + [id:'test'], + file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz', checkIfExists: true), + file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz.tbi', checkIfExists: true) + ]) input[1] = 'GRCh37' input[2] = channel.of([ [id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true)]).collect() input[3] = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() @@ -45,9 +49,13 @@ nextflow_workflow { } workflow { """ - input[0] = channel.of([ [id:'test'], file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz'), file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz.tbi') ]).collect() + input[0] = channel.of([ + [id:'test'], + file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz', checkIfExists: true), + file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz.tbi', checkIfExists: true) + ]) input[1] = 'GRCh38' - input[2] = channel.fromPath( params.pipelines_testdata_base_path + 'reference/reference.fasta.fai' ).map { it -> [[id:it.simpleName], it] }.collect() + input[2] = channel.fromPath(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true).map {it -> [[id:it.simpleName], it] }.collect() input[3] = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() input[4] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_resources' ], dir ] } input[5] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_prescored_indels' ], dir ] } diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test.snap b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap index 301b1a4..f05c43f 100644 --- a/subworkflows/local/annotate_cadd/tests/main.nf.test.snap +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap @@ -46,16 +46,36 @@ "content": [ { "0": [ - + [ + { + "id": "test" + }, + "SNV_indels_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] ], "1": [ - + [ + { + "id": "test" + }, + "SNV_indels_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] ], "tbi": [ - + [ + { + "id": "test" + }, + "SNV_indels_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] ], "vcf": [ - + [ + { + "id": "test" + }, + "SNV_indels_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] ] } ], @@ -63,6 +83,6 @@ "nf-test": "0.9.3", "nextflow": "25.10.4" }, - "timestamp": "2026-03-27T15:48:16.48239" + "timestamp": "2026-03-30T13:58:31.285282" } } \ No newline at end of file diff --git a/subworkflows/local/annotate_cadd/tests/nextflow.config b/subworkflows/local/annotate_cadd/tests/nextflow.config index 3bdabb5..8dfa90f 100644 --- a/subworkflows/local/annotate_cadd/tests/nextflow.config +++ b/subworkflows/local/annotate_cadd/tests/nextflow.config @@ -1,5 +1,10 @@ process { + withName: 'RENAME_CHR_CADD' { + ext.args = { "--output-type z" } + ext.prefix = { "${input.simpleName}_indels" } + } + withName: 'BCFTOOLS_VIEW' { ext.args = { "--output-type z --types indels,other" } ext.prefix = { "${vcf.simpleName}_indels" } @@ -32,9 +37,4 @@ process { ext.prefix = { "${input.simpleName}_ann" } } - withName: 'RENAME_CHR_CADD' { - ext.args = { "--output-type z" } - ext.prefix = { "${vcf.simpleName}_indels" } - } - } From 728f2b916c9fb4cfab5b89108556958e57466b1b Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 30 Mar 2026 16:48:29 +0200 Subject: [PATCH 08/23] fix so default test does not run cadd --- conf/subworkflows/annotate_cadd.config | 1 + conf/test.config | 6 +----- main.nf | 3 ++- nextflow.config | 4 ++-- tests/default.nf.test | 1 + workflows/oncorefiner.nf | 18 ++++++++---------- 6 files changed, 15 insertions(+), 18 deletions(-) diff --git a/conf/subworkflows/annotate_cadd.config b/conf/subworkflows/annotate_cadd.config index 0ce8b02..3e2e16a 100644 --- a/conf/subworkflows/annotate_cadd.config +++ b/conf/subworkflows/annotate_cadd.config @@ -9,6 +9,7 @@ process { publishDir = [ enabled: false ] + //ext.when = { ( !(workflow.profile.tokenize(',').intersect(['test', 'test_full']).size() >= 1) || workflow.stubRun) } } withName: '.*:ANNOTATE_CADD:RENAME_CHR_CADD' { diff --git a/conf/test.config b/conf/test.config index d7bf962..27bbb59 100644 --- a/conf/test.config +++ b/conf/test.config @@ -46,10 +46,6 @@ params { svdb_query_dbs = params.pipelines_testdata_base_path + 'reference/svdb_querydb_files.csv' - // Mock input for CADD - cadd_resources = '../test-datasets/reference' - cadd_prescored_indels = '../test-datasets/' - //cadd_resources = params.pipelines_testdata_base_path + "assets" //TODO add - //cadd_prescored_indels = params.pipelines_testdata_base_path + "docs" //TODO add + // TODO make mock input for CADD } diff --git a/main.nf b/main.nf index 178461a..d47a5ba 100644 --- a/main.nf +++ b/main.nf @@ -78,10 +78,10 @@ workflow CLINICALGENOMICS_ONCOREFINER { ch_cadd_header = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() ch_cadd_resources = val_cadd_resources ? channel.fromPath(val_cadd_resources).map { it -> [[id:'cadd_resources'], it] }.collect() : channel.value([]) + ch_cadd_prescored_indels = val_cadd_prescored_indels ? channel.fromPath(val_cadd_prescored_indels).map { it -> [[id:'cadd_prescored_indels'], it] }.collect() : channel.value([]) - // Input for VEP ch_vep_extra_files_unsplit = val_vep_plugin_files ? channel.fromPath(val_vep_plugin_files).collect() : channel.value([]) if (val_vep_plugin_files) { @@ -132,6 +132,7 @@ workflow CLINICALGENOMICS_ONCOREFINER { ch_vcfanno_toml, PREPARE_REFERENCES.out.vep_resources, ch_vep_extra_files, + val_cadd_resources, val_genome, val_species, val_vep_cache_version diff --git a/nextflow.config b/nextflow.config index 27454f0..3876482 100644 --- a/nextflow.config +++ b/nextflow.config @@ -19,8 +19,8 @@ params { sv_vcf = null // CADD - cadd_resources = null - cadd_prescored_indels = null + cadd_resources = null + cadd_prescored_indels = null // Vep diff --git a/tests/default.nf.test b/tests/default.nf.test index dc67666..8af67e6 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -9,6 +9,7 @@ nextflow_pipeline { when { params { outdir = "$outputDir" + //skip_tools = ".*:ANNOTATE_CADD:.*" } } diff --git a/workflows/oncorefiner.nf b/workflows/oncorefiner.nf index 0e9cd60..c854f55 100644 --- a/workflows/oncorefiner.nf +++ b/workflows/oncorefiner.nf @@ -62,6 +62,7 @@ workflow ONCOREFINER { ch_vcfanno_toml // channel: [optional] [path(toml_file)] ch_vep_cache // channel: [optional] [vep_cache_files] ch_vep_extra_files // channel: [optional] [path(plugin_file1), path(plugin_file2), ...] + val_cadd_resources // string: [optional] path to CADD resources directory val_genome // string: [optional] genome assembly (e.g. "GRCh38") val_species // string: [optional] species (e.g. "homo_sapiens") val_vep_cache_version // string: [optional] version of vep cache to use (e.g. "107") @@ -107,13 +108,14 @@ workflow ONCOREFINER { } .set {ch_vep_snv} - // ANNOTATE WITH CADD - currently depends on resources - could be variable instead (ref optional wf refinement)? - if (params.cadd_resources != null) { + // ANNOTATE WITH CADD - currently depends on val_cadd_resources - could be improved? + if (val_cadd_resources) { - TABIX_RESEARCH_FILTERING(RESEARCH_FILTERING.out.vcf) + TABIX_RESEARCH_FILTERING(RESEARCH_FILTERING.out.vcf) //CADD needs tabix index RESEARCH_FILTERING.out.vcf .join(TABIX_RESEARCH_FILTERING.out.index, failOnMismatch:true, failOnDuplicate:true) + .view() .set{ ch_cadd_in } ANNOTATE_CADD ( @@ -124,15 +126,11 @@ workflow ONCOREFINER { ch_cadd_resources, ch_cadd_prescored_indels ) - ch_cadd_snv = ANNOTATE_CADD.out.vcf - + ANNOTATE_CADD.out.vcf + .join(ANNOTATE_CADD.out.tbi) + .set { ch_vep_snv } } - ch_cadd_snv // Q: is it better to make this channel in the annotate cadd subwf? - .join(ANNOTATE_CADD.out.tbi) - .map { meta, vcf, tbi -> tuple(meta, vcf, tbi) } - .set { ch_vep_snv } - // VEP ENSEMBLVEP_SNV ( ch_vep_snv, From 8e0d78574e5ab9ac2b1193d8163b76646fb7dfb0 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 30 Mar 2026 17:05:57 +0200 Subject: [PATCH 09/23] changelog and small fixes --- CHANGELOG.md | 1 + conf/subworkflows/annotate_cadd.config | 1 - conf/test.config | 3 +-- subworkflows/local/annotate_cadd/tests/main.nf.test | 2 +- tests/default.nf.test | 1 - workflows/oncorefiner.nf | 6 +++--- 6 files changed, 6 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2defb47..4e4d6b9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -19,6 +19,7 @@ Initial release of Clinical-Genomics/oncorefiner, created with the [nf-core](htt - Added parameters documentation [#25](https://github.com/Clinical-Genomics/oncorefiner/pull/25) - Added pre-commit hook for automatic generation of parameters documentation [#25](https://github.com/Clinical-Genomics/oncorefiner/pull/25) - Added Nextflow strict syntax compatibility [#30](https://github.com/Clinical-Genomics/oncorefiner/pull/30) +- Added CADD scoring for InDels, and a test for the subworkflow [#59](https://github.com/Clinical-Genomics/oncorefiner/pull/59) ### Changed diff --git a/conf/subworkflows/annotate_cadd.config b/conf/subworkflows/annotate_cadd.config index 3e2e16a..0ce8b02 100644 --- a/conf/subworkflows/annotate_cadd.config +++ b/conf/subworkflows/annotate_cadd.config @@ -9,7 +9,6 @@ process { publishDir = [ enabled: false ] - //ext.when = { ( !(workflow.profile.tokenize(',').intersect(['test', 'test_full']).size() >= 1) || workflow.stubRun) } } withName: '.*:ANNOTATE_CADD:RENAME_CHR_CADD' { diff --git a/conf/test.config b/conf/test.config index 27bbb59..d89964c 100644 --- a/conf/test.config +++ b/conf/test.config @@ -46,6 +46,5 @@ params { svdb_query_dbs = params.pipelines_testdata_base_path + 'reference/svdb_querydb_files.csv' - // TODO make mock input for CADD - + // TODO make/insert mock input for CADD } diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test b/subworkflows/local/annotate_cadd/tests/main.nf.test index 9023868..6d57147 100644 --- a/subworkflows/local/annotate_cadd/tests/main.nf.test +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test @@ -23,7 +23,7 @@ nextflow_workflow { file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz.tbi', checkIfExists: true) ]) input[1] = 'GRCh37' - input[2] = channel.of([ [id:'genome'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true)]).collect() + input[2] = channel.fromPath(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true).map {it -> [[id:it.simpleName], it] }.collect() input[3] = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() input[4] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_resources' ], dir ] } input[5] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_prescored_indels' ], dir ] } diff --git a/tests/default.nf.test b/tests/default.nf.test index 8af67e6..dc67666 100644 --- a/tests/default.nf.test +++ b/tests/default.nf.test @@ -9,7 +9,6 @@ nextflow_pipeline { when { params { outdir = "$outputDir" - //skip_tools = ".*:ANNOTATE_CADD:.*" } } diff --git a/workflows/oncorefiner.nf b/workflows/oncorefiner.nf index c854f55..5503288 100644 --- a/workflows/oncorefiner.nf +++ b/workflows/oncorefiner.nf @@ -62,7 +62,7 @@ workflow ONCOREFINER { ch_vcfanno_toml // channel: [optional] [path(toml_file)] ch_vep_cache // channel: [optional] [vep_cache_files] ch_vep_extra_files // channel: [optional] [path(plugin_file1), path(plugin_file2), ...] - val_cadd_resources // string: [optional] path to CADD resources directory + val_cadd_resources // string: [optional] path to CADD resources directory val_genome // string: [optional] genome assembly (e.g. "GRCh38") val_species // string: [optional] species (e.g. "homo_sapiens") val_vep_cache_version // string: [optional] version of vep cache to use (e.g. "107") @@ -99,14 +99,13 @@ workflow ONCOREFINER { tuple(meta, vcf, tbi) } .set { ch_research_filtering_in } - RESEARCH_FILTERING(ch_research_filtering_in, [], [], []) RESEARCH_FILTERING.out.vcf .map { meta, vcf -> tuple(meta, vcf, []) } - .set {ch_vep_snv} + .set { ch_vep_snv } // ANNOTATE WITH CADD - currently depends on val_cadd_resources - could be improved? if (val_cadd_resources) { @@ -129,6 +128,7 @@ workflow ONCOREFINER { ANNOTATE_CADD.out.vcf .join(ANNOTATE_CADD.out.tbi) .set { ch_vep_snv } + } // VEP From 217fd8d7ea93f5dfc6bc381627b32ca761302011 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Tue, 7 Apr 2026 09:37:36 +0200 Subject: [PATCH 10/23] fix --- workflows/oncorefiner.nf | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/oncorefiner.nf b/workflows/oncorefiner.nf index 5503288..80534dc 100644 --- a/workflows/oncorefiner.nf +++ b/workflows/oncorefiner.nf @@ -114,7 +114,6 @@ workflow ONCOREFINER { RESEARCH_FILTERING.out.vcf .join(TABIX_RESEARCH_FILTERING.out.index, failOnMismatch:true, failOnDuplicate:true) - .view() .set{ ch_cadd_in } ANNOTATE_CADD ( From a2c21741a72aef3de9d39ef6c6bc0130bf75087d Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Fri, 10 Apr 2026 16:35:26 +0200 Subject: [PATCH 11/23] add publishdir --- conf/subworkflows/annotate_cadd.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/conf/subworkflows/annotate_cadd.config b/conf/subworkflows/annotate_cadd.config index 0ce8b02..c97f424 100644 --- a/conf/subworkflows/annotate_cadd.config +++ b/conf/subworkflows/annotate_cadd.config @@ -45,5 +45,8 @@ process { withName: '.*:ANNOTATE_CADD:ANNOTATE_INDELS' { ext.args = { "--columns Chrom,Pos,Ref,Alt,-,CADD --output-type z --write-index=tbi" } ext.prefix = { "${input.simpleName}_ann" } + publishDir = [ + path: { "${params.outdir}/annotations" }, + mode: params.publish_dir_mode ] } } From fd0522c9e7feb210939eb7f0c3d4309ef3368032 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 13 Apr 2026 10:23:27 +0200 Subject: [PATCH 12/23] fix --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index d47a5ba..1c14b11 100644 --- a/main.nf +++ b/main.nf @@ -56,7 +56,7 @@ workflow CLINICALGENOMICS_ONCOREFINER { : channel.value([[],[]]) PREPARE_REFERENCES ( - val_vep_cache + params.vep_cache ) // From 342ea0c8be34d3a6c08a8d218453105ce9fb1faa Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 13 Apr 2026 11:31:47 +0200 Subject: [PATCH 13/23] fix merge bug --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index c2da2cd..7e4835a 100644 --- a/main.nf +++ b/main.nf @@ -33,10 +33,10 @@ workflow CLINICALGENOMICS_ONCOREFINER { samplesheet // channel: [mandatory] samplesheet read in from --input val_bam_normal // string: [optional] path to BAM file for the normal sample val_bai_normal // string: [optional] path to BAI file for the normal sample - val_cadd_resources // string: [optional] path to CADD resources directory - val_cadd_prescored_indels // string: [optional] path to CADD prescored indels file val_bam_tumor // string: [optional] path to BAM file for the tumor sample val_bai_tumor // string: [optional] path to BAI file for the tumor sample + val_cadd_prescored_indels // string: [optional] path to CADD prescored indels file + val_cadd_resources // string: [optional] path to CADD resources directory val_genome // string: [optional] genome assembly (e.g. "GRCh38") val_genome_fasta // string: [optional] path to genome fasta file val_genome_fai // string: [optional] path to genome fasta index file From 1a8e7f9bab0bc0a96e72460d00b7f3eba67de454 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 13 Apr 2026 14:03:45 +0200 Subject: [PATCH 14/23] improving CHANGELOG comment --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index dad423d..83d271f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,7 +23,7 @@ Initial release of Clinical-Genomics/oncorefiner, created with the [nf-core](htt - Added `sex` parameter [#62](https://github.com/Clinical-Genomics/oncorefiner/pull/62) - Added `SAMTOOLS/VIEW` for bam to cram conversion in the main.nf [#70](https://github.com/Clinical-Genomics/oncorefiner/pull/70) - Added `GENERATE_CYTOSURE_FILES` subworkflow and necessary nf-core modules `TIDDIT_COV` and `VCF2CYTOSURE` [#60](https://github.com/Clinical-Genomics/oncorefiner/pull/60) -- Added CADD scoring for InDels, and a test for the subworkflow [#59](https://github.com/Clinical-Genomics/oncorefiner/pull/59) +- Added CADD scoring for InDels in the subworkflow `ANNOTATE_CADD`, with a subworkflow test (stub only) [#59](https://github.com/Clinical-Genomics/oncorefiner/pull/59) ### Changed From 99819f017f4d7bd8baedd0f00b92e133412c7bc1 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 13 Apr 2026 14:11:34 +0200 Subject: [PATCH 15/23] update citations --- CITATIONS.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CITATIONS.md b/CITATIONS.md index 2b87728..07c4e21 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -10,6 +10,12 @@ ## Pipeline tools +- [CADD1](https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-021-00835-9), [2](https://academic.oup.com/nar/article/47/D1/D886/5146191) + + > Rentzsch P, Schubach M, Shendure J, Kircher M. CADD-Splice—improving genome-wide variant effect prediction using deep learning-derived splice scores. Genome Med. 2021;13(1):31. doi:10.1186/s13073-021-00835-9 + + > Rentzsch P, Witten D, Cooper GM, Shendure J, Kircher M. CADD: predicting the deleteriousness of variants throughout the human genome. Nucleic Acids Research. 2019;47(D1):D886-D894. doi:10.1093/nar/gky1016 + - [MultiQC](https://pubmed.ncbi.nlm.nih.gov/27312411/) > Ewels P, Magnusson M, Lundin S, Käller M. MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics. 2016 Oct 1;32(19):3047-8. doi: 10.1093/bioinformatics/btw354. Epub 2016 Jun 16. PubMed PMID: 27312411; PubMed Central PMCID: PMC5039924. From 1a6e8573f8a68c09743fe900de1da814b1578f92 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Fri, 17 Apr 2026 11:35:52 +0200 Subject: [PATCH 16/23] moving tabix to subworkflow --- subworkflows/local/annotate_cadd/main.nf | 31 +++++++++++++++--------- subworkflows/local/process_snvs/main.nf | 5 +--- 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf index 5194330..9930318 100644 --- a/subworkflows/local/annotate_cadd/main.nf +++ b/subworkflows/local/annotate_cadd/main.nf @@ -1,5 +1,5 @@ // -// A subworkflow to annotate cadd +// A subworkflow to annotate indels with CADD scores // include { BCFTOOLS_ANNOTATE as BCFTOOLS_RENAME_CHR_CADD } from '../../../modules/nf-core/bcftools/annotate/main' @@ -9,22 +9,29 @@ include { CADD } from '../../../modules include { GAWK as GAWK_REF_TO_CADD_CHRNAMES } from '../../../modules/nf-core/gawk/main' include { GAWK as GAWK_CADD_TO_REF_CHRNAMES } from '../../../modules/nf-core/gawk/main' include { TABIX_TABIX as TABIX_CADD } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_INPUT } from '../../../modules/nf-core/tabix/tabix/main' workflow ANNOTATE_CADD { take: - ch_vcf // channel: [mandatory] [ val(meta), path(vcf), path(idx) ] - val_genome // string: [mandatory] GRCh37 or GRCh38 - ch_fai // channel: [mandatory] [ val(meta), path(fai) ] - ch_header // channel: [mandatory] [ path(txt) ] - ch_cadd_resources // channel: [mandatory] [ val(meta), path(dir) ] - ch_cadd_prescored_indels // channel: [mandatory] [ val(meta), path(dir) ] + ch_vcf // channel: [mandatory] [val(meta), path(vcf)] + val_genome // string: [mandatory] GRCh37 or GRCh38 + ch_fai // channel: [mandatory] [val(meta), path(fai)] + ch_header // channel: [mandatory] [path(txt)] + ch_cadd_resources // channel: [mandatory] [val(meta), path(dir)] + ch_cadd_prescored_indels // channel: [mandatory] [val(meta), path(dir)] main: ch_rename_chrs_ref = channel.value([[]]) + TABIX_INPUT(ch_vcf) //Subworkflow needs tabix index + + ch_vcf + .join(TABIX_INPUT.out.index, failOnMismatch:true, failOnDuplicate:true) + .set { ch_vcf_tbi } + // Create files and rename chromosomes if reference is GRCh38 if (val_genome.equals('GRCh38')) { @@ -39,7 +46,7 @@ workflow ANNOTATE_CADD { GAWK_CADD_TO_REF_CHRNAMES.out.output.map { _meta, txt -> txt } .set { ch_rename_chrs_ref } - ch_vcf + ch_vcf_tbi .combine(ch_chrnames_cadd) .map { meta, vcf, tbi, txt -> tuple( meta, vcf, tbi, [], [], [], [], txt ) } .set {rename_chrnames_in} @@ -49,11 +56,11 @@ workflow ANNOTATE_CADD { BCFTOOLS_RENAME_CHR_CADD.out.vcf .map {meta, vcf -> tuple( meta , vcf, [] )} - .set { ch_vcf } + .set { ch_vcf_tbi } } // Filter to extract indels - BCFTOOLS_VIEW(ch_vcf, [], [], []) + BCFTOOLS_VIEW(ch_vcf_tbi, [], [], []) // CADD CADD(BCFTOOLS_VIEW.out.vcf, ch_cadd_resources, ch_cadd_prescored_indels) @@ -62,12 +69,12 @@ workflow ANNOTATE_CADD { TABIX_CADD(CADD.out.tsv) // Change chr names back to desired naming and annotate original vcf with cadd results - ch_vcf + ch_vcf_tbi .join(CADD.out.tsv, failOnMismatch: true, failOnDuplicate: true) .join(TABIX_CADD.out.index, failOnMismatch: true, failOnDuplicate: true) .combine( ch_header ) .combine( ch_rename_chrs_ref ) - .map { meta, vcf, tbi, annotations, annotations_index, header, txt -> tuple( meta, vcf, [], annotations, annotations_index, [], header, txt ) } + .map { meta, vcf, tbi, annotations, annotations_index, header, txt -> tuple( meta, vcf, [], annotations, annotations_index, [], header, txt ) } //THERE IS A TBI? .set { ch_annotate } diff --git a/subworkflows/local/process_snvs/main.nf b/subworkflows/local/process_snvs/main.nf index a8054dc..061bbd0 100644 --- a/subworkflows/local/process_snvs/main.nf +++ b/subworkflows/local/process_snvs/main.nf @@ -12,7 +12,6 @@ include { ENSEMBLVEP_VEP } from '../../../modules/nf-co include { VCFANNO } from '../../../modules/nf-core/vcfanno/main' include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_RESEARCH } from '../../../modules/nf-core/bcftools/view/main' include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_CLINICAL } from '../../../modules/nf-core/bcftools/view/main' -include { TABIX_TABIX as TABIX_RESEARCH_FILTERING } from '../../../modules/nf-core/tabix/tabix/main' include { ANNOTATE_CADD } from '../../../subworkflows/local/annotate_cadd' /* @@ -74,10 +73,7 @@ workflow PROCESS_SNVS { // ANNOTATE WITH CADD - currently depends on val_cadd_resources - could be improved? if (val_cadd_resources) { - TABIX_RESEARCH_FILTERING(BCFTOOLS_VIEW_RESEARCH.out.vcf) //Subworkflow needs tabix index - BCFTOOLS_VIEW_RESEARCH.out.vcf - .join(TABIX_RESEARCH_FILTERING.out.index, failOnMismatch:true, failOnDuplicate:true) .set{ ch_cadd_in } ANNOTATE_CADD ( @@ -88,6 +84,7 @@ workflow PROCESS_SNVS { ch_cadd_resources, ch_cadd_prescored_indels ) + ANNOTATE_CADD.out.vcf .join(ANNOTATE_CADD.out.tbi) .set { ch_vep_snv } From dcb9352a60a8c99e4bc06976dad687afd558cce5 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Fri, 17 Apr 2026 13:13:09 +0200 Subject: [PATCH 17/23] review changes and update of snapshot --- conf/subworkflows/annotate_cadd.config | 8 +- main.nf | 4 +- subworkflows/local/annotate_cadd/main.nf | 2 +- .../local/annotate_cadd/tests/main.nf.test | 13 +-- .../annotate_cadd/tests/main.nf.test.snap | 100 ++++++------------ .../local/annotate_cadd/tests/nextflow.config | 8 +- 6 files changed, 48 insertions(+), 87 deletions(-) diff --git a/conf/subworkflows/annotate_cadd.config b/conf/subworkflows/annotate_cadd.config index c97f424..3feeece 100644 --- a/conf/subworkflows/annotate_cadd.config +++ b/conf/subworkflows/annotate_cadd.config @@ -11,7 +11,7 @@ process { ] } - withName: '.*:ANNOTATE_CADD:RENAME_CHR_CADD' { + withName: '.*:ANNOTATE_CADD:BCFTOOLS_RENAME_CHR_CADD' { ext.args = { "--output-type z" } ext.prefix = { "${input.simpleName}_indels" } } @@ -30,19 +30,19 @@ process { ext.args = { "--force --sequence 1 --begin 2 --end 2" } } - withName: '.*:ANNOTATE_CADD:CADD_TO_REFERENCE_CHRNAMES' { + withName: '.*:ANNOTATE_CADD:GAWK_CADD_TO_REF_CHRNAMES' { ext.args2 = '\'{original=$1; sub("chr","",$1); print $1, original}\'' ext.prefix = "cadd_to_reference" ext.suffix = "txt" } - withName: '.*:ANNOTATE_CADD:REFERENCE_TO_CADD_CHRNAMES' { + withName: '.*:ANNOTATE_CADD:GAWK_REF_TO_CADD_CHRNAMES' { ext.args2 = '\'{original=$1; sub("chr","",$1); print original, $1}\'' ext.prefix = "reference_to_cadd" ext.suffix = "txt" } - withName: '.*:ANNOTATE_CADD:ANNOTATE_INDELS' { + withName: '.*:ANNOTATE_CADD:BCFTOOLS_ANNOTATE_INDELS' { ext.args = { "--columns Chrom,Pos,Ref,Alt,-,CADD --output-type z --write-index=tbi" } ext.prefix = { "${input.simpleName}_ann" } publishDir = [ diff --git a/main.nf b/main.nf index e4a4acc..dc7d133 100644 --- a/main.nf +++ b/main.nf @@ -92,8 +92,8 @@ workflow CLINICALGENOMICS_ONCOREFINER { // Reference files ch_genome_fasta = channel.fromPath(val_genome_fasta).map { it -> [[id:it.simpleName], it] }.collect() - ch_genome_fai = channel.fromPath(val_genome_fai).map {it -> [[id:it.simpleName], it] }.collect() - ch_genome_fasta_fai = ch_genome_fasta.join(ch_genome_fai, failOnMismatch: true, failOnDuplicate: true) + ch_genome_fai = channel.fromPath(val_genome_fai).map { it -> [[id:it.simpleName], it] }.collect() + ch_genome_fasta_fai = ch_genome_fasta.join(ch_genome_fai, failOnMismatch: true, failOnDuplicate: true) // CADD input files ch_cadd_header = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf index 9930318..38da64d 100644 --- a/subworkflows/local/annotate_cadd/main.nf +++ b/subworkflows/local/annotate_cadd/main.nf @@ -74,7 +74,7 @@ workflow ANNOTATE_CADD { .join(TABIX_CADD.out.index, failOnMismatch: true, failOnDuplicate: true) .combine( ch_header ) .combine( ch_rename_chrs_ref ) - .map { meta, vcf, tbi, annotations, annotations_index, header, txt -> tuple( meta, vcf, [], annotations, annotations_index, [], header, txt ) } //THERE IS A TBI? + .map { meta, vcf, tbi, annotations, annotations_index, header, txt -> tuple( meta, vcf, tbi, annotations, annotations_index, [], header, txt ) } //THERE IS A TBI? .set { ch_annotate } diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test b/subworkflows/local/annotate_cadd/tests/main.nf.test index 6d57147..8b8df8f 100644 --- a/subworkflows/local/annotate_cadd/tests/main.nf.test +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test @@ -19,8 +19,7 @@ nextflow_workflow { """ input[0] = channel.of([ [id:'test'], - file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz', checkIfExists: true), - file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz.tbi', checkIfExists: true) + file(params.pipelines_testdata_base_path + 'testdata/tumor_normal/subject_a.tumor.purple.somatic.vcf.gz', checkIfExists: true) ]) input[1] = 'GRCh37' input[2] = channel.fromPath(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true).map {it -> [[id:it.simpleName], it] }.collect() @@ -34,13 +33,13 @@ nextflow_workflow { then { assertAll( { assert workflow.success }, - { assert snapshot(workflow.out).match() } + { assert snapshot(workflow.out.vcf, workflow.out.tbi).match() } ) } } test("ANNOTATE_CADD - GRCh38, stub") { - + // TODO update test data to GRCh38 options "-stub" when { @@ -51,8 +50,7 @@ nextflow_workflow { """ input[0] = channel.of([ [id:'test'], - file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz', checkIfExists: true), - file(params.pipelines_testdata_base_path + 'testdata/SNV.tumor.pave.somatic.37.vcf.gz.tbi', checkIfExists: true) + file(params.pipelines_testdata_base_path + 'testdata/tumor_normal/subject_a.tumor.purple.somatic.vcf.gz', checkIfExists: true) ]) input[1] = 'GRCh38' input[2] = channel.fromPath(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true).map {it -> [[id:it.simpleName], it] }.collect() @@ -66,8 +64,7 @@ nextflow_workflow { then { assertAll( { assert workflow.success }, - { assert snapshot( - workflow.out).match() } + { assert snapshot(workflow.out.vcf, workflow.out.tbi).match() } ) } } diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test.snap b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap index f05c43f..0a3c051 100644 --- a/subworkflows/local/annotate_cadd/tests/main.nf.test.snap +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap @@ -1,88 +1,52 @@ { "ANNOTATE_CADD - GRCh37, stub": { "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "SNV_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "1": [ - [ - { - "id": "test" - }, - "SNV_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "tbi": [ - [ - { - "id": "test" - }, - "SNV_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "vcf": [ - [ - { - "id": "test" - }, - "SNV_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] + [ + [ + { + "id": "test" + }, + "subject_a_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" ] - } + ], + [ + [ + { + "id": "test" + }, + "subject_a_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ] ], "meta": { "nf-test": "0.9.3", "nextflow": "25.10.4" }, - "timestamp": "2026-03-27T16:04:52.225642" + "timestamp": "2026-04-17T13:12:12.741306" }, "ANNOTATE_CADD - GRCh38, stub": { "content": [ - { - "0": [ - [ - { - "id": "test" - }, - "SNV_indels_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - "1": [ - [ - { - "id": "test" - }, - "SNV_indels_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "tbi": [ - [ - { - "id": "test" - }, - "SNV_indels_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] - ], - "vcf": [ - [ - { - "id": "test" - }, - "SNV_indels_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] + [ + [ + { + "id": "test" + }, + "subject_a_indels_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + [ + [ + { + "id": "test" + }, + "subject_a_indels_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" ] - } + ] ], "meta": { "nf-test": "0.9.3", "nextflow": "25.10.4" }, - "timestamp": "2026-03-30T13:58:31.285282" + "timestamp": "2026-04-17T13:12:21.913504" } } \ No newline at end of file diff --git a/subworkflows/local/annotate_cadd/tests/nextflow.config b/subworkflows/local/annotate_cadd/tests/nextflow.config index 8dfa90f..9c408d6 100644 --- a/subworkflows/local/annotate_cadd/tests/nextflow.config +++ b/subworkflows/local/annotate_cadd/tests/nextflow.config @@ -1,6 +1,6 @@ process { - withName: 'RENAME_CHR_CADD' { + withName: 'BCFTOOLS_RENAME_CHR_CADD' { ext.args = { "--output-type z" } ext.prefix = { "${input.simpleName}_indels" } } @@ -20,19 +20,19 @@ process { ext.args = { "--force --sequence 1 --begin 2 --end 2" } } - withName: 'CADD_TO_REFERENCE_CHRNAMES' { + withName: 'GAWK_CADD_TO_REF_CHRNAMES' { ext.args2 = '\'{original=$1; sub("chr","",$1); print $1, original}\'' ext.prefix = "cadd_to_reference" ext.suffix = "txt" } - withName: 'REFERENCE_TO_CADD_CHRNAMES' { + withName: 'GAWK_REF_TO_CADD_CHRNAMES' { ext.args2 = '\'{original=$1; sub("chr","",$1); print original, $1}\'' ext.prefix = "reference_to_cadd" ext.suffix = "txt" } - withName: 'ANNOTATE_INDELS' { + withName: 'BCFTOOLS_ANNOTATE_INDELS' { ext.args = { "--columns Chrom,Pos,Ref,Alt,-,CADD --output-type z --write-index=tbi" } ext.prefix = { "${input.simpleName}_ann" } } From a83477fa900ba0b1b8663785508aecd5afc1ed06 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Fri, 17 Apr 2026 13:19:31 +0200 Subject: [PATCH 18/23] add citations --- subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf b/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf index a492c2e..a1d2893 100644 --- a/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf @@ -180,6 +180,7 @@ def toolCitationText() { def citations_list = [] def vcfanno = "vcfanno (Pedersen et al. 2016)" def bcftools_view = "bcftools (Danecek et al. 2021)" + def cadd = "CADD (Rentzsch et al. 2019)" def ensemblvep_vep = "Ensembl VEP (McLaren et al. 2016)" def svdb = "svdb" def multiqc = "MultiQC (Ewels et al. 2016)" @@ -189,6 +190,7 @@ def toolCitationText() { citations_list + vcfanno + bcftools_view + + cadd + ensemblvep_vep } @@ -218,6 +220,7 @@ def toolBibliographyText() { def bibliography_list = [] def vcfanno = "
  • Pedersen BS, Layer RM, Quinlan AR. Vcfanno: fast, flexible annotation of genetic variants. Genome Biol. 2016 Jun 1;17(1):118. doi: 10.1186/s13059-016-0973-5. PMID: 27250555; PMCID: PMC4888505.
  • " def bcftools_view = "
  • Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590845; PMCID: PMC7898596.
  • " + def cadd = "
  • Rentzsch P, Witten D, Cooper GM, Shendure J, Kircher M. CADD: predicting the deleteriousness of variants throughout the human genome. Nucleic Acids Res. 2019 Jan 8;47(D1):D886-D894. doi: 10.1093/nar/gky1016. PMID: 30371827; PMCID: PMC6323892.
  • " def ensemblvep_vep = "
  • McLaren W, Gil L, Hunt SE, Riat HS, Ritchie GR, Thormann A, Flicek P, Cunningham F. The Ensembl Variant Effect Predictor. Genome Biol. 2016 Jun 6;17(1):122. doi: 10.1186/s13059-016-0974-4. PMID: 27268795; PMCID: PMC4893825.
  • " def svdb = "
  • svdb. https://github.com/J35P312/svdb.
  • " def multiqc = "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " @@ -227,6 +230,7 @@ def toolBibliographyText() { bibliography_list + vcfanno + bcftools_view + + cadd + ensemblvep_vep } From 02d46a0f9d2ce30d320f8810ba93e6b78f387c36 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Fri, 17 Apr 2026 13:44:32 +0200 Subject: [PATCH 19/23] update process_svs test --- .../local/process_snvs/tests/main.nf.test | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/subworkflows/local/process_snvs/tests/main.nf.test b/subworkflows/local/process_snvs/tests/main.nf.test index 9f2ead2..403e0ee 100644 --- a/subworkflows/local/process_snvs/tests/main.nf.test +++ b/subworkflows/local/process_snvs/tests/main.nf.test @@ -33,27 +33,31 @@ nextflow_workflow { [id:'reference'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta', checkIfExists: true) ]) - input[1] = channel.of([ + input[1] = channel.fromPath(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true).map {it -> [[id:it.simpleName], it] }.collect() + input[2] = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() + input[3] = null + input[4] = null + input[5] = channel.of([ [id:'SNV'], file(params.pipelines_testdata_base_path + 'testdata/tumor_normal/subject_a.tumor.purple.somatic.vcf.gz', checkIfExists: true) ]) - input[2] = channel.of([ + input[6] = channel.of([ [id:'SNV'], file(params.pipelines_testdata_base_path + 'testdata/tumor_normal/subject_a.tumor.purple.sv.vcf.gz', checkIfExists: true) ]) - input[3] = [] - input[4] = channel.of([ + input[7] = [] + input[8] = channel.of([ file(params.pipelines_testdata_base_path + 'reference/vcfanno_functions.lua', checkIfExists: true) ]) - input[5] = channel.of([ + input[9] = channel.of([ file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz', checkIfExists: true), file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz.tbi', checkIfExists: true) ]) - input[6] = channel.of([ + input[10] = channel.of([ file(params.pipelines_testdata_base_path + 'reference/vcfanno.toml', checkIfExists: true) ]) - input[7] = UNTAR_VEP_CACHE.out.untar.map{ _meta, files -> [files]}.collect() - input[8] = channel.of([ + input[11] = UNTAR_VEP_CACHE.out.untar.map{ _meta, files -> [files]}.collect() + input[12] = channel.of([ file(params.pipelines_testdata_base_path + 'reference/LoFtool_scores.txt', checkIfExists: true), file(params.pipelines_testdata_base_path + 'reference/spliceai_21_scores_raw_indel_-v1.3-.vcf.gz', checkIfExists: true), file(params.pipelines_testdata_base_path + 'reference/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz', checkIfExists: true), @@ -61,9 +65,10 @@ nextflow_workflow { file(params.pipelines_testdata_base_path + 'reference/spliceai_21_scores_raw_indel_-v1.3-.vcf.gz.tbi', checkIfExists: true), file(params.pipelines_testdata_base_path + 'reference/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz.tbi', checkIfExists: true) ]) - input[9] = 'GRCh37' - input[10] = 'homo_sapiens' - input[11] = '107' + input[13] = null + input[14] = 'GRCh37' + input[15] = 'homo_sapiens' + input[16] = '107' """ } } From ca5aae7559e72b33ace2fd2212fc9503a5cffde9 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 20 Apr 2026 13:37:26 +0200 Subject: [PATCH 20/23] updating snapshot --- main.nf | 14 ++--- .../local/annotate_cadd/tests/main.nf.test | 18 ++++++- .../annotate_cadd/tests/main.nf.test.snap | 54 +++++++++---------- .../local/annotate_cadd/tests/nextflow.config | 2 +- 4 files changed, 48 insertions(+), 40 deletions(-) diff --git a/main.nf b/main.nf index dc7d133..343c29a 100644 --- a/main.nf +++ b/main.nf @@ -66,15 +66,15 @@ workflow CLINICALGENOMICS_ONCOREFINER { // // Input channels - ch_snv_vcf = channel.fromPath(val_snv_vcf).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() - ch_snv_vcf_tbi = channel.fromPath(val_snv_vcf + '.tbi', checkIfExists: true).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() - ch_sv_vcf = channel.fromPath(val_sv_vcf).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() - ch_sv_vcf_tbi = channel.fromPath(val_sv_vcf + '.tbi', checkIfExists: true).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() - ch_vep_extra_files = channel.empty() - ch_svdb_dbs = channel.empty() + ch_snv_vcf = channel.fromPath(val_snv_vcf).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() + ch_snv_vcf_tbi = channel.fromPath(val_snv_vcf + '.tbi', checkIfExists: true).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() + ch_sv_vcf = channel.fromPath(val_sv_vcf).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() + ch_sv_vcf_tbi = channel.fromPath(val_sv_vcf + '.tbi', checkIfExists: true).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() + ch_vep_extra_files = channel.empty() + ch_svdb_dbs = channel.empty() // Alignment files - ch_bam_bai_normal = channel.empty() + ch_bam_bai_normal = channel.empty() if (val_bam_normal && val_bai_normal) { ch_bam_bai_normal = channel.fromPath(val_bam_normal) diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test b/subworkflows/local/annotate_cadd/tests/main.nf.test index 8b8df8f..2a59695 100644 --- a/subworkflows/local/annotate_cadd/tests/main.nf.test +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test @@ -14,6 +14,7 @@ nextflow_workflow { when { params { genome = "GRCh37" + outdir = "$outputDir" } workflow { """ @@ -31,9 +32,15 @@ nextflow_workflow { } then { + // All directories and files + def output_directories_and_files = getAllFilesFromDir(params.outdir, relative: true, includeDir: true) + assertAll( { assert workflow.success }, - { assert snapshot(workflow.out.vcf, workflow.out.tbi).match() } + { assert snapshot( + // All directories and files + output_directories_and_files + ).match() } ) } } @@ -45,6 +52,7 @@ nextflow_workflow { when { params { genome = "GRCh38" + outdir = "$outputDir" } workflow { """ @@ -62,9 +70,15 @@ nextflow_workflow { } then { + // All directories and files + def output_directories_and_files = getAllFilesFromDir(params.outdir, relative: true, includeDir: true) + assertAll( { assert workflow.success }, - { assert snapshot(workflow.out.vcf, workflow.out.tbi).match() } + { assert snapshot( + // All directories and files + output_directories_and_files + ).match() } ) } } diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test.snap b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap index 0a3c051..0f5186b 100644 --- a/subworkflows/local/annotate_cadd/tests/main.nf.test.snap +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap @@ -2,51 +2,45 @@ "ANNOTATE_CADD - GRCh37, stub": { "content": [ [ - [ - { - "id": "test" - }, - "subject_a_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - [ - [ - { - "id": "test" - }, - "subject_a_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + "bcftools", + "bcftools/subject_a_ann.vcf.gz", + "bcftools/subject_a_ann.vcf.gz.tbi", + "bcftools/subject_a_indels.vcf.gz", + "cadd", + "cadd/subject_a_indels_cadd.tsv.gz", + "tabix", + "tabix/subject_a.tumor.purple.somatic.vcf.gz.tbi", + "tabix/subject_a_indels_cadd.tsv.gz.tbi" ] ], "meta": { "nf-test": "0.9.3", "nextflow": "25.10.4" }, - "timestamp": "2026-04-17T13:12:12.741306" + "timestamp": "2026-04-20T13:35:58.691949" }, "ANNOTATE_CADD - GRCh38, stub": { "content": [ [ - [ - { - "id": "test" - }, - "subject_a_indels_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" - ] - ], - [ - [ - { - "id": "test" - }, - "subject_a_indels_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" - ] + "bcftools", + "bcftools/subject_a_renamed.vcf.gz", + "bcftools/subject_a_renamed_ann.vcf.gz", + "bcftools/subject_a_renamed_ann.vcf.gz.tbi", + "bcftools/subject_a_renamed_indels.vcf.gz", + "cadd", + "cadd/subject_a_renamed_indels_cadd.tsv.gz", + "gawk", + "gawk/cadd_to_reference.txt", + "gawk/reference_to_cadd.txt", + "tabix", + "tabix/subject_a.tumor.purple.somatic.vcf.gz.tbi", + "tabix/subject_a_renamed_indels_cadd.tsv.gz.tbi" ] ], "meta": { "nf-test": "0.9.3", "nextflow": "25.10.4" }, - "timestamp": "2026-04-17T13:12:21.913504" + "timestamp": "2026-04-20T13:36:09.683491" } } \ No newline at end of file diff --git a/subworkflows/local/annotate_cadd/tests/nextflow.config b/subworkflows/local/annotate_cadd/tests/nextflow.config index 9c408d6..dd99056 100644 --- a/subworkflows/local/annotate_cadd/tests/nextflow.config +++ b/subworkflows/local/annotate_cadd/tests/nextflow.config @@ -2,7 +2,7 @@ process { withName: 'BCFTOOLS_RENAME_CHR_CADD' { ext.args = { "--output-type z" } - ext.prefix = { "${input.simpleName}_indels" } + ext.prefix = { "${input.simpleName}_renamed" } } withName: 'BCFTOOLS_VIEW' { From 82bcd85b20d31b6534ee22e3b8f69c81b80ea684 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 20 Apr 2026 16:45:16 +0200 Subject: [PATCH 21/23] Review implementations --- CHANGELOG.md | 2 +- conf/subworkflows/annotate_cadd.config | 15 ++++++++++++++- nextflow.config | 1 - subworkflows/local/annotate_cadd/main.nf | 16 ++++++---------- subworkflows/local/process_snvs/main.nf | 4 +--- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c435dfb..24fb675 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,7 +24,7 @@ Initial release of Clinical-Genomics/oncorefiner, created with the [nf-core](htt - [#60](https://github.com/Clinical-Genomics/oncorefiner/pull/60) Added `GENERATE_CYTOSURE_FILES` subworkflow and necessary nf-core modules `TIDDIT_COV` and `VCF2CYTOSURE`. - [#70](https://github.com/Clinical-Genomics/oncorefiner/pull/70) Added `SAMTOOLS/VIEW` for bam to cram conversion in the `main.nf`. - [#66](https://github.com/Clinical-Genomics/oncorefiner/pull/66) Added `PROCESS_SNVS` subworkflow. -- [#59](https://github.com/Clinical-Genomics/oncorefiner/pull/59) Added CADD scoring for InDels in the subworkflow `ANNOTATE_CADD`, with a subworkflow test (stub only) +- [#59](https://github.com/Clinical-Genomics/oncorefiner/pull/59) Added `ANNOTATE_CADD` subworkflow with following test (stub only), for CADD scoring of InDels, used in `PROCESS_SNVS`. ### `Changed` diff --git a/conf/subworkflows/annotate_cadd.config b/conf/subworkflows/annotate_cadd.config index 3feeece..301f91c 100644 --- a/conf/subworkflows/annotate_cadd.config +++ b/conf/subworkflows/annotate_cadd.config @@ -1,7 +1,20 @@ /* -Annotate with CADD +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = Conditional clause +---------------------------------------------------------------------------------------- */ +// +// Annotate with CADD +// + process { diff --git a/nextflow.config b/nextflow.config index 5ed7f32..3a31a88 100644 --- a/nextflow.config +++ b/nextflow.config @@ -27,7 +27,6 @@ params { cadd_resources = null cadd_prescored_indels = null - // Vep vep_cache_version = 112 vep_plugin_files = null diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf index 38da64d..e6d851e 100644 --- a/subworkflows/local/annotate_cadd/main.nf +++ b/subworkflows/local/annotate_cadd/main.nf @@ -28,9 +28,8 @@ workflow ANNOTATE_CADD { TABIX_INPUT(ch_vcf) //Subworkflow needs tabix index - ch_vcf + ch_vcf_tbi = ch_vcf .join(TABIX_INPUT.out.index, failOnMismatch:true, failOnDuplicate:true) - .set { ch_vcf_tbi } // Create files and rename chromosomes if reference is GRCh38 if (val_genome.equals('GRCh38')) { @@ -46,17 +45,15 @@ workflow ANNOTATE_CADD { GAWK_CADD_TO_REF_CHRNAMES.out.output.map { _meta, txt -> txt } .set { ch_rename_chrs_ref } - ch_vcf_tbi + rename_chrnames_in = ch_vcf_tbi .combine(ch_chrnames_cadd) .map { meta, vcf, tbi, txt -> tuple( meta, vcf, tbi, [], [], [], [], txt ) } - .set {rename_chrnames_in} // Change chr names to CADD compatible names BCFTOOLS_RENAME_CHR_CADD( rename_chrnames_in ) - BCFTOOLS_RENAME_CHR_CADD.out.vcf + ch_vcf_tbi = BCFTOOLS_RENAME_CHR_CADD.out.vcf .map {meta, vcf -> tuple( meta , vcf, [] )} - .set { ch_vcf_tbi } } // Filter to extract indels @@ -69,18 +66,17 @@ workflow ANNOTATE_CADD { TABIX_CADD(CADD.out.tsv) // Change chr names back to desired naming and annotate original vcf with cadd results - ch_vcf_tbi + ch_annotate = ch_vcf_tbi .join(CADD.out.tsv, failOnMismatch: true, failOnDuplicate: true) .join(TABIX_CADD.out.index, failOnMismatch: true, failOnDuplicate: true) .combine( ch_header ) .combine( ch_rename_chrs_ref ) .map { meta, vcf, tbi, annotations, annotations_index, header, txt -> tuple( meta, vcf, tbi, annotations, annotations_index, [], header, txt ) } //THERE IS A TBI? - .set { ch_annotate } BCFTOOLS_ANNOTATE_INDELS( ch_annotate ) emit: - vcf = BCFTOOLS_ANNOTATE_INDELS.out.vcf // channel: [ val(meta), path(vcf) ] - tbi = BCFTOOLS_ANNOTATE_INDELS.out.tbi // channel: [ val(meta), path(tbi) ] + vcf = BCFTOOLS_ANNOTATE_INDELS.out.vcf // channel: [val(meta), path(vcf)] + tbi = BCFTOOLS_ANNOTATE_INDELS.out.tbi // channel: [val(meta), path(tbi)] } diff --git a/subworkflows/local/process_snvs/main.nf b/subworkflows/local/process_snvs/main.nf index 061bbd0..442fef5 100644 --- a/subworkflows/local/process_snvs/main.nf +++ b/subworkflows/local/process_snvs/main.nf @@ -73,8 +73,7 @@ workflow PROCESS_SNVS { // ANNOTATE WITH CADD - currently depends on val_cadd_resources - could be improved? if (val_cadd_resources) { - BCFTOOLS_VIEW_RESEARCH.out.vcf - .set{ ch_cadd_in } + ch_cadd_in = BCFTOOLS_VIEW_RESEARCH.out.vcf ANNOTATE_CADD ( ch_cadd_in, @@ -88,7 +87,6 @@ workflow PROCESS_SNVS { ANNOTATE_CADD.out.vcf .join(ANNOTATE_CADD.out.tbi) .set { ch_vep_snv } - } ENSEMBLVEP_VEP ( From 3c5e89aee1b1c2d9aa6acfb2aa8d4a5e30fb698a Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Mon, 20 Apr 2026 17:12:15 +0200 Subject: [PATCH 22/23] update citations --- .../local/utils_nfcore_oncorefiner_pipeline/main.nf | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf b/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf index a1d2893..b276648 100644 --- a/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf @@ -190,8 +190,10 @@ def toolCitationText() { citations_list + vcfanno + bcftools_view + - cadd + ensemblvep_vep + if (params.cadd_resources) { + citations_list = citations_list + cadd + } } if (params.sv_vcf) { @@ -230,8 +232,10 @@ def toolBibliographyText() { bibliography_list + vcfanno + bcftools_view + - cadd + ensemblvep_vep + if (params.cadd_resources) { + bibliography_list = bibliography_list + cadd + } } if (params.sv_vcf) { From 4bafb21765dcfd476623c2c68c71d6a5cfda3309 Mon Sep 17 00:00:00 2001 From: kristinebilgrav Date: Tue, 21 Apr 2026 14:51:24 +0200 Subject: [PATCH 23/23] update snapshot --- .../local/annotate_cadd/tests/main.nf.test | 14 +-- .../annotate_cadd/tests/main.nf.test.snap | 98 +++++++++++++------ 2 files changed, 72 insertions(+), 40 deletions(-) diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test b/subworkflows/local/annotate_cadd/tests/main.nf.test index 2a59695..2691e46 100644 --- a/subworkflows/local/annotate_cadd/tests/main.nf.test +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test @@ -32,15 +32,10 @@ nextflow_workflow { } then { - // All directories and files - def output_directories_and_files = getAllFilesFromDir(params.outdir, relative: true, includeDir: true) assertAll( { assert workflow.success }, - { assert snapshot( - // All directories and files - output_directories_and_files - ).match() } + { assert snapshot(workflow.out).match() } ) } } @@ -70,15 +65,10 @@ nextflow_workflow { } then { - // All directories and files - def output_directories_and_files = getAllFilesFromDir(params.outdir, relative: true, includeDir: true) assertAll( { assert workflow.success }, - { assert snapshot( - // All directories and files - output_directories_and_files - ).match() } + { assert snapshot(workflow.out).match() } ) } } diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test.snap b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap index 0f5186b..c9dcc48 100644 --- a/subworkflows/local/annotate_cadd/tests/main.nf.test.snap +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap @@ -1,46 +1,88 @@ { "ANNOTATE_CADD - GRCh37, stub": { "content": [ - [ - "bcftools", - "bcftools/subject_a_ann.vcf.gz", - "bcftools/subject_a_ann.vcf.gz.tbi", - "bcftools/subject_a_indels.vcf.gz", - "cadd", - "cadd/subject_a_indels_cadd.tsv.gz", - "tabix", - "tabix/subject_a.tumor.purple.somatic.vcf.gz.tbi", - "tabix/subject_a_indels_cadd.tsv.gz.tbi" - ] + { + "0": [ + [ + { + "id": "test" + }, + "subject_a_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "subject_a_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tbi": [ + [ + { + "id": "test" + }, + "subject_a_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "vcf": [ + [ + { + "id": "test" + }, + "subject_a_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + } ], "meta": { "nf-test": "0.9.3", "nextflow": "25.10.4" }, - "timestamp": "2026-04-20T13:35:58.691949" + "timestamp": "2026-04-21T14:50:16.980447" }, "ANNOTATE_CADD - GRCh38, stub": { "content": [ - [ - "bcftools", - "bcftools/subject_a_renamed.vcf.gz", - "bcftools/subject_a_renamed_ann.vcf.gz", - "bcftools/subject_a_renamed_ann.vcf.gz.tbi", - "bcftools/subject_a_renamed_indels.vcf.gz", - "cadd", - "cadd/subject_a_renamed_indels_cadd.tsv.gz", - "gawk", - "gawk/cadd_to_reference.txt", - "gawk/reference_to_cadd.txt", - "tabix", - "tabix/subject_a.tumor.purple.somatic.vcf.gz.tbi", - "tabix/subject_a_renamed_indels_cadd.tsv.gz.tbi" - ] + { + "0": [ + [ + { + "id": "test" + }, + "subject_a_renamed_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "subject_a_renamed_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tbi": [ + [ + { + "id": "test" + }, + "subject_a_renamed_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "vcf": [ + [ + { + "id": "test" + }, + "subject_a_renamed_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + } ], "meta": { "nf-test": "0.9.3", "nextflow": "25.10.4" }, - "timestamp": "2026-04-20T13:36:09.683491" + "timestamp": "2026-04-21T14:50:26.527943" } } \ No newline at end of file