diff --git a/CHANGELOG.md b/CHANGELOG.md index fa72bd6..298c9a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ Initial release of Clinical-Genomics/oncorefiner, created with the [nf-core](htt - [#60](https://github.com/Clinical-Genomics/oncorefiner/pull/60) Added `GENERATE_CYTOSURE_FILES` subworkflow and necessary nf-core modules `TIDDIT_COV` and `VCF2CYTOSURE`. - [#70](https://github.com/Clinical-Genomics/oncorefiner/pull/70) Added `SAMTOOLS/VIEW` for bam to cram conversion in the `main.nf`. - [#66](https://github.com/Clinical-Genomics/oncorefiner/pull/66) Added `PROCESS_SNVS` subworkflow. +- [#59](https://github.com/Clinical-Genomics/oncorefiner/pull/59) Added `ANNOTATE_CADD` subworkflow with following test (stub only), for CADD scoring of InDels, used in `PROCESS_SNVS`. ### `Changed` diff --git a/CITATIONS.md b/CITATIONS.md index d75c30f..801daaf 100644 --- a/CITATIONS.md +++ b/CITATIONS.md @@ -12,6 +12,12 @@ ## Pipeline tools +- [CADD1](https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-021-00835-9), [2](https://academic.oup.com/nar/article/47/D1/D886/5146191) + + > Rentzsch P, Schubach M, Shendure J, Kircher M. CADD-Splice—improving genome-wide variant effect prediction using deep learning-derived splice scores. Genome Med. 2021;13(1):31. doi:10.1186/s13073-021-00835-9 + + > Rentzsch P, Witten D, Cooper GM, Shendure J, Kircher M. CADD: predicting the deleteriousness of variants throughout the human genome. Nucleic Acids Research. 2019;47(D1):D886-D894. doi:10.1093/nar/gky1016 + - [`bcftools`](https://github.com/samtools/bcftools) > Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Jan 29;10(2):giab008. doi: 10.1093/gigascience/giab008. PubMed PMID: 33590861; PubMed Central PMCID: PMC7931819. diff --git a/assets/cadd_to_vcf_header.txt b/assets/cadd_to_vcf_header.txt new file mode 100644 index 0000000..8deee48 --- /dev/null +++ b/assets/cadd_to_vcf_header.txt @@ -0,0 +1 @@ +##INFO= diff --git a/conf/subworkflows/annotate_cadd.config b/conf/subworkflows/annotate_cadd.config new file mode 100644 index 0000000..301f91c --- /dev/null +++ b/conf/subworkflows/annotate_cadd.config @@ -0,0 +1,65 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. + ext.when = Conditional clause +---------------------------------------------------------------------------------------- +*/ + +// +// Annotate with CADD +// + + +process { + + withName: '.*:ANNOTATE_CADD:.*' { + publishDir = [ + enabled: false + ] + } + + withName: '.*:ANNOTATE_CADD:BCFTOOLS_RENAME_CHR_CADD' { + ext.args = { "--output-type z" } + ext.prefix = { "${input.simpleName}_indels" } + } + + withName: '.*:ANNOTATE_CADD:BCFTOOLS_VIEW' { + ext.args = { "--output-type z --types indels,other" } + ext.prefix = { "${vcf.simpleName}_indels" } + } + + withName: '.*:ANNOTATE_CADD:CADD' { + ext.args = { "-g ${params.genome}" } + ext.prefix = { "${vcf.simpleName}_cadd" } + } + + withName: '.*:ANNOTATE_CADD:TABIX_CADD' { + ext.args = { "--force --sequence 1 --begin 2 --end 2" } + } + + withName: '.*:ANNOTATE_CADD:GAWK_CADD_TO_REF_CHRNAMES' { + ext.args2 = '\'{original=$1; sub("chr","",$1); print $1, original}\'' + ext.prefix = "cadd_to_reference" + ext.suffix = "txt" + } + + withName: '.*:ANNOTATE_CADD:GAWK_REF_TO_CADD_CHRNAMES' { + ext.args2 = '\'{original=$1; sub("chr","",$1); print original, $1}\'' + ext.prefix = "reference_to_cadd" + ext.suffix = "txt" + } + + withName: '.*:ANNOTATE_CADD:BCFTOOLS_ANNOTATE_INDELS' { + ext.args = { "--columns Chrom,Pos,Ref,Alt,-,CADD --output-type z --write-index=tbi" } + ext.prefix = { "${input.simpleName}_ann" } + publishDir = [ + path: { "${params.outdir}/annotations" }, + mode: params.publish_dir_mode ] + } +} diff --git a/conf/test.config b/conf/test.config index 7eed828..5bf10a6 100644 --- a/conf/test.config +++ b/conf/test.config @@ -51,4 +51,5 @@ params { svdb_query_dbs = params.pipelines_testdata_base_path + 'reference/svdb_querydb_files.csv' + // TODO make/insert mock input for CADD } diff --git a/docs/parameters.md b/docs/parameters.md index ed78e4c..3931d97 100644 --- a/docs/parameters.md +++ b/docs/parameters.md @@ -29,6 +29,8 @@ Reference genome related files and options required for the workflow. | `genome` | Name of the genome reference. (accepted: `GRCh38`\|`GRCh37`)
HelpUse this parameter to specify the ID for the reference genome used. This is then used to annotate the SV and SNV files e.g. `--genome GRCh38`.
| `string` | GRCh38 | | | | `fasta` | Path to FASTA genome file.
HelpIf you don't have a BWA index available this will be generated for you automatically. Combine with `--save_reference` to save BWA index for future runs.
| `string` | | | | | `fai` | Path to FASTA genome index file.
HelpIf none provided, will be generated automatically from the FASTA reference
| `string` | | | | +| `cadd_prescored_indels` | Path to a directory containing prescored indels for CADD.
HelpThis folder contains the compressed files and indexes that would otherwise be in data/prescored folder as described in https://github.com/kircherlab/CADD-scripts/#manual-installation.
| `string` | | | | +| `cadd_resources` | Path to the directory containing cadd annotations.
HelpThis folder contains the uncompressed files that would otherwise be in data/annotation folder as described in https://github.com/kircherlab/CADD-scripts/#manual-installation.
| `string` | | | | | `species` | Species of the reference genome. E.g. `--species homo_sapiens`. (accepted: `homo_sapiens`) | `string` | homo_sapiens | | | ## Annotation options diff --git a/main.nf b/main.nf index afdb260..343c29a 100644 --- a/main.nf +++ b/main.nf @@ -35,6 +35,8 @@ workflow CLINICALGENOMICS_ONCOREFINER { val_bai_normal // string: [optional] path to BAI file for the normal sample val_bam_tumor // string: [optional] path to BAM file for the tumor sample val_bai_tumor // string: [optional] path to BAI file for the tumor sample + val_cadd_prescored_indels // string: [optional] path to CADD prescored indels file + val_cadd_resources // string: [optional] path to CADD resources directory val_genome // string: [optional] genome assembly (e.g. "GRCh38") val_genome_fasta // string: [optional] path to genome fasta file val_genome_fai // string: [optional] path to genome fasta index file @@ -64,15 +66,15 @@ workflow CLINICALGENOMICS_ONCOREFINER { // // Input channels - ch_snv_vcf = channel.fromPath(val_snv_vcf).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() - ch_snv_vcf_tbi = channel.fromPath(val_snv_vcf + '.tbi', checkIfExists: true).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() - ch_sv_vcf = channel.fromPath(val_sv_vcf).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() - ch_sv_vcf_tbi = channel.fromPath(val_sv_vcf + '.tbi', checkIfExists: true).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() - ch_vep_extra_files = channel.empty() - ch_svdb_dbs = channel.empty() + ch_snv_vcf = channel.fromPath(val_snv_vcf).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() + ch_snv_vcf_tbi = channel.fromPath(val_snv_vcf + '.tbi', checkIfExists: true).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() + ch_sv_vcf = channel.fromPath(val_sv_vcf).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() + ch_sv_vcf_tbi = channel.fromPath(val_sv_vcf + '.tbi', checkIfExists: true).map { vcf -> [[id:vcf.simpleName], vcf] }.collect() + ch_vep_extra_files = channel.empty() + ch_svdb_dbs = channel.empty() // Alignment files - ch_bam_bai_normal = channel.empty() + ch_bam_bai_normal = channel.empty() if (val_bam_normal && val_bai_normal) { ch_bam_bai_normal = channel.fromPath(val_bam_normal) @@ -89,9 +91,17 @@ workflow CLINICALGENOMICS_ONCOREFINER { } // Reference files - ch_genome_fasta = channel.fromPath(val_genome_fasta).map { it -> [[id:it.simpleName], it] }.collect() - ch_genome_fai = channel.fromPath(val_genome_fai).map { it -> [[id:it.simpleName], it] }.collect() - ch_genome_fasta_fai = ch_genome_fasta.join(ch_genome_fai, failOnMismatch: true, failOnDuplicate: true) + ch_genome_fasta = channel.fromPath(val_genome_fasta).map { it -> [[id:it.simpleName], it] }.collect() + ch_genome_fai = channel.fromPath(val_genome_fai).map { it -> [[id:it.simpleName], it] }.collect() + ch_genome_fasta_fai = ch_genome_fasta.join(ch_genome_fai, failOnMismatch: true, failOnDuplicate: true) + + // CADD input files + ch_cadd_header = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() + ch_cadd_resources = val_cadd_resources ? channel.fromPath(val_cadd_resources).map { it -> [[id:'cadd_resources'], it] }.collect() + : channel.value([]) + + ch_cadd_prescored_indels = val_cadd_prescored_indels ? channel.fromPath(val_cadd_prescored_indels).map { it -> [[id:'cadd_prescored_indels'], it] }.collect() + : channel.value([]) // Input for VEP ch_vep_extra_files_unsplit = val_vep_plugin_files ? channel.fromPath(val_vep_plugin_files).collect() : channel.value([]) @@ -129,7 +139,11 @@ workflow CLINICALGENOMICS_ONCOREFINER { samplesheet, ch_bam_bai_normal, ch_bam_bai_tumor, + ch_cadd_header, + ch_cadd_prescored_indels, + ch_cadd_resources, ch_genome_fasta, + ch_genome_fai, ch_snv_vcf, ch_snv_vcf_tbi, ch_sv_dbs, @@ -141,6 +155,7 @@ workflow CLINICALGENOMICS_ONCOREFINER { ch_vcfanno_toml, PREPARE_REFERENCES.out.vep_resources, ch_vep_extra_files, + val_cadd_resources, val_genome, val_species, val_vep_cache_version @@ -194,6 +209,8 @@ workflow { params.bai_normal, params.bam_tumor, params.bai_tumor, + params.cadd_prescored_indels, + params.cadd_resources, params.genome, params.fasta, params.fai, diff --git a/modules.json b/modules.json index ace8340..cfc8ee9 100644 --- a/modules.json +++ b/modules.json @@ -25,11 +25,21 @@ "git_sha": "6383d8fe58f9498eecd5aa303e71a4a932d1e9f6", "installed_by": ["modules"] }, + "cadd": { + "branch": "master", + "git_sha": "64ab14a6905e5c9d649f61e2757a1e600dbdb8e0", + "installed_by": ["modules"] + }, "ensemblvep/vep": { "branch": "master", "git_sha": "34505e1fc5e9f4fd641210ca440acff6bd33b842", "installed_by": ["modules"] }, + "gawk": { + "branch": "master", + "git_sha": "c0da8f3a26835d663873001382a708f75766fec6", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "2c73cc8fa92cf48de3da0b643fdf357a8a290b36", diff --git a/modules/nf-core/cadd/environment.yml b/modules/nf-core/cadd/environment.yml new file mode 100644 index 0000000..39701b4 --- /dev/null +++ b/modules/nf-core/cadd/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - bioconda::cadd-scripts=1.7.3 diff --git a/modules/nf-core/cadd/main.nf b/modules/nf-core/cadd/main.nf new file mode 100644 index 0000000..771d144 --- /dev/null +++ b/modules/nf-core/cadd/main.nf @@ -0,0 +1,53 @@ +process CADD { + tag "${meta.id}" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container 'docker.io/clinicalgenomics/cadd-with-scripts:1.7.3' + + containerOptions { + if (prescored_dir) { + ['singularity', 'apptainer'].contains(workflow.containerEngine) ? + "-B ${annotation_dir}:/cadd-scripts/data/annotations -B ${prescored_dir}:/cadd-scripts/data/prescored" : + "-v ${annotation_dir}:/cadd-scripts/data/annotations -v ${prescored_dir}:/cadd-scripts/data/prescored" + } else { + ['singularity', 'apptainer'].contains(workflow.containerEngine) ? + "-B ${annotation_dir}:/cadd-scripts/data/annotations" : + "-v ${annotation_dir}:/cadd-scripts/data/annotations" + } + } + + input: + tuple val(meta), path(vcf) + tuple val(meta2), val(annotation_dir) + tuple val(meta3), val(prescored_dir) + + output: + tuple val(meta), path("${prefix}.tsv.gz"), emit: tsv + tuple val("${task.process}"), val("cadd"), val("1.7.3"), emit: versions_cadd, topic: versions + // WARN: Version information not provided by tool on CLI. Please update version string below when bumping container versions. + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + prefix = task.ext.prefix ?: "${meta.id}" + """ + export XDG_CACHE_HOME=\$PWD/snakemake_cache + export MPLCONFIGDIR=. + mkdir -p \$XDG_CACHE_HOME + + CADD.sh \\ + -m \\ + -o ${prefix}.tsv.gz \\ + ${args} \\ + ${vcf} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + """ + echo "" | gzip > ${prefix}.tsv.gz + """ +} diff --git a/modules/nf-core/cadd/meta.yml b/modules/nf-core/cadd/meta.yml new file mode 100644 index 0000000..1efaa94 --- /dev/null +++ b/modules/nf-core/cadd/meta.yml @@ -0,0 +1,94 @@ +name: "cadd" +description: CADD is a tool for scoring the deleteriousness of single nucleotide + variants as well as insertion/deletions variants in the human genome. +keywords: + - cadd + - annotate + - variants +tools: + - "cadd": + description: "CADD scripts release for offline scoring" + homepage: "https://cadd.gs.washington.edu/" + documentation: "https://github.com/kircherlab/CADD-scripts/blob/master/README.md" + tool_dev_url: "https://github.com/kircherlab/CADD-scripts/" + doi: "10.1093/nar/gky1016" + licence: + - Restricted. Free for non-commercial users. + identifier: biotools:cadd_phred +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - vcf: + type: file + description: Input file for annotation in vcf or vcf.gz format + pattern: "*.{vcf,vcf.gz}" + ontologies: [] + - - meta2: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - annotation_dir: + type: directory + description: | + Path to folder containing the vcf files with precomputed CADD scores. + This folder contains the uncompressed files that would otherwise be in data/annotation folder as described in https://github.com/kircherlab/CADD-scripts/#manual-installation. + - - meta3: + type: map + description: | + Groovy Map containing reference information + e.g. [ id:'test' ] + - prescored_dir: + type: directory + description: | + Path to folder containing prescored CADD score files. + Expected structure mirrors data/prescored/ from the CADD-scripts installation: + / + GRCh38_v1.7/ + incl_anno/ # *.tsv.gz + *.tsv.gz.tbi (scores with annotations) + no_anno/ # *.tsv.gz + *.tsv.gz.tbi (scores only) + GRCh37_v1.7/ + incl_anno/ + no_anno/ + See https://github.com/kircherlab/CADD-scripts/#manual-installation for details. +output: + tsv: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - ${prefix}.tsv.gz: + type: file + description: Annotated tsv file + pattern: "*.{tsv,tsv.gz}" + ontologies: + - edam: http://edamontology.org/format_3475 + versions_cadd: + - - ${task.process}: + type: string + description: The name of the process + - cadd: + type: string + description: The name of the tool + - 1.7.3: + type: string + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - cadd: + type: string + description: The name of the tool + - 1.7.3: + type: string + description: The expression to obtain the version of the tool +authors: + - "@ramprasadn" +maintainers: + - "@ramprasadn" diff --git a/modules/nf-core/cadd/tests/main.nf.test b/modules/nf-core/cadd/tests/main.nf.test new file mode 100644 index 0000000..c328790 --- /dev/null +++ b/modules/nf-core/cadd/tests/main.nf.test @@ -0,0 +1,39 @@ +nextflow_process { + + name "Test Process CADD" + + script "../main.nf" + process "CADD" + + tag "modules" + tag "modules_nfcore" + tag "cadd" + + config "./nextflow.config" + + test("test_cadd - stub") { + options '-stub' + when { + + process { + """ + input[0] = [ + [id:'test',single_end:false],// meta map + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/gvcf/test.genome.vcf',checkIfExists:true) + ] + input[1] = Channel.from("\$PWD").map { dir -> [ [ id: dir ], dir ] } + input[2] = Channel.from("/").map { dir -> [ [ id: dir ], dir ] } + """ + } + } + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out, + process.out.versions.collect{ path(it).yaml } + ).match() } + ) + } + } +} diff --git a/modules/nf-core/cadd/tests/main.nf.test.snap b/modules/nf-core/cadd/tests/main.nf.test.snap new file mode 100644 index 0000000..5e38eea --- /dev/null +++ b/modules/nf-core/cadd/tests/main.nf.test.snap @@ -0,0 +1,48 @@ +{ + "test_cadd - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + "CADD", + "cadd", + "1.7.3" + ] + ], + "tsv": [ + [ + { + "id": "test", + "single_end": false + }, + "test.tsv.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "versions_cadd": [ + [ + "CADD", + "cadd", + "1.7.3" + ] + ] + }, + [ + + ] + ], + "timestamp": "2026-03-01T12:08:37.372500636", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/cadd/tests/nextflow.config b/modules/nf-core/cadd/tests/nextflow.config new file mode 100644 index 0000000..bd24d9f --- /dev/null +++ b/modules/nf-core/cadd/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'CADD' { + container = "nf-core/ubuntu:22.04" //Using an basic container because v1.7.3 is too big for CI. + } +} diff --git a/modules/nf-core/gawk/environment.yml b/modules/nf-core/gawk/environment.yml new file mode 100644 index 0000000..185a0f5 --- /dev/null +++ b/modules/nf-core/gawk/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - conda-forge::gawk=5.3.1 diff --git a/modules/nf-core/gawk/main.nf b/modules/nf-core/gawk/main.nf new file mode 100644 index 0000000..33dd24c --- /dev/null +++ b/modules/nf-core/gawk/main.nf @@ -0,0 +1,60 @@ +process GAWK { + tag "$meta.id" + label 'process_single' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://community-cr-prod.seqera.io/docker/registry/v2/blobs/sha256/a1/a125c778baf3865331101a104b60d249ee15fe1dca13bdafd888926cc5490a34/data' : + 'community.wave.seqera.io/library/gawk:5.3.1--e09efb5dfc4b8156' }" + + input: + tuple val(meta), path(input, arity: '0..*') + path(program_file) + val(disable_redirect_output) + + output: + tuple val(meta), path("*.${suffix}"), emit: output + tuple val("${task.process}"), val('gawk'), eval("awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//'"), topic: versions, emit: versions_gawk + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' // args is used for the main arguments of the tool + def args2 = task.ext.args2 ?: '' // args2 is used to specify a program when no program file has been given + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.collect{ file -> file.getExtension()}.get(0)}" // use the first extension of the input files + + program = program_file ? "-f ${program_file}" : "${args2}" + lst_gz = input.findResults{ file -> file.getExtension().endsWith("gz") ? file.toString() : null } + unzip = lst_gz ? "gunzip -q -f ${lst_gz.join(" ")}" : "" + input_cmd = input.collect { file -> file.toString() - ~/\.gz$/ }.join(" ") + output_cmd = suffix.endsWith("gz") ? "| gzip > ${prefix}.${suffix}" : "> ${prefix}.${suffix}" + output = disable_redirect_output ? "" : output_cmd + cleanup = lst_gz ? "rm ${lst_gz.collect{ file -> file - ~/\.gz$/ }.join(" ")}" : "" + + input.collect{ file -> + assert file.name != "${prefix}.${suffix}" : "Input and output names are the same, set prefix in module configuration to disambiguate!" + } + + """ + ${unzip} + + awk \\ + ${args} \\ + ${program} \\ + ${input_cmd} \\ + ${output} + + ${cleanup} + """ + + stub: + prefix = task.ext.prefix ?: "${meta.id}" + suffix = task.ext.suffix ?: "${input.collect{ file -> file.getExtension()}.get(0)}" + def create_cmd = suffix.endsWith("gz") ? "echo '' | gzip >" : "touch" + + """ + ${create_cmd} ${prefix}.${suffix} + """ +} diff --git a/modules/nf-core/gawk/meta.yml b/modules/nf-core/gawk/meta.yml new file mode 100644 index 0000000..96cd0c7 --- /dev/null +++ b/modules/nf-core/gawk/meta.yml @@ -0,0 +1,84 @@ +name: "gawk" +description: | + If you are like many computer users, you would frequently like to make changes in various text files + wherever certain patterns appear, or extract data from parts of certain lines while discarding the rest. + The job is easy with awk, especially the GNU implementation gawk. +keywords: + - gawk + - awk + - txt + - text + - file parsing +tools: + - "gawk": + description: "GNU awk" + homepage: "https://www.gnu.org/software/gawk/" + documentation: "https://www.gnu.org/software/gawk/manual/" + tool_dev_url: "https://www.gnu.org/prep/ftp.html" + licence: + - "GPL v3" + identifier: "" +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - input: + type: file + description: The input file - Specify the logic that needs to be executed + on this file on the `ext.args2` or in the program file. If the files + have a `.gz` extension, they will be unzipped using `zcat`. + pattern: "*" + ontologies: [] + - program_file: + type: file + description: Optional file containing logic for awk to execute. If you don't + wish to use a file, you can use `ext.args2` to specify the logic. + pattern: "*" + ontologies: [] + - disable_redirect_output: + type: boolean + description: Disable the redirection of awk output to a given file. This is + useful if you want to use awk's built-in redirect to write files instead + of the shell's redirect. +output: + output: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. [ id:'test', single_end:false ] + - "*.${suffix}": + type: file + description: The output file - if using shell redirection, specify the + name of this file using `ext.prefix` and the extension using + `ext.suffix`. Otherwise, ensure the awk program produces files with + the extension in `ext.suffix`. + pattern: "*" + ontologies: [] + versions_gawk: + - - ${task.process}: + type: string + description: The name of the process + - gawk: + type: string + description: The name of the tool + - awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//': + type: eval + description: The expression to obtain the version of the tool +topics: + versions: + - - ${task.process}: + type: string + description: The name of the process + - gawk: + type: string + description: The name of the tool + - awk -Wversion | sed '1!d; s/.*Awk //; s/,.*//': + type: eval + description: The expression to obtain the version of the tool +authors: + - "@nvnieuwk" +maintainers: + - "@nvnieuwk" diff --git a/modules/nf-core/gawk/tests/main.nf.test b/modules/nf-core/gawk/tests/main.nf.test new file mode 100644 index 0000000..3bd0a43 --- /dev/null +++ b/modules/nf-core/gawk/tests/main.nf.test @@ -0,0 +1,211 @@ +nextflow_process { + + name "Test Process GAWK" + script "../main.nf" + process "GAWK" + + tag "modules" + tag "modules_nfcore" + tag "gawk" + + config "./nextflow.config" + + test("Convert fasta to bed") { + when { + params { + gawk_suffix = "bed" + gawk_args2 = '\'BEGIN { FS = OFS = "\t"}; { print \$1, "0", \$2 }\'' + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = [] + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Convert fasta to bed - stub") { + + options "-stub" + + when { + params { + gawk_suffix = "bed" + gawk_args2 = '\'BEGIN { FS = OFS = "\t"}; { print \$1, "0", \$2 }\'' + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = [] + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(process.out).match() + } + } + + test("Convert fasta to bed with program file") { + when { + params { + gawk_suffix = "bed" + gawk_args2 = "" + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = Channel.of('BEGIN { FS = OFS = "\t"}; { print \$1, "0", \$2 }').collectFile(name:"program.awk") + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Convert fasta to bed using awk redirect instead of shell redirect") { + when { + params { + gawk_suffix = "bed" + gawk_args2 = '\'BEGIN { FS = OFS = "\t"}; { print \$1, "0", \$2 > "test.bed" }\'' + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = [] + input[2] = true + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Extract first column from multiple files") { + when { + params { + gawk_suffix = "bed" + gawk_args2 = "" + } + process { + """ + input[0] = [ + [ id:'test' ], + [file(params.modules_testdata_base_path + 'generic/txt/hello.txt', checkIfExists: true), + file(params.modules_testdata_base_path + 'generic/txt/species_names.txt', checkIfExists: true)] + ] + input[1] = Channel.of('BEGIN {FS=" "}; {print \$1}').collectFile(name:"program.awk") + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Unzip files before processing") { + when { + params { + gawk_suffix = "bed" + gawk_args2 = "" + } + process { + """ + input[0] = [ + [ id:'test' ], + [file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/vcf/NA12878_chrM.vcf.gz', checkIfExists: true), + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/illumina/vcf/NA24385_sv.vcf.gz', checkIfExists: true)] + ] + input[1] = Channel.of('/^#CHROM/ { print \$1, \$10 }').collectFile(name:"column_header.awk") + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Compress after processing") { + when { + params { + gawk_suffix = "txt.gz" + gawk_args2 = '\'BEGIN { FS = OFS = "\t"}; { print \$1, "0", \$2 }\'' + } + process { + """ + input[0] = [ + [ id:'test' ], + file(params.modules_testdata_base_path + 'genomics/homo_sapiens/genome/genome.fasta.fai', checkIfExists: true) + ] + input[1] = [] + input[2] = false + """ + } + } + + then { + assert process.success + assert snapshot(sanitizeOutput(process.out)).match() + } + } + + test("Input and output files are similar") { + when { + params { + gawk_suffix = "txt" + gawk_args = "" + gawk_args2 = "" + } + process { + """ + input[0] = [ + [ id:'hello' ], + [file(params.modules_testdata_base_path + 'generic/txt/hello.txt', checkIfExists: true), + file(params.modules_testdata_base_path + 'generic/txt/species_names.txt', checkIfExists: true)] + ] + input[1] = Channel.of('BEGIN {FS=" "}; {print \$1}').collectFile(name:"program.awk") + input[2] = false + """ + } + } + + then { + assert process.failed + assert process.errorReport.contains("Input and output names are the same, set prefix in module configuration to disambiguate!") + } + } +} diff --git a/modules/nf-core/gawk/tests/main.nf.test.snap b/modules/nf-core/gawk/tests/main.nf.test.snap new file mode 100644 index 0000000..9d6a369 --- /dev/null +++ b/modules/nf-core/gawk/tests/main.nf.test.snap @@ -0,0 +1,199 @@ +{ + "Compress after processing": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.txt.gz:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:50.761549948", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Convert fasta to bed": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:30:50.804933797", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Convert fasta to bed with program file": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:10.838989113", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Convert fasta to bed - stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "test.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "1": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ], + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:00.182649403", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Extract first column from multiple files": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,566c51674bd643227bb2d83e0963376d" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:30.796772884", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Unzip files before processing": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,1e31ebd4a060aab5433bbbd9ab24e403" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:40.72259289", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + }, + "Convert fasta to bed using awk redirect instead of shell redirect": { + "content": [ + { + "output": [ + [ + { + "id": "test" + }, + "test.bed:md5,87a15eb9c2ff20ccd5cd8735a28708f7" + ] + ], + "versions_gawk": [ + [ + "GAWK", + "gawk", + "5.3.1" + ] + ] + } + ], + "timestamp": "2026-03-04T11:31:20.33222004", + "meta": { + "nf-test": "0.9.4", + "nextflow": "25.10.4" + } + } +} \ No newline at end of file diff --git a/modules/nf-core/gawk/tests/nextflow.config b/modules/nf-core/gawk/tests/nextflow.config new file mode 100644 index 0000000..895709a --- /dev/null +++ b/modules/nf-core/gawk/tests/nextflow.config @@ -0,0 +1,6 @@ +process { + withName: GAWK { + ext.suffix = params.gawk_suffix + ext.args2 = params.gawk_args2 + } +} diff --git a/nextflow.config b/nextflow.config index 6472c1e..3a31a88 100644 --- a/nextflow.config +++ b/nextflow.config @@ -23,6 +23,10 @@ params { bai_normal = null sex = null + // CADD + cadd_resources = null + cadd_prescored_indels = null + // Vep vep_cache_version = 112 vep_plugin_files = null @@ -292,6 +296,7 @@ validation { // Load modules.config for DSL2 module specific options includeConfig 'conf/modules.config' +includeConfig 'conf/subworkflows/annotate_cadd.config' includeConfig 'conf/subworkflows/prepare_references.config' includeConfig 'conf/subworkflows/process_snvs.config' includeConfig 'conf/subworkflows/generate_cytosure_files.config' diff --git a/nextflow_schema.json b/nextflow_schema.json index f531dbd..96bdc59 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -129,6 +129,22 @@ "pattern": "^\\S+\\.fn?a(sta)?\\.fai$", "fa_icon": "fas fa-file" }, + "cadd_prescored_indels": { + "type": "string", + "exists": true, + "format": "directory-path", + "fa_icon": "fas fa-file", + "description": "Path to a directory containing prescored indels for CADD.", + "help_text": "This folder contains the compressed files and indexes that would otherwise be in data/prescored folder as described in https://github.com/kircherlab/CADD-scripts/#manual-installation." + }, + "cadd_resources": { + "type": "string", + "exists": true, + "format": "directory-path", + "fa_icon": "fas fa-file", + "description": "Path to the directory containing cadd annotations.", + "help_text": "This folder contains the uncompressed files that would otherwise be in data/annotation folder as described in https://github.com/kircherlab/CADD-scripts/#manual-installation." + }, "species": { "type": "string", "default": "homo_sapiens", diff --git a/subworkflows/local/annotate_cadd/main.nf b/subworkflows/local/annotate_cadd/main.nf new file mode 100644 index 0000000..e6d851e --- /dev/null +++ b/subworkflows/local/annotate_cadd/main.nf @@ -0,0 +1,82 @@ +// +// A subworkflow to annotate indels with CADD scores +// + +include { BCFTOOLS_ANNOTATE as BCFTOOLS_RENAME_CHR_CADD } from '../../../modules/nf-core/bcftools/annotate/main' +include { BCFTOOLS_ANNOTATE as BCFTOOLS_ANNOTATE_INDELS } from '../../../modules/nf-core/bcftools/annotate/main' +include { BCFTOOLS_VIEW } from '../../../modules/nf-core/bcftools/view/main' +include { CADD } from '../../../modules/nf-core/cadd/main' +include { GAWK as GAWK_REF_TO_CADD_CHRNAMES } from '../../../modules/nf-core/gawk/main' +include { GAWK as GAWK_CADD_TO_REF_CHRNAMES } from '../../../modules/nf-core/gawk/main' +include { TABIX_TABIX as TABIX_CADD } from '../../../modules/nf-core/tabix/tabix/main' +include { TABIX_TABIX as TABIX_INPUT } from '../../../modules/nf-core/tabix/tabix/main' + + +workflow ANNOTATE_CADD { + + take: + ch_vcf // channel: [mandatory] [val(meta), path(vcf)] + val_genome // string: [mandatory] GRCh37 or GRCh38 + ch_fai // channel: [mandatory] [val(meta), path(fai)] + ch_header // channel: [mandatory] [path(txt)] + ch_cadd_resources // channel: [mandatory] [val(meta), path(dir)] + ch_cadd_prescored_indels // channel: [mandatory] [val(meta), path(dir)] + + main: + + ch_rename_chrs_ref = channel.value([[]]) + + TABIX_INPUT(ch_vcf) //Subworkflow needs tabix index + + ch_vcf_tbi = ch_vcf + .join(TABIX_INPUT.out.index, failOnMismatch:true, failOnDuplicate:true) + + // Create files and rename chromosomes if reference is GRCh38 + if (val_genome.equals('GRCh38')) { + + // Create txt files for changing of chromosomes + GAWK_REF_TO_CADD_CHRNAMES ( ch_fai , [], false ) + + GAWK_REF_TO_CADD_CHRNAMES.out.output.map { _meta, txt -> txt } + .set {ch_chrnames_cadd} + + GAWK_CADD_TO_REF_CHRNAMES ( ch_fai , [], false ) + + GAWK_CADD_TO_REF_CHRNAMES.out.output.map { _meta, txt -> txt } + .set { ch_rename_chrs_ref } + + rename_chrnames_in = ch_vcf_tbi + .combine(ch_chrnames_cadd) + .map { meta, vcf, tbi, txt -> tuple( meta, vcf, tbi, [], [], [], [], txt ) } + + // Change chr names to CADD compatible names + BCFTOOLS_RENAME_CHR_CADD( rename_chrnames_in ) + + ch_vcf_tbi = BCFTOOLS_RENAME_CHR_CADD.out.vcf + .map {meta, vcf -> tuple( meta , vcf, [] )} + } + + // Filter to extract indels + BCFTOOLS_VIEW(ch_vcf_tbi, [], [], []) + + // CADD + CADD(BCFTOOLS_VIEW.out.vcf, ch_cadd_resources, ch_cadd_prescored_indels) + + // Index CADD + TABIX_CADD(CADD.out.tsv) + + // Change chr names back to desired naming and annotate original vcf with cadd results + ch_annotate = ch_vcf_tbi + .join(CADD.out.tsv, failOnMismatch: true, failOnDuplicate: true) + .join(TABIX_CADD.out.index, failOnMismatch: true, failOnDuplicate: true) + .combine( ch_header ) + .combine( ch_rename_chrs_ref ) + .map { meta, vcf, tbi, annotations, annotations_index, header, txt -> tuple( meta, vcf, tbi, annotations, annotations_index, [], header, txt ) } //THERE IS A TBI? + + + BCFTOOLS_ANNOTATE_INDELS( ch_annotate ) + + emit: + vcf = BCFTOOLS_ANNOTATE_INDELS.out.vcf // channel: [val(meta), path(vcf)] + tbi = BCFTOOLS_ANNOTATE_INDELS.out.tbi // channel: [val(meta), path(tbi)] +} diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test b/subworkflows/local/annotate_cadd/tests/main.nf.test new file mode 100644 index 0000000..2691e46 --- /dev/null +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test @@ -0,0 +1,76 @@ +nextflow_workflow { + + name "Test Subworkflow ANNOTATE_CADD" + script "../main.nf" + workflow "ANNOTATE_CADD" + tag "subworkflows" + tag "annotate_cadd" + config "./nextflow.config" + + test("ANNOTATE_CADD - GRCh37, stub") { + + options "-stub" + + when { + params { + genome = "GRCh37" + outdir = "$outputDir" + } + workflow { + """ + input[0] = channel.of([ + [id:'test'], + file(params.pipelines_testdata_base_path + 'testdata/tumor_normal/subject_a.tumor.purple.somatic.vcf.gz', checkIfExists: true) + ]) + input[1] = 'GRCh37' + input[2] = channel.fromPath(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true).map {it -> [[id:it.simpleName], it] }.collect() + input[3] = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() + input[4] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_resources' ], dir ] } + input[5] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_prescored_indels' ], dir ] } + """ + } + } + + then { + + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + } + + test("ANNOTATE_CADD - GRCh38, stub") { + // TODO update test data to GRCh38 + options "-stub" + + when { + params { + genome = "GRCh38" + outdir = "$outputDir" + } + workflow { + """ + input[0] = channel.of([ + [id:'test'], + file(params.pipelines_testdata_base_path + 'testdata/tumor_normal/subject_a.tumor.purple.somatic.vcf.gz', checkIfExists: true) + ]) + input[1] = 'GRCh38' + input[2] = channel.fromPath(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true).map {it -> [[id:it.simpleName], it] }.collect() + input[3] = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() + input[4] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_resources' ], dir ] } + input[5] = channel.from("\$PWD").map { dir -> [ [ id: 'cadd_prescored_indels' ], dir ] } + """ + } + } + + then { + + assertAll( + { assert workflow.success }, + { assert snapshot(workflow.out).match() } + ) + } + } + +} diff --git a/subworkflows/local/annotate_cadd/tests/main.nf.test.snap b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap new file mode 100644 index 0000000..c9dcc48 --- /dev/null +++ b/subworkflows/local/annotate_cadd/tests/main.nf.test.snap @@ -0,0 +1,88 @@ +{ + "ANNOTATE_CADD - GRCh37, stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "subject_a_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "subject_a_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tbi": [ + [ + { + "id": "test" + }, + "subject_a_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "vcf": [ + [ + { + "id": "test" + }, + "subject_a_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-21T14:50:16.980447" + }, + "ANNOTATE_CADD - GRCh38, stub": { + "content": [ + { + "0": [ + [ + { + "id": "test" + }, + "subject_a_renamed_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ], + "1": [ + [ + { + "id": "test" + }, + "subject_a_renamed_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "tbi": [ + [ + { + "id": "test" + }, + "subject_a_renamed_ann.vcf.gz.tbi:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + "vcf": [ + [ + { + "id": "test" + }, + "subject_a_renamed_ann.vcf.gz:md5,68b329da9893e34099c7d8ad5cb9c940" + ] + ] + } + ], + "meta": { + "nf-test": "0.9.3", + "nextflow": "25.10.4" + }, + "timestamp": "2026-04-21T14:50:26.527943" + } +} \ No newline at end of file diff --git a/subworkflows/local/annotate_cadd/tests/nextflow.config b/subworkflows/local/annotate_cadd/tests/nextflow.config new file mode 100644 index 0000000..dd99056 --- /dev/null +++ b/subworkflows/local/annotate_cadd/tests/nextflow.config @@ -0,0 +1,40 @@ +process { + + withName: 'BCFTOOLS_RENAME_CHR_CADD' { + ext.args = { "--output-type z" } + ext.prefix = { "${input.simpleName}_renamed" } + } + + withName: 'BCFTOOLS_VIEW' { + ext.args = { "--output-type z --types indels,other" } + ext.prefix = { "${vcf.simpleName}_indels" } + } + + withName: 'CADD' { + container = "nf-core/ubuntu:22.04" //Using an basic container because v1.7.3 is too big for CI. + ext.args = { "-g ${params.genome}" } + ext.prefix = { "${vcf.simpleName}_cadd" } + } + + withName: 'TABIX_CADD' { + ext.args = { "--force --sequence 1 --begin 2 --end 2" } + } + + withName: 'GAWK_CADD_TO_REF_CHRNAMES' { + ext.args2 = '\'{original=$1; sub("chr","",$1); print $1, original}\'' + ext.prefix = "cadd_to_reference" + ext.suffix = "txt" + } + + withName: 'GAWK_REF_TO_CADD_CHRNAMES' { + ext.args2 = '\'{original=$1; sub("chr","",$1); print original, $1}\'' + ext.prefix = "reference_to_cadd" + ext.suffix = "txt" + } + + withName: 'BCFTOOLS_ANNOTATE_INDELS' { + ext.args = { "--columns Chrom,Pos,Ref,Alt,-,CADD --output-type z --write-index=tbi" } + ext.prefix = { "${input.simpleName}_ann" } + } + +} diff --git a/subworkflows/local/process_snvs/main.nf b/subworkflows/local/process_snvs/main.nf index 2fb8613..442fef5 100644 --- a/subworkflows/local/process_snvs/main.nf +++ b/subworkflows/local/process_snvs/main.nf @@ -12,6 +12,7 @@ include { ENSEMBLVEP_VEP } from '../../../modules/nf-co include { VCFANNO } from '../../../modules/nf-core/vcfanno/main' include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_RESEARCH } from '../../../modules/nf-core/bcftools/view/main' include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_CLINICAL } from '../../../modules/nf-core/bcftools/view/main' +include { ANNOTATE_CADD } from '../../../subworkflows/local/annotate_cadd' /* ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -22,18 +23,23 @@ include { BCFTOOLS_VIEW as BCFTOOLS_VIEW_CLINICAL } from '../../../modules/nf-co workflow PROCESS_SNVS { take: - ch_genome_fasta // channel: [optional] [val(meta), path(fasta)] - ch_snv_vcf // channel: [optional] [val(meta), path(vcf)] - ch_snv_vcf_tbi // channel: [optional] [val(meta), path(vcf.tbi)] - ch_vcfanno_extra // channel: [optional] [path(extra_file1), path(extra_file2), ...] - ch_vcfanno_lua // channel: [optional] [path(lua_file)] - ch_vcfanno_resources // channel: [optional] [path(resource_file1), path(resource_file2), ...] - ch_vcfanno_toml // channel: [optional] [path(toml_file)] - ch_vep_cache // channel: [optional] [path(vep_cache)] - ch_vep_extra_files // channel: [optional] [path(plugin_file1), path(plugin_file2), ...] - val_genome // string: [optional] genome assembly (e.g. "GRCh38") - val_species // string: [optional] species (e.g. "homo_sapiens") - val_vep_cache_version // string: [optional] version of vep cache to use (e.g. "107") + ch_genome_fasta // channel: [optional] [val(meta), path(fasta)] + ch_genome_fai // channel: [optional] [val(meta), path(fai)] + ch_cadd_header // channel: [optional] [val(meta), path(header_file)] + ch_cadd_prescored_indels // channel: [optional] [val(meta), path(dir)] + ch_cadd_resources // channel: [optional] [val(meta), path(dir)] + ch_snv_vcf // channel: [optional] [val(meta), path(vcf)] + ch_snv_vcf_tbi // channel: [optional] [val(meta), path(vcf.tbi)] + ch_vcfanno_extra // channel: [optional] [path(extra_file1), path(extra_file2), ...] + ch_vcfanno_lua // channel: [optional] [path(lua_file)] + ch_vcfanno_resources // channel: [optional] [path(resource_file1), path(resource_file2), ...] + ch_vcfanno_toml // channel: [optional] [path(toml_file)] + ch_vep_cache // channel: [optional] [path(vep_cache)] + ch_vep_extra_files // channel: [optional] [path(plugin_file1), path(plugin_file2), ...] + val_cadd_resources // string: [optional] path to CADD resources directory + val_genome // string: [optional] genome assembly (e.g. "GRCh38") + val_species // string: [optional] species (e.g. "homo_sapiens") + val_vep_cache_version // string: [optional] version of vep cache to use (e.g. "107") main: // Annotate with custom databases @@ -64,6 +70,25 @@ workflow PROCESS_SNVS { } .set { ch_vep_snv } + // ANNOTATE WITH CADD - currently depends on val_cadd_resources - could be improved? + if (val_cadd_resources) { + + ch_cadd_in = BCFTOOLS_VIEW_RESEARCH.out.vcf + + ANNOTATE_CADD ( + ch_cadd_in, + val_genome, + ch_genome_fai, + ch_cadd_header, + ch_cadd_resources, + ch_cadd_prescored_indels + ) + + ANNOTATE_CADD.out.vcf + .join(ANNOTATE_CADD.out.tbi) + .set { ch_vep_snv } + } + ENSEMBLVEP_VEP ( ch_vep_snv, val_genome, diff --git a/subworkflows/local/process_snvs/tests/main.nf.test b/subworkflows/local/process_snvs/tests/main.nf.test index 9f2ead2..403e0ee 100644 --- a/subworkflows/local/process_snvs/tests/main.nf.test +++ b/subworkflows/local/process_snvs/tests/main.nf.test @@ -33,27 +33,31 @@ nextflow_workflow { [id:'reference'], file(params.pipelines_testdata_base_path + 'reference/reference.fasta', checkIfExists: true) ]) - input[1] = channel.of([ + input[1] = channel.fromPath(params.pipelines_testdata_base_path + 'reference/reference.fasta.fai', checkIfExists: true).map {it -> [[id:it.simpleName], it] }.collect() + input[2] = channel.fromPath("$projectDir/assets/cadd_to_vcf_header.txt", checkIfExists: true).collect() + input[3] = null + input[4] = null + input[5] = channel.of([ [id:'SNV'], file(params.pipelines_testdata_base_path + 'testdata/tumor_normal/subject_a.tumor.purple.somatic.vcf.gz', checkIfExists: true) ]) - input[2] = channel.of([ + input[6] = channel.of([ [id:'SNV'], file(params.pipelines_testdata_base_path + 'testdata/tumor_normal/subject_a.tumor.purple.sv.vcf.gz', checkIfExists: true) ]) - input[3] = [] - input[4] = channel.of([ + input[7] = [] + input[8] = channel.of([ file(params.pipelines_testdata_base_path + 'reference/vcfanno_functions.lua', checkIfExists: true) ]) - input[5] = channel.of([ + input[9] = channel.of([ file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz', checkIfExists: true), file(params.pipelines_testdata_base_path + 'reference/grch37_gnomad_-r2.1.1-.vcf.gz.tbi', checkIfExists: true) ]) - input[6] = channel.of([ + input[10] = channel.of([ file(params.pipelines_testdata_base_path + 'reference/vcfanno.toml', checkIfExists: true) ]) - input[7] = UNTAR_VEP_CACHE.out.untar.map{ _meta, files -> [files]}.collect() - input[8] = channel.of([ + input[11] = UNTAR_VEP_CACHE.out.untar.map{ _meta, files -> [files]}.collect() + input[12] = channel.of([ file(params.pipelines_testdata_base_path + 'reference/LoFtool_scores.txt', checkIfExists: true), file(params.pipelines_testdata_base_path + 'reference/spliceai_21_scores_raw_indel_-v1.3-.vcf.gz', checkIfExists: true), file(params.pipelines_testdata_base_path + 'reference/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz', checkIfExists: true), @@ -61,9 +65,10 @@ nextflow_workflow { file(params.pipelines_testdata_base_path + 'reference/spliceai_21_scores_raw_indel_-v1.3-.vcf.gz.tbi', checkIfExists: true), file(params.pipelines_testdata_base_path + 'reference/spliceai_21_scores_raw_snv_-v1.3-.vcf.gz.tbi', checkIfExists: true) ]) - input[9] = 'GRCh37' - input[10] = 'homo_sapiens' - input[11] = '107' + input[13] = null + input[14] = 'GRCh37' + input[15] = 'homo_sapiens' + input[16] = '107' """ } } diff --git a/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf b/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf index a492c2e..b276648 100644 --- a/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_oncorefiner_pipeline/main.nf @@ -180,6 +180,7 @@ def toolCitationText() { def citations_list = [] def vcfanno = "vcfanno (Pedersen et al. 2016)" def bcftools_view = "bcftools (Danecek et al. 2021)" + def cadd = "CADD (Rentzsch et al. 2019)" def ensemblvep_vep = "Ensembl VEP (McLaren et al. 2016)" def svdb = "svdb" def multiqc = "MultiQC (Ewels et al. 2016)" @@ -190,6 +191,9 @@ def toolCitationText() { vcfanno + bcftools_view + ensemblvep_vep + if (params.cadd_resources) { + citations_list = citations_list + cadd + } } if (params.sv_vcf) { @@ -218,6 +222,7 @@ def toolBibliographyText() { def bibliography_list = [] def vcfanno = "
  • Pedersen BS, Layer RM, Quinlan AR. Vcfanno: fast, flexible annotation of genetic variants. Genome Biol. 2016 Jun 1;17(1):118. doi: 10.1186/s13059-016-0973-5. PMID: 27250555; PMCID: PMC4888505.
  • " def bcftools_view = "
  • Danecek P, Bonfield JK, Liddle J, Marshall J, Ohan V, Pollard MO, Whitwham A, Keane T, McCarthy SA, Davies RM, Li H. Twelve years of SAMtools and BCFtools. Gigascience. 2021 Feb 16;10(2):giab008. doi: 10.1093/gigascience/giab008. PMID: 33590845; PMCID: PMC7898596.
  • " + def cadd = "
  • Rentzsch P, Witten D, Cooper GM, Shendure J, Kircher M. CADD: predicting the deleteriousness of variants throughout the human genome. Nucleic Acids Res. 2019 Jan 8;47(D1):D886-D894. doi: 10.1093/nar/gky1016. PMID: 30371827; PMCID: PMC6323892.
  • " def ensemblvep_vep = "
  • McLaren W, Gil L, Hunt SE, Riat HS, Ritchie GR, Thormann A, Flicek P, Cunningham F. The Ensembl Variant Effect Predictor. Genome Biol. 2016 Jun 6;17(1):122. doi: 10.1186/s13059-016-0974-4. PMID: 27268795; PMCID: PMC4893825.
  • " def svdb = "
  • svdb. https://github.com/J35P312/svdb.
  • " def multiqc = "
  • Ewels, P., Magnusson, M., Lundin, S., & Käller, M. (2016). MultiQC: summarize analysis results for multiple tools and samples in a single report. Bioinformatics , 32(19), 3047–3048. doi: /10.1093/bioinformatics/btw354
  • " @@ -228,6 +233,9 @@ def toolBibliographyText() { vcfanno + bcftools_view + ensemblvep_vep + if (params.cadd_resources) { + bibliography_list = bibliography_list + cadd + } } if (params.sv_vcf) { diff --git a/workflows/oncorefiner.nf b/workflows/oncorefiner.nf index 51cd6ab..fe4cd97 100644 --- a/workflows/oncorefiner.nf +++ b/workflows/oncorefiner.nf @@ -45,24 +45,29 @@ include { PROCESS_SNVS } from '../subworkflows/local/process_snvs/main.nf' workflow ONCOREFINER { take: - ch_samplesheet // channel: [mandatory] samplesheet read in from --input - ch_bam_bai_normal // channel: [optional] [val(meta), path(bam), path(bai)] - ch_bam_bai_tumor // channel: [mandatory] [val(meta), path(bam), path(bai)] - ch_genome_fasta // channel: [optional] [val(meta), path(fasta)] - ch_snv_vcf // channel: [optional] [val(meta), path(vcf)] - ch_snv_vcf_tbi // channel: [optional] [val(meta), path(vcf.tbi)] - ch_sv_dbs // channel: [optional] [path(csv)] - ch_sv_vcf // channel: [optional] [val(meta), path(vcf)] - ch_sv_vcf_tbi // channel: [optional] [val(meta), path(vcf.tbi)] - ch_vcfanno_extra // channel: [optional] [path(extra_file1), path(extra_file2), ...] - ch_vcfanno_lua // channel: [optional] [path(lua_file)] - ch_vcfanno_resources // channel: [optional] [path(resource_file1), path(resource_file2), ...] - ch_vcfanno_toml // channel: [optional] [path(toml_file)] - ch_vep_cache // channel: [optional] [vep_cache_files] - ch_vep_extra_files // channel: [optional] [path(plugin_file1), path(plugin_file2), ...] - val_genome // string: [optional] genome assembly (e.g. "GRCh38") - val_species // string: [optional] species (e.g. "homo_sapiens") - val_vep_cache_version // string: [optional] version of vep cache to use (e.g. "107") + ch_samplesheet // channel: [mandatory] samplesheet read in from --input + ch_bam_bai_normal // channel: [optional] [val(meta), path(bam), path(bai)] + ch_bam_bai_tumor // channel: [mandatory] [val(meta), path(bam), path(bai)] + ch_cadd_header // channel: [mandatory] [path(txt)] + ch_cadd_prescored_indels // channel: [optional] [val(meta), path(dir)] + ch_cadd_resources // channel: [optional] [val(meta), path(dir)] + ch_genome_fasta // channel: [optional] [val(meta), path(fasta)] + ch_genome_fai // channel: [optional] [val(meta), path(fai)] + ch_snv_vcf // channel: [optional] [val(meta), path(vcf)] + ch_snv_vcf_tbi // channel: [optional] [val(meta), path(vcf.tbi)] + ch_sv_dbs // channel: [optional] [path(csv)] + ch_sv_vcf // channel: [optional] [val(meta), path(vcf)] + ch_sv_vcf_tbi // channel: [optional] [val(meta), path(vcf.tbi)] + ch_vcfanno_extra // channel: [optional] [path(extra_file1), path(extra_file2), ...] + ch_vcfanno_lua // channel: [optional] [path(lua_file)] + ch_vcfanno_resources // channel: [optional] [path(resource_file1), path(resource_file2), ...] + ch_vcfanno_toml // channel: [optional] [path(toml_file)] + ch_vep_cache // channel: [optional] [vep_cache_files] + ch_vep_extra_files // channel: [optional] [path(plugin_file1), path(plugin_file2), ...] + val_cadd_resources // string: [optional] path to CADD resources directory + val_genome // string: [optional] genome assembly (e.g. "GRCh38") + val_species // string: [optional] species (e.g. "homo_sapiens") + val_vep_cache_version // string: [optional] version of vep cache to use (e.g. "107") main: @@ -73,6 +78,10 @@ workflow ONCOREFINER { // Process SNV VCF files PROCESS_SNVS ( ch_genome_fasta, + ch_genome_fai, + ch_cadd_header, + ch_cadd_prescored_indels, + ch_cadd_resources, ch_snv_vcf, ch_snv_vcf_tbi, ch_vcfanno_extra, @@ -81,6 +90,7 @@ workflow ONCOREFINER { ch_vcfanno_toml, ch_vep_cache, ch_vep_extra_files, + val_cadd_resources, val_genome, val_species, val_vep_cache_version