diff --git a/CHANGELOG.md b/CHANGELOG.md index 34ea1d25..cbba6938 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - [#181](https://github.com/nf-core/phaseimpute/pull/181) - Add nf-co2footprint plugin to the config file. - [#184](https://github.com/nf-core/phaseimpute/pull/184) - Add support `.csi` index for `.bam` files. - [#188](https://github.com/nf-core/phaseimpute/pull/188) - Add documentation for all subworkflows. +- [#204](https://github.com/nf-core/phaseimpute/pull/204) - Add MINIMAC4 support for genotype imputation. ### `Changed` @@ -44,6 +45,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 | `r-stitch` | 1.6.10 | 1.7.3 | | `shapeit5` | 1.0.0 | 5.1.1 | | `vcflib` | 1.0.3 | 1.0.14 | +| `minimac4` | | 4.1.6 | + +### `Contributors` + +[Gaspard Ichas](https://github.com/gichas) ## v1.0.0 - Black Labrador [2024-12-09] diff --git a/README.md b/README.md index 27d1137b..828fa5e3 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ The whole pipeline consists of five main steps, each of which can be run separat - **Position Extraction** for targeted imputation sites. 4. **Imputation (`--impute`)**: This is the primary step, where genotypes in the target dataset are imputed using the prepared reference panel. The main steps are: - - **Imputation** of the target dataset using tools like [Glimpse1](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html), [Glimpse2](https://odelaneau.github.io/GLIMPSE/), [Stitch](https://github.com/rwdavies/stitch), or [Quilt](https://github.com/rwdavies/QUILT). + - **Imputation** of the target dataset using tools like [Glimpse1](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html), [Glimpse2](https://odelaneau.github.io/GLIMPSE/), [Stitch](https://github.com/rwdavies/stitch), [Quilt](https://github.com/rwdavies/QUILT) or [Minimac4](https://github.com/statgen/Minimac4). - **Ligation** of imputed chunks to produce a final VCF file per sample, with all chromosomes unified. 5. **Validation (`--validate`)**: Assesses imputation accuracy by comparing the imputed dataset to a truth dataset. This step leverages the [Glimpse2](https://odelaneau.github.io/GLIMPSE/) concordance process to summarize differences between two VCF files. diff --git a/conf/steps/imputation_minimac4.config b/conf/steps/imputation_minimac4.config new file mode 100644 index 00000000..f87e9b11 --- /dev/null +++ b/conf/steps/imputation_minimac4.config @@ -0,0 +1,50 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Config file for defining DSL2 per module options and publishing paths +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Available keys to override module options: + ext.args = Additional arguments appended to command in module. + ext.args2 = Second set of arguments appended to command in module (multi-tool modules). + ext.args3 = Third set of arguments appended to command in module (multi-tool modules). + ext.prefix = File name prefix for output files. +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +*/ + +process { + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_IMPUTE_MINIMAC4:.*' { + publishDir = [enabled: false] + tag = { "${meta.id} ${meta.chr}" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_IMPUTE_MINIMAC4:MINIMAC4_COMPRESSREF' { + ext.args = '' + ext.prefix = { "${meta.id}.${meta.chr}.minimac4" } + publishDir = [enabled: false] + tag = { "${meta.id} ${meta.chr}" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_IMPUTE_MINIMAC4:MINIMAC4_IMPUTE' { + ext.args = { "--output-format vcf.gz" } + ext.prefix = { "${meta.id}.${meta.chr}.minimac4" } + tag = { "${meta.id} ${meta.chr}" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:VCF_IMPUTE_MINIMAC4:BCFTOOLS_INDEX' { + ext.args = '' + publishDir = [enabled: false] + tag = { "${meta.id} ${meta.chr}" } + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_MINIMAC4:.*' { + publishDir = [ + path: { "${params.outdir}/imputation/minimac4/concat" }, + mode: params.publish_dir_mode, + saveAs: { filename -> filename.equals('versions.yml') ? null : filename } + ] + } + + withName: 'NFCORE_PHASEIMPUTE:PHASEIMPUTE:CONCAT_MINIMAC4:BCFTOOLS_CONCAT' { + ext.args = ["--ligate", "--output-type z", "--write-index=tbi"].join(' ') + ext.prefix = { "${meta.id}.minimac4" } + } +} diff --git a/conf/test_minimac4.config b/conf/test_minimac4.config new file mode 100644 index 00000000..5df2cb74 --- /dev/null +++ b/conf/test_minimac4.config @@ -0,0 +1,46 @@ +/* +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Nextflow config file for running minimal tests +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + Defines input files and everything required to run a fast and simple pipeline test. + + Use as follows: + nextflow run nf-core/phaseimpute -profile test_minimac4, --outdir + +---------------------------------------------------------------------------------------- +*/ + +process { + resourceLimits = [ + cpus: 4, + memory: '4.GB', + time: '1.h' + ] +} + +params { + config_profile_name = 'Test profile' + config_profile_description = 'Minimal test dataset to check pipeline function with MINIMAC4' + + // Input data + input = "${projectDir}/tests/csv/sample_vcf.csv" + + // Genome references + fasta = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz" + panel = "${projectDir}/tests/csv/panel.csv" + + // Region file + input_region = "${projectDir}/tests/csv/region.csv" + + // Map file + map = "${projectDir}/tests/csv/map.csv" + + // Position file + posfile = "${projectDir}/tests/csv/posfile.csv" + + // Pipeline steps + steps = "impute" + + // Impute tools + tools = "minimac4" +} diff --git a/conf/test_validate.config b/conf/test_validate.config index 29d703da..4e317e3f 100644 --- a/conf/test_validate.config +++ b/conf/test_validate.config @@ -30,7 +30,7 @@ params { // Genome references fasta = params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38.s.fa.gz" posfile = "${projectDir}/tests/csv/posfile_vcf_index.csv" - map = "${projectDir}/tests/csv/map.csv" + map = "${projectDir}/tests/csv/map_glimpse.csv" // Pipeline steps steps = "validate" diff --git a/docs/usage.md b/docs/usage.md index da2fe29a..f746602e 100644 --- a/docs/usage.md +++ b/docs/usage.md @@ -294,7 +294,7 @@ For starting from the imputation steps, the required flags are: - `--steps impute` - `--input input.csv`: The samplesheet containing the input sample files in `bam`, `cram` or `vcf`, `bcf` format. - `--genome` or `--fasta`: The reference genome of the samples. -- `--tools [glimpse1, quilt, stitch]`: A selection of one or more of the available imputation tools. Each imputation tool has their own set of specific flags and input files. These required files are produced by `--steps panelprep` and used as input in: +- `--tools [glimpse1, quilt, stitch, minimac4]`: A selection of one or more of the available imputation tools. Each imputation tool has their own set of specific flags and input files. These required files are produced by `--steps panelprep` and used as input in: - `--chunks chunks.csv`: A samplesheet containing chunks per chromosome. These are produced by `--steps panelprep` using `GLIMPSE1`. - `--posfile posfile.csv`: A samplesheet containing a `.legend.gz` file with the list of positions to genotype per chromosome. These are required by tools ( QUILT/STITCH/GLIMPSE1). It can also contain the `hap.gz` files (required by QUILT). The posfile can be generated with `--steps panelprep`. - `--panel panel.csv`: A samplesheet containing the post-processed reference panel VCF (required by GLIMPSE1, GLIMPSE2). These files can be obtained with `--steps panelprep`. @@ -307,6 +307,7 @@ For starting from the imputation steps, the required flags are: | `GLIMPSE2` | ✅ | ✅ ¹ | ✅ | ✅ | ✅ | ❌ | | `QUILT` | ✅ | ✅ ² | ✅ | ❌ | ✅ | ✅ ⁴ | | `STITCH` | ✅ | ✅ ² | ✅ | ❌ | ❌ | ✅ ³ | +| `MINIMAC4` | ✅ | ✅ ¹ | ✅ | ✅ | ❌ | ❌ | > ¹ Alignment files as well as variant calling format (i.e. BAM, CRAM, VCF or BCF) > ² Alignment files only (i.e. BAM or CRAM) @@ -332,12 +333,12 @@ When the number of samples exceeds the batch size, the pipeline will split the s To summarize: -- If you have Variant Calling Format (VCF) files, join them into a single file and choose either GLIMPSE1 or GLIMPSE2. +- If you have Variant Calling Format (VCF) files, join them into a single file and choose either GLIMPSE1, GLIMPSE2 or MINIMAC4. - If you have alignment files (e.g., BAM or CRAM), all tools are available, and processing will occur in `batch_size`: - GLIMPSE1 and STITCH may induce batch effects, so all samples need to be imputed together. - GLIMPSE2 and QUILT can process samples in separate batches. -## Imputation tools `--steps impute --tools [glimpse1, glimpse2, quilt, stitch]` +## Imputation tools `--steps impute --tools [glimpse1, glimpse2, quilt, stitch, minimac4]` You can choose different software to perform the imputation. In the following sections, the typical commands for running the pipeline with each software are included. Multiple tools can be selected by separating them with a comma (eg. `--tools glimpse1,quilt`). @@ -477,6 +478,26 @@ nextflow run nf-core/phaseimpute \ Make sure the CSV file with the input panel is the output from `--step panelprep` or has been previously prepared. +### MINIMAC4 + +[MINIMAC4](https://github.com/statgen/Minimac4) is a low memory, computationally efficient implementation of the MaCH algorithm for genotype imputation. It is designed to work on phased haplotypes and can handle very large reference panels. + +```bash +nextflow run nf-core/phaseimpute \ + --input samplesheet.csv \ + --panel samplesheet_reference.csv \ + --steps impute \ + --tool minimac4 \ + --outdir results \ + --genome GRCh37 \ + -profile docker \ + --posfile posfile.csv +``` + +The CSV file provided in `--panel` must be prepared with `--steps panelprep` and must contain four columns [panel, chr, vcf, index]. + +MINIMAC4 works only with variant calling format files (VCF or BCF) as input. + ## Start with validation `--steps validate` concordance_metro diff --git a/modules.json b/modules.json index 401ad857..6066f3fa 100644 --- a/modules.json +++ b/modules.json @@ -114,6 +114,16 @@ "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", "installed_by": ["modules"] }, + "minimac4/compressref": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, + "minimac4/impute": { + "branch": "master", + "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", + "installed_by": ["modules"] + }, "multiqc": { "branch": "master", "git_sha": "41dfa3f7c0ffabb96a6a813fe321c6d1cc5b6e46", diff --git a/modules/nf-core/minimac4/compressref/environment.yml b/modules/nf-core/minimac4/compressref/environment.yml new file mode 100644 index 00000000..81c63b8c --- /dev/null +++ b/modules/nf-core/minimac4/compressref/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::minimac4=4.1.6" diff --git a/modules/nf-core/minimac4/compressref/main.nf b/modules/nf-core/minimac4/compressref/main.nf new file mode 100644 index 00000000..da3c6b95 --- /dev/null +++ b/modules/nf-core/minimac4/compressref/main.nf @@ -0,0 +1,47 @@ +process MINIMAC4_COMPRESSREF { + tag "$meta.id" + label 'process_low' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/minimac4:4.1.6--hcb620b3_1': + 'biocontainers/minimac4:4.1.6--hcb620b3_1' }" + + input: + tuple val(meta), path(ref), path(ref_index) // Reference index is autodetected from reference file name + + output: + tuple val(meta), path("*.msav"), emit: msav + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + minimac4 \\ + --compress-reference $ref\\ + $args \\ + --threads $task.cpus \\ + -o ${prefix}.msav \\ + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimac4: \$(minimac4 --version |& sed '1!d ; s/minimac v//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + """ + touch ${prefix}.msav + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimac4: \$(minimac4 --version |& sed '1!d ; s/minimac v//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimac4/compressref/meta.yml b/modules/nf-core/minimac4/compressref/meta.yml new file mode 100644 index 00000000..0b6a5fd6 --- /dev/null +++ b/modules/nf-core/minimac4/compressref/meta.yml @@ -0,0 +1,62 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "minimac4_compressref" +description: Compression of a reference panel for genotype imputation to `.msav` format +keywords: + - haplotypes + - reference compression + - genomics +tools: + - "minimac4": + description: "Computationally efficient genotype imputation" + homepage: "https://github.com/statgen/Minimac4" + documentation: "https://github.com/statgen/Minimac4" + tool_dev_url: "https://github.com/statgen/Minimac4" + doi: "10.1038/ng.3656" + licence: ["GPL v3-or-later"] + identifier: biotools:minimac4 + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - ref: + type: file + description: Variant reference panel file + pattern: "*.{vcf,vcf.gz,bcf}" + ontologies: + - edam: "http://edamontology.org/format_3016" + - edam: "http://edamontology.org/format_3020" + - ref_index: + type: file + description: Index file for the reference panel + pattern: "*.{tbi,csi}" + ontologies: + - edam: "http://edamontology.org/format_3700" + +output: + msav: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.msav": + type: file + description: Multy-sample variant compressed file + pattern: "*.{msav}" + + ontologies: [] + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@LouisLeNezet" +maintainers: + - "@LouisLeNezet" diff --git a/modules/nf-core/minimac4/compressref/tests/main.nf.test b/modules/nf-core/minimac4/compressref/tests/main.nf.test new file mode 100644 index 00000000..19ad8ab6 --- /dev/null +++ b/modules/nf-core/minimac4/compressref/tests/main.nf.test @@ -0,0 +1,65 @@ +nextflow_process { + + name "Test Process MINIMAC4_COMPRESSREF" + script "../main.nf" + process "MINIMAC4_COMPRESSREF" + + tag "modules" + tag "modules_nfcore" + tag "minimac4" + tag "minimac4/compressref" + + test("homo sapiens - vcf") { + when { + process { + """ + input[0] = Channel.of([ + [ id:'input', single_end:false ], // meta map + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.vcf.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.vcf.gz.csi", checkIfExists: true), + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.msav.collect{ file(it[1]).name }, + ["versions.yml": path(process.out.versions.get(0)).yaml] + ).match() } + ) + } + + } + + test("homo sapiens - vcf - stub") { + + options "-stub" + + when { + process { + """ + input[0] = Channel.of([ + [ id:'input', single_end:false ], // meta map + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.vcf.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.vcf.gz.csi", checkIfExists: true), + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.msav, + ["versions.yml": path(process.out.versions.get(0)).yaml] + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/minimac4/compressref/tests/main.nf.test.snap b/modules/nf-core/minimac4/compressref/tests/main.nf.test.snap new file mode 100644 index 00000000..1581522d --- /dev/null +++ b/modules/nf-core/minimac4/compressref/tests/main.nf.test.snap @@ -0,0 +1,46 @@ +{ + "homo sapiens - vcf": { + "content": [ + [ + "input.msav" + ], + { + "versions.yml": { + "MINIMAC4_COMPRESSREF": { + "minimac4": "4.1.6" + } + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-03-07T20:17:00.324096449" + }, + "homo sapiens - vcf - stub": { + "content": [ + [ + [ + { + "id": "input", + "single_end": false + }, + "input.msav:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + { + "versions.yml": { + "MINIMAC4_COMPRESSREF": { + "minimac4": "4.1.6" + } + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-03-07T20:09:48.459570451" + } +} \ No newline at end of file diff --git a/modules/nf-core/minimac4/impute/environment.yml b/modules/nf-core/minimac4/impute/environment.yml new file mode 100644 index 00000000..81c63b8c --- /dev/null +++ b/modules/nf-core/minimac4/impute/environment.yml @@ -0,0 +1,7 @@ +--- +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/environment-schema.json +channels: + - conda-forge + - bioconda +dependencies: + - "bioconda::minimac4=4.1.6" diff --git a/modules/nf-core/minimac4/impute/main.nf b/modules/nf-core/minimac4/impute/main.nf new file mode 100644 index 00000000..1881c529 --- /dev/null +++ b/modules/nf-core/minimac4/impute/main.nf @@ -0,0 +1,67 @@ +process MINIMAC4_IMPUTE { + tag "$meta.id" + label 'process_medium' + + conda "${moduleDir}/environment.yml" + container "${ workflow.containerEngine == 'singularity' && !task.ext.singularity_pull_docker_container ? + 'https://depot.galaxyproject.org/singularity/minimac4:4.1.6--hcb620b3_1': + 'biocontainers/minimac4:4.1.6--hcb620b3_1' }" + + input: + tuple val(meta), path(target_vcf), path(target_index), path(ref_msav), path(sites_vcf), path(sites_index), path(map) + + output: + tuple val(meta), path("*.{bcf,sav,vcf.gz,vcf,ubcf,usav}"), emit: vcf + path "versions.yml" , emit: versions + + when: + task.ext.when == null || task.ext.when + + script: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-format bcf") || args.contains("-O bcf") ? "bcf" : + args.contains("--output-format sav") || args.contains("-O sav") ? "sav" : + args.contains("--output-format vcf.gz") || args.contains("-O vcf.gz") ? "vcf.gz" : + args.contains("--output-format vcf") || args.contains("-O vcf") ? "vcf" : + args.contains("--output-format ubcf") || args.contains("-O ubcf") ? "ubcf" : + args.contains("--output-format usav") || args.contains("-O usav") ? "usav" : + "vcf.gz" + def sites_cmd = sites_vcf ? "--sites $sites_vcf" : "" + def map_cmd = map ? "--map $map" : "" + """ + minimac4 \\ + $ref_msav \\ + $target_vcf \\ + $args \\ + $sites_cmd \\ + $map_cmd \\ + --threads $task.cpus \\ + -o ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimac4: \$(minimac4 --version |& sed '1!d ; s/minimac v//') + END_VERSIONS + """ + + stub: + def args = task.ext.args ?: '' + def prefix = task.ext.prefix ?: "${meta.id}" + def extension = args.contains("--output-format bcf") || args.contains("-O bcf") ? "bcf" : + args.contains("--output-format sav") || args.contains("-O sav") ? "sav" : + args.contains("--output-format vcf.gz") || args.contains("-O vcf.gz") ? "vcf.gz" : + args.contains("--output-format vcf") || args.contains("-O vcf") ? "vcf" : + args.contains("--output-format ubcf") || args.contains("-O ubcf") ? "ubcf" : + args.contains("--output-format usav") || args.contains("-O usav") ? "usav" : + "vcf.gz" + def create_cmd = extension.endsWith(".gz") ? "echo '' | gzip >" : "touch" + """ + ${create_cmd} ${prefix}.${extension} + + cat <<-END_VERSIONS > versions.yml + "${task.process}": + minimac4: \$(minimac4 --version |& sed '1!d ; s/minimac v//') + END_VERSIONS + """ +} diff --git a/modules/nf-core/minimac4/impute/meta.yml b/modules/nf-core/minimac4/impute/meta.yml new file mode 100644 index 00000000..6c7cbf7b --- /dev/null +++ b/modules/nf-core/minimac4/impute/meta.yml @@ -0,0 +1,88 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/modules/meta-schema.json +name: "minimac4_impute" +description: Imputation of genotypes using a reference panel +keywords: + - impute + - haploype + - genomics +tools: + - "minimac4": + description: "Computationally efficient genotype imputation" + homepage: "https://github.com/statgen/Minimac4" + documentation: "https://github.com/statgen/Minimac4" + tool_dev_url: "https://github.com/statgen/Minimac4" + doi: "10.1038/ng.3656" + licence: ["GPL v3-or-later"] + identifier: biotools:minimac4 + +input: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - target_vcf: + type: file + description: Target VCF/BCF file + pattern: "*.{vcf,bcf,vcf.gz}" + ontologies: + - edam: "http://edamontology.org/format_3016" + - edam: "http://edamontology.org/format_3020" + - target_index: + type: file + description: Target VCF/BCF file index + pattern: "*.{csi,tbi}" + ontologies: + - edam: "http://edamontology.org/format_3700" + - ref_msav: + type: file + description: Reference compressed MSAV file obtain through `minimac4 --compress-reference` + pattern: "*.{msav}" + ontologies: [] + - sites_vcf: + type: file + description: Sites VCF/BCF file containing the sites to impute + pattern: "*.{vcf,bcf,vcf.gz}" + ontologies: + - edam: "http://edamontology.org/format_3016" + - edam: "http://edamontology.org/format_3020" + - sites_index: + type: file + description: Sites VCF/BCF file index + pattern: "*.{csi,tbi}" + ontologies: + - edam: "http://edamontology.org/format_3700" + - map: + type: file + description: Genetic map file + pattern: "*.map" + ontologies: + - edam: "http://edamontology.org/data_1278" + +output: + vcf: + - - meta: + type: map + description: | + Groovy Map containing sample information + e.g. `[ id:'sample1', single_end:false ]` + - "*.{bcf,sav,vcf.gz,vcf,ubcf,usav}": + type: file + description: Imputed variants file + pattern: "*.{bcf,sav,vcf.gz,vcf,ubcf,usav}" + ontologies: + - edam: "http://edamontology.org/format_3016" + - edam: "http://edamontology.org/format_3020" + + versions: + - versions.yml: + type: file + description: File containing software versions + pattern: "versions.yml" + + ontologies: + - edam: http://edamontology.org/format_3750 # YAML +authors: + - "@LouisLeNezet" +maintainers: + - "@LouisLeNezet" diff --git a/modules/nf-core/minimac4/impute/tests/main.nf.test b/modules/nf-core/minimac4/impute/tests/main.nf.test new file mode 100644 index 00000000..ce02cfc1 --- /dev/null +++ b/modules/nf-core/minimac4/impute/tests/main.nf.test @@ -0,0 +1,133 @@ +nextflow_process { + + name "Test Process MINIMAC4_IMPUTE" + script "../main.nf" + process "MINIMAC4_IMPUTE" + + tag "modules" + tag "modules_nfcore" + tag "minimac4" + tag "minimac4/impute" + tag "minimac4/compressref" + + config "./nextflow.config" + + setup { + run("MINIMAC4_COMPRESSREF") { + script "../../compressref/main.nf" + process { + """ + input[0] = Channel.of([ + [ id:'input' ], // meta map + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.vcf.gz", checkIfExists: true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.vcf.gz.csi", checkIfExists: true), + ]) + """ + } + } + } + + test("homo sapiens - vcf, ref, sites, map") { + when { + params { + minimac4_args = "--chunk 1000000" + } + process { + """ + input[0] = Channel.of([ + [id: "NA12878", chr: "chr22"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878.chr22.1X.vcf.gz",checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878.chr22.1X.vcf.gz.csi",checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.sites.vcf.gz", checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/popgen/1000GP.chr22.sites.vcf.gz.csi", checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/genome/genome.GRCh38.chr22.map.gz", checkIfExists:true) + ]) + .combine(MINIMAC4_COMPRESSREF.out.msav) + .map{ + meta, target_vcf, target_index, sites_vcf, sites_index, map, metaRef, ref -> [ + meta, target_vcf, target_index, ref, sites_vcf, sites_index, map + ] + } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf.collect{ [file(it[1]).name, path(it[1]).vcf.summary, path(it[1]).vcf.variantsMD5] }, + ["versions.yml": path(process.out.versions.get(0)).yaml] + ).match() } + ) + } + + } + + test("homo sapiens - vcf, ref, [], []") { + when { + params { + minimac4_args = "--output-format vcf" + } + process { + """ + input[0] = Channel.of([ + [id: "NA12878", chr: "chr22"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878.chr22.1X.vcf.gz",checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878.chr22.1X.vcf.gz.csi",checkIfExists:true) + ]) + .combine(MINIMAC4_COMPRESSREF.out.msav) + .map{ + meta, target_vcf, target_index, metaRef, ref -> [ + meta, target_vcf, target_index, ref, [], [], [] + ] + } + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf.collect{ [file(it[1]).name, path(it[1]).vcf.summary, path(it[1]).vcf.variantsMD5] }, + ["versions.yml": path(process.out.versions.get(0)).yaml] + ).match() } + ) + } + + } + + test("homo sapiens - vcf, ref, sites, map - stub") { + + options "-stub" + + when { + params { + minimac4_args = "-O usav" + } + process { + """ + input[0] = Channel.of([ + [id: "NA12878", chr: "chr22"], + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878.chr22.1X.vcf.gz",checkIfExists:true), + file(params.modules_testdata_base_path + "genomics/homo_sapiens/illumina/vcf/NA12878.chr22.1X.vcf.gz.csi",checkIfExists:true), + [], [], [], [] + ]) + """ + } + } + + then { + assertAll( + { assert process.success }, + { assert snapshot( + process.out.vcf, + ["versions.yml": path(process.out.versions.get(0)).yaml] + ).match() } + ) + } + + } + +} diff --git a/modules/nf-core/minimac4/impute/tests/main.nf.test.snap b/modules/nf-core/minimac4/impute/tests/main.nf.test.snap new file mode 100644 index 00000000..790e07c0 --- /dev/null +++ b/modules/nf-core/minimac4/impute/tests/main.nf.test.snap @@ -0,0 +1,73 @@ +{ + "homo sapiens - vcf, ref, sites, map": { + "content": [ + [ + [ + "NA12878.vcf.gz", + "VcfFile [chromosomes=[chr22], sampleCount=1, variantCount=903, phased=true, phasedAutodetect=true]", + "ab57a5554b8c01ae0494318233d1b1dd" + ] + ], + { + "versions.yml": { + "MINIMAC4_IMPUTE": { + "minimac4": "4.1.6" + } + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-03-07T20:08:36.288449293" + }, + "homo sapiens - vcf, ref, [], []": { + "content": [ + [ + [ + "NA12878.vcf", + "VcfFile [chromosomes=[chr22], sampleCount=1, variantCount=903, phased=true, phasedAutodetect=true]", + "6f3379a8c1f75935a9932abf92b6393b" + ] + ], + { + "versions.yml": { + "MINIMAC4_IMPUTE": { + "minimac4": "4.1.6" + } + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-03-07T20:08:44.845881828" + }, + "homo sapiens - vcf, ref, sites, map - stub": { + "content": [ + [ + [ + { + "id": "NA12878", + "chr": "chr22" + }, + "NA12878.usav:md5,d41d8cd98f00b204e9800998ecf8427e" + ] + ], + { + "versions.yml": { + "MINIMAC4_IMPUTE": { + "minimac4": "4.1.6" + } + } + } + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "24.10.4" + }, + "timestamp": "2025-03-07T20:08:51.967898824" + } +} \ No newline at end of file diff --git a/modules/nf-core/minimac4/impute/tests/nextflow.config b/modules/nf-core/minimac4/impute/tests/nextflow.config new file mode 100644 index 00000000..79d5afd8 --- /dev/null +++ b/modules/nf-core/minimac4/impute/tests/nextflow.config @@ -0,0 +1,5 @@ +process { + withName: 'MINIMAC4_IMPUTE' { + ext.args = {"${params.minimac4_args}"} + } +} diff --git a/nextflow.config b/nextflow.config index d557e862..847b72be 100644 --- a/nextflow.config +++ b/nextflow.config @@ -122,6 +122,7 @@ includeConfig 'conf/steps/imputation_glimpse1.config' includeConfig 'conf/steps/imputation_quilt.config' includeConfig 'conf/steps/imputation_stitch.config' includeConfig 'conf/steps/imputation_glimpse2.config' +includeConfig 'conf/steps/imputation_minimac4.config' // validation step includeConfig 'conf/steps/validation.config' @@ -246,6 +247,7 @@ profiles { test_quilt { includeConfig 'conf/test_quilt.config' } test_stitch { includeConfig 'conf/test_stitch.config' } test_glimpse2 { includeConfig 'conf/test_glimpse2.config' } + test_minimac4 { includeConfig 'conf/test_minimac4.config' } test_panelprep { includeConfig 'conf/test_panelprep.config' } test_dog { includeConfig 'conf/test_dog.config' } test_batch { includeConfig 'conf/test_batch.config' } diff --git a/nextflow_schema.json b/nextflow_schema.json index 82e0ecfc..2c7530fc 100644 --- a/nextflow_schema.json +++ b/nextflow_schema.json @@ -74,7 +74,7 @@ "description": "Imputation tool to use.", "help_text": "Multiple tools separated with commas. Used when starting from `--steps impute` or `--steps all`.", "fa_icon": "fas fa-step-forward", - "pattern": "^((glimpse1|glimpse2|quilt|stitch)?,?)*(?\n \n \n \"nf-core/phaseimpute\"\n \n\n\n[![GitHub Actions CI Status](https://github.com/nf-core/phaseimpute/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/phaseimpute/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/phaseimpute/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/phaseimpute/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/phaseimpute/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.14329225-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.14329225)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/phaseimpute)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23phaseimpute-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/phaseimpute)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/phaseimpute** is a bioinformatics pipeline to phase and impute genetic data.\n\n\n \n \"metromap\"/\n\n\nThe whole pipeline consists of five main steps, each of which can be run separately and independently. Users are not required to run all steps sequentially and can select specific steps based on their needs:\n\n1. **QC: Chromosome Name Check**: Ensures compatibility by validating that all expected contigs are present in the variant and alignment files.\n\n2. **Simulation (`--simulate`)**: Generates artificial datasets by downsampling high-density data to simulate low-pass genetic information. This enables the comparison of imputation results against a high-quality dataset (truth set). Simulations may include:\n - **Low-pass data generation** by downsampling BAM or CRAM files with [`samtools view -s`](https://www.htslib.org/doc/samtools-view.html) at different depths.\n\n3. **Panel Preparation (`--panelprep`)**: Prepares the reference panel through phasing, quality control, variant filtering, and annotation. Key processes include:\n - **Normalization** of the reference panel to retain essential variants.\n - **Phasing** of haplotypes in the reference panel using [Shapeit5](https://odelaneau.github.io/shapeit5/).\n - **Chunking** of the reference panel into specific regions across chromosomes.\n - **Position Extraction** for targeted imputation sites.\n\n4. **Imputation (`--impute`)**: This is the primary step, where genotypes in the target dataset are imputed using the prepared reference panel. The main steps are:\n - **Imputation** of the target dataset using tools like [Glimpse1](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html), [Glimpse2](https://odelaneau.github.io/GLIMPSE/), [Stitch](https://github.com/rwdavies/stitch), or [Quilt](https://github.com/rwdavies/QUILT).\n - **Ligation** of imputed chunks to produce a final VCF file per sample, with all chromosomes unified.\n\n5. **Validation (`--validate`)**: Assesses imputation accuracy by comparing the imputed dataset to a truth dataset. This step leverages the [Glimpse2](https://odelaneau.github.io/GLIMPSE/) concordance process to summarize differences between two VCF files.\n\nFor more detailed instructions, please refer to the [usage documentation](https://nf-co.re/phaseimpute/usage).\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nThe primary function of this pipeline is to impute a target dataset based on a phased panel. Begin by preparing a samplesheet with your input data, formatted as follows:\n\n```csv title=\"samplesheet.csv\"\nsample,file,index\nSAMPLE_1X,/path/to/.,/path/to/.\n```\n\nEach row represents either a bam or a cram file along with its corresponding index file. Ensure that all input files have consistent file extensions.\n\nFor certain tools and steps within the pipeline, you will also need to provide a samplesheet for the reference panel. Here's an example of what a final samplesheet for a reference panel might look like, covering three chromosomes:\n\n```csv title=\"panel.csv\"\npanel,chr,vcf,index\nPhase3,1,ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.csi\nPhase3,2,ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.csi\nPhase3,3,ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.csi\n```\n\n## Running the pipeline\n\nRun one of the steps of the pipeline (imputation with glimpse1) using the following command and test profile:\n\n```bash\nnextflow run nf-core/phaseimpute \\\n -profile test, \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/phaseimpute/usage) and the [parameter documentation](https://nf-co.re/phaseimpute/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/phaseimpute/results) tab on the nf-core website pipeline page.\nFor more details on the output files and reports, please refer to the [output documentation](https://nf-co.re/phaseimpute/output).\n\n## Credits\n\nnf-core/phaseimpute was originally written by Louis Le N\u00e9zet & Anabella Trigila.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- Saul Pierotti\n- Eugenia Fontecha\n- Matias Romero Victorica\n- Hemanoel Passarelli\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). Further development tips can be found in the [development documentation](docs/development.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#phaseimpute` channel](https://nfcore.slack.com/channels/phaseimpute) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/phaseimpute for your analysis, please cite it using the following doi: [10.5281/zenodo.14329225](https://doi.org/10.5281/zenodo.14329225)\n\nAn extensive list of references for the tools used by the pipeline, including QUILT, GLIMPSE, and STITCH, can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", + "description": "

\n \n \n \"nf-core/phaseimpute\"\n \n

\n\n[![GitHub Actions CI Status](https://github.com/nf-core/phaseimpute/actions/workflows/nf-test.yml/badge.svg)](https://github.com/nf-core/phaseimpute/actions/workflows/nf-test.yml)\n[![GitHub Actions Linting Status](https://github.com/nf-core/phaseimpute/actions/workflows/linting.yml/badge.svg)](https://github.com/nf-core/phaseimpute/actions/workflows/linting.yml)[![AWS CI](https://img.shields.io/badge/CI%20tests-full%20size-FF9900?labelColor=000000&logo=Amazon%20AWS)](https://nf-co.re/phaseimpute/results)[![Cite with Zenodo](http://img.shields.io/badge/DOI-10.5281/zenodo.14329225-1073c8?labelColor=000000)](https://doi.org/10.5281/zenodo.14329225)\n[![nf-test](https://img.shields.io/badge/unit_tests-nf--test-337ab7.svg)](https://www.nf-test.com)\n\n[![Nextflow](https://img.shields.io/badge/version-%E2%89%A524.10.5-green?style=flat&logo=nextflow&logoColor=white&color=%230DC09D&link=https%3A%2F%2Fnextflow.io)](https://www.nextflow.io/)\n[![nf-core template version](https://img.shields.io/badge/nf--core_template-3.3.2-green?style=flat&logo=nfcore&logoColor=white&color=%2324B064&link=https%3A%2F%2Fnf-co.re)](https://github.com/nf-core/tools/releases/tag/3.3.2)\n[![run with conda](http://img.shields.io/badge/run%20with-conda-3EB049?labelColor=000000&logo=anaconda)](https://docs.conda.io/en/latest/)\n[![run with docker](https://img.shields.io/badge/run%20with-docker-0db7ed?labelColor=000000&logo=docker)](https://www.docker.com/)\n[![run with singularity](https://img.shields.io/badge/run%20with-singularity-1d355c.svg?labelColor=000000)](https://sylabs.io/docs/)\n[![Launch on Seqera Platform](https://img.shields.io/badge/Launch%20%F0%9F%9A%80-Seqera%20Platform-%234256e7)](https://cloud.seqera.io/launch?pipeline=https://github.com/nf-core/phaseimpute)\n\n[![Get help on Slack](http://img.shields.io/badge/slack-nf--core%20%23phaseimpute-4A154B?labelColor=000000&logo=slack)](https://nfcore.slack.com/channels/phaseimpute)[![Follow on Bluesky](https://img.shields.io/badge/bluesky-%40nf__core-1185fe?labelColor=000000&logo=bluesky)](https://bsky.app/profile/nf-co.re)[![Follow on Mastodon](https://img.shields.io/badge/mastodon-nf__core-6364ff?labelColor=FFFFFF&logo=mastodon)](https://mstdn.science/@nf_core)[![Watch on YouTube](http://img.shields.io/badge/youtube-nf--core-FF0000?labelColor=000000&logo=youtube)](https://www.youtube.com/c/nf-core)\n\n## Introduction\n\n**nf-core/phaseimpute** is a bioinformatics pipeline to phase and impute genetic data.\n\n\n \n \"metromap\"/\n\n\nThe whole pipeline consists of five main steps, each of which can be run separately and independently. Users are not required to run all steps sequentially and can select specific steps based on their needs:\n\n1. **QC: Chromosome Name Check**: Ensures compatibility by validating that all expected contigs are present in the variant and alignment files.\n\n2. **Simulation (`--simulate`)**: Generates artificial datasets by downsampling high-density data to simulate low-pass genetic information. This enables the comparison of imputation results against a high-quality dataset (truth set). Simulations may include:\n - **Low-pass data generation** by downsampling BAM or CRAM files with [`samtools view -s`](https://www.htslib.org/doc/samtools-view.html) at different depths.\n\n3. **Panel Preparation (`--panelprep`)**: Prepares the reference panel through phasing, quality control, variant filtering, and annotation. Key processes include:\n - **Normalization** of the reference panel to retain essential variants.\n - **Phasing** of haplotypes in the reference panel using [Shapeit5](https://odelaneau.github.io/shapeit5/).\n - **Chunking** of the reference panel into specific regions across chromosomes.\n - **Position Extraction** for targeted imputation sites.\n\n4. **Imputation (`--impute`)**: This is the primary step, where genotypes in the target dataset are imputed using the prepared reference panel. The main steps are:\n - **Imputation** of the target dataset using tools like [Glimpse1](https://odelaneau.github.io/GLIMPSE/glimpse1/index.html), [Glimpse2](https://odelaneau.github.io/GLIMPSE/), [Stitch](https://github.com/rwdavies/stitch), [Quilt](https://github.com/rwdavies/QUILT) or [Minimac4](https://github.com/statgen/Minimac4).\n - **Ligation** of imputed chunks to produce a final VCF file per sample, with all chromosomes unified.\n\n5. **Validation (`--validate`)**: Assesses imputation accuracy by comparing the imputed dataset to a truth dataset. This step leverages the [Glimpse2](https://odelaneau.github.io/GLIMPSE/) concordance process to summarize differences between two VCF files.\n\nFor more detailed instructions, please refer to the [usage documentation](https://nf-co.re/phaseimpute/usage).\n\n## Usage\n\n> [!NOTE]\n> If you are new to Nextflow and nf-core, please refer to [this page](https://nf-co.re/docs/usage/installation) on how to set-up Nextflow. Make sure to [test your setup](https://nf-co.re/docs/usage/introduction#how-to-run-a-pipeline) with `-profile test` before running the workflow on actual data.\n\nThe primary function of this pipeline is to impute a target dataset based on a phased panel. Begin by preparing a samplesheet with your input data, formatted as follows:\n\n```csv title=\"samplesheet.csv\"\nsample,file,index\nSAMPLE_1X,/path/to/.,/path/to/.\n```\n\nEach row represents either a bam or a cram file along with its corresponding index file. Ensure that all input files have consistent file extensions.\n\nFor certain tools and steps within the pipeline, you will also need to provide a samplesheet for the reference panel. Here's an example of what a final samplesheet for a reference panel might look like, covering three chromosomes:\n\n```csv title=\"panel.csv\"\npanel,chr,vcf,index\nPhase3,1,ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,ALL.chr1.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.csi\nPhase3,2,ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,ALL.chr2.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.csi\nPhase3,3,ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz,ALL.chr3.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz.csi\n```\n\n## Running the pipeline\n\nRun one of the steps of the pipeline (imputation with glimpse1) using the following command and test profile:\n\n```bash\nnextflow run nf-core/phaseimpute \\\n -profile test, \\\n --outdir \n```\n\n> [!WARNING]\n> Please provide pipeline parameters via the CLI or Nextflow `-params-file` option. Custom config files including those provided by the `-c` Nextflow option can be used to provide any configuration _**except for parameters**_; see [docs](https://nf-co.re/docs/usage/getting_started/configuration#custom-configuration-files).\n\nFor more details and further functionality, please refer to the [usage documentation](https://nf-co.re/phaseimpute/usage) and the [parameter documentation](https://nf-co.re/phaseimpute/parameters).\n\n## Pipeline output\n\nTo see the results of an example test run with a full size dataset refer to the [results](https://nf-co.re/phaseimpute/results) tab on the nf-core website pipeline page.\nFor more details on the output files and reports, please refer to the [output documentation](https://nf-co.re/phaseimpute/output).\n\n## Credits\n\nnf-core/phaseimpute was originally written by Louis Le N\u00e9zet & Anabella Trigila.\n\nWe thank the following people for their extensive assistance in the development of this pipeline:\n\n- Saul Pierotti\n- Eugenia Fontecha\n- Matias Romero Victorica\n- Hemanoel Passarelli\n\n## Contributions and Support\n\nIf you would like to contribute to this pipeline, please see the [contributing guidelines](.github/CONTRIBUTING.md). Further development tips can be found in the [development documentation](docs/development.md).\n\nFor further information or help, don't hesitate to get in touch on the [Slack `#phaseimpute` channel](https://nfcore.slack.com/channels/phaseimpute) (you can join with [this invite](https://nf-co.re/join/slack)).\n\n## Citations\n\nIf you use nf-core/phaseimpute for your analysis, please cite it using the following doi: [10.5281/zenodo.14329225](https://doi.org/10.5281/zenodo.14329225)\n\nAn extensive list of references for the tools used by the pipeline, including QUILT, GLIMPSE, and STITCH, can be found in the [`CITATIONS.md`](CITATIONS.md) file.\n\nYou can cite the `nf-core` publication as follows:\n\n> **The nf-core framework for community-curated bioinformatics pipelines.**\n>\n> Philip Ewels, Alexander Peltzer, Sven Fillinger, Harshil Patel, Johannes Alneberg, Andreas Wilm, Maxime Ulysse Garcia, Paolo Di Tommaso & Sven Nahnsen.\n>\n> _Nat Biotechnol._ 2020 Feb 13. doi: [10.1038/s41587-020-0439-x](https://dx.doi.org/10.1038/s41587-020-0439-x).\n", "hasPart": [ { "@id": "main.nf" diff --git a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf index 43a628e2..2a31c799 100644 --- a/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf +++ b/subworkflows/local/utils_nfcore_phaseimpute_pipeline/main.nf @@ -406,13 +406,13 @@ def validateInputParameters() { // Check that posfile and chunks are provided when running impute only. Steps with panelprep generate those files. if (params.steps.split(',').contains("impute") && !params.steps.split(',').find { it in ["all", "panelprep"] }) { - // Required by all tools except glimpse2 - if (!params.tools.split(',').find { it in ["glimpse2"] }) { + // Required by all tools except glimpse2 and minimac4 + if (!params.tools.split(',').find { it in ["glimpse2", "minimac4"] }) { assert params.posfile : "No --posfile provided for --steps impute" } - // Required by all tools except STITCH - if (params.tools != "stitch") { - assert params.chunks : "No --chunks provided for --steps impute" + // Required by all tools except stitch and minimac4 + if (!params.tools.split(',').any { it in ['stitch', 'minimac4'] }) { + assert params.chunks : "No --chunks provided for --steps impute" } // Required by GLIMPSE1 and GLIMPSE2 only if (params.tools.split(',').contains("glimpse")) { diff --git a/subworkflows/local/vcf_impute_minimac4/main.nf b/subworkflows/local/vcf_impute_minimac4/main.nf new file mode 100644 index 00000000..f86e0f61 --- /dev/null +++ b/subworkflows/local/vcf_impute_minimac4/main.nf @@ -0,0 +1,67 @@ +include { MINIMAC4_COMPRESSREF } from '../../../modules/nf-core/minimac4/compressref/main' +include { MINIMAC4_IMPUTE } from '../../../modules/nf-core/minimac4/impute/main' +include { BCFTOOLS_INDEX } from '../../../modules/nf-core/bcftools/index/main' + +workflow VCF_IMPUTE_MINIMAC4 { + + take: + ch_input // channel: [ [id, chr], vcf, tbi ] + ch_panel // channel: [ [id, chr], vcf, tbi ] + ch_map // channel: [ [chr], map] + ch_posfile // channel: [ [chr], sites_vcf, sites_index, hap, legend ] + + main: + + ch_versions = Channel.empty() + + ch_posfile_minimac4 = ch_posfile + .map { meta, sites_vcf, sites_index, hap, legend -> + [meta, sites_vcf, sites_index] + } + + // Compress reference panel to MSAV format + MINIMAC4_COMPRESSREF(ch_panel) + ch_versions = ch_versions.mix(MINIMAC4_COMPRESSREF.out.versions.first()) + + // Prepare input channels for MINIMAC4 + ch_minimac4_input = ch_input + .map { meta, vcf, tbi -> [meta.chr, meta, vcf, tbi] } + .combine( + MINIMAC4_COMPRESSREF.out.msav.map { meta, msav -> [meta.chr, meta.id, msav] }, + by: 0 + ) + .combine( + ch_map.map { meta, map -> [meta.chr, map] }, + by: 0 + ) + .combine( + ch_posfile_minimac4.map { meta, sites_vcf, sites_index -> + [meta.chr, sites_vcf, sites_index] + }, + by: 0 + ) + .map { chr, target_meta, target_vcf, target_tbi, panel_id, ref_msav, map, sites_vcf, sites_index -> + [target_meta + [panel: panel_id], target_vcf, target_tbi, ref_msav, sites_vcf, sites_index, map] + } + // Perform imputation + MINIMAC4_IMPUTE(ch_minimac4_input) + ch_versions = ch_versions.mix(MINIMAC4_IMPUTE.out.versions.first()) + + // Index the output VCF file + BCFTOOLS_INDEX( + MINIMAC4_IMPUTE.out.vcf + ) + ch_versions = ch_versions.mix(BCFTOOLS_INDEX.out.versions.first()) + + // Join imputed and index files + ch_vcf_index = MINIMAC4_IMPUTE.out.vcf + .join( + BCFTOOLS_INDEX.out.tbi + .mix(BCFTOOLS_INDEX.out.csi) + ) + .map{ meta, vcf, index -> [meta + [tools: "minimac4"], vcf, index] } + + emit: + vcf_index = ch_vcf_index // channel: [ [id, chr, tools], vcf, index ] + versions = ch_versions // channel: [ versions.yml ] + } diff --git a/subworkflows/local/vcf_impute_minimac4/meta.yml b/subworkflows/local/vcf_impute_minimac4/meta.yml new file mode 100644 index 00000000..62f7334f --- /dev/null +++ b/subworkflows/local/vcf_impute_minimac4/meta.yml @@ -0,0 +1,92 @@ +# yaml-language-server: $schema=https://raw.githubusercontent.com/nf-core/modules/master/subworkflows/yaml-schema.json +name: "VCF_IMPUTE_MINIMAC4" +description: | + Subworkflow to impute VCF files using MINIMAC4 software. The subworkflow + takes VCF files, phased reference panel, and genetic maps to perform imputation + and outputs phased and imputed VCF files. MINIMAC4 requires reference panels + to be compressed into MSAV format before imputation. +keywords: + - VCF + - imputation + - minimac4 + - phasing + - MSAV +components: + - minimac4/compressref + - minimac4/impute + - bcftools/index +input: + - ch_input: + description: Channel with input data + structure: + - meta: + type: map + description: | + Metadata map containing sample information + Need to have "id" as sample name and "chr" as chromosome name + - vcf: + type: file + description: Input VCF files + pattern: "*.{vcf,bcf,vcf.gz}" + - index: + type: file + description: Input index file + pattern: "*.{tbi,csi}" + - ch_panel: + description: Channel with phased reference panel data + structure: + - meta: + type: map + description: | + Metadata map that will be combined with the input data map + Need to have the "chr" as chromosome name and "id" as panel name + - vcf: + type: file + description: Reference panel VCF files by chromosomes + pattern: "*.{vcf,bcf,vcf.gz}" + - index: + type: file + description: Reference panel VCF index files + pattern: "*.{tbi,csi}" + - ch_map: + description: Channel with genetic map data + structure: + - meta: + type: map + description: | + Metadata map containing chromosome information + Need to have "chr" as chromosome name + - map: + type: file + description: PLINK format genetic map files + pattern: "*.map" +output: + - vcf_tbi: + description: Channel with imputed and phased VCF files + structure: + - meta: + type: map + description: | + Metadata map combined with the input data map. + It contains the "id" as sample name and "chr" as chromosome name. + {"tools" : "minimac4"} is added. + - vcf: + type: file + description: VCF imputed and phased file by sample + pattern: "*.{vcf,bcf,vcf.gz}" + - index: + type: file + description: VCF index file + pattern: "*.{tbi,csi}" + - versions: + description: Channel containing software versions file + structure: + - versions.yml: + type: file + description: File containing versions of the software used +authors: + - "@LouisLeNezet" + - "@gichas" +maintainers: + - "@LouisLeNezet" + - "@gichas" diff --git a/subworkflows/local/vcf_impute_minimac4/tests/main.nf.test b/subworkflows/local/vcf_impute_minimac4/tests/main.nf.test new file mode 100644 index 00000000..11efeba9 --- /dev/null +++ b/subworkflows/local/vcf_impute_minimac4/tests/main.nf.test @@ -0,0 +1,190 @@ +nextflow_workflow { + + name "Test Subworkflow VCF_IMPUTE_MINIMAC4" + script "../main.nf" + + config "nextflow.config" + + workflow "VCF_IMPUTE_MINIMAC4" + + tag "subworkflows" + tag "subworkflows_local" + tag "subworkflows/vcf_impute_minimac4" + tag "vcf_impute_minimac4" + + tag "minimac4" + tag "minimac4/compressref" + tag "minimac4/impute" + + test("Impute with minimac4 one vcf") { + + when { + workflow { + """ + input[0] = Channel.of([ + [id: "NA12878", panel: "1000GP", chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA12878/NA12878.s.bcf", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA12878/NA12878.s.bcf.csi", checkIfExist:true) + ]) + input[1] = Channel.of([ + [id: "1000GP", chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.s.norel.vcf.gz", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.s.norel.vcf.gz.csi", checkIfExist:true) + ]) + input[2] = Channel.of([[chr: "chr22"], []]) + input[3] = Channel.of([[chr: "chr22"], [], [], [], []]) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.versions, + workflow.out.vcf_index.collect{[ + it[0], + file(it[1]).name, + file(it[2]).name + ] }, + workflow.out.vcf_index.collect{ path(it[1]).vcf.summary }, + workflow.out.vcf_index.collect{ path(it[1]).md5 } + ).match() } + ) + } + } + + test("Impute with minimac4 one vcf with map") { + + when { + workflow { + """ + input[0] = Channel.of([ + [id: "NA12878", panel: "1000GP", chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA12878/NA12878.s.bcf", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA12878/NA12878.s.bcf.csi", checkIfExist:true) + ]) + input[1] = Channel.of([ + [id: "1000GP", chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.s.norel.vcf.gz", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.s.norel.vcf.gz.csi", checkIfExist:true) + ]) + input[2] = Channel.of([ + [chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38_chr22.glimpse.map", checkIfExist:true) + ]) + input[3] = Channel.of([[chr: "chr22"], [], [], [], []]) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.versions, + workflow.out.vcf_index.collect{[ + it[0], + file(it[1]).name, + file(it[2]).name + ] }, + workflow.out.vcf_index.collect{ path(it[1]).vcf.summary }, + workflow.out.vcf_index.collect{ path(it[1]).md5 } + ).match() } + ) + } + } + + test("Impute with minimac4 two vcf") { + + when { + workflow { + """ + input[0] = Channel.of( + [ + [id: "NA12878", panel: "1000GP", chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA12878/NA12878.s.bcf", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA12878/NA12878.s.bcf.csi", checkIfExist:true) + ], + [ + [id: "NA19401", panel: "1000GP", chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA19401/NA19401.s.bcf", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA19401/NA19401.s.bcf.csi", checkIfExist:true) + ] + ) + input[1] = Channel.of([ + [id: "1000GP", chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.s.norel.vcf.gz", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.s.norel.vcf.gz.csi", checkIfExist:true) + ]) + input[2] = Channel.of([[chr: "chr22"], []]) + input[3] = Channel.of([[chr: "chr22"], [], [], [], []]) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.versions, + workflow.out.vcf_index.collect{[ + it[0], + file(it[1]).name, + file(it[2]).name + ] }, + workflow.out.vcf_index.collect{ path(it[1]).vcf.summary }, + workflow.out.vcf_index.collect{ path(it[1]).md5 } + ).match() } + ) + } + } + + test("Impute with minimac4 one vcf with sites") { + + when { + workflow { + """ + input[0] = Channel.of([ + [id: "NA12878", panel: "1000GP", chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA12878/NA12878.s.bcf", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/individuals/NA12878/NA12878.s.bcf.csi", checkIfExist:true) + ]) + input[1] = Channel.of([ + [id: "1000GP", chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.s.norel.vcf.gz", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.s.norel.vcf.gz.csi", checkIfExist:true) + ]) + input[2] = Channel.of([ + [chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/reference_genome/GRCh38_chr22.glimpse.map", checkIfExist:true) + ]) + input[3] = Channel.of([ + [chr: "chr22"], + file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.sites.vcf.gz", checkIfExist:true), + file(params.pipelines_testdata_base_path + "hum_data/panel/chr22/1000GP.chr22.sites.vcf.gz.csi", checkIfExist:true), + [], + [] + ]) + """ + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + workflow.out.versions, + workflow.out.vcf_index.collect{[ + it[0], + file(it[1]).name, + file(it[2]).name + ] }, + workflow.out.vcf_index.collect{ path(it[1]).vcf.summary }, + workflow.out.vcf_index.collect{ path(it[1]).md5 } + ).match() } + ) + } + } + +} diff --git a/subworkflows/local/vcf_impute_minimac4/tests/main.nf.test.snap b/subworkflows/local/vcf_impute_minimac4/tests/main.nf.test.snap new file mode 100644 index 00000000..81e2b94a --- /dev/null +++ b/subworkflows/local/vcf_impute_minimac4/tests/main.nf.test.snap @@ -0,0 +1,142 @@ +{ + "Impute with minimac4 two vcf": { + "content": [ + [ + "versions.yml:md5,373fe12da367d3891473f487a9536d04", + "versions.yml:md5,99b8cb34e87147cec893f4bcf7a9b244", + "versions.yml:md5,a4842af8fedc3522b989d4ee28f08d80" + ], + [ + [ + { + "id": "NA12878", + "panel": "1000GP", + "chr": "chr22", + "tools": "minimac4" + }, + "NA12878.vcf.gz", + "NA12878.vcf.gz.csi" + ], + [ + { + "id": "NA19401", + "panel": "1000GP", + "chr": "chr22", + "tools": "minimac4" + }, + "NA19401.vcf.gz", + "NA19401.vcf.gz.csi" + ] + ], + [ + "VcfFile [chromosomes=[chr22], sampleCount=1, variantCount=903, phased=true, phasedAutodetect=true]", + "VcfFile [chromosomes=[chr22], sampleCount=1, variantCount=903, phased=true, phasedAutodetect=true]" + ], + [ + "76a94fd1ad228e1cd9010312ec2a18dd", + "098833b0617876411ea68dd0c8709986" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.4" + }, + "timestamp": "2025-07-22T11:52:33.317647396" + }, + "Impute with minimac4 one vcf": { + "content": [ + [ + "versions.yml:md5,373fe12da367d3891473f487a9536d04", + "versions.yml:md5,99b8cb34e87147cec893f4bcf7a9b244", + "versions.yml:md5,a4842af8fedc3522b989d4ee28f08d80" + ], + [ + [ + { + "id": "NA12878", + "panel": "1000GP", + "chr": "chr22", + "tools": "minimac4" + }, + "NA12878.vcf.gz", + "NA12878.vcf.gz.csi" + ] + ], + [ + "VcfFile [chromosomes=[chr22], sampleCount=1, variantCount=903, phased=true, phasedAutodetect=true]" + ], + [ + "76a94fd1ad228e1cd9010312ec2a18dd" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.4" + }, + "timestamp": "2025-07-22T11:51:38.719726645" + }, + "Impute with minimac4 one vcf with sites": { + "content": [ + [ + "versions.yml:md5,373fe12da367d3891473f487a9536d04", + "versions.yml:md5,99b8cb34e87147cec893f4bcf7a9b244", + "versions.yml:md5,a4842af8fedc3522b989d4ee28f08d80" + ], + [ + [ + { + "id": "NA12878", + "panel": "1000GP", + "chr": "chr22", + "tools": "minimac4" + }, + "NA12878.vcf.gz", + "NA12878.vcf.gz.csi" + ] + ], + [ + "VcfFile [chromosomes=[chr22], sampleCount=1, variantCount=903, phased=true, phasedAutodetect=true]" + ], + [ + "2eac9b4d102fb38fd75b7efd19bfb837" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.4" + }, + "timestamp": "2025-07-22T11:53:00.151658756" + }, + "Impute with minimac4 one vcf with map": { + "content": [ + [ + "versions.yml:md5,373fe12da367d3891473f487a9536d04", + "versions.yml:md5,99b8cb34e87147cec893f4bcf7a9b244", + "versions.yml:md5,a4842af8fedc3522b989d4ee28f08d80" + ], + [ + [ + { + "id": "NA12878", + "panel": "1000GP", + "chr": "chr22", + "tools": "minimac4" + }, + "NA12878.vcf.gz", + "NA12878.vcf.gz.csi" + ] + ], + [ + "VcfFile [chromosomes=[chr22], sampleCount=1, variantCount=903, phased=true, phasedAutodetect=true]" + ], + [ + "2eac9b4d102fb38fd75b7efd19bfb837" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.4" + }, + "timestamp": "2025-07-22T11:52:07.324270788" + } +} \ No newline at end of file diff --git a/subworkflows/local/vcf_impute_minimac4/tests/nextflow.config b/subworkflows/local/vcf_impute_minimac4/tests/nextflow.config new file mode 100644 index 00000000..d5c07e6a --- /dev/null +++ b/subworkflows/local/vcf_impute_minimac4/tests/nextflow.config @@ -0,0 +1,8 @@ +process { + withName: 'MINIMAC4_COMPRESSREF' { + ext.args = '' + } + withName: 'MINIMAC4_IMPUTE' { + ext.args = '--output-format vcf.gz' + } +} diff --git a/subworkflows/local/vcf_impute_minimac4/tests/tags.yml b/subworkflows/local/vcf_impute_minimac4/tests/tags.yml new file mode 100644 index 00000000..5fdf6cb1 --- /dev/null +++ b/subworkflows/local/vcf_impute_minimac4/tests/tags.yml @@ -0,0 +1,2 @@ +subworkflows/vcf_impute_minimac4: + - subworkflows/local/vcf_impute_minimac4/** diff --git a/tests/csv/map.csv b/tests/csv/map.csv index 96257bff..830f4d20 100644 --- a/tests/csv/map.csv +++ b/tests/csv/map.csv @@ -1,3 +1,3 @@ chr,map -chr21,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/reference_genome/GRCh38_21.map -chr22,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/reference_genome/GRCh38_22.map +chr21,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/reference_genome/GRCh38_chr21.glimpse.map +chr22,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/reference_genome/GRCh38_chr22.glimpse.map diff --git a/tests/csv/sample_vcf.csv b/tests/csv/sample_vcf.csv index e9ccb044..d0edfe94 100644 --- a/tests/csv/sample_vcf.csv +++ b/tests/csv/sample_vcf.csv @@ -1,4 +1,4 @@ sample,file,index -NA12878,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA12878/NA12878.s.1x.bcf,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA12878/NA12878.s.1x.bcf.csi -NA19401,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA19401/NA19401.s.1x.bcf,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA19401/NA19401.s.1x.bcf.csi -NA20359,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA20359/NA20359.s.1x.bcf,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA20359/NA20359.s.1x.bcf.csi +NA12878,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA12878/NA12878.s.snp.vcf.gz,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA12878/NA12878.s.snp.vcf.gz.csi +NA19401,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA19401/NA19401.s.snp.vcf.gz,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA19401/NA19401.s.snp.vcf.gz.csi +NA20359,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA20359/NA20359.s.snp.vcf.gz,https://raw.githubusercontent.com/nf-core/test-datasets/phaseimpute/hum_data/individuals/NA20359/NA20359.s.snp.vcf.gz.csi diff --git a/workflows/phaseimpute/main.nf b/workflows/phaseimpute/main.nf index 8e9f2375..8f5fe58a 100644 --- a/workflows/phaseimpute/main.nf +++ b/workflows/phaseimpute/main.nf @@ -60,6 +60,10 @@ include { VCF_CONCATENATE_BCFTOOLS as CONCAT_QUILT } from '../../subworkflows/ include { BAM_IMPUTE_STITCH } from '../../subworkflows/local/bam_impute_stitch' include { VCF_CONCATENATE_BCFTOOLS as CONCAT_STITCH } from '../../subworkflows/local/vcf_concatenate_bcftools' +// MINIMAC4 subworkflows +include { VCF_IMPUTE_MINIMAC4 } from '../../subworkflows/local/vcf_impute_minimac4' +include { VCF_CONCATENATE_BCFTOOLS as CONCAT_MINIMAC4} from '../../subworkflows/local/vcf_concatenate_bcftools' + // Imputation stats include { BCFTOOLS_STATS as BCFTOOLS_STATS_TOOLS } from '../../modules/nf-core/bcftools/stats' @@ -398,6 +402,33 @@ workflow PHASEIMPUTE { ch_input_validate = ch_input_validate.mix(CONCAT_QUILT.out.vcf_tbi) } + if (params.tools.split(',').contains("minimac4")) { + log.info("Impute with MINIMAC4") + + // Create input channel combining VCF with regions + ch_input_minimac4 = ch_input_type.vcf + .combine(ch_region) + .map { meta_vcf, vcf, index, meta_region, region -> + [meta_vcf + meta_region, vcf, index] + } + + // Run imputation with MINIMAC4 + VCF_IMPUTE_MINIMAC4( + ch_input_minimac4, + ch_panel_phased, + ch_map, + ch_posfile + ) + ch_versions = ch_versions.mix(VCF_IMPUTE_MINIMAC4.out.versions) + + // Concatenate by chromosomes + CONCAT_MINIMAC4(VCF_IMPUTE_MINIMAC4.out.vcf_index) + ch_versions = ch_versions.mix(CONCAT_MINIMAC4.out.versions) + + // Add results to input validate + ch_input_validate = ch_input_validate.mix(CONCAT_MINIMAC4.out.vcf_tbi) + } + // Prepare renaming file BCFTOOLS_QUERY_IMPUTED(ch_input_validate, [], [], []) GAWK_IMPUTED(BCFTOOLS_QUERY_IMPUTED.out.output, [], false) diff --git a/workflows/phaseimpute/tests/main.nf.test b/workflows/phaseimpute/tests/main.nf.test index b15d2dee..30142cce 100644 --- a/workflows/phaseimpute/tests/main.nf.test +++ b/workflows/phaseimpute/tests/main.nf.test @@ -88,6 +88,33 @@ nextflow_pipeline { } + test("Check test_minimac4") { + tag "test_minimac4" + config "../../../conf/test_minimac4.config" + when { + params { + publish_dir_mode = "copy" + pipelines_testdata_base_path = 'https://raw.githubusercontent.com/nf-core/test-datasets/refs/heads/phaseimpute/' + outdir = "$outputDir" + } + } + + then { + assertAll( + { assert workflow.success }, + { assert snapshot( + path("${outputDir}/imputation/") + .list() + .collect { getRecursiveFileNames(it, outputDir) } + .flatten(), + path("$outputDir/imputation/minimac4/concat/NA12878.minimac4.vcf.gz").vcf.summary.replaceAll(", phasedAutodetect=(false|true)", ""), + path("$outputDir/imputation/minimac4/concat/NA12878.minimac4.vcf.gz").vcf.header.getGenotypeSamples().sort() + ).match() + } + ) + } + } + test("Check test_sim") { tag "test_sim" config "../../../conf/test_sim.config" @@ -107,7 +134,8 @@ nextflow_pipeline { .list() .collect { getRecursiveFileNames(it, outputDir) } .flatten(), - ).match() } + ).match() + } ) } } diff --git a/workflows/phaseimpute/tests/main.nf.test.snap b/workflows/phaseimpute/tests/main.nf.test.snap index 3e73a7a7..2873642a 100644 --- a/workflows/phaseimpute/tests/main.nf.test.snap +++ b/workflows/phaseimpute/tests/main.nf.test.snap @@ -31,6 +31,37 @@ }, "timestamp": "2025-06-18T18:23:01.818626445" }, + "Check test_minimac4": { + "content": [ + [ + "imputation/csv/impute.csv", + "imputation/minimac4/concat/NA12878.minimac4.vcf.gz", + "imputation/minimac4/concat/NA12878.minimac4.vcf.gz.tbi", + "imputation/minimac4/concat/NA19401.minimac4.vcf.gz", + "imputation/minimac4/concat/NA19401.minimac4.vcf.gz.tbi", + "imputation/minimac4/concat/NA20359.minimac4.vcf.gz", + "imputation/minimac4/concat/NA20359.minimac4.vcf.gz.tbi", + "imputation/minimac4/samples/NA12878.minimac4.vcf.gz", + "imputation/minimac4/samples/NA12878.minimac4.vcf.gz.tbi", + "imputation/minimac4/samples/NA19401.minimac4.vcf.gz", + "imputation/minimac4/samples/NA19401.minimac4.vcf.gz.tbi", + "imputation/minimac4/samples/NA20359.minimac4.vcf.gz", + "imputation/minimac4/samples/NA20359.minimac4.vcf.gz.tbi", + "imputation/stats/NA12878.minimac4.bcftools_stats.txt", + "imputation/stats/NA19401.minimac4.bcftools_stats.txt", + "imputation/stats/NA20359.minimac4.bcftools_stats.txt" + ], + "VcfFile [chromosomes=[chr21, chr22], sampleCount=1, variantCount=1739, phased=true]", + [ + "NA12878" + ] + ], + "meta": { + "nf-test": "0.9.2", + "nextflow": "25.04.4" + }, + "timestamp": "2025-07-24T09:25:16.978538151" + }, "Check test_all": { "content": [ [