Skip to content

Commit c0facc3

Browse files
committed
feat: pipeline accepts vep.tar.gz or vep/ dir
1 parent 65969ea commit c0facc3

4 files changed

Lines changed: 45 additions & 9 deletions

File tree

docs/variant_annotation.md

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,15 +12,37 @@ The CIViCpy annotation cache is small and is downloaded on each pipeline run. If
1212

1313
## Ensembl Variant Effect Predictor (VEP) cache
1414

15-
1. The quickest way to download the VEP cache is with `wget` and `tar`:
15+
The pipeline accepts either a pre-extracted cache directory or a `.tar.gz` archive.
16+
17+
If a tarball is supplied, it will be automatically extracted before VEP runs.
18+
19+
1. The quickest way to download the VEP cache is with `wget`:
1620

1721
```console
1822
wget https://ftp.ensembl.org/pub/release-114/variation/indexed_vep_cache/homo_sapiens_vep_114_GRCh38.tar.gz
19-
tar -xzf homo_sapiens_vep_114_GRCh38.tar.gz
2023
```
2124

2225
The version and build you choose should match the `--ensemblvep_cache_version` and `--annotation_genome_version` arguments provided to the pipeline, respectively.
2326

27+
You can pass the tarball directly to the pipeline:
28+
29+
```console
30+
nextflow run twistcgp/main.nf \
31+
-profile <docker/singularity/conda> \
32+
--fasta hg38.fa \
33+
--input samplesheet.csv \
34+
--baits baits.bed \
35+
--targets targets.bed \
36+
--ensemblvep_cache homo_sapiens_vep_114_GRCh38.tar.gz \
37+
--outdir <OUTDIR>
38+
```
39+
40+
Or extract it first and pass the directory:
41+
42+
```console
43+
tar -xzf homo_sapiens_vep_114_GRCh38.tar.gz
44+
```
45+
2446
2. Alternatively, install Ensembl VEP which is available directly from [github.com/ensembl-vep](https://github.com/Ensembl/ensembl-vep.git) or install with mamba/conda, [bioconda::ensembl-vep](https://anaconda.org/bioconda/ensembl-vep). If using conda, activate your environment.
2547

2648
3. Download the cache with Ensembl VEP, making sure that the genome version and database version match the pipeline parameters.
@@ -31,7 +53,7 @@ Please note that this download is rate-limited, and will take much longer than `
3153
vep_install -a cf -s homo_sapiens -y GRCh38 -c ~/vep --CONVERT
3254
```
3355

34-
3. Pass the cache to the pipeline:
56+
4. Pass the cache directory to the pipeline:
3557

3658
```console
3759
nextflow run twistcgp/main.nf \

main.nf

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_twistcgp
1919
include { PREPARE_GENOME } from './subworkflows/local/prepare_genome'
2020
include { PREPARE_ANNOTATION_DB } from './subworkflows/local/prepare_annotation_db'
2121
include { PREPARE_INDICES } from './subworkflows/local/prepare_indices'
22+
include { UNTAR as UNTAR_VEP_CACHE } from './modules/nf-core/untar/main'
2223
/*
2324
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
2425
RUN MAIN WORKFLOW
@@ -166,9 +167,18 @@ workflow FULCRUMGENOMICS_TWISTCGP {
166167
ch_snpeff_cache = params.snpeff_cache
167168
? Channel.fromPath(params.snpeff_cache).map { it -> [[id: 'snpeff_cache'], it] }.collect()
168169
: PREPARE_ANNOTATION_DB.out.snpeff_cache
169-
ch_vep_cache = params.ensemblvep_cache
170-
? Channel.fromPath(params.ensemblvep_cache).map { it -> [[id: 'vep_cache'], it] }.collect()
171-
: PREPARE_ANNOTATION_DB.out.ensemblvep_cache
170+
if (params.ensemblvep_cache && params.ensemblvep_cache.endsWith('.tar.gz')) {
171+
UNTAR_VEP_CACHE(
172+
Channel.fromPath(params.ensemblvep_cache)
173+
.map { it -> [[id: 'vep_cache'], it] }
174+
.collect()
175+
)
176+
ch_vep_cache = UNTAR_VEP_CACHE.out.untar.collect()
177+
} else {
178+
ch_vep_cache = params.ensemblvep_cache
179+
? Channel.fromPath(params.ensemblvep_cache).map { it -> [[id: 'vep_cache'], it] }.collect()
180+
: PREPARE_ANNOTATION_DB.out.ensemblvep_cache
181+
}
172182
ch_msi_scan = params.msisensor_scan
173183
? Channel.fromPath(params.msisensor_scan).map { it -> [[id: 'scan'], it] }.collect()
174184
: (params.use_msisensor_pro_licensed ? PREPARE_GENOME.out.msi_scan : Channel.value([[id: 'scan'], []]))

modules.json

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,11 @@
133133
"branch": "master",
134134
"git_sha": "a96f42708b631dfc736f48980dbf2be418dd784b",
135135
"installed_by": ["modules", "vcf_annotate_ensemblvep"]
136+
},
137+
"untar": {
138+
"branch": "master",
139+
"git_sha": "6d46786420b4d7bc88eba026eb389c0c5535d120",
140+
"installed_by": ["modules"]
136141
}
137142
}
138143
},

nextflow_schema.json

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,9 @@
123123
},
124124
"ensemblvep_cache": {
125125
"type": "string",
126-
"format": "directory-path",
127126
"fa_icon": "fas fa-cloud-download-alt",
128-
"description": "Path to VEP cache which should contain the relevant species, genome and build directories. You have to use absolute paths to storage on Cloud infrastructure (i.e., s3://my-reference-data/cache/vep_cache/).",
129-
"help_text": "The VEP cache stores information about regulatory regions, attributes about different variants and transcripts, as well as predictions and scores for SIFT and PolyPhen-2."
127+
"description": "Path to VEP cache directory, or a .tar.gz archive of the cache. You have to use absolute paths to storage on Cloud infrastructure (i.e., s3://my-reference-data/cache/vep_cache/).",
128+
"help_text": "The VEP cache stores information about regulatory regions, attributes about different variants and transcripts, as well as predictions and scores for SIFT and PolyPhen-2. A pre-built .tar.gz archive will be automatically extracted before use."
130129
},
131130
"ensemblvep_cache_version": {
132131
"type": "string",

0 commit comments

Comments
 (0)