feat: pipeline accepts vep.tar.gz or vep/ dir

emmcauley · emmcauley · commit c0facc3fd52a · 2026-05-07T11:38:27.000-04:00
diff --git a/docs/variant_annotation.md b/docs/variant_annotation.md
@@ -12,15 +12,37 @@ The CIViCpy annotation cache is small and is downloaded on each pipeline run. If
 
 ## Ensembl Variant Effect Predictor (VEP) cache
 
-1. The quickest way to download the VEP cache is with `wget` and `tar`:
+The pipeline accepts either a pre-extracted cache directory or a `.tar.gz` archive.
+
+If a tarball is supplied, it will be automatically extracted before VEP runs.
+
+1. The quickest way to download the VEP cache is with `wget`:
 
 ```console
 wget https://ftp.ensembl.org/pub/release-114/variation/indexed_vep_cache/homo_sapiens_vep_114_GRCh38.tar.gz
-tar -xzf homo_sapiens_vep_114_GRCh38.tar.gz
 ```
 
 The version and build you choose should match the `--ensemblvep_cache_version` and `--annotation_genome_version` arguments provided to the pipeline, respectively.
 
+You can pass the tarball directly to the pipeline:
+
+```console
+nextflow run twistcgp/main.nf \
+   -profile <docker/singularity/conda> \
+   --fasta hg38.fa \
+   --input samplesheet.csv \
+   --baits baits.bed \
+   --targets targets.bed \
+   --ensemblvep_cache homo_sapiens_vep_114_GRCh38.tar.gz \
+   --outdir <OUTDIR>
+```
+
+Or extract it first and pass the directory:
+
+```console
+tar -xzf homo_sapiens_vep_114_GRCh38.tar.gz
+```
+
 2. Alternatively, install Ensembl VEP which is available directly from [github.com/ensembl-vep](https://github.com/Ensembl/ensembl-vep.git) or install with mamba/conda, [bioconda::ensembl-vep](https://anaconda.org/bioconda/ensembl-vep). If using conda, activate your environment.
 
 3. Download the cache with Ensembl VEP, making sure that the genome version and database version match the pipeline parameters.
@@ -31,7 +53,7 @@ Please note that this download is rate-limited, and will take much longer than `
 vep_install -a cf -s homo_sapiens -y GRCh38 -c ~/vep --CONVERT
 ```
 
-3. Pass the cache to the pipeline:
+4. Pass the cache directory to the pipeline:
 
 ```console
 nextflow run twistcgp/main.nf \
diff --git a/main.nf b/main.nf
@@ -19,6 +19,7 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_twistcgp
 include { PREPARE_GENOME } from './subworkflows/local/prepare_genome'
 include { PREPARE_ANNOTATION_DB } from './subworkflows/local/prepare_annotation_db'
 include { PREPARE_INDICES } from './subworkflows/local/prepare_indices'
+include { UNTAR as UNTAR_VEP_CACHE } from './modules/nf-core/untar/main'
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN MAIN WORKFLOW
@@ -166,9 +167,18 @@ workflow FULCRUMGENOMICS_TWISTCGP {
     ch_snpeff_cache = params.snpeff_cache
         ? Channel.fromPath(params.snpeff_cache).map { it -> [[id: 'snpeff_cache'], it] }.collect()
         : PREPARE_ANNOTATION_DB.out.snpeff_cache
-    ch_vep_cache = params.ensemblvep_cache
-        ? Channel.fromPath(params.ensemblvep_cache).map { it -> [[id: 'vep_cache'], it] }.collect()
-        : PREPARE_ANNOTATION_DB.out.ensemblvep_cache
+    if (params.ensemblvep_cache && params.ensemblvep_cache.endsWith('.tar.gz')) {
+        UNTAR_VEP_CACHE(
+            Channel.fromPath(params.ensemblvep_cache)
+                .map { it -> [[id: 'vep_cache'], it] }
+                .collect()
+        )
+        ch_vep_cache = UNTAR_VEP_CACHE.out.untar.collect()
+    } else {
+        ch_vep_cache = params.ensemblvep_cache
+            ? Channel.fromPath(params.ensemblvep_cache).map { it -> [[id: 'vep_cache'], it] }.collect()
+            : PREPARE_ANNOTATION_DB.out.ensemblvep_cache
+    }
     ch_msi_scan = params.msisensor_scan
         ? Channel.fromPath(params.msisensor_scan).map { it -> [[id: 'scan'], it] }.collect()
         : (params.use_msisensor_pro_licensed ? PREPARE_GENOME.out.msi_scan : Channel.value([[id: 'scan'], []]))
diff --git a/modules.json b/modules.json
@@ -133,6 +133,11 @@
                         "branch": "master",
                         "git_sha": "a96f42708b631dfc736f48980dbf2be418dd784b",
                         "installed_by": ["modules", "vcf_annotate_ensemblvep"]
+                    },
+                    "untar": {
+                        "branch": "master",
+                        "git_sha": "6d46786420b4d7bc88eba026eb389c0c5535d120",
+                        "installed_by": ["modules"]
                     }
                 }
             },
diff --git a/nextflow_schema.json b/nextflow_schema.json
@@ -123,10 +123,9 @@
                 },
                 "ensemblvep_cache": {
                     "type": "string",
-                    "format": "directory-path",
                     "fa_icon": "fas fa-cloud-download-alt",
-                    "description": "Path to VEP cache which should contain the relevant species, genome and build directories. You have to use absolute paths to storage on Cloud infrastructure (i.e., s3://my-reference-data/cache/vep_cache/).",
-                    "help_text": "The VEP cache stores information about regulatory regions, attributes about different variants and transcripts, as well as predictions and scores for SIFT and PolyPhen-2."
+                    "description": "Path to VEP cache directory, or a .tar.gz archive of the cache. You have to use absolute paths to storage on Cloud infrastructure (i.e., s3://my-reference-data/cache/vep_cache/).",
+                    "help_text": "The VEP cache stores information about regulatory regions, attributes about different variants and transcripts, as well as predictions and scores for SIFT and PolyPhen-2. A pre-built .tar.gz archive will be automatically extracted before use."
                 },
                 "ensemblvep_cache_version": {
                     "type": "string",

Original file line number	Diff line number	Diff line change
`@@ -133,6 +133,11 @@`
`133`	`133`	`"branch": "master",`
`134`	`134`	`"git_sha": "a96f42708b631dfc736f48980dbf2be418dd784b",`
`135`	`135`	`"installed_by": ["modules", "vcf_annotate_ensemblvep"]`
	`136`	`+ },`
	`137`	`+ "untar": {`
	`138`	`+ "branch": "master",`
	`139`	`+ "git_sha": "6d46786420b4d7bc88eba026eb389c0c5535d120",`
	`140`	`+ "installed_by": ["modules"]`
`136`	`141`	`}`
`137`	`142`	`}`
`138`	`143`	`},`