fulcrumgenomics · emmcauley · Mar 19, 2026 · Apr 1, 2026 · May 7, 2026 · Jun 8, 2026
diff --git a/.github/workflows/linting.yml b/.github/workflows/linting.yml
@@ -34,7 +34,7 @@ jobs:
         uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4
 
       - name: Install Nextflow
-        uses: nf-core/setup-nextflow@v2
+        uses: nf-core/setup-nextflow@6c2e22b4d901f0c42ca66c5069f8026df026d165 # v2
 
       - uses: actions/setup-python@a26af69be951a213d495a4c3e4e4022e16d87065 # v5
         with:

diff --git a/.github/workflows/twistgp_ci.yml b/.github/workflows/twistgp_ci.yml
@@ -38,13 +38,13 @@ jobs:
           fetch-depth: 0
 
       - name: Set up Nextflow
-        uses: nf-core/setup-nextflow@v2
+        uses: nf-core/setup-nextflow@6c2e22b4d901f0c42ca66c5069f8026df026d165 # v2
         with:
           version: "${{ matrix.NXF_VER }}"
 
       - name: Set up Apptainer
         if: matrix.profile == 'singularity'
-        uses: eWaterCycle/setup-apptainer@main
+        uses: eWaterCycle/setup-apptainer@3f706d898c9db585b1d741b4692e66755f3a1b40 # main
 
       - name: Set up Singularity
         if: matrix.profile == 'singularity'

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,9 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- `--ensemblvep_cache` now accepts either a pre-extracted cache directory or a `.tar.gz` archive; tarballs are automatically extracted before VEP runs
 - Added GATK4 FilterMutectCalls after Mutect2 variant calling
 - Added BCFTOOLS_VIEW pre-filtering step prior to TMB calculation
 - Added `--tmb_popaf_cutoff` and `--tmb_vaf_cutoff` parameters
+- Added `--skip_cnv`, `--skip_msi`, and `--skip_tmb` parameters to allow skipping CNV calling, MSI analysis, and TMB calculation respectively
 
 ## 1.1.0dev
 

diff --git a/README.md b/README.md
@@ -183,10 +183,35 @@ Prior to TMB calculation, annotated variants are pre-filtered using `bcftools vi
 The following parameters control these thresholds:
 
 - `--tmb_popaf_cutoff` (default: `3.0`): Minimum POPAF value (negative log10 of population allele frequency) to include a variant. The default of `3.0` corresponds to a population allele frequency of &le; 0.001 (0.1%), excluding common germline variants that are unlikely to be somatic. This value is derived from the Mutect2 `POPAF` INFO field.
-- `--tmb_vaf_cutoff` (default: `0.05`): Minimum variant allele frequency (FORMAT/AF) to include a variant. The default of `0.05` (5%) excludes very low frequency variants that may represent sequencing artifacts or sub-clonal noise, consistent with the [Friends of Cancer Research TMB Harmonization Project](https://friendsofcancerresearch.org/publication/in-silico-assessment-of-variation-in-tmb-quantification-across-diagnostic-platforms-phase-1-of-the-friends-of-cancer-research-harmonization-project/) recommendations.
+- `--tmb_vaf_cutoff` (default: `0.10`): Minimum variant allele frequency (FORMAT/AF) to include a variant. The [Friends of Cancer Research TMB Harmonization Project](https://friendsofcancerresearch.org/publication/in-silico-assessment-of-variation-in-tmb-quantification-across-diagnostic-platforms-phase-1-of-the-friends-of-cancer-research-harmonization-project/) recommends a minimum of 0.05 (5%). The default of 0.10 (10%) provides additional stringency to reduce sub-clonal noise in tumor-only analyses.
 
 </details>
 
+### Skipping Analysis Steps
+
+Individual analysis steps can be skipped using the following parameters:
+
+| Parameter        | Description                                                         |
+| ---------------- | ------------------------------------------------------------------- |
+| `--skip_cnv`     | Skip CNV calling with CNVkit                                        |
+| `--skip_msi`     | Skip microsatellite instability analysis (MSIsensor2/MSIsensor-pro) |
+| `--skip_tmb`     | Skip tumor mutational burden calculation (pyTMB)                    |
+| `--skip_civicpy` | Skip CIViCpy variant annotation                                     |
+
+For example, to run the pipeline without MSI and TMB:
+
+```console
+nextflow run twistcgp/main.nf \
+   -profile docker \
+   --input samplesheet.csv \
+   --fasta hg38_giab.fa \
+   --baits baits.bed \
+   --targets targets.bed \
+   --outdir results \
+   --skip_msi \
+   --skip_tmb
+```
+
 ### Variant Filtering with FilterMutectCalls
 
 Following variant calling with Mutect2, this pipeline applies [`FilterMutectCalls`](https://gatk.broadinstitute.org/hc/en-us/articles/360036856831-FilterMutectCalls) to annotate variant quality, consistent with [GATK Best Practices for somatic variant discovery](https://www.biorxiv.org/content/biorxiv/early/2019/12/02/861054/DC1/embed/media-1.pdf?download=true) (Benjamin et al., 2019).

diff --git a/assets/pytmb_vep.yml b/assets/pytmb_vep.yml
@@ -89,12 +89,13 @@ polymDb:
   gnomad:
     - gnomADe_AF
     - gnomADe_AFR_AF
-    - gnomADe_AMR_AF
+    - gnomAD_AMR_AF
     - gnomADe_ASJ_AF
     - gnomADe_EAS_AF
     - gnomADe_FIN_AF
     - gnomADe_MID_AF
     - gnomADe_NFE_AF
+    - gnomADe_REMAINING_AF
     - gnomADe_SAS_AF
     - gnomADg_AF
     - gnomADg_AFR_AF

diff --git a/conf/modules.config b/conf/modules.config
@@ -27,6 +27,7 @@ process {
     }
 
     withName: BCFTOOLS_VIEW {
+        ext.when = { params.skip_tmb == false }
         // NB: ext.args must include --write-index=tbi for downstream processes to run successfully
         ext.args = {
             [
@@ -53,8 +54,15 @@ process {
             saveAs: { filename -> filename != 'versions.yml' && params.save_reference ? filename : null },
         ]
     }
-    withName: CIVICPY {
-        ext.when = { params.skip_civicpy == false }
+
+    withName: CIVICPY_UPDATE_CACHE {
+        publishDir = [
+            path: { params.save_reference ? "${params.outdir}/reference" : params.outdir },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename != 'versions.yml' && params.save_reference ? filename : null },
+        ]
+    }
+    withName: CIVICPY_ANNOTATE_VCF {
         publishDir = [
             path: { "${params.outdir}/${meta.id}" },
             mode: params.publish_dir_mode,
@@ -63,6 +71,7 @@ process {
     }
 
     withName: CNVKIT_BATCH {
+        ext.when = { params.skip_cnv == false }
         // hmm-tumor performs well for relatively pure, high coverage tumor samples
         // alternative segmentation methods can be explored here:
         // https://cnvkit.readthedocs.io/en/stable/pipeline.html#segmentation-methods
@@ -74,6 +83,14 @@ process {
         ]
     }
 
+    withName: UNTAR_VEP_CACHE {
+        publishDir = [
+            path: { "${params.outdir}/reference/ensemblvep_cache" },
+            mode: params.publish_dir_mode,
+            saveAs: { filename -> filename != 'versions.yml' && params.save_reference ? filename : null },
+        ]
+    }
+
     withName: ENSEMBLVEP_DOWNLOAD {
         ext.when = { !params.ensemblvep_cache }
         ext.args = { '--AUTO c --CONVERT --NO_BIOPERL --NO_HTSLIB --NO_TEST --NO_UPDATE' }
@@ -85,13 +102,13 @@ process {
     }
 
     withName: 'ENSEMBLVEP_VEP' {
-        ext.args = "--vcf --af_gnomade --af_1kg" + (params.cosmic_vcf ? " --custom ${params.cosmic_vcf},COSMIC,vcf,exact,0,ID" : "")
+        ext.args = "--format vcf --vcf --af_gnomade --af_1kg" + (params.cosmic_vcf ? " --custom ${params.cosmic_vcf},COSMIC,vcf,exact,0,ID" : "")
         ext.prefix = { "${meta.id}.vep" }
         publishDir = [
             [
                 mode: params.publish_dir_mode,
                 path: { "${params.outdir}/${meta.id}/" },
-                pattern: "*{gz,html}",
+                pattern: "*{gz,gz.tbi,html}",
             ]
         ]
     }
@@ -106,8 +123,9 @@ process {
     }
 
     withName: GATK4_FILTERMUTECTCALLS {
+        ext.prefix = { "${meta.id}.labeled" }
         publishDir = [
-            path: { "${params.outdir}/${meta.id}.labeled" },
+            path: { "${params.outdir}/${meta.id}" },
             mode: params.publish_dir_mode,
             pattern: "*{vcf.gz,vcf.gz.tbi,filteringStats.tsv}",
         ]
@@ -148,6 +166,7 @@ process {
     }
 
     withName: MSISENSOR2_MSI {
+        ext.when = { params.skip_msi == false }
         // NB: The module outputs the summary file as "${prefix}"
         ext.prefix = { "${meta.id}.msi" }
         publishDir = [
@@ -158,6 +177,7 @@ process {
     }
 
     withName: MSISENSORPRO_PRO {
+        ext.when = { params.skip_msi == false }
         // NB: The module outputs the summary file as "${prefix}"
         ext.prefix = { "${meta.id}.msi" }
         publishDir = [
@@ -266,8 +286,11 @@ process {
     }
 
     withName: TMB {
+        ext.when = { params.skip_tmb == false }
+        errorStrategy = { task.attempt <= task.maxRetries ? 'retry' : 'finish' }
+        maxRetries = 3
         // VAF, minDepth, and minAltDepth recommended by: https://friendsofcancerresearch.org/publication/in-silico-assessment-of-variation-in-tmb-quantification-across-diagnostic-platforms-phase-1-of-the-friends-of-cancer-research-harmonization-project/
-        ext.args = "--polymDb gnomad --filterPolym --vaf 0.05 --minDepth 25 --minAltDepth 3 --filterLowQual --filterIndels --filterNonCoding --filterSyn --maf 0.01"
+        ext.args = "--polymDb gnomad --filterPolym --vaf ${params.tmb_vaf_cutoff} --minDepth 25 --minAltDepth 3 --filterLowQual --filterIndels --filterNonCoding --filterSyn --maf 0.01"
         publishDir = [
             path: { "${params.outdir}/${meta.id}" },
             mode: params.publish_dir_mode,
@@ -284,7 +307,15 @@ process {
         ]
     }
 
-    withName: TABIX_BGZIPTABIX {
+    withName: '.*VCF_ANNOTATE_ENSEMBLVEP:TABIX_TABIX' {
+        publishDir = [
+            mode: params.publish_dir_mode,
+            path: { "${params.outdir}/${meta.id}/" },
+            pattern: "*.tbi",
+        ]
+    }
+
+    withName: '.*VCF_ANNOTATE_SNPEFF:TABIX_BGZIPTABIX' {
         ext.prefix = { "${meta.id}.snpeff" }
         publishDir = [
             mode: params.publish_dir_mode,
@@ -293,8 +324,8 @@ process {
         ]
     }
 
-    withName: TABIX_TABIX {
-        ext.prefix = { "${meta.id}.snpeff" }
+    withName: 'TWISTCGP:TABIX_BGZIPTABIX' {
+        ext.prefix = { "${meta.id}.civic" }
         publishDir = [
             mode: params.publish_dir_mode,
             path: { "${params.outdir}/${meta.id}/" },

diff --git a/conf/test.config b/conf/test.config
@@ -42,22 +42,15 @@ process {
         // FilterMutectCalls fails with the minimal test data
         ext.when = { false }
     }
-    withName: "TMB" {
-        // This process runs after SNPEFF_SNPEFF, which we are also skipping
-        ext.when = { false }
-    }
-    withName: "MSISENSOR2_MSI" {
-        // TODO: Create small test data to use in integration testing
-        ext.when = { false }
-    }
-    withName: "MSISENSORPRO_PRO" {
-        // This process fails with the empty output from MSISENSOR2_SCAN
-        // TODO: Create small test data to use in integration testing
+    withName: "CIVICPY_UPDATE_CACHE" {
+        // Downloading the CIViC cache is too slow for CI
         ext.when = { false }
     }
 }
 
 params {
+    skip_tmb = true
+    skip_msi = true
     config_profile_name = 'Test profile'
     config_profile_description = 'Minimal test dataset to check pipeline function'
 

diff --git a/docs/variant_annotation.md b/docs/variant_annotation.md
@@ -12,26 +12,61 @@ The CIViCpy annotation cache is small and is downloaded on each pipeline run. If
 
 ## Ensembl Variant Effect Predictor (VEP) cache
 
-1. The quickest way to download the VEP cache is with `wget` and `tar`:
+The pipeline accepts either a pre-extracted cache directory or a `.tar.gz` archive via `--ensemblvep_cache`.
+If a tarball is supplied, it will be automatically extracted before VEP runs.
+Only gzip-compressed archives (`.tar.gz`) are supported; other formats (`.tgz`, `.tar.bz2`) are not.
+
+The version and build of the cache must match the `--ensemblvep_cache_version` and `--annotation_genome_version` parameters provided to the pipeline.
+
+### Option 1: Download with wget (recommended)
+
+1. Download the cache tarball:
 
 ```console
 wget https://ftp.ensembl.org/pub/release-114/variation/indexed_vep_cache/homo_sapiens_vep_114_GRCh38.tar.gz
+```
+
+2. Pass the tarball directly to the pipeline:
+
+```console
+nextflow run twistcgp/main.nf \
+   -profile <docker/singularity/conda> \
+   --fasta hg38.fa \
+   --input samplesheet.csv \
+   --baits baits.bed \
+   --targets targets.bed \
+   --ensemblvep_cache homo_sapiens_vep_114_GRCh38.tar.gz \
+   --outdir <OUTDIR>
+```
+
+Or extract it first and pass the directory:
+
+```console
 tar -xzf homo_sapiens_vep_114_GRCh38.tar.gz
+
+nextflow run twistcgp/main.nf \
+   -profile <docker/singularity/conda> \
+   --fasta hg38.fa \
+   --input samplesheet.csv \
+   --baits baits.bed \
+   --targets targets.bed \
+   --ensemblvep_cache homo_sapiens/ \
+   --outdir <OUTDIR>
 ```
 
-The version and build you choose should match the `--ensemblvep_cache_version` and `--annotation_genome_version` arguments provided to the pipeline, respectively.
+### Option 2: Download with the VEP installer
 
-2. Alternatively, install Ensembl VEP which is available directly from [github.com/ensembl-vep](https://github.com/Ensembl/ensembl-vep.git) or install with mamba/conda, [bioconda::ensembl-vep](https://anaconda.org/bioconda/ensembl-vep). If using conda, activate your environment.
+1. Install Ensembl VEP, available directly from [github.com/ensembl-vep](https://github.com/Ensembl/ensembl-vep.git) or via mamba/conda ([bioconda::ensembl-vep](https://anaconda.org/bioconda/ensembl-vep)). If using conda, activate your environment.
 
-3. Download the cache with Ensembl VEP, making sure that the genome version and database version match the pipeline parameters.
+2. Download the cache, making sure the genome version and database version match the pipeline parameters.
 
-Please note that this download is rate-limited, and will take much longer than `wget`.
+Please note that this download is rate-limited and will take much longer than `wget`.
 
 ```console
 vep_install -a cf -s homo_sapiens -y GRCh38 -c ~/vep --CONVERT
 ```
 
-3. Pass the cache to the pipeline:
+3. Pass the cache directory to the pipeline:
 
 ```console
 nextflow run twistcgp/main.nf \
@@ -44,8 +79,7 @@ nextflow run twistcgp/main.nf \
    --outdir <OUTDIR>
 ```
 
-Note that the path provided to `--ensemblvep_cache` should be the parent directory of the parent directory of the cache files.
-In this example, it would be `--ensemblvep_cache ~/vep/`:
+Note that `--ensemblvep_cache` should point to the directory containing the `homo_sapiens/` subdirectory:
 
 ```console
 $ tree -L 1 ~/vep/

diff --git a/main.nf b/main.nf
@@ -19,6 +19,7 @@ include { PIPELINE_COMPLETION } from './subworkflows/local/utils_nfcore_twistcgp
 include { PREPARE_GENOME } from './subworkflows/local/prepare_genome'
 include { PREPARE_ANNOTATION_DB } from './subworkflows/local/prepare_annotation_db'
 include { PREPARE_INDICES } from './subworkflows/local/prepare_indices'
+include { UNTAR_VEP_CACHE } from './modules/local/untar_vep_cache/main'
 /*
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
     RUN MAIN WORKFLOW
@@ -69,8 +70,6 @@ workflow {
             params.ensemblvep_cache_version,
         )
     )
-    ensemblvep_cache = params.ensemblvep_cache ? file(params.ensemblvep_cache) : []
-
     ch_cosmic_vcf = Channel.value(
         tuple([id: 'cosmic_vcf'], params.cosmic_vcf ? file(params.cosmic_vcf) : [])
     )
@@ -93,7 +92,6 @@ workflow {
         snpeff_cache,
         tmb_mutect2_config,
         tmb_vep_config,
-        ensemblvep_cache,
         ch_cosmic_vcf,
         ch_gnomad_vcf,
     )
@@ -130,7 +128,6 @@ workflow FULCRUMGENOMICS_TWISTCGP {
     snpeff_cache // channel: path(snpeff_cache)
     tmb_mutect2_config // required path to variant calling config file
     tmb_vep_config // required path to variant annotation config file
-    ensemblvep_cache // channel: path(ensemblvep_cache)
     ch_cosmic_vcf // optional val(reference meta), path(cosmic VCF)
     ch_gnomad_vcf // optional val(reference meta), path(gnomAD VCF)
 
@@ -166,9 +163,17 @@ workflow FULCRUMGENOMICS_TWISTCGP {
     ch_snpeff_cache = params.snpeff_cache
         ? Channel.fromPath(params.snpeff_cache).map { it -> [[id: 'snpeff_cache'], it] }.collect()
         : PREPARE_ANNOTATION_DB.out.snpeff_cache
-    ch_vep_cache = params.ensemblvep_cache
-        ? Channel.fromPath(params.ensemblvep_cache).map { it -> [[id: 'vep_cache'], it] }.collect()
-        : PREPARE_ANNOTATION_DB.out.ensemblvep_cache
+    if (params.ensemblvep_cache && params.ensemblvep_cache.endsWith('.tar.gz')) {
+        UNTAR_VEP_CACHE(
+            channel.value(file(params.ensemblvep_cache))
+                .map { it -> [[id: 'vep_cache'], it] }
+        )
+        ch_vep_cache = UNTAR_VEP_CACHE.out.cache.first()
+    } else {
+        ch_vep_cache = params.ensemblvep_cache
+            ? channel.fromPath(params.ensemblvep_cache).collect { it -> [[id: 'vep_cache'], it] }
+            : PREPARE_ANNOTATION_DB.out.ensemblvep_cache
+    }
     ch_msi_scan = params.msisensor_scan
         ? Channel.fromPath(params.msisensor_scan).map { it -> [[id: 'scan'], it] }.collect()
         : (params.use_msisensor_pro_licensed ? PREPARE_GENOME.out.msi_scan : Channel.value([[id: 'scan'], []]))

diff --git a/modules/local/civicpy/environment.yml → ...es/local/civicpy/annotate/environment.yml b/modules/local/civicpy/environment.yml → ...es/local/civicpy/annotate/environment.yml