diff --git a/.claude/skills/prepare-edit-inputs/SKILL.md b/.claude/skills/prepare-edit-inputs/SKILL.md index 253c66b7..58d9c1cf 100644 --- a/.claude/skills/prepare-edit-inputs/SKILL.md +++ b/.claude/skills/prepare-edit-inputs/SKILL.md @@ -10,13 +10,16 @@ Goal: produce a valid `input_config.json` (+ per-sample expected-edit JSONs) tha ## 0. Preflight: is the reference data present? -Before anything else, confirm `scripts/setup.sh` has been run (see AGENTS.md guardrail 1): -- Check `hifi-wdl-resources-v3.1.0/` exists and the paths inside +Before anything else, confirm the resource bundle has been fetched (see AGENTS.md guardrail 1). +`scripts/setup.sh` (or `scripts/fetch_resources.sh` directly) pulls the frozen Zenodo bundle +into `editing-qc-resources-v3.1.0/` and writes populated map files +`GRCh38.{ref,tertiary,somatic}_map.v3p1p0.template.tsv` at the repo root: +- Check `editing-qc-resources-v3.1.0/` exists and the absolute paths inside `GRCh38.ref_map.v3p1p0.template.tsv` resolve to real files. -- **`` placeholders:** unmodified templates contain `/...` paths. - `setup.sh` strips these (`sed "s/\///g"`). If you still see `` in a map - file, the template was never populated — tell the user to run `scripts/setup.sh`. Do not - hand-edit placeholders into guessed absolute paths. +- **`` placeholders:** the in-bundle `*.template.tsv` files contain `/...` + paths; `fetch_resources.sh` substitutes the absolute bundle path when writing the populated + `*.template.tsv`. If you still see `` in a populated map file, the fetch never ran — tell the + user to run `scripts/setup.sh`. Do not hand-edit placeholders into guessed absolute paths. ## Interpreting a sample request into samples @@ -73,15 +76,17 @@ Resolution rules that recur regardless of format: ## 1. dbNSFP licensing (do this once, flag every time) -`setup.sh` auto-downloads **dbNSFP v4.9a** as a *fallback only*. v4.x omits columns that are -license-gated for commercial use. **Academic users should register at -https://www.dbnsfp.org/download and obtain dbNSFP v5.3+**, then: -- place the indexed file alongside the other references, -- update `DBNSFP_VERSION` in `setup.sh` (or the download step) and -- update the `dbnsfp` entry in `GRCh38.somatic_map.v3p1p0.template.tsv` to point at it. +dbNSFP is **license-gated and deliberately NOT in the frozen bundle** (academic-free / +commercial-licensed). The pinned version lives in `resources/manifest.tsv` (`dbnsfp_ver`). +**Academic users register at https://www.dbnsfp.org/download, obtain the pinned version**, +build the bgzip+tabix `_grch38.gz` file, then re-run the fetch step pointing at it: +- `./scripts/fetch_resources.sh --dbnsfp /path/to/dbNSFP_grch38.gz` (`.tbi` alongside) — + this patches the `dbnsfp_file`/`dbnsfp_file_index` entries in the populated + `GRCh38.somatic_map.v3p1p0.template.tsv`. +- To change the pinned version, edit `dbnsfp_ver` in `resources/manifest.tsv`. The agent cannot download a registered file — surface this to the user and confirm which -dbNSFP version their somatic map points at. +dbNSFP version their somatic map points at. Until provided, those entries are placeholders. ## 2. Expected-edit JSON from a Benchling GenBank file diff --git a/.gitignore b/.gitignore index 650eab7c..33381b5b 100644 --- a/.gitignore +++ b/.gitignore @@ -45,3 +45,5 @@ workflows/snakemake .env .venv .env +editing-qc-resources-*/ +GRCh38.*_map.*.template.tsv diff --git a/AGENTS.md b/AGENTS.md index 05c5f248..93a6bdd6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,10 +16,17 @@ cluster**. ## Hard guardrails (must hold even when no skill is invoked) 1. **Verify prerequisite reference data before doing anything that runs the workflow.** - `scripts/setup.sh` downloads ~200 GB into `hifi-wdl-resources-v3.1.0/` and populates the - map templates (`GRCh38.{ref,tertiary,somatic}_map.v3p1p0.template.tsv`). If the bundle dir - or the paths referenced inside the map files are **missing, STOP and tell the user to run - `scripts/setup.sh`** — never fabricate or guess reference paths. + `scripts/setup.sh` (→ `scripts/fetch_resources.sh`) pulls a **single frozen Zenodo bundle** + (resumable, checksum-verified) into `editing-qc-resources-v3.1.0/` and writes populated map + files `GRCh38.{ref,tertiary,somatic}_map.v3p1p0.template.tsv` at the repo root with absolute + paths (same filename as before, now holding substituted absolute paths). + If the bundle dir or the paths referenced inside the map files are **missing, STOP and tell + the user to run `scripts/setup.sh`** — never fabricate or guess reference paths. + - Versions are pinned in `resources/manifest.tsv` (the single source of truth). + - **dbNSFP is license-gated and not in the bundle** — the user supplies it via + `fetch_resources.sh --dbnsfp`; until then its somatic-map entries are placeholders. + - Rebuilding the bundle for a new version is a **maintainer** task via + `scripts/build_resource_bundle.sh` (the slow upstream pull) — never run that during setup. 2. **Never read pipeline logs in full.** `workflow.log` and task stderr are enormous and highly repetitive. Always `grep -E "ERROR|FAILED|failed"`, `tail`, or scope to one @@ -89,7 +96,10 @@ cluster**. later, which gives cleaner variant-filtering statistics for publication. It should have no effect on the actual final reported variants. - `genbank_to_crispr_json.py` — Benchling GenBank → expected-edit JSON converter. -- `scripts/setup.sh` — one-time reference + container download. +- `resources/manifest.tsv` — **single source of truth** for resource versions/URLs/checksums. +- `scripts/setup.sh` — one-time setup: fetch frozen bundle (delegates to fetch_resources.sh) + build knock-knock container. +- `scripts/fetch_resources.sh` — user-facing: resumable, checksum-verified pull of the frozen Zenodo bundle. +- `scripts/build_resource_bundle.sh` — **maintainer-only**: slow upstream pull → frozen bundle tarball for Zenodo. - `scripts/launch.sh` — stages inputs and (optionally `--run`) launches the workflow. - `scripts/process_input_config.py` — BAM merge/strip/stage, called by launch.sh. - `scripts/create_image_manifest.sh` / `populate_miniwdl_singularity_cache.sh` — container prepull. diff --git a/GRCh38.somatic_map.v3p1p0.template.tsv b/GRCh38.somatic_map.v3p1p0.template.tsv index f451da76..bb88f6d3 100644 --- a/GRCh38.somatic_map.v3p1p0.template.tsv +++ b/GRCh38.somatic_map.v3p1p0.template.tsv @@ -1,9 +1,4 @@ -trf_bed hifisomatic_resources/human_GRCh38_no_alt_analysis_set.trf.bed -ref_bed hifisomatic_resources/chr.bed ref_gff hifisomatic_resources/Homo_sapiens.GRCh38.112.chr.reformatted.gff3 -control_vcf hifisomatic_resources/severus.jasmine.AN10.AC4.nosample.vcf.gz -control_vcf_index hifisomatic_resources/severus.jasmine.AN10.AC4.nosample.vcf.gz.tbi -severus_pon_tsv PoN_1000G_hg38_extended.tsv.gz vep_cache homo_sapiens_refseq_vep_115_GRCh38.tar.gz annotsv_cache annotsv_cache.tar.gz dbnsfp_file dbNSFP5.3.1a_grch38.gz diff --git a/docs/backend-hpc.md b/docs/backend-hpc.md index 9278ce6c..6a0bfff9 100644 --- a/docs/backend-hpc.md +++ b/docs/backend-hpc.md @@ -40,7 +40,7 @@ Cromwell supports a number of different HPC backends; see [Cromwell's documentat ### Filling out workflow inputs -Create an input configuration JSON describing your samples and their relationships. See [example_input_config.json](../example_input_config.json) as a template. After downloading reference data with `./scripts/setup.sh`, the template map files at the repository root will be populated with local paths. +Create an input configuration JSON describing your samples and their relationships. See [example_input_config.json](../example_input_config.json) as a template. After downloading reference data with `./scripts/setup.sh`, the populated map files (`GRCh38.{ref,tertiary,somatic}_map.v3p1p0.template.tsv`) at the repository root will hold absolute local paths into the frozen bundle. See [family.md](./family.md) for input structure details, or [biohub-setup.md](./biohub-setup.md) for biohub-specific instructions. @@ -80,12 +80,44 @@ cromwell run workflows/family.wdl --input ## Reference data bundle -[10.5281/zenodo.17086906](https://zenodo.org/records/17086906) +[10.5281/zenodo.20856447](https://zenodo.org/records/20856447) -Reference data is hosted on Zenodo. Use the provided setup script to download and configure: +Reference data (~46 GB) is hosted on Zenodo. The download is resumable and checksum-verified; it typically takes 15–30 minutes depending on network speed. + +Before running setup, collect two prerequisites: + +**1. Set `SINGULARITY_CACHEDIR`** to a location with sufficient space (the knock-knock container +is several GB). If unset, `setup.sh` skips the knock-knock build with a warning. A good default +is a directory under the repo root on scratch: ```bash +export SINGULARITY_CACHEDIR="$(pwd)/miniwdl_cache/singularity_cache" +mkdir -p "${SINGULARITY_CACHEDIR}" +``` + +**2. Obtain dbNSFP** (license-gated, not in bundle). dbNSFP is required for somatic variant +annotation. Request a licensed copy from [https://www.dbnsfp.org/download](https://www.dbnsfp.org/download). +You need the **GRCh38 BGZF files** listed under _"dbNSFP variants in BGZF format for VEP and +SnpEff annotation programs (sorted by GRCh38 and GRCh37 coordinates)"_: + +- `dbNSFP5.3.1a_grch38.gz` +- `dbNSFP5.3.1a_grch38.gz.tbi` + +Have the path ready before running setup — passing it via `--dbnsfp` avoids re-running the +full download a second time. If you cannot obtain dbNSFP yet, setup will complete with +placeholders in the somatic map. + +Then download the bundle (~46 GB, resumable, typically 15–30 min) and build the knock-knock container: + +```bash +# Recommended: pass dbNSFP in the same invocation +./scripts/setup.sh --dbnsfp /path/to/dbNSFP5.3.1a_grch38.gz + +# Without dbNSFP (somatic map will have placeholders): ./scripts/setup.sh + +# Resources only, no knock-knock build: +./scripts/fetch_resources.sh --dbnsfp /path/to/dbNSFP5.3.1a_grch38.gz ``` -This downloads ~200GB of reference files and updates the template map files with local paths. The process takes several hours. +All commands extract the bundle under the repo root and write ready-to-use map files (`GRCh38.{ref,tertiary,somatic}_map.v3p1p0.template.tsv`) with absolute paths. If a download is interrupted, re-run — it resumes from where it stopped. diff --git a/docs/biohub-setup.md b/docs/biohub-setup.md index 16eb366e..98d5f6d5 100644 --- a/docs/biohub-setup.md +++ b/docs/biohub-setup.md @@ -29,15 +29,49 @@ conda activate hifi-wdl Create or edit `~/.config/miniwdl.cfg` for your HPC environment. See [docs/backend-hpc.md](backend-hpc.md) for SLURM-specific configuration. -### 4. Download reference data +### 4. Download reference data and build containers -This downloads ~200GB of reference files and will take several hours: +Before running setup, collect two prerequisites: + +#### 4a. Set container cache location + +`setup.sh` builds the knock-knock Singularity image into `$SINGULARITY_CACHEDIR`. On Biohub +HPC the default home-directory cache is too small — point it at scratch storage first: ```bash -./scripts/setup.sh +export SINGULARITY_CACHEDIR="$(pwd)/miniwdl_cache/singularity_cache" +mkdir -p "${SINGULARITY_CACHEDIR}" ``` -This populates the reference map template files at the repository root with local paths. +If `SINGULARITY_CACHEDIR` is unset, `setup.sh` will skip the knock-knock build with a warning +and you will need to re-run it later with the variable set. + +#### 4b. Obtain dbNSFP (license-gated) + +dbNSFP is required for somatic variant annotation but is not in the bundle due to licensing. +Request a licensed copy from [https://www.dbnsfp.org/download](https://www.dbnsfp.org/download). +You need the **GRCh38 BGZF files** listed under _"dbNSFP variants in BGZF format for VEP and +SnpEff annotation programs (sorted by GRCh38 and GRCh37 coordinates)"_: + +- `dbNSFP5.3.1a_grch38.gz` +- `dbNSFP5.3.1a_grch38.gz.tbi` + +Have the path to `dbNSFP5.3.1a_grch38.gz` ready before running setup — passing it via +`--dbnsfp` avoids having to re-run the full download a second time. If you cannot obtain +dbNSFP yet, setup will complete with placeholders in the somatic map. + +#### 4c. Run setup + +Downloads ~46 GB from Zenodo (resumable; typically 15–30 min), writes map files with +absolute paths, and builds the knock-knock container: + +```bash +# Recommended: pass dbNSFP in the same invocation +./scripts/setup.sh --dbnsfp /path/to/dbNSFP5.3.1a_grch38.gz + +# Without dbNSFP (somatic map will have placeholders — patch later with fetch_resources.sh --dbnsfp): +./scripts/setup.sh +``` ## Preparing Input Files diff --git a/docs/tools_containers.md b/docs/tools_containers.md index d74bf909..b6ce756e 100644 --- a/docs/tools_containers.md +++ b/docs/tools_containers.md @@ -50,6 +50,5 @@ This fork adds specialized tools for CRISPR editing validation and somatic varia | --------: | ------------------- | :---: | | ensembl-vep |
  • VEP (variant annotation)
| [ensembl-vep@sha256:e7612ab7c2923f2b9a78592b939e74874cd29f7494d70ee7135c8303841b03a8](https://hub.docker.com/r/ensemblorg/ensembl-vep) | | annotsv |
  • AnnotSV (SV annotation)
| [annotsv@sha256:0c73fef5fa529b11e10bea0355480f01b56d0feb21af54cb9bbbd1f9f4c862a7](https://quay.io/repository/biocontainers/annotsv) | -| severus |
  • Severus (phased SV calling)
| [severus@sha256:fb4471e0504d564de78215ae15c081a1bb2022ad51e993eba92bc6fa5052a05d](https://quay.io/repository/biocontainers/severus) | | somatic_general_tools |
  • Somatic analysis utilities
| [somatic_general_tools@sha256:a25a2e62b88c73fa3c18a0297654420a4675224eb0cf39fa4192f8a1e92b30d6](https://quay.io/repository/pacbio/somatic_general_tools) | | chord |
  • CHORD (HRD prediction)
| [chord@sha256:9f6aa44ffefe3f736e66a0e2d7941d4f3e1cc6d848a9a11a17e85a6525e63a77](https://hub.docker.com/r/scwatts/chord) | diff --git a/resources/manifest.tsv b/resources/manifest.tsv new file mode 100644 index 00000000..2320724e --- /dev/null +++ b/resources/manifest.tsv @@ -0,0 +1,42 @@ +# manifest.tsv — single source of truth for resource versions/provenance. +# +# This pins every upstream resource that build_resource_bundle.sh pulls and +# freezes into the bundle tarball. Bump a version HERE, re-run +# build_resource_bundle.sh, re-upload to Zenodo, update BUNDLE_* below. Users +# never read this file — they only run fetch_resources.sh against the frozen +# Zenodo record. +# +# Columns: keyvalue +# +# --- bundle identity (what fetch_resources.sh downloads) ---------------------- +bundle_version v3.1.0 +bundle_tar editing-qc-resources-v3.1.0.tar +# Zenodo record holding the frozen bundle. +bundle_zenodo_record 20856447 +# sha256 of the bundle tar; written by build_resource_bundle.sh, checked by fetch. +bundle_sha256 2fb5f71f6af9c69cfdf34edc4e75a208fb6db8fdfe48a8bfffa504e938a77763 +# +# --- redistributable upstreams (frozen INTO the bundle) ----------------------- +# key value +hifi_wdl_resources_ver v3.1.0 +hifi_wdl_resources_url https://zenodo.org/records/17086906/files/hifi-wdl-resources-v3.1.0.tar +hg002_fasta_url https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/HG002/assemblies/hg002v1.1.fasta.gz +hg002_chain_url https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/HG002/assemblies/changes/hg002v1.1_to_GRCh38.chain.gz +# hifisomatic suite — only the reformatted Ensembl GFF3 (ref_gff) is extracted into +# the bundle; the rest belonged to the removed Severus somatic-calling path. +hifisomatic_resources_url https://zenodo.org/record/14847828/files/hifisomatic_resources.tar.gz +# AnnotSV: install-script commit AND the (annotation_ver, exomiser_ver) args are +# pinned together so the cache always matches the AnnotSV container in the pipeline. +annotsv_install_commit b270de3f6db45e4c4ad6b32e7fc868f2369b62c3 +annotsv_annotation_ver 3.5 +annotsv_exomiser_ver 2406 +vep_ver 115 +vep_cache_url https://ftp.ensembl.org/pub/release-115/variation/indexed_vep_cache/homo_sapiens_refseq_vep_115_GRCh38.tar.gz +clinvar_vcf_url https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz +clinvar_tbi_url https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi +# +# --- license-gated, NOT in the bundle (user obtains separately) --------------- +# dbNSFP: academic-free / commercial-licensed. fetch_resources.sh prints +# instructions and patches the somatic map; the file is never re-hosted. +dbnsfp_ver 5.3.1a +dbnsfp_download_page https://www.dbnsfp.org/download diff --git a/scripts/README.md b/scripts/README.md index 3fc3cbcc..116dd477 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -186,10 +186,26 @@ ls -la outputs/ 4. **Input Files**: Ensure all file paths in your input configuration JSON are accessible -5. **Reference Data**: Download the reference bundle from Zenodo if using local paths +5. **Reference Data**: Download the frozen reference bundle (~46 GB) from Zenodo (resumable, + checksum-verified, typically 15–30 min) and build the knock-knock container. + `setup.sh` reads `$SINGULARITY_CACHEDIR` for the container build — set it to a path with + sufficient space before running (unset = knock-knock build is skipped with a warning): ```bash - ./scripts/setup.sh + export SINGULARITY_CACHEDIR="$(pwd)/miniwdl_cache/singularity_cache" + mkdir -p "${SINGULARITY_CACHEDIR}" + ./scripts/setup.sh # bundle + knock-knock image + # or just the resources: ./scripts/fetch_resources.sh [--dbnsfp FILE] ``` + Versions are pinned in `resources/manifest.tsv`. **dbNSFP is license-gated and not in the + bundle.** Obtain `dbNSFP5.3.1a_grch38.gz` + `dbNSFP5.3.1a_grch38.gz.tbi` from + [https://www.dbnsfp.org/download](https://www.dbnsfp.org/download) (look for the GRCh38 + BGZF files under _"dbNSFP variants in BGZF format for VEP and SnpEff"_), then pass + `--dbnsfp` to either `setup.sh` or `fetch_resources.sh`: + ```bash + ./scripts/setup.sh --dbnsfp /path/to/dbNSFP5.3.1a_grch38.gz + ``` + Until then, `dbnsfp_file` entries in the somatic map are left as placeholders. Maintainers + rebuild the bundle for a new version with `./scripts/build_resource_bundle.sh`. ### Advanced Usage Examples diff --git a/scripts/build_resource_bundle.sh b/scripts/build_resource_bundle.sh new file mode 100755 index 00000000..5837d3db --- /dev/null +++ b/scripts/build_resource_bundle.sh @@ -0,0 +1,228 @@ +#!/bin/bash +# build_resource_bundle.sh — MAINTAINER-ONLY. Pull every redistributable resource +# from its (slow, flaky) upstream, transform it, lay it out under ONE tree, and +# tar it into the frozen bundle that gets uploaded to Zenodo. Users never run +# this — they run scripts/fetch_resources.sh against the frozen record. +# +# Run this only when bumping versions. Edit resources/manifest.tsv first. +# +# ./scripts/build_resource_bundle.sh [--out-dir DIR] [--keep-extracted] +# +# Produces: /editing-qc-resources-/ (the tree) +# /editing-qc-resources-.tar (upload THIS to Zenodo) +# and prints the bundle sha256 to paste into manifest.tsv. +# +# dbNSFP is license-gated and is deliberately NOT included — see manifest.tsv. +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +MANIFEST="${REPO_ROOT}/resources/manifest.tsv" +OUT_DIR="${REPO_ROOT}" +KEEP_EXTRACTED=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --out-dir) OUT_DIR="$2"; shift 2;; + --keep-extracted) KEEP_EXTRACTED=1; shift;; + -h|--help) sed -n '2,17p' "$0"; exit 0;; + *) echo "Unknown arg: $1" >&2; exit 1;; + esac +done + +# --- tiny manifest reader ----------------------------------------------------- +m() { # m KEY -> value (first match), errors if missing + local v + v=$(grep -vE '^[[:space:]]*#' "$MANIFEST" | awk -F'\t' -v k="$1" '$1==k {print $2; exit}') + [[ -n "$v" ]] || { echo "manifest: key '$1' not found in $MANIFEST" >&2; exit 1; } + printf '%s' "$v" +} + +VER=$(m bundle_version) +VERp=${VER//./p} +TREE_NAME="editing-qc-resources-${VER}" +TREE="${OUT_DIR}/${TREE_NAME}" +TAR="${OUT_DIR}/${TREE_NAME}.tar" + +echo ">>> Building ${TREE_NAME} into ${OUT_DIR}" +mkdir -p "${TREE}" +cd "${TREE}" + +# Resumable fetch helper: aria2c (multi-conn) if present, else wget -c. +fetch() { # fetch URL [output_path] + local url="$1" out="${2:-}" + if command -v aria2c >/dev/null 2>&1; then + if [[ -n "$out" ]]; then aria2c -x8 -s8 -c -o "$out" "$url" + else aria2c -x8 -s8 -c "$url"; fi + else + if [[ -n "$out" ]]; then wget -c -O "$out" "$url" + else wget -c "$url"; fi + fi +} + +# Each step below guards on its final output so the build is idempotent: if it +# aborts partway (or you re-run after fixing one upstream), completed steps are +# skipped and you resume from the failure — no re-downloading the 200 GB of inputs. + +# --- 1. core reference suite (brings in the map templates + GRCh38 + slivar) -- +if [[ -d "hifi-wdl-resources-${VER}/GRCh38" ]]; then + echo ">>> hifi-wdl-resources ${VER} already extracted — skipping" +else + echo ">>> hifi-wdl-resources ${VER}" + fetch "$(m hifi_wdl_resources_url)" "hifi-wdl-resources-${VER}.tar" + tar -xf "hifi-wdl-resources-${VER}.tar" + rm -f "hifi-wdl-resources-${VER}.tar" +fi + +# --- 2. hg002 assembly + chain, indexed ------------------------------------- +if [[ -f "hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz.gzi" ]]; then + echo ">>> hg002 assembly already indexed — skipping" +else + echo ">>> hg002 assembly" + mkdir -p "hifi-wdl-resources-${VER}/hg002" + fetch "$(m hg002_chain_url)" "hifi-wdl-resources-${VER}/hg002/hg002v1.1_to_GRCh38.chain.gz" + fetch "$(m hg002_fasta_url)" "hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz" + module load samtools 2>/dev/null || true + samtools faidx "hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz" + bgzip -r "hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz" +fi + +# --- 3. somatic annotation reference ----------------------------------------- +# The family workflow's somatic ANNOTATION path consumes only one file from the +# hifisomatic suite — the reformatted Ensembl GFF3 (somatic_map "ref_gff"). The +# rest of the hifisomatic tarball (the 3 GB GRC-exclusions FASTA, refFlat, TRF +# bed, control VCF, chr.bed) and the Severus 1000G PoN belonged to the somatic +# SV-CALLING (Severus) path, which has been removed from the workflow — so we +# extract the GFF3 only and drop ~3.2 GB of dead weight from the bundle. +SOM_GFF="hifisomatic_resources/Homo_sapiens.GRCh38.112.chr.reformatted.gff3" +if [[ -f "$SOM_GFF" ]]; then + echo ">>> hifisomatic GFF3 already present — skipping" +else + echo ">>> hifisomatic GFF3 (ref_gff) — extracting only the file the workflow uses" + mkdir -p hifisomatic_resources + fetch "$(m hifisomatic_resources_url)" hifisomatic_resources.tar.gz + GFF_BASE="$(basename "$SOM_GFF")" + # extract just the GFF3 wherever it lives in the archive (GNU tar --wildcards) + tar -xzf hifisomatic_resources.tar.gz -C hifisomatic_resources --wildcards "*${GFF_BASE}" + rm -f hifisomatic_resources.tar.gz + # flatten to the expected path if the archive nested it in a subdir + [[ -f "$SOM_GFF" ]] || find hifisomatic_resources -name "$GFF_BASE" -exec mv {} "$SOM_GFF" \; + [[ -f "$SOM_GFF" ]] || { echo "ERROR: GFF3 not found in hifisomatic tarball" >&2; exit 1; } +fi + +# --- 4. AnnotSV annotation cache (pinned commit + version args) -------------- +if [[ -f annotsv_cache.tar.gz ]]; then + echo ">>> annotsv_cache.tar.gz already built — skipping" +else + echo ">>> AnnotSV annotations (commit $(m annotsv_install_commit))" + EXO_VER=$(m annotsv_exomiser_ver) + # Re-running INSTALL_annotations.sh re-downloads the ~12 GB Exomiser zip even + # when the cache is already on disk, so only run it if the cache is incomplete. + if [[ ! -d "AnnotSV_annotations/Annotations_Human" || ! -d "AnnotSV_annotations/Annotations_Exomiser/${EXO_VER}" ]]; then + fetch "https://raw.githubusercontent.com/lgmgeo/AnnotSV/$(m annotsv_install_commit)/bin/INSTALL_annotations.sh" INSTALL_annotations.sh + sed -i 's/rm -r AnnotSV/rm -rf AnnotSV/' INSTALL_annotations.sh + # Upstream bug in this pinned commit: the Exomiser download block references the + # variable ${EXOMISERVERSION} (no underscore) while the version is stored in + # ${EXOMISER_VERSION}. The empty expansion requests ".../data/_phenotype.zip" + # (404, a 162-byte error page) and unzip fails. Fix the typo'd token in place. + sed -i 's/EXOMISERVERSION/EXOMISER_VERSION/g' INSTALL_annotations.sh + chmod +x INSTALL_annotations.sh + bash INSTALL_annotations.sh "$(m annotsv_annotation_ver)" "${EXO_VER}" + else + echo " AnnotSV_annotations already complete — packaging existing cache" + fi + tar -czf annotsv_cache.tar.gz AnnotSV_annotations + rm -rf AnnotSV_annotations INSTALL_annotations.sh +fi + +# --- 5. VEP cache ------------------------------------------------------------ +echo ">>> VEP cache (release $(m vep_ver))" +[[ -f homo_sapiens_refseq_vep_$(m vep_ver)_GRCh38.tar.gz ]] || fetch "$(m vep_cache_url)" + +# --- 6. ClinVar -------------------------------------------------------------- +echo ">>> ClinVar" +[[ -f clinvar.vcf.gz ]] || fetch "$(m clinvar_vcf_url)" +[[ -f clinvar.vcf.gz.tbi ]] || fetch "$(m clinvar_tbi_url)" + +# --- 7. stage + normalize map templates to a uniform / scheme -------- +# Template sources differ: +# ref/tertiary ship INSIDE the hifi-wdl-resources-/ subdir (the tarball), +# already using "/hifi-wdl-resources-/..." paths. +# somatic is a maintained, git-tracked repo file with bare paths (it is +# NOT in the tarball). We read it from git HEAD so a prior +# fetch_resources.sh run (which writes a populated copy to the +# repo root under the same name) can't poison the build. +# We stage all three at the tree root, add the hg002 block to ref, and rewrite +# somatic's bare paths to "/...". fetch_resources.sh then swaps +# for the absolute extracted-tree path. dbNSFP entries stay as a manual placeholder. +TAB=$'\t' +SUBDIR="hifi-wdl-resources-${VER}" +REF_T="GRCh38.ref_map.${VERp}.template.tsv" +SOM_T="GRCh38.somatic_map.${VERp}.template.tsv" + +for t in ref tertiary; do + f="GRCh38.${t}_map.${VERp}.template.tsv" + if [[ ! -f "$f" ]]; then + [[ -f "${SUBDIR}/$f" ]] || { echo "missing $f (expected in ${SUBDIR}/ from the reference tarball)" >&2; exit 1; } + cp "${SUBDIR}/$f" "$f" + fi +done +if [[ ! -f "$SOM_T" ]]; then + git -C "${REPO_ROOT}" show "HEAD:${SOM_T}" > "$SOM_T" 2>/dev/null \ + || { echo "missing $SOM_T (maintained repo file, not in tarball; not found in git HEAD)" >&2; exit 1; } +fi + +# append hg002 entries (prefix-rooted) if not already present +if ! grep -q '^hg002_name' "$REF_T"; then + cat >> "$REF_T" </hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz +hg002_fasta_index${TAB}/hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz.fai +hg002_fasta_bgzf_index${TAB}/hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz.gzi +hg002_chain${TAB}/hifi-wdl-resources-${VER}/hg002/hg002v1.1_to_GRCh38.chain.gz +EOF +fi + +# somatic: prepend / to file-path values; leave scalars and dbNSFP alone +python3 - "$SOM_T" <<'PYEOF' +import sys +path_keys = {"ref_gff","vep_cache","annotsv_cache","clinvar_vcf","clinvar_vcf_index"} +f = sys.argv[1] +out = [] +for line in open(f): + if "\t" in line and not line.startswith("#"): + k, v = line.rstrip("\n").split("\t", 1) + if k in path_keys and not v.startswith("/"): + v = "/" + v + out.append(f"{k}\t{v}\n") + else: + out.append(line) +open(f, "w").writelines(out) +PYEOF + +cp "$MANIFEST" manifest.tsv + +# --- 8. checksums + tar ------------------------------------------------------- +echo ">>> Writing SHA256SUMS" +find . -type f ! -name SHA256SUMS -print0 | sort -z | xargs -0 sha256sum > SHA256SUMS + +cd "${OUT_DIR}" +echo ">>> Creating ${TAR}" +tar -cf "${TAR}" "${TREE_NAME}" +BUNDLE_SHA=$(sha256sum "${TAR}" | awk '{print $1}') + +if [[ ${KEEP_EXTRACTED} -eq 0 ]]; then rm -rf "${TREE}"; fi + +cat <${BUNDLE_SHA} + 2. Upload ${TREE_NAME}.tar to your Zenodo record, then set + bundle_zenodo_record to the new record id. + 3. Commit manifest.tsv. +dbNSFP is NOT in this bundle (license-gated); users obtain it via fetch_resources.sh. +================================================================================ +EOF diff --git a/scripts/fetch_resources.sh b/scripts/fetch_resources.sh new file mode 100755 index 00000000..fef098c0 --- /dev/null +++ b/scripts/fetch_resources.sh @@ -0,0 +1,109 @@ +#!/bin/bash +# fetch_resources.sh — USER-FACING. Pull the frozen resource bundle from Zenodo +# (resumable, checksum-verified), extract it under one tree, and write ready-to-use +# map files with absolute paths. No upstream servers, no transforms — all the slow, +# flaky work was done once by build_resource_bundle.sh. +# +# ./scripts/fetch_resources.sh [--dest DIR] [--dbnsfp FILE] [--keep-tar] +# +# --dest DIR where to extract the bundle (default: repo root) +# --dbnsfp FILE path to your licensed dbNSFP bgzip (+ .tbi alongside); the +# somatic map is patched to point at it. Omit to leave a clearly +# marked placeholder you fill in later. +# +# If a download is interrupted, just re-run — it resumes from where it stopped. +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +MANIFEST="${REPO_ROOT}/resources/manifest.tsv" +DEST="${REPO_ROOT}" +DBNSFP="" +KEEP_TAR=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --dest) DEST="$2"; shift 2;; + --dbnsfp) DBNSFP="$2"; shift 2;; + --keep-tar) KEEP_TAR=1; shift;; + -h|--help) sed -n '2,17p' "$0"; exit 0;; + *) echo "Unknown arg: $1" >&2; exit 1;; + esac +done + +m() { + local v + v=$(grep -vE '^[[:space:]]*#' "$MANIFEST" | awk -F'\t' -v k="$1" '$1==k {print $2; exit}') + [[ -n "$v" ]] || { echo "manifest: key '$1' not found in $MANIFEST" >&2; exit 1; } + printf '%s' "$v" +} + +VER=$(m bundle_version) +VERp=${VER//./p} +TAR_NAME=$(m bundle_tar) +RECORD=$(m bundle_zenodo_record) +WANT_SHA=$(m bundle_sha256) +TREE_NAME="editing-qc-resources-${VER}" +URL="https://zenodo.org/records/${RECORD}/files/${TAR_NAME}" + +mkdir -p "${DEST}" +cd "${DEST}" + +echo ">>> Fetching ${TAR_NAME} from Zenodo record ${RECORD} (resumable)" +if command -v aria2c >/dev/null 2>&1; then + aria2c -x8 -s8 -c -o "${TAR_NAME}" "${URL}" +else + echo " (install aria2c for faster multi-connection downloads; using wget -c)" + wget -c -O "${TAR_NAME}" "${URL}" +fi + +if [[ "${WANT_SHA}" != "TBD" ]]; then + echo ">>> Verifying bundle sha256" + echo "${WANT_SHA} ${TAR_NAME}" | sha256sum -c - || { + echo "ERROR: checksum mismatch. Delete ${DEST}/${TAR_NAME} and re-run." >&2; exit 1; } +else + echo "WARNING: manifest bundle_sha256 is TBD — skipping bundle verification." >&2 +fi + +echo ">>> Extracting" +tar -xf "${TAR_NAME}" +[[ ${KEEP_TAR} -eq 1 ]] || rm -f "${TAR_NAME}" + +TREE="$(cd "${TREE_NAME}" && pwd)" # absolute path + +if [[ -f "${TREE}/SHA256SUMS" ]]; then + echo ">>> Verifying extracted files against SHA256SUMS" + ( cd "${TREE}" && sha256sum -c --quiet SHA256SUMS ) +fi + +# --- write ready-to-use map files with absolute paths ------------------------ +# Output keeps the historical *.template.tsv name (now holding absolute, populated +# paths) so existing input configs and docs that reference it keep working — the +# in-bundle copy under ${TREE} is the pristine version. +echo ">>> Writing map files (absolute paths -> ${TREE})" +for kind in ref tertiary somatic; do + src="${TREE}/GRCh38.${kind}_map.${VERp}.template.tsv" + dst="${DEST}/GRCh38.${kind}_map.${VERp}.template.tsv" + [[ -f "$src" ]] || { echo "missing template $src in bundle" >&2; exit 1; } + sed "s||${TREE}|g" "$src" > "$dst" + echo " ${dst}" +done + +# --- dbNSFP (license-gated, not in bundle) ----------------------------------- +SOM="${DEST}/GRCh38.somatic_map.${VERp}.template.tsv" +if [[ -n "${DBNSFP}" ]]; then + DBN_ABS="$(cd "$(dirname "${DBNSFP}")" && pwd)/$(basename "${DBNSFP}")" + [[ -f "${DBN_ABS}" ]] || { echo "ERROR: --dbnsfp file not found: ${DBN_ABS}" >&2; exit 1; } + [[ -f "${DBN_ABS}.tbi" ]] || echo "WARNING: ${DBN_ABS}.tbi not found next to dbNSFP file" >&2 + sed -i -E "s|^(dbnsfp_file)\t.*|\1\t${DBN_ABS}|; s|^(dbnsfp_file_index)\t.*|\1\t${DBN_ABS}.tbi|" "${SOM}" + echo ">>> Patched dbNSFP path into ${SOM}" +else + cat >&2 <>> dbNSFP is license-gated and is NOT in the bundle. + Obtain dbNSFP $(m dbnsfp_ver) from $(m dbnsfp_download_page), build the + bgzip+tabix grch38 file, then re-run with: --dbnsfp /path/to/dbNSFP_grch38.gz + Until then, the dbnsfp_file entries in ${SOM} are placeholders. +EOF +fi + +echo ">>> Done. Map files are in ${DEST}; resources under ${TREE}" diff --git a/scripts/setup.sh b/scripts/setup.sh index d8998466..3d1e5ca7 100755 --- a/scripts/setup.sh +++ b/scripts/setup.sh @@ -1,111 +1,33 @@ #!/bin/bash - - -VER="v3.1.0" -VERp=${VER//./p} -FETCH_EXTRA=1 -ZENODO_BASE="https://zenodo.org/records/17086906" - -#download reference tarball for singleton -echo "Starting Reference suite download" -wget "${ZENODO_BASE}/files/hifi-wdl-resources-${VER}.tar" -echo "Reference Tarball Sucessfully downloaded" -echo "Extracting tarball" -tar -xvf hifi-wdl-resources-${VER}.tar -echo "Finished Extracting tarball" - -mkdir -p hifi-wdl-resources-${VER}/hg002 -wget -P hifi-wdl-resources-${VER}/hg002/ https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/HG002/assemblies/changes/hg002v1.1_to_GRCh38.chain.gz -wget -P hifi-wdl-resources-${VER}/hg002/ https://s3-us-west-2.amazonaws.com/human-pangenomics/T2T/HG002/assemblies/hg002v1.1.fasta.gz -mv hifi-wdl-resources-${VER}/hg002/hg002v1.1.fasta.gz hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz -module load samtools -samtools faidx hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz -bgzip -r hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz -if [[ ! -e GRCh38.ref_map.${VERp}.template.tsv.bak ]]; then - cp GRCh38.ref_map.${VERp}.template.tsv GRCh38.ref_map.${VERp}.template.tsv.bak - cp GRCh38.tertiary_map.${VERp}.template.tsv GRCh38.tertiary_map.${VERp}.template.tsv.bak - cp GRCh38.somatic_map.${VERp}.template.tsv GRCh38.somatic_map.${VERp}.template.tsv.bak -else - cp GRCh38.ref_map.${VERp}.template.tsv.bak GRCh38.ref_map.${VERp}.template.tsv - cp GRCh38.tertiary_map.${VERp}.template.tsv.bak GRCh38.tertiary_map.${VERp}.template.tsv - cp GRCh38.somatic_map.${VERp}.template.tsv.bak GRCh38.somatic_map.${VERp}.template.tsv -fi -sed -i "s/\///g" GRCh38.ref_map.${VERp}.template.tsv -sed -i "s/\///g" GRCh38.tertiary_map.${VERp}.template.tsv -sed -i "s/\///g" GRCh38.somatic_map.${VERp}.template.tsv -TAB=$'\t' -echo "hg002_name${TAB}HG002 -hg002_fasta${TAB}hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz -hg002_fasta_index${TAB}hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz.fai -hg002_fasta_bgzf_index${TAB}hifi-wdl-resources-${VER}/hg002/hg002v1.1.fa.gz.gzi -hg002_chain${TAB}hifi-wdl-resources-${VER}/hg002/hg002v1.1_to_GRCh38.chain.gz" >> GRCh38.ref_map.${VERp}.template.tsv -DBNSFP_VERSION="4.9a" - - -# Build knock-knock Singularity image (no pre-built container available) -if [[ -z "${SINGULARITY_CACHEDIR}" ]]; then - echo "WARNING: SINGULARITY_CACHEDIR is not set; skipping knock-knock container build" +# setup.sh — one-time setup entry point. +# +# 1. fetches the FROZEN resource bundle from Zenodo (resumable, checksummed) +# via scripts/fetch_resources.sh, and +# 2. builds the knock-knock Singularity image (no usable prebuilt container). +# +# Pass-through args (e.g. --dest DIR, --dbnsfp FILE) go to fetch_resources.sh. +# +# Maintainers rebuilding the bundle for a new resource version want +# scripts/build_resource_bundle.sh instead, not this script. +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/.." && pwd)" + +# --- 1. resources ------------------------------------------------------------ +"${SCRIPT_DIR}/fetch_resources.sh" "$@" + +# --- 2. knock-knock container ------------------------------------------------ +if [[ -z "${SINGULARITY_CACHEDIR:-}" ]]; then + echo "WARNING: SINGULARITY_CACHEDIR is not set; skipping knock-knock container build" >&2 else KK_SIF="${SINGULARITY_CACHEDIR}/docker___knock-knock.sif" if [[ ! -f "${KK_SIF}" ]]; then - echo "Building knock-knock Singularity image..." + echo ">>> Building knock-knock Singularity image..." mkdir -p "${SINGULARITY_CACHEDIR}" - apptainer build --fakeroot "${KK_SIF}" knock-knock.def + apptainer build --fakeroot "${KK_SIF}" "${REPO_ROOT}/knock-knock.def" echo "knock-knock image built: ${KK_SIF}" else echo "knock-knock Singularity image already exists: ${KK_SIF}" fi fi - -if [[ $FETCH_EXTRA -eq 1 ]]; then - echo "Starting somatic suite download" - wget https://zenodo.org/record/14847828/files/hifisomatic_resources.tar.gz - echo "Reference Tarball Sucessfully downloaded" - echo "Extracting tarball" - mkdir -p hifisomatic_resources - tar -xvzf hifisomatic_resources.tar.gz -C hifisomatic_resources - echo "Finished Extracting tarball" - - echo "Starting 1000G panel of normal from Severus GitHub" - wget https://github.com/KolmogorovLab/Severus/raw/refs/heads/main/pon/PoN_1000G_hg38_extended.tsv.gz - echo "Reference Tarball Sucessfully downloaded" - - - echo "Starting AnnotSV suite download" - wget https://raw.githubusercontent.com/lgmgeo/AnnotSV/b270de3f6db45e4c4ad6b32e7fc868f2369b62c3/bin/INSTALL_annotations.sh - echo "Install Script Sucessfully downloaded" - echo "Starting AnnotSV cache download" - sed -i 's/rm -r AnnotSV/rm -rf AnnotSV/' INSTALL_annotations.sh - chmod +x INSTALL_annotations.sh - bash INSTALL_annotations.sh "3.5" "2406" - echo "Finished download cache" - tar -czvf annotsv_cache.tar.gz AnnotSV_annotations - - echo "Starting VEP suite download" - wget https://ftp.ensembl.org/pub/release-115/variation/indexed_vep_cache/homo_sapiens_refseq_vep_115_GRCh38.tar.gz - echo "VEP cache successfully downloaded" - - echo "Starting dbNSFP download (BayesDel, REVEL, AlphaMissense, CADD)" - # dbNSFP requires registration and license agreement; academic use is free but commercial - # use requires a separate license. See: https://www.dbnsfp.org/download - # v4.9a is the most recent version available for automated download without registration. - # Newer versions (recommended) must be downloaded manually after obtaining access — update - # DBNSFP_VERSION and the dbnsfp_file entries in your somatic map accordingly. - if [[ "${DBNSFP_VERSION}" == "4.9a" ]]; then - echo "WARNING: downloading dbNSFP v4.9a (automated baseline). A newer version is available at https://www.dbnsfp.org/download — download it manually and update DBNSFP_VERSION and your somatic map." >&2 - fi - wget https://dbnsfp.s3.amazonaws.com/dbNSFP${DBNSFP_VERSION}.zip - unzip dbNSFP${DBNSFP_VERSION}.zip "dbNSFP${DBNSFP_VERSION}_variant.chr*.gz" "dbNSFP${DBNSFP_VERSION}_variant.chr*.gz.tbi" - # Concatenate chromosomes into single bgzipped file expected by VEP plugin - (zcat dbNSFP${DBNSFP_VERSION}_variant.chr1.gz | head -1; \ - for f in dbNSFP${DBNSFP_VERSION}_variant.chr*.gz; do zcat "$f" | tail -n +2; done) \ - | bgzip -c > dbNSFP${DBNSFP_VERSION}_grch38.gz - tabix -s 1 -b 2 -e 2 dbNSFP${DBNSFP_VERSION}_grch38.gz - rm -f dbNSFP${DBNSFP_VERSION}_variant.chr*.gz dbNSFP${DBNSFP_VERSION}_variant.chr*.gz.tbi dbNSFP${DBNSFP_VERSION}.zip - echo "dbNSFP successfully downloaded and indexed" - - echo "Starting ClinVar download" - wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz - wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi - echo "ClinVar successfully downloaded" -fi diff --git a/workflows/family.wdl b/workflows/family.wdl index 3976223a..3ec807e7 100755 --- a/workflows/family.wdl +++ b/workflows/family.wdl @@ -20,7 +20,6 @@ import "edit-qc/offtarget.wdl" as OffTarget import "edit-qc/knock_knock.wdl" as KnockKnock import "edit-qc/loh.wdl" as LOH import "somatic_ports/somatic_annotation.wdl" as Somatic_annotation -import "somatic_ports/somatic_calling.wdl" as Somatic_calling workflow humanwgs_family { meta { @@ -937,17 +936,6 @@ workflow humanwgs_family { Array[String]? stat_concerning_SNV_count = count_concerning_snv.row_count Array[String]? stat_concerning_SV_count = count_concerning_sv.row_count - # Somatic SV calling - # Array[File] Severus_somatic_vcf = select_all(select_first([phased_severus.output_vcf])) - # Array[File] Severus_all_vcf = select_all(select_first([phased_severus.output_all_vcf])) - # Array[File] Severus_breakpoint_cluster = select_all(select_first([phased_severus.output_breakpoint_clusters])) - # Array[File] Severus_breakpoint_cluster_all = select_all(select_first([phased_severus.output_breakpoint_clusters_all])) - # Array[File] Severus_cluster_plots = select_all(select_first([phased_severus.output_somatic_sv_plots])) - - # Array[File] Severus_tabix_vcf = select_first([tabix_vcf.output_vcf]) - # Array[File] Severus_filtered_vcf = select_first([recover_mate_bnd.output_vcf]) - # Array[File] Severus_filtered_vcf_index = select_first([recover_mate_bnd.output_vcf_index]) - # annotation analysis outputs File? merged_vep_annotated_vcf = annotateSpliceAI.annotated_vcf @@ -962,10 +950,6 @@ workflow humanwgs_family { Array[File]? small_variant_annotated_ranked_tsv = prioritizeSomatic.vep_annotated_ranked_tsv Array[File]? small_variant_annotated_filtered_tsv = prioritizeSomatic.vep_annotated_filtered_tsv - # Array[File] Severus_annotated_tsv = select_first([annotateSeverusSVfiltered.annotated_tsv]) - # Array[File] Severus_annotated_ranked_tsv = select_first([prioritize_Severus.annotSV_ranked_tsv]) - # Array[File] Severus_annotated_ccg_tsv = select_first([prioritize_Severus.annotSV_ccg_tsv]) - # Array[File] Severus_annotated_ccg_ranked_tsv = select_first([prioritize_Severus.annotSV_ccg_ranked_tsv]) # tertiary analysis outputs File? tertiary_small_variant_filtered_vcf = tertiary_analysis.small_variant_filtered_vcf File? tertiary_small_variant_filtered_vcf_index = tertiary_analysis.small_variant_filtered_vcf_index diff --git a/workflows/somatic_ports/somatic_calling.wdl b/workflows/somatic_ports/somatic_calling.wdl deleted file mode 100755 index cec9359b..00000000 --- a/workflows/somatic_ports/somatic_calling.wdl +++ /dev/null @@ -1,245 +0,0 @@ -version 1.1 - -# Use Severus to call SV -task Severus_sv { - input { - File wt_bam - File wt_bam_index - File? parental_bam - File? parental_bam_index - File trf_bed - File? phased_vcf - File? PON_tsv - String pname = "Sample_1_test" - Int threads - Int min_supp_reads - } - - Float file_size = ceil(size(wt_bam, "GB") + size(parental_bam, "GB") + size(phased_vcf, "GB") + 10) - command <<< - set -euxo pipefail - - echo "Running Severus for ~{pname}" - - severus --version - - severus \ - --target-bam ~{wt_bam} \ - ~{if defined(parental_bam) then "--control-bam " + parental_bam else ""} \ - ~{"--phasing-vcf " + phased_vcf} \ - ~{"--PON " + PON_tsv} \ - --out-dir ~{pname + "_severus"} \ - -t ~{threads} \ - --vntr-bed ~{trf_bed} \ - --min-support ~{min_supp_reads} \ - --resolve-overlaps \ - --between-junction-ins \ - --single-bp - - - - # Compress SVs plots HTML inside somatic_SVs/plots directory - # Check if the directory exists first - if [[ -d ~{pname + "_severus/somatic_SVs/plots"} ]] - then tar -czvf ~{pname + "_severus/somatic_SVs/plots.tar.gz"} ~{pname + "_severus/somatic_SVs/plots"} - fi - >>> - - output { - File? output_vcf = pname + "_severus/somatic_SVs/severus_somatic" + ".vcf" - File? output_all_vcf = pname + "_severus/all_SVs/severus_all.vcf" - File? output_breakpoint_clusters = pname + "_severus/somatic_SVs/" + "breakpoint_clusters_list.tsv" - File? output_breakpoint_clusters_all = pname + "_severus/all_SVs/" + "breakpoint_clusters_list.tsv" - File? output_somatic_sv_plots = pname + "_severus/somatic_SVs/plots.tar.gz" - } - - runtime { - docker: "quay.io/biocontainers/severus@sha256:fb4471e0504d564de78215ae15c081a1bb2022ad51e993eba92bc6fa5052a05d" - cpu: threads - memory: "~{threads * 4} GB" - disk: file_size + " GB" - maxRetries: 2 - preemptible: 1 - } -} - - -task tabix_vcf { - input { - File vcf - File contig_bed - Int threads = 2 - } - - Float file_size = ceil(size(vcf, "GB") + size(contig_bed, "GB") + 10) - - command <<< - set -euxo pipefail - echo "indexing ~{vcf}" - # Get rid of SVLEN=0 and change to INS - # to avoid Truvari error problem - # sed 's/SVLEN=0;//g' ~{vcf} > tmp.vcf - # sed -i 's//INS/g' tmp.vcf - - bcftools --version - - # sort first in case input is not sorted - bcftools sort \ - -Oz -o tmp.vcf.gz \ - ~{vcf} - - tabix -p vcf tmp.vcf.gz - - # Filter out contigs that are not in the contig bed file - bcftools view \ - -R ~{contig_bed} \ - -Oz -o ~{basename(vcf) + ".gz"} \ - tmp.vcf.gz - - tabix -p vcf ~{basename(vcf) + ".gz"} - rm -f tmp.vcf.gz tmp.vcf.gz.tbi - >>> - - output { - File output_vcf = basename(vcf) + ".gz" - File output_vcf_index = basename(vcf) + ".gz.tbi" - } - - runtime { - docker: "quay.io/biocontainers/bcftools:1.17--h3cc50cf_1" - cpu: threads - memory: "~{threads * 4} GB" - disk: file_size + " GB" - maxRetries: 2 - preemptible: 1 - } -} - -# Use svpack filtering instead of truvari, copied from -# human-wgs-wdl -task svpack_filter_annotated { - input { - File sv_vcf - - Array[File] population_vcfs - Array[File] population_vcf_indices - - Float? svlen - - File gff - } - - String sv_vcf_basename = basename(sv_vcf, ".vcf.gz") - Int file_size = ceil(size(sv_vcf, "GB") * 2 + 20) - - command <<< - set -euo pipefail - - echo "svpack version:" - cat /opt/svpack/.git/HEAD - - svpack \ - filter \ - --pass-only \ - ~{"--min-svlen " + svlen} \ - ~{sv_vcf} \ - ~{sep=' ' prefix('| svpack match -v - ', population_vcfs)} \ - | svpack \ - consequence \ - - \ - ~{gff} \ - | svpack \ - tagzygosity \ - - \ - > ~{sv_vcf_basename}.svpack.vcf - - bgzip --version - - bgzip ~{sv_vcf_basename}.svpack.vcf - - tabix --version - - tabix -p vcf ~{sv_vcf_basename}.svpack.vcf.gz - >>> - - output { - File output_vcf = "~{sv_vcf_basename}.svpack.vcf.gz" - File output_vcf_index = "~{sv_vcf_basename}.svpack.vcf.gz.tbi" - } - - runtime { - docker: "quay.io/pacbio/svpack@sha256:a680421cb517e1fa4a3097838719a13a6bd655a5e6980ace1b03af9dd707dd75" - cpu: 4 - memory: "16 GB" - disk: file_size + " GB" - maxRetries: 2 - preemptible: 1 - } -} - -# Use bcftools to add missing BND mate back after svpack filtering -task recover_mate_bnd { - input { - File sv_vcf_original - File sv_svpack_filtered - } - - Float file_size = ceil(size(sv_vcf_original, "GB") + size(sv_svpack_filtered, "GB") + 10) - - command <<< - set -euxo pipefail - - bcftools --version - - # Copy the VCF here and index them - cp ~{sv_svpack_filtered} ~{basename(sv_svpack_filtered)} - tabix ~{basename(sv_svpack_filtered)} - - # For original VCF, bgzip if not already, then index - cp ~{sv_vcf_original} ~{basename(sv_vcf_original)} - if [[ ~{basename(sv_vcf_original)} == *.vcf ]]; - then - bgzip ~{sv_vcf_original} - tabix ~{sv_vcf_original}.gz - else - tabix ~{basename(sv_vcf_original)} - fi - - comm -13 \ - <(bcftools query -f '%ID\t%MATE_ID\n' ~{basename(sv_svpack_filtered)} | rg -v '\.' | cut -f1 | sort) \ - <(bcftools query -f '%ID\t%MATE_ID\n' ~{basename(sv_svpack_filtered)} | rg -v '\.' | cut -f2 | sort) \ - > missing_mate.txt - - # Extract the missing BNDs - bcftools view \ - -i 'ID=@missing_mate.txt' \ - ~{basename(sv_vcf_original)} |\ - bcftools sort -Oz -o missing_mate.vcf.gz - - # Add the missing BNDs back to the filtered VCF - bcftools concat \ - ~{basename(sv_svpack_filtered)} \ - missing_mate.vcf.gz |\ - bcftools sort -Oz -o tmp.vcf.gz - - mv tmp.vcf.gz ~{basename(sv_svpack_filtered)} - rm -f ~{basename(sv_svpack_filtered)}.tbi - - tabix -p vcf ~{basename(sv_svpack_filtered)} - >>> - - output { - File output_vcf = basename(sv_svpack_filtered) - File output_vcf_index = basename(sv_svpack_filtered) + ".tbi" - } - - runtime { - docker: "quay.io/biocontainers/bcftools:1.17--h3cc50cf_1" - cpu: 2 - memory: "8 GB" - disk: file_size + " GB" - maxRetries: 2 - preemptible: 1 - } -} -