diff --git a/.dockstore.yml b/.dockstore.yml index d9902316f..5aa060cd9 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -108,3 +108,91 @@ workflows: subclass: wdl primaryDescriptorPath: /wdl/PBMASIsoSeqDemultiplex.wdl testParameterFiles: +- name: SRBamToFq + subclass: wdl + primaryDescriptorPath: /wdl/SRBamToFq.wdl + testParameterFiles: +- name: SRIndexBam + subclass: wdl + primaryDescriptorPath: /wdl/SRIndexBam.wdl + testParameterFiles: +- name: SRWholeGenome + subclass: wdl + primaryDescriptorPath: /wdl/SRWholeGenome.wdl + testParameterFiles: +- name: SRFlowcell + subclass: wdl + primaryDescriptorPath: /wdl/SRFlowcell.wdl + testParameterFiles: +- name: SRJointCallGVCFsWithGenomicsDB + subclass: wdl + primaryDescriptorPath: /wdl/SRJointCallGVCFsWithGenomicsDB.wdl + testParameterFiles: +- name: LRJointCallGVCFsWithGenomicsDB + subclass: wdl + primaryDescriptorPath: /wdl/LRJointCallGVCFsWithGenomicsDB.wdl + testParameterFiles: +- name: LRJointCallGVCFs + subclass: wdl + primaryDescriptorPath: /wdl/LRJointCallGVCFs.wdl + testParameterFiles: +- name: ConvertToHailMT + subclass: wdl + primaryDescriptorPath: /wdl/ConvertToHailMT.wdl + testParameterFiles: +- name: ConvertToZarrStore + subclass: wdl + primaryDescriptorPath: /wdl/ConvertToZarrStore.wdl + testParameterFiles: +- name: LRConvertBCF + subclass: wdl + primaryDescriptorPath: /wdl/LRConvertBCF.wdl + testParameterFiles: +- name: BenchmarkVCFs + subclass: wdl + primaryDescriptorPath: /wdl/BenchmarkVCFs.wdl + testParameterFiles: +- name: CompareVcfBenchmarks + subclass: wdl + primaryDescriptorPath: /wdl/CompareVcfBenchmarks.wdl + testParameterFiles: +- name: PanelProcessMalariaBarcodesForRh + subclass: wdl + primaryDescriptorPath: /wdl/PanelProcessMalariaBarcodesForRh.wdl + testParameterFiles: +- name: ProcessMalariaBarcodesDemo + subclass: wdl + primaryDescriptorPath: /wdl/ProcessMalariaBarcodesDemo.wdl + testParameterFiles: +- name: ExtractRegionsFromBam + subclass: wdl + primaryDescriptorPath: /wdl/ExtractRegionsFromBam.wdl + testParameterFiles: +- name: PfalciparumDrugResistanceSummary + subclass: wdl + primaryDescriptorPath: /wdl/PfalciparumDrugResistanceSummary.wdl + testParameterFiles: +- name: SRWholeGenome_Pf_Niare_VQSR + subclass: wdl + primaryDescriptorPath: /wdl/SRWholeGenome_Pf_Niare_VQSR.wdl + testParameterFiles: +- name: SRWholeGenome_Pf_Niare_VETS + subclass: wdl + primaryDescriptorPath: /wdl/SRWholeGenome_Pf_Niare_VETS.wdl + testParameterFiles: +- name: SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR + subclass: wdl + primaryDescriptorPath: /wdl/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR.wdl + testParameterFiles: +- name: SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS + subclass: wdl + primaryDescriptorPath: /wdl/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS.wdl + testParameterFiles: +- name: ExpandedDrugResistanceMarkerExtraction + subclass: wdl + primaryDescriptorPath: /wdl/ExpandedDrugResistanceMarkerExtraction.wdl + testParameterFiles: +- name: ExpandedDrugResistanceMarkerAggregation + subclass: wdl + primaryDescriptorPath: /wdl/ExpandedDrugResistanceMarkerAggregation.wdl + testParameterFiles: \ No newline at end of file diff --git a/BRANCH_IS_DEPRECATED.txt b/BRANCH_IS_DEPRECATED.txt new file mode 100644 index 000000000..16a2e5bd5 --- /dev/null +++ b/BRANCH_IS_DEPRECATED.txt @@ -0,0 +1,7 @@ +THIS BRANCH IS DEPRECATED. + +AS OF 2024 Jan 09, ALL CODE IN THIS BRANCH HAS BEEN MERGED TO MAIN. + +THIS BRANCH SHALL CONTINUE TO EXIST AS A REFERENCE UNTIL +JONN IS BACK FROM PATERNITY LEAVE. + diff --git a/README.md b/README.md index 512934441..8f517bd6f 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -[![Generic badge](https://img.shields.io/badge/version-3.0.57-blue.svg)](https://shields.io/) +[![Generic badge](https://img.shields.io/badge/version-3.0.59-blue.svg)](https://shields.io/) ![CI/CD](https://github.com/broadinstitute/long-read-pipelines/workflows/CI/CD/badge.svg) ![Nightly](https://github.com/broadinstitute/long-read-pipelines/workflows/Nightly/badge.svg) diff --git a/VERSION b/VERSION index d0ae6b362..43d663807 100644 --- a/VERSION +++ b/VERSION @@ -1,2 +1,2 @@ -version=3.0.57 +version=3.0.59 diff --git a/docker/lr-functional-annotation/Dockerfile b/docker/lr-functional-annotation/Dockerfile new file mode 100644 index 000000000..fbbc2a850 --- /dev/null +++ b/docker/lr-functional-annotation/Dockerfile @@ -0,0 +1,70 @@ +# Start with a good base python3 image: +FROM ubuntu:20.04 +MAINTAINER Jonn Smith + +# Make sure we don't need to interact with any package installations: +ARG DEBIAN_FRONTEND=noninteractive + +# Set the working directory to / +WORKDIR / + +################################################################################ +# Install system packages: + +# Make sure we can actually talk to package repos: +RUN apt-get update +RUN apt-get install -y apt-utils +RUN apt-get -y upgrade + +RUN apt-get update && apt-get -y upgrade + +# Development / prereqs for building software: +# Utilities / tools: +# Get libcurses: +# install gsutil requirements: +RUN apt-get -y install make gcc g++ autoconf sudo && \ + apt-get -y install git bash vim time bc sed perl wget curl bzip2 man unzip && \ + apt-get -y install liblzma-dev libbz2-dev libncurses5-dev libncursesw5-dev && \ + apt-get --allow-releaseinfo-change update && \ + apt install -y curl git-lfs time datamash + +# Install python: +RUN apt-get install -y python python3 python3-pip + +# Setup crcmodc for gsutil: +RUN apt-get install -y gcc python3-dev python3-setuptools && \ + pip3 uninstall -y crcmod && \ + pip3 install --no-cache-dir -U crcmod + +# install gsutil: +RUN curl https://sdk.cloud.google.com | bash + +# Get Java: +# Install OpenJDK-8 +RUN apt-get update && \ + apt-get install -y openjdk-11-jdk && \ + apt-get install -y ant && \ + apt-get clean + +# Fix certificate issues +RUN apt-get update && \ + apt-get install ca-certificates-java && \ + apt-get clean && \ + update-ca-certificates -f + +# Setup JAVA_HOME -- useful for docker commandline +ENV JAVA_HOME /usr/lib/jvm/java-11-openjdk-amd64/ +RUN export JAVA_HOME + +# Get snPEff +RUN wget https://snpeff.blob.core.windows.net/versions/snpEff_latest_core.zip && \ + unzip snpEff_latest_core.zip + +# Get tabix (for bgzip): +RUN apt-get install -y tabix + +################################################################################ +# Final runtime configuration: +# Let's start at the root: +WORKDIR / + diff --git a/docker/lr-functional-annotation/Makefile b/docker/lr-functional-annotation/Makefile new file mode 100644 index 000000000..0d0aa4764 --- /dev/null +++ b/docker/lr-functional-annotation/Makefile @@ -0,0 +1,17 @@ +IMAGE_NAME = lr-functional-annotation +VERSION = 0.0.1 + +TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest + +all: | build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +build_no_cache: + docker build --no-cache -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-malaria/Dockerfile b/docker/lr-malaria/Dockerfile new file mode 100644 index 000000000..83a75ff4e --- /dev/null +++ b/docker/lr-malaria/Dockerfile @@ -0,0 +1,27 @@ +FROM continuumio/miniconda3:22.11.1 + +MAINTAINER Jonn Smith + +# copy other resources +COPY ./environment.yml / + +# install conda packages +RUN conda env create -f /environment.yml && conda clean -a +ENV PATH=/root/google-cloud-sdk/bin/:${PATH} + +# install gsutil +RUN apt-get --allow-releaseinfo-change update +RUN apt install -y curl git-lfs time datamash +RUN curl https://sdk.cloud.google.com | bash + +# Setup crcmodc for gsutil: +RUN apt-get install -y gcc python3-dev python3-setuptools && \ + pip3 uninstall -y crcmod && \ + pip3 install --no-cache-dir -U crcmod + +# copy python scripts +COPY python/* /python_scripts/ +RUN chmod +x /python_scripts/* + +# activate conda environment +RUN echo "source activate lr-malaria" > ~/.bashrc diff --git a/docker/lr-malaria/Makefile b/docker/lr-malaria/Makefile new file mode 100644 index 000000000..673b8d967 --- /dev/null +++ b/docker/lr-malaria/Makefile @@ -0,0 +1,17 @@ +IMAGE_NAME = lr-malaria +VERSION = 0.0.1 + +TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest + +all: | build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +build_no_cache: + docker build --no-cache -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-malaria/environment.yml b/docker/lr-malaria/environment.yml new file mode 100644 index 000000000..339c1706a --- /dev/null +++ b/docker/lr-malaria/environment.yml @@ -0,0 +1,22 @@ +name: lr-malaria +channels: + - default + - bioconda + - anaconda +dependencies: + - python=3.6 + - cython + - pip + - pip: + - numpy + - scipy + - pandas + - seaborn + - statsmodels + - networkx + - matplotlib + - patsy + - openpyxl + - scikit-learn + - numpyencoder + - tqdm diff --git a/docker/lr-malaria/python/process_barcode_data.py b/docker/lr-malaria/python/process_barcode_data.py new file mode 100755 index 000000000..81c8ea2df --- /dev/null +++ b/docker/lr-malaria/python/process_barcode_data.py @@ -0,0 +1,1639 @@ +#!/usr/bin/env python +# coding: utf-8 + +# Processes barcode data for P. falciparum samples +# Based on work from the following publication: https://doi.org/10.1093/pnasnexus/pgac187 +# Orignal Author: Wesley Wong +# Modified by: Jonn Smith + +import random +import copy +import itertools +import sys +import json +import argparse + +import scipy +import pandas as pd +import numpy as np +import seaborn as sns +import numpy.polynomial.polynomial as poly +import statsmodels.api as sm +import networkx as nx +import matplotlib.pyplot as plt + +from pandas import DataFrame, read_csv, read_excel +from collections import defaultdict, Counter +from matplotlib.patches import Patch +from patsy import dmatrices +from sklearn import linear_model +from numpyencoder import NumpyEncoder +from sklearn.neighbors import KernelDensity + +def remove_duplicate_df(df, field="Haplotype"): + duplicate_rows = pd.DataFrame.duplicated(df, field, keep="first") + return df[~duplicate_rows] + + +def inverse_var_weight(p_array, var_array): + var_weighted_num = np.nansum(np.divide(p_array, var_array, where=var_array != 0)) + var_weighted_denom = np.nansum( + np.divide([1 for _ in var_array], var_array, where=var_array != 0) + ) + weighted_mean = var_weighted_num / var_weighted_denom + + weighted_var = 1 / np.sum(1 / var_array) + weighted_std = np.sqrt(weighted_var) + weighted_ci = ( + weighted_mean - 1.96 * weighted_std, + weighted_mean + 1.96 * weighted_std, + ) + + return weighted_mean, weighted_var, weighted_ci + + +def is_unique(s): + a = s.to_numpy() # s.values (pandas<0.24) + return (a[0] == a).all() # True = homozygous, #False = Heterozygous + + +def return_stats(s, convert_numpy=True): + if convert_numpy: + a = s.to_numpy() # s.values (pandas<0.24) + else: + s = a + missing_screen = a != "X" + poly_screen = a != "N" + + if missing_screen.all() == False: + return np.NaN # missing data present in this comparison, skip + elif poly_screen.all() == True: + return 1.0 - (a[0] == a).all() # 0 = homozygous, #1 = Heterozygous + + else: # N present in one of the comparisons, skip for now + return np.NaN + + +def quantify_het(barcode_comparison, f_true=0, axis=0): + null_het = np.nansum(barcode_comparison, axis=axis) * (1 - f_true) + return null_het + + +def quantify_total(barcode_comparison, axis=0): + return np.sum(~np.isnan(barcode_comparison), axis=axis) + + +def run_fn_tests(): + test_het_calculator() + test_unique() + + +def test_het_calculator(): + expected_het, expected_total = (12, 16) + test_barcode1 = ( + 6 * ["A"] + 6 * ["T"] + 6 * ["C"] + 6 * ["G"] + 6 * ["X"] + 6 * ["N"] + ) + test_barcode2 = 6 * ["A", "T", "C", "G", "X", "N"] + test_df = DataFrame([test_barcode1, test_barcode2]) + b = test_df.apply(return_stats, axis=0).to_list() + heterozygotes = np.sum(quantify_het(b), axis=0) + total = np.sum(quantify_total(b), axis=0) + assert ( + heterozygotes == expected_het + ), "Heterozygote sites should be {expected_het}, identified {x}".format( + expected_het=expected_het, x=heterozygotes + ) + assert ( + total == 16 + ), "Total usable sites should be {expected_total}, identified {x}".format( + x=total, expected_total=expected_total + ) + + +def test_unique(): + test_df1 = DataFrame([["A"], ["B"]]) + test_df2 = DataFrame(["A"], ["A"]) + assert ( + is_unique(DataFrame(test_df1)) == False + ), "Failed unique test, misidentified [A,B] as being composed of a single unique entity" + assert ( + is_unique(DataFrame(test_df2)) == True + ), "Failed unique test, misidentified [A,A] as being composed of a multiple unique entities" + + +def check_combinations(combo_barcodes): + arr_2d = combo_barcodes.to_numpy() + assert ( + np.array_equal(arr_2d[0], arr_2d[1]) == False + ), "Duplicate barcodes were sampled" + + +def calculate_RH_sample(h_mono, h_poly): + if h_mono > h_poly: + rh = (h_mono - h_poly) / h_mono + else: + rh = (h_mono - h_poly) / h_mono + return rh + + +def bootstrap(array, iterations): + resample = np.random.choice(array, (iterations, len(array)), replace=True) + return np.mean(resample, axis=1) + + +def rh_classifier(rh): + rh = float(rh) + if rh > 0.4: + return "cotx" + elif rh > 0.3: + return "cotx_probable" + elif rh > -0.1: + return "coi=2" + elif rh > -0.2: + return "coi=2_probable" + elif rh > -0.4: + return "coi=3" + elif rh > -0.6: + return "coi=3_probable" + elif rh > -0.8: + return "coi=4_probable" + else: + return "coi=4" + + +def calculate_shannons_index(p_array): + h = -np.sum([p_array * np.log(p_array)]) + return h + + +def calculate_evenness(h_array, k): + e = h_array / np.log(k) + return e + + +def calculate_h12(p_array): + p_array = sorted(p_array) + p1 = p_array[0] + p2 = p_array[1] + other_p = np.asarray(p_array[2:]) + sum_rest = np.sum(other_p**2) + h12 = (p1 + p2) ** 2 + sum_rest + return h12 + + +def interpret_cotx_simbinbarcode(sim, cotx_event=1): + """wokrks only for coi =2""" + sim = sim[str(cotx_event)] + converted_barcodes = [] + for binbarcode in sim: + if binbarcode == 0: + barcode = 24 * [0] + else: + barcode = [int(i) for i in list("{0:0b}".format(binbarcode))] + while len(barcode) < 24: + barcode = [ + 0 + ] + barcode # the conversion does not preserve leading zeros + converted_barcodes.append(barcode) + return np.asarray(converted_barcodes) + + +def load_cotx_simulations(): + cotx_simulation_data = defaultdict( + lambda: defaultdict(list) + ) # dict[initial_coi][cotx_event] + + cotx_file = "cotx_barcodes_2.txt" + cotx_data = json.load(open(cotx_file)) + for simulation in cotx_data: + for n_cotx_event in [1, 2, 3]: + if len(simulation[str(n_cotx_event)]) > 1: + cotx_simulation_data[2]["cotx_event{n}".format(n=n_cotx_event)].append( + interpret_cotx_simbinbarcode(simulation, cotx_event=n_cotx_event) + ) + + for initial_coi in [3, 4, 5]: + cotx_file = "cotx_barcodes_{x}.txt".format(x=initial_coi) + cotx_data = json.load(open(cotx_file)) + for simulation in cotx_data: + for n_cotx in [1, 2, 3]: + if len(simulation[str(n_cotx)]) > 1: + cotx_simulation_data[initial_coi][ + "cotx_event{x}".format(x=n_cotx) + ].append(np.asarray(simulation[str(n_cotx)])) + return cotx_simulation_data + + +def wilson(p, n, z=1.96): + denominator = 1 + z**2 / n + centre_adjusted_probability = p + z * z / (2 * n) + adjusted_standard_deviation = np.sqrt((p * (1 - p) + z * z / (4 * n)) / n) + + lower_bound = ( + centre_adjusted_probability - z * adjusted_standard_deviation + ) / denominator + upper_bound = ( + centre_adjusted_probability + z * adjusted_standard_deviation + ) / denominator + return lower_bound, upper_bound + + +class BarcodeStats: + cpalette_converter = { + "green": "viridis", + "blue": "mako", + "cornflowerblue": "mako", + "crimson": "rocket", + "orange": "Oranges", + "purple": "Purples_r", + } + + # Set up field names from sheet: + multi_poly_field = "M_P" + sample_name_field = "Sample_Name" + + def __init__(self, input_file, ISO3, barcode_file_path, sheet_name=None, adjusted_n=False): + + self.ISO3 = ISO3 + self.barcode_file_path = barcode_file_path + + # Ingest the input file: + if input_file.endswith(".xls") or input_file.endswith(".xlsx"): + if not sheet_name: + self.master_df = DataFrame(read_excel(input_file, sheet_name=ISO3)) + else: + self.master_df = DataFrame(read_excel(input_file, sheet_name=sheet_name)) + else: + sep = "\t" if input_file.endswith(".tsv") else "," + self.master_df = DataFrame(read_csv(input_file, sep=sep, header=0)) + + # Ingest the barcode file: + tmp_barcode_def_df = DataFrame(read_csv(barcode_file_path, sep="\t")) + self.loci_position_names = list(tmp_barcode_def_df["name"].values) + self.barcode_pos_dict = {name: f"{contig}:{pos}" for name, contig, pos in + zip(tmp_barcode_def_df['name'], tmp_barcode_def_df['chr'], tmp_barcode_def_df['pos'])} + # We don't have to, but we should clean up our mess: + del tmp_barcode_def_df + + # Define all the fields we're going to create: + self.poly_het_dict = None + self.total_mono = None + self.repeat_haps = None + self.unique_mono_count = None + self.popgen_stats = None + self.haplotype_counts = None + self.loci_allele_dict = None + self.n_total = None + self.n_poly = None + self.n_singles = None + self.poly_barcode_year_dfs = None + self.mono_barcode_year_dfs = None + self.barcode_year_dfs = None + self.chrono_years = None + self.mono_barcode_df = None + self.barcodes_df = None + self.poly_het_timeseries = None + self.poly_barcode_het_dist = None + self.poly_barcode_het_avg = None + self.poly_samples = None + self.RH_barcode_dict = None + self.observed_RH = None + self.H_mono_barcodes = None + self.RH_df = None + self.poly_df = None + self.RH_yearly_averages = None + self.RH_yearly_variances = None + self.RH_average = None + self.RH_weighted_var = None + self.RH_ci = None + self.model_expectations = None + self.cotx_simulation_data = None + + # Now process our data: + self.extract_region(ISO3) + self.extract_mono_poly_stats() + self.calc_stats() + self.calculate_polyhet_timeseries() + self.quantify_observed_poly_het(adjusted_n) + self.calculate_RH() + + def label_haplotypes(self): + barcode_haplotypes_dict = {} + unique_barcode_haplotype = 0 + barcode_haplotypes = [] + for row in self.barcodes_df[self.barcodes_df.columns[7:31]].to_numpy(): + barcode = "".join(row) + if barcode not in barcode_haplotypes_dict: + barcode_haplotypes_dict[barcode] = unique_barcode_haplotype + unique_barcode_haplotype += 1 + barcode_haplotypes.append(barcode_haplotypes_dict[barcode]) + self.barcodes_df["Haplotype"] = barcode_haplotypes + + def extract_region(self, ISO3): + tmp_df = self.master_df[self.master_df["ISO3"] == ISO3] + for position in self.loci_position_names: + tmp_df[position] = [ + x.strip().upper() for x in tmp_df[position] + ] # noticed a tab formatting in these columns + + tmp_df[BarcodeStats.multi_poly_field] = [x.strip() for x in tmp_df[BarcodeStats.multi_poly_field]] + + control_samples = [ + "3D7", + "3D7-1", + "3D7-2", + "3d7", + "Dd2-MOD", + "Dd2-Mod", + "Dd2/Mod", + "Dd2_MOD", + "DD2", + "Dd2", + ] + + self.barcodes_df = tmp_df[ + (tmp_df["X"] <= 2) & (~tmp_df[BarcodeStats.sample_name_field].isin(control_samples)) + ] + self.mono_barcode_df = self.barcodes_df[ + (self.barcodes_df["X"] <= 2) + & (self.barcodes_df[BarcodeStats.multi_poly_field] == "M") + & (~self.barcodes_df[BarcodeStats.sample_name_field].isin(control_samples)) + ] + self.chrono_years = sorted(Counter(self.barcodes_df["Year"]).keys()) + self.label_haplotypes() + + def extract_mono_poly_stats(self, fail_threshold=2): + self.barcode_year_dfs = {} + self.mono_barcode_year_dfs = {} + self.poly_barcode_year_dfs = {} + for year in self.chrono_years: + self.barcode_year_dfs[year] = self.barcodes_df[ + (self.barcodes_df["Year"] == year) + & (self.barcodes_df["X"] <= fail_threshold) + ] + + self.mono_barcode_year_dfs[year] = self.barcodes_df[ + (self.barcodes_df["Year"] == year) + & (self.barcodes_df["X"] <= fail_threshold) + & (self.barcodes_df[BarcodeStats.multi_poly_field] == "M") + ] + + self.poly_barcode_year_dfs[year] = self.barcodes_df[ + (self.barcodes_df["Year"] == year) + & (self.barcodes_df["X"] <= fail_threshold) + & (self.barcodes_df[BarcodeStats.multi_poly_field] == "P") + ] + self.mono_barcode_year_dfs[year].reset_index(drop=True, inplace=True) + self.poly_barcode_year_dfs[year].reset_index(drop=True, inplace=True) + self.n_singles = np.asarray( + [len(self.mono_barcode_year_dfs[year]) for year in self.chrono_years] + ) + self.n_poly = np.asarray( + [len(self.poly_barcode_year_dfs[year]) for year in self.chrono_years] + ) + + self.n_total = np.asarray( + [len(self.barcode_year_dfs[year]) for year in self.chrono_years] + ) + + self.loci_allele_dict = {} + for column in self.mono_barcode_df.columns[7:31]: + counts = Counter(self.mono_barcode_df[column].to_numpy()) + counts.pop("X", 0) + counts.pop("N", 0) + major_allele = max(counts, key=counts.get) + counts.pop(major_allele, 0) + try: + minor_allele = max(counts, key=counts.get) + except: + minor_allele = None + self.loci_allele_dict[column] = (major_allele, minor_allele) + + self.haplotype_counts = {} + for year in self.chrono_years: + counts = Counter( + self.barcode_year_dfs[year][self.barcode_year_dfs[year][BarcodeStats.multi_poly_field] == "M"][ + "Haplotype" + ] + ) + self.haplotype_counts[year] = counts + + def calc_stats(self): + self.popgen_stats = defaultdict(list) + n = self.n_singles + self.n_poly + p = self.n_poly / n + q = 1.0 - p + variances = p * q / n + wilson_interval = [wilson(prop, n_samples) for prop, n_samples in zip(p, n)] + + self.popgen_stats["p_poly_fract"] = list(p) + self.popgen_stats["var_poly_fract"] = list(variances) + weighted_mean, weighted_var, weighted_ci = inverse_var_weight(p, p * q / n) + self.popgen_stats["poly_fract_inv_var"] = ( + weighted_mean, + weighted_var, + weighted_ci, + ) + self.popgen_stats["poly_wilson"] = wilson_interval + x = np.asarray(self.chrono_years) + X = sm.add_constant(x) + y = np.array(p) + model = sm.OLS(y, X).fit() + self.popgen_stats["poly_fract_model"] = model + + # print('Unique Mono Fract') + self.unique_mono_count = {} + self.repeat_haps = defaultdict(lambda: defaultdict(lambda: 0)) + for year in self.chrono_years: + sorted_hid = sorted( + self.haplotype_counts[year], + key=lambda x: self.haplotype_counts[year][x], + reverse=True, + ) + for hid in sorted_hid: + if self.haplotype_counts[year][hid] != 1: + self.repeat_haps[year][hid] = self.haplotype_counts[year][hid] + else: + self.repeat_haps[year]["unique"] += 1 + self.unique_mono_count[year] = self.repeat_haps[year]["unique"] + + self.total_mono = np.asarray( + [ + np.sum(list(self.haplotype_counts[year].values())) + for year in self.chrono_years + ] + ) + p_mono_unique = ( + np.asarray([self.unique_mono_count[year] for year in self.chrono_years]) + / self.total_mono + ) + p_mono_clonal = 1.0 - p_mono_unique + var_mono_unique = (p_mono_unique * (1.0 - p_mono_unique)) / self.total_mono + self.popgen_stats["p_mono_unique"] = list(p_mono_unique) + self.popgen_stats["var_mono_unique"] = list(var_mono_unique) + self.popgen_stats["wilson_mono_unique"] = [ + wilson(p, n) for p, n in zip(p_mono_unique, self.total_mono) + ] + x = np.asarray(self.chrono_years) + X = sm.add_constant(x) + y = np.array(p_mono_unique) + model = sm.OLS(y, X).fit() + self.popgen_stats["mono_unique_model"] = model + weighted_mean, weighted_var, weighted_ci = inverse_var_weight( + p_mono_unique, var_mono_unique + ) + self.popgen_stats["mono_unique_inv_var"] = ( + weighted_mean, + weighted_var, + weighted_ci, + ) + + self.popgen_stats["p_mono_clonal"] = list(p_mono_clonal) + self.popgen_stats["var_mono_clonal"] = list( + var_mono_unique + ) # it's the same because it is the inverse + self.popgen_stats["wilson_mono_clonal"] = [ + wilson(p, n) for p, n in zip(p_mono_clonal, self.total_mono) + ] + x = np.asarray(self.chrono_years) + X = sm.add_constant(x) + y = np.array(p_mono_clonal) + model = sm.OLS(y, X).fit() + self.popgen_stats["mono_clonal_model"] = model + weighted_mean, weighted_var, weighted_ci = inverse_var_weight( + p_mono_clonal, var_mono_unique + ) + self.popgen_stats["mono_clonal_inv_var"] = ( + weighted_mean, + weighted_var, + weighted_ci, + ) + + # calculate mono diversity - all + for year in self.chrono_years: + hap_ids = np.asarray(list(self.haplotype_counts[year].keys())) + hap_counts = np.asarray(list(self.haplotype_counts[year].values())) + hap_freqs = hap_counts / np.sum(hap_counts) + sampling_idxes = np.random.choice(hap_ids, p=hap_freqs, size=(200, 200)) + shannon_idxes, evenness_scores, H12_scores = [], [], [] + for sampling_idx in sampling_idxes: + sampled_counts = Counter(sampling_idx) + sampled_freqs = np.asarray(list(sampled_counts.values())) / 200 + + H12 = calculate_h12(sampled_freqs) + shannon_idx = calculate_shannons_index(sampled_freqs) + shannon_idxes.append(shannon_idx) + evenness = calculate_evenness( + shannon_idx, len(list(sampled_counts.keys())) + ) + evenness_scores.append(evenness) + H12_scores.append(H12) + + self.popgen_stats["shannon_idx_mean"].append(np.mean(shannon_idxes)) + self.popgen_stats["evenness_mean"].append(np.mean(evenness_scores)) + self.popgen_stats["H12_mean"].append(np.mean(H12_scores)) + + self.popgen_stats["shannon_idx_var"].append(np.var(shannon_idxes)) + self.popgen_stats["evenness_var"].append(np.var(evenness_scores)) + self.popgen_stats["H12_var"].append(np.var(H12_scores)) + + self.popgen_stats["shannon_idx_mean"] = np.array( + self.popgen_stats["shannon_idx_mean"] + ) + model = sm.OLS(self.popgen_stats["shannon_idx_mean"], X).fit() + self.popgen_stats["shannon_idx_model"] = model + + self.popgen_stats["evenness_mean"] = np.array( + self.popgen_stats["evenness_mean"] + ) + model = sm.OLS(self.popgen_stats["evenness_mean"], X).fit() + self.popgen_stats["evenness_model"] = model + + self.popgen_stats["H12_mean"] = np.array(self.popgen_stats["H12_mean"]) + model = sm.OLS(self.popgen_stats["H12_mean"], X).fit() + self.popgen_stats["H12_model"] = model + + if "mccoil_median" in self.barcodes_df.columns: + for year, df in self.barcodes_df.groupby("Year"): + self.popgen_stats["mccoil_coi"].append(np.mean(df["mccoil_median"])) + self.popgen_stats["mccoil_coi_std"].append(np.std(df["mccoil_median"])) + for year, df in self.barcodes_df.groupby("Year"): + self.popgen_stats["mccoil_coi_poly"].append( + np.mean([x for x in df["mccoil_median"] if x >= 2]) + ) + self.popgen_stats["mccoil_coi_poly_std"].append( + np.std([x for x in df["mccoil_median"] if x >= 2]) + ) + + def calculate_polyhet_timeseries(self): + self.poly_het_dict = defaultdict(dict) + for year in self.chrono_years: + for position in self.loci_position_names: + counts = Counter(self.poly_barcode_year_dfs[year][position].to_list()) + n_missing = counts.pop("X", 0) + total = np.sum(list(counts.values())) + n_het = counts.pop("N", 0) + p_het = n_het / total + # print(year, position, p_het, n_missing, n_missing/total, total) + self.poly_het_dict[year][position] = (p_het, total) + + self.poly_het_timeseries = defaultdict(list) + for loci in self.loci_position_names: + for year in self.chrono_years: + self.poly_het_timeseries[loci].append(self.poly_het_dict[year][loci][0]) + + def quantify_observed_poly_het(self, adjustedN=False): + self.poly_barcode_het_dist = defaultdict(list) + self.poly_barcode_het_avg = {} + self.poly_samples = {} + if not adjustedN: + # counting N from barcode + for year in self.chrono_years: + for i, row in enumerate( + self.poly_barcode_year_dfs[year][ + self.poly_barcode_year_dfs[year].columns[7:31] + ].to_numpy() + ): + barcode = "".join(row) + barcode_counts = Counter(barcode) + barcode_counts.pop("X", 0) + total = np.sum(list(barcode_counts.values())) + het = barcode_counts.pop("N") + H_poly_barcode = het / total + self.poly_barcode_het_dist[year].append(H_poly_barcode) + self.poly_samples[year] = list( + self.poly_barcode_year_dfs[year][BarcodeStats.sample_name_field] + ) + self.poly_barcode_het_avg[year] = np.mean( + self.poly_barcode_het_dist[year] + ) + else: + for year in self.chrono_years: + total = 24.0 - np.asarray(self.poly_barcode_year_dfs[year]["X"]) + het = ( + np.asarray(self.poly_barcode_year_dfs[year]["Adjusted_Het"]) / total + ) + self.poly_barcode_het_dist[year] = het + self.poly_samples[year] = list( + self.poly_barcode_year_dfs[year][BarcodeStats.sample_name_field] + ) + self.poly_barcode_het_avg[year] = np.mean(het) + + def sample_poly_barcodes(self, year, coi=2, samples=100): + combinations = np.random.randint( + self.mono_barcode_year_dfs[year].index[0], + self.mono_barcode_year_dfs[year].index[-1], + size=(samples, coi), + ) + sampled_poly_barcodes = [] + for i, combo in enumerate(combinations): + sampled_haplotypes = self.mono_barcode_year_dfs[year].loc[ + combo, ["Haplotype"] + ] + flag = len(np.unique(list(sampled_haplotypes["Haplotype"]))) != coi + while flag: + combo = np.random.randint( + self.mono_barcode_year_dfs[year].index[0], + self.mono_barcode_year_dfs[year].index[-1], + coi, + ) + combinations[i] = combo + sampled_haplotypes = self.mono_barcode_year_dfs[year].loc[ + combo, ["Haplotype"] + ] + flag = len(np.unique(list(sampled_haplotypes["Haplotype"]))) != coi + combo_barcodes = self.mono_barcode_year_dfs[year].loc[ + combo, self.loci_position_names + ] + check_combinations(combo_barcodes) + sampled_poly_barcodes.append( + combo_barcodes.apply(lambda x: return_stats(x), axis=0).to_list() + ) + + sampled_poly_barcodes = np.asarray(sampled_poly_barcodes) + + return sampled_poly_barcodes + + def simulator(self, n_poly, n_iterations=1000, f_true=0, axis=0, coi=2): + run_fn_tests() + poly_simulations = defaultdict(list) + for year, n in zip(self.chrono_years, n_poly): + for n_rep in range(n_iterations): + attempt_count = 1 + if n_rep % 100 == 0: + print(year, n, n_rep) + b = self.sample_poly_barcodes(year, coi=coi, samples=n) + heterozygotes = quantify_het(b, f_true, axis=axis) + total = quantify_total(b, axis=axis) + p_het = heterozygotes / total + while ( + np.isfinite(p_het).all() == False + ): # if zero is found in the total + assert attempt_count <= 3, "maximum number of attempts reached" + print("attempting resample {x}".format(x=attempt_count)) + b = self.sample_poly_barcodes(year, samples=n) + heterozygotes = quantify_het(b, f_true, axis=axis) + total = quantify_total(b, axis=axis) + p_het = heterozygotes / total + attempt_count += 1 + poly_simulations[year].append(p_het) + poly_simulations[year] = np.asarray(poly_simulations[year]) + return poly_simulations + + def sample_cotx_barcodes_from_mono( + self, year, coi=2, samples=100, initial_coi=2, cotx_event=1 + ): + # cotx_sim stats + cotx_sim_filtered_data = [ + x + for x in self.cotx_simulation_data[initial_coi][ + "cotx_event{x}".format(x=cotx_event) + ] + if len(x) >= coi + ] + random_sim_idxes = np.asarray( + random.sample(range(len(cotx_sim_filtered_data)), samples) + ) + sampled_cotx_barcodes = np.asarray(cotx_sim_filtered_data, dtype="object")[ + random_sim_idxes + ] + + # superinfection layer + combinations = np.random.randint( + self.mono_barcode_year_dfs[year].index[0], + self.mono_barcode_year_dfs[year].index[-1], + size=(samples, initial_coi), + ) + sampled_poly_barcodes = [] + for i, combo in enumerate(combinations): + sampled_haplotypes = self.mono_barcode_year_dfs[year].loc[ + combo, ["Haplotype"] + ] + flag = len(np.unique(list(sampled_haplotypes["Haplotype"]))) != initial_coi + while flag: + combo = np.random.randint( + self.mono_barcode_year_dfs[year].index[0], + self.mono_barcode_year_dfs[year].index[-1], + initial_coi, + ) + combinations[i] = combo + sampled_haplotypes = self.mono_barcode_year_dfs[year].loc[ + combo, ["Haplotype"] + ] + flag = ( + len(np.unique(list(sampled_haplotypes["Haplotype"]))) != initial_coi + ) + combo_barcodes = self.mono_barcode_year_dfs[year].loc[ + combo, self.loci_position_names + ] + check_combinations(combo_barcodes) + + relatedness_maps = [] + for n in range(coi): + relatedness_maps.append(sampled_cotx_barcodes[i][n]) + + combo_barcodes = combo_barcodes.to_numpy() + cotx_strains = [] + for relatedness_map in relatedness_maps: + tmp = [ + combo_barcodes[strain_choice][position] + for position, strain_choice in enumerate(relatedness_map) + ] + cotx_strains.append(tmp) + + df = DataFrame(cotx_strains) # [cotx_strain1, cotx_strain2]) + sampled_poly_barcodes.append( + df.apply(lambda x: return_stats(x), axis=0).to_list() + ) + + sampled_poly_barcodes = np.asarray(sampled_poly_barcodes) + + return sampled_poly_barcodes + + def simulator_cotx( + self, n_poly, n_iterations=1000, axis=0, coi=2, initial_coi=2, cotx_event=1 + ): + run_fn_tests() + poly_simulations = defaultdict(list) + for year, n in zip(self.chrono_years, n_poly): + for n_rep in range(n_iterations): + attempt_count = 1 + if n_rep % 100 == 0: + print(year, n, n_rep) + b = self.sample_cotx_barcodes_from_mono( + year, + coi=coi, + samples=n, + initial_coi=initial_coi, + cotx_event=cotx_event, + ) + heterozygotes = quantify_het(b, Ftrue=0, axis=axis) + total = quantify_total(b, axis=axis) + p_het = heterozygotes / total + while ( + np.isfinite(p_het).all() == False + ): # if zero is found in the total + assert attempt_count <= 3, "maximum number of attempts reached" + print("attempting resample {x}".format(x=attempt_count)) + b = self.sample_poly_barcodes(year, samples=n) + heterozygotes = quantify_het(b, Ftrue, axis=axis) + total = quantify_total(b, axis=axis) + p_het = heterozygotes / total + attempt_count += 1 + poly_simulations[year].append(p_het) + poly_simulations[year] = np.asarray(poly_simulations[year]) + return poly_simulations + + def calculate_RH(self, n_poly_per_year=20, n_iter=20): + self.RH_barcode_dict = {} + self.observed_RH = defaultdict(list) + + print("Simulating mono barcode sampling for RH") + H_mono_barcodes = self.simulator( + [n_poly_per_year for x in self.n_poly], n_iter, 0, axis=1 + ) + self.calculate_RH_barcode_distribution(H_mono_barcodes) + self.calculate_RHyear_distribution(H_mono_barcodes) + self.H_mono_barcodes = H_mono_barcodes + + # actual samples + minimum_sample = np.min(self.n_poly) + for year in self.chrono_years: + for i, H_poly_barcode in enumerate(self.poly_barcode_het_dist[year]): + RH_sample_dist = [ + calculate_RH_sample(np.mean(sim_trace), H_poly_barcode) + for sim_trace in self.H_mono_barcodes[year] + ] + self.observed_RH[year].append(RH_sample_dist) + + X = sm.add_constant(self.chrono_years) + y = np.array( + [np.mean(self.RH_barcode_dict[year]) for year in self.chrono_years] + ) + model1 = sm.OLS(y, X).fit() + self.RH_barcode_dict["model"] = model1 + + data, Rh_sample_averages, cotx_averages = ( + [], + defaultdict(list), + defaultdict(list), + ) + for year in self.chrono_years: + for sample_name, RH_sample_dist in zip( + self.poly_samples[year], self.observed_RH[year] + ): + data.append([sample_name, year, np.mean(RH_sample_dist)]) + self.RH_df = DataFrame(data) + self.RH_df.columns = ["Sample", "Year", "RH"] + self.RH_df["classification"] = self.RH_df["RH"].apply(rh_classifier) + self.poly_df = pd.merge( + self.master_df, self.RH_df, left_on=BarcodeStats.sample_name_field, right_on="Sample" + ) + + for year, df in self.RH_df.groupby("Year"): + total_sample = np.asarray(df["RH"]) + cotx_total_samples = np.asarray(df["classification"]) + + sampling_idxes = np.random.randint(0, len(total_sample), size=(100, 200)) + for sampling_idx in sampling_idxes: + RH_sample = total_sample[sampling_idx] + Rh_sample_averages[year].append(np.mean(RH_sample)) + + cotx_samples = cotx_total_samples[sampling_idx] + cotx_counts = Counter(cotx_samples) + p_cotx = (cotx_counts["cotx"] + cotx_counts["cotx_probable"]) / len( + sampling_idx + ) + cotx_averages[year].append(p_cotx) + averages = np.asarray( + [np.mean(Rh_sample_averages[year]) for year in self.chrono_years] + ) + variances = np.asarray( + [np.var(Rh_sample_averages[year]) for year in self.chrono_years] + ) + weighted_mean, weighted_var, weighted_ci = inverse_var_weight( + averages, variances + ) + + self.RH_yearly_averages = averages + self.RH_yearly_variances = variances + self.RH_average = weighted_mean + self.RH_weighted_var = weighted_var + self.RH_ci = weighted_ci + + def calculate_RH_barcode_distribution(self, poly_simulations): + def distance_RH_trace(RH, H_mono_trace, H_poly_trace): + H_mono_trace = np.asarray(H_mono_trace) + H_poly_trace = np.asarray(H_poly_trace) + distance = np.sum((H_mono_trace - (H_mono_trace * RH) - H_poly_trace) ** 2) + return distance + + H_poly_trace = list(self.poly_barcode_het_avg.values()) + n_reps = len(poly_simulations[list(poly_simulations.keys())[0]]) + barcode_het_timetraces = [] + for irep in range(n_reps): + timetrace = [] + for year in self.chrono_years: + timetrace.append(np.mean(poly_simulations[year][irep])) + barcode_het_timetraces.append(timetrace) + + RH_barcode_distribution = [] + for itrace in range(n_reps): + output = scipy.optimize.minimize( + lambda RH: distance_RH_trace( + RH, barcode_het_timetraces[itrace], H_poly_trace + ), + x0=[0.5], + bounds=[(-1, 1)], + ) + RH_barcode_distribution.append(output) + self.RH_barcode_dict["total"] = [ + output.x[0] for output in RH_barcode_distribution + ] + + def calculate_RHyear_distribution(self, poly_simulations): + def distance_RHyear_trace( + RH, H_mono_traces_dict, itrace, year, H_poly_dict=self.poly_barcode_het_dist + ): + distance = 0 + H_mono_trace = np.mean(H_mono_traces_dict[year][itrace]) + H_poly_trace = np.mean(H_poly_dict[year]) + distance_array = (H_mono_trace - (H_mono_trace * RH) - H_poly_trace) ** 2 + return np.sum(distance_array) + + for year in self.chrono_years: + n_reps = len(poly_simulations[list(poly_simulations.keys())[0]]) + RH_year_distribution = [] + for itrace in range(n_reps): + output = scipy.optimize.minimize( + lambda RH: distance_RHyear_trace( + RH, poly_simulations, itrace, year + ), + x0=[0.5], + bounds=[(0, 1)], + ) + RH_year_distribution.append(output) + + self.RH_barcode_dict[year] = [ + output.x[0] for output in RH_year_distribution + ] + + def simulate_coi_cotx_sweep(self, n_poly=200, n_iter=200, oocyst_alpha=2.5): + self.model_expectations = defaultdict(lambda: defaultdict(list)) + self.cotx_simulation_data = load_cotx_simulations(oocyst_alpha) + + H_mono_barcode_coi = defaultdict(dict) + for coi in [2, 3, 4, 5]: + print("Simulating COI={coi} expectation".format(coi=coi)) + H_mono_barcode_coi[coi] = self.simulator( + [n_poly for x in self.n_poly], n_iter, 0, axis=1, coi=coi + ) + + for year in self.chrono_years: + for coi in [2, 3, 4, 5]: + for sim_iteration in H_mono_barcode_coi[coi][year]: + mean_het = np.mean(self.H_mono_barcodes[year]) + for sim in sim_iteration: + self.model_expectations[ + "coi={x}, oocyst_alpha={alpha}".format( + x=coi, alpha=oocyst_alpha + ) + ][year].append(calculate_RH_sample(mean_het, sim)) + + for initial_coi in [2, 3, 4, 5]: + for cotx_round in [1, 2, 3]: + print( + "Initial COI = {initial_coi}, Simulating cotx round {x}".format( + initial_coi=initial_coi, x=cotx_round + ) + ) + H_mono_barcode_cotx = self.simulator_cotx( + [n_poly for x in self.n_poly], + n_iter, + axis=1, + coi=2, + initial_coi=initial_coi, + cotx_event=cotx_round, + ) + for year in self.chrono_years: + for sim_iteration in H_mono_barcode_cotx[year]: + mean_het = np.mean(self.H_mono_barcodes[year]) + for sim in sim_iteration: + self.model_expectations[ + "cotx_{initial_coi}_{alpha}_{cotx_round}".format( + initial_coi=initial_coi, + alpha=oocyst_alpha, + cotx_round=cotx_round, + ) + ][year].append(calculate_RH_sample(mean_het, sim)) + + def plot_sample_distribution( + self, color, ax=None, x_annotate=-0.2, y_annotate=0.1, legend=True, title=None + ): + if not ax: + fig, ax = plt.subplots() + ax.bar( + [year for year in self.chrono_years], + self.n_singles, + color="grey", + alpha=0.3, + ) + + bar = ax.bar( + [year for year in self.chrono_years], + self.n_poly, + color=color, + bottom=self.n_singles, + ) + for x, y, z in zip(self.chrono_years, self.n_singles, self.n_singles): + x = round(x, 2) + y = round(y, 2) + ax.annotate( + str(z), + (x + x_annotate, y + y_annotate), + fontsize=15, + color="black", + fontweight="bold", + ) + + for x, y, z in zip( + self.chrono_years, self.n_singles + self.n_poly, self.n_poly + ): + x = round(x, 2) + y = round(y, 2) + ax.annotate( + str(z), + (x + x_annotate, y + y_annotate), + fontsize=12, + color=color, + fontweight="bold", + ) + + legend_elements = [ + Patch(facecolor="grey", edgecolor="black", label="Monogenomic"), + Patch(facecolor=color, edgecolor="black", label="Polygenomic"), + ] + if legend: + ax.legend(handles=legend_elements) + if not title: + ax.set_title("Sample Distribution", fontsize=20) + else: + ax.set_title(title, fontsize=20, loc="left") + ax.set_xticks(self.chrono_years) + ax.set_xticklabels(self.chrono_years) + ax.tick_params(labelsize=15) + return ax + + def plot_mono_poly_fract( + self, + color, + ax=None, + x_annotate=-0.02, + y_annotate=0.02, + annotate_color="black", + title=None, + ): + if not ax: + fig, ax = plt.subplots() + p_mono = self.n_singles / self.n_total + bar = ax.bar( + [year for year in self.chrono_years], p_mono, color="grey", alpha=0.3 + ) + for b, z in zip(bar, [round(_, 2) for _ in p_mono]): + x, y = b._x0, b._height + # print(x,y) + if str(x) == "nan": + x = 0 + if str(y) == "nan": + y = 0 + x = round(x, 2) + y = round(y, 2) + ax.annotate( + str(z), + (x + x_annotate, y + y_annotate), + fontsize=12, + color=annotate_color, + fontweight="bold", + ) + + ax.bar( + [year for year in self.chrono_years], + self.n_poly / self.n_total, + color=color, + bottom=self.n_singles / self.n_total, + ) + if not title: + ax.set_title("Mono vs Poly Fraction", fontsize=20) + else: + ax.set_title(title, fontsize=20, loc="left") + ax.tick_params(labelsize=15) + ax.set_xlim(self.chrono_years[0] - 1, self.chrono_years[-1] + 1) + ax.set_xticks(self.chrono_years) + ax.set_xticklabels(self.chrono_years) + + def plot_mono_hap_sharing( + self, + color, + ax=None, + annotate_color="black", + x_annotate=0.2, + y_annotate=0.05, + title=None, + ): + if not ax: + fig, ax = plt.subplots() + + for year in self.chrono_years: + bottom, unique = 0, 0 + sorted_hid = sorted( + self.haplotype_counts[year], + key=lambda x: self.haplotype_counts[year][x], + reverse=True, + ) + cpalette = sns.color_palette( + BarcodeStats.cpalette_converter[color], len(self.repeat_haps[year]) + ) + total = np.sum(list(self.repeat_haps[year].values())) + for i, hid in enumerate(self.repeat_haps[year]): + if hid != "unique": + height = self.repeat_haps[year][hid] / total + ax.bar( + [year], + height, + bottom=bottom, + color=cpalette[i], + edgecolor="black", + ) + bottom += height + # shared_fracts.append(round(bottom,2)) + bar = ax.bar( + [year], + self.repeat_haps[year]["unique"] / total, + bottom=bottom, + color="grey", + alpha=0.3, + ) + + shared_fracts = [ + round(_, 2) for _ in 1.0 - np.asarray(self.popgen_stats["p_mono_unique"]) + ] + for x, y, z in zip(self.chrono_years, shared_fracts, shared_fracts): + x = round(x, 2) + y = round(y, 2) + ax.annotate( + str(z), + (x - x_annotate, y + y_annotate), + fontsize=12, + color=annotate_color, + fontweight="bold", + ) + + if not title: + ax.set_title("Mono Clonality", fontsize=20) + else: + ax.set_title(title, fontsize=20, loc="left") + ax.tick_params(labelsize=15) + ax.set_xlim(self.chrono_years[0] - 1, self.chrono_years[-1] + 1) + ax.set_xticks(self.chrono_years) + ax.set_xticklabels(self.chrono_years) + + def plot_persistent_clones( + self, color, ax=None, x_annotate=[-0.3, 0.2], y_annotate=0.1, title=None + ): + if not ax: + fig, ax = plt.subplots() + cpalette = sns.color_palette(BarcodeStats.cpalette_converter[color], 10) + hap_stats = defaultdict(dict) + total_clusters = 1 + for year in self.chrono_years: + sorted_hid = sorted( + self.haplotype_counts[year], + key=lambda x: self.haplotype_counts[year][x], + reverse=True, + ) + for hid in sorted_hid: + if self.haplotype_counts[year][hid] != 1: + hap_stats[hid][year] = self.haplotype_counts[year][hid] + total_clusters += 1 + cpalette = sns.color_palette( + BarcodeStats.cpalette_converter[color], total_clusters + ) + + count = 1 + flipper = 0 + for haplotype in hap_stats: + x_array, y_array, s_array = [], [], [] + for year in hap_stats[haplotype]: + x_array.append(year) + y_array.append(count) + s_array.append(hap_stats[haplotype][year] * 100) + ax.scatter( + x_array, y_array, s_array, color=cpalette[count - 1], edgecolor="black" + ) + ax.plot(x_array, y_array, color=cpalette[count - 1]) + for i, txt in enumerate(s_array): + ax.annotate( + int(txt / 100.0), + (x_array[i] + x_annotate[flipper], y_array[i] - y_annotate), + fontsize=12, + color="black", + fontweight="bold", + ) + if flipper == 0: + flipper = 1 + else: + flipper = 0 + count += 1 + if not title: + ax.set_title("Mono Clonality", fontsize=20) + else: + ax.set_title(title, fontsize=20, loc="left") + ax.tick_params(labelsize=15) + ax.tick_params(left=False, labelleft=False) + + ax.set_xlim(self.chrono_years[0] - 1, self.chrono_years[-1] + 1) + ax.set_xticks(self.chrono_years) + ax.set_xticklabels(self.chrono_years) + + def plot_longitudinal( + self, field, color="orange", ax=None, inverse_var=False, title=None + ): + if not ax: + fig, ax = plt.subplots() + fields = { + "mono_unique": ( + "p_mono_unique", + "var_mono_unique", + "mono_unique_model", + "mono_unique_inv_var", + ), + "poly_fract": ( + "p_poly_fract", + "var_poly_fract", + "poly_fract_model", + "poly_fract_inv_var", + ), + "cotx": ("cotx_average", "cotx_var", "cotx_inv_var"), + } + + p = self.popgen_stats[fields[field][0]] + variances = self.popgen_stats[fields[field][1]] + + if not inverse_var: + model = self.popgen_stats[fields[field][2]] + x = np.asarray(self.chrono_years) + X = sm.add_constant(x) + ax.scatter(self.chrono_years, p, color=color) + ax.plot(x, model.predict(X), color=color) + ax.fill_between( + self.chrono_years, + p + 2.5 * np.sqrt(variances), + p - 2.5 * np.sqrt(variances), + alpha=0.3, + color=color, + ) + ax.set_ylim(0, 1) + + ax.set_title(field, fontsize=20) + # ax.set_xlabel('Year', fontsize = 15) + ax.set_ylabel("Proportion", fontsize=15) + ax.tick_params(labelsize=15) + ax.set_xlim(self.chrono_years[0] - 1, self.chrono_years[-1] + 1) + ax.set_xticks(self.chrono_years) + ax.set_xticklabels(self.chrono_years) + + else: + shifted_x = [x + 1 for x in range(len(self.chrono_years))] + ax.errorbar( + shifted_x, + p, + color=color, + yerr=1.96 * np.sqrt(variances), + markersize=20, + markeredgecolor=color, + markerfacecolor=color, + fmt=".", + ecolor=color, + capsize=10, + ) + + weighted_mean, weighted_var, weighted_coi = self.popgen_stats[ + fields[field][3] + ] + x = [shifted_x[0], shifted_x[-1]] + ax.plot(x, [weighted_mean for _ in x], color=color, linewidth=3) + ax.fill_between( + x, + [weighted_coi[0] for _ in x], + [weighted_coi[1] for _ in x], + color=color, + linewidth=3, + alpha=0.2, + ) + ax.set_ylim(0, 1) + if not title: + ax.set_title(field, fontsize=20) + else: + ax.set_title(title, fontsize=20, loc="left") + # ax.set_xlabel('Year', fontsize = 15) + ax.set_ylabel("Proportion", fontsize=15) + ax.tick_params(labelsize=15) + ax.set_xlim(shifted_x[0] - 0.5, shifted_x[-1] + 0.5) + ax.set_xticks(shifted_x) + ax.set_xticklabels(shifted_x) + + def plot_RH_average_confidence(self, color="orange", ax=None): + if not ax: + fig, ax = plt.subplots() + RH = np.mean(self.RH_barcode_dict["total"]) + RH_ci = ( + np.percentile(self.RH_barcode_dict["total"], 2.5), + np.percentile(self.RH_barcode_dict["total"], 97.5), + ) + ax.hist(self.RH_barcode_dict["total"], color="orange") + + ax.set_xlabel(r"$R_{H}$", fontsize=15) + ax.set_ylabel("Freq", fontsize=15) + + def plot_RHsample_longitudinal_average(self, color="orange", ax=None): + if not ax: + fig, ax = plt.subplots() + y = np.array( + [np.mean(self.RH_barcode_dict[year]) for year in self.chrono_years] + ) + X = sm.add_constant(self.chrono_years) + + ax.scatter(self.chrono_years, y, color=color) + ax.plot( + self.chrono_years, self.RH_barcode_dict["model"].predict(X), color=color + ) + ax.boxplot( + [self.RH_barcode_dict[year] for year in self.chrono_years], + positions=self.chrono_years, + showfliers=False, + notch=True, + patch_artist=True, + boxprops=dict(facecolor=color, color=color), + capprops=dict(color=color), + whiskerprops=dict(color=color), + flierprops=dict(color=color, markeredgecolor=color), + medianprops=dict(color=color), + ) + + ax.set_ylim(0, 0.5) + ax.tick_params(axis="both", labelsize=15) + ax.set_ylabel(r"$R_{H}$", fontsize=20) + ax.set_xlabel("Year", fontsize=20) + + def plot_RHsample_longitudinal(self, color="orange", ax=None): + if not ax: + fig, ax = plt.subplots(figsize=(12, 5)) + b = sns.swarmplot(x="Year", y="RH", data=self.RH_df, color=color, ax=ax) + ax.plot( + [0 - 0.5, len(self.chrono_years) + 0.5], + [self.RH_average, self.RH_average], + color="black", + linewidth=3, + ) + ax.tick_params(axis="both", labelsize=15) + + sns.boxplot( + showmeans=True, + # meanline=True, + meanprops={ + "marker": "s", + "markerfacecolor": "white", + "markeredgecolor": color, + "markersize": "12", + }, + medianprops={"visible": False}, + whiskerprops={"visible": False}, + zorder=10, + x="Year", + y="RH", + data=self.RH_df, + showfliers=False, + showbox=False, + showcaps=False, + ax=ax, + ) + ax.set_xlabel("Year", fontsize=20) + ax.set_ylabel(r"$R_{H}$", fontsize=20) + legend_elements = [ + Patch( + facecolor="black", + edgecolor="black", + label=r"$R_{H}=$" + + str(round(self.RH_average, 2)) + + " " + + "({ci1},{ci2})".format( + ci1=str(round(self.RH_ci[0], 2)), ci2=str(round(self.RH_ci[1], 2)) + ), + ) + ] + + ax.legend(handles=legend_elements, fontsize=15) + ax.set_ylim(-1.1, 1.0) + + def plot_cotx_sweep(self, ax=None): + if not ax: + fig, ax = plt.subplots(figsize=(12, 5)) + + simulation_boxplot_results = [] + for year in self.chrono_years: + for key in self.model_expectations: + for RH in self.model_expectations[key][year]: + simulation_boxplot_results.append([year, RH, key]) + df = DataFrame(simulation_boxplot_results) + df.columns = ["Year", "RH", "Condition"] + df_melt = df.melt(id_vars=["Year", "Condition"], value_vars="RH") + cotx_colors = sns.color_palette("rocket", 4) + superinfection_colors = sns.color_palette("mako_r", 3) + order = [ + "cotx_5_2_3", + "cotx_4_2_3", + "cotx_3_2_3", + "cotx_2_2_3", + "cotx_5_2_2", + "cotx_4_2_2", + "cotx_3_2_2", + "cotx_2_2_2", + "cotx_5_2_1", + "cotx_4_2_1", + "cotx_3_2_1", + "cotx_2_2_1", + "coi=2", + "coi=3", + "coi=4", + ] + colors = 3 * list(cotx_colors.as_hex()) + list(superinfection_colors.as_hex()) + custom_pal = {} + for x, y in zip(order, colors): + custom_pal[x] = y + + ax.fill_between([-0.5, 3.5], [1.0, 1.0], [-1.5, -1.5], color="grey", alpha=0.1) + ax.fill_between( + [8 - 0.5, 11.5], [1.0, 1.0], [-1.5, -1.5], color="grey", alpha=0.1 + ) + + b = sns.boxplot( + data=df_melt, + x="Condition", + y="value", + showfliers=False, + ax=ax, + palette=custom_pal, + showmeans=True, + meanprops={ + "marker": "o", + "markerfacecolor": "white", + "markeredgecolor": "black", + "markersize": "10", + }, + order=order, + ) + + ax.tick_params(axis="both", labelsize=15) + ax.set_ylabel(r"$R_{H}$", fontsize=20) + ax.set_xlabel("Condition", fontsize=20) + + line3 = 4 * ["3"] + 4 * ["2"] + 4 * ["1"] + ["COI=2", "COI=3", "COI=4"] + ax.set_xticklabels(line3, rotation=45) + + legend_elements = [ + Patch( + facecolor=cotx_colors[0], edgecolor="black", label=r"$COI_{i,cotx}=5$" + ), + Patch( + facecolor=cotx_colors[1], edgecolor="black", label=r"$COI_{i,cotx}=4$" + ), + Patch( + facecolor=cotx_colors[2], edgecolor="black", label=r"$COI_{i,cotx}=3$" + ), + Patch( + facecolor=cotx_colors[3], edgecolor="black", label=r"$COI_{i,cotx}=2$" + ), + Patch( + facecolor=superinfection_colors[0], + edgecolor="black", + label=r"$COI_{i,super}=2$", + ), + Patch( + facecolor=superinfection_colors[1], + edgecolor="black", + label=r"$COI_{i,super}=3$", + ), + Patch( + facecolor=superinfection_colors[2], + edgecolor="black", + label=r"$COI_{i,super}=4$", + ), + ] + + ax.legend(handles=legend_elements, fontsize=12) + + ax.plot([0, 14], [0, 0], color="black", linestyle="--") + ax.plot([0, 14], [0.3, 0.3], linestyle="--", color="crimson") + ax.annotate("Cotx Detection\n Threshold", [12.5, 0.35], color="crimson") + + def plot_RH_classification(self, color="orange", ax=None): + if not ax: + fig, ax = plt.subplots(figsize=(12, 5)) + classification_counts = Counter(self.RH_df["classification"]) + total = np.sum(list(classification_counts.values())) + x_array = np.asarray([0, 1, 2, 3]) + + cat1 = [ + classification_counts[key] / total + for key in ["cotx", "coi=2", "coi=3", "coi=4"] + ] + cat2 = [ + classification_counts[key] / total + for key in [ + "cotx_probable", + "coi=2_probable", + "coi=3_probable", + "coi=4_probable", + ] + ] + ax.bar(x_array, cat1, color=color) + ax.bar(x_array, cat2, bottom=cat1, color=color) + + proportions = np.asarray(cat1) + np.asarray(cat2) + + stdev_array = [] + for p in proportions: + wilson_low, wilson_high = wilson(p, total) + rel_low_boundary = p - wilson_low + rel_high_boundary = wilson_high - p + stdev_array.append((rel_low_boundary, rel_high_boundary)) + stdev_array = np.asarray(stdev_array).T + + ax.errorbar( + x_array, proportions, yerr=stdev_array, fmt=".", capsize=5, color="black" + ) + + ax.set_xticks(x_array) + ax.set_xticklabels( + ["Cotransmission", "COI=2", "COI=3", "COI=4"], fontsize=15, rotation=45 + ) + ax.set_ylabel("Proportion", fontsize=15) + ax.tick_params(labelsize=15) + ax.set_ylim(0, 1) + # ax.set_title('2020', fontsize = 20) + + def generate_summary_report(self, output_file=None): + data_report = {} + data_report["n_singles"] = self.n_singles + data_report["n_poly"] = self.n_poly + data_report["n_total"] = self.n_singles + self.n_poly + + data_report["poly_fract_data"] = [ + (round(p, 2), [round(x, 2) for x in ci]) + for p, ci in zip( + self.popgen_stats["p_poly_fract"], self.popgen_stats["poly_wilson"] + ) + ] + data_report["p_mono_unique"] = [ + (round(p, 2), [round(x, 2) for x in ci]) + for p, ci in zip( + self.popgen_stats["p_mono_unique"], + self.popgen_stats["wilson_mono_unique"], + ) + ] + + data_report["realmccoilcoi"] = [ + (round(p, 2), [max(round(p - 1.96 * std, 2), 0), round(p + 1.96 * std, 2)]) + for p, std in zip( + self.popgen_stats["mccoil_coi"], self.popgen_stats["mccoil_coi_std"] + ) + ] + data_report["realmccoilcoi_poly"] = [ + (round(p, 2), [max(round(p - 1.96 * std, 2), 0), round(p + 1.96 * std, 2)]) + for p, std in zip( + self.popgen_stats["mccoil_coi_poly"], + self.popgen_stats["mccoil_coi_poly_std"], + ) + ] + + data_report["RH_array"] = [ + (round(p, 2), [round(p - 1.96 * std, 2), round(p + 1.96 * std, 2)]) + for p, std in zip( + self.RH_yearly_averages, np.sqrt(self.RH_yearly_variances) + ) + ] + data_report_df = pd.DataFrame.from_dict(data_report).T + data_report_df.columns = self.chrono_years + if output_file: + data_report_df.to_csv(output_file) + return data_report_df + + +def show_stats(ISO3, color): + fig = plt.figure(figsize=(20, 15)) + axes = [fig.add_subplot(3, 3, i + 1) for i in range(0, 9)] + + BS[ISO3].plot_sample_distribution(color, axes[0]) + BS[ISO3].plot_mono_poly_fract(color, axes[1], x_annotate=0.0) + BS[ISO3].plot_longitudinal("poly_fract", color, axes[2]) + axes[2].set_title("Poly Fraction", fontsize=15) + + BS[ISO3].plot_mono_hap_sharing(color, axes[3], x_annotate=0.35, y_annotate=0.03) + BS[ISO3].plot_persistent_clones(color, ax=axes[4], x_annotate=[-0.5, 0.5]) + BS[ISO3].plot_longitudinal("mono_unique", color, axes[5]) + axes[5].set_title("Unique Mono Fraction", fontsize=15) + + BS[ISO3].plot_RHsample_longitudinal(ax=axes[6], color=color) + BS[ISO3].plot_RH_classification(ax=axes[7], color=color) + + # Calculate some stats: + counts = Counter(BS[ISO3].RH_df["classification"]) + total = np.sum(list(counts.values())) + p = (counts["cotx"] + counts["cotx_probable"]) / total + stdev = np.sqrt(p * (1 - p) / total) + + # We should save both a CSV and a TSV: + base_name = ISO3.replace(":", ".") # We have to replace ':' here for ease of use with filenames. + fig.savefig(f"{base_name}_summary_figure.svg") + fig.savefig(f"{base_name}_summary_figure.png") + + +if __name__ == "__main__": + + # Set up our CLI args: + parser = argparse.ArgumentParser( + description=f"Processes P. falciparum data from a spreadsheet into actionable information (e.g. CoI estimates)." + ) + + requiredNamed = parser.add_argument_group('required named arguments') + requiredNamed.add_argument('-f', '--input-file', + help='TSV/CSV/Excel file containing data to process', + required=True) + requiredNamed.add_argument('-s', '--sheet', + help='Sheet name to process.', + required=True) + requiredNamed.add_argument('-b', '--barcodes', + help='Barcode file to use.', + required=True) + args = parser.parse_args() + + # Do some validation here: + if (args.input_file.endswith(".xls") or args.input_file.endswith(".xlsx")) and not args.sheet: + print("ERROR: You must provide a sheet name with an excel file input.", file=sys.stderr) + sys.exit(1) + + # Do the work: + BS = {} + ISO3 = args.sheet.replace(".", ":") # We have to replace '.' with ':' here because of Terra conventions. + sheet_name = ISO3.replace(":", "_") + + BS[ISO3] = BarcodeStats( + args.input_file, ISO3, args.barcodes, sheet_name=sheet_name, adjusted_n=False + ) + + show_stats(ISO3, color="crimson") + + base_name = ISO3.replace(":", ".") # We have to replace ':' here for ease of use with filenames. + BS[ISO3].generate_summary_report(f"{base_name}_summary.csv") + BS[ISO3].mono_barcode_df.to_csv(f"{base_name}_mono_barcodes.csv") + BS[ISO3].poly_df.to_csv(f"{base_name}_poly_barcodes.csv") diff --git a/docker/lr-sgkit/Dockerfile b/docker/lr-sgkit/Dockerfile new file mode 100644 index 000000000..0913be0b9 --- /dev/null +++ b/docker/lr-sgkit/Dockerfile @@ -0,0 +1,21 @@ +FROM continuumio/miniconda3 + +MAINTAINER Kiran V Garimella + +# copy other resources +COPY ./environment.yml / + +# install conda packages +RUN conda env create -f /environment.yml && conda clean -a +RUN echo "source activate lr-sgkit" > ~/.bashrc +ENV PATH=/opt/conda/envs/lr-sgkit/bin/:/root/google-cloud-sdk/bin/:${PATH} + +# install gsutil +RUN apt-get --allow-releaseinfo-change update +RUN apt install -y curl git-lfs parallel +RUN curl https://sdk.cloud.google.com | bash + +# Setup crcmodc for gsutil: +RUN apt-get install -y gcc python3-dev python3-setuptools && \ + pip3 uninstall -y crcmod && \ + pip3 install --no-cache-dir -U crcmod diff --git a/docker/lr-sgkit/Makefile b/docker/lr-sgkit/Makefile new file mode 100644 index 000000000..3e467f819 --- /dev/null +++ b/docker/lr-sgkit/Makefile @@ -0,0 +1,17 @@ +IMAGE_NAME = lr-sgkit +VERSION = 0.5.0 # This should match the version number at https://github.com/pystatgen/sgkit/releases + +TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest + +all: | build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +build_no_cache: + docker build --no-cache -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) diff --git a/docker/lr-sgkit/environment.yml b/docker/lr-sgkit/environment.yml new file mode 100644 index 000000000..d5bc55fec --- /dev/null +++ b/docker/lr-sgkit/environment.yml @@ -0,0 +1,86 @@ +name: lr-sgkit +channels: + - conda-forge + - defaults +dependencies: + - _libgcc_mutex=0.1=conda_forge + - _openmp_mutex=4.5=2_gnu + - bzip2=1.0.8=h7f98852_4 + - ca-certificates=2022.12.7=ha878542_0 + - ld_impl_linux-64=2.39=hcc3a1bd_1 + - libffi=3.4.2=h7f98852_5 + - libgcc-ng=12.2.0=h65d4601_19 + - libgomp=12.2.0=h65d4601_19 + - libnsl=2.0.0=h7f98852_0 + - libsqlite=3.40.0=h753d276_0 + - libuuid=2.32.1=h7f98852_1000 + - libzlib=1.2.13=h166bdaf_4 + - ncurses=6.3=h27087fc_1 + - openssl=3.0.7=h0b41bf4_1 + - pip=22.3.1=pyhd8ed1ab_0 + - python=3.9.15=hba424b6_0_cpython + - readline=8.1.2=h0f457ee_0 + - setuptools=65.6.3=pyhd8ed1ab_0 + - tk=8.6.12=h27826a3_0 + - tzdata=2022g=h191b570_0 + - wheel=0.38.4=pyhd8ed1ab_0 + - xz=5.2.6=h166bdaf_0 + - pip: + - aiohttp==3.8.3 + - aiosignal==1.3.1 + - asciitree==0.3.3 + - async-timeout==4.0.2 + - attrs==22.2.0 + - certifi==2022.12.7 + - charset-normalizer==2.1.1 + - click==8.1.3 + - cloudpickle==2.2.0 + - coloredlogs==15.0.1 + - crcmod==1.7 + - cyvcf2==0.30.14 + - dask==2022.1.0 + - dask-glm==0.2.0 + - dask-ml==2022.5.27 + - distributed==2022.1.0 + - entrypoints==0.4 + - fasteners==0.18 + - frozenlist==1.3.3 + - fsspec==2022.11.0 + - heapdict==1.0.1 + - humanfriendly==10.0 + - idna==3.4 + - jinja2==3.1.2 + - joblib==1.2.0 + - llvmlite==0.39.1 + - locket==1.0.0 + - markupsafe==2.1.1 + - msgpack==1.0.4 + - multidict==6.0.4 + - multipledispatch==0.6.0 + - numba==0.56.4 + - numcodecs==0.11.0 + - numpy==1.21.6 + - packaging==23.0 + - pandas==1.3.5 + - partd==1.3.0 + - psutil==5.9.4 + - python-dateutil==2.8.2 + - pytz==2022.7.1 + - pyyaml==6.0 + - requests==2.28.2 + - scikit-learn==1.2.0 + - scipy==1.7.3 + - sgkit==0.5.0 + - six==1.16.0 + - sortedcontainers==2.4.0 + - tblib==1.7.0 + - threadpoolctl==3.1.0 + - toolz==0.12.0 + - tornado==6.2 + - typing-extensions==4.4.0 + - urllib3==1.26.14 + - xarray==2022.12.0 + - yarl==1.8.2 + - zarr==2.10.3 + - zict==2.2.0 +prefix: /opt/conda/envs/lr-sgkit diff --git a/docker/sr-malaria-niare-pipeline/Dockerfile b/docker/sr-malaria-niare-pipeline/Dockerfile new file mode 100644 index 000000000..3b08549ad --- /dev/null +++ b/docker/sr-malaria-niare-pipeline/Dockerfile @@ -0,0 +1,153 @@ +# Start with a good base python3 image: +FROM ubuntu:20.04 +MAINTAINER Jonn Smith + +# Make sure we don't need to interact with any package installations: +ARG DEBIAN_FRONTEND=noninteractive + +# Set the working directory to / +WORKDIR / + +######################################################################################################################## +# DEPENDENCIES + +# Install some dependencies for gsutil: +RUN apt-get --allow-releaseinfo-change update +RUN apt install -y curl git-lfs parallel + +# Setup crcmodc for gsutil: +RUN apt-get install -y gcc python3 python3-pip python3-dev python3-setuptools && \ + pip3 uninstall -y crcmod && \ + pip3 install --no-cache-dir -U crcmod + +# Install gsutil: +RUN curl https://sdk.cloud.google.com | bash + +# Dependencies for samtools: +RUN apt-get install -y bzip2 curl gnupg2 libc-dev ncurses-dev libcurl4-openssl-dev libssl-dev libbz2-dev liblzma-dev + +# Google cloud support for samtools: +RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] http://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list \ + && curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add - \ + && apt-get update -y \ + && apt-get install google-cloud-sdk -y + +# Additional Dependencies: +RUN apt install -y curl wget datamash pkg-config zip unzip default-jre python + +######################################################################################################################## +# SOFTWARE: + +RUN mkdir -p /usr/local/bin /usr/local/lib /usr/local/etc + +# BWA: +RUN wget https://github.com/lh3/bwa/releases/download/v0.7.15/bwakit-0.7.15_x64-linux.tar.bz2 && \ + tar -xf bwakit-0.7.15_x64-linux.tar.bz2 && \ + cd bwa.kit && \ + mv bwa fermi2 fermi2.pl htsbox k8 ropebwt2 run-HLA run-bwamem run-gen-ref samblaster samtools seqtk trimadap typeHLA.sh typeHLA-selctg.js typeHLA.js bwa-postalt.js /usr/local/bin && \ + mkdir /usr/local/etc/bwa && \ + mv README.md doc resource-GRCh38 resource-human-HLA /usr/local/etc/bwa && \ + cd .. && \ + rm -rf bwakit-0.7.15_x64-linux.tar.bz2 bwa.kit + +# TABIX: +RUN wget https://github.com/samtools/htslib/releases/download/1.18/htslib-1.18.tar.bz2 && \ + tar -xf htslib-1.18.tar.bz2 && \ + cd htslib-1.18 && \ + ./configure && make && make install && \ + cd .. \ + rm -rf htslib-1.18 htslib-1.18.tar.bz2 + +# VCF Tools: +RUN wget https://github.com/vcftools/vcftools/releases/download/v0.1.16/vcftools-0.1.16.tar.gz && \ + tar -xf vcftools-0.1.16.tar.gz && \ + cd vcftools-0.1.16 && \ + ./configure && make && make install && \ + cd .. \ + rm -rf vcftools-0.1.16 vcftools-0.1.16.tar.gz + +# bcftools: +RUN wget https://github.com/samtools/bcftools/releases/download/1.9/bcftools-1.9.tar.bz2 && \ + tar -xf bcftools-1.9.tar.bz2 && \ + cd bcftools-1.9 && \ + ./configure && make && make install && \ + cd .. \ + rm -rf bcftools-1.9 bcftools-1.9.tar.bz2 + +# Samtools: +# Get samtools source: +RUN wget https://github.com/samtools/samtools/releases/download/1.11/samtools-1.11.tar.bz2 && \ + tar -xjf samtools-1.11.tar.bz2 && \ + cd samtools-1.11 && \ + ./configure && make install && \ + cd .. && \ + rm -rf samtools-1.11 samtools-1.11.tar.bz2 + +# gatk/4.2.2.0: +RUN wget https://github.com/broadinstitute/gatk/releases/download/4.2.2.0/gatk-4.2.2.0.zip && \ + unzip gatk-4.2.2.0.zip && \ + cd gatk-4.2.2.0 && \ + mv gatk gatk-completion.sh gatk-package-4.2.2.0-local.jar gatk-package-4.2.2.0-spark.jar /usr/local/bin && \ + mkdir -p /usr/local/etc/gatk && \ + mv GATKConfig.EXAMPLE.properties README.md gatkPythonPackageArchive.zip gatkcondaenv.yml gatkdoc scripts /usr/local/etc/gatk && \ + cd .. \ + rm -rf gatk-4.2.2.0.zip gatk-4.2.2.0 + +# zlib/1.2.11: +RUN wget https://www.zlib.net/fossils/zlib-1.2.11.tar.gz && \ + tar -xf zlib-1.2.11.tar.gz && \ + cd zlib-1.2.11 && \ + ./configure && make && make install && \ + cd .. \ + rm -rf zlib-1.2.11 zlib-1.2.11.tar.gz + +# plink/1.90: +RUN mkdir plink && \ + cd plink && \ + wget https://s3.amazonaws.com/plink1-assets/plink_linux_x86_64_20231018.zip && \ + unzip plink_linux_x86_64_20231018.zip && \ + rm plink_linux_x86_64_20231018.zip && \ + mv plink prettify /usr/local/bin && \ + mkdir /usr/local/etc/plink && \ + mv toy.ped toy.map /usr/local/etc/plink/ && \ + cd .. && \ + rm -rf plink + +# trimmomatic/0.36: +RUN wget http://www.usadellab.org/cms/uploads/supplementary/Trimmomatic/Trimmomatic-0.36.zip && \ + mkdir -p /opt && \ + unzip Trimmomatic-0.36.zip && \ + mv Trimmomatic-0.36 /opt/ && \ + echo "#!/usr/bin/env bash\n\njava -jar /opt/Trimmomatic-0.36/trimmomatic-0.36.jar PE $@\n\n" > /usr/local/bin/TrimmomaticPE && \ + echo "#!/usr/bin/env bash\n\njava -jar /opt/Trimmomatic-0.36/trimmomatic-0.36.jar SE $@\n\n" > /usr/local/bin/TrimmomaticSE && \ + chmod +x /usr/local/bin/Trimmomatic*E && \ + rm Trimmomatic-0.36.zip + +######################################################################################################################## +######################################################################################################################## +######################################################################################################################## + +# Other utilities: +RUN apt-get clean +RUN apt install -y vim emacs nano +RUN apt-get clean + +######################################################################################################################## +######################################################################################################################## +######################################################################################################################## + +# Might not need these: + +# gsl/2.7.1: + +# RAiSD/2.8: + +# R/4.1.0: + +# sratoolkit/2.8.2-1: +# BROKEN +#RUN wget https://github.com/ncbi/sra-tools/archive/refs/tags/2.8.2-1.tar.gz && \ +# tar -xf 2.8.2-1.tar.gz && \ +# cd sra-tools-2.8.2-1 && \ + + diff --git a/docker/sr-malaria-niare-pipeline/Makefile b/docker/sr-malaria-niare-pipeline/Makefile new file mode 100644 index 000000000..45c266872 --- /dev/null +++ b/docker/sr-malaria-niare-pipeline/Makefile @@ -0,0 +1,18 @@ +IMAGE_NAME = sr-malaria-niare-pipeline +VERSION = 0.0.1 + +TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) +TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest + +all: | build push + +build: + docker build -t $(TAG1) -t $(TAG2) . + +build_no_cache: + docker build --no-cache -t $(TAG1) -t $(TAG2) . + +push: + docker push $(TAG1) + docker push $(TAG2) + diff --git a/docker/sr-utils/Dockerfile b/docker/sr-utils/Dockerfile index f2c68da1d..cc0adc19c 100644 --- a/docker/sr-utils/Dockerfile +++ b/docker/sr-utils/Dockerfile @@ -9,5 +9,15 @@ COPY ./environment.yml / RUN conda env create -f /environment.yml && conda clean -a ENV PATH=/opt/conda/envs/sr-utils/bin/:/root/google-cloud-sdk/bin/:${PATH} +# Install BWA-MEM2: +RUN wget https://github.com/bwa-mem2/bwa-mem2/releases/download/v2.2.1/bwa-mem2-2.2.1_x64-linux.tar.bz2 && \ + tar -xf bwa-mem2-2.2.1_x64-linux.tar.bz2 && \ + mv bwa-mem2-2.2.1_x64-linux /opt/ && \ + for f in $(find /opt/bwa-mem2-2.2.1_x64-linux/ -type f -name \*bwa-mem\* ) ; do ln -s $f /usr/local/bin/$(basename $f) ; done && \ + rm bwa-mem2-2.2.1_x64-linux.tar.bz2 + +COPY ./python /python + # set LD library path ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/conda/envs/sr-utils/lib/ + diff --git a/docker/sr-utils/Makefile b/docker/sr-utils/Makefile index e2911596b..cbfd359fd 100644 --- a/docker/sr-utils/Makefile +++ b/docker/sr-utils/Makefile @@ -1,5 +1,5 @@ IMAGE_NAME = sr-utils -VERSION = 0.1.0 +VERSION = 0.2.1 TAG1 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):$(VERSION) TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest @@ -7,11 +7,12 @@ TAG2 = us.gcr.io/broad-dsp-lrma/$(IMAGE_NAME):latest all: | build push build: - docker build -t $(TAG1) -t $(TAG2) . + docker build -t $(TAG1) -t $(TAG2) . build_no_cache: - docker build --no-cache -t $(TAG1) -t $(TAG2) . + docker build --no-cache -t $(TAG1) -t $(TAG2) . push: - docker push $(TAG1) - docker push $(TAG2) + docker push $(TAG1) + docker push $(TAG2) + diff --git a/docker/sr-utils/environment.yml b/docker/sr-utils/environment.yml index 537e00470..ebbc6f0bd 100644 --- a/docker/sr-utils/environment.yml +++ b/docker/sr-utils/environment.yml @@ -6,3 +6,7 @@ channels: dependencies: - samtools - bwa + - pysam + - numpy + - tqdm + diff --git a/docker/sr-utils/python/compute_sr_stats.py b/docker/sr-utils/python/compute_sr_stats.py new file mode 100644 index 000000000..aeb2e9fe9 --- /dev/null +++ b/docker/sr-utils/python/compute_sr_stats.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python3 + +import numpy as np +import pysam +import argparse +from tqdm import tqdm + +def n50(lengths): + all_len = sorted(lengths, reverse=True) + csum = np.cumsum(all_len) + n2 = int(sum(lengths) / 2) + csumn2 = min(csum[csum >= n2]) + ind = np.where(csum == csumn2) + + return all_len[int(ind[0])] + + +def get_bam_stats(bam_file_path, qual_thresh=None): + # Open the file and get ready to iterate: + with pysam.AlignmentFile(bam_file_path, "rb", check_sq=False, require_index=False) as bam_file: + + # Get total number of reads if we have an index: + total_reads = None + if bam_file.has_index(): + idx_stats = bam_file.get_index_statistics() + unaligned_reads = bam_file.nocoordinate + aligned_reads = reduce(lambda a, b: a + b, [x.total for x in idx_stats]) if len(idx_stats) > 0 else 0 + total_reads = unaligned_reads + aligned_reads + + n_reads = 0 if not total_reads else total_reads + read_lengths = [] + quals = [] + total_bases = 0 + + # Iterate through our reads + for read in tqdm(bam_file, desc=f"Collecting Bam Stats" + (f" (rq >= {qual_thresh})" if qual_thresh else ""), + total=total_reads, unit=" read"): + l = len(read.query_sequence) + q = np.mean(read.query_qualities) + + if qual_thresh and q < qual_thresh: + continue + + quals.append(q) + total_bases += l + read_lengths.append(l) + + if not total_reads: + n_reads += 1 + + return n_reads, total_bases, np.mean(quals), np.median(quals), np.array(read_lengths) + + +def main(): + parser = argparse.ArgumentParser(description='Compute short read bam file stats', prog='compute_sr_stats') + parser.add_argument('-q', '--qual-threshold', type=int, default=0, help="Phred-scale quality threshold") + parser.add_argument('bam_file_path', type=str, help="Path to bam file") + args = parser.parse_args() + + n_reads, n_bases, mean_qual, median_qual, read_lengths = get_bam_stats(args.bam_file_path, args.qual_threshold) + + print(f"reads\t{n_reads}") + print(f"bases\t{n_bases}") + print(f"mean_qual\t{mean_qual}") + print(f"median_qual\t{median_qual}") + + print(f"read_mean\t{int(np.mean(read_lengths)) if len(read_lengths) > 0 else 0}") + print(f"read_median\t{int(np.median(read_lengths)) if len(read_lengths) > 0 else 0}") + print(f"read_stdev\t{int(np.std(read_lengths)) if len(read_lengths) > 0 else 0}") + print(f"read_n50\t{n50(read_lengths) if len(read_lengths) > 0 else 0}") + + +if __name__ == "__main__": + main() + diff --git a/extra_documentation/lrma_sp_malaria_barcodes/barcode_summary.png b/extra_documentation/lrma_sp_malaria_barcodes/barcode_summary.png new file mode 100644 index 000000000..97a56ccfa Binary files /dev/null and b/extra_documentation/lrma_sp_malaria_barcodes/barcode_summary.png differ diff --git a/extra_documentation/lrma_sp_malaria_barcodes/pipeline_GenEpi_summary.png b/extra_documentation/lrma_sp_malaria_barcodes/pipeline_GenEpi_summary.png new file mode 100644 index 000000000..6153c64af Binary files /dev/null and b/extra_documentation/lrma_sp_malaria_barcodes/pipeline_GenEpi_summary.png differ diff --git a/extra_documentation/lrma_sp_malaria_barcodes/pipeline_QC_summary.png b/extra_documentation/lrma_sp_malaria_barcodes/pipeline_QC_summary.png new file mode 100644 index 000000000..4b2a2a002 Binary files /dev/null and b/extra_documentation/lrma_sp_malaria_barcodes/pipeline_QC_summary.png differ diff --git a/extra_documentation/lrma_sp_malaria_barcodes/pipeline_summary.png b/extra_documentation/lrma_sp_malaria_barcodes/pipeline_summary.png new file mode 100644 index 000000000..3803066d2 Binary files /dev/null and b/extra_documentation/lrma_sp_malaria_barcodes/pipeline_summary.png differ diff --git a/extra_documentation/sp_malaria/lrma_sr_malaria_pipeline_diagram.png b/extra_documentation/sp_malaria/lrma_sr_malaria_pipeline_diagram.png new file mode 100644 index 000000000..d41830eda Binary files /dev/null and b/extra_documentation/sp_malaria/lrma_sr_malaria_pipeline_diagram.png differ diff --git a/extra_documentation/sp_malaria/lrma_sr_malaria_pipeline_diagram_high_level.png b/extra_documentation/sp_malaria/lrma_sr_malaria_pipeline_diagram_high_level.png new file mode 100644 index 000000000..708bba568 Binary files /dev/null and b/extra_documentation/sp_malaria/lrma_sr_malaria_pipeline_diagram_high_level.png differ diff --git a/extra_documentation/sp_malaria/workspace_markdown_doc.md b/extra_documentation/sp_malaria/workspace_markdown_doc.md new file mode 100644 index 000000000..cd447058f --- /dev/null +++ b/extra_documentation/sp_malaria/workspace_markdown_doc.md @@ -0,0 +1,36 @@ +# _P. falciparum_ Short Read Whole Genome Workspace +This is the workspace for short read whole genome variant discovery and analysis in _Plasmodium falciparum_. This workspace can call variants in a single-sample, joint call cohorts of samples, and perform various tertiary analyses (e.g. drug resistance screening, rapid diagnostic test evasion screening, etc.). + +While the current focus of this workspace is _P. falciparum_, but the processing steps here are generalized and can be adapted to other _Plasmodium_ species. + +## Variant Calling Pipeline + +As part of this workspace there are workflows to call variants on both single samples, and for joint calling across cohorts of samples. + +The main variant calling pipeline has has the following high-level structure: + +![LRMA SP Malaria Variant Calling](https://github.com/broadinstitute/long-read-pipelines/raw/jts_kvg_sp_malaria/extra_documentation/sp_malaria/lrma_sr_malaria_pipeline_diagram_high_level.png) + +## Data + +### Datasets + +The following datasets are currently in this workspace: +- [PF7](https://www.malariagen.net/apps/pf7/) +- The MalariaGEN [crosses](https://www.malariagen.net/parasite/p-falciparum-genetic-crosses) +- 2022 data collected in Senegal +- 2019 data collected in Senegal + +### Data Structure + +The data processing is broken down into three levels (similar to other LRMA projects) in the following Terra data tables: +* Sample (flowcell data) +* Sample Set (sample data / single-sample calling) +* Sample Set Set (cohort data for joint calling) + +_Sample / Flowcell_ data consists of reads from a single flowcell. The sample from which these reads have been processed may or may not be represented in other flowcells. + +_Sample Set_ data consists of all data from a specific sample. This may include data from multiple flowcells that belong to the same "participant" (i.e. same strain / clone). + +_Sample Set Set / Cohort_ data consists of data from multiple samples. + diff --git a/scripts/monitor/legacy/vm_local_monitoring_script.sh b/scripts/monitor/legacy/vm_local_monitoring_script.sh index b504fca70..ba154a2b8 100644 --- a/scripts/monitor/legacy/vm_local_monitoring_script.sh +++ b/scripts/monitor/legacy/vm_local_monitoring_script.sh @@ -22,7 +22,7 @@ # cloud storage folder. set -Eeuo pipefail -MONITOR_MOUNT_POINT=${MONITOR_MOUNT_POINT:-"/"} +MONITOR_MOUNT_POINT=${MONITOR_MOUNT_POINT:-"/cromwell_root"} SLEEP_TIME=${SLEEP_TIME:-"10"} function getCpuUsage() { diff --git a/wdl/BenchmarkVCFs.wdl b/wdl/BenchmarkVCFs.wdl new file mode 100644 index 000000000..786601738 --- /dev/null +++ b/wdl/BenchmarkVCFs.wdl @@ -0,0 +1,1562 @@ +version 1.0 + +# Borrowed and adapted from the Broad Institute's Hydrogen/Palantir repo, courtesy of Michael Gatzen: +# +# https://github.com/broadinstitute/palantir-workflows/tree/mg_benchmark_compare/BenchmarkVCFs +# +# Permalink: +# https://github.com/broadinstitute/palantir-workflows/blob/0bf48efc6de818364993e46d89591a035cfd80c7/BenchmarkVCFs/BenchmarkVCFs.wdl + +workflow Benchmark { + input{ + + File evalVcf + String evalLabel + File evalVcfIndex + File? evalBam + String? evalBamLabel + + File truthVcf + String truthLabel + File truthVcfIndex + File? truthBam + String? truthBamLabel + + File confidenceInterval + + File ref_map_file + + String? analysisRegion + File? hapMap + + Array[File] stratIntervals = [] + Array[String] stratLabels = [] + Array[String]? jexlVariantSelectors + Array[String]? variantSelectorLabels + + Int? threadsVcfEval = 2 + Boolean doIndelLengthStratification = true + Int? preemptible + String gatkTag="4.0.11.0" + Boolean requireMatchingGenotypes = true + Boolean truthIsSitesOnlyVcf = false + File? gatkJarForAnnotation + Array[String]? annotationNames + Boolean enableRefOverlap = false + Boolean passingOnly = true + String? vcfScoreField + String? dummyInputForTerraCallCaching + } + + meta { + description: "A workflow to calculate sensitivity and precision of a germline variant calling pipeline by comparing a 'call' vcf produced by the pipeline to a gold standard 'truth' vcf. Allows for stratification based on interval lists, bed files, or variant types defined according to GATK SelectVariants." + } + + parameter_meta { + evalVcf: {description: "vcfs to be evaluated"} + evalLabel: {description: "label to identify vcf to be evaluated"} + evalVcfIndex: {description: "vcf index for evalVcf"} + evalBam: {description: "bam file contaning the reads that generated the evalVcf"} + evalBamLabel: {description: "label to use for the evalBam in IGV"} + truthVcf: {description: "truth vcf against which to evaluate"} + truthLabel: {description: "label by which to indentify truth set"} + truthBam: {description: "bam file contaning the reads that generated the truthVcf"} + truthBamLabel: {description: "label to use for the truthBam in IGV"} + confidenceInterval: {description: "confidence interval for truth set (can be bed or picard interval_list)"} + ref_map_file: {description: "table indicating reference sequence and auxillary file locations" } + hapMap: {description: "reference haplotype map for CrosscheckFingerprints"} + stratIntervals: {description: "intervals for stratifiction (can be picard interval_list or bed format)"} + stratLabels: {description: "labels by which to identify stratification intervals (must be same length as stratIntervals)"} + jexlVariantSelectors: {description: "variant types to select over (defined by jexl fed to GATK SelectVariants)"} + variantSelectorLabels: {description: "labels by which to identify variant selectors (must be same length as jexlVariantSelectors)"} + doIndelLengthStratification: {description: "whether or not to perform stratification by indel length"} + requireMatchingGenotypes: {description: "whether to require genotypes to match in order to be a true positive"} + truthIsSitesOnlyVcf: {description: "whether the truth VCF is a sites-only VCF file without any sample information"} + gatkTag: {description: "version of gatk docker to use. Defaults to 4.0.11.0"} + analysisRegion: {description: "if provided (gatk format, single interval e.g., 'chr20', or 'chr20:1-10') all the analysis will be performed only within the region."} + passingOnly: {description:"Have vcfEval only consider the passing variants"} + vcfScoreField: {description:"Have vcfEval use this field for making the roc-plot. If this is an info field (like VSQLOD) it should be provided as INFO.VQSLOD, otherewise it is assumed to be a format field."} + gatkJarForAnnotation: {description:"GATK jar that can calculate necessary annotations for jexl Selections when using VCFEval."} + annotationNames: {description:"Annotation arguments to GATK (-A argument, multiple OK)"} + dummyInputForTerraCallCaching: {description:"When running on Terra, use workspace.name as this input to ensure that all tasks will only cache hit to runs in your own workspace. This will prevent call caching from failing with 'Cache Miss (10 failed copy attempts)'. Outside of Terra this can be left empty. This dummy input is only needed for tasks that have no inputs specific to the sample being run (such as CreateIntervalList which does not take in any sample data)."} + } + + # Get ref info: + Map[String, String] ref_map = read_map(ref_map_file) + + if (defined(analysisRegion)) { + call CreateIntervalList { + input: + reference = ref_map["fasta"], + reference_index = ref_map["fai"], + reference_dict = ref_map["dict"], + interval_string = select_first([analysisRegion]), + gatkTag = gatkTag, + dummyInputForTerraCallCaching = dummyInputForTerraCallCaching + } + } + + Array[File] actualStratIntervals = flatten([[""], stratIntervals]) + Array[String] actualStratLabels = flatten([[""], stratLabels]) + Array[String] actualSelectorLabels = select_first([variantSelectorLabels,[""]]) + Array[String] actualSelectorJEXL = select_first([jexlVariantSelectors,[""]]) + + #check that lengths of different arrays are compatible + if (length(actualStratLabels)!= length(actualStratIntervals)) { + call ErrorWithMessage as Error6 { + input: + message="Stratification vcf list is length "+length(actualStratIntervals)+" while stratification labels list is length "+length(actualStratLabels) + } + } + + if (length(actualSelectorLabels) != length(actualSelectorJEXL)) { + call ErrorWithMessage as Error7 { + input: + message="Variant selector list is length "+length(actualSelectorJEXL)+" while labels list is "+length(actualSelectorLabels) + } + } + + if (defined(hapMap)) { + call MatchEvalTruth as Match { + input: + evalVcf = evalVcf, + truthVcf = truthVcf, + evalVcfIndex = evalVcfIndex, + truthVcfIndex = truthVcfIndex, + hapMap = select_first([hapMap]), + gatkTag = gatkTag, + preemptible = preemptible + } + } + Array[String] indelLabels=["deletion","insertion","indel_fine_m20","indel_fine_m19","indel_fine_m18","indel_fine_m17","indel_fine_m16","indel_fine_m15", + "indel_fine_m14","indel_fine_m13","indel_fine_m12","indel_fine_m11","indel_fine_m10","indel_fine_m9","indel_fine_m8","indel_fine_m7", + "indel_fine_m6","indel_fine_m5","indel_fine_m4","indel_fine_m3","indel_fine_m2","indel_fine_m1","indel_fine_1","indel_fine_2","indel_fine_3", + "indel_fine_4","indel_fine_5","indel_fine_6","indel_fine_7","indel_fine_8","indel_fine_9","indel_fine_10","indel_fine_11","indel_fine_12", + "indel_fine_13","indel_fine_14","indel_fine_15","indel_fine_16","indel_fine_17","indel_fine_18","indel_fine_19","indel_fine_20","indel_coarse_m30.0", + "indel_coarse_m25.0","indel_coarse_m20.0","indel_coarse_m15.0","indel_coarse_m10.0","indel_coarse_m5.0","indel_coarse_0.0","indel_coarse_5.0", + "indel_coarse_10.0","indel_coarse_15.0","indel_coarse_20.0","indel_coarse_25.0","indel_coarse_30.0"] + + Array[String] indelJexl=["vc.isSimpleIndel() && vc.getIndelLengths().0<0","vc.isSimpleIndel() && vc.getIndelLengths().0>0","vc.isSimpleIndel() && vc.getIndelLengths().0==-20", + "vc.isSimpleIndel() && vc.getIndelLengths().0==-19","vc.isSimpleIndel() && vc.getIndelLengths().0==-18","vc.isSimpleIndel() && vc.getIndelLengths().0==-17", + "vc.isSimpleIndel() && vc.getIndelLengths().0==-16","vc.isSimpleIndel() && vc.getIndelLengths().0==-15","vc.isSimpleIndel() && vc.getIndelLengths().0==-14", + "vc.isSimpleIndel() && vc.getIndelLengths().0==-13","vc.isSimpleIndel() && vc.getIndelLengths().0==-12","vc.isSimpleIndel() && vc.getIndelLengths().0==-11", + "vc.isSimpleIndel() && vc.getIndelLengths().0==-10","vc.isSimpleIndel() && vc.getIndelLengths().0==-9","vc.isSimpleIndel() && vc.getIndelLengths().0==-8", + "vc.isSimpleIndel() && vc.getIndelLengths().0==-7","vc.isSimpleIndel() && vc.getIndelLengths().0==-6","vc.isSimpleIndel() && vc.getIndelLengths().0==-5", + "vc.isSimpleIndel() && vc.getIndelLengths().0==-4","vc.isSimpleIndel() && vc.getIndelLengths().0==-3","vc.isSimpleIndel() && vc.getIndelLengths().0==-2", + "vc.isSimpleIndel() && vc.getIndelLengths().0==-1","vc.isSimpleIndel() && vc.getIndelLengths().0==1","vc.isSimpleIndel() && vc.getIndelLengths().0==2", + "vc.isSimpleIndel() && vc.getIndelLengths().0==3","vc.isSimpleIndel() && vc.getIndelLengths().0==4","vc.isSimpleIndel() && vc.getIndelLengths().0==5", + "vc.isSimpleIndel() && vc.getIndelLengths().0==6","vc.isSimpleIndel() && vc.getIndelLengths().0==7","vc.isSimpleIndel() && vc.getIndelLengths().0==8", + "vc.isSimpleIndel() && vc.getIndelLengths().0==9","vc.isSimpleIndel() && vc.getIndelLengths().0==10","vc.isSimpleIndel() && vc.getIndelLengths().0==11", + "vc.isSimpleIndel() && vc.getIndelLengths().0==12","vc.isSimpleIndel() && vc.getIndelLengths().0==13","vc.isSimpleIndel() && vc.getIndelLengths().0==14", + "vc.isSimpleIndel() && vc.getIndelLengths().0==15","vc.isSimpleIndel() && vc.getIndelLengths().0==16","vc.isSimpleIndel() && vc.getIndelLengths().0==17", + "vc.isSimpleIndel() && vc.getIndelLengths().0==18","vc.isSimpleIndel() && vc.getIndelLengths().0==19","vc.isSimpleIndel() && vc.getIndelLengths().0==20", + "vc.isSimpleIndel() && vc.getIndelLengths().0<-27.5 && vc.getIndelLengths().0>-32.5","vc.isSimpleIndel() && vc.getIndelLengths().0<-22.5 && vc.getIndelLengths().0>-27.5", + "vc.isSimpleIndel() && vc.getIndelLengths().0<-17.5 && vc.getIndelLengths().0>-22.5","vc.isSimpleIndel() && vc.getIndelLengths().0<-12.5 && vc.getIndelLengths().0>-17.5", + "vc.isSimpleIndel() && vc.getIndelLengths().0<-7.5 && vc.getIndelLengths().0>-12.5","vc.isSimpleIndel() && vc.getIndelLengths().0<-2.5 && vc.getIndelLengths().0>-7.5", + "vc.isSimpleIndel() && vc.getIndelLengths().0<2.5 && vc.getIndelLengths().0>-2.5","vc.isSimpleIndel() && vc.getIndelLengths().0<7.5 && vc.getIndelLengths().0>2.5", + "vc.isSimpleIndel() && vc.getIndelLengths().0<12.5 && vc.getIndelLengths().0>7.5","vc.isSimpleIndel() && vc.getIndelLengths().0<17.5 && vc.getIndelLengths().0>12.5", + "vc.isSimpleIndel() && vc.getIndelLengths().0<22.5 && vc.getIndelLengths().0>17.5","vc.isSimpleIndel() && vc.getIndelLengths().0<27.5 && vc.getIndelLengths().0>22.5", + "vc.isSimpleIndel() && vc.getIndelLengths().0<32.5 && vc.getIndelLengths().0>27.5"] + + scatter (indel in zip(indelLabels,indelJexl)) { + VariantSelector indelSelectors = object{ jexl : indel.right, + label : indel.left + } + } + + if (defined(jexlVariantSelectors)) { + scatter (select in zip(actualSelectorLabels,actualSelectorJEXL)) { + VariantSelector variantSelectors = object{ jexl: select.right, + label : select.left + } + } + } + + Array[VariantSelector] defaultVS = [object{ jexl: "vc.isIndel() && vc.getHetCount() == 1", label: "HetIndel" }, + object{jexl: "vc.isIndel() && vc.getHomVarCount() == 1", label: "HomVarIndel"}, + object{jexl: "vc.isSNP() && vc.getHetCount() == 1", label: "HetSNP"}, + object{jexl: "vc.isSNP() && vc.getHomVarCount() == 1", label: "HomVarSNP"}] + Array[VariantSelector] actualVariantSelectors = flatten(select_all([defaultVS,variantSelectors])) + + if (defined(stratIntervals)) { + scatter (stratIL in actualStratIntervals) { + if(stratIL!="") { + call ConvertIntervals as StratConvertIntervals { + input: + inputIntervals = stratIL, + refDict = ref_map["dict"], + gatkTag = gatkTag, + subset_interval = CreateIntervalList.interval_list, + preemptible = preemptible, + dummyInputForTerraCallCaching = dummyInputForTerraCallCaching + + } + } + } + } + Array[File] stratBeds = select_all(flatten(select_all([[""],StratConvertIntervals.bed]))) + Array[File] stratILs = select_all(flatten(select_all([[""],StratConvertIntervals.intervalList]))) + + scatter (strat in zip(zip(stratILs,stratBeds),actualStratLabels)) { + Stratifier stratifiers = object {intervalList : strat.left.left, + bed : strat.left.right, + label : strat.right + } + } + + call ConvertIntervals as ConfidenceConvertIntervals { + input: + inputIntervals = confidenceInterval, + refDict = ref_map["dict"], + gatkTag = gatkTag, + preemptible = preemptible, + subset_interval = CreateIntervalList.interval_list, + dummyInputForTerraCallCaching = dummyInputForTerraCallCaching + } + + scatter (stratifier in stratifiers) { + + if (stratifier.label != "" && stratifier.intervalList != "") { + String stratLabel = select_first([stratifier.label,""]) + File stratIL = select_first([stratifier.intervalList,""]) + File stratBed = select_first([stratifier.bed,""]) + String outputPreStrat = evalLabel+"_"+truthLabel+"_"+stratLabel + } + String outputPrefix = select_first([outputPreStrat,evalLabel+"_"+truthLabel]) + + call CheckForVariants as CheckForVariantsEval { + input: + vcf = evalVcf, + vcfIndex = evalVcfIndex, + confidenceIL = ConfidenceConvertIntervals.intervalList, + stratIL = stratIL, + gatkTag = gatkTag, + preemptible = preemptible + } + + call CheckForVariants as CheckForVariantsTruth { + input: + vcf = truthVcf, + vcfIndex = truthVcfIndex, + confidenceIL = ConfidenceConvertIntervals.intervalList, + stratIL = stratIL, + gatkTag = gatkTag, + preemptible = preemptible + } + + if (CheckForVariantsTruth.variantsFound && CheckForVariantsEval.variantsFound) { + call VcfEval as StandardVcfEval { + input: + truthVCF = truthVcf, + truthVCFIndex = truthVcfIndex, + evalVCF = evalVcf, + evalVCFIndex = evalVcfIndex, + confidenceBed = ConfidenceConvertIntervals.bed, + stratBed = stratBed, + ref = ref_map["fasta"], + refDict = ref_map["dict"], + refIndex = ref_map["fai"], + outputPre = outputPrefix+"_vcfeval", + threads = threadsVcfEval, + preemptible = preemptible, + requireMatchingGenotypes = requireMatchingGenotypes, + truthIsSitesOnlyVcf = truthIsSitesOnlyVcf, + passingOnly = passingOnly, + vcfScoreField = vcfScoreField, + enableRefOverlap = enableRefOverlap + } + + call WriteXMLfile as VcfEvalWriteXMLfile { + input: + input_files = flatten([select_all([StandardVcfEval.outVcf,ConfidenceConvertIntervals.bed,stratifier.bed]), select_all([evalBam, truthBam])]), + input_names = flatten([select_all([outputPrefix+"_vcfeval","confidence_intervals",stratifier.label]), select_all([evalBamLabel, truthBamLabel])]), + reference_version = ref_map["fasta"], + file_name = outputPrefix+"_vcfeval" + } + + call CountUNKVcfEval { + input: + vcf = StandardVcfEval.outVcf, + vcfIndex = StandardVcfEval.outVcfIndex, + gatkTag = gatkTag, + preemptible = preemptible + } + } + + String areVariants = if(CheckForVariantsTruth.variantsFound && CheckForVariantsEval.variantsFound) then "yes" else "no" + call SummariseVcfEval { + input: + evalLabel = evalLabel, + truthLabel = truthLabel, + stratLabel = stratLabel, + summaryFile = StandardVcfEval.outSummary, + igvSession = VcfEvalWriteXMLfile.igv_session, + areVariants = areVariants, + unkSNP = CountUNKVcfEval.UNK_SNP, + unkINDEL = CountUNKVcfEval.UNK_INDEL, + preemptible = preemptible + } + } + + scatter ( i in range(length(stratifiers)) ) { + AnnotatedVcfs annotatedVcfsList = object{vcfVcfEval : StandardVcfEval.outVcf[i], + vcfVcfEvalIndex : StandardVcfEval.outVcfIndex[i], + stratLabel : stratifiers[i].label, + evalLabel : evalLabel, + truthLabel : truthLabel, + stratBed : stratBed[i], + confidenceBed : ConfidenceConvertIntervals.bed, + namePrefix : outputPrefix[i] + } + } + + + scatter (indelCombo in cross(annotatedVcfsList,indelSelectors)) { + EvalStratSelectorCombo evalStratIndelCombos = object{annotatedVcfs : indelCombo.left, + variantSelector : indelCombo.right + } + } + + scatter (evalStratIndelCombo in evalStratIndelCombos) { + String jexl = evalStratIndelCombo.variantSelector.jexl + File? vcfVcfEval = evalStratIndelCombo.annotatedVcfs.vcfVcfEval + File? vcfVcfEvalIndex = evalStratIndelCombo.annotatedVcfs.vcfVcfEvalIndex + String evalIndelLabel = evalStratIndelCombo.annotatedVcfs.evalLabel + String truthIndelLabel = evalStratIndelCombo.annotatedVcfs.truthLabel + String? stratIndelLabel = evalStratIndelCombo.annotatedVcfs.stratLabel + String indelLabel = evalStratIndelCombo.variantSelector.label + File? stratIndelBed = evalStratIndelCombo.annotatedVcfs.stratBed + File? confidenceBed = evalStratIndelCombo.annotatedVcfs.confidenceBed + String namePrefix = evalStratIndelCombo.annotatedVcfs.namePrefix+"_"+indelLabel + + if (defined(vcfVcfEval) && defined(vcfVcfEvalIndex) && doIndelLengthStratification) { + call EvalForVariantSelection as EvalIndelLengthVcfEval { + input: + vcf = vcfVcfEval, + vcfIndex = vcfVcfEvalIndex, + jexl = jexl, + engine="VcfEval", + selectTPCall="CALL == 'TP'", + selectTPBase="BASE == 'TP'", + selectFN="(BASE == 'FN' || BASE == 'FN_CA')", + selectFP="(CALL == 'FP' || CALL == 'FP_CA')", + sampleCall="CALLS", + sampleBase="BASELINE", + gatkTag = gatkTag, + preemptible = preemptible, + gatkJarForAnnotation = gatkJarForAnnotation, + annotationNames = annotationNames, + reference = ref_map["fasta"], + refDict = ref_map["dict"], + refIndex = ref_map["fai"] + } + + call WriteXMLfile as VcfEvalIndelWriteXMLfile { + input: + input_files = flatten([select_all([EvalIndelLengthVcfEval.selectedTPCall,EvalIndelLengthVcfEval.selectedTPBase,EvalIndelLengthVcfEval.selectedFP,EvalIndelLengthVcfEval.selectedFN,vcfVcfEval,confidenceBed,stratIndelBed]), select_all([evalBam, truthBam])]), + input_names = flatten([select_all(["TP_Eval","TP_Base","FP","FN","All_Variants","confidence_intervals",stratIndelLabel]), select_all([evalBamLabel, truthBamLabel])]), + reference_version = ref_map["fasta"], + file_name = namePrefix+"_vcfeval" + } + + call SummariseForIndelSelection as VcfEvalSummariseForIndelSelection { + input: + evalLabel = evalIndelLabel, + truthLabel = truthIndelLabel, + stratLabel = stratIndelLabel, + indelLabel = indelLabel, + engine="VcfEval", + igvSession = VcfEvalIndelWriteXMLfile.igv_session, + TP_CALL = EvalIndelLengthVcfEval.TP_CALL, + TP_BASE = EvalIndelLengthVcfEval.TP_BASE, + FP = EvalIndelLengthVcfEval.FP, + FN = EvalIndelLengthVcfEval.FN, + preemptible = preemptible + } + } + } + + + + + scatter (selectorCombo in cross(annotatedVcfsList,actualVariantSelectors)) { + EvalStratSelectorCombo evalStratSelectorCombos = object{annotatedVcfs : selectorCombo.left, + variantSelector : selectorCombo.right + } + } + scatter (evalStratSelectorCombo in evalStratSelectorCombos) { + if (defined(evalStratSelectorCombo.annotatedVcfs.vcfVcfEval) && defined(evalStratSelectorCombo.annotatedVcfs.vcfVcfEvalIndex)) { + call EvalForVariantSelection as EvalSelectorVcfEval { + input: + vcf = evalStratSelectorCombo.annotatedVcfs.vcfVcfEval, + vcfIndex = evalStratSelectorCombo.annotatedVcfs.vcfVcfEvalIndex, + jexl = evalStratSelectorCombo.variantSelector.jexl, + engine="VcfEval", + selectTPCall="CALL == 'TP'", + selectTPBase="BASE == 'TP'", + selectFN="(BASE == 'FN' || BASE == 'FN_CA')", + selectFP="(CALL == 'FP' || CALL == 'FP_CA')", + sampleCall="CALLS", + sampleBase="BASELINE", + gatkTag = gatkTag, + preemptible = preemptible, + gatkJarForAnnotation = gatkJarForAnnotation, + annotationNames = annotationNames, + reference = ref_map["fasta"], + refDict = ref_map["dict"], + refIndex = ref_map["fai"] + } + call WriteXMLfile as VcfEvalSelectorWriteXMLfile { + input: + input_files = flatten([select_all([EvalSelectorVcfEval.selectedTPCall,EvalSelectorVcfEval.selectedTPBase,EvalSelectorVcfEval.selectedFP,EvalSelectorVcfEval.selectedFN, + evalStratSelectorCombo.annotatedVcfs.vcfVcfEval,evalStratSelectorCombo.annotatedVcfs.confidenceBed,evalStratSelectorCombo.annotatedVcfs.stratBed]), select_all([evalBam, truthBam])]), + input_names = flatten([select_all(["TP_Eval","TP_Base","FP","FN","All_Variants","confidence_intervals",evalStratSelectorCombo.annotatedVcfs.stratLabel]), select_all([evalBamLabel, truthBamLabel])]), + reference_version = ref_map["fasta"], + file_name = evalStratSelectorCombo.annotatedVcfs.namePrefix+"_"+evalStratSelectorCombo.variantSelector.label+"_vcfeval" + } + + call SummariseForVariantSelection as VcfEvalSummariseForVariantSelection { + input: + evalLabel = evalStratSelectorCombo.annotatedVcfs.evalLabel, + truthLabel = evalStratSelectorCombo.annotatedVcfs.truthLabel, + stratLabel = evalStratSelectorCombo.annotatedVcfs.stratLabel, + variantLabel = evalStratSelectorCombo.variantSelector.label, + engine="VcfEval", + igvSession = VcfEvalSelectorWriteXMLfile.igv_session, + TP_CALL = EvalSelectorVcfEval.TP_CALL, + TP_BASE = EvalSelectorVcfEval.TP_BASE, + FP = EvalSelectorVcfEval.FP, + FN = EvalSelectorVcfEval.FN, + preemptible = preemptible + } + } + } + + Array[File] summaries = flatten([SummariseVcfEval.summaryOut,select_all(VcfEvalSummariseForVariantSelection.summaryOut), + select_all(VcfEvalSummariseForIndelSelection.summaryOut)]) + + call CombineSummaries { + input: + summaries = summaries, + preemptible = preemptible + } + + ################################################################################ + + output { + File summary = CombineSummaries.summaryOut + Float snpPrecision = SummariseVcfEval.snpPrecision[0] + Float indelPrecision = SummariseVcfEval.indelPrecision[0] + Float snpRecall = SummariseVcfEval.snpRecall[0] + Float indelRecall = SummariseVcfEval.indelRecall[0] + Float snpF1Score = SummariseVcfEval.snpF1Score[0] + Float indelF1Score = SummariseVcfEval.indelF1Score[0] + Array[File?] snpRocs = StandardVcfEval.outSnpRoc + Array[File?] nonSnpRocs = StandardVcfEval.outNonSnpRoc + } +} + +################################################################################ +################################################################################ +################################################################################ + +struct EvalTruthMatch { + File truthVcf + File truthVcfIndex + File confidenceIntervals + String truthLabel + File evalVcf + File evalVcfIndex + String evalLabel +} + +struct VariantSelector { + String jexl + String label +} + +struct Stratifier { + File? intervalList + File? bed + String? label +} + +struct EvalStratCombo { + EvalTruthMatch evalTruthMatch + Stratifier stratifier +} + +struct AnnotatedVcfs { + File? vcfVcfEval + File? vcfVcfEvalIndex + String? stratLabel + String evalLabel + String truthLabel + File? stratBed + File? confidenceBed + String namePrefix +} +struct EvalStratSelectorCombo { + AnnotatedVcfs annotatedVcfs + VariantSelector variantSelector +} + +#Check to see if there are variants in the given vcf which overlap the confidence and stratification intervals +task CheckForVariants { + input{ + File vcf + File vcfIndex + File confidenceIL + File? stratIL + Int? preemptible + Int? memoryMaybe + String gatkTag + } + Int memoryDefault = 16 + Int memoryJava = select_first([memoryMaybe,memoryDefault]) + Int memoryRam = memoryJava+2 + + Int disk_size = 10 + ceil(size(vcf, "GB") + size(vcfIndex, "GB") + size(confidenceIL, "GB") + size(stratIL, "GB")) + + command <<< + set -xeuo pipefail + + nVariants="$(gatk --java-options "-Xmx~{memoryJava}G" CountVariants -V ~{vcf} -L ~{confidenceIL} ~{"-L " + stratIL} -isr INTERSECTION | tail -1)" + if [ "$nVariants" -gt "0" ]; then echo "true" > outBool.txt; else echo "false" > outBool.txt; fi + >>> + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:"+gatkTag + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: "16" + memory: memoryRam + " GB" + } + + output { + Boolean variantsFound = read_boolean("outBool.txt") + } +} + +#Evaluate evalVCF against truthVCF using vcfeval +task VcfEval { + input{ + File truthVCF + File truthVCFIndex + + File evalVCF + File evalVCFIndex + + File confidenceBed + + File? stratBed + + File ref + File refDict + File refIndex + + String outputPre + Boolean passingOnly + + String? vcfScoreField + Int? preemptible + String? memUser + Int? threads + + Boolean requireMatchingGenotypes + Boolean truthIsSitesOnlyVcf + Boolean enableRefOverlap = false + } + String memDefault="16 GB" + String mem = select_first([memUser,memDefault]) + + Int cpu = select_first([threads,1]) + Int disk_size = 50 + ceil(size(truthVCF, "GB") + size(truthVCFIndex, "GB") + 2.2 * size(evalVCF, "GB") + size(evalVCFIndex, "GB") + size(confidenceBed, "GB") + size(stratBed, "GB") + size(ref, "GB") + size(refDict, "GB") + size(refIndex, "GB")) + + command <<< + set -xeuo pipefail + + /bin/rtg-tools/rtg format -o rtg_ref ~{ref} + /bin/rtg-tools/rtg vcfeval \ + ~{false="--all-records" true="" passingOnly} \ + ~{"--vcf-score-field=" + vcfScoreField} \ + ~{false="--squash-ploidy" true="" requireMatchingGenotypes} \ + ~{false="" true="--sample ALT" truthIsSitesOnlyVcf} \ + ~{true="--ref-overlap" false="" enableRefOverlap} \ + -b ~{truthVCF} -c ~{evalVCF} \ + -e ~{confidenceBed} ~{"--bed-regions " + stratBed} \ + ~{false="--output-mode combine" true="" truthIsSitesOnlyVcf} \ + --decompose -t rtg_ref \ + ~{"--threads "+threads} -o output_dir + + for f in output_dir/*; do + mv $f ~{outputPre}_"$(basename "$f")"; + done + + /bin/rtg-tools/rtg rocplot --precision-sensitivity --title="~{outputPre} SNP" --svg=~{outputPre}.snp.svg ~{outputPre}_snp_roc.tsv.gz + /bin/rtg-tools/rtg rocplot --precision-sensitivity --title="~{outputPre} INDEL" --svg=~{outputPre}.indel.svg ~{outputPre}_non_snp_roc.tsv.gz + + python3 -<<"EOF" ~{outputPre}_snp_roc.tsv.gz ~{outputPre}_non_snp_roc.tsv.gz ~{outputPre}_summary.csv + import gzip + import sys + + indel_sensitivity = 0 + indel_precision = 0 + indel_fscore = 0 + indel_TP_Base = 0 + indel_TP_Eval = 0 + indel_FP = 0 + indel_FN = 0 + + snp_sensitivity = 0 + snp_precision = 0 + snp_fscore = 0 + snp_TP_Base = 0 + snp_TP_Eval = 0 + snp_FP = 0 + snp_FN = 0 + + with gzip.open(sys.argv[1],"rt") as f_snp: + for line in f_snp: + try: + snp_sensitivity = float(line.split()[6]) + snp_precision = float(line.split()[5]) + snp_fscore = float(line.split()[7]) + snp_TP_Eval = float(line.split()[3]) + snp_TP_Base = float(line.split()[1]) + snp_FP = float(line.split()[2]) + snp_FN = float(line.split()[4]) + except ValueError: + continue + except IndexError: + continue + f_snp.close() + with gzip.open(sys.argv[2],"rt") as f_indel: + for line in f_indel: + try: + indel_sensitivity = float(line.split()[6]) + indel_precision = float(line.split()[5]) + indel_fscore = float(line.split()[7]) + indel_TP_Eval = float(line.split()[3]) + indel_TP_Base = float(line.split()[1]) + indel_FP = float(line.split()[2]) + indel_FN = float(line.split()[4]) + except ValueError: + continue + except IndexError: + continue + f_indel.close() + + str_indel_sensitivity = str(indel_sensitivity) + str_indel_precision = str(indel_precision) + str_indel_fscore = str(indel_fscore) + str_snp_sensitivity = str(snp_sensitivity) + str_snp_precision = str(snp_precision) + str_snp_fscore = str(snp_fscore) + + + if indel_TP_Eval+indel_FP==0: + str_indel_precision="NA" + if indel_TP_Base+indel_FN==0: + str_indel_sensitivity="NA" + if str_indel_sensitivity=="NA" or str_indel_precision=="NA": + str_indel_fscore="NA" + + if snp_TP_Eval+snp_FP==0: + str_snp_precision="NA" + if snp_TP_Base+snp_FN==0: + str_snp_sensitivity="NA" + if str_snp_sensitivity=="NA" or str_snp_precision=="NA": + str_snp_fscore="NA" + + + + with open(sys.argv[3],"wt") as f_out: + f_out.write(",".join(["Type","Precision","Recall","F1_Score","TP_Eval","TP_Base","FP","FN"])+"\n") + f_out.write(",".join(["SNP",str_snp_precision,str_snp_sensitivity,str_snp_fscore,str(snp_TP_Eval),str(snp_TP_Base),str(snp_FP),str(snp_FN)])+"\n") + f_out.write(",".join(["INDEL",str_indel_precision,str_indel_sensitivity,str_indel_fscore,str(indel_TP_Eval),str(indel_TP_Base),str(indel_FP),str(indel_FN)])+"\n") + f_out.close() + EOF + >>> + + runtime { + docker: "ckachulis/rtg-tools:0.1" + preemptible: select_first([preemptible,0]) + memory: mem + cpu: cpu + disks: "local-disk " + disk_size + " HDD" + } + + output { + Array[File] outs = glob("${outputPre}_*") + File outSummary="${outputPre}_summary.csv" + File outVcf="${outputPre}_output.vcf.gz" + File outVcfIndex="${outputPre}_output.vcf.gz.tbi" + File outSnpRocPlot="~{outputPre}.snp.svg" + File outNonRocPlot="~{outputPre}.indel.svg" + File outSnpRoc="${outputPre}_snp_roc.tsv.gz" + File outNonSnpRoc="${outputPre}_non_snp_roc.tsv.gz" + } +} + +#Evaluate evalVCF against truthVCF using hap.py +task EvalHappy { + input{ + File truthVCF + File truthVCFIndex + File evalVCF + File confidenceBed + File? stratBed + File ref + File refDict + File refIndex + String outputPre + String? memUser + Int? preemptible + Int? threads + String happyTag + } + String memDefault="16 GB" + String mem = select_first([memUser,memDefault]) + + Int cpu = select_first([threads,1]) + Int disk_size = 10 + ceil(size(truthVCF, "GB") + size(truthVCFIndex, "GB") + 2.2 * size(evalVCF, "GB") + size(confidenceBed, "GB") + size(stratBed, "GB") + size(ref, "GB") + size(refDict, "GB") + size(refIndex, "GB")) + + + command <<< + /opt/hap.py/bin/hap.py ~{truthVCF} ~{evalVCF} -f ~{confidenceBed} -r ~{ref} -V ~{"-T " + stratBed} -L --preprocess-truth ~{"--threads "+threads} -o ~{outputPre} + >>> + + runtime { + docker: "pkrusche/hap.py:"+happyTag + memory: mem + preemptible: select_first([preemptible,0]) + cpu: cpu + disks: "local-disk " + disk_size + " HDD" + } + + output { + Array[File] outs = glob("${outputPre}*") + File outSummary="${outputPre}.summary.csv" + File outVcf="${outputPre}.vcf.gz" + File outVcfIndex="${outputPre}.vcf.gz.tbi" + } +} + +#Evaluate evalVCF against truthVCF using picard GenotypeConcordance +task EvalGATKGC { + input{ + File truthVCF + File truthVCFIndex + File evalVCF + File evalVCFIndex + File intervalList + File? stratIL + File ref + File refDict + File refIndex + String outputPre + Int? preemptible + Int? memoryMaybe + String gatkTag + } + Int memoryDefault = 16 + Int memoryJava = select_first([memoryMaybe,memoryDefault]) + Int memoryRam = memoryJava+2 + Int disk_size = 10 + ceil(size(truthVCF, "GB") + size(truthVCFIndex, "GB") + 2.2 * size(evalVCF, "GB") + size(intervalList, "GB") + size(stratIL, "GB") + size(evalVCFIndex, "GB") + size(ref, "GB") + size(refDict, "GB") + size(refIndex, "GB")) + + + command <<< + gatk --java-options "-Xmx~{memoryJava}G" GenotypeConcordance -TV ~{truthVCF} -CV ~{evalVCF} -R ~{ref} -INTERVALS ~{intervalList} ~{"-INTERVALS " + stratIL} -USE_VCF_INDEX -OUTPUT_VCF -O ~{outputPre} + >>> + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:"+gatkTag + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: "16" + memory: memoryRam + " GB" + } + + output { + Array[File] out = glob("${outputPre}*") + File outSummary="${outputPre}.genotype_concordance_summary_metrics" + File outCounts="${outputPre}.genotype_concordance_contingency_metrics" + File outVcf="${outputPre}.genotype_concordance.vcf.gz" + File outVcfIndex="${outputPre}.genotype_concordance.vcf.gz.tbi" + } +} + +#takes in either a .bed or .intervallist and returns both a .bed and .intervallist version of the input +task ConvertIntervals { + input { + File inputIntervals + File? subset_interval + Int? preemptible + Int? memoryMaybe + File refDict + String gatkTag + String? dummyInputForTerraCallCaching + } + + Int memoryDefault = 16 + Int memoryJava = select_first([memoryMaybe,memoryDefault]) + Int memoryRam = memoryJava+2 + Int disk_size = 10 + ceil(3 * size(inputIntervals, "GB") + size(refDict, "GB")) + + command <<< + set -xeuo pipefail + + # convert bed to interval_list, or copy interval_list + if [[ ~{inputIntervals} == *.bed || ~{inputIntervals} == *.bed.gz ]]; then + gatk --java-options "-Xmx~{memoryJava}G" \ + BedToIntervalList \ + -I ~{inputIntervals} \ + -O initial_intervals.interval_list \ + -SD ~{refDict} + else + cp ~{inputIntervals} initial_intervals.interval_list + fi + + # optionally intersect interval_list with subset_interval + + if [ ! -z ~{subset_interval} ]; then + gatk --java-options "-Xmx~{memoryJava}G" \ + IntervalListTools \ + -I initial_intervals.interval_list \ + -I ~{subset_interval} \ + -ACTION INTERSECT \ + -O intervals.interval_list + else + mv initial_intervals.interval_list intervals.interval_list + fi + + # convert result to BED + gatk --java-options "-Xmx~{memoryJava}G" \ + IntervalListToBed \ + -I intervals.interval_list \ + -O intervals.bed + >>> + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:"+gatkTag + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: "16" + memory: memoryRam + " GB" + } + + output { + File bed="intervals.bed" + File intervalList="intervals.interval_list" + } +} + +#For now, due to a bug, the ouput annotated VCF from hap.py has a hardcoded HG19 header, so the VCF header must be fixed to represent the correct reference. +#Additionally, due to a separate bug, UpdateVCFSequenceDictionary crashes in this case when trying to index a bgzipped vcf. So for now have to perform indexing in a separate command. +task FixVcfHeader { + input { + File vcf + File vcfIndex + File ref + File refDict + File refIndex + Int? preemptible + Int? memoryMaybe + String gatkTag + } + Int memoryDefault = 16 + Int memoryJava = select_first([memoryMaybe,memoryDefault]) + Int memoryRam = memoryJava+2 + Int disk_size = 10 + ceil(2.2 * size(vcf, "GB") + 2.2 * size(vcfIndex, "GB") + size(ref, "GB") + size(refDict, "GB") + size(refIndex, "GB")) + + command <<< + set -xeuo pipefail + + gatk --java-options "-Xmx~{memoryJava}G" UpdateVCFSequenceDictionary -V ~{vcf} -O fixed.vcf.gz --source-dictionary ~{refDict} --replace --create-output-variant-index false + gatk --java-options "-Xmx~{memoryJava}G" IndexFeatureFile -F fixed.vcf.gz + >>> + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:"+gatkTag + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: "16" + memory: memoryRam + " GB" + } + + output { + File outVcf="fixed.vcf.gz" + File outVcfIndex="fixed.vcf.gz.tbi" + } + +} + +#Count number of variants which were outside confidence region based on vcfeval annotated vcf +task CountUNKVcfEval { + input { + File? vcf="" + File? vcfIndex="" + Int? preemptible + Int? memoryMaybe + String gatkTag + } + Int memoryDefault = 16 + Int memoryJava = select_first([memoryMaybe,memoryDefault]) + Int memoryRam = memoryJava+2 + Int disk_size = 10 + ceil(size(vcf, "GB") + size(vcfIndex, "GB")) + + command <<< + set -xeuo pipefail + + gatk --java-options "-Xmx~{memoryJava}G" SelectVariants -V ~{vcf} -O selected.unk.snp.vcf -select "(CALL == 'OUT')" --select-type-to-include SNP + gatk --java-options "-Xmx~{memoryJava}G" SelectVariants -V ~{vcf} -O selected.unk.indel.vcf -select "(CALL == 'OUT')" --select-type-to-include INDEL + + UNK_SNP="$(gatk --java-options "-Xmx~{memoryJava}G" CountVariants -V selected.unk.snp.vcf | tail -1)" + UNK_INDEL="$(gatk --java-options "-Xmx~{memoryJava}G" CountVariants -V selected.unk.indel.vcf | tail -1)" + + echo "$UNK_SNP" > unk_snp.txt + echo "$UNK_INDEL" > unk_indel.txt + >>> + runtime { + docker: "us.gcr.io/broad-gatk/gatk:"+gatkTag + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: "16" + memory: memoryRam + " GB" + } + output { + Int UNK_SNP = read_int("unk_snp.txt") + Int UNK_INDEL = read_int("unk_indel.txt") + } +} + +#Count number of variants which were outside confidence region based on GenotypeConcordance annotated vcf +task CountUNKGC { + input { + File? vcfAnnotated="" + File? vcfIndexAnnotated="" + File vcfOrig + File vcfIndexOrig + Int? preemptible + Int? memoryMaybe + File? stratIL + String gatkTag + } + Int memoryDefault = 16 + Int memoryJava = select_first([memoryMaybe,memoryDefault]) + Int memoryRam = memoryJava+2 + Int disk_size = 10 + ceil(size(vcfAnnotated, "GB") + size(vcfIndexAnnotated, "GB") + 2 * size(vcfOrig, "GB") + size(vcfIndexOrig, "GB") + size(stratIL, "GB")) + + command <<< + set -xeuo pipefail + + gatk --java-options "-Xmx~{memoryJava}G" SelectVariants -V ~{vcfOrig} -O selected.unk.snp.vcf.gz ~{"-L "+stratIL} --select-type-to-include SNP --discordance ~{vcfAnnotated} + gatk --java-options "-Xmx~{memoryJava}G" SelectVariants -V ~{vcfOrig} -O selected.unk.indel.vcf.gz ~{"-L "+stratIL} --select-type-to-include INDEL --discordance ~{vcfAnnotated} + + UNK_SNP="$(gatk --java-options "-Xmx~{memoryJava}G" CountVariants -V selected.unk.snp.vcf.gz | tail -1)" + UNK_INDEL="$(gatk --java-options "-Xmx~{memoryJava}G" CountVariants -V selected.unk.indel.vcf.gz | tail -1)" + + echo "$UNK_SNP" > unk_snp.txt + echo "$UNK_INDEL" > unk_indel.txt + >>> + + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:"+gatkTag + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: "16" + memory: memoryRam + " GB" + } + + output { + Int UNK_SNP = read_int("unk_snp.txt") + Int UNK_INDEL = read_int("unk_indel.txt") + } +} + +#Count TP,FP,FN for a particular selection of variants given by jexl +task EvalForVariantSelection { + input { + File? vcf="" + File? vcfIndex="" + String jexl + Int? preemptible + Int? memoryMaybe + String engine + String selectTPCall + String selectTPBase + String selectFN + String selectFP + String sampleCall + String sampleBase + String gatkTag + + Array[String] annotationNames=[] + File? gatkJarForAnnotation + File reference + File refDict + File refIndex + } + + Int memoryDefault = 16 + Int memoryJava = select_first([memoryMaybe,memoryDefault]) + Int memoryRam = memoryJava+2 + + String selectionTPCall = jexl + " && " + selectTPCall + String selectionTPBase = jexl + " && " + selectTPBase + String selectionFN = jexl + " && " + selectFN + String selectionFP = jexl + " && " + selectFP + + Int disk_size = 10 + ceil(4.2 * size(vcf, "GB") + 2.2 * size(vcfIndex, "GB") + size(reference, "GB")) + + command <<< + set -xeuo pipefail + + VCF=~{vcf} + if [[ ! -z "~{gatkJarForAnnotation}" ]]; then + java -jar ~{gatkJarForAnnotation} VariantAnnotator -V ~{vcf} -O annotated.vcf.gz ~{true="-A" false="" length(annotationNames)>0} ~{sep=" -A " annotationNames} -R ~{reference} + VCF = annotated.vcf.gz + else + touch annotated.vcf.gz + fi + + gatk --java-options "-Xmx~{memoryJava}G" SelectVariants -V $VCF -O selected.TP_CALL.vcf.gz -select "~{selectionTPCall}" -sn ~{sampleCall} + gatk --java-options "-Xmx~{memoryJava}G" SelectVariants -V $VCF -O selected.TP_BASE.vcf.gz -select "~{selectionTPBase}" -sn ~{sampleBase} + gatk --java-options "-Xmx~{memoryJava}G" SelectVariants -V $VCF -O selected.FN.vcf.gz -select "~{selectionFN}" -sn ~{sampleBase} + gatk --java-options "-Xmx~{memoryJava}G" SelectVariants -V $VCF -O selected.FP.vcf.gz -select "~{selectionFP}" -sn ~{sampleCall} + + TP_CALL="$(gatk --java-options "-Xmx~{memoryJava}G" CountVariants -V selected.TP_CALL.vcf.gz | tail -1)" + TP_BASE="$(gatk --java-options "-Xmx~{memoryJava}G" CountVariants -V selected.TP_BASE.vcf.gz | tail -1)" + FN="$(gatk --java-options "-Xmx~{memoryJava}G" CountVariants -V selected.FN.vcf.gz | tail -1)" + FP="$(gatk --java-options "-Xmx~{memoryJava}G" CountVariants -V selected.FP.vcf.gz | tail -1)" + + echo "$TP_CALL" > tp_call.txt + echo "$TP_BASE" > tp_base.txt + echo "$FN" > fn.txt + echo "$FP" > fp.txt + >>> + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:"+gatkTag + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: "16" + memory: memoryRam + " GB" + } + + output { + Int TP_CALL = read_int("tp_call.txt") + Int TP_BASE = read_int("tp_base.txt") + Int FP = read_int("fp.txt") + Int FN = read_int("fn.txt") + File selectedTPCall="selected.TP_CALL.vcf.gz" + File selectedTPBase="selected.TP_BASE.vcf.gz" + File selectedFP="selected.FP.vcf.gz" + File selectedFN="selected.FN.vcf.gz" + + File annotated="annotated.vcf.gz" + File selectedTPCallIndex="selected.TP_CALL.vcf.gz.tbi" + File selectedTPBaseIndex="selected.TP_BASE.vcf.gz.tbi" + File selectedFPIndex="selected.FP.vcf.gz.tbi" + File selectedFNIndex="selected.FN.vcf.gz.tbi" + } + +} + +#create csv file of statistics based on TP,FP,FN +task SummariseForIndelSelection { + input { + String evalLabel + String truthLabel + String? stratLabel + String indelLabel + String engine + String igvSession + Int TP_CALL + Int TP_BASE + Int FP + Int FN + Int? preemptible + + } + + command <<< + set -xeuo pipefail + + Rscript -<<"EOF" ~{TP_CALL} ~{TP_BASE} ~{FN} ~{FP} ~{evalLabel} ~{truthLabel} ~{indelLabel} ~{engine} ~{default="" stratLabel} ~{igvSession} + GetSelectionValue<-function(name, target) { + + if(target=="insertion" || target=="deletion") { + return(NA) + } + pos_start<-regexpr(target,name) + sub = substring(name,pos_start+attr(pos_start,"match.length")+1,nchar(name)) + split_sub = strsplit(sub,"_") + val = if(grepl("^m",split_sub[[1]][[1]])) -as.double(gsub("m","",split_sub[[1]][[1]])) else as.double(split_sub[[1]][[1]]) + } + + args <-commandArgs(trailingOnly = TRUE) + indel_options <-c("deletion","insertion","indel_fine","indel_coarse") + indel_type <- mapply(grepl,indel_options,args[7]) + indel_type <- indel_options[indel_type[indel_options]] + indel_length <- GetSelectionValue(args[7],indel_type) + if (length(args)<10) { + stratifier <- NA + } else { + stratifier <- args[9] + } + table <- data.frame("Name"=args[5], "Truth_Set"=args[6],"Comparison_Engine"=args[8],"Stratifier"=stratifier, + "IndelLength"= indel_length, + "Recall"=as.numeric(args[2])/(as.numeric(args[2])+as.numeric(args[3])),"Precision"=as.numeric(args[1])/(as.numeric(args[1])+as.numeric(args[4])),"TP_Base"=as.numeric(args[2]),"TP_Eval"=as.numeric(args[1]), + "FP"=as.numeric(args[4]),"FN"=as.numeric(args[3]),"IGV_Session"=args[length(args)],"Summary_Type"=indel_type) + table$F1_Score <- 2*table$Precision*table$Recall/(table$Precision+table$Recall) + write.csv(table,paste(args[8],".",indel_type,".summary.csv",sep=""),row.names = FALSE) + EOF + >>> + + runtime { + docker: "rocker/tidyverse" + preemptible: select_first([preemptible,0]) + disks: "local-disk 10 HDD" + } + + output { + File summaryOut = glob("*.summary.csv")[0] + } + +} + +#create csv file of statistics based on TP,FP,FN +task SummariseForVariantSelection { + input { + String evalLabel + String truthLabel + String? stratLabel + String variantLabel + String engine + String igvSession + Int TP_CALL + Int TP_BASE + Int FP + Int FN + Int? preemptible + + } + + command <<< + set -xeuo pipefail + + Rscript -<<"EOF" ~{TP_CALL} ~{TP_BASE} ~{FN} ~{FP} ~{evalLabel} ~{truthLabel} ~{variantLabel} ~{engine} ~{default="" stratLabel} ~{igvSession} + args <-commandArgs(trailingOnly = TRUE) + if (length(args)<10) { + stratifier <- NA + } else { + stratifier <- args[9] + } + table <- data.frame("Name"=args[5], "Truth_Set"=args[6],"Comparison_Engine"=args[8],"Stratifier"=stratifier, + "IndelLength"= NA, + "Recall"=as.numeric(args[2])/(as.numeric(args[2])+as.numeric(args[3])),"Precision"=as.numeric(args[1])/(as.numeric(args[1])+as.numeric(args[4])),"TP_Base"=as.numeric(args[2]),"TP_Eval"=as.numeric(args[1]), + "FP"=as.numeric(args[4]),"FN"=as.numeric(args[3]),"IGV_Session"=args[length(args)],"Summary_Type"="summary","Type"=args[7]) + table$F1_Score <- 2*table$Precision*table$Recall/(table$Precision+table$Recall) + write.csv(table,paste(args[8],".",args[7],".summary.csv",sep=""),row.names = FALSE) + EOF + >>> + + runtime { + docker: "rocker/tidyverse" + preemptible: select_first([preemptible,0]) + disks: "local-disk 10 HDD" + } + + output { + File summaryOut = glob("*.summary.csv")[0] + } + +} + +#Convert vcfeval output statistics to final output format +task SummariseVcfEval { + input { + String evalLabel + String truthLabel + String areVariants + String? igvSession + String? stratLabel + File? summaryFile + Int? unkSNP + Int? unkINDEL + Int? preemptible + } + Int disk_size = 10 + ceil(2.2 * size(summaryFile, "GB")) + + command <<< + set -xeuo pipefail + + Rscript -<<"EOF" ~{evalLabel} ~{truthLabel} ~{default="" summaryFile} ~{default="" stratLabel} ~{default="" igvSession} ~{default="" unkSNP} ~{default="" unkINDEL} ~{areVariants} + args <- commandArgs(trailingOnly = TRUE) + if (args[length(args)]=="yes") { + table_vcfeval <- read.csv(args[3]) + if (length(args)==7) { + table_vcfeval$Stratifier <- NA + table_vcfeval$IGV_Session <- args[4] + table_vcfeval$UNK[table_vcfeval$Type=="SNP"]=args[5] + table_vcfeval$UNK[table_vcfeval$Type=="INDEL"]=args[6] + } else { + table_vcfeval$Stratifier <- args[4] + table_vcfeval$IGV_Session <- args[5] + table_vcfeval$UNK[table_vcfeval$Type=="SNP"]=args[6] + table_vcfeval$UNK[table_vcfeval$Type=="INDEL"]=args[7] + } + write(ifelse(is.na(table_vcfeval[table_vcfeval$Type=="SNP", ]$Precision), 0, table_vcfeval[table_vcfeval$Type=="SNP", ]$Precision), "snpPrecision.txt") + write(ifelse(is.na(table_vcfeval[table_vcfeval$Type=="INDEL",]$Precision), 0, table_vcfeval[table_vcfeval$Type=="INDEL",]$Precision), "indelPrecision.txt") + write(ifelse(is.na(table_vcfeval[table_vcfeval$Type=="SNP", ]$Recall), 0, table_vcfeval[table_vcfeval$Type=="SNP", ]$Recall), "snpRecall.txt") + write(ifelse(is.na(table_vcfeval[table_vcfeval$Type=="INDEL",]$Recall), 0, table_vcfeval[table_vcfeval$Type=="INDEL",]$Recall), "indelRecall.txt") + write(ifelse(is.na(table_vcfeval[table_vcfeval$Type=="SNP", ]$F1_Score), 0, table_vcfeval[table_vcfeval$Type=="SNP", ]$F1_Score), "snpF1Score.txt") + write(ifelse(is.na(table_vcfeval[table_vcfeval$Type=="INDEL",]$F1_Score), 0, table_vcfeval[table_vcfeval$Type=="INDEL",]$F1_Score), "indelF1Score.txt") + } else { + types <- c("INDEL","SNP") + recall <- c(NA,NA) + precision <- c(NA,NA) + f1_score <- c(NA,NA) + tp <- c(0,0) + fp <- c(0,0) + fn <- c(0,0) + unk <- c(0,0) + igv_session <- c(NA,NA) + table_vcfeval <- data.frame("Type"=types,"Recall"=recall,"Precision"=precision,"F1_Score"=f1_score,"TP_Base"=tp,"TP_Eval"=tp,"FP"=fp,"FN"=fn,"UNK"=unk,"IGV_Session"=igv_session) + if (length(args)==3) { + table_vcfeval$Stratifier <- NA + } else { + table_vcfeval$Stratifier <- args[3] + } + write(0,"snpPrecision.txt") + write(0,"indelPrecision.txt") + write(0,"snpRecall.txt") + write(0,"indelRecall.txt") + write(0,"snpF1Score.txt") + write(0,"indelF1Score.txt") + } + table_vcfeval$Name <- args[1] + table_vcfeval$Truth_Set <- args[2] + table_vcfeval$Summary_Type <- "summary" + table_vcfeval$Comparison_Engine <-"VcfEval" + write.csv(table_vcfeval,"vcfeval.summary.csv",row.names = FALSE) + EOF + >>> + + runtime { + docker: "rocker/tidyverse" + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + } + + output{ + File summaryOut="vcfeval.summary.csv" + Float snpPrecision = read_float("snpPrecision.txt") + Float indelPrecision = read_float("indelPrecision.txt") + Float snpRecall = read_float("snpRecall.txt") + Float indelRecall = read_float("indelRecall.txt") + Float snpF1Score = read_float("snpF1Score.txt") + Float indelF1Score = read_float("indelF1Score.txt") + } +} + +#Convert hap.py output statistics to final output format +task SummariseHappy { + input { + String evalLabel + String truthLabel + String areVariants + String? stratLabel + File? summaryFile + Int? preemptible + String? igvSession + } + + Int disk_size = 10 + ceil(2.2 * size(summaryFile, "GB")) + + command <<< + Rscript -<<"EOF" ~{evalLabel} ~{truthLabel} ~{default="" summaryFile} ~{default="" stratLabel} ~{default="" igvSession} ~{areVariants} + args <-commandArgs(trailingOnly = TRUE) + if (args[length(args)]=="yes") { + table_happy <- read.csv(args[3]) + colnames(table_happy)[10]<-"Recall" + colnames(table_happy)[11]<-"Precision" + colnames(table_happy)[13]<-"F1_Score" + colnames(table_happy)[4]<-"TP_Base" + table_happy$TP_Eval = table_happy$TP_Base + colnames(table_happy)[5]<-"FN" + colnames(table_happy)[7]<-"FP" + colnames(table_happy)[8]<-"UNK" + if (length(args)==5) { + table_happy$Stratifier <- NA + table_happy$IGV_Session <- args[4] + } else { + table_happy$Stratifier <-args[4] + table_happy$IGV_Session <- args[5] + } + + + } else { + types <- c("INDEL","SNP") + recall <- c(NA,NA) + precision <- c(NA,NA) + f1_score <- c(NA,NA) + igv_session <- c(NA,NA) + tp <- c(0,0) + fp <- c(0,0) + fn <- c(0,0) + unk <- c(0,0) + filters <- c("PASS","PASS") + table_happy <- data.frame("Type"=types,"Recall"=recall,"Precision"=precision,"F1_Score"=f1_score,"TP_Base"=tp,"TP_Eval"=tp,"FP"=fp,"FN"=fn,"UNK"=unk,"Filter"=filters,"IGV_Session"=igv_session) + if (length(args)==3) { + table_happy$Stratifier <- NA + } else { + table_happy$Stratifier <-args[3] + } + + } + table_happy$Name <- args[1] + table_happy$Truth_Set <- args[2] + table_happy$Comparison_Engine <-"Happy" + table_happy$Summary_Type <- "summary" + table_happy<-table_happy[table_happy$Filter=="PASS",c("Type","Recall","Precision","Name","Truth_Set","Comparison_Engine","F1_Score","TP_Eval","TP_Base","FP","FN","UNK","Stratifier","IGV_Session","Summary_Type")] + if (nrow(subset(table_happy,Type=="INDEL"))==0) { + table_add <- data.frame("Type"="INDEL","Recall"=NA,"Precision"=NA,"F1_Score"=NA,"TP_Eval"=0,"TP_Base"=0,"FP"=0,"FN"=0,"UNK"=0,"Name"=args[1],"Truth_Set"=args[2],"Comparison_Engine"="Happy","Stratifier"=table_happy$Stratifier[1],"IGV_Session"=NA,"Summary_Type"="summary") + table_happy <- rbind(table_happy,table_add) + } + write.csv(table_happy,"happy.summary.csv",row.names = FALSE) + EOF + >>> + + runtime { + docker: "rocker/tidyverse" + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + } + + output{ + File summaryOut="happy.summary.csv" + } +} + +#Convert GenotypeConcordance output statistics to final output format +task SummariseGATKGC { + input { + String evalLabel + String truthLabel + String areVariants + String? stratLabel + File? summaryFile + File? summaryCounts + Int? unkSNP + Int? unkINDEL + Int? preemptible + String? igvSession + } + + Int disk_size = 10 + ceil(2.2 * size(summaryFile, "GB") + 2.2 * size(summaryCounts, "GB")) + + command <<< + set -xeuo pipefail + + Rscript -<<"EOF" ~{evalLabel} ~{truthLabel} ~{default="" summaryFile} ~{default="" summaryCounts} ~{default="" stratLabel} ~{default="" igvSession} ~{default="" unkSNP} ~{default="" unkINDEL} ~{areVariants} + args <- commandArgs(trailingOnly = TRUE) + if (args[length(args)]=="yes") { + table_GC <- read.table(args[3],skip = 6, header = TRUE,sep="\t",na.strings="?") + table_counts_GC <- read.table(args[4],skip = 6, header = TRUE,sep="\t",na.strings="?") + table_GC$F1_Score <- 2*(table_GC$VAR_PPV*table_GC$VAR_SENSITIVITY)/(table_GC$VAR_PPV+table_GC$VAR_SENSITIVITY) + table_GC$TP_Eval <- table_counts_GC$TP_COUNT + table_GC$TP_Base <- table_counts_GC$TP_COUNT + table_GC$FP <- table_counts_GC$FP_COUNT + table_GC$FN <- table_counts_GC$FN_COUNT + colnames(table_GC)[10]<-"Recall" + colnames(table_GC)[11]<-"Precision" + colnames(table_GC)[1]<-"Type" + if (length(args)==8) { + table_GC$Stratifier <- NA + table_GC$IGV_Session <- args[5] + table_GC$UNK[table_GC$Type=="SNP"]=args[6] + table_GC$UNK[table_GC$Type=="INDEL"]=args[7] + + } else { + table_GC$Stratifier <- args[5] + table_GC$IGV_Session <- args[6] + table_GC$UNK[table_GC$Type=="SNP"]=args[7] + table_GC$UNK[table_GC$Type=="INDEL"]=args[8] + } + } else { + types <- c("INDEL","SNP") + recall <- c(NA,NA) + precision <- c(NA,NA) + f1_score <- c(NA,NA) + tp <- c(0,0) + fp <- c(0,0) + fn <- c(0,0) + unk <- c(0,0) + igv_session <- c(NA,NA) + table_GC <- data.frame("Type"=types,"Recall"=recall,"Precision"=precision,"F1_Score"=f1_score,"TP_Eval"=tp,"TP_Base"=tp,"FP"=fp,"FN"=fn,"UNK"=unk,"IGV_Session"=igv_session) + if (length(args)==3) { + table_GC$Stratifier <- NA + } else { + table_GC$Stratifier <- args[3] + } + } + table_GC$Name <- args[1] + table_GC$Truth_Set <- args[2] + table_GC$Comparison_Engine <-"GATK_GC" + table_GC$Summary_Type <- "summary" + table_GC <- table_GC[,c("Type","Recall","Precision","Name","Truth_Set","Comparison_Engine","F1_Score","TP_Eval","TP_Base","FP","FN","UNK","Stratifier","IGV_Session","Summary_Type")] + write.csv(table_GC,"gatkgc.summary.csv",row.names = FALSE) + EOF + >>> + + runtime { + docker: "rocker/tidyverse" + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + } + + output{ + File summaryOut="gatkgc.summary.csv" + } +} + +#Combine summaries from multiple csv into a single csv +task CombineSummaries { + input { + Array[File] summaries + Int? preemptible + } + String dollar="$" + + Int disk_size = 10 + ceil(2 * size(summaries, "GB")) + command <<< + set -xeuo pipefail + + Rscript -<<"EOF" + library(readr) + library(dplyr) + library(purrr) + summary_files <- read_csv("~{write_lines(summaries)}", col_names = FALSE) + merged<- as.list(summary_files$X1) %>% map(read_csv) %>% reduce(bind_rows) + write.csv(merged,"summary.csv",row.names = FALSE) + EOF + >>> + + runtime { + docker: "rocker/tidyverse" + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + } + + output{ + File summaryOut="summary.csv" + } +} + +#Use CrosscheckFingerprints to match evaluation vcfs to appropriate truth vcfs +task MatchEvalTruth { + input{ + File evalVcf + File truthVcf + File evalVcfIndex + File truthVcfIndex + File hapMap + Int? preemptible + Int? memoryMaybe + String gatkTag + } + Int memoryDefault = 16 + Int memoryJava = select_first([memoryMaybe,memoryDefault]) + Int memoryRam = memoryJava+2 + Int disk_size = 10 + ceil(size(hapMap, "GB") + size(evalVcf, "GB") + size(evalVcfIndex, "GB") + size(truthVcf, "GB") + size(truthVcfIndex, "GB")) + + command <<< + gatk --java-options "-Xmx~{memoryJava}G" CrosscheckFingerprints -I ~{evalVcf} -SI ~{truthVcf} -H ~{hapMap} --CROSSCHECK_MODE CHECK_ALL_OTHERS --CROSSCHECK_BY FILE --EXPECT_ALL_GROUPS_TO_MATCH + >>> + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:"+gatkTag + preemptible: select_first([preemptible,0]) + disks: "local-disk " + disk_size + " HDD" + bootDiskSizeGb: "16" + memory: memoryRam + " GB" + } +} + +# creates an IGV session +# given a list of IGV compatible file paths +task WriteXMLfile { + input { + Array[String] input_files + String reference_version + String file_name + + Array[String]? input_names="" + } + + Array[String] input_names_prefix = if defined(input_names) then prefix('-n ', select_first([input_names])) else [] + + command <<< + set -euxo pipefail + + # because of some nonsense above, we need to play some bash tricks here to make sure that + # the inputs are labeled correctly: + echo '~{sep=" " input_names_prefix}' | sed -e 's#-n[ \t]*-n#-n#g' -e 's#-n[ \t]*$##' > labels.txt + + bash /usr/writeIGV.sh ~{reference_version} ~{sep=" " input_files} $(cat labels.txt) > "~{file_name}.xml" + >>> + runtime { + docker: "quay.io/mduran/generate-igv-session_2:v1.0" + } + output { + File igv_session = "${file_name}.xml" + } +} + +task CreateIntervalList{ + input { + File reference + File reference_index + File reference_dict + String interval_string + String gatkTag + String? dummyInputForTerraCallCaching + } + command { + gatk PreprocessIntervals \ + -R ~{reference} \ + -L ~{interval_string} \ + -O output.interval_list \ + --bin-length 0 \ + -imr OVERLAPPING_ONLY \ + -padding 0 + } + output { + File interval_list = "output.interval_list" + } + runtime { + preemptible: 3 + docker: "us.gcr.io/broad-gatk/gatk:"+gatkTag + disks: "local-disk 100 HDD" + memory: "4 GB" + } +} + +#Print given message to stderr and return an error +task ErrorWithMessage{ + input { + String message + } + command <<< + >&2 echo "Error: ~{message}" + exit 1 + >>> + + runtime { + docker: "ubuntu" + } +} \ No newline at end of file diff --git a/wdl/CleanupIntermediate.wdl b/wdl/CleanupIntermediate.wdl index 2aad460b5..e52de533e 100644 --- a/wdl/CleanupIntermediate.wdl +++ b/wdl/CleanupIntermediate.wdl @@ -3,7 +3,7 @@ version 1.0 workflow CleanupIntermediate { # Ironicaly, this generates intermeidate files too, but they are tiny. meta { - description: "A workflow to clean up intermediate files from running workflows. Use at your own risk." + description: "A workflow to clean up intermediate files from running workflows on Terra. Use at your own risk." } input { @@ -31,17 +31,15 @@ task CleanupAFolder { } command <<< - echo "started" - gsutil -q rm -rf gs://~{bucket_name}/~{submission_id} + timeout 23h gsutil -q rm -rf gs://~{bucket_name}/submissions/~{submission_id} || echo "Timed out. Please try again." >>> runtime { cpu: 1 memory: "4 GiB" - disks: "local-disk 50 HDD" - bootDiskSizeGb: 10 + disks: "local-disk 10 HDD" preemptible_tries: 1 max_retries: 1 - docker:"google/cloud-sdk:latest" + docker:"us.gcr.io/google.com/cloudsdktool/google-cloud-cli:alpine" } -} \ No newline at end of file +} diff --git a/wdl/CompareVcfBenchmarks.wdl b/wdl/CompareVcfBenchmarks.wdl new file mode 100644 index 000000000..e87de45d2 --- /dev/null +++ b/wdl/CompareVcfBenchmarks.wdl @@ -0,0 +1,450 @@ +version 1.0 + +# Adapted from the palantir / Hydrogen team repository (https://github.com/broadinstitute/palantir-workflows/tree/mg_benchmark_compare/BenchmarkVCFs) +# Original Author: Michael Gatzen +# Documentation: https://github.com/broadinstitute/palantir-workflows/blob/mg_benchmark_compare/BenchmarkVCFs/README_CompareBenchmarks.md + +workflow CompareVcfBenchmarks { + input { + Array[String] sample_ids + Array[String] configurations + Array[File] benchmark_summaries + Array[String]? stratifiers + + Boolean include_counts = true + Boolean generate_gc_plots = false + + Array[String]? order_of_samples + Array[String]? order_of_configurations + Array[Int]? deltas + + Int? mem_gb + Int? preemptible + } + + call CompareBenchmarksTask { + input: + sample_ids = sample_ids, + configurations = configurations, + benchmark_summaries = benchmark_summaries, + stratifiers = stratifiers, + include_counts = include_counts, + order_of_samples = order_of_samples, + order_of_configurations = order_of_configurations, + deltas = deltas, + mem_gb = mem_gb, + preemptible = preemptible + } + + if (generate_gc_plots) { + call CreateGCPlotsTask { + input: + sample_ids = sample_ids, + configurations = configurations, + benchmark_summaries = benchmark_summaries, + order_of_samples = order_of_samples, + order_of_configurations = order_of_configurations, + mem_gb = mem_gb, + preemptible = preemptible + } + } + + output { + File comparison_csv = CompareBenchmarksTask.comparison_csv + File raw_data = CompareBenchmarksTask.raw_data + Array[File]? gc_plots = CreateGCPlotsTask.gc_plots + } +} + +task CreateGCPlotsTask { + input { + Array[String] sample_ids + Array[String] configurations + Array[File] benchmark_summaries + + Array[String]? order_of_samples + Array[String]? order_of_configurations + + Int mem_gb = 4 + Int preemptible = 0 + } + + String order_of_samples_arg = if !defined(order_of_samples) then "" else "--order-of-samples" + Array[String] order_of_samples_or_empty = select_first([order_of_samples, []]) + String order_of_configurations_arg = if !defined(order_of_configurations) then "" else "--order-of-configurations" + Array[String] order_of_configurations_or_empty = select_first([order_of_configurations, []]) + + command <<< + set -xeuo pipefail + + source activate compare_benchmarks + + cat <<'EOF' > script.py +import argparse +import numpy as np +import pandas as pd +import matplotlib +import matplotlib.pyplot as plt + +matplotlib.rcParams['text.usetex'] = False +matplotlib.rcParams['mathtext.default'] = 'regular' +matplotlib.rcParams['font.family'] = 'serif' + +def get_value_from_table(data, sample_id, configuration, stratifier, var_type, column): + return data.query('sample_id == @sample_id and configuration == @configuration and Stratifier == @stratifier and Type == @var_type').iloc[0][column] + +def calculate_metrics(data, unique_sample_ids, unique_configurations, stratifiers): + recalculated_data = pd.DataFrame(columns=['sample_id', 'configuration', 'Stratifier', 'Type', 'TP', 'FP', 'FN', 'Precision', 'Sensitivity', 'F-Measure']) + for sample_id in unique_sample_ids: + for configuration in unique_configurations: + for stratifier in stratifiers: + tp, fp, fn = dict(), dict(), dict() + tp['SNP'] = get_value_from_table(data, sample_id, configuration, stratifier, 'SNP', 'TP_Base') + fp['SNP'] = get_value_from_table(data, sample_id, configuration, stratifier, 'SNP', 'FP') + fn['SNP'] = get_value_from_table(data, sample_id, configuration, stratifier, 'SNP', 'FN') + tp['INDEL'] = get_value_from_table(data, sample_id, configuration, stratifier, 'INDEL', 'TP_Base') + fp['INDEL'] = get_value_from_table(data, sample_id, configuration, stratifier, 'INDEL', 'FP') + fn['INDEL'] = get_value_from_table(data, sample_id, configuration, stratifier, 'INDEL', 'FN') + tp['all'] = tp['SNP'] + tp['INDEL'] + fp['all'] = fp['SNP'] + fp['INDEL'] + fn['all'] = fn['SNP'] + fn['INDEL'] + + for var_type in ['SNP', 'INDEL', 'all']: + recalculated_data = recalculated_data.append({ + 'sample_id': sample_id, + 'configuration': configuration, + 'Stratifier': stratifier, + 'Type': var_type, + 'TP': tp[var_type], + 'FP': fp[var_type], + 'FN': fn[var_type], + 'Precision': tp[var_type]/(tp[var_type] + fp[var_type]) if tp[var_type] + fp[var_type] > 0 else np.nan, + 'Sensitivity': tp[var_type]/(tp[var_type] + fn[var_type]) if tp[var_type] + fn[var_type] > 0 else np.nan, + 'F-Measure': tp[var_type]/(tp[var_type] + 0.5*(fp[var_type] + fn[var_type])) if tp[var_type] + fp[var_type] + fn[var_type] > 0 else np.nan + }, ignore_index=True) + return recalculated_data + +def plot_sample(data, i_sample, sample_id, unique_configurations, stratifiers): + fig, axes = plt.subplots(2, 2, figsize=(10, 8)) + for column, var_type in enumerate(['SNP', 'INDEL']): + for row, metric in enumerate(['Sensitivity', 'Precision']): + ax = axes[row, column] + X = np.arange(len(stratifiers)) + for configuration in unique_configurations: + ax.plot(X, [get_value_from_table(data, sample_id, configuration, stratifier, var_type, metric) for stratifier in stratifiers], label=configuration) + + ax.set_xticks(X) + ax.set_xticklabels([stratifier.replace('gc', '') for stratifier in stratifiers], rotation=45) + ax.set_xlabel('GC bin') + ax.set_ylabel(metric) + ax.set_title(var_type, zorder=0) + axes[0, 0].legend() + fig.suptitle(sample_id) + plt.tight_layout() + + fig.savefig(f'gc_plot_{i_sample}_{sample_id}.png', dpi=100) + + +def main(sample_ids, configurations, summaries, order_of_samples, order_of_configurations): + if len(sample_ids) != len(configurations) or len(sample_ids) != len(summaries): + raise RuntimeError('The number of sample_ids, configurations, and summaries tables must be equal.') + + samples_data = [] + for i in range(len(sample_ids)): + sample_data = pd.read_csv(summaries[i]) + + # Filter out everything other than SNP or INDEL rows, and stratifiers starting with "gc" + sample_data = sample_data.loc[((sample_data['Type'] == 'SNP') | (sample_data['Type'] == 'INDEL')) & (sample_data['Stratifier'].str.startswith("gc"))] + + # Add sample_id and configuration names + sample_data['sample_id'] = sample_ids[i] + sample_data['configuration'] = configurations[i] + samples_data.append(sample_data) + data = pd.concat(samples_data) + + if order_of_samples is None: + unique_sample_ids = data['sample_id'].unique() + else: + unique_sample_ids = order_of_samples + + if order_of_configurations is None: + unique_configurations = data['configuration'].unique() + else: + unique_configurations = order_of_configurations + + gc_stratifiers = data['Stratifier'].unique().astype(str) + gc_stratifiers.sort() + + data = calculate_metrics(data, unique_sample_ids, unique_configurations, gc_stratifiers) + + for i_sample_id, sample_id in enumerate(unique_sample_ids): + plot_sample(data, i_sample_id, sample_id, unique_configurations, gc_stratifiers) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Create a table to compare output of BenchmarkVCFs.') + parser.add_argument('--order-of-samples', type=str, nargs='+', help='Order of samples. If not specified, the order will be the same as the supplied inputs.') + parser.add_argument('--order-of-configurations', type=str, nargs='+', help='Order of configurations. If not specified, the order will be the same as the supplied inputs.') + required_named = parser.add_argument_group('Required named arguments') + required_named.add_argument('--sample-ids', required=True, type=str, nargs='+') + required_named.add_argument('--configurations', required=True, type=str, nargs='+') + required_named.add_argument('--summaries', required=True, type=str, nargs='+') + args = parser.parse_args() + main(args.sample_ids, args.configurations, args.summaries, args.order_of_samples, args.order_of_configurations) +EOF + python script.py --sample-ids ~{sep=' ' sample_ids} --configurations ~{sep=' ' configurations} --summaries ~{sep=' ' benchmark_summaries} ~{order_of_samples_arg} ~{sep=' ' order_of_samples_or_empty} ~{order_of_configurations_arg} ~{sep=' ' order_of_configurations_or_empty} + >>> + + runtime { + docker: "us.gcr.io/broad-dsde-methods/benchmark_vcfs/compare_benchmarks:1.1.0" + preemptible: preemptible + memory: mem_gb + " GB" + disks: "local-disk 20 HDD" + } + + output { + Array[File] gc_plots = glob("gc_plot_*.png") + } +} + + + +task CompareBenchmarksTask { + input { + Array[String] sample_ids + Array[String] configurations + Array[File] benchmark_summaries + Array[String]? stratifiers + + Boolean include_counts + + Array[String]? order_of_samples + Array[String]? order_of_configurations + Array[Int]? deltas + + Int mem_gb = 4 + Int preemptible = 0 + } + + String stratifiers_arg = if !defined(stratifiers) then "" else "--stratifiers" + Array[String] stratifiers_or_empty = select_first([stratifiers, []]) + + String order_of_samples_arg = if !defined(order_of_samples) then "" else "--order-of-samples" + Array[String] order_of_samples_or_empty = select_first([order_of_samples, []]) + String order_of_configurations_arg = if !defined(order_of_configurations) then "" else "--order-of-configurations" + Array[String] order_of_configurations_or_empty = select_first([order_of_configurations, []]) + + String deltas_arg = if !defined(deltas) then "" else "--deltas" + Array[Int] deltas_or_empty = select_first([deltas, []]) + + command <<< + set -xeuo pipefail + + source activate compare_benchmarks + + cat <<'EOF' > script.py +import argparse +import numpy as np +import pandas as pd + +# The purpose of this class is to significantly simplify writing a TSV (or otherwise separated) table. +# For example, when passing an array of strings to cells(arr), the elements will be written separated +# by the separator sep, which is equivalent to file.write(sep.join(arr)). However, the methods of this class allow +# for chaining of these write commands to easily concatenate multiple calls, which can be helpful for writing +# custom-formatted tables, e.g. output.cells(arr1).sep().cells(arr2).new_line() instead of having to issue multiple +# file.write() calls. +class ChainableOutput: + def __init__(self, file, sep:str): + self.file = file + self.separator = sep + + def cells(self, cells:list): + self.file.write(self.separator.join([str(cell) for cell in cells])) + return self + def sep(self): + self.file.write(self.separator) + return self + def new_line(self): + self.file.write('\n') + return self + +def write_header(output:ChainableOutput, unique_sample_ids, unique_configurations, deltas, include_counts): + # This function will write the header for the comparison table. In general, the layout looks like this: + # | | | | Precision | Sensitivity | ... + # | | | | sample1 | sample2 | sample1 | sample2 | ... + # | | | | config1 | config2 | config1 | config2 | config1 | config2 | config1 | config2 | ... + + # If include_counts is True then TP, FP and FN will be written before Precision, Sensitivity and F-Measure + + # Line 1 + output.cells([''] * 4).sep() + if include_counts: + output.cells(['TP'] + [''] * (len(unique_sample_ids) * (len(unique_configurations) + len(deltas)) - 1)).sep() + output.cells(['FP'] + [''] * (len(unique_sample_ids) * (len(unique_configurations) + len(deltas)) - 1)).sep() + output.cells(['FN'] + [''] * (len(unique_sample_ids) * (len(unique_configurations) + len(deltas)) - 1)).sep() + output.cells(['Precision'] + [''] * (len(unique_sample_ids) * (len(unique_configurations) + len(deltas)) - 1)).sep() + output.cells(['Sensitivity'] + [''] * (len(unique_sample_ids) * (len(unique_configurations) + len(deltas)) - 1)).sep() + output.cells(['F-Measure'] + [''] * (len(unique_sample_ids) * (len(unique_configurations) + len(deltas)) - 1)).sep() + output.cells([''] * 2).new_line() + + # Line 2 + output.cells([''] * 4) + for _ in (['TP', 'FP', 'FN'] if include_counts else []) + ['Precision', 'Sensitivity', 'F-Measure']: + for sample_id in unique_sample_ids: + output.sep().cells([sample_id] + [''] * (len(unique_configurations) + len(deltas) - 1)) + output.sep().cells([''] * 2).new_line() + + # Line 3 + output.cells([''] * 4) + for metric in (['TP', 'FP', 'FN'] if include_counts else []) + ['Precision', 'Sensitivity', 'F-Measure']: + for sample_id in unique_sample_ids: + # Write configurations themselves + for configuration in unique_configurations: + output.sep().cells([configuration]) + # Write deltas + for delta_pair in deltas: + if metric in ['TP', 'FP', 'FN']: + output.sep().cells([f'delta({unique_configurations[delta_pair[1]]}-{unique_configurations[delta_pair[0]]})']) + else: + output.sep().cells([f'delta%({unique_configurations[delta_pair[1]]}-{unique_configurations[delta_pair[0]]})']) + output.sep().cells([''] * 2).new_line() + +def get_value_from_table(data, sample_id, configuration, stratifier, var_type, column): + try: + return data.query('sample_id == @sample_id and configuration == @configuration and Stratifier == @stratifier and Type == @var_type').iloc[0][column] + except IndexError as e: + raise RuntimeError(f'Failed querying table for sample_id: {sample_id}, configuration: {configuration}, stratifier {stratifier}, var_type: {var_type}, column: {column}. Make sure that the set of stratifiers and samples in the order_of_ arguments exactly matches the set of stratifiers and samples used for BenchmarkVCFs.') + +def write_stratifier(output:ChainableOutput, stratifier:str, data:pd.DataFrame, unique_sample_ids:list, unique_configurations:list, deltas:list, include_counts): + for var_type in ['SNP', 'INDEL', 'all']: + # Placeholder to print percentage of genome for each stratification + output.cells(['', '']).sep() + # Only print stratifier name in the first row + output.cells([stratifier if var_type == 'SNP' else '', var_type]) + for metric in (['TP', 'FP', 'FN'] if include_counts else []) + ['Precision', 'Sensitivity', 'F-Measure']: + for sample_id in unique_sample_ids: + for configuration in unique_configurations: + output.sep().cells(['{:.5f}'.format(get_value_from_table(data, sample_id, configuration, stratifier, var_type, metric))]) + for delta_pair in deltas: + base_value = get_value_from_table(data, sample_id, unique_configurations[delta_pair[0]], stratifier, var_type, metric) + current_value = get_value_from_table(data, sample_id, unique_configurations[delta_pair[1]], stratifier, var_type, metric) + if metric in ['TP', 'FP', 'FN']: + delta = int(current_value) - int(base_value) + output.sep().cells(['{}'.format(delta)]) + else: + delta_pct = (current_value - base_value) / base_value + output.sep().cells(['{:.2%}'.format(delta_pct)]) + output.sep().cells([var_type, stratifier if var_type == 'SNP' else '']).new_line() + +def calculate_metrics(data, unique_sample_ids, unique_configurations, stratifiers): + recalculated_data = pd.DataFrame(columns=['sample_id', 'configuration', 'Stratifier', 'Type', 'TP', 'FP', 'FN', 'Precision', 'Sensitivity', 'F-Measure']) + for sample_id in unique_sample_ids: + for configuration in unique_configurations: + for stratifier in stratifiers: + tp, fp, fn = dict(), dict(), dict() + tp['SNP'] = get_value_from_table(data, sample_id, configuration, stratifier, 'SNP', 'TP_Base') + fp['SNP'] = get_value_from_table(data, sample_id, configuration, stratifier, 'SNP', 'FP') + fn['SNP'] = get_value_from_table(data, sample_id, configuration, stratifier, 'SNP', 'FN') + tp['INDEL'] = get_value_from_table(data, sample_id, configuration, stratifier, 'INDEL', 'TP_Base') + fp['INDEL'] = get_value_from_table(data, sample_id, configuration, stratifier, 'INDEL', 'FP') + fn['INDEL'] = get_value_from_table(data, sample_id, configuration, stratifier, 'INDEL', 'FN') + tp['all'] = tp['SNP'] + tp['INDEL'] + fp['all'] = fp['SNP'] + fp['INDEL'] + fn['all'] = fn['SNP'] + fn['INDEL'] + + for var_type in ['SNP', 'INDEL', 'all']: + recalculated_data = recalculated_data.append({ + 'sample_id': sample_id, + 'configuration': configuration, + 'Stratifier': stratifier, + 'Type': var_type, + 'TP': tp[var_type], + 'FP': fp[var_type], + 'FN': fn[var_type], + 'Precision': tp[var_type]/(tp[var_type] + fp[var_type]) if tp[var_type] + fp[var_type] > 0 else np.nan, + 'Sensitivity': tp[var_type]/(tp[var_type] + fn[var_type]) if tp[var_type] + fn[var_type] > 0 else np.nan, + 'F-Measure': tp[var_type]/(tp[var_type] + 0.5*(fp[var_type] + fn[var_type])) if tp[var_type] + fp[var_type] + fn[var_type] > 0 else np.nan + }, ignore_index=True) + return recalculated_data + + + +def main(sample_ids, configurations, summaries, stratifiers, order_of_samples, order_of_configurations, deltas_array, include_counts): + if len(sample_ids) != len(configurations) or len(sample_ids) != len(summaries): + raise RuntimeError('The number of sample_id, configurations, and summary tables must be equal.') + if deltas_array is None: + deltas_array = [] + if len(deltas_array) % 2 != 0: + raise RuntimeError('The number of indices in the delta argument must be even. Please use --help or check the documentation on how to use this argument.') + + deltas = [(int(deltas_array[i]), int(deltas_array[i+1])) for i in range(0, len(deltas_array), 2)] + + samples_data = [] + for i in range(len(sample_ids)): + sample_data = pd.read_csv(summaries[i]) + + # Filter out everything other than SNP or INDEL rows + sample_data = sample_data.loc[(sample_data['Type'] == 'SNP') | (sample_data['Type'] == 'INDEL')] + + # Add sample_id and configuration names + sample_data['sample_id'] = sample_ids[i] + sample_data['configuration'] = configurations[i] + samples_data.append(sample_data) + data = pd.concat(samples_data) + data = data.fillna({'Stratifier': 'all'}) + + if order_of_samples is None: + unique_sample_ids = data['sample_id'].unique() + else: + unique_sample_ids = order_of_samples + + if order_of_configurations is None: + unique_configurations = data['configuration'].unique() + else: + unique_configurations = order_of_configurations + + if stratifiers is None: + stratifiers = data['Stratifier'].unique() + else: + stratifiers = ['all'] + stratifiers + + data = calculate_metrics(data, unique_sample_ids, unique_configurations, stratifiers) + + data.to_csv('raw_data.csv') + + with open('comparison.csv', 'w') as output_file: + chainable_output = ChainableOutput(output_file, ',') + write_header(chainable_output, unique_sample_ids, unique_configurations, deltas, include_counts) + for stratifier in stratifiers: + write_stratifier(chainable_output, stratifier, data, unique_sample_ids, unique_configurations, deltas, include_counts) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Create a table to compare output of BenchmarkVCFs.') + parser.add_argument('--stratifiers', type=str, nargs='*', help='Explicitly specify the stratifiers that have to be present in all samples. "all" will automatically be added. If not specified, the stratifiers will be inferred. If specified, this argument also defines the order of the stratifiers.') + parser.add_argument('--order-of-samples', type=str, nargs='+', help='Order of samples. If not specified, the order will be the same as the supplied inputs.') + parser.add_argument('--order-of-configurations', type=str, nargs='+', help='Order of configurations. If not specified, the order will be the same as the supplied inputs.') + parser.add_argument('--deltas', type=str, nargs='+', help='A list of configuration (zero-based) indices to compare. E.g. for comparing configurations 0 to 1 and 0 to 2, pass the values 0 1 0 2.') + parser.add_argument('--include-counts', action='store_true', help='If set, include the TP/FP/FN counts in the output table.') + required_named = parser.add_argument_group('Required named arguments') + required_named.add_argument('--sample-ids', required=True, type=str, nargs='+') + required_named.add_argument('--configurations', required=True, type=str, nargs='+') + required_named.add_argument('--summaries', required=True, type=str, nargs='+') + args = parser.parse_args() + main(args.sample_ids, args.configurations, args.summaries, args.stratifiers, args.order_of_samples, args.order_of_configurations, args.deltas, args.include_counts) +EOF + python script.py --sample-ids ~{sep=' ' sample_ids} --configurations ~{sep=' ' configurations} --summaries ~{sep=' ' benchmark_summaries} ~{stratifiers_arg} ~{sep=' ' stratifiers_or_empty} ~{order_of_samples_arg} ~{sep=' ' order_of_samples_or_empty} ~{order_of_configurations_arg} ~{sep=' ' order_of_configurations_or_empty} ~{deltas_arg} ~{sep=' ' deltas_or_empty} ~{true="--include-counts" false="" include_counts} + >>> + + runtime { + docker: "us.gcr.io/broad-dsde-methods/benchmark_vcfs/compare_benchmarks:1.1.0" + preemptible: preemptible + memory: mem_gb + " GB" + disks: "local-disk 20 HDD" + } + + output { + File comparison_csv = "comparison.csv" + File raw_data = "raw_data.csv" + } +} \ No newline at end of file diff --git a/wdl/ConvertToHailMT.wdl b/wdl/ConvertToHailMT.wdl new file mode 100644 index 000000000..1304d35e5 --- /dev/null +++ b/wdl/ConvertToHailMT.wdl @@ -0,0 +1,45 @@ +version 1.0 + +############################################################################################ +## A workflow that performs joint calling on gVCFs (usually from DeepVariant) using GLNexus. +############################################################################################ + +import "tasks/GLNexus.wdl" as GLNexus +import "tasks/Hail.wdl" as Hail +import "tasks/Finalize.wdl" as FF + +workflow ConvertToHailMT { + input { + File joint_gvcf + File joint_gvcf_tbi + String prefix + + String gcs_out_root_dir + } + + parameter_meta { + joint_gvcf: "joint-called gVCF file" + joint_gvcf_tbi: ".tbi index for joint-called gVCF file" + prefix: "prefix for output Hail MatrixTable" + gcs_out_root_dir: "GCS bucket in which to store the Hail MatrixTable" + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/JointCallGVCFs/~{prefix}" + + # Gather across multiple input gVCFs + call Hail.ConvertToHailMT { + input: + gvcf = joint_gvcf, + tbi = joint_gvcf_tbi, + prefix = prefix, + outdir = outdir + } + + ########## + # store the results into designated bucket + ########## + + output { + String joint_mt = ConvertToHailMT.gcs_path + } +} diff --git a/wdl/ConvertToZarrStore.wdl b/wdl/ConvertToZarrStore.wdl new file mode 100644 index 000000000..23633fb79 --- /dev/null +++ b/wdl/ConvertToZarrStore.wdl @@ -0,0 +1,45 @@ +version 1.0 + +################################################################# +## A workflow that converts a joint-called gVCF to a Zarr store. +################################################################# + +import "tasks/GLNexus.wdl" as GLNexus +import "tasks/SGKit.wdl" as SGKit +import "tasks/Finalize.wdl" as FF + +workflow ConvertToZarrStore { + input { + File joint_gvcf + File joint_gvcf_tbi + String prefix + + String gcs_out_root_dir + } + + parameter_meta { + joint_gvcf: "joint-called gVCF file" + joint_gvcf_tbi: ".tbi index for joint-called gVCF file" + prefix: "prefix for output Zarr store" + gcs_out_root_dir: "GCS bucket in which to store the Zarr store" + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/JointCallGVCFs/~{prefix}" + + # Gather across multiple input gVCFs + call SGKit.ConvertToZarrStore { + input: + gvcf = joint_gvcf, + tbi = joint_gvcf_tbi, + prefix = prefix, + outdir = outdir + } + + ########## + # store the results into designated bucket + ########## + + output { + String joint_zarr = ConvertToZarrStore.gcs_path + } +} \ No newline at end of file diff --git a/wdl/ExpandedDrugResistanceMarkerAggregation.wdl b/wdl/ExpandedDrugResistanceMarkerAggregation.wdl new file mode 100644 index 000000000..2885537b7 --- /dev/null +++ b/wdl/ExpandedDrugResistanceMarkerAggregation.wdl @@ -0,0 +1,160 @@ +version 1.0 + +import "tasks/Structs.wdl" +import "tasks/Finalize.wdl" as FF + +workflow ExpandedDrugResistanceMarkerExtraction { + input { + Array[String] sample_names + Array[File] expanded_drug_res_markers + + String out_file_prefix + + String dir_prefix + String gcs_out_root_dir + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/ExpandedDrugResistanceMarkerAggregation/~{dir_prefix}" + + call CombineExpandedDrugResistanceMarkers { + input: + expanded_drug_res_markers = expanded_drug_res_markers, + sample_names = sample_names, + prefix = out_file_prefix, + } + + + # Finalize data + String dir = outdir + "/reports" + + call FF.FinalizeToFile as FinalizeDRReportAllMarkers { input: outdir = dir, file = CombineExpandedDrugResistanceMarkers.combined_report } + + output { + File combined_expanded_markers = FinalizeDRReportAllMarkers.gcs_path + } +} + +task CombineExpandedDrugResistanceMarkers { + input { + Array[String] sample_names + Array[File] expanded_drug_res_markers + + String prefix + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + expanded_drug_res_markers: { localization_optional: true } + } + + Int disk_size = 10 + 10*ceil(size(expanded_drug_res_markers, "GB")) + + command <<< + set -euxo pipefail + + # Copy the files from the cloud to this machine here so it goes faster: + mkdir -p expanded_drug_reports + cd expanded_drug_reports + remote_sample_files=~{write_lines(expanded_drug_res_markers)} + cat ${remote_sample_files} | gsutil -m cp -I . + + # Create local file list: + cat ${remote_sample_files} | sed -e 's@^.*/@@' -e "s@^@$(pwd)/@g" > local_sample_files.txt + + cd .. + + python3 <>> + + output { + File combined_report = "~{prefix}.expanded_drug_report_combined.tsv.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 10, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/ExpandedDrugResistanceMarkerExtraction.wdl b/wdl/ExpandedDrugResistanceMarkerExtraction.wdl new file mode 100644 index 000000000..28949dddd --- /dev/null +++ b/wdl/ExpandedDrugResistanceMarkerExtraction.wdl @@ -0,0 +1,215 @@ +version 1.0 + +import "tasks/Structs.wdl" +import "tasks/FunctionalAnnotation.wdl" as FUNK +import "tasks/Finalize.wdl" as FF + +workflow ExpandedDrugResistanceMarkerExtraction { + input { + String sample_name + + File vcf + File snpeff_db + File protein_drug_resistance_list + File gene_drug_resistance_list + + String dir_prefix + String gcs_out_root_dir + + Boolean do_functional_annotation = true + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/ExpandedDrugResistanceMarkerExtraction/~{dir_prefix}" + + if (do_functional_annotation) { + call FUNK.FunctionallyAnnotateVariants { input: vcf = vcf, snpeff_db = snpeff_db } + } + + call CallDrugResistanceMutations { + input: + vcf = select_first([FunctionallyAnnotateVariants.annotated_vcf, vcf]), + protein_drug_resistance_list = protein_drug_resistance_list, + gene_drug_resistance_list = gene_drug_resistance_list, + prefix = sample_name + } + + # Finalize data + String dir = outdir + "/reports" + + call FF.FinalizeToFile as FinalizeDRReportAllMarkers { input: outdir = dir, file = CallDrugResistanceMutations.all_markers } + call FF.FinalizeToFile as FinalizeDRReportProteinMarkers { input: outdir = dir, file = CallDrugResistanceMutations.protein_coding_markers } + + if (do_functional_annotation) { + call FF.FinalizeToFile as FinalizeAnnotatedVCF { input: outdir = dir, file = select_first([FunctionallyAnnotateVariants.annotated_vcf]) } + call FF.FinalizeToFile as FinalizeAnnotatedVCFIndex { input: outdir = dir, file = select_first([FunctionallyAnnotateVariants.annotated_vcf_index]) } + call FF.FinalizeToFile as FinalizeSnpEffSummary { input: outdir = dir, file = select_first([FunctionallyAnnotateVariants.snpEff_summary]) } + call FF.FinalizeToFile as FinalizeSnpEffGenes { input: outdir = dir, file = select_first([FunctionallyAnnotateVariants.snpEff_genes]) } + } + + output { + File drug_res_report_all = FinalizeDRReportAllMarkers.gcs_path + File drug_res_report_prot_only = FinalizeDRReportProteinMarkers.gcs_path + + File? annotated_vcf = FinalizeAnnotatedVCF.gcs_path + File? annotated_vcf_index = FinalizeAnnotatedVCFIndex.gcs_path + File? snpEff_summary = FinalizeSnpEffSummary.gcs_path + File? snpEff_genes = FinalizeSnpEffGenes.gcs_path + } +} + +task CallDrugResistanceMutations { + input { + File vcf + File protein_drug_resistance_list + File gene_drug_resistance_list + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 10*ceil(size([vcf, protein_drug_resistance_list, gene_drug_resistance_list], "GB")) + + command <<< + set -euxo pipefail + + python3 <') + annotation_fields = line[i1+len(needle):i2].replace("'", "").split(" | ") + continue + elif line.startswith("#"): + continue + chrom, pos, idd, ref, alt, qual, flt, info, gt_f, gt_d = line.strip().split("\t") + infos = {} + for i in info.split(";"): + if "=" in i: + k, v = i.split("=") + infos[k] = v + ann_dicts = make_ann_dict(infos["ANN"], annotation_fields) + + # Get genotype info: + gt_f = gt_f.split(":") + gt_d = gt_d.split(":") + + i = gt_f.index("GT") + gt = gt_d[i] + + base_out_data = [chrom, pos, ref, alt, gt] + + # Now check if our Gene level drug markers: + gene_annotations = [] + for gene_name, gene_id in gene_info: + for ann_dict in ann_dicts: + if gene_id in ann_dict["Gene_Name"] or gene_id in ann_dict["Gene_ID"]: + gene_annotations.append(tuple(base_out_data + [ann_dict[a] for a in annotation_fields])) + + for g in gene_annotations: + annotations.append(g) + + # Now check for our protein change string drug markers: + for gene_name, gene_id, prot_change in p_change_marker_info: + for ann_dict in ann_dicts: + if gene_id in ann_dict["Gene_Name"] or gene_id in ann_dict["Gene_ID"]: + if len(ann_dict["HGVS.p"]) > 0 and ann_dict["HGVS.p"] == prot_change: + annotations.append(tuple(base_out_data + [ann_dict[a] for a in annotation_fields])) + + header = "Chrom\tPos\tRef\tAlt\tGT\t" + "\t".join(annotation_fields) + with open(gene_drug_report_all, 'w') as f: + f.write(f"{header}\n") + for a in annotations: + f.write("\t".join(a)) + f.write("\n") + + with open(gene_drug_report_prot, 'w') as f: + f.write(f"{header}\n") + for a in annotations: + f.write("\t".join(a)) + f.write("\n") + + print('Done') + CODE + + >>> + + output { + File all_markers = "~{prefix}.expanded_drug_report.ALL.tsv" + File protein_coding_markers = "~{prefix}.expanded_drug_report.PROTEIN_CHANGES_ONLY.tsv" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "quay.io/biocontainers/snpeff:5.1d--hdfd78af_0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/ExtractRegionsFromBam.wdl b/wdl/ExtractRegionsFromBam.wdl new file mode 100644 index 000000000..181441260 --- /dev/null +++ b/wdl/ExtractRegionsFromBam.wdl @@ -0,0 +1,62 @@ +version 1.0 + +import "tasks/Utils.wdl" as Utils +import "tasks/SRUtils.wdl" as SRUtils +import "tasks/Finalize.wdl" as FF + +workflow ExtractRegionsFromBam { + meta { + desciption: "Extract reads from the given bam file which overlap the regions in the given bed file." + } + + input { + String gcs_bam_path + File regions_bed + + String participant_name + String extraction_comment + + String gcs_out_root_dir + } + + parameter_meta { + gcs_bam_path: "GCS URL to bam file from which to extract reads." + regions_bed: "Bed file containing regions for which to extract reads." + participant_name: "Participant (or sample) name for the given bam file." + extraction_comment: "Comment to add to the end of the output filename." + gcs_out_root_dir: "Output folder into which to place the results of this workflow." + } + + # First clean the extraction comment: + String clean_comment = sub(extraction_comment, "[!$\t\n\r/<>:\"\\|?*&%#@'`~\[\]\{\}]", "_") + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/ExtractRegionsFromBam/~{participant_name}_~{extraction_comment}" + + call Utils.GetReadsInBedFileRegions as GetReadsInBedFileRegions { + input: + gcs_bam_path = gcs_bam_path, + regions_bed = regions_bed, + prefix = "~{participant_name}_~{extraction_comment}", + } + + call SRUtils.BamToFq as Bam2Fastq { + input: + bam = GetReadsInBedFileRegions.bam, + prefix = "~{participant_name}_~{extraction_comment}" + } + + call FF.FinalizeToFile as FinalizeBam { input: outdir = outdir, file = GetReadsInBedFileRegions.bam } + call FF.FinalizeToFile as FinalizeBai { input: outdir = outdir, file = GetReadsInBedFileRegions.bai } + call FF.FinalizeToFile as FinalizeFqEnd1 { input: outdir = outdir, file = Bam2Fastq.fq_end1 } + call FF.FinalizeToFile as FinalizeFqEnd2 { input: outdir = outdir, file = Bam2Fastq.fq_end2 } + call FF.FinalizeToFile as FinalizeFqUnpaired { input: outdir = outdir, file = Bam2Fastq.fq_unpaired } + + output { + File bam = FinalizeBam.gcs_path + File bai = FinalizeBai.gcs_path + File fq_end1 = FinalizeFqEnd1.gcs_path + File fq_end2 = FinalizeFqEnd2.gcs_path + File fq_unpaired = FinalizeFqUnpaired.gcs_path + + } +} \ No newline at end of file diff --git a/wdl/LRConvertBCF.wdl b/wdl/LRConvertBCF.wdl new file mode 100644 index 000000000..6eed49e9e --- /dev/null +++ b/wdl/LRConvertBCF.wdl @@ -0,0 +1,46 @@ +version 1.0 + +############################################################################################ +## A workflow that converts a BCF file into a .vcf.gz file. Meant to temporarily handle some +## transient issues stemming from the LRJointCallGVCFs workflow. Should be removed eventually. +############################################################################################ + +import "tasks/GLNexus.wdl" as GLNexus +import "tasks/Hail.wdl" as Hail +import "tasks/Finalize.wdl" as FF + +workflow LRConvertBCF { + input { + File joint_bcf + String prefix + + String gcs_out_root_dir + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/JointCallGVCFs/~{prefix}" + + # Convert the joint .bcf callset into a single joint .vcf.bgz callset + call GLNexus.ConcatBCFs { input: bcfs = [ joint_bcf ], prefix = prefix } + + call Hail.ConvertToHailMT { + input: + gvcf = ConcatBCFs.joint_gvcf, + tbi = ConcatBCFs.joint_gvcf_tbi, + prefix = prefix, + outdir = outdir + } + + # Finalize + call FF.FinalizeToFile as FinalizeGVCF { input: outdir = outdir, file = ConcatBCFs.joint_gvcf } + call FF.FinalizeToFile as FinalizeTBI { input: outdir = outdir, file = ConcatBCFs.joint_gvcf_tbi } + + ########## + # store the results into designated bucket + ########## + + output { + File joint_gvcf = FinalizeGVCF.gcs_path + File joint_gvcf_tbi = FinalizeTBI.gcs_path + String joint_mt = ConvertToHailMT.gcs_path + } +} diff --git a/wdl/LRJointCallGVCFs.wdl b/wdl/LRJointCallGVCFs.wdl new file mode 100644 index 000000000..143f4c2ce --- /dev/null +++ b/wdl/LRJointCallGVCFs.wdl @@ -0,0 +1,64 @@ +version 1.0 + +############################################################################################ +## A workflow that performs joint calling on gVCFs (usually from DeepVariant) using GLNexus. +############################################################################################ + +import "tasks/GLNexus.wdl" as GLNexus +import "tasks/Hail.wdl" as Hail +import "tasks/Finalize.wdl" as FF + +workflow LRJointCallGVCFs { + input { + Array[File] gvcfs + Array[File] tbis + File ref_map_file + + String prefix + + String gcs_out_root_dir + } + + parameter_meta { + gvcfs: "GCS paths to gVCF files" + tbis: "GCS paths to gVCF tbi files" + ref_map_file: "table indicating reference sequence and auxillary file locations" + prefix: "prefix for output joint-called gVCF and tabix index" + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/JointCallGVCFs/~{prefix}" + + Map[String, String] ref_map = read_map(ref_map_file) + + # Gather across multiple input gVCFs + call GLNexus.JointCall { + input: + gvcfs = gvcfs, + tbis = tbis, + dict = ref_map['dict'], + prefix = prefix + } + + call Hail.ConvertToHailMT { + input: + gvcf = JointCall.joint_gvcf, + tbi = JointCall.joint_gvcf_tbi, + prefix = prefix, + outdir = outdir + } + + # Finalize + call FF.FinalizeToFile as FinalizeGVCF { input: outdir = outdir, file = JointCall.joint_gvcf } + call FF.FinalizeToFile as FinalizeTBI { input: outdir = outdir, file = JointCall.joint_gvcf_tbi } + + ########## + # store the results into designated bucket + ########## + + output { + File joint_gvcf = FinalizeGVCF.gcs_path + File joint_gvcf_tbi = FinalizeTBI.gcs_path + String joint_mt = ConvertToHailMT.gcs_path + } +} diff --git a/wdl/LRJointCallGVCFsWithGenomicsDB.wdl b/wdl/LRJointCallGVCFsWithGenomicsDB.wdl new file mode 100644 index 000000000..3323fce79 --- /dev/null +++ b/wdl/LRJointCallGVCFsWithGenomicsDB.wdl @@ -0,0 +1,269 @@ +version 1.0 + +############################################################################################################# +## A workflow that performs joint calling on single-sample gVCFs from GATK4 HaplotypeCaller using GenomicsDB. +############################################################################################################# + +import "tasks/SRJointGenotyping.wdl" as SRJOINT +import "tasks/VariantUtils.wdl" as VARUTIL +import "tasks/Hail.wdl" as Hail +import "tasks/Finalize.wdl" as FF + +workflow LRJointCallGVCFsWithGenomicsDB { + input { + Array[File] gvcfs + Array[File] gvcf_indices + + File ref_map_file + + File interval_list + + Float snp_filter_level = 99.7 + Array[String] snp_recalibration_annotation_values = ["QD", "FS", "SOR", "MQRankSum", "ReadPosRankSum"] + Array[Float] snp_recalibration_tranche_values = [100.0, 99.95, 99.9, 99.8, 99.6, 99.5, 99.4, 99.3, 99.0, 98.0, 97.0, 90.0 ] + + Array[File] snp_known_reference_variants + Array[File] snp_known_reference_variants_index + Array[File] snp_known_reference_variants_identifier + Array[Boolean] snp_is_known + Array[Boolean] snp_is_training + Array[Boolean] snp_is_truth + Array[Float] snp_prior + Int snp_max_gaussians = 8 + + Float indel_filter_level = 99.0 + Array[String] indel_recalibration_annotation_values = ["QD", "FS", "SOR", "MQRankSum", "ReadPosRankSum"] + Array[Float] indel_recalibration_tranche_values = [100.0, 99.95, 99.9, 99.5, 99.0, 97.0, 96.0, 95.0, 94.0, 93.5, 93.0, 92.0, 91.0, 90.0] + + Array[File] indel_known_reference_variants + Array[File] indel_known_reference_variants_index + Array[File] indel_known_reference_variants_identifier + Array[Boolean] indel_is_known + Array[Boolean] indel_is_training + Array[Boolean] indel_is_truth + Array[Float] indel_prior + Int indel_max_gaussians = 8 + + Array[File]? annotation_bed_files + Array[File]? annotation_bed_file_indexes + Array[String]? annotation_bed_file_annotation_names + + String prefix + + String gcs_out_root_dir + } + + parameter_meta { + gvcfs: "GCS paths to gVCF files" + gvcf_indices: "GCS paths to gVCF tbi files" + ref_map_file: "table indicating reference sequence and auxillary file locations" + prefix: "prefix for output joint-called gVCF and tabix index" + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/LRJointCallGVCFsWithGenomicsDB/~{prefix}" + + Map[String, String] ref_map = read_map(ref_map_file) + + # From WARP: + # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. + # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. + # We allow overriding this default behavior for testing / special requests. + Boolean is_small_callset = length(gvcfs) <= 1000 + + # Create sample-name map: + call SRJOINT.CreateSampleNameMap as CreateSampleNameMap { + input: + gvcfs = gvcfs, + prefix = prefix + } + + # Import our data into GenomicsDB: + call SRJOINT.ImportGVCFs as ImportGVCFsIntoGenomicsDB { + input: + sample_name_map = CreateSampleNameMap.sample_name_map, + interval_list = interval_list, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = prefix, + batch_size = 50, + # We need to override this because we're not actually sending the GVCF over (just a list) + # ALSO, we're currently tarring the genomicsDB, so we need at least double the space here, plus some slop: + runtime_attr_override = object {disk_gb: 10 + (3 * CreateSampleNameMap.total_gvcf_size_gb) + (2 * ceil(size(ref_map['fasta'], "GB"))), preemptible_tries: 0} + } + + # Joint call + call SRJOINT.GenotypeGVCFs as JointCallGVCFs { + input: + input_gvcf_data = ImportGVCFsIntoGenomicsDB.output_genomicsdb, + interval_list = interval_list, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + dbsnp_vcf = ref_map["known_sites_vcf"], + prefix = prefix, + runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype. + } + + # First make a sites-only VCF for recal (smaller file, easier to work with): + call VARUTIL.MakeSitesOnlyVcf as MakeSitesOnlyGVCF { + input: + vcf = JointCallGVCFs.output_vcf, + vcf_index = JointCallGVCFs.output_vcf_index, + prefix = prefix + } + + # Now we run VariantRecalibrator for indels and snps: + call VARUTIL.IndelsVariantRecalibrator as TrainVQSROnHCIndelVariants { + input: + vcfs = [MakeSitesOnlyGVCF.sites_only_vcf], + vcf_indices = [MakeSitesOnlyGVCF.sites_only_vcf_index], + prefix = prefix + ".indels", + recalibration_tranche_values = indel_recalibration_tranche_values, + recalibration_annotation_values = indel_recalibration_annotation_values, +# known_reference_variants = [ref_map["known_sites_vcf"]], +# known_reference_variants_index = [ref_map["known_sites_index"]], +# known_reference_variants_identifier = ["pfcrosses"], +# is_known = [true], +# is_training = [true], +# is_truth = [true], +# prior = [15], + known_reference_variants = indel_known_reference_variants, + known_reference_variants_index = indel_known_reference_variants_index, + known_reference_variants_identifier = indel_known_reference_variants_identifier, + is_known = indel_is_known, + is_training = indel_is_training, + is_truth = indel_is_truth, + prior = indel_prior, + use_allele_specific_annotations = false, + max_gaussians = indel_max_gaussians, + } + + call VARUTIL.SNPsVariantRecalibratorCreateModel as TrainVQSROnHCSnpVariants { + input: + vcfs = [MakeSitesOnlyGVCF.sites_only_vcf], + vcf_indices = [MakeSitesOnlyGVCF.sites_only_vcf_index], + prefix = prefix + ".snps", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, +# known_reference_variants = [ref_map["known_sites_vcf"]], +# known_reference_variants_index = [ref_map["known_sites_index"]], +# known_reference_variants_identifier = ["pfcrosses"], +# is_known = [true], +# is_training = [true], +# is_truth = [true], +# prior = [15], + known_reference_variants = snp_known_reference_variants, + known_reference_variants_index = snp_known_reference_variants_index, + known_reference_variants_identifier = snp_known_reference_variants_identifier, + is_known = snp_is_known, + is_training = snp_is_training, + is_truth = snp_is_truth, + prior = snp_prior, + use_allele_specific_annotations = false, + max_gaussians = snp_max_gaussians, + } + + call VARUTIL.ApplyVqsr as ApplyVqsr { + input: + vcf = JointCallGVCFs.output_vcf, + vcf_index = JointCallGVCFs.output_vcf_index, + + prefix = prefix + ".vqsr_filtered", + + snps_recalibration = TrainVQSROnHCSnpVariants.recalibration, + snps_recalibration_index = TrainVQSROnHCSnpVariants.recalibration_index, + snps_tranches = TrainVQSROnHCSnpVariants.tranches, + snp_filter_level = snp_filter_level, + + indels_recalibration = TrainVQSROnHCIndelVariants.recalibration, + indels_recalibration_index = TrainVQSROnHCIndelVariants.recalibration_index, + indels_tranches = TrainVQSROnHCIndelVariants.tranches, + indel_filter_level = indel_filter_level, + + use_allele_specific_annotations = false, + } + + # Now we need to annotate our variants by region: + if (defined(annotation_bed_files)) { + call VARUTIL.AnnotateVcfWithBedRegions as AnnotateVcfRegions { + input: + vcf = ApplyVqsr.recalibrated_vcf, + vcf_index = ApplyVqsr.recalibrated_vcf_index, + bed_files = select_first([annotation_bed_files]), + bed_file_indexes = select_first([annotation_bed_file_indexes]), + bed_file_annotation_names = select_first([annotation_bed_file_annotation_names]), + prefix = prefix + ".region_annotated" + } + } + + # Finally convert the output to a HAIL Matrix Table: + call Hail.ConvertToHailMT as CreateHailMatrixTable { + input: + gvcf = select_first([AnnotateVcfRegions.annotated_vcf, ApplyVqsr.recalibrated_vcf]), + tbi = select_first([AnnotateVcfRegions.annotated_vcf_index, ApplyVqsr.recalibrated_vcf_index]), + reference = sub(sub(ref_map["fasta"], "^.*/", ""), "\.[fasta]*$", ""), + ref_fasta = ref_map["fasta"], + ref_fai = ref_map["fai"], + prefix = prefix, + outdir = outdir + } + + # Finalize: + File keyfile = select_first([AnnotateVcfRegions.annotated_vcf_index, ApplyVqsr.recalibrated_vcf_index]) + + call FF.FinalizeToFile as FinalizeGenomicsDB { input: outdir = outdir, keyfile = keyfile, file = ImportGVCFsIntoGenomicsDB.output_genomicsdb } + + call FF.FinalizeToFile as FinalizeRawVCF { input: outdir = outdir, keyfile = keyfile, file = JointCallGVCFs.output_vcf } + call FF.FinalizeToFile as FinalizeRawTBI { input: outdir = outdir, keyfile = keyfile, file = JointCallGVCFs.output_vcf_index } + + call FF.FinalizeToFile as FinalizeIndelRecalFile { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCIndelVariants.recalibration } + call FF.FinalizeToFile as FinalizeIndelRecalIndex { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCIndelVariants.recalibration_index } + call FF.FinalizeToFile as FinalizeIndelRecalTranches { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCIndelVariants.tranches } + call FF.FinalizeToFile as FinalizeIndelRecalModelReport { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCIndelVariants.model_report } + + call FF.FinalizeToFile as FinalizeSnpRecalFile { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCSnpVariants.recalibration } + call FF.FinalizeToFile as FinalizeSnpRecalIndex { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCSnpVariants.recalibration_index } + call FF.FinalizeToFile as FinalizeSnpRecalTranches { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCSnpVariants.tranches } + call FF.FinalizeToFile as FinalizeSnpRecalModelReport { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCSnpVariants.model_report } + + call FF.FinalizeToFile as FinalizeVQSRVCF { input: outdir = outdir, keyfile = keyfile, file = ApplyVqsr.recalibrated_vcf } + call FF.FinalizeToFile as FinalizeVQSRTBI { input: outdir = outdir, keyfile = keyfile, file = ApplyVqsr.recalibrated_vcf_index } + + if (defined(annotation_bed_files)) { + call FF.FinalizeToFile as FinalizeRegionAnnotatedVcf { input: outdir = outdir, keyfile = keyfile, file = select_first([AnnotateVcfRegions.annotated_vcf]) } + call FF.FinalizeToFile as FinalizeRegionAnnotatedVcfIndex { input: outdir = outdir, keyfile = keyfile, file = select_first([AnnotateVcfRegions.annotated_vcf_index]) } + } + + ########## + # store the results into designated bucket + ########## + + output { + File genomicsDB = FinalizeGenomicsDB.gcs_path + + File raw_joint_vcf = FinalizeRawVCF.gcs_path + File raw_joint_vcf_tbi = FinalizeRawTBI.gcs_path + + File? vqsr_indel_recal_file = FinalizeIndelRecalFile.gcs_path + File? vqsr_indel_recal_file_index = FinalizeIndelRecalIndex.gcs_path + File? vqsr_indel_recal_tranches = FinalizeIndelRecalTranches.gcs_path + File? vqsr_indel_recal_model_report = FinalizeIndelRecalModelReport.gcs_path + + File? vqsr_snp_recal_file = FinalizeSnpRecalFile.gcs_path + File? vqsr_snp_recal_file_index = FinalizeSnpRecalIndex.gcs_path + File? vqsr_snp_recal_tranches = FinalizeSnpRecalTranches.gcs_path + File? vqsr_snp_recal_model_report = FinalizeSnpRecalModelReport.gcs_path + + File joint_recalibrated_vcf = FinalizeVQSRVCF.gcs_path + File joint_recalibrated_vcf_tbi = FinalizeVQSRTBI.gcs_path + + File? annotated_joint_vcf = AnnotateVcfRegions.annotated_vcf + File? annotated_joint_vcf_tbi = AnnotateVcfRegions.annotated_vcf_index + + File joint_mt = CreateHailMatrixTable.gcs_path + } +} + + diff --git a/wdl/LRJointCallGVCFsWithGenomicsDB_monolithic.wdl b/wdl/LRJointCallGVCFsWithGenomicsDB_monolithic.wdl new file mode 100644 index 000000000..0aa8e6bb9 --- /dev/null +++ b/wdl/LRJointCallGVCFsWithGenomicsDB_monolithic.wdl @@ -0,0 +1,1197 @@ +version 1.0 + +############################################################################################################# +## A workflow that performs joint calling on single-sample gVCFs from GATK4 HaplotypeCaller using GenomicsDB. +############################################################################################################# + +struct RuntimeAttr { + Float? mem_gb + Int? cpu_cores + Int? disk_gb + Int? boot_disk_gb + Int? preemptible_tries + Int? max_retries + String? docker +} + +struct DataTypeParameters { + Int num_shards + String map_preset +} + +workflow LRJointCallGVCFsWithGenomicsDB { + input { + Array[File] gvcfs + Array[File] gvcf_indices + + File ref_map_file + + File interval_list + + Float snp_filter_level = 99.7 + Array[String] snp_recalibration_annotation_values = ["QD", "FS", "SOR", "MQRankSum", "ReadPosRankSum"] + Array[Float] snp_recalibration_tranche_values = [100.0, 99.95, 99.9, 99.8, 99.6, 99.5, 99.4, 99.3, 99.0, 98.0, 97.0, 90.0 ] + + Float indel_filter_level = 99.0 + Array[String] indel_recalibration_annotation_values = ["QD", "FS", "SOR", "MQRankSum", "ReadPosRankSum"] + Array[Float] indel_recalibration_tranche_values = [100.0, 99.95, 99.9, 99.5, 99.0, 97.0, 96.0, 95.0, 94.0, 93.5, 93.0, 92.0, 91.0, 90.0] + + Array[File]? annotation_bed_files + Array[File]? annotation_bed_file_indexes + Array[String]? annotation_bed_file_annotation_names + + String prefix + + String gcs_out_root_dir + } + + parameter_meta { + gvcfs: "GCS paths to gVCF files" + gvcf_indices: "GCS paths to gVCF tbi files" + ref_map_file: "table indicating reference sequence and auxillary file locations" + prefix: "prefix for output joint-called gVCF and tabix index" + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/LRJointCallGVCFsWithGenomicsDB/~{prefix}" + + Map[String, String] ref_map = read_map(ref_map_file) + + # From WARP: + # For small callsets (fewer than 1000 samples) we can gather the VCF shards and collect metrics directly. + # For anything larger, we need to keep the VCF sharded and gather metrics collected from them. + # We allow overriding this default behavior for testing / special requests. + Boolean is_small_callset = length(gvcfs) <= 1000 + + # Create sample-name map: + call CreateSampleNameMap as CreateSampleNameMap { + input: + gvcfs = gvcfs, + prefix = prefix + } + + # Import our data into GenomicsDB: + call ImportGVCFs as ImportGVCFsIntoGenomicsDB { + input: + sample_name_map = CreateSampleNameMap.sample_name_map, + interval_list = interval_list, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = prefix, + batch_size = 50, + # We need to override this because we're not actually sending the GVCF over (just a list) + # ALSO, we're currently tarring the genomicsDB, so we need at least double the space here, plus some slop: + runtime_attr_override = object {disk_gb: 10 + (3 * CreateSampleNameMap.total_gvcf_size_gb) + (2 * ceil(size(ref_map['fasta'], "GB"))), preemptible_tries: 0} + } + + # Joint call + call GenotypeGVCFs as JointCallGVCFs { + input: + input_gvcf_data = ImportGVCFsIntoGenomicsDB.output_genomicsdb, + interval_list = interval_list, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + dbsnp_vcf = ref_map["known_sites_vcf"], + prefix = prefix, + runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype. + } + + # First make a sites-only VCF for recal (smaller file, easier to work with): + call MakeSitesOnlyVcf as MakeSitesOnlyGVCF { + input: + vcf = JointCallGVCFs.output_vcf, + vcf_index = JointCallGVCFs.output_vcf_index, + prefix = prefix, + runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype. + } + + # Now we run VariantRecalibrator for indels and snps: + call IndelsVariantRecalibrator as TrainVQSROnHCIndelVariants { + input: + vcf = MakeSitesOnlyGVCF.sites_only_vcf, + vcf_index = MakeSitesOnlyGVCF.sites_only_vcf_index, + prefix = prefix + ".indels", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + known_reference_variants = [ref_map["known_sites_vcf"]], + known_reference_variants_index = [ref_map["known_sites_index"]], + known_reference_variants_identifier = ["pfcrosses"], + is_known = [true], + is_training = [true], + is_truth = [true], + prior = [15], + use_allele_specific_annotations = true, + max_gaussians = 8, + runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype. + } + + call SNPsVariantRecalibratorCreateModel as TrainVQSROnHCSnpVariants { + input: + vcf = MakeSitesOnlyGVCF.sites_only_vcf, + vcf_index = MakeSitesOnlyGVCF.sites_only_vcf_index, + prefix = prefix + ".snps", + recalibration_tranche_values = snp_recalibration_tranche_values, + recalibration_annotation_values = snp_recalibration_annotation_values, + known_reference_variants = [ref_map["known_sites_vcf"]], + known_reference_variants_index = [ref_map["known_sites_index"]], + known_reference_variants_identifier = ["pfcrosses"], + is_known = [true], + is_training = [true], + is_truth = [true], + prior = [15], + use_allele_specific_annotations = true, + max_gaussians = 8, + runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype. + } + + call ApplyVqsr as ApplyVqsr { + input: + vcf = JointCallGVCFs.output_vcf, + vcf_index = JointCallGVCFs.output_vcf_index, + + prefix = prefix + ".vqsr_filtered", + + snps_recalibration = TrainVQSROnHCSnpVariants.recalibration, + snps_recalibration_index = TrainVQSROnHCSnpVariants.recalibration_index, + snps_tranches = TrainVQSROnHCSnpVariants.tranches, + snp_filter_level = snp_filter_level, + + indels_recalibration = TrainVQSROnHCIndelVariants.recalibration, + indels_recalibration_index = TrainVQSROnHCIndelVariants.recalibration_index, + indels_tranches = TrainVQSROnHCIndelVariants.tranches, + indel_filter_level = indel_filter_level, + + use_allele_specific_annotations = true, + + runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype. + } + + # Now we need to annotate our variants by region: + if (defined(annotation_bed_files)) { + call AnnotateVcfWithBedRegions as AnnotateVcfRegions { + input: + vcf = ApplyVqsr.recalibrated_vcf, + vcf_index = ApplyVqsr.recalibrated_vcf_index, + bed_files = select_first([annotation_bed_files]), + bed_file_indexes = select_first([annotation_bed_file_indexes]), + bed_file_annotation_names = select_first([annotation_bed_file_annotation_names]), + prefix = prefix + ".region_annotated", + + runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype. + } + } + + # Finally convert the output to a HAIL Matrix Table: + call ConvertToHailMT as CreateHailMatrixTable { + input: + gvcf = select_first([AnnotateVcfRegions.annotated_vcf, ApplyVqsr.recalibrated_vcf]), + tbi = select_first([AnnotateVcfRegions.annotated_vcf_index, ApplyVqsr.recalibrated_vcf_index]), + reference = sub(sub(ref_map["fasta"], "^.*/", ""), "\.[fasta]*$", ""), + ref_fasta = ref_map["fasta"], + ref_fai = ref_map["fai"], + prefix = prefix, + outdir = outdir, + + runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype. + } + + # Finalize: + File keyfile = select_first([AnnotateVcfRegions.annotated_vcf_index, ApplyVqsr.recalibrated_vcf_index]) + + call FinalizeToFile as FinalizeGenomicsDB { input: outdir = outdir, keyfile = keyfile, file = ImportGVCFsIntoGenomicsDB.output_genomicsdb } + + call FinalizeToFile as FinalizeRawVCF { input: outdir = outdir, keyfile = keyfile, file = JointCallGVCFs.output_vcf } + call FinalizeToFile as FinalizeRawTBI { input: outdir = outdir, keyfile = keyfile, file = JointCallGVCFs.output_vcf_index } + + call FinalizeToFile as FinalizeIndelRecalFile { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCIndelVariants.recalibration } + call FinalizeToFile as FinalizeIndelRecalIndex { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCIndelVariants.recalibration_index } + call FinalizeToFile as FinalizeIndelRecalTranches { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCIndelVariants.tranches } + call FinalizeToFile as FinalizeIndelRecalModelReport { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCIndelVariants.model_report } + + call FinalizeToFile as FinalizeSnpRecalFile { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCSnpVariants.recalibration } + call FinalizeToFile as FinalizeSnpRecalIndex { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCSnpVariants.recalibration_index } + call FinalizeToFile as FinalizeSnpRecalTranches { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCSnpVariants.tranches } + call FinalizeToFile as FinalizeSnpRecalModelReport { input: outdir = outdir, keyfile = keyfile, file = TrainVQSROnHCSnpVariants.model_report } + + call FinalizeToFile as FinalizeVQSRVCF { input: outdir = outdir, keyfile = keyfile, file = ApplyVqsr.recalibrated_vcf } + call FinalizeToFile as FinalizeVQSRTBI { input: outdir = outdir, keyfile = keyfile, file = ApplyVqsr.recalibrated_vcf_index } + + if (defined(annotation_bed_files)) { + call FinalizeToFile as FinalizeRegionAnnotatedVcf { input: outdir = outdir, keyfile = keyfile, file = select_first([AnnotateVcfRegions.annotated_vcf]) } + call FinalizeToFile as FinalizeRegionAnnotatedVcfIndex { input: outdir = outdir, keyfile = keyfile, file = select_first([AnnotateVcfRegions.annotated_vcf_index]) } + } + + ########## + # store the results into designated bucket + ########## + + output { + File genomicsDB = FinalizeGenomicsDB.gcs_path + + File raw_joint_vcf = FinalizeRawVCF.gcs_path + File raw_joint_vcf_tbi = FinalizeRawTBI.gcs_path + + File? vqsr_indel_recal_file = FinalizeIndelRecalFile.gcs_path + File? vqsr_indel_recal_file_index = FinalizeIndelRecalIndex.gcs_path + File? vqsr_indel_recal_tranches = FinalizeIndelRecalTranches.gcs_path + File? vqsr_indel_recal_model_report = FinalizeIndelRecalModelReport.gcs_path + + File? vqsr_snp_recal_file = FinalizeSnpRecalFile.gcs_path + File? vqsr_snp_recal_file_index = FinalizeSnpRecalIndex.gcs_path + File? vqsr_snp_recal_tranches = FinalizeSnpRecalTranches.gcs_path + File? vqsr_snp_recal_model_report = FinalizeSnpRecalModelReport.gcs_path + + File joint_recalibrated_vcf = FinalizeVQSRVCF.gcs_path + File joint_recalibrated_vcf_tbi = FinalizeVQSRTBI.gcs_path + + File? annotated_joint_vcf = AnnotateVcfRegions.annotated_vcf + File? annotated_joint_vcf_tbi = AnnotateVcfRegions.annotated_vcf_index + + File joint_mt = CreateHailMatrixTable.gcs_path + } +} + + +task CreateSampleNameMap { + + meta { + description: "Creates the sample / name-map file of the GVCFs for ingest into ImportGVCFs. NOTE: Some of this functionality is duplicated from Utils.InferSampleName. This is intentional - we don't want to localize all these files or shard over potentially thousands of input GVCFs." + } + + input { + Array[File] gvcfs + String prefix + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + gvcfs: { + help: "Array of single-sample GVCF files.", + localization_optional: true + } + } + + Int disk_size_gb = 20 + + String outfile_name = "~{prefix}.sample_name_map.tsv" + String size_file_gb = "~{prefix}.total_gvcf_file_size_gb.txt" + + # Every so often we should reauthorize so `bcftools` can continue to access our data: + Int re_auth_interval = 50 + + command <<< + set -euxo pipefail + + # Put our gvcfs into a file we can iterate over: + gvcf_file_list=~{write_lines(gvcfs)} + + # Initialize a file for the sample names: + [ -e ~{outfile_name} ] && rm -rf ~{outfile_name} + + # Set our access token: + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + + # Create a temporary file to store file sizes in: + size_file=$(mktemp) + + let i=1 + while read file_path ; do + + # Get our sample list from our file: + bcftools query -l ${file_path} > sample_names.txt + + # Make sure we only have one sample name: + [[ $(wc -l sample_names.txt | awk '{print $1}') -ne 1 ]] && echo "Incorrect number of sample names found in GVCF (there can be only one!): ${file_path}" && exit 1 + + # Make sure the samplename has an actual name: + [ $(grep -iq "unnamedsample" sample_names.txt) ] && echo "Sample name found to be unnamedsample in GVCF: ${file_path}" && exit 1 + + # Add the sample name and GVCF path to the sample name file: + echo -e "$(cat sample_names.txt)\t${file_path}" >> ~{outfile_name} + + # Add the file size to the size file: + gsutil du -sac ${file_path} | tail -n1 | awk '{print $1}' >> ${size_file} + + let i=$i+1 + if [[ $i -gt ~{re_auth_interval} ]] ; then + # Periodically we should update the token so we don't have problems with long file lists: + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + i=0 + fi + done < ${gvcf_file_list} + + # Now calculate the final file size in GB: + # We include an additional GB in case we have a very small dataset: + awk '{s += $1}END{print int(1+s/(1024*1024*1024))}' ${size_file} > ~{size_file_gb} + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File sample_name_map = outfile_name + Int total_gvcf_size_gb = read_int("~{size_file_gb}") + } +} + +task ImportGVCFs { + + input { + File sample_name_map + + File interval_list + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix + + Int batch_size = 50 + + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_fasta_fai, "GB") + size(ref_dict, "GB")) + + Int disk_size = 8192 + 4*ref_size + + command <<< + set -euxo pipefail + + # Make sure that the output directory does not exist: + [ -e ~{prefix} ] && rm -rf ~{prefix} + + # + # Notes from WARP Team: + # + # We've seen some GenomicsDB performance regressions related to intervals, so we're going to pretend we only have a single interval + # using the --merge-input-intervals arg + # There's no data in between since we didn't run HaplotypeCaller over those loci so we're not wasting any compute + + # The memory setting here is very important and must be several GiB lower + # than the total memory allocated to the VM because this tool uses + # a significant amount of non-heap memory for native libraries. + # Also, testing has shown that the multithreaded reader initialization + # does not scale well beyond 5 threads, so don't increase beyond that. + gatk --java-options "-Xms8000m -Xmx25000m" \ + GenomicsDBImport \ + --genomicsdb-workspace-path ~{prefix}.genomicsDB \ + --batch-size ~{batch_size} \ + -L ~{interval_list} \ + --sample-name-map ~{sample_name_map} \ + --reader-threads 5 \ + --merge-input-intervals \ + --consolidate + + tar -cf ~{prefix}.genomicsDB.tar ~{prefix}.genomicsDB + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File output_genomicsdb = "~{prefix}.genomicsDB.tar" + } +} + +task GenotypeGVCFs { + + input { + File input_gvcf_data + File? input_gvcf_index # Required if passing a VCF file. + + File interval_list + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String dbsnp_vcf + + String prefix + + Boolean keep_combined_raw_annotations = false + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_fasta_fai, "GB") + size(ref_dict, "GB")) + Int db_snp_size = ceil(size(dbsnp_vcf, "GB")) + + Int disk_size = 1 + 4*ceil(size(input_gvcf_data, "GB")) + ref_size + db_snp_size + + parameter_meta { + input_gvcf_data: { help: "Either a single GVCF file or a GenomicsDB Tar file." } + interval_list: { + localization_optional: true + } + } + + command <<< + set -euxo pipefail + + # We must determine if our input variants are in a genomicsdb file or in a VCF. + # The easiest way is to see if the input is a .tar file: + + is_genomics_db=true + filename=$(basename -- "~{input_gvcf_data}") + extension="${filename##*.}" + if [[ "${extension}" != "tar" ]] ; then + is_genomics_db=false + fi + + if $is_genomics_db ; then + tar -xf ~{input_gvcf_data} + INPUT_FILE="gendb://$(basename ~{input_gvcf_data} .tar)" + else + INPUT_FILE=~{input_gvcf_data} + fi + + gatk --java-options "-Xms8000m -Xmx25000m" \ + GenotypeGVCFs \ + -R ~{ref_fasta} \ + -O ~{prefix}.vcf.gz \ + -D ~{dbsnp_vcf} \ + -G StandardAnnotation -G AS_StandardAnnotation \ + --only-output-calls-starting-in-intervals \ + -V ${INPUT_FILE} \ + -L ~{interval_list} \ + ~{true='--keep-combined-raw-annotations' false='' keep_combined_raw_annotations} \ + --merge-input-intervals + >>> + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 26, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File output_vcf = "~{prefix}.vcf.gz" + File output_vcf_index = "~{prefix}.vcf.gz.tbi" + } +} + + +task HardFilterVcf { + + input { + File vcf + File vcf_index + + String prefix + + # From WARP: + # ExcessHet is a phred-scaled p-value. We want a cutoof anything more extreme + # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 + Float excess_het_threshold = 54.69 + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size([vcf, vcf_index], "GB")) + + command <<< + set -euo pipefail + + # Get amount of memory to use: + mem_available=$(free -m | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-1000 + let mem_max=${mem_available}-750 + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + VariantFiltration \ + --filter-expression "ExcessHet > ~{excess_het_threshold}" \ + --filter-name ExcessHet \ + -V ~{vcf} \ + -O ~{prefix}.hard_filtered.vcf.gz + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File variant_filtered_vcf = "~{prefix}.hard_filtered.vcf.gz" + File variant_filtered_vcf_index = "~{prefix}.hard_filtered.vcf.gz.tbi" + } +} + +task MakeSitesOnlyVcf { + + input { + File vcf + File vcf_index + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size([vcf, vcf_index], "GB")) + + command <<< + set -euo pipefail + + # Get amount of memory to use: + mem_available=$(free -m | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-1000 + let mem_max=${mem_available}-750 + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + MakeSitesOnlyVcf \ + -I ~{vcf} \ + -O ~{prefix}.sites_only.vcf.gz + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File sites_only_vcf = "~{prefix}.sites_only.vcf.gz" + File sites_only_vcf_index = "~{prefix}.sites_only.vcf.gz.tbi" + } +} + +task AnnotateVcfWithBedRegions { + input { + File vcf + File vcf_index + + Array[File] bed_files + Array[File] bed_file_indexes + Array[String] bed_file_annotation_names + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size([vcf, vcf_index, bed_files, bed_file_indexes], "GB")) + + command <<< + set -euxo pipefail + + # Get amount of memory to use: + mem_available=$(free -m | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-1000 + let mem_max=${mem_available}-750 + + # We need to generate argument strings from the input arrays. + # First we check that the arrays are the same length: + if [[ ~{length(bed_files)} -ne ~{length(bed_file_indexes)} ]] || \ + [[ ~{length(bed_files)} -ne ~{length(bed_file_annotation_names)} ]] ; then + echo "ERROR: Not all input arrays for known variants contain the same number of elements: " 1>&2 + echo " bed_files = ~{length(bed_files)}" 1>&2 + echo " bed_file_indices = ~{length(bed_file_indexes)}" 1>&2 + echo " bed_file_annotation_names = ~{length(bed_file_annotation_names)}" 1>&2 + false + fi + + # Now we can write out the arrays into a TSV file and add them line by line to the execution: + # Create the TSV: + options_tsv=~{write_tsv(transpose([bed_files, bed_file_annotation_names]))} + + # Now we have to run `VariantFiltration` multiple times on its own output so that it can + # annotate each region in the file: + # NOTE: This is dumb, but must be done because the `--mask` and `--mask-name` inputs are not arrays. + + input_vcf=~{vcf} + output_vcf=~{prefix}.intermediate.vcf.gz + while read mask_options ; do + + bed_file=$(echo "${mask_options}" | awk -F'\t' '{print $1}') + mask_name=$(echo "${mask_options}" | awk -F'\t' '{print $2}') + + echo -e "RUNNING GATK ON NEW MASK: ${mask_name}\t${bed_file}" + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + VariantFiltration \ + -V ${input_vcf} \ + -O ${output_vcf} \ + --mask ${bed_file} \ + --mask-name ${mask_name} + + mv ${output_vcf} ~{prefix}.new_input.vcf.gz + mv ${output_vcf}.tbi ~{prefix}.new_input.vcf.gz.tbi + input_vcf=~{prefix}.new_input.vcf.gz + done < ${options_tsv} + + # Because of the `mv` at the end of the loop we need to move the "new_input" files here: + mv ~{prefix}.new_input.vcf.gz ~{prefix}.vcf.gz + mv ~{prefix}.new_input.vcf.gz.tbi ~{prefix}.vcf.gz.tbi + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File annotated_vcf = "~{prefix}.vcf.gz" + File annotated_vcf_index = "~{prefix}.vcf.gz.tbi" + } +} + +task IndelsVariantRecalibrator { + + input { + File vcf + File vcf_index + + String prefix + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + Array[File] known_reference_variants + Array[File] known_reference_variants_index + Array[String] known_reference_variants_identifier + Array[Boolean] is_known + Array[Boolean] is_training + Array[Boolean] is_truth + Array[Int] prior + + Boolean use_allele_specific_annotations + Int max_gaussians = 4 + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + vcf: "Sites only VCF. Can be pre-filtered using hard-filters." + vcf_index: "Tribble Index for sites only VCF." + known_reference_variants: "Array of known reference VCF files. For humans, dbSNP is one example." + known_reference_variants_index: "Array of index files for known reference VCF files." + known_reference_variants_identifier: "Array of boolean values the identifier / name for the known_reference_variant file at the same array position. Must be the same length as `known_reference_variants`." + is_known: "Array of boolean values indicating if the known_reference_variant file at the same array position contains known variants. Must be the same length as `known_reference_variants`." + is_training: "Array of boolean values indicating if the known_reference_variant file at the same array position contains training data. Must be the same length as `known_reference_variants`." + is_truth: "Array of boolean values indicating if the known_reference_variant file at the same array position contains truth data. Must be the same length as `known_reference_variants`." + prior: "Array of integer values indicating the priors for the known_reference_variant file at the same array position. Must be the same length as `known_reference_variants`." + } + + + Int disk_size = 10 + ceil(size(known_reference_variants, "GB")) + + 4*ceil(size(vcf, "GB")) + + 2*ceil(size(vcf_index, "GB")) + + command <<< + set -euxo pipefail + + # We need to generate resource strings from the input arrays. + # First we check that the arrays are the same length: + if [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_identifier)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_index)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_known)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_training)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_truth)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(prior)} ]] ; then + echo "ERROR: Not all input arrays for known variants contain the same number of elements: " 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants)}" 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants_index)}" 1>&2 + echo " known_reference_variants_identifier = ~{length(known_reference_variants_identifier)}" 1>&2 + echo " is_known = ~{length(is_known)}" 1>&2 + echo " is_training = ~{length(is_training)}" 1>&2 + echo " is_truth = ~{length(is_truth)}" 1>&2 + echo " prior = ~{length(prior)}" 1>&2 + false + fi + + # Now we can write out the arrays into a TSV file and add them line by line to the execution: + # Create the TSV: + options_tsv=~{write_tsv(transpose([known_reference_variants_identifier, is_known, is_training, is_truth, prior, known_reference_variants]))} + + # Now read them into a string: + resource_flags=$(awk '{printf("--resource:%s,known=%s,training=%s,truth=%s,prior=%d %s ", $1, $2, $3, $4, $5, $6)}' ${options_tsv}) + + # Get amount of memory to use: + mem_available=$(free -g | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-2 + let mem_max=${mem_available}-1 + + gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \ + VariantRecalibrator \ + -V ~{vcf} \ + -O ~{prefix}.recal \ + --tranches-file ~{prefix}.tranches \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode INDEL \ + --output-model ~{prefix}.model.report \ + --max-gaussians ~{max_gaussians} \ + ${resource_flags} + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 26, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File recalibration = "~{prefix}.recal" + File recalibration_index = "~{prefix}.recal.idx" + File tranches = "~{prefix}.tranches" + File model_report = "~{prefix}.model.report" + } +} + +task SNPsVariantRecalibratorCreateModel { + + input { + File vcf + File vcf_index + + String prefix + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + Array[File] known_reference_variants + Array[File] known_reference_variants_index + Array[String] known_reference_variants_identifier + Array[Boolean] is_known + Array[Boolean] is_training + Array[Boolean] is_truth + Array[Int] prior + + Int? downsampleFactor + + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + vcf: "Sites only VCF. Can be pre-filtered using hard-filters." + vcf_index: "Tribble Index for sites only VCF." + known_reference_variants: "Array of known reference VCF files. For humans, dbSNP is one example." + known_reference_variants_index: "Array of index files for known reference VCF files." + known_reference_variants_identifier: "Array of boolean values the identifier / name for the known_reference_variant file at the same array position. Must be the same length as `known_reference_variants`." + is_known: "Array of boolean values indicating if the known_reference_variant file at the same array position contains known variants. Must be the same length as `known_reference_variants`." + is_training: "Array of boolean values indicating if the known_reference_variant file at the same array position contains training data. Must be the same length as `known_reference_variants`." + is_truth: "Array of boolean values indicating if the known_reference_variant file at the same array position contains truth data. Must be the same length as `known_reference_variants`." + prior: "Array of integer values indicating the priors for the known_reference_variant file at the same array position. Must be the same length as `known_reference_variants`." + } + + Int disk_size = 10 + ceil(size(known_reference_variants, "GB")) + + 4*ceil(size(vcf, "GB")) + + 2*ceil(size(vcf_index, "GB")) + + String downsample_factor_arg = if defined(downsampleFactor) then " --sample-every-Nth-variant " else "" + + command <<< + set -euxo pipefail + + # We need to generate resource strings from the input arrays. + # First we check that the arrays are the same length: + if [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_identifier)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_index)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_known)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_training)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_truth)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(prior)} ]] ; then + echo "ERROR: Not all input arrays for known variants contain the same number of elements: " 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants)}" 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants_index)}" 1>&2 + echo " known_reference_variants_identifier = ~{length(known_reference_variants_identifier)}" 1>&2 + echo " is_known = ~{length(is_known)}" 1>&2 + echo " is_training = ~{length(is_training)}" 1>&2 + echo " is_truth = ~{length(is_truth)}" 1>&2 + echo " prior = ~{length(prior)}" 1>&2 + false + fi + + # Now we can write out the arrays into a TSV file and add them line by line to the execution: + # Create the TSV: + options_tsv=~{write_tsv(transpose([known_reference_variants_identifier, is_known, is_training, is_truth, prior, known_reference_variants]))} + + # Now read them into a string: + resource_flags=$(awk '{printf("--resource:%s,known=%s,training=%s,truth=%s,prior=%d %s ", $1, $2, $3, $4, $5, $6)}' ${options_tsv}) + + # Get amount of memory to use: + mem_available=$(free -g | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-2 + let mem_max=${mem_available}-1 + + gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \ + VariantRecalibrator \ + -V ~{vcf} \ + -O ~{prefix}.recal \ + --tranches-file ~{prefix}.tranches \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + ~{downsample_factor_arg}~{default="" sep=" --sample-every-Nth-variant " downsampleFactor} \ + --output-model ~{prefix}.model.report \ + --max-gaussians ~{max_gaussians} \ + ${resource_flags} + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 64, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File recalibration = "~{prefix}.recal" + File recalibration_index = "~{prefix}.recal.idx" + File tranches = "~{prefix}.tranches" + File model_report = "~{prefix}.model.report" + } +} + +task ApplyVqsr { + + input { + File vcf + File vcf_index + + String prefix + + File snps_recalibration + File snps_recalibration_index + File snps_tranches + Float snp_filter_level + + File indels_recalibration + File indels_recalibration_index + File indels_tranches + Float indel_filter_level + + Boolean use_allele_specific_annotations + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + ceil(size([vcf, vcf_index], "GB")) + + 2*ceil(size([snps_recalibration, snps_recalibration_index, snps_tranches], "GB")) + + 2*ceil(size([indels_recalibration, indels_recalibration_index, indels_tranches], "GB")) + + command <<< + set -euxo pipefail + + # Get amount of memory to use: + mem_available=$(free -m | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-2000 + let mem_max=${mem_available}-500 + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + ApplyVQSR \ + -V ~{vcf} \ + -O tmp.indel.recalibrated.vcf.gz \ + --recal-file ~{indels_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{indels_tranches} \ + --truth-sensitivity-filter-level ~{indel_filter_level} \ + --create-output-variant-index true \ + -mode INDEL + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + ApplyVQSR \ + -V tmp.indel.recalibrated.vcf.gz \ + -O ~{prefix}.recalibrated.vcf.gz \ + --recal-file ~{snps_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{snps_tranches} \ + --truth-sensitivity-filter-level ~{snp_filter_level} \ + --create-output-variant-index true \ + -mode SNP + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 7, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File recalibrated_vcf = "~{prefix}.recalibrated.vcf.gz" + File recalibrated_vcf_index = "~{prefix}.recalibrated.vcf.gz.tbi" + } +} + + +task ConvertToHailMT { + meta { + description: "Convert a .vcf.bgz file to a Hail MatrixTable and copy it to a final gs:// URL." + } + + input { + File gvcf + File tbi + + String reference = "GRCh38" + String? ref_fasta + String? ref_fai + String prefix = "out" + + String outdir + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 3*ceil(size(gvcf, "GB")) + + command <<< + set -x + + python3 <>> + + output { + String gcs_path = "~{outdir}/~{prefix}.mt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 64, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "hailgenetics/hail:0.2.105" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FinalizeToFile { + input { + File file + String outdir + String? name + + File? keyfile + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + file: { + description: "file to finalize", + localization_optional: true + } + keyfile : "[optional] File used to key this finaliation. Finalization will not take place until the KeyFile exists. This can be used to force the finaliation to wait until a certain point in a workflow. NOTE: The latest WDL development spec includes the `after` keyword which will obviate this." + outdir: "directory to which files should be uploaded" + name: "name to set for uploaded file" + } + + String gcs_output_dir = sub(outdir, "/+$", "") + String gcs_output_file = gcs_output_dir + "/" + select_first([name, basename(file)]) + + command <<< + set -euxo pipefail + + gsutil -m cp "~{file}" "~{gcs_output_file}" + >>> + + output { + String gcs_path = gcs_output_file + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 10, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/lr-finalize:0.1.2" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + diff --git a/wdl/ONTPfTypeDrugResistanceMarkers.wdl b/wdl/ONTPfTypeDrugResistanceMarkers.wdl index 1cb96d224..ed4dadd16 100644 --- a/wdl/ONTPfTypeDrugResistanceMarkers.wdl +++ b/wdl/ONTPfTypeDrugResistanceMarkers.wdl @@ -1,105 +1,89 @@ version 1.0 import "tasks/Structs.wdl" +import "tasks/FunctionalAnnotation.wdl" as FUNK import "tasks/Finalize.wdl" as FF workflow ONTPfTypeDrugResistanceMarkers { input { File vcf + File snpeff_db + File drug_resistance_list String dir_prefix String gcs_out_root_dir + + Boolean do_functional_annotation = true } String outdir = sub(gcs_out_root_dir, "/$", "") + "/ONTPfTypeDrugResistanceMarkers/~{dir_prefix}" - call AnnotateEffectsOfSelectedVariants { input: vcf = vcf } + if (do_functional_annotation) { + call FUNK.FunctionallyAnnotateVariants { input: vcf = vcf, snpeff_db = snpeff_db } + } + + call CallDrugResistanceMutations { + input: + vcf = select_first([FunctionallyAnnotateVariants.annotated_vcf, vcf]), + drug_resistance_list = drug_resistance_list + } # Finalize data String dir = outdir + "/reports" - call FF.FinalizeToFile as FinalizeDRReport { input: outdir = dir, file = AnnotateEffectsOfSelectedVariants.report } + call FF.FinalizeToFile as FinalizeDRReport { input: outdir = dir, file = CallDrugResistanceMutations.report } + + if (do_functional_annotation) { + call FF.FinalizeToFile as FinalizeAnnotatedVCF { input: outdir = dir, file = select_first([FunctionallyAnnotateVariants.annotated_vcf]) } + call FF.FinalizeToFile as FinalizeAnnotatedVCFIndex { input: outdir = dir, file = select_first([FunctionallyAnnotateVariants.annotated_vcf_index]) } + call FF.FinalizeToFile as FinalizeSnpEffSummary { input: outdir = dir, file = select_first([FunctionallyAnnotateVariants.snpEff_summary]) } + call FF.FinalizeToFile as FinalizeSnpEffGenes { input: outdir = dir, file = select_first([FunctionallyAnnotateVariants.snpEff_genes]) } + } output { File drug_res_report = FinalizeDRReport.gcs_path + + File? annotated_vcf = FinalizeAnnotatedVCF.gcs_path + File? annotated_vcf_index = FinalizeAnnotatedVCFIndex.gcs_path + File? snpEff_summary = FinalizeSnpEffSummary.gcs_path + File? snpEff_genes = FinalizeSnpEffGenes.gcs_path } } -task AnnotateEffectsOfSelectedVariants { +task CallDrugResistanceMutations { input { File vcf + File drug_resistance_list RuntimeAttr? runtime_attr_override } - Int disk_size = 1 + 2*ceil(size(vcf, "GB")) - String base = basename(vcf, ".vcf.gz") + Int disk_size = 1 + 2*ceil(size([vcf, drug_resistance_list], "GB")) + String prefix = basename(basename(vcf, ".gz"), ".vcf") command <<< set -x - zcat ~{vcf} | \ - sed 's/^Pf3D7_0//' | \ - sed 's/^Pf3D7_1/1/' | \ - sed 's/_v3\t/\t/' | \ - awk '{ if ($0 ~ "^#" || (length($4) == 1 && length($5) == 1 && $7 == "PASS")) print $0 }' \ - > reformatted.vcf - - /usr/local/bin/snpEff ann -v Plasmodium_falciparum reformatted.vcf \ - > ann.vcf - - grep PF3D7_0417200 ann.vcf | grep p.Cys50Arg | wc -l | awk '{ if ($1 > 0) print "pfdhfr\tPF3D7_0417200\tp.Cys50Arg\t+"; else print "pfdhfr\tPF3D7_0417200\tp.Cys50Arg\t-" }' > drug_resistance_report.txt - grep PF3D7_0417200 ann.vcf | grep p.Asn51Ile | wc -l | awk '{ if ($1 > 0) print "pfdhfr\tPF3D7_0417200\tp.Asn51Ile\t+"; else print "pfdhfr\tPF3D7_0417200\tp.Asn51Ile\t-" }' >> drug_resistance_report.txt - grep PF3D7_0417200 ann.vcf | grep p.Cys59Arg | wc -l | awk '{ if ($1 > 0) print "pfdhfr\tPF3D7_0417200\tp.Cys59Arg\t+"; else print "pfdhfr\tPF3D7_0417200\tp.Cys59Arg\t-" }' >> drug_resistance_report.txt - grep PF3D7_0417200 ann.vcf | grep p.Ser108Asn | wc -l | awk '{ if ($1 > 0) print "pfdhfr\tPF3D7_0417200\tp.Ser108Asn\t+"; else print "pfdhfr\tPF3D7_0417200\tp.Ser108Asn\t-" }' >> drug_resistance_report.txt - grep PF3D7_0417200 ann.vcf | grep p.Ile164Lys | wc -l | awk '{ if ($1 > 0) print "pfdhfr\tPF3D7_0417200\tp.Ile164Lys\t+"; else print "pfdhfr\tPF3D7_0417200\tp.Ile164Lys\t-" }' >> drug_resistance_report.txt - - grep PF3D7_0523000 ann.vcf | grep p.Asn86Tyr | wc -l | awk '{ if ($1 > 0) print "pfmdr1\tPF3D7_0523000\tp.Asn86Tyr\t+"; else print "pfmdr1\tPF3D7_0523000\tp.Asn86Tyr\t-" }' >> drug_resistance_report.txt - grep PF3D7_0523000 ann.vcf | grep p.Tyr184Phe | wc -l | awk '{ if ($1 > 0) print "pfmdr1\tPF3D7_0523000\tp.Tyr184Phe\t+"; else print "pfmdr1\tPF3D7_0523000\tp.Tyr184Phe\t-" }' >> drug_resistance_report.txt - grep PF3D7_0523000 ann.vcf | grep p.Ser1034Cys | wc -l | awk '{ if ($1 > 0) print "pfmdr1\tPF3D7_0523000\tp.Ser1034Cys\t+"; else print "pfmdr1\tPF3D7_0523000\tp.Ser1034Cys\t-" }' >> drug_resistance_report.txt - grep PF3D7_0523000 ann.vcf | grep p.Asn1024Asp | wc -l | awk '{ if ($1 > 0) print "pfmdr1\tPF3D7_0523000\tp.Asn1024Asp\t+"; else print "pfmdr1\tPF3D7_0523000\tp.Asn1024Asp\t-" }' >> drug_resistance_report.txt - grep PF3D7_0523000 ann.vcf | grep p.Asp1246Tyr | wc -l | awk '{ if ($1 > 0) print "pfmdr1\tPF3D7_0523000\tp.Asp1246Tyr\t+"; else print "pfmdr1\tPF3D7_0523000\tp.Asp1246Tyr\t-" }' >> drug_resistance_report.txt - - grep PF3D7_0709000 ann.vcf | grep p.Lys76Thr | wc -l | awk '{ if ($1 > 0) print "pfcrt\tPF3D7_0709000\tp.Lys76Thr\t+"; else print "pfcrt\tPF3D7_0709000\tp.Lys76Thr\t-" }' >> drug_resistance_report.txt - grep PF3D7_0709000 ann.vcf | grep p.Met74Ile | wc -l | awk '{ if ($1 > 0) print "pfcrt\tPF3D7_0709000\tp.Met74Ile\t+"; else print "pfcrt\tPF3D7_0709000\tp.Met74Ile\t-" }' >> drug_resistance_report.txt - grep PF3D7_0709000 ann.vcf | grep p.Asn75Glu | wc -l | awk '{ if ($1 > 0) print "pfcrt\tPF3D7_0709000\tp.Asn75Glu\t+"; else print "pfcrt\tPF3D7_0709000\tp.Asn75Glu\t-" }' >> drug_resistance_report.txt - grep PF3D7_0709000 ann.vcf | grep p.Cys72Ser | wc -l | awk '{ if ($1 > 0) print "pfcrt\tPF3D7_0709000\tp.Lys76Thr\t+"; else print "pfcrt\tPF3D7_0709000\tp.Lys76Thr\t-" }' >> drug_resistance_report.txt - grep PF3D7_0709000 ann.vcf | grep p.His97Tyr | wc -l | awk '{ if ($1 > 0) print "pfcrt\tPF3D7_0709000\tp.His97Tyr\t+"; else print "pfcrt\tPF3D7_0709000\tp.His97Tyr\t-" }' >> drug_resistance_report.txt - grep PF3D7_0709000 ann.vcf | grep p.Cys101Phe | wc -l | awk '{ if ($1 > 0) print "pfcrt\tPF3D7_0709000\tp.Cys101Phe\t+"; else print "pfcrt\tPF3D7_0709000\tp.Cys101Phe\t-" }' >> drug_resistance_report.txt - grep PF3D7_0709000 ann.vcf | grep p.Phe145Ile | wc -l | awk '{ if ($1 > 0) print "pfcrt\tPF3D7_0709000\tp.Phe145Ile\t+"; else print "pfcrt\tPF3D7_0709000\tp.Phe145Ile\t-" }' >> drug_resistance_report.txt - grep PF3D7_0709000 ann.vcf | grep p.Met343Leu | wc -l | awk '{ if ($1 > 0) print "pfcrt\tPF3D7_0709000\tp.Met343Leu\t+"; else print "pfcrt\tPF3D7_0709000\tp.Met343Leu\t-" }' >> drug_resistance_report.txt - grep PF3D7_0709000 ann.vcf | grep p.Ser350Arg | wc -l | awk '{ if ($1 > 0) print "pfcrt\tPF3D7_0709000\tp.Ser350Arg\t+"; else print "pfcrt\tPF3D7_0709000\tp.Ser350Arg\t-" }' >> drug_resistance_report.txt - grep PF3D7_0709000 ann.vcf | grep p.Gly353Val | wc -l | awk '{ if ($1 > 0) print "pfcrt\tPF3D7_0709000\tp.Glu353Val\t+"; else print "pfcrt\tPF3D7_0709000\tp.Glu353Val\t-" }' >> drug_resistance_report.txt - - grep PF3D7_0810800 ann.vcf | grep p.Ser436Ala | wc -l | awk '{ if ($1 > 0) print "pfdhps\tPF3D7_0810800\tp.Ser436Ala\t+"; else print "pfdhps\tPF3D7_0810800\tp.Ser436Ala\t-" }' >> drug_resistance_report.txt - grep PF3D7_0810800 ann.vcf | grep p.Lys437Gly | wc -l | awk '{ if ($1 > 0) print "pfdhps\tPF3D7_0810800\tp.Lys437Gly\t+"; else print "pfdhps\tPF3D7_0810800\tp.Lys437Gly\t-" }' >> drug_resistance_report.txt - grep PF3D7_0810800 ann.vcf | grep p.Lys540Glu | wc -l | awk '{ if ($1 > 0) print "pfdhps\tPF3D7_0810800\tp.Lys540Glu\t+"; else print "pfdhps\tPF3D7_0810800\tp.Lys540Glu\t-" }' >> drug_resistance_report.txt - grep PF3D7_0810800 ann.vcf | grep p.Ala581Gly | wc -l | awk '{ if ($1 > 0) print "pfdhps\tPF3D7_0810800\tp.Ala581Gly\t+"; else print "pfdhps\tPF3D7_0810800\tp.Ala581Gly\t-" }' >> drug_resistance_report.txt - grep PF3D7_0810800 ann.vcf | grep p.Ala613Thr | wc -l | awk '{ if ($1 > 0) print "pfdhps\tPF3D7_0810800\tp.Ala613Thr\t+"; else print "pfdhps\tPF3D7_0810800\tp.Ala613Thr\t-" }' >> drug_resistance_report.txt - grep PF3D7_0810800 ann.vcf | grep p.Ala613Ser | wc -l | awk '{ if ($1 > 0) print "pfdhps\tPF3D7_0810800\tp.Ala613Ser\t+"; else print "pfdhps\tPF3D7_0810800\tp.Ala613Ser\t-" }' >> drug_resistance_report.txt - - grep PF3D7_1343700 ann.vcf | grep p.Tyr493His | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Tyr493His\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Tyr493His\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Arg539Thr | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Arg539Thr\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Arg539Thr\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Ile543Thr | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Ile543Thr\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Ile543Thr\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Arg561His | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Arg561His\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Arg561His\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Cys580Tyr | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Cys580Tyr\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Cys580Tyr\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Ala675Val | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Ala675Val\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Ala675Val\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Phe446Ile | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Phe446Ile\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Phe446Ile\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Met476Ile | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Met476Ile\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Met476Ile\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Asn458Tyr | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Asn458Tyr\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Asn458Tyr\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Phe553Leu | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Phe553Leu\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Phe553Leu\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Phe574Leu | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Phe574Leu\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Phe574Leu\t-" }' >> drug_resistance_report.txt - grep PF3D7_1343700 ann.vcf | grep p.Arg633Ile | wc -l | awk '{ if ($1 > 0) print "pfkelch13\tPF3D7_1343700\tp.Arg633Ile\t+"; else print "pfkelch13\tPF3D7_1343700\tp.Arg633Ile\t-" }' >> drug_resistance_report.txt + while read LINE; do + GENE_NAME=$(echo $LINE | awk '{print $1}') + GENE_ID=$(echo $LINE | awk '{print $2}') + MUTATION=$(echo $LINE | awk '{print $3}') + + zcat ~{vcf} | grep $GENE_ID | grep $MUTATION | wc -l | \ + awk -v gene_name=$GENE_NAME -v gene_id=$GENE_ID -v mutation=$MUTATION \ + '{ print gene_name, gene_id, mutation, ($1 > 0) ? "present" : "absent" }' | \ + tee -a ~{prefix}.drug_resistance_report.txt + done <~{drug_resistance_list} >>> output { - File report = "drug_resistance_report.txt" + File report = "~{prefix}.drug_resistance_report.txt" } ######################### RuntimeAttr default_attr = object { - cpu_cores: 2, - mem_gb: 4, + cpu_cores: 1, + mem_gb: 1, disk_gb: disk_size, boot_disk_gb: 10, preemptible_tries: 2, diff --git a/wdl/PanelProcessMalariaBarcodesForRh.wdl b/wdl/PanelProcessMalariaBarcodesForRh.wdl new file mode 100644 index 000000000..473f114fd --- /dev/null +++ b/wdl/PanelProcessMalariaBarcodesForRh.wdl @@ -0,0 +1,328 @@ +version 1.0 + +########################################################################################## +## A workflow that processes P. falciparum SNP panels (read: barcodes) and calculates several +## metrics that are relevant to studying the epidemiology of the disease. +## +## This WDL calls a script written by Wes Wong and based on the following paper: +## https://doi.org/10.1093/pnasnexus/pgac187 +########################################################################################## + +import "tasks/Structs.wdl" +import "tasks/Utils.wdl" as Utils +import "tasks/Finalize.wdl" as FF + +workflow PanelProcessMalariaBarcodesForRh { + input { + + # Unfortunately the easiest way to make this work would be to pass a spreadsheet into the script. + # Because of how Terra is structured, this isn't really possible. + # Instead, we pass each column and construct a spreadsheet in the task. + # This way we can have one big table of the values in Terra and we don't have to make the data hard + # to visualize. + + # High-level required info: + String location_code + File barcode_def_tsv + + # Spreadsheet data: + Array[String] cc + Array[String] ISO3 + Array[String] Year + Array[String] Number_Text + Array[String] Sample_Name + Array[String] Raw_Name + Array[String] Barcode_String + Array[String] A1 + Array[String] B1 + Array[String] A2 + Array[String] B2 + Array[String] A3 + Array[String] B3 + Array[String] A4 + Array[String] B4 + Array[String] A5 + Array[String] B5 + Array[String] A6 + Array[String] B6 + Array[String] A7 + Array[String] B7 + Array[String] A8 + Array[String] B8 + Array[String] A9 + Array[String] B9 + Array[String] A10 + Array[String] B10 + Array[String] A11 + Array[String] B11 + Array[String] A12 + Array[String] B12 + Array[String] X + Array[String] N + Array[String] M_P + Array[String] Delta_CT_Threshold + Array[String] Adjusted_Het + Array[String] mccoil_median + + String dir_prefix + String gcs_out_root_dir + + Boolean DEBUG_MODE = false + } + + parameter_meta { + location_code: "Location code of the sample. Should correspond to the ISO3 value." + barcode_def_tsv: "TSV file containing the definition of the SNP barcode sites with the columns: Name, Contig, Position" + + cc: "" + ISO3: "" + Year: "Year this dataset was collected." + Number_Text: "" + Sample_Name: "" + Raw_Name: "" + Barcode_String: "Nucleotide sequence of all barcode SNPs in genomic order." + A1: "Nucleotide at the A1 barcode position." + B1: "Nucleotide at the B1 barcode position." + A2: "Nucleotide at the A2 barcode position." + B2: "Nucleotide at the B2 barcode position." + A3: "Nucleotide at the A3 barcode position." + B3: "Nucleotide at the B3 barcode position." + A4: "Nucleotide at the A4 barcode position." + B4: "Nucleotide at the B4 barcode position." + A5: "Nucleotide at the A5 barcode position." + B5: "Nucleotide at the B5 barcode position." + A6: "Nucleotide at the A6 barcode position." + B6: "Nucleotide at the B6 barcode position." + A7: "Nucleotide at the A7 barcode position." + B7: "Nucleotide at the B7 barcode position." + A8: "Nucleotide at the A8 barcode position." + B8: "Nucleotide at the B8 barcode position." + A9: "Nucleotide at the A9 barcode position." + B9: "Nucleotide at the B9 barcode position." + A10:"Nucleotide at the A10 barcode position." + B10:"Nucleotide at the B10 barcode position." + A11:"Nucleotide at the A11 barcode position." + B11:"Nucleotide at the B11 barcode position." + A12:"Nucleotide at the A12 barcode position." + B12:"Nucleotide at the B12 barcode position." + X: "" + N: "" + M_P: "Mono- / Poly-clonal indicator." + Delta_CT_Threshold: "" + Adjusted_Het: "" + mccoil_median: "" + + dir_prefix: "directory prefix for output files" + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + } + + #################################### + # _____ _ + # |_ _|_ _ ___| | _____ + # | |/ _` / __| |/ / __| + # | | (_| \__ \ <\__ \ + # |_|\__,_|___/_|\_\___/ + # + #################################### + + # Call our timestamp so we can store outputs without clobbering previous runs: + call Utils.GetCurrentTimestampString as t_001_WdlExecutionStartTimestamp { input: } + + # Create an outdir: + String outdir = if DEBUG_MODE then sub(gcs_out_root_dir, "/$", "") + "/PanelProcessMalariaBarcodesForRh/~{dir_prefix}/" + t_001_WdlExecutionStartTimestamp.timestamp_string else sub(gcs_out_root_dir, "/$", "") + "/PanelProcessMalariaBarcodesForRh/~{dir_prefix}" + + call ProcessBarcodeSpreadsheet as t_002_ProcessBarcodeSpreadsheet { + input: + location_code = location_code, + barcode_def_tsv = barcode_def_tsv, + cc = cc, + ISO3 = ISO3, + Year = Year, + Number_Text = Number_Text, + Sample_Name = Sample_Name, + Raw_Name = Raw_Name, + Barcode_String = Barcode_String, + A1 = A1, + B1 = B1, + A2 = A2, + B2 = B2, + A3 = A3, + B3 = B3, + A4 = A4, + B4 = B4, + A5 = A5, + B5 = B5, + A6 = A6, + B6 = B6, + A7 = A7, + B7 = B7, + A8 = A8, + B8 = B8, + A9 = A9, + B9 = B9, + A10 = A10, + B10 = B10, + A11 = A11, + B11 = B11, + A12 = A12, + B12 = B12, + X = X, + N = N, + M_P = M_P, + Delta_CT_Threshold = Delta_CT_Threshold, + Adjusted_Het = Adjusted_Het, + mccoil_median = mccoil_median + } + + ############################################ + # _____ _ _ _ + # | ___(_)_ __ __ _| (_)_______ + # | |_ | | '_ \ / _` | | |_ / _ \ + # | _| | | | | | (_| | | |/ / __/ + # |_| |_|_| |_|\__,_|_|_/___\___| + # + ############################################ + File keyfile = t_002_ProcessBarcodeSpreadsheet.summary_stats + + # Finalize our outputs. We don't have many so let's do them all together: + call FF.FinalizeToDir as t_003_FinalizeOutputs { + input: + outdir = outdir, + files = + [ + t_002_ProcessBarcodeSpreadsheet.summary_figure_svg, + t_002_ProcessBarcodeSpreadsheet.summary_figure_png, + t_002_ProcessBarcodeSpreadsheet.summary_stats, + t_002_ProcessBarcodeSpreadsheet.mono_barcode_stats, + t_002_ProcessBarcodeSpreadsheet.poly_barcode_stats, + t_002_ProcessBarcodeSpreadsheet.input_tsv, + ], + keyfile = keyfile + } + + ############################################ + # ___ _ _ + # / _ \ _ _| |_ _ __ _ _| |_ + # | | | | | | | __| '_ \| | | | __| + # | |_| | |_| | |_| |_) | |_| | |_ + # \___/ \__,_|\__| .__/ \__,_|\__| + # |_| + ############################################ + + output { + File summary_figure_svg = t_002_ProcessBarcodeSpreadsheet.summary_figure_svg + File summary_figure_png = t_002_ProcessBarcodeSpreadsheet.summary_figure_png + File summary_stats = t_002_ProcessBarcodeSpreadsheet.summary_stats + File mono_barcode_stats = t_002_ProcessBarcodeSpreadsheet.mono_barcode_stats + File poly_barcode_stats = t_002_ProcessBarcodeSpreadsheet.poly_barcode_stats + + File input_tsv = t_002_ProcessBarcodeSpreadsheet.input_tsv + } +} + +################################################################################ +################################################################################ +################################################################################ + +task ProcessBarcodeSpreadsheet { + input { + # High-level required info: + String location_code + File barcode_def_tsv + + # Spreadsheet data: + Array[String] cc + Array[String] ISO3 + Array[String] Year + Array[String] Number_Text + Array[String] Sample_Name + Array[String] Raw_Name + Array[String] Barcode_String + Array[String] A1 + Array[String] B1 + Array[String] A2 + Array[String] B2 + Array[String] A3 + Array[String] B3 + Array[String] A4 + Array[String] B4 + Array[String] A5 + Array[String] B5 + Array[String] A6 + Array[String] B6 + Array[String] A7 + Array[String] B7 + Array[String] A8 + Array[String] B8 + Array[String] A9 + Array[String] B9 + Array[String] A10 + Array[String] B10 + Array[String] A11 + Array[String] B11 + Array[String] A12 + Array[String] B12 + Array[String] X + Array[String] N + Array[String] M_P + Array[String] Delta_CT_Threshold + Array[String] Adjusted_Het + Array[String] mccoil_median + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + + # Create a header for the inputs so we can generate a TSV: + Array[String] header = ["cc", "ISO3", "Year", "Number_Text", "Sample_Name", "Raw_Name", "Barcode_String", "A1", "B1", "A2", "B2", "A3", "B3", "A4", "B4", "A5", "B5", "A6", "B6", "A7", "B7", "A8", "B8", "A9", "B9", "A10", "B10", "A11", "B11", "A12", "B12", "X", "N", "M_P", "Delta_CT_Threshold", "Adjusted_Het", "mccoil_median"] + + String out_base_name = sub(location_code, ":", ".") + String input_tsv_path = "~{out_base_name}.reconstructed_input.tsv" + + command <<< + source activate lr-malaria + + set -euxo pipefail + + ## Generate the input TSV: + tmp_header_tsv=~{write_tsv([header])} + tmp_data_tsv=~{write_tsv(transpose([cc, ISO3, Year, Number_Text, Sample_Name, Raw_Name, Barcode_String, A1, B1, A2, B2, A3, B3, A4, B4, A5, B5, A6, B6, A7, B7, A8, B8, A9, B9, A10, B10, A11, B11, A12, B12, X, N, M_P, Delta_CT_Threshold, Adjusted_Het, mccoil_median]))} + + cat ${tmp_header_tsv} ${tmp_data_tsv} > ~{input_tsv_path} + + ## Run the script: + /python_scripts/process_barcode_data.py -b ~{barcode_def_tsv} -s ~{location_code} -f ~{input_tsv_path} + >>> + + output { + File summary_figure_svg = "~{out_base_name}_summary_figure.svg" + File summary_figure_png = "~{out_base_name}_summary_figure.png" + File summary_stats = "~{out_base_name}_summary.csv" + File mono_barcode_stats = "~{out_base_name}_mono_barcodes.csv" + File poly_barcode_stats = "~{out_base_name}_poly_barcodes.csv" + + File input_tsv = input_tsv_path + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-malaria:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/PfalciparumDrugResistanceSummary.wdl b/wdl/PfalciparumDrugResistanceSummary.wdl new file mode 100644 index 000000000..110d63630 --- /dev/null +++ b/wdl/PfalciparumDrugResistanceSummary.wdl @@ -0,0 +1,267 @@ +version 1.0 + +import "tasks/Finalize.wdl" as FF + +workflow PfalciparumDrugResistanceSummary { + meta { + desciption: "Create a drug resistance report based on the given raw drug resistance loci report." + } + + input { + File raw_drug_resistance_report + + String participant_name + + String gcs_out_root_dir + } + + parameter_meta { + raw_drug_resistance_report: "File containing a raw drug resistance report to use to determine drug resistance." + participant_name: "Participant (or sample) name for the given bam file." + gcs_out_root_dir: "Output folder into which to place the results of this workflow." + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/PfalciparumDrugResistanceSummary/~{participant_name}" + + call CreateDrugResistanceSummary as CreateDrugResistanceSummary { + input: + raw_drug_resistance_report = raw_drug_resistance_report, + prefix = participant_name + } + + call FF.FinalizeToFile as FinalizeDrugResistanceSummary { input: outdir = outdir, file = CreateDrugResistanceSummary.resistance_summary } + + output { + File drug_resistance_summary = FinalizeDrugResistanceSummary.gcs_path + } +} + +task CreateDrugResistanceSummary { + meta { + desciption: "Create a drug resistance report based on the given raw drug resistance loci report." + } + + input { + File raw_drug_resistance_report + String prefix + RuntimeAttr? runtime_attr_override + } + + String outfile_name = "~{prefix}.drug_resistance_summary.txt" + + Int disk_size = 1 + 4*ceil(size(raw_drug_resistance_report, "GB")) + + command <<< + python3 <>> + output { + File resistance_summary = "~{outfile_name}" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/ProcessMalariaBarcodesDemo.wdl b/wdl/ProcessMalariaBarcodesDemo.wdl new file mode 100644 index 000000000..63b5c1253 --- /dev/null +++ b/wdl/ProcessMalariaBarcodesDemo.wdl @@ -0,0 +1,292 @@ +version 1.0 + +########################################################################################## +## A workflow that processes P. falciparum SNP panels (read: barcodes) and calculates several +## metrics that are relevant to studying the epidemiology of the disease. +## +## This WDL calls a script written by Wes Wong and based on the following paper: +## https://doi.org/10.1093/pnasnexus/pgac187 +########################################################################################## + +import "tasks/Structs.wdl" +import "tasks/Utils.wdl" as Utils + +workflow ProcessMalariaBarcodesDemo { + input { + + # Unfortunately the easiest way to make this work would be to pass a spreadsheet into the script. + # Because of how Terra is structured, this isn't really possible. + # Instead, we pass each column and construct a spreadsheet in the task. + # This way we can have one big table of the values in Terra and we don't have to make the data hard + # to visualize. + + # High-level required info: + String location_code + File barcode_def_tsv + + # Spreadsheet data: + Array[String] cc + Array[String] ISO3 + Array[String] Year + Array[String] Number_Text + Array[String] Sample_Name + Array[String] Raw_Name + Array[String] Barcode_String + Array[String] A1 + Array[String] B1 + Array[String] A2 + Array[String] B2 + Array[String] A3 + Array[String] B3 + Array[String] A4 + Array[String] B4 + Array[String] A5 + Array[String] B5 + Array[String] A6 + Array[String] B6 + Array[String] A7 + Array[String] B7 + Array[String] A8 + Array[String] B8 + Array[String] A9 + Array[String] B9 + Array[String] A10 + Array[String] B10 + Array[String] A11 + Array[String] B11 + Array[String] A12 + Array[String] B12 + Array[String] X + Array[String] N + Array[String] M_P + Array[String] Delta_CT_Threshold + Array[String] Adjusted_Het + Array[String] mccoil_median + + Boolean DEBUG_MODE = false + } + + parameter_meta { + location_code: "Location code of the sample. Should correspond to the ISO3 value." + barcode_def_tsv: "TSV file containing the definition of the SNP barcode sites with the columns: Name, Contig, Position" + + cc: "Country Code" + ISO3: "ISO3 formatted location code." + Year: "Year this dataset was collected." + Number_Text: "" + Sample_Name: "" + Raw_Name: "" + Barcode_String: "Nucleotide sequence of all barcode SNPs in genomic order." + A1: "Nucleotide at the A1 barcode position." + B1: "Nucleotide at the B1 barcode position." + A2: "Nucleotide at the A2 barcode position." + B2: "Nucleotide at the B2 barcode position." + A3: "Nucleotide at the A3 barcode position." + B3: "Nucleotide at the B3 barcode position." + A4: "Nucleotide at the A4 barcode position." + B4: "Nucleotide at the B4 barcode position." + A5: "Nucleotide at the A5 barcode position." + B5: "Nucleotide at the B5 barcode position." + A6: "Nucleotide at the A6 barcode position." + B6: "Nucleotide at the B6 barcode position." + A7: "Nucleotide at the A7 barcode position." + B7: "Nucleotide at the B7 barcode position." + A8: "Nucleotide at the A8 barcode position." + B8: "Nucleotide at the B8 barcode position." + A9: "Nucleotide at the A9 barcode position." + B9: "Nucleotide at the B9 barcode position." + A10:"Nucleotide at the A10 barcode position." + B10:"Nucleotide at the B10 barcode position." + A11:"Nucleotide at the A11 barcode position." + B11:"Nucleotide at the B11 barcode position." + A12:"Nucleotide at the A12 barcode position." + B12:"Nucleotide at the B12 barcode position." + X: "" + N: "" + M_P: "Mono- / Poly-clonal indicator." + Delta_CT_Threshold: "" + Adjusted_Het: "" + mccoil_median: "" + } + + #################################### + # _____ _ + # |_ _|_ _ ___| | _____ + # | |/ _` / __| |/ / __| + # | | (_| \__ \ <\__ \ + # |_|\__,_|___/_|\_\___/ + # + #################################### + + # Call our timestamp so we can store outputs without clobbering previous runs: + call Utils.GetCurrentTimestampString as t_001_WdlExecutionStartTimestamp { input: } + + call ProcessBarcodeSpreadsheet as t_002_ProcessBarcodeSpreadsheet { + input: + location_code = location_code, + barcode_def_tsv = barcode_def_tsv, + cc = cc, + ISO3 = ISO3, + Year = Year, + Number_Text = Number_Text, + Sample_Name = Sample_Name, + Raw_Name = Raw_Name, + Barcode_String = Barcode_String, + A1 = A1, + B1 = B1, + A2 = A2, + B2 = B2, + A3 = A3, + B3 = B3, + A4 = A4, + B4 = B4, + A5 = A5, + B5 = B5, + A6 = A6, + B6 = B6, + A7 = A7, + B7 = B7, + A8 = A8, + B8 = B8, + A9 = A9, + B9 = B9, + A10 = A10, + B10 = B10, + A11 = A11, + B11 = B11, + A12 = A12, + B12 = B12, + X = X, + N = N, + M_P = M_P, + Delta_CT_Threshold = Delta_CT_Threshold, + Adjusted_Het = Adjusted_Het, + mccoil_median = mccoil_median + } + + ############################################ + # ___ _ _ + # / _ \ _ _| |_ _ __ _ _| |_ + # | | | | | | | __| '_ \| | | | __| + # | |_| | |_| | |_| |_) | |_| | |_ + # \___/ \__,_|\__| .__/ \__,_|\__| + # |_| + ############################################ + + output { + File summary_figure_svg = t_002_ProcessBarcodeSpreadsheet.summary_figure_svg + File summary_figure_png = t_002_ProcessBarcodeSpreadsheet.summary_figure_png + File summary_stats = t_002_ProcessBarcodeSpreadsheet.summary_stats + File mono_barcode_stats = t_002_ProcessBarcodeSpreadsheet.mono_barcode_stats + File poly_barcode_stats = t_002_ProcessBarcodeSpreadsheet.poly_barcode_stats + + File input_tsv = t_002_ProcessBarcodeSpreadsheet.input_tsv + } +} + +################################################################################ +################################################################################ +################################################################################ + +task ProcessBarcodeSpreadsheet { + input { + # High-level required info: + String location_code + File barcode_def_tsv + + # Spreadsheet data: + Array[String] cc + Array[String] ISO3 + Array[String] Year + Array[String] Number_Text + Array[String] Sample_Name + Array[String] Raw_Name + Array[String] Barcode_String + Array[String] A1 + Array[String] B1 + Array[String] A2 + Array[String] B2 + Array[String] A3 + Array[String] B3 + Array[String] A4 + Array[String] B4 + Array[String] A5 + Array[String] B5 + Array[String] A6 + Array[String] B6 + Array[String] A7 + Array[String] B7 + Array[String] A8 + Array[String] B8 + Array[String] A9 + Array[String] B9 + Array[String] A10 + Array[String] B10 + Array[String] A11 + Array[String] B11 + Array[String] A12 + Array[String] B12 + Array[String] X + Array[String] N + Array[String] M_P + Array[String] Delta_CT_Threshold + Array[String] Adjusted_Het + Array[String] mccoil_median + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + + # Create a header for the inputs so we can generate a TSV: + Array[String] header = ["cc", "ISO3", "Year", "Number_Text", "Sample_Name", "Raw_Name", "Barcode_String", "A1", "B1", "A2", "B2", "A3", "B3", "A4", "B4", "A5", "B5", "A6", "B6", "A7", "B7", "A8", "B8", "A9", "B9", "A10", "B10", "A11", "B11", "A12", "B12", "X", "N", "M_P", "Delta_CT_Threshold", "Adjusted_Het", "mccoil_median"] + + String out_base_name = sub(location_code, ":", ".") + String input_tsv_path = "~{out_base_name}.reconstructed_input.tsv" + + command <<< + source activate lr-malaria + + set -euxo pipefail + + ## Generate the input TSV: + tmp_header_tsv=~{write_tsv([header])} + tmp_data_tsv=~{write_tsv(transpose([cc, ISO3, Year, Number_Text, Sample_Name, Raw_Name, Barcode_String, A1, B1, A2, B2, A3, B3, A4, B4, A5, B5, A6, B6, A7, B7, A8, B8, A9, B9, A10, B10, A11, B11, A12, B12, X, N, M_P, Delta_CT_Threshold, Adjusted_Het, mccoil_median]))} + + cat ${tmp_header_tsv} ${tmp_data_tsv} > ~{input_tsv_path} + + ## Run the script: + /python_scripts/process_barcode_data.py -b ~{barcode_def_tsv} -s ~{location_code} -f ~{input_tsv_path} + >>> + + output { + File summary_figure_svg = "~{out_base_name}_summary_figure.svg" + File summary_figure_png = "~{out_base_name}_summary_figure.png" + File summary_stats = "~{out_base_name}_summary.csv" + File mono_barcode_stats = "~{out_base_name}_mono_barcodes.csv" + File poly_barcode_stats = "~{out_base_name}_poly_barcodes.csv" + + File input_tsv = input_tsv_path + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-malaria:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/SRBamToFq.wdl b/wdl/SRBamToFq.wdl new file mode 100644 index 000000000..6727f0b1d --- /dev/null +++ b/wdl/SRBamToFq.wdl @@ -0,0 +1,27 @@ +version 1.0 + +import "tasks/SRUtils.wdl" as SRUtils +import "tasks/Finalize.wdl" as FF + +workflow SRBamToFq { + input { + File bam + String participant_name + + String gcs_out_root_dir + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/SRBamToFq/~{participant_name}" + + call SRUtils.BamToFq { input: bam = bam, prefix = participant_name } + + call FF.FinalizeToFile as FinalizeFqEnd1 { input: outdir = outdir, file = BamToFq.fq_end1 } + call FF.FinalizeToFile as FinalizeFqEnd2 { input: outdir = outdir, file = BamToFq.fq_end2 } + call FF.FinalizeToFile as FinalizeFqUnpaired { input: outdir = outdir, file = BamToFq.fq_unpaired } + + output { + File fq_end1 = FinalizeFqEnd1.gcs_path + File fq_end2 = FinalizeFqEnd2.gcs_path + File fq_unpaired = FinalizeFqUnpaired.gcs_path + } +} \ No newline at end of file diff --git a/wdl/SRCallGermlineCNVs.wdl b/wdl/SRCallGermlineCNVs.wdl new file mode 100644 index 000000000..7897e10be --- /dev/null +++ b/wdl/SRCallGermlineCNVs.wdl @@ -0,0 +1,40 @@ +version 1.0 + +########################################################################## +## A workflow that performs germline CNV calling using GATK4's GCNV tools. +########################################################################## + +import "tasks/SRGermlineCNVs.wdl" as SRJOINT +import "tasks/Finalize.wdl" as FF + +workflow SRCallGermlineCNVs { + input { + Array[File] gvcfs + Array[File] gvcf_indices + + File ref_map_file + + File interval_list + + String prefix + + String gcs_out_root_dir + } + + parameter_meta { + gvcfs: "GCS paths to gVCF files" + gvcf_indices: "GCS paths to gVCF tbi files" + ref_map_file: "table indicating reference sequence and auxillary file locations" + prefix: "prefix for output joint-called gVCF and tabix index" + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/SRJointCallGVCFsWithGenomicsDB/~{prefix}" + + Map[String, String] ref_map = read_map(ref_map_file) + + + output { + + } +} diff --git a/wdl/SRFlowcell.wdl b/wdl/SRFlowcell.wdl new file mode 100644 index 000000000..8c63c9675 --- /dev/null +++ b/wdl/SRFlowcell.wdl @@ -0,0 +1,421 @@ +version 1.0 + +########################################################################################## +## A workflow that preprocesses short read flowcell data in preparation for variant calling. +## This workflow contains the following steps: +## 1) Sam -> Fastq (if necessary) +## 2) Alignment to reference with bwa-mem2 (https://github.com/bwa-mem2/bwa-mem2) +## 3) Mark Duplicate reads +## 4) Recalibrate base quality scores. +########################################################################################## + +import "tasks/SRUtils.wdl" as SRUTIL +import "tasks/Utils.wdl" as Utils +import "tasks/AlignedMetrics.wdl" as AM +import "tasks/FastQC.wdl" as FastQC +import "tasks/RemoveSingleOrganismContamination.wdl" as DECONTAMINATE +import "tasks/Finalize.wdl" as FF + +workflow SRFlowcell { + input { + File? bam + File? bai + + File? fq_end1 + File? fq_end2 + + String SM + String LB + + File ref_map_file + String? contaminant_ref_name + File? contaminant_ref_map_file + + String dir_prefix + + String gcs_out_root_dir + + Boolean perform_BQSR = true + + Boolean DEBUG_MODE = false + + String platform = "illumina" + } + + parameter_meta { + bam: "GCS path to unmapped bam" + bai: "GCS path to bai index for unmapped bam" + + fq_end1: "GCS path to end1 of paired-end fastq" + fq_end2: "GCS path to end2 of paired-end fastq" + + ref_map_file: "table indicating reference sequence and auxillary file locations" + contaminant_ref_name: "Name of the contaminant genome to be used in output files." + contaminant_ref_map_file: "table indicating reference sequence and auxillary file locations for a single-organism contaminant" + + SM: "the value to place in the BAM read group's SM field" + LB: "the value to place in the BAM read group's LB (library) field" + + num_shards: "number of shards into which fastq files should be batched" + dir_prefix: "directory prefix for output files" + + DEBUG_MODE: "[default valued] enables debugging tasks / subworkflows (default: false)" + + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + } + + #################################### + # _____ _ + # |_ _|_ _ ___| | _____ + # | |/ _` / __| |/ / __| + # | | (_| \__ \ <\__ \ + # |_|\__,_|___/_|\_\___/ + # + #################################### + + # Get ref info: + Map[String, String] ref_map = read_map(ref_map_file) + + # Call our timestamp so we can store outputs without clobbering previous runs: + call Utils.GetCurrentTimestampString as t_001_WdlExecutionStartTimestamp { input: } + + # Create an outdir: + String outdir = if DEBUG_MODE then sub(gcs_out_root_dir, "/$", "") + "/SRFlowcell/~{dir_prefix}/" + t_001_WdlExecutionStartTimestamp.timestamp_string else sub(gcs_out_root_dir, "/$", "") + "/SRFlowcell/~{dir_prefix}" + String reads_dir = outdir + "/reads" + String unaligned_reads_dir = outdir + "/reads/unaligned" + String aligned_reads_dir = outdir + "/reads/aligned" + String metrics_dir = outdir + "/metrics" + + if (defined(bam)) { + # Convert the given bam to a uBAM (needed for previous aligned data): + call SRUTIL.RevertSam as t_002_RevertSam { + input: + input_bam = select_first([bam]), + prefix = SM + ".revertSam" + } + + # Convert input SAM/BAM to FASTQ: + call SRUTIL.BamToFq as t_003_Bam2Fastq { + input: + bam = t_002_RevertSam.bam, + prefix = SM + } + + call Utils.GetRawReadGroup as t_004_GetRawReadGroup { input: gcs_bam_path = select_first([bam]) } + } + + # OK, this is inefficient, but let's NOW extract our contaminated reads if we have the info. + # TODO: Move this into the sections above to make it more efficient. Specifically where we convert bam -> fastq. + # TODO: Re-enable this section after decontamination is fixed. The alignment based method with BWA-MEM doesn't work. Not clear why, but this does seem somewhat inadequate (simplistic alignment-based strategies). + if (false && defined(contaminant_ref_map_file)) { + + # Call our sub-workflow for decontamination: + # NOTE: We don't need to be too concerned with the finalization info. + # This will be partially filled in by the WDL itself, so we can pass the same inputs for + # these things here (e.g. `dir_prefix`): + call DECONTAMINATE.RemoveSingleOrganismContamination as DecontaminateSample { + input: + fq_end1 = select_first([fq_end1, t_003_Bam2Fastq.fq_end1]), + fq_end2 = select_first([fq_end2, t_003_Bam2Fastq.fq_end2]), + + SM = SM, + LB = LB, + platform = platform, + + contaminant_ref_name = select_first([contaminant_ref_name]), + contaminant_ref_map_file = select_first([contaminant_ref_map_file]), + + dir_prefix = dir_prefix, + gcs_out_root_dir = gcs_out_root_dir + } + } + + File fq_e1 = select_first([DecontaminateSample.decontaminated_fq1, fq_end1, t_003_Bam2Fastq.fq_end1]) + File fq_e2 = select_first([DecontaminateSample.decontaminated_fq2, fq_end2, t_003_Bam2Fastq.fq_end2]) + + String RG = select_first([t_004_GetRawReadGroup.rg, "@RG\tID:" + SM + "_" + LB + "\tPL:" + platform + "\tLB:" + LB + "\tSM:" + SM]) + + # Align reads to reference with BWA-MEM2: + call SRUTIL.BwaMem2 as t_005_AlignReads { + input: + fq_end1 = fq_e1, + fq_end2 = fq_e2, + ref_fasta = ref_map["fasta"], + ref_fasta_index = ref_map["fai"], + ref_dict = ref_map["dict"], + ref_0123 = ref_map["0123"], + ref_amb = ref_map["amb"], + ref_ann = ref_map["ann"], + ref_bwt = ref_map["bwt"], + ref_pac = ref_map["pac"], + mark_short_splits_as_secondary = true, + read_group = RG, + prefix = SM + ".aligned" + } + + if (defined(bam)) { + # Merge aligned reads and unaligned reads: + call SRUTIL.MergeBamAlignment as t_006_MergeBamAlignment { + input: + aligned_bam = t_005_AlignReads.bam, + unaligned_bam = select_first([t_002_RevertSam.bam]), + ref_fasta = ref_map["fasta"], + ref_fasta_index = ref_map["fai"], + ref_dict = ref_map["dict"], + prefix = SM + ".aligned.merged" + } + } + + File merged_bam = select_first([t_005_AlignReads.bam, t_006_MergeBamAlignment.bam]) + + # Mark Duplicates + call SRUTIL.MarkDuplicates as t_007_MarkDuplicates { + input: + input_bam = merged_bam, + prefix = SM + ".aligned.merged.markDuplicates" + } + + # Sort Duplicate Marked Bam: + call Utils.SortSam as t_008_SortAlignedDuplicateMarkedBam { + input: + input_bam = t_007_MarkDuplicates.bam, + output_bam_basename = SM + ".aligned.merged.markDuplicates.sorted", + compression_level = 2 + } + +# TODO: Add Fingerprinting? + + if (perform_BQSR) { + # Recalibrate Base Scores: + call SRUTIL.BaseRecalibrator as t_009_BaseRecalibrator { + input: + input_bam = t_008_SortAlignedDuplicateMarkedBam.output_bam, + input_bam_index = t_008_SortAlignedDuplicateMarkedBam.output_bam_index, + + ref_fasta = ref_map["fasta"], + ref_fasta_index = ref_map["fai"], + ref_dict = ref_map["dict"], + + known_sites_vcf = ref_map["known_sites_vcf"], + known_sites_index = ref_map["known_sites_index"], + + prefix = SM + ".baseRecalibratorReport" + } + + call SRUTIL.ApplyBQSR as t_010_ApplyBQSR { + input: + input_bam = t_008_SortAlignedDuplicateMarkedBam.output_bam, + input_bam_index = t_008_SortAlignedDuplicateMarkedBam.output_bam_index, + + ref_fasta = ref_map["fasta"], + ref_fasta_index = ref_map["fai"], + ref_dict = ref_map["dict"], + + recalibration_report = t_009_BaseRecalibrator.recalibration_report, + + prefix = SM + ".aligned.merged.markDuplicates.sorted.BQSR" + } + } + + File final_bam = select_first([t_010_ApplyBQSR.recalibrated_bam, t_008_SortAlignedDuplicateMarkedBam.output_bam]) + File final_bai = select_first([t_010_ApplyBQSR.recalibrated_bai, t_008_SortAlignedDuplicateMarkedBam.output_bam_index]) + + ############################################# + # __ __ _ _ + # | \/ | ___| |_ _ __(_) ___ ___ + # | |\/| |/ _ \ __| '__| |/ __/ __| + # | | | | __/ |_| | | | (__\__ \ + # |_| |_|\___|\__|_| |_|\___|___/ + # + ############################################# + + call AM.SamStatsMap as t_011_SamStats { + input: + bam = final_bam + } + + call FastQC.FastQC as t_012_FastQC { input: bam = final_bam, bai = final_bai } + call Utils.ComputeGenomeLength as t_013_ComputeGenomeLength { input: fasta = ref_map['fasta'] } + call SRUTIL.ComputeBamStats as t_014_ComputeBamStats { input: bam_file = final_bam } + + # Collect stats on aligned reads: + call SRUTIL.ComputeBamStats as t_015_ComputeBamStatsQ5 { input: bam_file = final_bam, qual_threshold = 5 } + call SRUTIL.ComputeBamStats as t_016_ComputeBamStatsQ7 { input: bam_file = final_bam, qual_threshold = 7 } + call SRUTIL.ComputeBamStats as t_017_ComputeBamStatsQ10 { input: bam_file = final_bam, qual_threshold = 10 } + call SRUTIL.ComputeBamStats as t_018_ComputeBamStatsQ12 { input: bam_file = final_bam, qual_threshold = 12 } + call SRUTIL.ComputeBamStats as t_019_ComputeBamStatsQ15 { input: bam_file = final_bam, qual_threshold = 15 } + + call AM.AlignedMetrics as PerFlowcellMetrics { + input: + aligned_bam = final_bam, + aligned_bai = final_bai, + ref_fasta = ref_map['fasta'], + ref_dict = ref_map['dict'], + gcs_output_dir = metrics_dir + } + + ############################################ + # _____ _ _ _ + # | ___(_)_ __ __ _| (_)_______ + # | |_ | | '_ \ / _` | | |_ / _ \ + # | _| | | | | | (_| | | |/ / __/ + # |_| |_|_| |_|\__,_|_|_/___\___| + # + ############################################ + File keyfile = t_014_ComputeBamStats.results_file + + # Finalize our unaligned reads first: + call FF.FinalizeToDir as t_020_FinalizeUnalignedFastqReads { + input: + outdir = unaligned_reads_dir, + files = + [ + fq_e1, + fq_e2, + ], + keyfile = keyfile + } + if (defined(bam)) { + call FF.FinalizeToDir as t_021_FinalizeUnalignedReadsFromBam { + input: + outdir = unaligned_reads_dir, + files = select_all( + [ + bam, + bai, + t_003_Bam2Fastq.fq_unpaired, + ]), + keyfile = keyfile + } + } + + call FF.FinalizeToDir as t_022_FinalizeAlignedReads { + input: + outdir = aligned_reads_dir, + files = + [ + t_005_AlignReads.bam, + merged_bam, + t_007_MarkDuplicates.bam, + t_008_SortAlignedDuplicateMarkedBam.output_bam, + t_008_SortAlignedDuplicateMarkedBam.output_bam_index, + ], + keyfile = keyfile + } + + call FF.FinalizeToFile as t_023_FinalizeAlignedBam { + input: + outdir = aligned_reads_dir, + file = final_bam, + keyfile = keyfile + } + + call FF.FinalizeToFile as t_024_FinalizeAlignedBai { + input: + outdir = aligned_reads_dir, + file = final_bai, + keyfile = keyfile + } + + # Finalize our metrics: + call FF.FinalizeToDir as t_025_FinalizeMetrics { + input: + outdir = metrics_dir, + files = + [ + t_007_MarkDuplicates.metrics, + t_011_SamStats.sam_stats, + t_014_ComputeBamStats.results_file, + t_015_ComputeBamStatsQ5.results_file, + t_016_ComputeBamStatsQ7.results_file, + t_017_ComputeBamStatsQ10.results_file, + t_018_ComputeBamStatsQ12.results_file, + t_019_ComputeBamStatsQ15.results_file, + ], + keyfile = keyfile + } + + # Finalize BQSR Metrics if it was run: + if (perform_BQSR) { + call FF.FinalizeToDir as t_026_FinalizeBQSRMetrics { + input: + outdir = metrics_dir, + files = select_all([t_009_BaseRecalibrator.recalibration_report]), + keyfile = keyfile + } + + } + + call FF.FinalizeToFile as t_027_FinalizeFastQCReport { + input: + outdir = metrics_dir, + file = t_012_FastQC.report + } + + # Prep a few files for output: + File fq1_o = unaligned_reads_dir + "/" + basename(fq_e1) + File fq2_o = unaligned_reads_dir + "/" + basename(fq_e2) + if (defined(bam)) { + File unaligned_bam_o = unaligned_reads_dir + "/" + basename(select_first([bam])) + File unaligned_bai_o = unaligned_reads_dir + "/" + basename(select_first([bai])) + File fqboup = unaligned_reads_dir + "/" + basename(select_first([DecontaminateSample.decontaminated_unpaired, t_003_Bam2Fastq.fq_unpaired])) + } + + ############################################ + # ___ _ _ + # / _ \ _ _| |_ _ __ _ _| |_ + # | | | | | | | __| '_ \| | | | __| + # | |_| | |_| | |_| |_) | |_| | |_ + # \___/ \__,_|\__| .__/ \__,_|\__| + # |_| + ############################################ + output { + # Unaligned reads + File fq1 = fq1_o + File fq2 = fq2_o + File? fq_unpaired = fqboup + + # Unaligned BAM file + File? unaligned_bam = unaligned_bam_o + File? unaligned_bai = unaligned_bai_o + + # Contaminated BAM file: + File? contaminated_bam = DecontaminateSample.contaminated_bam + + # Aligned BAM file + File aligned_bam = t_023_FinalizeAlignedBam.gcs_path + File aligned_bai = t_024_FinalizeAlignedBai.gcs_path + + # Unaligned read stats + Float num_reads = t_014_ComputeBamStats.results['reads'] + Float num_bases = t_014_ComputeBamStats.results['bases'] + Float raw_est_fold_cov = t_014_ComputeBamStats.results['bases']/t_013_ComputeGenomeLength.length + + Float read_length = t_014_ComputeBamStats.results['read_mean'] + + Float read_qual_mean = t_014_ComputeBamStats.results['mean_qual'] + Float read_qual_median = t_014_ComputeBamStats.results['median_qual'] + + Float num_reads_Q5 = t_015_ComputeBamStatsQ5.results['reads'] + Float num_reads_Q7 = t_016_ComputeBamStatsQ7.results['reads'] + Float num_reads_Q10 = t_017_ComputeBamStatsQ10.results['reads'] + Float num_reads_Q12 = t_018_ComputeBamStatsQ12.results['reads'] + Float num_reads_Q15 = t_019_ComputeBamStatsQ15.results['reads'] + + # Aligned read stats + Float aligned_num_reads = t_012_FastQC.stats_map['number_of_reads'] + Float aligned_num_bases = t_011_SamStats.stats_map['bases_mapped'] + Float aligned_frac_bases = t_011_SamStats.stats_map['bases_mapped']/t_011_SamStats.stats_map['total_length'] + Float aligned_est_fold_cov = t_011_SamStats.stats_map['bases_mapped']/t_013_ComputeGenomeLength.length + + Float aligned_read_length = t_012_FastQC.stats_map['read_length'] + + Float insert_size_average = t_011_SamStats.stats_map['insert_size_average'] + Float insert_size_standard_deviation = t_011_SamStats.stats_map['insert_size_standard_deviation'] + Float pct_properly_paired_reads = t_011_SamStats.stats_map['percentage_of_properly_paired_reads_%'] + + Float average_identity = 100.0 - (100.0*t_011_SamStats.stats_map['mismatches']/t_011_SamStats.stats_map['bases_mapped']) + + File fastqc_report = t_027_FinalizeFastQCReport.gcs_path + } +} diff --git a/wdl/SRIndexBam.wdl b/wdl/SRIndexBam.wdl new file mode 100644 index 000000000..1c16e17d9 --- /dev/null +++ b/wdl/SRIndexBam.wdl @@ -0,0 +1,29 @@ +version 1.0 + +import "tasks/Utils.wdl" as Utils +import "tasks/Finalize.wdl" as FF + +workflow SRIndexBam { + input { + File bam + String? outdir # Make outdir optional + } + + call Utils.Index { input: bam = bam } + # If outdir is not provided, use the directory of the bam file as outdir. + # This uses a ternary conditional operator (if-else shorthand) to check if outdir is provided. + # If outdir is provided, it uses it; otherwise, it extracts the directory from the bam file path. + String finalOutdir = select_first([outdir, sub(bam, "/[^/]+$", "")]) + + # Call the FinalizeToFile task with the finalOutdir and the index file generated by the Utils.Index task + call FF.FinalizeToFile as FinalizeBamIndex { + input: + outdir = finalOutdir, + file = Index.bai + } + # call FF.FinalizeToFile as FinalizeBamIndex { input: outdir = outdir, file = Index.bai } + + output { + File bai = FinalizeBamIndex.gcs_path + } +} \ No newline at end of file diff --git a/wdl/SRJointCallGVCFsWithGenomicsDB.wdl b/wdl/SRJointCallGVCFsWithGenomicsDB.wdl new file mode 100644 index 000000000..6964c5315 --- /dev/null +++ b/wdl/SRJointCallGVCFsWithGenomicsDB.wdl @@ -0,0 +1,431 @@ +version 1.0 + +############################################################################################################# +## A workflow that performs joint calling on single-sample gVCFs from GATK4 HaplotypeCaller using GenomicsDB. +############################################################################################################# + +import "tasks/SRJointGenotyping.wdl" as SRJOINT +import "tasks/VariantUtils.wdl" as VARUTIL +import "tasks/Utils.wdl" as UTILS +import "tasks/Hail.wdl" as Hail +import "tasks/FunctionalAnnotation.wdl" as FUNK +import "tasks/SGKit.wdl" as SGKit +import "tasks/Finalize.wdl" as FF + +workflow SRJointCallGVCFsWithGenomicsDB { + input { + Array[File] gvcfs + Array[File] gvcf_indices + + File ref_map_file + + File interval_list + + Float snp_calibration_sensitivity = 0.99 + Int snp_max_unlabeled_variants = 0 + Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ] + + Array[File] snp_known_reference_variants + Array[File] snp_known_reference_variants_index + Array[File] snp_known_reference_variants_identifier + Array[Boolean] snp_is_training + Array[Boolean] snp_is_calibration + + Float indel_calibration_sensitivity = 0.99 + Int indel_max_unlabeled_variants = 0 + Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ] + + Array[File] indel_known_reference_variants + Array[File] indel_known_reference_variants_index + Array[File] indel_known_reference_variants_identifier + Array[Boolean] indel_is_training + Array[Boolean] indel_is_calibration + + Array[File]? annotation_bed_files + Array[File]? annotation_bed_file_indexes + Array[String]? annotation_bed_file_annotation_names + + File? snpeff_db + + String prefix + + String gcs_out_root_dir + } + + parameter_meta { + gvcfs: "GCS paths to gVCF files" + gvcf_indices: "GCS paths to gVCF tbi files" + ref_map_file: "table indicating reference sequence and auxillary file locations" + prefix: "prefix for output joint-called gVCF and tabix index" + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/SRJointCallGVCFsWithGenomicsDB/~{prefix}" + + Map[String, String] ref_map = read_map(ref_map_file) + + # Create interval list over which to shard the processing: + call UTILS.MakeChrIntervalList as MakeChrIntervalList { + input: + ref_dict = ref_map['dict'], + } + + # Create sample-name map: + call SRJOINT.CreateSampleNameMap as CreateSampleNameMap { + input: + gvcfs = gvcfs, + prefix = prefix + } + + # Shard by contig for speed: + scatter (idx_1 in range(length(MakeChrIntervalList.contig_interval_list_files))) { + + String contig = MakeChrIntervalList.chrs[idx_1][0] + File contig_interval_list = MakeChrIntervalList.contig_interval_list_files[idx_1] + + # Import our data into GenomicsDB: + call SRJOINT.ImportGVCFs as ImportGVCFsIntoGenomicsDB { + input: + sample_name_map = CreateSampleNameMap.sample_name_map, + interval_list = contig_interval_list, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = prefix + "." + contig, + batch_size = 50, + # We need to override this because we're not actually sending the GVCF over (just a list) + # ALSO, we're currently tarring the genomicsDB, so we need at least double the space here, plus some slop: + runtime_attr_override = object {disk_gb: 10 + (3 * CreateSampleNameMap.total_gvcf_size_gb) + (2 * ceil(size(ref_map['fasta'], "GB"))), preemptible_tries: 0} + } + + # Joint call + call SRJOINT.GenotypeGVCFs as JointCallGVCFs { + input: + input_gvcf_data = ImportGVCFsIntoGenomicsDB.output_genomicsdb, + interval_list = contig_interval_list, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + dbsnp_vcf = ref_map["known_sites_vcf"], + prefix = prefix + "." + contig + ".raw", + runtime_attr_override = object {preemptible_tries: 0}, # Disable preemption for prototype. + } + + # First make a sites-only VCF for recal (smaller file, easier to work with): + call VARUTIL.MakeSitesOnlyVcf as MakeSitesOnlyVCF { + input: + vcf = JointCallGVCFs.output_vcf, + vcf_index = JointCallGVCFs.output_vcf_index, + prefix = prefix + "." + contig + ".sites_only" + } + } + + # Merge all sites-only VCFs + call VARUTIL.GatherVcfs as MergeSitesOnlyVCFs { + input: + input_vcfs = MakeSitesOnlyVCF.sites_only_vcf, + input_vcf_indices = MakeSitesOnlyVCF.sites_only_vcf_index, + prefix = prefix + ".sites_only" + } + + ######################################################################## + # Call VETS / VQSR-lite: + call VARUTIL.ExtractVariantAnnotations as ExtractIndelVariantAnnotations { + input: + vcf = MergeSitesOnlyVCFs.output_vcf, + vcf_index = MergeSitesOnlyVCFs.output_vcf_index, + + prefix = prefix, + mode = "INDEL", + + recalibration_annotation_values = indel_recalibration_annotation_values, + + known_reference_variants = indel_known_reference_variants, + known_reference_variants_index = indel_known_reference_variants_index, + known_reference_variants_identifier = indel_known_reference_variants_identifier, + is_training = indel_is_training, + is_calibration = indel_is_calibration, + + max_unlabeled_variants = indel_max_unlabeled_variants, + } + + call VARUTIL.ExtractVariantAnnotations as ExtractSnpVariantAnnotations { + input: + vcf = MergeSitesOnlyVCFs.output_vcf, + vcf_index = MergeSitesOnlyVCFs.output_vcf_index, + + prefix = prefix, + mode = "SNP", + + recalibration_annotation_values = snp_recalibration_annotation_values, + + known_reference_variants = snp_known_reference_variants, + known_reference_variants_index = snp_known_reference_variants_index, + known_reference_variants_identifier = snp_known_reference_variants_identifier, + is_training = snp_is_training, + is_calibration = snp_is_calibration, + + max_unlabeled_variants = snp_max_unlabeled_variants, + } + + call VARUTIL.TrainVariantAnnotationsModel as TrainIndelVariantAnnotationsModel { + input: + annotation_hdf5 = ExtractIndelVariantAnnotations.annotation_hdf5, + mode = "INDEL", + prefix = prefix, + } + + call VARUTIL.TrainVariantAnnotationsModel as TrainSnpVariantAnnotationsModel { + input: + annotation_hdf5 = ExtractSnpVariantAnnotations.annotation_hdf5, + mode = "SNP", + prefix = prefix, + } + + # Shard by contig for speed: + scatter (idx_2 in range(length(JointCallGVCFs.output_vcf))) { + + String contig_2 = MakeChrIntervalList.chrs[idx_2][0] + File joint_called_vcf = JointCallGVCFs.output_vcf[idx_2] + File joint_called_vcf_index = JointCallGVCFs.output_vcf_index[idx_2] + + call VARUTIL.ScoreVariantAnnotations as ScoreSnpVariantAnnotations { + input: + vcf = joint_called_vcf, + vcf_index = joint_called_vcf_index, + + sites_only_extracted_vcf = ExtractSnpVariantAnnotations.sites_only_vcf, + sites_only_extracted_vcf_index = ExtractSnpVariantAnnotations.sites_only_vcf_index, + + model_prefix = prefix + "_train_SNP", + model_files = flatten([[TrainSnpVariantAnnotationsModel.training_scores, TrainSnpVariantAnnotationsModel.positive_model_scorer_pickle], select_all([ + TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores, + TrainSnpVariantAnnotationsModel.calibration_set_scores, + TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle + ])]), + prefix = prefix + "_SNP_" + contig_2, + mode = "SNP", + + calibration_sensitivity_threshold = snp_calibration_sensitivity, + + recalibration_annotation_values = snp_recalibration_annotation_values, + + known_reference_variants = snp_known_reference_variants, + known_reference_variants_index = snp_known_reference_variants_index, + known_reference_variants_identifier = snp_known_reference_variants_identifier, + is_training = snp_is_training, + is_calibration = snp_is_calibration, + } + + call VARUTIL.ScoreVariantAnnotations as ScoreIndelVariantAnnotations { + input: + vcf = ScoreSnpVariantAnnotations.scored_vcf, + vcf_index = ScoreSnpVariantAnnotations.scored_vcf_index, + + sites_only_extracted_vcf = ExtractIndelVariantAnnotations.sites_only_vcf, + sites_only_extracted_vcf_index = ExtractIndelVariantAnnotations.sites_only_vcf_index, + + model_prefix = prefix + "_train_INDEL", + model_files = flatten([[TrainIndelVariantAnnotationsModel.training_scores, TrainIndelVariantAnnotationsModel.positive_model_scorer_pickle], select_all([ + TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores, + TrainIndelVariantAnnotationsModel.calibration_set_scores, + TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle + ])]), + prefix = prefix + "_ALL_" + contig_2, + mode = "INDEL", + + calibration_sensitivity_threshold = indel_calibration_sensitivity, + + recalibration_annotation_values = indel_recalibration_annotation_values, + + known_reference_variants = indel_known_reference_variants, + known_reference_variants_index = indel_known_reference_variants_index, + known_reference_variants_identifier = indel_known_reference_variants_identifier, + is_training = indel_is_training, + is_calibration = indel_is_calibration, + } + + # Now we need to annotate our variants by region: + if (defined(annotation_bed_files)) { + call VARUTIL.AnnotateVcfWithBedRegions as AnnotateVcfRegions { + input: + vcf = ScoreIndelVariantAnnotations.scored_vcf, + vcf_index = ScoreIndelVariantAnnotations.scored_vcf_index, + bed_files = select_first([annotation_bed_files]), + bed_file_indexes = select_first([annotation_bed_file_indexes]), + bed_file_annotation_names = select_first([annotation_bed_file_annotation_names]), + prefix = basename(basename(ScoreIndelVariantAnnotations.scored_vcf, ".vcf.gz"), ".vcf") + ".region_annotated", + } + } + + File recalibrated_vcf = select_first([AnnotateVcfRegions.annotated_vcf, ScoreIndelVariantAnnotations.scored_vcf]) + File recalibrated_vcf_index = select_first([AnnotateVcfRegions.annotated_vcf_index, ScoreIndelVariantAnnotations.scored_vcf_index]) + + # Now functionally annotate each VCF: + if (defined(snpeff_db)) { + call FUNK.FunctionallyAnnotateVariants as FunctionallyAnnotate { + input: + vcf = recalibrated_vcf, + snpeff_db = select_first([snpeff_db]) + } + } + + File vcf_for_merging = select_first([FunctionallyAnnotate.annotated_vcf, recalibrated_vcf]) + File vcf_index_for_merging = select_first([FunctionallyAnnotate.annotated_vcf_index, recalibrated_vcf_index]) + } + + # Consolidate files: + call VARUTIL.GatherVcfs as GatherRawVcfs { + input: + input_vcfs = JointCallGVCFs.output_vcf, + input_vcf_indices = JointCallGVCFs.output_vcf_index, + prefix = prefix + ".raw.combined" + } + + # Consolidate files: + call VARUTIL.GatherVcfs as GatherRescoredVcfs { + input: + input_vcfs = vcf_for_merging, + input_vcf_indices = vcf_index_for_merging, + prefix = prefix + ".rescored.combined" + } + + # Convert to Zarr + call SGKit.ConvertToZarrStore as ConvertToZarr { + input: + gvcf = GatherRescoredVcfs.output_vcf, + tbi = GatherRescoredVcfs.output_vcf_index, + prefix = prefix, + outdir = outdir + } + + # Convert the output to a HAIL Matrix Table: + call Hail.ConvertToHailMT as CreateHailMatrixTable { + input: + gvcf = GatherRescoredVcfs.output_vcf, + tbi = GatherRescoredVcfs.output_vcf_index, + reference = sub(sub(ref_map["fasta"], "^.*/", ""), "\.[fasta]*$", ""), + ref_fasta = ref_map["fasta"], + ref_fai = ref_map["fai"], + prefix = prefix, + outdir = outdir + } + + ################################ + # Finalize the regular output files: + ############ + + File keyfile = CreateHailMatrixTable.completion_file + String recalibration_dir = outdir + "/recalibration_files" + String recalibration_model_dir = outdir + "/recalibration_files/model" + String recalibration_results_dir = outdir + "/recalibration_files/results" + String snpeff_results_dir = outdir + "/snpEff_results" + + call FF.FinalizeToDir as FinalizeGenomicsDB { input: outdir = outdir + "/GenomicsDB", keyfile = keyfile, files = ImportGVCFsIntoGenomicsDB.output_genomicsdb } + + call FF.FinalizeToFile as FinalizeRawVCF { input: outdir = outdir, keyfile = keyfile, file = GatherRawVcfs.output_vcf } + call FF.FinalizeToFile as FinalizeRawTBI { input: outdir = outdir, keyfile = keyfile, file = GatherRawVcfs.output_vcf_index } + + call FF.FinalizeToFile as FinalizeVETSVCF { input: outdir = outdir, keyfile = keyfile, file = GatherRescoredVcfs.output_vcf } + call FF.FinalizeToFile as FinalizeVETSTBI { input: outdir = outdir, keyfile = keyfile, file = GatherRescoredVcfs.output_vcf_index } + + if (defined(snpeff_db)) { + call FF.FinalizeToDir as FinalizeSnpEffSummary { input: outdir = snpeff_results_dir, keyfile = keyfile, files = select_all(FunctionallyAnnotate.snpEff_summary) } + call FF.FinalizeToDir as FinalizeSnpEffGenes { input: outdir = snpeff_results_dir, keyfile = keyfile, files = select_all(FunctionallyAnnotate.snpEff_genes) } + } + + ################################ + # Finalize the VETS files: + ############ + + # ExtractVariantAnnotations: + call FF.FinalizeToFile as FinalizeSnpExtractedAnnotations { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractSnpVariantAnnotations.annotation_hdf5 } + call FF.FinalizeToFile as FinalizeSnpExtractedSitesOnlyVcf { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractSnpVariantAnnotations.sites_only_vcf } + call FF.FinalizeToFile as FinalizeSnpExtractedSitesOnlyVcfIndex { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractSnpVariantAnnotations.sites_only_vcf_index } + if (defined(ExtractSnpVariantAnnotations.unlabeled_annotation_hdf5)) { + call FF.FinalizeToFile as FinalizeSnpExtractedUnlabeledAnnotations { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([ExtractSnpVariantAnnotations.unlabeled_annotation_hdf5]) } + } + call FF.FinalizeToFile as FinalizeIndelExtractedAnnotations { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractIndelVariantAnnotations.annotation_hdf5 } + call FF.FinalizeToFile as FinalizeIndelExtractedSitesOnlyVcf { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractIndelVariantAnnotations.sites_only_vcf } + call FF.FinalizeToFile as FinalizeIndelExtractedSitesOnlyVcfIndex { input: outdir = recalibration_model_dir, keyfile = keyfile, file = ExtractIndelVariantAnnotations.sites_only_vcf_index } + if (defined(ExtractIndelVariantAnnotations.unlabeled_annotation_hdf5)) { + call FF.FinalizeToFile as FinalizeIndelExtractedUnlabeledAnnotations { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([ExtractIndelVariantAnnotations.unlabeled_annotation_hdf5]) } + } + + # TrainVariantAnnotationsModel + call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsTrainingScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = TrainSnpVariantAnnotationsModel.training_scores } + call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsPositiveModelScorer { input: outdir = recalibration_model_dir, keyfile = keyfile, file = TrainSnpVariantAnnotationsModel.positive_model_scorer_pickle } + if (defined(TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores)) { + call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsUnlabeledPositiveModelScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores]) } + } + if (defined(TrainSnpVariantAnnotationsModel.calibration_set_scores)) { + call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsCalibrationSetScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainSnpVariantAnnotationsModel.calibration_set_scores]) } + } + if (defined(TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle)) { + call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsNegativeModelScorer { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle]) } + } + + call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsTrainingScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = TrainIndelVariantAnnotationsModel.training_scores } + call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsPositiveModelScorer { input: outdir = recalibration_model_dir, keyfile = keyfile, file = TrainIndelVariantAnnotationsModel.positive_model_scorer_pickle } + if (defined(TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores)) { + call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsUnlabeledPositiveModelScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores]) } + } + if (defined(TrainIndelVariantAnnotationsModel.calibration_set_scores)) { + call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsCalibrationSetScores { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainIndelVariantAnnotationsModel.calibration_set_scores]) } + } + if (defined(TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle)) { + call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsNegativeModelScorer { input: outdir = recalibration_model_dir, keyfile = keyfile, file = select_first([TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle]) } + } + + # ScoreVariantAnnotations + # This was done per-contig, so we need to finalize per-contig: + scatter (idx_3 in range(length(MakeChrIntervalList.contig_interval_list_files))) { + + String contig_3 = MakeChrIntervalList.chrs[idx_3][0] + + call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsScoredVcf { input: outdir = recalibration_results_dir + "/" + contig_3, keyfile = keyfile, file = ScoreSnpVariantAnnotations.scored_vcf[idx_3] } + call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsScoredVcfIndex { input: outdir = recalibration_results_dir + "/" + contig_3, keyfile = keyfile, file = ScoreSnpVariantAnnotations.scored_vcf_index[idx_3] } + if (defined(ScoreSnpVariantAnnotations.annotations_hdf5)) { + call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsAnnotationsHdf5 { input: outdir = recalibration_results_dir + "/" + contig_3, keyfile = keyfile, file = select_first([ScoreSnpVariantAnnotations.annotations_hdf5[idx_3]]) } + } + if (defined(ScoreSnpVariantAnnotations.scores_hdf5)) { + call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsScoresHdf5 { input: outdir = recalibration_results_dir + "/" + contig_3, keyfile = keyfile, file = select_first([ScoreSnpVariantAnnotations.scores_hdf5[idx_3]]) } + } + + call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsScoredVcf { input: outdir = recalibration_results_dir + "/" + contig_3, keyfile = keyfile, file = ScoreIndelVariantAnnotations.scored_vcf[idx_3] } + call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsScoredVcfIndex { input: outdir = recalibration_results_dir + "/" + contig_3, keyfile = keyfile, file = ScoreIndelVariantAnnotations.scored_vcf_index[idx_3] } + if (defined(ScoreIndelVariantAnnotations.annotations_hdf5)) { + call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsAnnotationsHdf5 { input: outdir = recalibration_results_dir + "/" + contig_3, keyfile = keyfile, file = select_first([ScoreIndelVariantAnnotations.annotations_hdf5[idx_3]]) } + } + if (defined(ScoreIndelVariantAnnotations.scores_hdf5)) { + call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsScoresHdf5 { input: outdir = recalibration_results_dir + "/" + contig_3, keyfile = keyfile, file = select_first([ScoreIndelVariantAnnotations.scores_hdf5[idx_3]]) } + } + } + + # Make an alias for the functionally annotated data: + if (defined(snpeff_db)) { + File annotated_vcf = FinalizeVETSVCF.gcs_path + File annotated_vcf_tbi = FinalizeVETSTBI.gcs_path + } + + output { + String genomicsDB = FinalizeGenomicsDB.gcs_dir + + File raw_joint_vcf = FinalizeRawVCF.gcs_path + File raw_joint_vcf_tbi = FinalizeRawTBI.gcs_path + + File joint_recalibrated_vcf = FinalizeVETSVCF.gcs_path + File joint_recalibrated_vcf_tbi = FinalizeVETSTBI.gcs_path + + File joint_mt = CreateHailMatrixTable.gcs_path + File joint_zarr = ConvertToZarr.gcs_path + + File? annotated_joint_vcf = annotated_vcf + File? annotated_joint_vcf_tbi = annotated_vcf_tbi + + String? snpEff_summary = FinalizeSnpEffSummary.gcs_dir + String? snpEff_genes = FinalizeSnpEffGenes.gcs_dir + } +} + + diff --git a/wdl/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS.wdl b/wdl/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS.wdl new file mode 100644 index 000000000..baaaea841 --- /dev/null +++ b/wdl/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VETS.wdl @@ -0,0 +1,293 @@ +version 1.0 + +############################################################################################################# +## A workflow that performs joint calling on single-sample gVCFs from GATK4 HaplotypeCaller using GenomicsDB. +############################################################################################################# + +import "tasks/SRJointGenotyping.wdl" as SRJOINT +import "tasks/VariantUtils.wdl" as VARUTIL +import "tasks/Utils.wdl" as UTILS +import "tasks/Finalize.wdl" as FF +import "tasks/Pf_Niare_HaplotypeCaller.wdl" as Niare_HC + +workflow SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR { + input { + Array[File] gvcfs + Array[File] gvcf_indices + + File ref_map_file + + Float snp_calibration_sensitivity = 0.99 + Int snp_max_unlabeled_variants = 0 + Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ] + + Array[File] snp_known_reference_variants + Array[File] snp_known_reference_variants_index + Array[File] snp_known_reference_variants_identifier + Array[Boolean] snp_is_training + Array[Boolean] snp_is_calibration + + Float indel_calibration_sensitivity = 0.99 + Int indel_max_unlabeled_variants = 0 + Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ] + + Array[File] indel_known_reference_variants + Array[File] indel_known_reference_variants_index + Array[File] indel_known_reference_variants_identifier + Array[Boolean] indel_is_training + Array[Boolean] indel_is_calibration + + Array[File]? annotation_bed_files + Array[File]? annotation_bed_file_indexes + Array[String]? annotation_bed_file_annotation_names + + String prefix + + String gcs_out_root_dir + } + + parameter_meta { + gvcfs: "GCS paths to gVCF files" + gvcf_indices: "GCS paths to gVCF tbi files" + ref_map_file: "table indicating reference sequence and auxillary file locations" + prefix: "prefix for output joint-called gVCF and tabix index" + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR/~{prefix}" + + Map[String, String] ref_map = read_map(ref_map_file) + + # Create interval list over which to shard the processing: + call UTILS.MakeChrIntervalList as MakeChrIntervalList { + input: + ref_dict = ref_map['dict'], + } + + # Create sample-name map: + call SRJOINT.CreateSampleNameMap as CreateSampleNameMap { + input: + gvcfs = gvcfs, + prefix = prefix + } + + # Shard by contig for speed: + scatter (idx_1 in range(length(MakeChrIntervalList.contig_interval_list_files))) { + + String contig = MakeChrIntervalList.chrs[idx_1][0] + File contig_interval_list = MakeChrIntervalList.contig_interval_list_files[idx_1] + + call Niare_HC.GenomicsDbImport as GenomicsDbImport { + input: + sample_name_map = CreateSampleNameMap.sample_name_map, + interval_list = contig_interval_list, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = prefix + "." + contig, + } + + # Shard again by contig chunk: + call UTILS.SplitContigToIntervals as SplitContigToIntervals { + input: + ref_dict = ref_map['dict'], + contig = contig + } + + scatter (idx_2 in range(length(SplitContigToIntervals.individual_bed_files))) { + + File genotype_gvcfs_intervals = SplitContigToIntervals.individual_bed_files[idx_2] + + call Niare_HC.GenotypeGVCFs as GenotypeGVCFs { + input: + input_gvcf_data = GenomicsDbImport.output_genomicsdb, + interval_list = genotype_gvcfs_intervals, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = prefix + "." + contig + ".raw", + } + } + + # Merge all raw VCFs: + call VARUTIL.GatherVcfs as GatherVcfs { + input: + input_vcfs = GenotypeGVCFs.output_vcf, + input_vcf_indices = GenotypeGVCFs.output_vcf_index, + prefix = prefix + "." + contig + ".raw.merged", + } + + # First make a sites-only VCF for recal (smaller file, easier to work with): + call VARUTIL.MakeSitesOnlyVcf as MakeSitesOnlyVCF { + input: + vcf = GatherVcfs.output_vcf, + vcf_index = GatherVcfs.output_vcf_index, + prefix = prefix + "." + contig + ".sites_only" + } + } + + # Merge all sites-only VCFs + call VARUTIL.GatherVcfs as GatherSitesOnlyVCFs { + input: + input_vcfs = MakeSitesOnlyVCF.sites_only_vcf, + input_vcf_indices = MakeSitesOnlyVCF.sites_only_vcf_index, + prefix = prefix + ".sites_only" + } + + ######################################################################## + # Call VETS / VQSR-lite: + call VARUTIL.ExtractVariantAnnotations as ExtractIndelVariantAnnotations { + input: + vcf = GatherSitesOnlyVCFs.output_vcf, + vcf_index = GatherSitesOnlyVCFs.output_vcf_index, + + prefix = prefix, + mode = "INDEL", + + recalibration_annotation_values = indel_recalibration_annotation_values, + + known_reference_variants = indel_known_reference_variants, + known_reference_variants_index = indel_known_reference_variants_index, + known_reference_variants_identifier = indel_known_reference_variants_identifier, + is_training = indel_is_training, + is_calibration = indel_is_calibration, + + max_unlabeled_variants = indel_max_unlabeled_variants, + } + + call VARUTIL.ExtractVariantAnnotations as ExtractSnpVariantAnnotations { + input: + vcf = GatherSitesOnlyVCFs.output_vcf, + vcf_index = GatherSitesOnlyVCFs.output_vcf_index, + + prefix = prefix, + mode = "SNP", + + recalibration_annotation_values = snp_recalibration_annotation_values, + + known_reference_variants = snp_known_reference_variants, + known_reference_variants_index = snp_known_reference_variants_index, + known_reference_variants_identifier = snp_known_reference_variants_identifier, + is_training = snp_is_training, + is_calibration = snp_is_calibration, + + max_unlabeled_variants = snp_max_unlabeled_variants, + } + + call VARUTIL.TrainVariantAnnotationsModel as TrainIndelVariantAnnotationsModel { + input: + annotation_hdf5 = ExtractIndelVariantAnnotations.annotation_hdf5, + mode = "INDEL", + prefix = prefix, + } + + call VARUTIL.TrainVariantAnnotationsModel as TrainSnpVariantAnnotationsModel { + input: + annotation_hdf5 = ExtractSnpVariantAnnotations.annotation_hdf5, + mode = "SNP", + prefix = prefix, + } + + # Shard by contig for speed: + scatter (idx_3 in range(length(GatherVcfs.output_vcf))) { + + String contig_2 = MakeChrIntervalList.chrs[idx_3][0] + File joint_called_vcf = GatherVcfs.output_vcf[idx_3] + File joint_called_vcf_index = GatherVcfs.output_vcf_index[idx_3] + + call VARUTIL.ScoreVariantAnnotations as ScoreSnpVariantAnnotations { + input: + vcf = joint_called_vcf, + vcf_index = joint_called_vcf_index, + + sites_only_extracted_vcf = ExtractSnpVariantAnnotations.sites_only_vcf, + sites_only_extracted_vcf_index = ExtractSnpVariantAnnotations.sites_only_vcf_index, + + model_prefix = prefix + "_train_SNP", + model_files = flatten([[TrainSnpVariantAnnotationsModel.training_scores, TrainSnpVariantAnnotationsModel.positive_model_scorer_pickle], select_all([ + TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores, + TrainSnpVariantAnnotationsModel.calibration_set_scores, + TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle + ])]), + prefix = prefix + "_SNP_" + contig_2, + mode = "SNP", + + calibration_sensitivity_threshold = snp_calibration_sensitivity, + + recalibration_annotation_values = snp_recalibration_annotation_values, + + known_reference_variants = snp_known_reference_variants, + known_reference_variants_index = snp_known_reference_variants_index, + known_reference_variants_identifier = snp_known_reference_variants_identifier, + is_training = snp_is_training, + is_calibration = snp_is_calibration, + } + + call VARUTIL.ScoreVariantAnnotations as ScoreIndelVariantAnnotations { + input: + vcf = ScoreSnpVariantAnnotations.scored_vcf, + vcf_index = ScoreSnpVariantAnnotations.scored_vcf_index, + + sites_only_extracted_vcf = ExtractIndelVariantAnnotations.sites_only_vcf, + sites_only_extracted_vcf_index = ExtractIndelVariantAnnotations.sites_only_vcf_index, + + model_prefix = prefix + "_train_INDEL", + model_files = flatten([[TrainIndelVariantAnnotationsModel.training_scores, TrainIndelVariantAnnotationsModel.positive_model_scorer_pickle], select_all([ + TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores, + TrainIndelVariantAnnotationsModel.calibration_set_scores, + TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle + ])]), + prefix = prefix + "_ALL_" + contig_2, + mode = "INDEL", + + calibration_sensitivity_threshold = indel_calibration_sensitivity, + + recalibration_annotation_values = indel_recalibration_annotation_values, + + known_reference_variants = indel_known_reference_variants, + known_reference_variants_index = indel_known_reference_variants_index, + known_reference_variants_identifier = indel_known_reference_variants_identifier, + is_training = indel_is_training, + is_calibration = indel_is_calibration, + } + + # Now we need to annotate our variants by region: + if (defined(annotation_bed_files)) { + call VARUTIL.AnnotateVcfWithBedRegions as AnnotateVcfRegions { + input: + vcf = ScoreIndelVariantAnnotations.scored_vcf, + vcf_index = ScoreIndelVariantAnnotations.scored_vcf_index, + bed_files = select_first([annotation_bed_files]), + bed_file_indexes = select_first([annotation_bed_file_indexes]), + bed_file_annotation_names = select_first([annotation_bed_file_annotation_names]), + prefix = basename(basename(ScoreIndelVariantAnnotations.scored_vcf, ".vcf.gz"), ".vcf") + ".region_annotated", + } + } + + File vcf_for_merging = select_first([AnnotateVcfRegions.annotated_vcf, ScoreIndelVariantAnnotations.scored_vcf]) + File vcf_index_for_merging = select_first([AnnotateVcfRegions.annotated_vcf_index, ScoreIndelVariantAnnotations.scored_vcf_index]) + } + + # Consolidate files: + call VARUTIL.GatherVcfs as GatherRescoredVcfs { + input: + input_vcfs = vcf_for_merging, + input_vcf_indices = vcf_index_for_merging, + prefix = prefix + ".rescored.combined" + } + + ################################ + # Finalize the regular output files: + ############ + File keyfile = GatherRescoredVcfs.output_vcf_index + + call FF.FinalizeToFile as FinalizeVETSVCF { input: outdir = outdir, keyfile = keyfile, file = GatherRescoredVcfs.output_vcf } + call FF.FinalizeToFile as FinalizeVETSTBI { input: outdir = outdir, keyfile = keyfile, file = GatherRescoredVcfs.output_vcf_index } + + + output { + File joint_recalibrated_vcf = FinalizeVETSVCF.gcs_path + File joint_recalibrated_vcf_tbi = FinalizeVETSTBI.gcs_path + } +} + diff --git a/wdl/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR.wdl b/wdl/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR.wdl new file mode 100644 index 000000000..3668dec49 --- /dev/null +++ b/wdl/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR.wdl @@ -0,0 +1,189 @@ +version 1.0 + +############################################################################################################# +## A workflow that performs joint calling on single-sample gVCFs from GATK4 HaplotypeCaller using GenomicsDB. +############################################################################################################# + +import "tasks/SRJointGenotyping.wdl" as SRJOINT +import "tasks/VariantUtils.wdl" as VARUTIL +import "tasks/Utils.wdl" as UTILS +import "tasks/Finalize.wdl" as FF +import "tasks/Pf_Niare_HaplotypeCaller.wdl" as Niare_HC + +workflow SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR { + input { + Array[File] gvcfs + Array[File] gvcf_indices + + File ref_map_file + + File vqsr_sites_vcf + File vqsr_sites_vcf_index + + String prefix + + String gcs_out_root_dir + } + + parameter_meta { + gvcfs: "GCS paths to gVCF files" + gvcf_indices: "GCS paths to gVCF tbi files" + ref_map_file: "table indicating reference sequence and auxillary file locations" + prefix: "prefix for output joint-called gVCF and tabix index" + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + } + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/SRJointCallGVCFsWithGenomicsDB_Pf_Niare_VQSR/~{prefix}" + + Map[String, String] ref_map = read_map(ref_map_file) + + # Create interval list over which to shard the processing: + call UTILS.MakeChrIntervalList as MakeChrIntervalList { + input: + ref_dict = ref_map['dict'], + } + + # Create sample-name map: + call SRJOINT.CreateSampleNameMap as CreateSampleNameMap { + input: + gvcfs = gvcfs, + prefix = prefix + } + + # Shard by contig for speed: + scatter (idx_1 in range(length(MakeChrIntervalList.contig_interval_list_files))) { + + String contig = MakeChrIntervalList.chrs[idx_1][0] + File contig_interval_list = MakeChrIntervalList.contig_interval_list_files[idx_1] + + call Niare_HC.GenomicsDbImport as GenomicsDbImport { + input: + sample_name_map = CreateSampleNameMap.sample_name_map, + interval_list = contig_interval_list, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = prefix + "." + contig, + } + + # Shard again by contig chunk: + call UTILS.SplitContigToIntervals as SplitContigToIntervals { + input: + ref_dict = ref_map['dict'], + contig = contig + } + + scatter (idx_2 in range(length(SplitContigToIntervals.individual_bed_files))) { + + File genotype_gvcfs_intervals = SplitContigToIntervals.individual_bed_files[idx_2] + + call Niare_HC.GenotypeGVCFs as GenotypeGVCFs { + input: + input_gvcf_data = GenomicsDbImport.output_genomicsdb, + interval_list = genotype_gvcfs_intervals, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = prefix + "." + contig + ".raw", + } + } + + # Merge all raw VCFs: + call VARUTIL.GatherVcfs as GatherVcfs { + input: + input_vcfs = GenotypeGVCFs.output_vcf, + input_vcf_indices = GenotypeGVCFs.output_vcf_index, + prefix = prefix + "." + contig + ".raw.merged", + } + + # Normalize variants here: + call Niare_HC.NormalizeVcfSplittingMultiallelics as NormalizeVcfSplittingMultiallelics { + input: + input_vcf = GatherVcfs.output_vcf, + input_vcf_index = GatherVcfs.output_vcf_index, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = prefix + "." + contig + ".raw.merged.normalized", + } + + # Run variant recalibrator and VQSR: + call Niare_HC.VariantRecalibratorIndel as VariantRecalibratorIndel { + input: + input_vcf = NormalizeVcfSplittingMultiallelics.output_vcf, + input_vcf_index = NormalizeVcfSplittingMultiallelics.output_vcf_index, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + sites_only_vcf = vqsr_sites_vcf, + sites_only_vcf_index = vqsr_sites_vcf_index, + prefix = prefix + "." + contig, + } + + call Niare_HC.ApplyVqsrIndel as ApplyVqsrIndel { + input: + input_vcf = NormalizeVcfSplittingMultiallelics.output_vcf, + input_vcf_index = NormalizeVcfSplittingMultiallelics.output_vcf_index, + recal_file = VariantRecalibratorIndel.recalibration, + recal_file_index = VariantRecalibratorIndel.recalibration_index, + recal_tranches = VariantRecalibratorIndel.tranches, + prefix = prefix + "." + contig + "raw", + } + + call Niare_HC.VariantRecalibratorSnp as VariantRecalibratorSnp { + input: + input_vcf = ApplyVqsrIndel.output_vcf, + input_vcf_index = ApplyVqsrIndel.output_vcf_index, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + sites_only_vcf = vqsr_sites_vcf, + sites_only_vcf_index = vqsr_sites_vcf_index, + prefix = prefix + "." + contig, + } + + call Niare_HC.ApplyVqsrIndel as ApplyVqsrSnp { + input: + input_vcf = ApplyVqsrIndel.output_vcf, + input_vcf_index = ApplyVqsrIndel.output_vcf_index, + recal_file = VariantRecalibratorSnp.recalibration, + recal_file_index = VariantRecalibratorSnp.recalibration_index, + recal_tranches = VariantRecalibratorSnp.tranches, + prefix = prefix + "." + contig + "raw", + } + + # Merge multi-allelic sites after recalibration: + call Niare_HC.MergeMultiAllelicSitesPostRecalibration as MergeMultiAllelicSitesPostRecalibration { + input: + input_vcf = ApplyVqsrSnp.output_vcf, + input_vcf_index = ApplyVqsrSnp.output_vcf_index, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = prefix + "." + contig + ".recalibrated", + } + } + + # Consolidate files: + call VARUTIL.GatherVcfs as GatherRescoredVcfs { + input: + input_vcfs = MergeMultiAllelicSitesPostRecalibration.output_vcf, + input_vcf_indices = MergeMultiAllelicSitesPostRecalibration.output_vcf_index, + prefix = prefix + ".rescored.combined" + } + + ################################ + # Finalize the regular output files: + ############ + File keyfile = GatherRescoredVcfs.output_vcf_index + + call FF.FinalizeToFile as FinalizeVETSVCF { input: outdir = outdir, keyfile = keyfile, file = GatherRescoredVcfs.output_vcf } + call FF.FinalizeToFile as FinalizeVETSTBI { input: outdir = outdir, keyfile = keyfile, file = GatherRescoredVcfs.output_vcf_index } + + + output { + File joint_recalibrated_vcf = FinalizeVETSVCF.gcs_path + File joint_recalibrated_vcf_tbi = FinalizeVETSTBI.gcs_path + } +} + diff --git a/wdl/SRWholeGenome.wdl b/wdl/SRWholeGenome.wdl new file mode 100644 index 000000000..97e7141bd --- /dev/null +++ b/wdl/SRWholeGenome.wdl @@ -0,0 +1,481 @@ +version 1.0 + +###################################################################################### +## A workflow that performs single sample variant calling on Illumina reads from +## one or more flow cells. The workflow merges multiple samples into a single BAM +## prior to variant calling. +###################################################################################### + +import "tasks/Utils.wdl" as Utils +import "tasks/SRUtils.wdl" as SRUTIL +import "tasks/VariantUtils.wdl" as VARUTIL +import "tasks/CallVariantsIllumina.wdl" as VAR +import "tasks/HaplotypeCaller.wdl" as HC +import "tasks/AlignedMetrics.wdl" as AM +import "tasks/FastQC.wdl" as FastQC +import "tasks/Finalize.wdl" as FF +import "tasks/SampleLevelAlignedMetrics.wdl" as COV + +workflow SRWholeGenome { + input { + Array[File] aligned_bams + Array[File] aligned_bais + + File ref_map_file + + String participant_name + + String gcs_out_root_dir + + Boolean call_small_variants = true + + Boolean run_HC_analysis = true + Boolean run_dv_pepper_analysis = true + + Boolean enable_hc_pileup_mode = true + + Int dvp_threads = 32 + Int dvp_memory = 128 + + Int ploidy = 2 + + Float heterozygosity = 0.001 + Float heterozygosity_stdev = 0.01 + Float indel_heterozygosity = 0.000125 + + + Float snp_calibration_sensitivity = 0.99 + Int snp_max_unlabeled_variants = 0 + Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ] + + Array[File] snp_known_reference_variants + Array[File] snp_known_reference_variants_index + Array[File] snp_known_reference_variants_identifier + Array[Boolean] snp_is_training + Array[Boolean] snp_is_calibration + + Float indel_calibration_sensitivity = 0.99 + Int indel_max_unlabeled_variants = 0 + Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ] + + Array[File] indel_known_reference_variants + Array[File] indel_known_reference_variants_index + Array[File] indel_known_reference_variants_identifier + Array[Boolean] indel_is_training + Array[Boolean] indel_is_calibration + + File? bed_to_compute_coverage + + File? fingerprint_haploytpe_db_file + + Array[String] contigs_names_to_ignore = ["RANDOM_PLACEHOLDER_VALUE"] ## Required for ignoring any filtering - this is kind of a hack - TODO: fix the task! + } + + Map[String, String] ref_map = read_map(ref_map_file) + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/SRWholeGenome/~{participant_name}" + + String bam_dir = outdir + "/alignments" + String metrics_dir = outdir + "/metrics" + String smalldir = outdir + "/variants/small" + String recalibration_dir = outdir + "/variants/recalibration_files" + + # gather across (potential multiple) input CCS BAMs + if (length(aligned_bams) > 1) { + scatter (pair in zip(aligned_bams, aligned_bais)) { + call Utils.InferSampleName {input: bam = pair.left, bai = pair.right} + } + call Utils.CheckOnSamplenames {input: sample_names = InferSampleName.sample_name} + + call Utils.MergeBams as MergeAllReads { input: bams = aligned_bams, prefix = participant_name } + } + + File bam = select_first([MergeAllReads.merged_bam, aligned_bams[0]]) + File bai = select_first([MergeAllReads.merged_bai, aligned_bais[0]]) + + # Collect sample-level metrics: + call AM.SamStatsMap as SamStats { input: bam = bam } + call FastQC.FastQC as FastQC { input: bam = bam, bai = bai } + call Utils.ComputeGenomeLength as ComputeGenomeLength { input: fasta = ref_map['fasta'] } + call SRUTIL.ComputeBamStats as ComputeBamStats { input: bam_file = bam } + + if (defined(bed_to_compute_coverage)) { + call AM.MosDepthOverBed as MosDepth { + input: + bam = bam, + bai = bai, + bed = select_first([bed_to_compute_coverage]) + } + + call COV.SummarizeDepthOverWholeBed as RegionalCoverage { + input: + mosdepth_output = MosDepth.regions + } + } + + call FF.FinalizeToFile as FinalizeBam { input: outdir = bam_dir, file = bam, name = "~{participant_name}.bam" } + call FF.FinalizeToFile as FinalizeBai { input: outdir = bam_dir, file = bai, name = "~{participant_name}.bam.bai" } + + if (defined(bed_to_compute_coverage)) { call FF.FinalizeToFile as FinalizeRegionalCoverage { input: outdir = bam_dir, file = select_first([RegionalCoverage.cov_summary]) } } + + + call FF.FinalizeToFile as FinalizeFastQCReport { + input: + outdir = metrics_dir, + file = FastQC.report + } + + + #################################################################################################### + + # Some input handling: + if ((!run_dv_pepper_analysis) && (!run_HC_analysis)) { + call Utils.StopWorkflow as short_variant_caller_analysis_not_provided { + input: reason = "One of the following must be set to true: run_dv_pepper_analysis(~{run_dv_pepper_analysis}), run_HC_analysis(~{run_HC_analysis})" + } + } + + # Handle DeepVariant First: + if (run_dv_pepper_analysis) { + + # Deep Variant runs better with raw base quals because it has already learned the error modes. + # We need to revert our recalibration before calling variants: + call SRUTIL.RevertBaseQualities as RevertBQSRQuals { + input: + bam = bam, + bai = bai, + prefix = basename(bam, ".bam") + ".reverted_base_quals" + } + + call VAR.CallVariants as CallVariantsWithDeepVariant { + input: + bam = RevertBQSRQuals.bam_out, + bai = RevertBQSRQuals.bai_out, + sample_id = participant_name, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + + prefix = participant_name + ".deep_variant", + + call_small_variants = call_small_variants, + + run_dv_pepper_analysis = run_dv_pepper_analysis, + dvp_threads = dvp_threads, + dvp_memory = dvp_memory, + + mito_contig = ref_map['mt_chr_name'], + contigs_names_to_ignore = contigs_names_to_ignore, + } + + call FF.FinalizeToFile as FinalizeDVPepperVcf { input: outdir = smalldir, file = select_first([CallVariantsWithDeepVariant.dvp_vcf]) } + call FF.FinalizeToFile as FinalizeDVPepperTbi { input: outdir = smalldir, file = select_first([CallVariantsWithDeepVariant.dvp_tbi]) } + call FF.FinalizeToFile as FinalizeDVPepperGVcf { input: outdir = smalldir, file = select_first([CallVariantsWithDeepVariant.dvp_g_vcf]) } + call FF.FinalizeToFile as FinalizeDVPepperGTbi { input: outdir = smalldir, file = select_first([CallVariantsWithDeepVariant.dvp_g_tbi]) } + } + + # Now we handle HaplotypeCaller data: + if (run_HC_analysis) { + call HC.CallVariantsWithHaplotypeCaller { + input: + bam = bam, + bai = bai, + sample_id = participant_name, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + dbsnp_vcf = ref_map["known_sites_vcf"], + + ploidy = ploidy, + heterozygosity = heterozygosity, + heterozygosity_stdev = heterozygosity_stdev, + indel_heterozygosity = indel_heterozygosity, + + prefix = participant_name + ".haplotype_caller", + + enable_pileup_mode = enable_hc_pileup_mode, + + mito_contig = ref_map['mt_chr_name'], + contigs_names_to_ignore = contigs_names_to_ignore, + } + + # Make sure our sample name is correct: + call VARUTIL.RenameSingleSampleVcf as RenameRawHcVcf { + input: + vcf = CallVariantsWithHaplotypeCaller.output_vcf, + vcf_index = CallVariantsWithHaplotypeCaller.output_vcf_index, + prefix = participant_name + ".haplotype_caller.renamed", + new_sample_name = participant_name + } + call VARUTIL.RenameSingleSampleVcf as RenameRawHcGvcf { + input: + vcf = CallVariantsWithHaplotypeCaller.output_gvcf, + vcf_index = CallVariantsWithHaplotypeCaller.output_gvcf_index, + prefix = participant_name + ".haplotype_caller.renamed", + is_gvcf = true, + new_sample_name = participant_name + } + + ######################################################################## + # Call VETS / VQSR-lite: + call VARUTIL.ExtractVariantAnnotations as ExtractIndelVariantAnnotations { + input: + vcf = RenameRawHcVcf.new_sample_name_vcf, + vcf_index = RenameRawHcVcf.new_sample_name_vcf_index, + + prefix = participant_name, + mode = "INDEL", + + recalibration_annotation_values = indel_recalibration_annotation_values, + + known_reference_variants = indel_known_reference_variants, + known_reference_variants_index = indel_known_reference_variants_index, + known_reference_variants_identifier = indel_known_reference_variants_identifier, + is_training = indel_is_training, + is_calibration = indel_is_calibration, + + max_unlabeled_variants = indel_max_unlabeled_variants, + } + + call VARUTIL.ExtractVariantAnnotations as ExtractSnpVariantAnnotations { + input: + vcf = RenameRawHcVcf.new_sample_name_vcf, + vcf_index = RenameRawHcVcf.new_sample_name_vcf_index, + + prefix = participant_name, + mode = "SNP", + + recalibration_annotation_values = snp_recalibration_annotation_values, + + known_reference_variants = snp_known_reference_variants, + known_reference_variants_index = snp_known_reference_variants_index, + known_reference_variants_identifier = snp_known_reference_variants_identifier, + is_training = snp_is_training, + is_calibration = snp_is_calibration, + + max_unlabeled_variants = snp_max_unlabeled_variants, + } + + call VARUTIL.TrainVariantAnnotationsModel as TrainIndelVariantAnnotationsModel { + input: + annotation_hdf5 = ExtractIndelVariantAnnotations.annotation_hdf5, + mode = "INDEL", + prefix = participant_name, + } + + call VARUTIL.TrainVariantAnnotationsModel as TrainSnpVariantAnnotationsModel { + input: + annotation_hdf5 = ExtractSnpVariantAnnotations.annotation_hdf5, + mode = "SNP", + prefix = participant_name, + } + + call VARUTIL.ScoreVariantAnnotations as ScoreSnpVariantAnnotations { + input: + vcf = RenameRawHcVcf.new_sample_name_vcf, + vcf_index = RenameRawHcVcf.new_sample_name_vcf_index, + + sites_only_extracted_vcf = ExtractSnpVariantAnnotations.sites_only_vcf, + sites_only_extracted_vcf_index = ExtractSnpVariantAnnotations.sites_only_vcf_index, + + model_prefix = participant_name + "_train_SNP", + model_files = flatten([[TrainSnpVariantAnnotationsModel.training_scores, TrainSnpVariantAnnotationsModel.positive_model_scorer_pickle], select_all([ + TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores, + TrainSnpVariantAnnotationsModel.calibration_set_scores, + TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle + ])]), + prefix = participant_name + "_SNP", + mode = "SNP", + + calibration_sensitivity_threshold = snp_calibration_sensitivity, + + recalibration_annotation_values = snp_recalibration_annotation_values, + + known_reference_variants = snp_known_reference_variants, + known_reference_variants_index = snp_known_reference_variants_index, + known_reference_variants_identifier = snp_known_reference_variants_identifier, + is_training = snp_is_training, + is_calibration = snp_is_calibration, + } + + call VARUTIL.ScoreVariantAnnotations as ScoreIndelVariantAnnotations { + input: + vcf = ScoreSnpVariantAnnotations.scored_vcf, + vcf_index = ScoreSnpVariantAnnotations.scored_vcf_index, + + sites_only_extracted_vcf = ExtractIndelVariantAnnotations.sites_only_vcf, + sites_only_extracted_vcf_index = ExtractIndelVariantAnnotations.sites_only_vcf_index, + + model_prefix = participant_name + "_train_INDEL", + model_files = flatten([[TrainIndelVariantAnnotationsModel.training_scores, TrainIndelVariantAnnotationsModel.positive_model_scorer_pickle], select_all([ + TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores, + TrainIndelVariantAnnotationsModel.calibration_set_scores, + TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle + ])]), + prefix = participant_name + "_ALL", + mode = "INDEL", + + calibration_sensitivity_threshold = indel_calibration_sensitivity, + + recalibration_annotation_values = indel_recalibration_annotation_values, + + known_reference_variants = indel_known_reference_variants, + known_reference_variants_index = indel_known_reference_variants_index, + known_reference_variants_identifier = indel_known_reference_variants_identifier, + is_training = indel_is_training, + is_calibration = indel_is_calibration, + } + ######################################################################## + + if (defined(fingerprint_haploytpe_db_file)) { + call VARUTIL.ExtractFingerprintAndBarcode as FingerprintAndBarcodeVcf { + input: + vcf = ScoreIndelVariantAnnotations.scored_vcf, + vcf_index = ScoreIndelVariantAnnotations.scored_vcf_index, + haplotype_database_file = select_first([fingerprint_haploytpe_db_file]), + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = participant_name + } + } + + call VARUTIL.SelectVariants as RemoveFilteredVariants { + input: + vcf = ScoreIndelVariantAnnotations.scored_vcf, + vcf_index = ScoreIndelVariantAnnotations.scored_vcf_index, + prefix = participant_name + ".filtered" + } + + # Create a Keyfile for finalization: + File keyfile = RemoveFilteredVariants.vcf_out_index + + # Finalize the raw Joint Calls: + call FF.FinalizeToFile as FinalizeHCVcf { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcVcf.new_sample_name_vcf } + call FF.FinalizeToFile as FinalizeHCTbi { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcVcf.new_sample_name_vcf_index } + call FF.FinalizeToFile as FinalizeHCGVcf { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcGvcf.new_sample_name_vcf } + call FF.FinalizeToFile as FinalizeHCGTbi { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcGvcf.new_sample_name_vcf_index } + call FF.FinalizeToFile as FinalizeHCBamOut { input: outdir = smalldir, keyfile = keyfile, file = CallVariantsWithHaplotypeCaller.bamout } + call FF.FinalizeToFile as FinalizeHCBaiOut { input: outdir = smalldir, keyfile = keyfile, file = CallVariantsWithHaplotypeCaller.bamout_index } + + # Finalize the reclibrated / filtered variants: + call FF.FinalizeToFile as FinalizeHCRescoredVcf { input: outdir = smalldir, keyfile = keyfile, file = ScoreIndelVariantAnnotations.scored_vcf } + call FF.FinalizeToFile as FinalizeHCRescoredTbi { input: outdir = smalldir, keyfile = keyfile, file = ScoreIndelVariantAnnotations.scored_vcf_index } + call FF.FinalizeToFile as FinalizeHCRescoredFilteredVcf { input: outdir = smalldir, keyfile = keyfile, file = RemoveFilteredVariants.vcf_out } + call FF.FinalizeToFile as FinalizeHCRescoredFilteredTbi { input: outdir = smalldir, keyfile = keyfile, file = RemoveFilteredVariants.vcf_out_index } + + # Finalize other outputs: + if (defined(fingerprint_haploytpe_db_file)) { + call FF.FinalizeToFile as FinalizeFingerprintVcf { input: outdir = smalldir, keyfile = keyfile, file = select_first([FingerprintAndBarcodeVcf.output_vcf]) } + } + + ################################ + # Finalize the VETS files: + ############ + + # ExtractVariantAnnotations: + call FF.FinalizeToFile as FinalizeSnpExtractedAnnotations { input: outdir = recalibration_dir, keyfile = keyfile, file = ExtractSnpVariantAnnotations.annotation_hdf5 } + call FF.FinalizeToFile as FinalizeSnpExtractedSitesOnlyVcf { input: outdir = recalibration_dir, keyfile = keyfile, file = ExtractSnpVariantAnnotations.sites_only_vcf } + call FF.FinalizeToFile as FinalizeSnpExtractedSitesOnlyVcfIndex { input: outdir = recalibration_dir, keyfile = keyfile, file = ExtractSnpVariantAnnotations.sites_only_vcf_index } + if (defined(ExtractSnpVariantAnnotations.unlabeled_annotation_hdf5)) { + call FF.FinalizeToFile as FinalizeSnpExtractedUnlabeledAnnotations { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([ExtractSnpVariantAnnotations.unlabeled_annotation_hdf5]) } + } + call FF.FinalizeToFile as FinalizeIndelExtractedAnnotations { input: outdir = recalibration_dir, keyfile = keyfile, file = ExtractIndelVariantAnnotations.annotation_hdf5 } + call FF.FinalizeToFile as FinalizeIndelExtractedSitesOnlyVcf { input: outdir = recalibration_dir, keyfile = keyfile, file = ExtractIndelVariantAnnotations.sites_only_vcf } + call FF.FinalizeToFile as FinalizeIndelExtractedSitesOnlyVcfIndex { input: outdir = recalibration_dir, keyfile = keyfile, file = ExtractIndelVariantAnnotations.sites_only_vcf_index } + if (defined(ExtractIndelVariantAnnotations.unlabeled_annotation_hdf5)) { + call FF.FinalizeToFile as FinalizeIndelExtractedUnlabeledAnnotations { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([ExtractIndelVariantAnnotations.unlabeled_annotation_hdf5]) } + } + + # TrainVariantAnnotationsModel + call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsTrainingScores { input: outdir = recalibration_dir, keyfile = keyfile, file = TrainSnpVariantAnnotationsModel.training_scores } + call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsPositiveModelScorer { input: outdir = recalibration_dir, keyfile = keyfile, file = TrainSnpVariantAnnotationsModel.positive_model_scorer_pickle } + if (defined(TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores)) { + call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsUnlabeledPositiveModelScores { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores]) } + } + if (defined(TrainSnpVariantAnnotationsModel.calibration_set_scores)) { + call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsCalibrationSetScores { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([TrainSnpVariantAnnotationsModel.calibration_set_scores]) } + } + if (defined(TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle)) { + call FF.FinalizeToFile as FinalizeSnpTrainVariantAnnotationsNegativeModelScorer { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle]) } + } + + call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsTrainingScores { input: outdir = recalibration_dir, keyfile = keyfile, file = TrainIndelVariantAnnotationsModel.training_scores } + call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsPositiveModelScorer { input: outdir = recalibration_dir, keyfile = keyfile, file = TrainIndelVariantAnnotationsModel.positive_model_scorer_pickle } + if (defined(TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores)) { + call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsUnlabeledPositiveModelScores { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores]) } + } + if (defined(TrainIndelVariantAnnotationsModel.calibration_set_scores)) { + call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsCalibrationSetScores { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([TrainIndelVariantAnnotationsModel.calibration_set_scores]) } + } + if (defined(TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle)) { + call FF.FinalizeToFile as FinalizeIndelTrainVariantAnnotationsNegativeModelScorer { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle]) } + } + + # ScoreVariantAnnotations + call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsScoredVcf { input: outdir = recalibration_dir, keyfile = keyfile, file = ScoreSnpVariantAnnotations.scored_vcf } + call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsScoredVcfIndex { input: outdir = recalibration_dir, keyfile = keyfile, file = ScoreSnpVariantAnnotations.scored_vcf_index } + if (defined(ScoreSnpVariantAnnotations.annotations_hdf5)) { + call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsAnnotationsHdf5 { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([ScoreSnpVariantAnnotations.annotations_hdf5]) } + } + if (defined(ScoreSnpVariantAnnotations.scores_hdf5)) { + call FF.FinalizeToFile as FinalizeScoreSnpVariantAnnotationsScoresHdf5 { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([ScoreSnpVariantAnnotations.scores_hdf5]) } + } + + call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsScoredVcf { input: outdir = recalibration_dir, keyfile = keyfile, file = ScoreIndelVariantAnnotations.scored_vcf } + call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsScoredVcfIndex { input: outdir = recalibration_dir, keyfile = keyfile, file = ScoreIndelVariantAnnotations.scored_vcf_index } + if (defined(ScoreIndelVariantAnnotations.annotations_hdf5)) { + call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsAnnotationsHdf5 { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([ScoreIndelVariantAnnotations.annotations_hdf5]) } + } + if (defined(ScoreIndelVariantAnnotations.scores_hdf5)) { + call FF.FinalizeToFile as FinalizeScoreIndelVariantAnnotationsScoresHdf5 { input: outdir = recalibration_dir, keyfile = keyfile, file = select_first([ScoreIndelVariantAnnotations.scores_hdf5]) } + } + } + + output { + File aligned_bam = FinalizeBam.gcs_path + File aligned_bai = FinalizeBai.gcs_path + + Float aligned_num_reads = FastQC.stats_map['number_of_reads'] + Float aligned_num_bases = SamStats.stats_map['bases_mapped'] + Float aligned_frac_bases = SamStats.stats_map['bases_mapped']/SamStats.stats_map['total_length'] + Float aligned_est_fold_cov = SamStats.stats_map['bases_mapped']/ComputeGenomeLength.length + + Float aligned_read_length_mean = FastQC.stats_map['read_length'] + + Float insert_size_average = SamStats.stats_map['insert_size_average'] + Float insert_size_standard_deviation = SamStats.stats_map['insert_size_standard_deviation'] + Float pct_properly_paired_reads = SamStats.stats_map['percentage_of_properly_paired_reads_%'] + + Float average_identity = 100.0 - (100.0*SamStats.stats_map['mismatches']/SamStats.stats_map['bases_mapped']) + + File fastqc_report = FinalizeFastQCReport.gcs_path + + Boolean successfully_processed = true + + File? bed_cov_summary = FinalizeRegionalCoverage.gcs_path + + File? fingerprint_vcf = FinalizeFingerprintVcf.gcs_path + String? barcode = FingerprintAndBarcodeVcf.barcode + + ######################################## + + File? dvp_vcf = FinalizeDVPepperVcf.gcs_path + File? dvp_tbi = FinalizeDVPepperTbi.gcs_path + File? dvp_g_vcf = FinalizeDVPepperGVcf.gcs_path + File? dvp_g_tbi = FinalizeDVPepperGTbi.gcs_path + + ######################################## + + File? hc_g_vcf = FinalizeHCGVcf.gcs_path + File? hc_g_tbi = FinalizeHCGTbi.gcs_path + File? hc_bamout = FinalizeHCBamOut.gcs_path + File? hc_baiout = FinalizeHCBaiOut.gcs_path + File? hc_raw_vcf = FinalizeHCVcf.gcs_path + File? hc_raw_tbi = FinalizeHCTbi.gcs_path + File? hc_rescored_vcf = FinalizeHCRescoredVcf.gcs_path + File? hc_rescored_tbi = FinalizeHCRescoredTbi.gcs_path + File? hc_rescored_filtered_vcf = FinalizeHCRescoredFilteredVcf.gcs_path + File? hc_rescored_filtered_tbi = FinalizeHCRescoredFilteredTbi.gcs_path + } +} diff --git a/wdl/SRWholeGenome_Pf_Niare_VETS.wdl b/wdl/SRWholeGenome_Pf_Niare_VETS.wdl new file mode 100644 index 000000000..1b6fb34ce --- /dev/null +++ b/wdl/SRWholeGenome_Pf_Niare_VETS.wdl @@ -0,0 +1,262 @@ +version 1.0 + +###################################################################################### +## A workflow that performs single sample variant calling on Illumina reads from +## one or more flow cells. The workflow merges multiple samples into a single BAM +## prior to variant calling. +###################################################################################### + +import "tasks/Utils.wdl" as Utils +import "tasks/SRUtils.wdl" as SRUTIL +import "tasks/Finalize.wdl" as FF +import "tasks/VariantUtils.wdl" as VARUTIL +import "tasks/Pf_Niare_HaplotypeCaller.wdl" as Niare_HC + + +workflow SRWholeGenome_Pf_Niare_VETS { + input { + Array[File] aligned_bams + Array[File] aligned_bais + + File ref_map_file + + String participant_name + + String gcs_out_root_dir + + File vcf_calling_interval_list + File genotype_gvcfs_intervals + + Float snp_calibration_sensitivity = 0.99 + Int snp_max_unlabeled_variants = 0 + Array[String] snp_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ] + + Array[File] snp_known_reference_variants + Array[File] snp_known_reference_variants_index + Array[File] snp_known_reference_variants_identifier + Array[Boolean] snp_is_training + Array[Boolean] snp_is_calibration + + Float indel_calibration_sensitivity = 0.99 + Int indel_max_unlabeled_variants = 0 + Array[String] indel_recalibration_annotation_values = [ "BaseQRankSum", "ExcessHet", "FS", "HAPCOMP", "HAPDOM", "HEC", "MQ", "MQRankSum", "QD", "ReadPosRankSum", "SOR", "DP" ] + + Array[File] indel_known_reference_variants + Array[File] indel_known_reference_variants_index + Array[File] indel_known_reference_variants_identifier + Array[Boolean] indel_is_training + Array[Boolean] indel_is_calibration + + File? bed_to_compute_coverage + + Array[String] contigs_names_to_ignore = ["RANDOM_PLACEHOLDER_VALUE"] ## Required for ignoring any filtering - this is kind of a hack - TODO: fix the task! + } + + Map[String, String] ref_map = read_map(ref_map_file) + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/SRWholeGenome_Pf_Niare_VETS/~{participant_name}" + + String bam_dir = outdir + "/alignments" + String metrics_dir = outdir + "/metrics" + String smalldir = outdir + "/variants/small" + String recalibration_dir = outdir + "/variants/recalibration_files" + + # gather across (potential multiple) input CCS BAMs + if (length(aligned_bams) > 1) { + scatter (pair in zip(aligned_bams, aligned_bais)) { + call Utils.InferSampleName {input: bam = pair.left, bai = pair.right} + } + call Utils.CheckOnSamplenames {input: sample_names = InferSampleName.sample_name} + + call Utils.MergeBams as MergeAllReads { input: bams = aligned_bams, prefix = participant_name } + } + + File bam = select_first([MergeAllReads.merged_bam, aligned_bams[0]]) + File bai = select_first([MergeAllReads.merged_bai, aligned_bais[0]]) + + #################################################################################################### + # HC Call Variants: + + # Now we handle HaplotypeCaller data: + call Niare_HC.CallVariantsWithHaplotypeCaller { + input: + bam = bam, + bai = bai, + sample_id = participant_name, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + + genotype_gvcfs_intervals = genotype_gvcfs_intervals, + + prefix = participant_name + ".haplotype_caller", + + mito_contig = ref_map['mt_chr_name'], + contigs_names_to_ignore = contigs_names_to_ignore, + } + + # Make sure our sample name is correct: + call VARUTIL.RenameSingleSampleVcf as RenameRawHcVcf { + input: + vcf = CallVariantsWithHaplotypeCaller.output_vcf, + vcf_index = CallVariantsWithHaplotypeCaller.output_vcf_index, + prefix = participant_name + ".haplotype_caller.renamed", + new_sample_name = participant_name + } + call VARUTIL.RenameSingleSampleVcf as RenameRawHcGvcf { + input: + vcf = CallVariantsWithHaplotypeCaller.output_gvcf, + vcf_index = CallVariantsWithHaplotypeCaller.output_gvcf_index, + prefix = participant_name + ".haplotype_caller.renamed", + is_gvcf = true, + new_sample_name = participant_name + } + + ######################################################################## + # Call VETS / VQSR-lite: + call VARUTIL.ExtractVariantAnnotations as ExtractIndelVariantAnnotations { + input: + vcf = RenameRawHcVcf.new_sample_name_vcf, + vcf_index = RenameRawHcVcf.new_sample_name_vcf_index, + + prefix = participant_name, + mode = "INDEL", + + recalibration_annotation_values = indel_recalibration_annotation_values, + + known_reference_variants = indel_known_reference_variants, + known_reference_variants_index = indel_known_reference_variants_index, + known_reference_variants_identifier = indel_known_reference_variants_identifier, + is_training = indel_is_training, + is_calibration = indel_is_calibration, + + max_unlabeled_variants = indel_max_unlabeled_variants, + } + + call VARUTIL.ExtractVariantAnnotations as ExtractSnpVariantAnnotations { + input: + vcf = RenameRawHcVcf.new_sample_name_vcf, + vcf_index = RenameRawHcVcf.new_sample_name_vcf_index, + + prefix = participant_name, + mode = "SNP", + + recalibration_annotation_values = snp_recalibration_annotation_values, + + known_reference_variants = snp_known_reference_variants, + known_reference_variants_index = snp_known_reference_variants_index, + known_reference_variants_identifier = snp_known_reference_variants_identifier, + is_training = snp_is_training, + is_calibration = snp_is_calibration, + + max_unlabeled_variants = snp_max_unlabeled_variants, + } + + call VARUTIL.TrainVariantAnnotationsModel as TrainIndelVariantAnnotationsModel { + input: + annotation_hdf5 = ExtractIndelVariantAnnotations.annotation_hdf5, + mode = "INDEL", + prefix = participant_name, + } + + call VARUTIL.TrainVariantAnnotationsModel as TrainSnpVariantAnnotationsModel { + input: + annotation_hdf5 = ExtractSnpVariantAnnotations.annotation_hdf5, + mode = "SNP", + prefix = participant_name, + } + + call VARUTIL.ScoreVariantAnnotations as ScoreSnpVariantAnnotations { + input: + vcf = RenameRawHcVcf.new_sample_name_vcf, + vcf_index = RenameRawHcVcf.new_sample_name_vcf_index, + + sites_only_extracted_vcf = ExtractSnpVariantAnnotations.sites_only_vcf, + sites_only_extracted_vcf_index = ExtractSnpVariantAnnotations.sites_only_vcf_index, + + model_prefix = participant_name + "_train_SNP", + model_files = flatten([[TrainSnpVariantAnnotationsModel.training_scores, TrainSnpVariantAnnotationsModel.positive_model_scorer_pickle], select_all([ + TrainSnpVariantAnnotationsModel.unlabeled_positive_model_scores, + TrainSnpVariantAnnotationsModel.calibration_set_scores, + TrainSnpVariantAnnotationsModel.negative_model_scorer_pickle + ])]), + prefix = participant_name + "_SNP", + mode = "SNP", + + calibration_sensitivity_threshold = snp_calibration_sensitivity, + + recalibration_annotation_values = snp_recalibration_annotation_values, + + known_reference_variants = snp_known_reference_variants, + known_reference_variants_index = snp_known_reference_variants_index, + known_reference_variants_identifier = snp_known_reference_variants_identifier, + is_training = snp_is_training, + is_calibration = snp_is_calibration, + } + + call VARUTIL.ScoreVariantAnnotations as ScoreIndelVariantAnnotations { + input: + vcf = ScoreSnpVariantAnnotations.scored_vcf, + vcf_index = ScoreSnpVariantAnnotations.scored_vcf_index, + + sites_only_extracted_vcf = ExtractIndelVariantAnnotations.sites_only_vcf, + sites_only_extracted_vcf_index = ExtractIndelVariantAnnotations.sites_only_vcf_index, + + model_prefix = participant_name + "_train_INDEL", + model_files = flatten([[TrainIndelVariantAnnotationsModel.training_scores, TrainIndelVariantAnnotationsModel.positive_model_scorer_pickle], select_all([ + TrainIndelVariantAnnotationsModel.unlabeled_positive_model_scores, + TrainIndelVariantAnnotationsModel.calibration_set_scores, + TrainIndelVariantAnnotationsModel.negative_model_scorer_pickle + ])]), + prefix = participant_name + "_ALL", + mode = "INDEL", + + calibration_sensitivity_threshold = indel_calibration_sensitivity, + + recalibration_annotation_values = indel_recalibration_annotation_values, + + known_reference_variants = indel_known_reference_variants, + known_reference_variants_index = indel_known_reference_variants_index, + known_reference_variants_identifier = indel_known_reference_variants_identifier, + is_training = indel_is_training, + is_calibration = indel_is_calibration, + } + ######################################################################## + + call VARUTIL.SelectVariants as RemoveFilteredVariants { + input: + vcf = ScoreIndelVariantAnnotations.scored_vcf, + vcf_index = ScoreIndelVariantAnnotations.scored_vcf_index, + prefix = participant_name + ".filtered" + } + + # Create a Keyfile for finalization: + File keyfile = RemoveFilteredVariants.vcf_out_index + + # Finalize the raw Joint Calls: + call FF.FinalizeToFile as FinalizeRawHCVcf { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcVcf.new_sample_name_vcf } + call FF.FinalizeToFile as FinalizeRawHCTbi { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcVcf.new_sample_name_vcf_index } + call FF.FinalizeToFile as FinalizeHCGVcf { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcGvcf.new_sample_name_vcf } + call FF.FinalizeToFile as FinalizeHCGTbi { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcGvcf.new_sample_name_vcf_index } + call FF.FinalizeToFile as FinalizeHCBamOut { input: outdir = smalldir, keyfile = keyfile, file = CallVariantsWithHaplotypeCaller.bamout } + call FF.FinalizeToFile as FinalizeHCBaiOut { input: outdir = smalldir, keyfile = keyfile, file = CallVariantsWithHaplotypeCaller.bamout_index } + + # Finalize the reclibrated / filtered variants: + call FF.FinalizeToFile as FinalizeHCRescoredVcf { input: outdir = smalldir, keyfile = keyfile, file = ScoreIndelVariantAnnotations.scored_vcf } + call FF.FinalizeToFile as FinalizeHCRescoredTbi { input: outdir = smalldir, keyfile = keyfile, file = ScoreIndelVariantAnnotations.scored_vcf_index } + + output { + Boolean successfully_processed = true + + ######################################## + + File? hc_g_vcf = FinalizeHCGVcf.gcs_path + File? hc_g_tbi = FinalizeHCGTbi.gcs_path + File? hc_bamout = FinalizeHCBamOut.gcs_path + File? hc_baiout = FinalizeHCBaiOut.gcs_path + File? hc_raw_vcf = FinalizeRawHCVcf.gcs_path + File? hc_raw_tbi = FinalizeRawHCTbi.gcs_path + File? hc_rescored_vcf = FinalizeHCRescoredVcf.gcs_path + File? hc_rescored_tbi = FinalizeHCRescoredVcf.gcs_path + } +} diff --git a/wdl/SRWholeGenome_Pf_Niare_VQSR.wdl b/wdl/SRWholeGenome_Pf_Niare_VQSR.wdl new file mode 100644 index 000000000..3b9c96d16 --- /dev/null +++ b/wdl/SRWholeGenome_Pf_Niare_VQSR.wdl @@ -0,0 +1,289 @@ +version 1.0 + +###################################################################################### +## A workflow that performs single sample variant calling on Illumina reads from +## one or more flow cells. The workflow merges multiple samples into a single BAM +## prior to variant calling. +###################################################################################### + +import "tasks/Utils.wdl" as Utils +import "tasks/SRUtils.wdl" as SRUTIL +import "tasks/Finalize.wdl" as FF +import "tasks/VariantUtils.wdl" as VARUTIL +import "tasks/Pf_Niare_HaplotypeCaller.wdl" as Niare_HC + +workflow SRWholeGenome_Pf_Niare_VQSR { + input { + Array[File] aligned_bams + Array[File] aligned_bais + + File ref_map_file + + String participant_name + + File vcf_calling_interval_list + File genotype_gvcfs_intervals + + String gcs_out_root_dir + + File vqsr_sites_vcf + File vqsr_sites_vcf_index + + Boolean call_vars_on_mitochondria = false + String mito_contig = "Pf3D7_MIT_v3" + Array[String] contigs_names_to_ignore = ["Pf3D7_API_v3"] ## Required for ignoring any filtering - this is kind of a hack - TODO: fix the task! + } + + Map[String, String] ref_map = read_map(ref_map_file) + + String outdir = sub(gcs_out_root_dir, "/$", "") + "/SRWholeGenome_Pf_Niare_VQSR/~{participant_name}" + + String bam_dir = outdir + "/alignments" + String metrics_dir = outdir + "/metrics" + String smalldir = outdir + "/variants/small" + String recalibration_dir = outdir + "/variants/recalibration_files" + + # gather across (potential multiple) input CCS BAMs + if (length(aligned_bams) > 1) { + scatter (pair in zip(aligned_bams, aligned_bais)) { + call Utils.InferSampleName {input: bam = pair.left, bai = pair.right} + } + call Utils.CheckOnSamplenames {input: sample_names = InferSampleName.sample_name} + + call Utils.MergeBams as MergeAllReads { input: bams = aligned_bams, prefix = participant_name } + } + + File bam = select_first([MergeAllReads.merged_bam, aligned_bams[0]]) + File bai = select_first([MergeAllReads.merged_bai, aligned_bais[0]]) + + #################################################################################################### + # HC Call Variants: + + # Now we handle HaplotypeCaller data: + call Niare_HC.CallVariantsWithHaplotypeCaller { + input: + bam = bam, + bai = bai, + sample_id = participant_name, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + + genotype_gvcfs_intervals = genotype_gvcfs_intervals, + + prefix = participant_name + ".haplotype_caller", + + mito_contig = ref_map['mt_chr_name'], + contigs_names_to_ignore = contigs_names_to_ignore, + } + + # Make sure our sample name is correct: + call VARUTIL.RenameSingleSampleVcf as RenameRawHcVcf { + input: + vcf = CallVariantsWithHaplotypeCaller.output_vcf, + vcf_index = CallVariantsWithHaplotypeCaller.output_vcf_index, + prefix = participant_name + ".haplotype_caller.renamed", + new_sample_name = participant_name + } + call VARUTIL.RenameSingleSampleVcf as RenameRawHcGvcf { + input: + vcf = CallVariantsWithHaplotypeCaller.output_gvcf, + vcf_index = CallVariantsWithHaplotypeCaller.output_gvcf_index, + prefix = participant_name + ".haplotype_caller.renamed", + is_gvcf = true, + new_sample_name = participant_name + } + + ################################################################################################ + # VQSR: + + # Scatter by chromosome: + Array[String] use_filter = if (call_vars_on_mitochondria) then contigs_names_to_ignore else flatten([[mito_contig], contigs_names_to_ignore]) + call Utils.MakeChrIntervalList as SmallVariantsScatterPrep { + input: + ref_dict = ref_map['dict'], + filter = use_filter + } + +# # Call over the scattered intervals: +# scatter (c in SmallVariantsScatterPrep.chrs) { +# String contig_for_small_var = c[0] +# +# call VARUTIL.SubsetVCF as GetHcCallsForContig { +# input: +# vcf_gz = RenameRawHcVcf.new_sample_name_vcf, +# vcf_tbi = RenameRawHcVcf.new_sample_name_vcf_index, +# locus = contig_for_small_var, +# prefix = participant_name + "." + contig_for_small_var, +# } +# +# call Niare_HC.NormalizeVcfSplittingMultiallelics as NormalizeVcfPreVqsr { +# input: +# input_vcf = GetHcCallsForContig.subset_vcf, +# input_vcf_index = GetHcCallsForContig.subset_tbi, +# ref_fasta = ref_map['fasta'], +# ref_fasta_fai = ref_map['fai'], +# ref_dict = ref_map['dict'], +# prefix = participant_name + "." + contig_for_small_var + ".norm" +# } +# +# call Niare_HC.VariantRecalibratorIndel as VariantRecalibratorIndel { +# input: +# input_vcf = NormalizeVcfPreVqsr.output_vcf, +# input_vcf_index = NormalizeVcfPreVqsr.output_vcf_index, +# ref_fasta = ref_map['fasta'], +# ref_fasta_fai = ref_map['fai'], +# ref_dict = ref_map['dict'], +# sites_only_vcf = vqsr_sites_vcf, +# sites_only_vcf_index = vqsr_sites_vcf_index, +# prefix = participant_name + "." + contig_for_small_var + ".norm", +# } +# +# call Niare_HC.ApplyVqsrIndel as ApplyVqsrIndel { +# input: +# input_vcf = NormalizeVcfPreVqsr.output_vcf, +# input_vcf_index = NormalizeVcfPreVqsr.output_vcf_index, +# recal_file = VariantRecalibratorIndel.recalibration, +# recal_file_index = VariantRecalibratorIndel.recalibration_index, +# recal_tranches = VariantRecalibratorIndel.tranches, +# prefix = participant_name + "." + contig_for_small_var + ".norm", +# } +# +# call Niare_HC.VariantRecalibratorSnp as VariantRecalibratorSnp { +# input: +# input_vcf = ApplyVqsrIndel.output_vcf, +# input_vcf_index = ApplyVqsrIndel.output_vcf_index, +# ref_fasta = ref_map['fasta'], +# ref_fasta_fai = ref_map['fai'], +# ref_dict = ref_map['dict'], +# sites_only_vcf = vqsr_sites_vcf, +# sites_only_vcf_index = vqsr_sites_vcf_index, +# prefix = participant_name + "." + contig_for_small_var + ".norm", +# } +# +# call Niare_HC.ApplyVqsrSnp as ApplyVqsrSnp { +# input: +# input_vcf = ApplyVqsrIndel.output_vcf, +# input_vcf_index = ApplyVqsrIndel.output_vcf_index, +# recal_file = VariantRecalibratorSnp.recalibration, +# recal_file_index = VariantRecalibratorSnp.recalibration_index, +# recal_tranches = VariantRecalibratorSnp.tranches, +# prefix = participant_name + "." + contig_for_small_var + ".norm.indel_recal", +# } +# +# call Niare_HC.MergeMultiAllelicSitesPostRecalibration as MergeMultiAllelicSitesPostRecalibration { +# input: +# input_vcf = ApplyVqsrSnp.output_vcf, +# input_vcf_index = ApplyVqsrSnp.output_vcf_index, +# ref_fasta = ref_map['fasta'], +# ref_fasta_fai = ref_map['fai'], +# ref_dict = ref_map['dict'], +# prefix = participant_name + "." + contig_for_small_var, +# } +# } +# +# call SRUTIL.MergeVCFs as MergeVCFs { +# input: +# input_vcfs = MergeMultiAllelicSitesPostRecalibration.output_vcf, +# input_vcfs_indexes = MergeMultiAllelicSitesPostRecalibration.output_vcf_index, +# prefix = participant_name + ".recalibrated" +# } + + call Niare_HC.NormalizeVcfSplittingMultiallelics as NormalizeVcfPreVqsr { + input: + input_vcf = RenameRawHcVcf.new_sample_name_vcf, + input_vcf_index = RenameRawHcVcf.new_sample_name_vcf_index, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = participant_name + ".norm" + } + + call Niare_HC.VariantRecalibratorIndel as VariantRecalibratorIndel { + input: + input_vcf = NormalizeVcfPreVqsr.output_vcf, + input_vcf_index = NormalizeVcfPreVqsr.output_vcf_index, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + sites_only_vcf = vqsr_sites_vcf, + sites_only_vcf_index = vqsr_sites_vcf_index, + prefix = participant_name + ".norm", + } + + call Niare_HC.ApplyVqsrIndel as ApplyVqsrIndel { + input: + input_vcf = NormalizeVcfPreVqsr.output_vcf, + input_vcf_index = NormalizeVcfPreVqsr.output_vcf_index, + recal_file = VariantRecalibratorIndel.recalibration, + recal_file_index = VariantRecalibratorIndel.recalibration_index, + recal_tranches = VariantRecalibratorIndel.tranches, + prefix = participant_name + ".norm", + } + + call Niare_HC.VariantRecalibratorSnp as VariantRecalibratorSnp { + input: + input_vcf = ApplyVqsrIndel.output_vcf, + input_vcf_index = ApplyVqsrIndel.output_vcf_index, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + sites_only_vcf = vqsr_sites_vcf, + sites_only_vcf_index = vqsr_sites_vcf_index, + prefix = participant_name + ".norm", + } + + call Niare_HC.ApplyVqsrSnp as ApplyVqsrSnp { + input: + input_vcf = ApplyVqsrIndel.output_vcf, + input_vcf_index = ApplyVqsrIndel.output_vcf_index, + recal_file = VariantRecalibratorSnp.recalibration, + recal_file_index = VariantRecalibratorSnp.recalibration_index, + recal_tranches = VariantRecalibratorSnp.tranches, + prefix = participant_name + ".norm.indel_recal", + } + + call Niare_HC.MergeMultiAllelicSitesPostRecalibration as MergeMultiAllelicSitesPostRecalibration { + input: + input_vcf = ApplyVqsrSnp.output_vcf, + input_vcf_index = ApplyVqsrSnp.output_vcf_index, + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map['fai'], + ref_dict = ref_map['dict'], + prefix = participant_name + ".recalibrated", + } + + ################################################################################################ + + # Create a Keyfile for finalization: + File keyfile = MergeMultiAllelicSitesPostRecalibration.output_vcf_index + + # Finalize the raw Joint Calls: + call FF.FinalizeToFile as FinalizeRawHCVcf { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcVcf.new_sample_name_vcf } + call FF.FinalizeToFile as FinalizeRawHCTbi { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcVcf.new_sample_name_vcf_index } + call FF.FinalizeToFile as FinalizeHCGVcf { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcGvcf.new_sample_name_vcf } + call FF.FinalizeToFile as FinalizeHCGTbi { input: outdir = smalldir, keyfile = keyfile, file = RenameRawHcGvcf.new_sample_name_vcf_index } + call FF.FinalizeToFile as FinalizeHCBamOut { input: outdir = smalldir, keyfile = keyfile, file = CallVariantsWithHaplotypeCaller.bamout } + call FF.FinalizeToFile as FinalizeHCBaiOut { input: outdir = smalldir, keyfile = keyfile, file = CallVariantsWithHaplotypeCaller.bamout_index } + call FF.FinalizeToFile as FinalizeRecalibratedVcf { input: outdir = smalldir, keyfile = keyfile, file = MergeMultiAllelicSitesPostRecalibration.output_vcf } + call FF.FinalizeToFile as FinalizeRecalibratedVcfIndex { input: outdir = smalldir, keyfile = keyfile, file = MergeMultiAllelicSitesPostRecalibration.output_vcf_index } + + ################################ + # Finalize the VETS files: + ############ + + output { + Boolean successfully_processed = true + + ######################################## + + File? hc_g_vcf = FinalizeHCGVcf.gcs_path + File? hc_g_tbi = FinalizeHCGTbi.gcs_path + File? hc_bamout = FinalizeHCBamOut.gcs_path + File? hc_baiout = FinalizeHCBaiOut.gcs_path + File? hc_raw_vcf = FinalizeRawHCVcf.gcs_path + File? hc_raw_tbi = FinalizeRawHCTbi.gcs_path + File? hc_rescored_vcf = FinalizeRecalibratedVcf.gcs_path + File? hc_rescored_tbi = FinalizeRecalibratedVcfIndex.gcs_path + } +} diff --git a/wdl/TrainCnnFilters.wdl b/wdl/TrainCnnFilters.wdl new file mode 100644 index 000000000..c991ee9a6 --- /dev/null +++ b/wdl/TrainCnnFilters.wdl @@ -0,0 +1,423 @@ +version 1.0 + +import "tasks/Structs.wdl" as Structs +import "tasks/Utils.wdl" as Utils +import "tasks/Finalize.wdl" as FF + +workflow TrainCnnFilters { + meta { + author: "Jonn Smith" + description: "A workflow for training the 1D and 2D CNN filtration methods in GATK." + } + + input { + Array[File] vcfs + Array[File] vcf_indices + + Array[File] bams + Array[File] bais + + Array[File] truth_vcfs + Array[File] truth_vcf_indices + Array[File] truth_beds + + File ref_map_file + + String prefix = "out" + } + + parameter_meta { + vcfs: "GCS path to VCF files containing called variants on which to train / test / validate the CNN models." + vcf_indices: "GCS path to index files for called variants on which to train / test / validate the CNN models." + + bams: "GCS path to bam files containing the either the mapped reads from which variants were called, or a bam-out from the variant caller that produced the input VCF files." + bais: "GCS path to index files for the bam files containing the either the mapped reads from which variants were called, or a bam-out from the variant caller that produced the input VCF files." + + truth_vcfs: "GCS path to VCF files containing validated variant calls (\"truth\") for the corresponding called variants in `vcfs`." + truth_vcf_indices: "GCS path to index files for VCF files containing validated variant calls (\"truth\") for the corresponding called variants in `vcfs`." + truth_beds: "GCS path to bed files with confident regions for the given `truth_vcfs`" + + ref_map_file: "table indicating reference sequence and auxillary file locations" + } + + # Get ref info: + Map[String, String] ref_map = read_map(ref_map_file) + + # TODO: Validate that lengths of all inputs are the same: + if ((length(vcfs) != length(vcf_indices)) || (length(vcfs) != length(vcf_indices)) || (length(vcfs) != length(vcf_indices)) || (length(vcfs) != length(vcf_indices)) || (length(vcfs) != length(vcf_indices))) { + call Utils.StopWorkflow {input: reason="Not all input arrays have the same length!"} + } + + # First create tensors for the input data: + scatter (idx_1 in range(length(vcfs))) { + # 1D CNN: + call Create1DReferenceTensors { + input: + vcf_input = vcfs[idx_1], + vcf_idx = vcf_indices[idx_1], + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map["fai"], + ref_dict = ref_map['dict'], + truth_vcf = truth_vcfs[idx_1], + truth_vcf_idx = truth_vcf_indices[idx_1], + truth_bed = truth_beds[idx_1], + prefix = prefix + "_shard_" + idx_1 + "reference" + } + # 2D CNN: + call Create2DReadTensors { + input: + bam_input = bams[idx_1], + bai_input = bais[idx_1], + vcf_input = vcfs[idx_1], + vcf_idx = vcf_indices[idx_1], + ref_fasta = ref_map['fasta'], + ref_fasta_fai = ref_map["fai"], + ref_dict = ref_map['dict'], + truth_vcf = truth_vcfs[idx_1], + truth_vcf_idx = truth_vcf_indices[idx_1], + truth_bed = truth_beds[idx_1], + prefix = prefix + "_shard_" + idx_1 + "reference" + } + } + + # Train the models with the created Tensors: + # CNN 1D: + call TrainCnn as TrainCnn1D { + input: + tensor_tars = Create1DReferenceTensors.tensor_dir_tar, + tensor_type = "reference", + epochs = 100, + training_steps = 100, + validation_steps = 6, + prefix = prefix + "_CNN_1D_Model" + } + + # CNN 2D: + call TrainCnn as TrainCnn2D { + input: + tensor_tars = Create2DReadTensors.tensor_dir_tar, + tensor_type = "read_tensor", + epochs = 100, + training_steps = 100, + validation_steps = 6, + optimizer_learning_rate = 0.000001, + prefix = prefix + "_CNN_2D_Model" + } + + output { + Array[File] cnn_1d_tensors = Create1DReferenceTensors.tensor_dir_tar + Array[File] cnn_2d_tensors = Create2DReadTensors.tensor_dir_tar + + File cnn_1d_model_json = TrainCnn1D.model_json + File cnn_1d_model_hd5 = TrainCnn1D.model_hd5 + + File cnn_2d_model_json = TrainCnn2D.model_json + File cnn_2d_model_hd5 = TrainCnn2D.model_hd5 + } +} + +task Create1DReferenceTensors { + + meta { + author: "Jonn Smith" + description: "Task to create 1D reference tensors for the 1D CNN." + } + + input { + File vcf_input + File vcf_idx + + File ref_fasta + File ref_fasta_fai + File ref_dict + + File truth_vcf + File truth_vcf_idx + File truth_bed + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(vcf_input, "GB") + + size(ref_fasta, "GB") + size(ref_fasta_fai, "GB") + size(ref_dict, "GB") + + size(truth_vcf, "GB") + + size(truth_bed, "GB") + ) + + command <<< + set -euxo pipefail + + gatk CNNVariantWriteTensors \ + -R ~{ref_fasta} \ + -V ~{vcf_input} \ + -truth-vcf ~{truth_vcf} \ + -truth-bed ~{truth_bed} \ + -tensor-type reference \ + --downsample-snps 1 \ + --downsample-indels 1 \ + --max-tensors 10000000 \ + -output-tensor-dir ~{prefix}_1D_tensor_dir + + # No need to zip - the files are .hd5 formatted: + tar -cf ~{prefix}_1D_tensor_dir.tar ~{prefix}_1D_tensor_dir + >>> + + output { + File tensor_dir_tar = "~{prefix}_1D_tensor_dir.tar" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Create2DReadTensors { + + meta { + author: "Jonn Smith" + description: "Task to create 2D read tensors for the 2D CNN." + } + + input { + File bam_input + File bai_input + + File vcf_input + File vcf_idx + + File ref_fasta + File ref_fasta_fai + File ref_dict + + File truth_vcf + File truth_vcf_idx + File truth_bed + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + 8*ceil(size(bam_input, "GB") + + size(vcf_input, "GB") + + size(ref_fasta, "GB") + size(ref_fasta_fai, "GB") + size(ref_dict, "GB") + + size(truth_vcf, "GB") + + size(truth_bed, "GB") + ) + + command <<< + set -euxo pipefail + + gatk CNNVariantWriteTensors \ + -R ~{ref_fasta} \ + -V ~{vcf_input} \ + -bam-file ~{bam_input} \ + -truth-vcf ~{truth_vcf} \ + -truth-bed ~{truth_bed} \ + -tensor-type read_tensor \ + --downsample-snps 1 \ + --downsample-indels 1 \ + --max-tensors 10000000 \ + -output-tensor-dir ~{prefix}_2D_tensor_dir + +# Now check if the tensors contain NaN values: +python << CODE + +import os +import h5py + +import numpy as np + +print() + +def find_files(folder_path, ext): + all_files = [] + for root, directories, files in os.walk(folder_path): + for file in files: + if file.endswith(ext): + all_files.append(os.path.join(root, file)) + return all_files + +hd5_files = find_files("~{prefix}_2D_tensor_dir", "hd5") + +print(f"Inspecting {len(hd5_files)} hd5 files... ") +for f in hd5_files: + with h5py.File(f, 'r') as hd5: + for k in hd5.keys(): + n = np.isnan(np.array(hd5[k])) + if n.sum() > 0: + print(f"File: {f}: Found {n.sum()} NaN(s) in key: {k}") +print("Done.") +print() + +CODE + + # No need to zip - the files are .hd5 formatted: + tar -cf ~{prefix}_2D_tensor_dir.tar ~{prefix}_2D_tensor_dir + >>> + + output { + File tensor_dir_tar = "~{prefix}_2D_tensor_dir.tar" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task TrainCnn { + + meta { + author: "Jonn Smith" + description: "Task to train the CNN." + } + + input { + Array[File] tensor_tars + String tensor_type # Can be either "reference" or "read_tensor" + + Int epochs = 100 + Int training_steps = 100 + Int validation_steps = 6 + + Float optimizer_beta1 = 0.9 + Float optimizer_beta2 = 0.999 + Float optimizer_clipnorm = 1.0 + Float optimizer_epsilon = 0.00000001 # 1.0e-8 + Float optimizer_learning_rate = 0.0001 # 1.0e-4 + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + # We need a lot of disk space here for the unpacked tensors and final model: + Int disk_size = 1 + 4*ceil(size(tensor_tars, "GB")) + + command <<< + set -euxo pipefail + + # Must pre-process the given tensor_tars into a single folder: + mkdir tensors + cd tensors + + # Let's try to do this multi-threaded: + # NOTE: Yes, I know this is multi-processing, but I'm in a hurry here. + + # Get the max number of threads to use: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + let max_threads=${np}-1 + if [[ $max_threads -le 0 ]] ; then + max_threads=1 + fi + + # Dispatch some jobs: + num_active_threads=0 + while read f ; do + # If we have reached the maximum number of threads, we should wait for a while: + if [[ $num_active_threads -ge $max_threads ]] ; then + # Wait for the next background process to finish: + wait -n + + # Give ourselves some wiggle room: + sleep 5 + + # Refresh the number of active threads: + num_active_threads=$(jobs | wc -l) + fi + + # Extract our tensors to the `tensors` folder: + tar --strip-components 1 -xf $f & + + # Update the number of active threads: + let num_active_threads=${num_active_threads}+1 + done < ~{write_lines(tensor_tars)} + + # Wait for the rest of our background processes to finish: + wait + + cd ../ + + gatk CNNVariantTrain \ + --verbosity DEBUG \ + -tensor-type reference \ + --epochs ~{epochs} \ + --training-steps ~{training_steps} \ + --validation-steps ~{validation_steps} \ + --optimizer-beta-1 ~{optimizer_beta1} \ + --optimizer-beta-2 ~{optimizer_beta2} \ + --optimizer-clipnorm ~{optimizer_clipnorm} \ + --optimizer-epsilon ~{optimizer_epsilon} \ + --optimizer-learning-rate ~{optimizer_learning_rate} \ + -input-tensor-dir tensors/ \ + -model-name ~{prefix}_CNN_~{tensor_type}_model \ + + ls -la + >>> + + output { + File model_hd5 = "~{prefix}_CNN_~{tensor_type}_model.hd5" + File model_json = "~{prefix}_CNN_~{tensor_type}_model.json" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 1, + docker: "broadinstitute/gatk-nightly:2023-08-18-4.4.0.0-57-g98f63667a-NIGHTLY-SNAPSHOT" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + + # NOTE: We NEED GPUs to train the CNNs, so we don't allow for them to be modified by runtime attributes. + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + gpuType: "nvidia-tesla-t4" + gpuCount: 4 + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/tasks/AlignedMetrics.wdl b/wdl/tasks/AlignedMetrics.wdl index 6c25a031a..2b8e46c82 100644 --- a/wdl/tasks/AlignedMetrics.wdl +++ b/wdl/tasks/AlignedMetrics.wdl @@ -338,6 +338,103 @@ task CoverageTrack { } } +task SamStats { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + String basename = basename(bam, ".bam") + Int disk_size = 2*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + samtools stats -@${np} ~{bam} > ~{basename}.sam_stats.txt + >>> + + output { + File sam_stats = "~{basename}.sam_stats.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SamStatsMap { + input { + File bam + + RuntimeAttr? runtime_attr_override + } + + String basename = basename(bam, ".bam") + Int disk_size = 2*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + samtools stats -@${np} ~{bam} > ~{basename}.sam_stats.txt + + grep '^SN' ~{basename}.sam_stats.txt | \ + cut -f 2- | \ + sed 's/://g' | \ + sed 's/ /_/g' | \ + sed 's/[\(\)]//g' | \ + sed 's/[[:space:]]*#.*//' \ + > map.txt + >>> + + output { + File sam_stats = "~{basename}.sam_stats.txt" + Map[String, Float] stats_map = read_map("map.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + task FlagStats { input { File bam diff --git a/wdl/tasks/CallVariantsIllumina.wdl b/wdl/tasks/CallVariantsIllumina.wdl new file mode 100644 index 000000000..347d1840c --- /dev/null +++ b/wdl/tasks/CallVariantsIllumina.wdl @@ -0,0 +1,73 @@ +version 1.0 + +import "Utils.wdl" +import "VariantUtils.wdl" +import "DeepVariant.wdl" as DV + +workflow CallVariants { + meta { + description: "A workflow for calling small variants from an Illumina BAM file." + } + input { + File bam + File bai + + String prefix + String sample_id + + File ref_fasta + File ref_fasta_fai + File ref_dict + + Boolean call_small_variants + Boolean call_small_vars_on_mitochondria = true + + Boolean run_dv_pepper_analysis + Int? dvp_threads + Int? dvp_memory + + String mito_contig = "chrM" + Array[String] contigs_names_to_ignore = ["RANDOM_PLACEHOLDER_VALUE"] ## Required for ignoring any filtering - this is kind of a hack - TODO: fix the task! + } + + ###################################################################### + # Block for small variants handling + ###################################################################### + + call Utils.RandomZoneSpewer as arbitrary {input: num_of_zones = 3} + + # todo: merge the two scattering scheme into a better one + if (call_small_variants) { + # Scatter by chromosome + Array[String] use_filter = if (call_small_vars_on_mitochondria) then contigs_names_to_ignore else flatten([[mito_contig], contigs_names_to_ignore]) + call Utils.MakeChrIntervalList as SmallVariantsScatterPrepp { + input: + ref_dict = ref_dict, + filter = use_filter + } + + # size-balanced scatter + if (run_dv_pepper_analysis) { + call DV.DeepVariant { + input: + bam = bam, + bai = bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + + pepper_threads = select_first([dvp_threads]), + pepper_memory = select_first([dvp_memory]), + dv_threads = select_first([dvp_threads]), + dv_memory = select_first([dvp_memory]), + zones = arbitrary.zones + } + } + } + + output { + File? dvp_g_vcf = DeepVariant.gVCF + File? dvp_g_tbi = DeepVariant.gVCF_tbi + File? dvp_vcf = DeepVariant.VCF + File? dvp_tbi = DeepVariant.VCF_tbi + } +} diff --git a/wdl/tasks/DeepVariant.wdl b/wdl/tasks/DeepVariant.wdl new file mode 100644 index 000000000..c614f051d --- /dev/null +++ b/wdl/tasks/DeepVariant.wdl @@ -0,0 +1,136 @@ +version 1.0 + +####################################################### +# This pipeline calls small variants using DeepVariant. +####################################################### + +import "Structs.wdl" + + +workflow DeepVariant { + + meta { + description: "Workflow for getting VCF and gVCF from DeepVariant. Note VCF is un-phased." + } + + input { + File bam + File bai + + File ref_fasta + File ref_fasta_fai + + Int pepper_threads + Int pepper_memory + + Int dv_threads + Int dv_memory + + String zones = "us-central1-b us-central1-c" + } + + parameter_meta { + # when running large scale workflows, we sometimes see errors like the following + # A resource limit has delayed the operation: generic::resource_exhausted: allocating: selecting resources: selecting region and zone: + # no available zones: 2763 LOCAL_SSD_TOTAL_GB (738/30000 available) usage too high + zones: "select which zone (GCP) to run this task" + } + + call DV as deep_variant { + input: + bam = bam, + bai = bai, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + threads = dv_threads, + memory = dv_memory, + zones = zones + } + + output { + File VCF = deep_variant.VCF + File VCF_tbi = deep_variant.VCF_tbi + + File gVCF = deep_variant.gVCF + File gVCF_tbi = deep_variant.gVCF_tbi + } +} + +task DV { + + input { + File bam + File bai + + File ref_fasta + File ref_fasta_fai + + Int threads + Int memory + String zones + + RuntimeAttr? runtime_attr_override + } + + String prefix = basename(bam, ".bam") + ".deepvariant" + String output_root = "/cromwell_root/dv_output" + + Int bam_sz = ceil(size(bam, "GB")) + Boolean is_big_bam = bam_sz > 100 + Int inflation_factor = if (is_big_bam) then 10 else 5 + Int minimal_disk = 1000 + Int disk_size = if inflation_factor * bam_sz > minimal_disk then inflation_factor * bam_sz else minimal_disk + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + mkdir -p "~{output_root}" + + /opt/deepvariant/bin/run_deepvariant \ + --model_type=WGS \ + --ref=~{ref_fasta} \ + --reads=~{bam} \ + --output_vcf="~{output_root}/~{prefix}.vcf.gz" \ + --output_gvcf="~{output_root}/~{prefix}.g.vcf.gz" \ + --num_shards="${num_core}" + + find "~{output_root}/" -print | sed -e 's;[^/]*/;|____;g;s;____|; |;g' \ + > "~{output_root}/dir_structure.txt" + >>> + + output { + File output_dir_structure = "~{output_root}/dir_structure.txt" + + File VCF = "~{output_root}/~{prefix}.vcf.gz" + File VCF_tbi = "~{output_root}/~{prefix}.vcf.gz.tbi" + + File gVCF = "~{output_root}/~{prefix}.g.vcf.gz" + File gVCF_tbi = "~{output_root}/~{prefix}.g.vcf.gz.tbi" + + File visual_report_html = "~{output_root}/~{prefix}.visual_report.html" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: threads, + mem_gb: memory, + disk_gb: disk_size, + boot_disk_gb: 100, + preemptible_tries: 3, + max_retries: 0, + docker: "google/deepvariant:1.4.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + zones: zones + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/tasks/FastQC.wdl b/wdl/tasks/FastQC.wdl new file mode 100644 index 000000000..52c5a0bf4 --- /dev/null +++ b/wdl/tasks/FastQC.wdl @@ -0,0 +1,80 @@ +version 1.0 + +import "Structs.wdl" + +task FastQC { + input { + File bam + File bai + + Int num_cpus = 4 + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size([bam, bai], "GB")) + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + fastqc -t $num_core --extract ~{bam} + + find . -name 'fastqc_data.txt' -exec mv {} fastqc_data.txt \; + find . -name 'fastqc_report.html' -exec mv {} fastqc_report.html \; + + number_of_reads=$(grep 'Total Sequences' fastqc_data.txt | awk '{ print $3 }') + read_length=$(grep 'Sequence length' fastqc_data.txt | awk '{ print $3 }' | cut -f2 -d'-') + + echo $number_of_reads | awk '{ print "number_of_reads\t" $1 }' >> map.txt + echo $read_length | awk '{ print "read_length\t" $1 }' >> map.txt + echo $number_of_reads $read_length | awk '{ print "number_of_bases\t" $1*$2 }' >> map.txt + + mean_qual=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \ + grep -v '^#' | \ + grep -v '>>' | \ + awk '{ print $2 }' | \ + awk '{x+=$1; next} END{print x/NR}') + + echo $mean_qual | awk '{ print "mean_qual\t" $1 }' >> map.txt + + median_qual=$(sed -n '/Per base sequence quality/,/END_MODULE/p' fastqc_data.txt | \ + grep -v '^#' | \ + grep -v '>>' | \ + awk '{ print $2 }' | \ + awk '{x+=$1; next} END{print x/NR}' | \ + sort -n | \ + awk '{ a[i++]=$1; } END { x=int((i+1)/2); if (x < (i+1)/2) print (a[x-1]+a[x])/2; else print a[x-1]; }') + + echo $median_qual | awk '{ print "median_qual\t" $1 }' >> map.txt + >>> + + output { + Map[String, Float] stats_map = read_map("map.txt") + + File stats = "fastqc_data.txt" + File report = "fastqc_report.html" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "staphb/fastqc:latest" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/tasks/Finalize.wdl b/wdl/tasks/Finalize.wdl index fdebb1998..4402e1e38 100644 --- a/wdl/tasks/Finalize.wdl +++ b/wdl/tasks/Finalize.wdl @@ -22,7 +22,7 @@ task FinalizeToFile { outdir: "directory to which files should be uploaded" name: "name to set for uploaded file" } - + # Remove trailing slashes from a directory path. String gcs_output_dir = sub(outdir, "/+$", "") String gcs_output_file = gcs_output_dir + "/" + select_first([name, basename(file)]) @@ -82,11 +82,27 @@ task FinalizeToDir { command <<< set -euxo pipefail - cat ~{write_lines(files)} | gsutil -m cp -I "~{gcs_output_dir}" + # Only copy files that are not the same as their destinations. + # This is a very far corner case but can come up if you're finalizing input files + # and you're re-running data (e.g. re-running `SRFlowcell`). + + clean_out_dir=$(echo "~{gcs_output_dir}" | sed 's@/[ \t]*$@@') + + while read src_file_path ; do + bn=$(basename ${src_file_path}) + if [[ ${src_file_path} == ${clean_out_dir}/${bn} ]] ; then + echo "Source and destination file paths are the same. Skipping file: ${src_file_path}" 1>&2 + else + echo "${src_file_path}" + fi + done < ~{write_lines(files)} > sanitized_file_list.txt + + cat sanitized_file_list.txt | gsutil -m cp -I "~{gcs_output_dir}" >>> output { String gcs_dir = gcs_output_dir + File copied_files_list = "sanitized_file_list.txt" } ######################### diff --git a/wdl/tasks/FunctionalAnnotation.wdl b/wdl/tasks/FunctionalAnnotation.wdl new file mode 100644 index 000000000..107dea2d6 --- /dev/null +++ b/wdl/tasks/FunctionalAnnotation.wdl @@ -0,0 +1,61 @@ +version 1.0 + +import "Structs.wdl" + +task FunctionallyAnnotateVariants { + input { + File vcf + File snpeff_db + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 5*ceil(size([vcf, snpeff_db], "GB")) + String prefix = basename(basename(vcf, ".gz"), ".vcf") + + command <<< + set -x + + gunzip -c ~{snpeff_db} | tar xvf - + + /snpEff/scripts/snpEff ann -v \ + -c $PWD/snpeff_db/snpEff.config \ + -dataDir $PWD/snpeff_db/data \ + PlasmoDB-61_Pfalciparum3D7_Genome \ + ~{vcf} | bgzip > ~{prefix}.annotated.vcf.gz + + mv snpEff_summary.html ~{prefix}.snpEff_summary.html + mv snpEff_genes.txt ~{prefix}.snpEff_genes.txt + + # Index the output VCF file: + tabix -p vcf ~{prefix}.annotated.vcf.gz + >>> + + output { + File annotated_vcf = "~{prefix}.annotated.vcf.gz" + File annotated_vcf_index = "~{prefix}.annotated.vcf.gz.tbi" + File snpEff_summary = "~{prefix}.snpEff_summary.html" + File snpEff_genes = "~{prefix}.snpEff_genes.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-functional-annotation:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/tasks/GLNexus.wdl b/wdl/tasks/GLNexus.wdl new file mode 100644 index 000000000..31d61a90e --- /dev/null +++ b/wdl/tasks/GLNexus.wdl @@ -0,0 +1,363 @@ +version 1.0 + +########################################################################################## +# This pipeline joint-calls GVCFs with GLNexus (https://github.com/dnanexus-rnd/GLnexus). +# It also permits intervals to be specified so that joint calling only takes place on a +# subset of intervals (this can be useful for finding duplicate samples). +########################################################################################## + +import "Utils.wdl" +import "VariantUtils.wdl" + +workflow JointCall { + input { + Array[File] gvcfs + Array[File] tbis + + File dict + File? bed + + String config = "DeepVariantWGS" + Boolean more_PL = false + Boolean squeeze = false + Boolean trim_uncalled_alleles = false + + Int? num_cpus + Int max_cpus = 64 + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + gvcfs: "gVCF files to perform joint calling upon" + tbis: "gVCF index files" + dict: "reference sequence dictionary" + bed: "intervals to which joint calling should be restricted" + + config: "configuration preset name or .yml filename" + more_PL: "include PL from reference bands and other cases omitted by default" + squeeze: "reduce pVCF size by suppressing detail in cells derived from reference bands" + trim_uncalled_alleles: "remove alleles with no output GT calls in postprocessing" + + num_cpus: "number of CPUs to use" + max_cpus: "maximum number of CPUs to allow" + prefix: "output prefix for joined-called BCF and GVCF files" + } + + Int cpus_exp = if defined(num_cpus) then select_first([num_cpus]) else 2*length(gvcfs) + Int cpus_act = if cpus_exp < max_cpus then cpus_exp else max_cpus + + # List all of the contigs in the reference + call GetRanges { input: dict = dict, bed = bed } + + # Shard all gVCFs into per-contig shards + scatter (p in zip(gvcfs, tbis)) { + call ShardVCFByRanges { input: gvcf = p.left, tbi = p.right, ranges = GetRanges.ranges } + } + + # Joint-call in parallel over chromosomes + scatter (i in range(length(ShardVCFByRanges.sharded_gvcfs[0]))) { + Array[File] per_contig_gvcfs = transpose(ShardVCFByRanges.sharded_gvcfs)[i] + + call Call { + input: + gvcfs = per_contig_gvcfs, + + config = config, + more_PL = more_PL, + squeeze = squeeze, + trim_uncalled_alleles = trim_uncalled_alleles, + + num_cpus = cpus_act, + prefix = prefix + } + } + + # Concatenate the contig-sharded joint calls into a single joint callset + call ConcatBCFs { input: bcfs = Call.joint_bcf, prefix = prefix } + + output { + File joint_gvcf = ConcatBCFs.joint_gvcf + File joint_gvcf_tbi = ConcatBCFs.joint_gvcf_tbi + } +} + +task GetRanges { + meta { + description: "Select loci over which to parallelize downstream operations." + } + + input { + File dict + File? bed + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + ceil(size(dict, "GB")) + + command <<< + set -euxo pipefail + + if [[ "~{defined(bed)}" == "true" ]]; then + cat ~{bed} | awk '{ print $1 ":" $2 "-" $3 }' > ranges.txt + else + grep '^@SQ' ~{dict} | \ + awk '{ print $2, $3 }' | \ + sed 's/[SL]N://g' | \ + awk '{ print $1 ":0-" $2 }' \ + > ranges.txt + fi + >>> + + output { + Array[String] ranges = read_lines("ranges.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "ghcr.io/dnanexus-rnd/glnexus:v1.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ShardVCFByRanges { + meta { + description: "Split VCF into smaller ranges for parallelization." + } + + input { + File gvcf + File tbi + Array[String] ranges + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(gvcf, "GB")) + + command <<< + set -euxo pipefail + + mkdir per_contig + + INDEX=0 + for RANGE in ~{sep=' ' ranges} + do + PINDEX=$(printf "%06d" $INDEX) + FRANGE=$(echo $RANGE | sed 's/[:-]/___/g') + OUTFILE="per_contig/$PINDEX.~{basename(gvcf, ".g.vcf.gz")}.locus_$FRANGE.g.vcf.gz" + + bcftools view ~{gvcf} $RANGE | bgzip > $OUTFILE + + INDEX=$(($INDEX+1)) + done + >>> + + output { + Array[File] sharded_gvcfs = glob("per_contig/*") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "ghcr.io/dnanexus-rnd/glnexus:v1.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Call { + meta { + description: "Joint-call gVCFs with GLNexus." + } + + input { + Array[File] gvcfs + + String config = "DeepVariantWGS" + Boolean more_PL = false + Boolean squeeze = false + Boolean trim_uncalled_alleles = false + + Int num_cpus = 96 + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 5*ceil(size(gvcfs, "GB")) + Int mem = 4*num_cpus + + command <<< + set -x + + # For guidance on performance settings, see https://github.com/dnanexus-rnd/GLnexus/wiki/Performance + ulimit -Sn 65536 + + echo ~{gvcfs[0]} | sed 's/.*locus_//' | sed 's/.g.vcf.bgz//' | sed 's/___/\t/g' > range.bed + + glnexus_cli \ + --config ~{config} \ + --bed range.bed \ + ~{if more_PL then "--more-PL" else ""} \ + ~{if squeeze then "--squeeze" else ""} \ + ~{if trim_uncalled_alleles then "--trim-uncalled-alleles" else ""} \ + --list ~{write_lines(gvcfs)} \ + > ~{prefix}.bcf + >>> + + output { + File joint_bcf = "~{prefix}.bcf" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: mem, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "ghcr.io/dnanexus-rnd/glnexus:v1.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task CompressAndIndex { + meta { + description: "Convert a BCF file to a vcf.bgz file and index it." + } + + input { + File joint_bcf + + Int num_cpus = 8 + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 3*ceil(size(joint_bcf, "GB")) + + command <<< + set -x + + bcftools view ~{joint_bcf} | bgzip -@ ~{num_cpus} -c > ~{prefix}.g.vcf.bgz + tabix -p vcf ~{prefix}.g.vcf.bgz + >>> + + output { + File joint_gvcf = "~{prefix}.g.vcf.bgz" + File joint_gvcf_tbi = "~{prefix}.g.vcf.bgz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 4*num_cpus, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "ghcr.io/dnanexus-rnd/glnexus:v1.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ConcatBCFs { + meta { + description: "Concatenate BCFs into a single .vcf.bgz file and index it." + } + + input { + Array[File] bcfs + + Int num_cpus = 4 + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(bcfs, "GB")) + + command <<< + set -euxo pipefail + + bcftools concat -n ~{sep=' ' bcfs} | bcftools view | bgzip -@ ~{num_cpus} -c > ~{prefix}.g.vcf.bgz + tabix -p vcf ~{prefix}.g.vcf.bgz + >>> + + output { + File joint_gvcf = "~{prefix}.g.vcf.bgz" + File joint_gvcf_tbi = "~{prefix}.g.vcf.bgz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 0, + docker: "ghcr.io/dnanexus-rnd/glnexus:v1.4.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/tasks/Hail.wdl b/wdl/tasks/Hail.wdl new file mode 100644 index 000000000..2a9d8fc98 --- /dev/null +++ b/wdl/tasks/Hail.wdl @@ -0,0 +1,78 @@ +version 1.0 + +import "Structs.wdl" + +task ConvertToHailMT { + meta { + description: "Convert a .vcf.bgz file to a Hail MatrixTable and copy it to a final gs:// URL." + } + + input { + File gvcf + File tbi + + String reference = "GRCh38" + String? ref_fasta + String? ref_fai + String prefix = "out" + + String outdir + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 3*ceil(size(gvcf, "GB")) + + command <<< + set -x + + python3 <>> + + output { + String gcs_path = "~{outdir}/~{prefix}.mt" + File completion_file = "completion_file" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 64, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "hailgenetics/hail:0.2.105" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/tasks/HaplotypeCaller.wdl b/wdl/tasks/HaplotypeCaller.wdl new file mode 100644 index 000000000..7f237884d --- /dev/null +++ b/wdl/tasks/HaplotypeCaller.wdl @@ -0,0 +1,378 @@ +version 1.0 + +import "Structs.wdl" +import "Utils.wdl" +import "SRUtils.wdl" as SRUTIL +import "SRJointGenotyping.wdl" as SRJOINT + +workflow CallVariantsWithHaplotypeCaller { + meta { + author: "Jonn Smith" + description: "A workflow for calling small variants with GATK HaplotypeCaller from an Illumina BAM file." + } + + input { + File bam + File bai + + String prefix + String sample_id + + File ref_fasta + File ref_fasta_fai + File ref_dict + + File dbsnp_vcf + + Boolean call_vars_on_mitochondria = true + + Int ploidy = 2 + + Float heterozygosity = 0.001 + Float heterozygosity_stdev = 0.01 + Float indel_heterozygosity = 0.000125 + + Boolean enable_pileup_mode = false + + String mito_contig = "chrM" + Array[String] contigs_names_to_ignore = ["RANDOM_PLACEHOLDER_VALUE"] ## Required for ignoring any filtering - this is kind of a hack - TODO: fix the task! + } + + # Scatter by chromosome: + Array[String] use_filter = if (call_vars_on_mitochondria) then contigs_names_to_ignore else flatten([[mito_contig], contigs_names_to_ignore]) + call Utils.MakeChrIntervalList as SmallVariantsScatterPrep { + input: + ref_dict = ref_dict, + filter = use_filter + } + + # Call over the scattered intervals: + scatter (c in SmallVariantsScatterPrep.chrs) { + String contig_for_small_var = c[0] + + call HaplotypeCaller_GATK4_VCF as CallVariantsWithHC { + input: + input_bam = bam, + input_bam_index = bai, + prefix = prefix + "." + contig_for_small_var, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_fai, + ref_dict = ref_dict, + make_gvcf = true, + make_bamout = true, + enable_pileup_mode = enable_pileup_mode, + single_interval = contig_for_small_var, + contamination = 0, + ploidy = ploidy, + heterozygosity = heterozygosity, + heterozygosity_stdev = heterozygosity_stdev, + indel_heterozygosity = indel_heterozygosity, + use_spanning_event_genotyping = true + } + } + + # Merge the output GVCFs: + call SRUTIL.MergeVCFs as MergeGVCFs { + input: + input_vcfs = CallVariantsWithHC.output_vcf, + input_vcfs_indexes = CallVariantsWithHC.output_vcf_index, + prefix = prefix + } + + # Merge the output BAMs: + call MergeBamouts as MergeVariantCalledBamOuts { + input: + bams = CallVariantsWithHC.bamout, + prefix = "~{prefix}.bamout" + } + + # Index the Bamout: + call Utils.Index as IndexBamout { + input: + bam = MergeVariantCalledBamOuts.output_bam + } + +# We're disabling ReblockGVCF for now. +# It's removing some annotations we may need later. + +# # Now reblock the GVCF to combine hom ref blocks and save $ / storage: +# call ReblockGVCF { +# input: +# gvcf = MergeGVCFs.output_vcf, +# gvcf_index = IndexGVCF.index, +# ref_fasta = ref_fasta, +# ref_fasta_fai = ref_fasta_fai, +# ref_dict = ref_dict, +# prefix = prefix +# } + + # Collapse the GVCF into a regular VCF: + call SRJOINT.GenotypeGVCFs as CollapseGVCFtoVCF { + input: + input_gvcf_data = MergeGVCFs.output_vcf, + input_gvcf_index = MergeGVCFs.output_vcf_index, + interval_list = SmallVariantsScatterPrep.interval_list, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + prefix = prefix, + } + + output { + File output_gvcf = MergeGVCFs.output_vcf + File output_gvcf_index = MergeGVCFs.output_vcf_index + File output_vcf = CollapseGVCFtoVCF.output_vcf + File output_vcf_index = CollapseGVCFtoVCF.output_vcf_index + File bamout = MergeVariantCalledBamOuts.output_bam + File bamout_index = IndexBamout.bai + } +} + +task HaplotypeCaller_GATK4_VCF { + meta { + author: "Jonn Smith" + notes: "Adapted from the WARP pipeline found here: https://github.com/broadinstitute/warp.git" + } + + input { + File input_bam + File input_bam_index + + String prefix + + File ref_dict + File ref_fasta + File ref_fasta_index + + Int ploidy = 2 + + Float heterozygosity = 0.001 + Float heterozygosity_stdev = 0.01 + Float indel_heterozygosity = 0.000125 + + Boolean make_gvcf + Boolean make_bamout + + String? single_interval + File? interval_list + Float? contamination + + Boolean use_spanning_event_genotyping = true + + Boolean enable_pileup_mode = false + + RuntimeAttr? runtime_attr_override + } + + String output_suffix = if make_gvcf then ".g.vcf.gz" else ".vcf.gz" + String output_file_name = prefix + output_suffix + + Float ref_size = size(ref_fasta, "GiB") + size(ref_fasta_index, "GiB") + size(ref_dict, "GiB") + Int disk_size = 2*ceil(((size(input_bam, "GiB") + 30)) + ref_size) + 20 + + String bamout_arg = if make_bamout then "-bamout ~{prefix}.bamout.bam" else "" + + String interval_arg = if (defined(interval_list) || defined(single_interval)) then " -L " else "" + String interval_arg_value = if defined(interval_list) then interval_list else if defined(single_interval) then single_interval else "" + + parameter_meta { + input_bam: { localization_optional: true } + } + + command <<< + set -euxo pipefail + + # We need at least 1 GB of available memory outside of the Java heap in order to execute native code, thus, limit + # Java's memory by the total memory minus 1 GB. We need to compute the total memory as it might differ from + # memory_size_gb because of Cromwell's retry with more memory feature. + # Note: In the future this should be done using Cromwell's ${MEM_SIZE} and ${MEM_UNIT} environment variables, + # which do not rely on the output format of the `free` command. + + available_memory_mb=$(free -m | awk '/^Mem/ {print $2}') + let java_memory_size_mb=available_memory_mb-1024 + echo Total available memory: ${available_memory_mb} MB >&2 + echo Memory reserved for Java: ${java_memory_size_mb} MB >&2 + + gatk --java-options "-Xmx${java_memory_size_mb}m -Xms${java_memory_size_mb}m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + HaplotypeCaller \ + -R ~{ref_fasta} \ + -I ~{input_bam} \ + ~{interval_arg}~{default="" sep=" -L " interval_arg_value} \ + -O ~{output_file_name} \ + -contamination ~{default=0 contamination} \ + --sample-ploidy ~{ploidy} \ + --heterozygosity ~{heterozygosity} \ + --heterozygosity-stdev ~{heterozygosity_stdev} \ + --indel-heterozygosity ~{indel_heterozygosity} \ + --linked-de-bruijn-graph \ + ~{true="--pileup-detection --pileup-detection-enable-indel-pileup-calling" false="" enable_pileup_mode} \ + --annotate-with-num-discovered-alleles \ + -GQB 10 -GQB 20 -GQB 30 -GQB 40 -GQB 50 -GQB 60 -GQB 70 -GQB 80 -GQB 90 \ + ~{false="--disable-spanning-event-genotyping" true="" use_spanning_event_genotyping} \ + -G StandardAnnotation -G StandardHCAnnotation \ + -A AssemblyComplexity \ + ~{true="-ERC GVCF" false="" make_gvcf} \ + --smith-waterman FASTEST_AVAILABLE \ + ~{bamout_arg} + + # Removed for now because we need to qualify the pipeline with standard annotations first. + # ~{true="-G AS_StandardAnnotation" false="" make_gvcf} + + # Cromwell doesn't like optional task outputs, so we have to touch this file. + touch ~{prefix}.bamout.bam + >>> + + output { + File output_vcf = "~{output_file_name}" + File output_vcf_index = "~{output_file_name}.tbi" + File bamout = "~{prefix}.bamout.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + + +# This task is here because merging bamout files using Picard produces an error. +task MergeBamouts { + + input { + Array[File] bams + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = ceil(size(bams, "GiB") * 2) + 10 + + command <<< + + set -euxo pipefail + + # Make sure we use all our processors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + ithreads=${np} + + # If the number of processors = 1, then `let` will return 1 here: + # So we need to turn off `set -e` for this command: + set +e + let mthreads=${np}-1 + set -e + + samtools merge -@${mthreads} ~{prefix}.bam ~{sep=" " bams} + samtools index -@${ithreads} ~{prefix}.bam + mv ~{prefix}.bam.bai ~{prefix}.bai + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File output_bam = "~{prefix}.bam" + File output_bam_index = "~{prefix}.bai" + } +} + +task ReblockGVCF { + + input { + File gvcf + File gvcf_index + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix + Float? tree_score_cutoff + + Array[String]? annotations_to_keep + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = ceil((size(gvcf, "GiB") * 4) + size(ref_fasta, "GiB") + size(ref_fasta_fai, "GiB") + size(ref_dict, "GiB") + 10) + + String annotations_to_keep_arg = if defined(annotations_to_keep) then "--annotations-to-keep" else "" + + command { + set -euxo pipefail + + gatk --java-options "-Xms3000m -Xmx3000m" \ + ReblockGVCF \ + -R ~{ref_fasta} \ + -V ~{gvcf} \ + -do-qual-approx \ + --floor-blocks -GQB 20 -GQB 30 -GQB 40 \ + ~{"--tree-score-threshold-to-no-call " + tree_score_cutoff} \ + ~{annotations_to_keep_arg} ~{sep=" --annotations-to-keep " annotations_to_keep} \ + -O ~{prefix}.rb.g.vcf.gz + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File output_gvcf = "~{prefix}.rb.g.vcf.gz" + File output_gvcf_index = "~{prefix}.rb.g.vcf.gz.tbi" + } +} diff --git a/wdl/tasks/Pf_Niare_HaplotypeCaller.wdl b/wdl/tasks/Pf_Niare_HaplotypeCaller.wdl new file mode 100644 index 000000000..42807f27f --- /dev/null +++ b/wdl/tasks/Pf_Niare_HaplotypeCaller.wdl @@ -0,0 +1,932 @@ +version 1.0 + +import "Structs.wdl" +import "Utils.wdl" +import "SRUtils.wdl" as SRUTIL +import "SRJointGenotyping.wdl" as SRJOINT + +workflow CallVariantsWithHaplotypeCaller { + meta { + author: "Jonn Smith" + description: "A workflow for calling small variants with GATK HaplotypeCaller from an Illumina BAM file using the methods laid out by Niare et al. (https://doi.org/10.1186/s12936-023-04632-0)." + } + + input { + File bam + File bai + + String prefix + String sample_id + + File ref_fasta + File ref_fasta_fai + File ref_dict + + File genotype_gvcfs_intervals + + Boolean call_vars_on_mitochondria = false + + String mito_contig = "chrM" + Array[String] contigs_names_to_ignore = ["RANDOM_PLACEHOLDER_VALUE"] ## Required for ignoring any filtering - this is kind of a hack - TODO: fix the task! + } + + # Scatter by chromosome: + Array[String] use_filter = if (call_vars_on_mitochondria) then contigs_names_to_ignore else flatten([[mito_contig], contigs_names_to_ignore]) + call Utils.MakeChrIntervalList as SmallVariantsScatterPrep { + input: + ref_dict = ref_dict, + filter = use_filter + } + + # Call over the scattered intervals: + scatter (c in SmallVariantsScatterPrep.chrs) { + String contig_for_small_var = c[0] + + call HaplotypeCaller_NIARE_GATK4_VCF as CallVariantsWithHC { + input: + input_bam = bam, + input_bam_index = bai, + prefix = prefix + "." + contig_for_small_var, + ref_fasta = ref_fasta, + ref_fasta_index = ref_fasta_fai, + ref_dict = ref_dict, + interval_list = contig_for_small_var + } + } + + # Merge the output GVCFs: + call SRUTIL.MergeVCFs as MergeGVCFs { + input: + input_vcfs = CallVariantsWithHC.output_vcf, + input_vcfs_indexes = CallVariantsWithHC.output_vcf_index, + prefix = prefix + } + + # Merge the output BAMs: + call MergeBamouts as MergeVariantCalledBamOuts { + input: + bams = CallVariantsWithHC.bamout, + prefix = "~{prefix}.bamout" + } + + # Index the Bamout: + call Utils.Index as IndexBamout { + input: + bam = MergeVariantCalledBamOuts.output_bam + } + + # Collapse the GVCF into a regular VCF: + call SRJOINT.GenotypeGVCFs as CollapseGVCFtoVCF { + input: + input_gvcf_data = MergeGVCFs.output_vcf, + input_gvcf_index = MergeGVCFs.output_vcf_index, + interval_list = genotype_gvcfs_intervals, + ref_fasta = ref_fasta, + ref_fasta_fai = ref_fasta_fai, + ref_dict = ref_dict, + prefix = prefix, + } + + output { + File output_gvcf = MergeGVCFs.output_vcf + File output_gvcf_index = MergeGVCFs.output_vcf_index + File output_vcf = CollapseGVCFtoVCF.output_vcf + File output_vcf_index = CollapseGVCFtoVCF.output_vcf_index + File bamout = MergeVariantCalledBamOuts.output_bam + File bamout_index = IndexBamout.bai + } +} + +task HaplotypeCaller_NIARE_GATK4_VCF { + meta { + author: "Jonn Smith" + notes: "Reproducing methods laid out by Niare et al. (https://doi.org/10.1186/s12936-023-04632-0) to perform testing." + } + + input { + File input_bam + File input_bam_index + + String prefix + + File ref_dict + File ref_fasta + File ref_fasta_index + + String interval_list + + RuntimeAttr? runtime_attr_override + } + + String output_file_name = prefix + ".g.vcf.gz" + + Int disk_size = 2*ceil(size([ref_fasta, ref_fasta_index, ref_dict, input_bam], "GiB") + 50) + + parameter_meta { + input_bam: { localization_optional: true } + } + + command <<< + set -euxo pipefail + # We need at least 1 GB of available memory outside of the Java heap in order to execute native code, thus, limit + # Java's memory by the total memory minus 1 GB. We need to compute the total memory as it might differ from + # memory_size_gb because of Cromwell's retry with more memory feature. + # Note: In the future this should be done using Cromwell's ${MEM_SIZE} and ${MEM_UNIT} environment variables, + # which do not rely on the output format of the `free` command. + + available_memory_mb=$(free -m | awk '/^Mem/ {print $2}') + let java_memory_size_mb=available_memory_mb-1024 + echo Total available memory: ${available_memory_mb} MB >&2 + echo Memory reserved for Java: ${java_memory_size_mb} MB >&2 + +# gatk --java-options "-Xmx${java_memory_size_mb}m -Xms${java_memory_size_mb}m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ +# HaplotypeCaller \ +# -R ~{ref_fasta} \ +# -I ~{input_bam} \ +# -ERC GVCF \ +# -ploidy 2 \ # Default: 2 +# --native-pair-hmm-threads 16 \ # Default: 4 +# -O ~{output_file_name} \ +# --assembly-region-padding 100 \ # Default: 100 +# --max-num-haplotypes-in-population 128 \ # Default: 128 +# --kmer-size 10 \ # Default: 10, 25 +# --kmer-size 25 \ # Default: 10, 25 +# --min-dangling-branch-length 4 \ # default: 4 +# --heterozygosity 0.0029 \ +# --indel-heterozygosity 0.0017 \ +# --min-assembly-region-size 100 \ # default: 50 +# -L ~{interval_list} \ +# -mbq 5 \ # default 10 +# -DF MappingQualityReadFilter \ +# --base-quality-score-threshold 12 \ # 18 +# -bamout ~{prefix}.bamout.bam + + gatk --java-options "-Xmx${java_memory_size_mb}m -Xms${java_memory_size_mb}m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + HaplotypeCaller \ + -R ~{ref_fasta} \ + -I ~{input_bam} \ + -ERC GVCF \ + -ploidy 2 \ + --native-pair-hmm-threads 16 \ + -O ~{output_file_name} \ + --assembly-region-padding 100 \ + --max-num-haplotypes-in-population 128 \ + --kmer-size 10 \ + --kmer-size 25 \ + --min-dangling-branch-length 4 \ + --heterozygosity 0.0029 \ + --indel-heterozygosity 0.0017 \ + --min-assembly-region-size 100 \ + -L ~{interval_list} \ + -mbq 5 \ + -DF MappingQualityReadFilter \ + --base-quality-score-threshold 12 \ + -bamout ~{prefix}.bamout.bam + + >>> + + output { + File output_vcf = "~{output_file_name}" + File output_vcf_index = "~{output_file_name}.tbi" + File bamout = "~{prefix}.bamout.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-malaria-niare-pipeline:0.0.1" + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +# This task is here because merging bamout files using Picard produces an error. +task MergeBamouts { + + input { + Array[File] bams + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = ceil(size(bams, "GiB") * 2) + 10 + + command <<< + + set -euxo pipefail + + # Make sure we use all our processors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + ithreads=${np} + + # If the number of processors = 1, then `let` will return 1 here: + # So we need to turn off `set -e` for this command: + set +e + let mthreads=${np}-1 + set -e + + samtools merge -@${mthreads} ~{prefix}.bam ~{sep=" " bams} + samtools index -@${ithreads} ~{prefix}.bam + mv ~{prefix}.bam.bai ~{prefix}.bai + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File output_bam = "~{prefix}.bam" + File output_bam_index = "~{prefix}.bai" + } +} + +task GenomicsDbImport { + meta { + author: "Jonn Smith" + notes: "Reproducing methods laid out by Niare et al. (https://doi.org/10.1186/s12936-023-04632-0) to perform testing." + } + + input { + File sample_name_map + + File interval_list + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix + + Int batch_size = 100 + + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size([sample_name_map, interval_list, ref_fasta, ref_fasta_fai, ref_dict], "GB")) + + Int disk_size = 1 + 4*ref_size + + command <<< + set -euxo pipefail + # We need at least 1 GB of available memory outside of the Java heap in order to execute native code, thus, limit + # Java's memory by the total memory minus 1 GB. We need to compute the total memory as it might differ from + # memory_size_gb because of Cromwell's retry with more memory feature. + # Note: In the future this should be done using Cromwell's ${MEM_SIZE} and ${MEM_UNIT} environment variables, + # which do not rely on the output format of the `free` command. + + available_memory_mb=$(free -m | awk '/^Mem/ {print $2}') + let java_memory_size_mb=available_memory_mb-1024 + echo Total available memory: ${available_memory_mb} MB >&2 + echo Memory reserved for Java: ${java_memory_size_mb} MB >&2 + + gatk --java-options "-Xmx${java_memory_size_mb}m -Xms${java_memory_size_mb}m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + GenomicsDBImport \ + --sample-name-map ~{sample_name_map} \ + --genomicsdb-workspace-path ~{prefix}.genomicsDB \ + --batch-size ~{batch_size} \ + -L ~{interval_list} \ + --genomicsdb-segment-size 8048576 \ + --genomicsdb-vcf-buffer-size 160384 + + tar -cf ~{prefix}.genomicsDB.tar ~{prefix}.genomicsDB + >>> + + output { + File output_genomicsdb = "~{prefix}.genomicsDB.tar" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-malaria-niare-pipeline:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task GenotypeGVCFs { + meta { + author: "Jonn Smith" + notes: "Reproducing methods laid out by Niare et al. (https://doi.org/10.1186/s12936-023-04632-0) to perform testing. Mild adaptations due to infrastructure." + } + + input { + File input_gvcf_data + File? input_gvcf_index # Required if passing a VCF file. + + File interval_list + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix + + Int batch_size = 100 + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + input_gvcf_data: { help: "Either a single GVCF file or a GenomicsDB Tar file." } + interval_list: { + localization_optional: true + } + } + + Int ref_size = ceil(size([input_gvcf_data, input_gvcf_index, ref_fasta, ref_fasta_fai, ref_dict, interval_list], "GB")) + + Int disk_size = 1 + 4*ref_size + + command <<< + set -euxo pipefail + # We need at least 1 GB of available memory outside of the Java heap in order to execute native code, thus, limit + # Java's memory by the total memory minus 1 GB. We need to compute the total memory as it might differ from + # memory_size_gb because of Cromwell's retry with more memory feature. + # Note: In the future this should be done using Cromwell's ${MEM_SIZE} and ${MEM_UNIT} environment variables, + # which do not rely on the output format of the `free` command. + + # We must determine if our input variants are in a genomicsdb file or in a VCF. + # The easiest way is to see if the input is a .tar file: + + is_genomics_db=true + filename=$(basename -- "~{input_gvcf_data}") + extension="${filename##*.}" + if [[ "${extension}" != "tar" ]] ; then + is_genomics_db=false + fi + + if $is_genomics_db ; then + tar -xf ~{input_gvcf_data} + INPUT_FILE="gendb://$(basename ~{input_gvcf_data} .tar)" + else + INPUT_FILE=~{input_gvcf_data} + fi + + available_memory_mb=$(free -m | awk '/^Mem/ {print $2}') + let java_memory_size_mb=available_memory_mb-1024 + echo Total available memory: ${available_memory_mb} MB >&2 + echo Memory reserved for Java: ${java_memory_size_mb} MB >&2 + + gatk --java-options "-Xmx${java_memory_size_mb}m -Xms${java_memory_size_mb}m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + GenotypeGVCFs \ + --genomicsdb-use-bcf-codec true \ + -R ~{ref_fasta} \ + -V ${INPUT_FILE} \ + -L ~{interval_list} \ + --max-genotype-count 1024 \ + -O ~{prefix}.vcf.gz \ + -stand-call-conf 30 + + >>> + + output { + File output_vcf = "~{prefix}.vcf.gz" + File output_vcf_index = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-malaria-niare-pipeline:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task NormalizeVcfSplittingMultiallelics { + meta { + author: "Jonn Smith" + notes: "Reproducing methods laid out by Niare et al. (https://doi.org/10.1186/s12936-023-04632-0) to perform testing. Mild adaptations due to infrastructure." + } + + input { + File input_vcf + File input_vcf_index + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix + + Int batch_size = 100 + + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size([input_vcf, input_vcf_index, ref_fasta, ref_fasta_fai, ref_dict], "GB")) + + Int disk_size = 1 + 4*ref_size + + command <<< + set -euxo pipefail + # We need at least 1 GB of available memory outside of the Java heap in order to execute native code, thus, limit + # Java's memory by the total memory minus 1 GB. We need to compute the total memory as it might differ from + # memory_size_gb because of Cromwell's retry with more memory feature. + # Note: In the future this should be done using Cromwell's ${MEM_SIZE} and ${MEM_UNIT} environment variables, + # which do not rely on the output format of the `free` command. + + + bcftools norm -m-any ~{input_vcf} | \ + bcftools norm --check-ref -w -f ~{ref_fasta} | \ + bcftools annotate \ + -Ob \ + -x 'ID' \ + -I +'%CHROM:%POS:%POS:%REF:%ALT' | \ + bcftools view -i 'AC>0' -Oz -o ~{prefix}.vcf.gz + + tabix -p vcf ~{prefix}.vcf.gz + >>> + + output { + File output_vcf = "~{prefix}.vcf.gz" + File output_vcf_index = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-malaria-niare-pipeline:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task VariantRecalibratorIndel { + meta { + author: "Jonn Smith" + notes: "Reproducing methods laid out by Niare et al. (https://doi.org/10.1186/s12936-023-04632-0) to perform testing. Mild adaptations due to infrastructure." + } + + input { + File input_vcf + File input_vcf_index + + File ref_fasta + File ref_fasta_fai + File ref_dict + + File sites_only_vcf + File sites_only_vcf_index + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size([input_vcf, input_vcf_index, ref_fasta, ref_fasta_fai, ref_dict, sites_only_vcf, sites_only_vcf_index], "GB")) + + Int disk_size = 1 + 4*ref_size + + command <<< + set -euxo pipefail + # We need at least 1 GB of available memory outside of the Java heap in order to execute native code, thus, limit + # Java's memory by the total memory minus 1 GB. We need to compute the total memory as it might differ from + # memory_size_gb because of Cromwell's retry with more memory feature. + # Note: In the future this should be done using Cromwell's ${MEM_SIZE} and ${MEM_UNIT} environment variables, + # which do not rely on the output format of the `free` command. + + available_memory_mb=$(free -m | awk '/^Mem/ {print $2}') + let java_memory_size_mb=available_memory_mb-1024 + echo Total available memory: ${available_memory_mb} MB >&2 + echo Memory reserved for Java: ${java_memory_size_mb} MB >&2 + + gatk --java-options "-Xmx${java_memory_size_mb}m -Xms${java_memory_size_mb}m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + VariantRecalibrator \ + -R ~{ref_fasta} \ + -V ~{input_vcf} \ + --trust-all-polymorphic \ + -an QD -an DP -an FS -an SOR -an MQ \ + -mode INDEL \ + --max-gaussians 4 \ + -resource:Brown,known=true,training=true,truth=true,prior=15.0 ~{sites_only_vcf} \ + -O ~{prefix}.indel_recal \ + --output-model ~{prefix}.indel.model.report \ + --tranches-file ~{prefix}.raw.indel.tranches \ +# --rscript-file ~{prefix}.raw.indel.plots.R + + >>> + + output { + File recalibration = "~{prefix}.indel_recal" + File recalibration_index = "~{prefix}.indel_recal.idx" + File tranches = "~{prefix}.raw.indel.tranches" + File model_report = "~{prefix}.indel.model.report" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-malaria-niare-pipeline:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task VariantRecalibratorSnp { + meta { + author: "Jonn Smith" + notes: "Reproducing methods laid out by Niare et al. (https://doi.org/10.1186/s12936-023-04632-0) to perform testing. Mild adaptations due to infrastructure." + } + + input { + File input_vcf + File input_vcf_index + + File ref_fasta + File ref_fasta_fai + File ref_dict + + File sites_only_vcf + File sites_only_vcf_index + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size([input_vcf, input_vcf_index, ref_fasta, ref_fasta_fai, ref_dict, sites_only_vcf, sites_only_vcf_index], "GB")) + + Int disk_size = 1 + 4*ref_size + + command <<< + set -euxo pipefail + # We need at least 1 GB of available memory outside of the Java heap in order to execute native code, thus, limit + # Java's memory by the total memory minus 1 GB. We need to compute the total memory as it might differ from + # memory_size_gb because of Cromwell's retry with more memory feature. + # Note: In the future this should be done using Cromwell's ${MEM_SIZE} and ${MEM_UNIT} environment variables, + # which do not rely on the output format of the `free` command. + + available_memory_mb=$(free -m | awk '/^Mem/ {print $2}') + let java_memory_size_mb=available_memory_mb-1024 + echo Total available memory: ${available_memory_mb} MB >&2 + echo Memory reserved for Java: ${java_memory_size_mb} MB >&2 + + gatk --java-options "-Xmx${java_memory_size_mb}m -Xms${java_memory_size_mb}m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + VariantRecalibrator \ + -R ~{ref_fasta} \ + -V ~{input_vcf} \ + --trust-all-polymorphic \ + -an QD -an DP -an FS -an SOR -an MQ \ + -mode SNP \ + --max-gaussians 4 \ + -resource:Brown,known=true,training=true,truth=true,prior=15.0 ~{sites_only_vcf} \ + -O ~{prefix}.snp_recal \ + --tranches-file ~{prefix}.raw.snp.tranches \ + --output-model ~{prefix}.snp.model.report \ +# --rscript-file ~{prefix}.raw.snp.plots.R + + >>> + + output { + File recalibration = "~{prefix}.snp_recal" + File recalibration_index = "~{prefix}.snp_recal.idx" + File tranches = "~{prefix}.raw.snp.tranches" + File model_report = "~{prefix}.snp.model.report" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-malaria-niare-pipeline:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ApplyVqsrIndel { + meta { + author: "Jonn Smith" + notes: "Reproducing methods laid out by Niare et al. (https://doi.org/10.1186/s12936-023-04632-0) to perform testing. Mild adaptations due to infrastructure." + } + + input { + File input_vcf + File input_vcf_index + + File recal_file + File recal_file_index + File recal_tranches + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size(input_vcf, "GB") + size(recal_file, "GB") + size(recal_tranches, "GB")) + + Int disk_size = 1 + 4*ref_size + + command <<< + set -euxo pipefail + # We need at least 1 GB of available memory outside of the Java heap in order to execute native code, thus, limit + # Java's memory by the total memory minus 1 GB. We need to compute the total memory as it might differ from + # memory_size_gb because of Cromwell's retry with more memory feature. + # Note: In the future this should be done using Cromwell's ${MEM_SIZE} and ${MEM_UNIT} environment variables, + # which do not rely on the output format of the `free` command. + + available_memory_mb=$(free -m | awk '/^Mem/ {print $2}') + let java_memory_size_mb=available_memory_mb-1024 + echo Total available memory: ${available_memory_mb} MB >&2 + echo Memory reserved for Java: ${java_memory_size_mb} MB >&2 + + gatk --java-options "-Xmx${java_memory_size_mb}m -Xms${java_memory_size_mb}m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + ApplyVQSR \ + -V ~{input_vcf} \ + --recal-file ~{recal_file} \ + --tranches-file ~{recal_tranches} \ + --create-output-variant-index true \ + --lod-score-cutoff -2.0 \ + --exclude-filtered false \ + -mode INDEL \ + -O ~{prefix}.indel_recal.vcf.gz + + >>> + + output { + File output_vcf = "~{prefix}.indel_recal.vcf.gz" + File output_vcf_index = "~{prefix}.indel_recal.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-malaria-niare-pipeline:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ApplyVqsrSnp { + meta { + author: "Jonn Smith" + notes: "Reproducing methods laid out by Niare et al. (https://doi.org/10.1186/s12936-023-04632-0) to perform testing. Mild adaptations due to infrastructure." + } + + input { + File input_vcf + File input_vcf_index + + File recal_file + File recal_file_index + File recal_tranches + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size(input_vcf, "GB") + size(recal_file, "GB") + size(recal_tranches, "GB")) + + Int disk_size = 1 + 4*ref_size + + command <<< + set -euxo pipefail + # We need at least 1 GB of available memory outside of the Java heap in order to execute native code, thus, limit + # Java's memory by the total memory minus 1 GB. We need to compute the total memory as it might differ from + # memory_size_gb because of Cromwell's retry with more memory feature. + # Note: In the future this should be done using Cromwell's ${MEM_SIZE} and ${MEM_UNIT} environment variables, + # which do not rely on the output format of the `free` command. + + available_memory_mb=$(free -m | awk '/^Mem/ {print $2}') + let java_memory_size_mb=available_memory_mb-1024 + echo Total available memory: ${available_memory_mb} MB >&2 + echo Memory reserved for Java: ${java_memory_size_mb} MB >&2 + + gatk --java-options "-Xmx${java_memory_size_mb}m -Xms${java_memory_size_mb}m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + ApplyVQSR \ + -V ~{input_vcf} \ + --recal-file ~{recal_file} \ + --tranches-file ~{recal_tranches} \ + --create-output-variant-index true \ + --lod-score-cutoff 0.0 \ + --exclude-filtered false \ + -mode SNP \ + -O ~{prefix}.snp_recal.vcf.gz + + >>> + + output { + File output_vcf = "~{prefix}.snp_recal.vcf.gz" + File output_vcf_index = "~{prefix}.snp_recal.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-malaria-niare-pipeline:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MergeMultiAllelicSitesPostRecalibration { + meta { + author: "Jonn Smith" + notes: "Reproducing methods laid out by Niare et al. (https://doi.org/10.1186/s12936-023-04632-0) to perform testing. Mild adaptations due to infrastructure." + } + + input { + File input_vcf + File input_vcf_index + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size([input_vcf, input_vcf_index, ref_fasta, ref_fasta_fai, ref_dict], "GB")) + + Int disk_size = 1 + 4*ref_size + + command <<< + set -euxo pipefail + # We need at least 1 GB of available memory outside of the Java heap in order to execute native code, thus, limit + # Java's memory by the total memory minus 1 GB. We need to compute the total memory as it might differ from + # memory_size_gb because of Cromwell's retry with more memory feature. + # Note: In the future this should be done using Cromwell's ${MEM_SIZE} and ${MEM_UNIT} environment variables, + # which do not rely on the output format of the `free` command. + + # Make sure we use all our processors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + available_memory_mb=$(free -m | awk '/^Mem/ {print $2}') + let java_memory_size_mb=available_memory_mb-1024 + echo Total available memory: ${available_memory_mb} MB >&2 + echo Memory reserved for Java: ${java_memory_size_mb} MB >&2 + + gatk --java-options "-Xmx${java_memory_size_mb}m -Xms${java_memory_size_mb}m -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10" \ + SelectVariants \ + -R ~{ref_fasta} \ + -V ~{input_vcf} \ + -O ~{prefix}.raw.recal.pass.vcf.gz \ + --exclude-filtered true + + bcftools norm \ + -m+any ~{prefix}.raw.recal.pass.vcf.gz \ + --check-ref -f ~{ref_fasta} \ + -Oz \ + -o ~{prefix}.pass.merged.vcf.gz \ + --threads ${np} + + tabix -p vcf ~{prefix}.pass.merged.vcf.gz + + >>> + + output { + File output_vcf = "~{prefix}.pass.merged.vcf.gz" + File output_vcf_index = "~{prefix}.pass.merged.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-malaria-niare-pipeline:0.0.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + diff --git a/wdl/tasks/RemoveSingleOrganismContamination.wdl b/wdl/tasks/RemoveSingleOrganismContamination.wdl new file mode 100644 index 000000000..1210cc68c --- /dev/null +++ b/wdl/tasks/RemoveSingleOrganismContamination.wdl @@ -0,0 +1,300 @@ +version 1.0 + +import "Structs.wdl" as Structs +import "SRUtils.wdl" as SRUTIL +import "Utils.wdl" as Utils +import "Finalize.wdl" as FF + +workflow RemoveSingleOrganismContamination { + meta { + author: "Jonn Smith" + description: "A workflow to remove contamination originating from a single organism from a dataset." + } + + input { + File? input_bam + File? input_bai + + File? fq_end1 + File? fq_end2 + + String SM + String LB + String platform = "illumina" + + String contaminant_ref_name + File contaminant_ref_map_file + + String dir_prefix + String gcs_out_root_dir + + Boolean DEBUG_MODE = false + } + + parameter_meta { + input_bam: "GCS path to unmapped bam" + input_bai: "GCS path to bai index for unmapped bam" + + fq_end1: "GCS path to end1 of paired-end fastq" + fq_end2: "GCS path to end2 of paired-end fastq" + + SM: "the value to place in the BAM read group's SM field" + LB: "the value to place in the BAM read group's LB (library) field" + platform: "[default valued] the value to place in the BAM read group's PL (platform) field (default: illumina)" + + contaminant_ref_name: "Name of the contaminant genome to be used in output files." + contaminant_ref_map_file: "Table indicating reference sequence and auxillary file locations." + + dir_prefix: "directory prefix for output files" + gcs_out_root_dir: "GCS bucket to store the reads, variants, and metrics files" + + DEBUG_MODE: "[default valued] enables debugging tasks / subworkflows (default: false)" + } + + # Call our timestamp so we can store outputs without clobbering previous runs: + call Utils.GetCurrentTimestampString as t_001_WdlExecutionStartTimestamp { input: } + + # Some basic error handling: + if (!defined(input_bam) && (!defined(fq_end1) || !defined(fq_end2))) { + call Utils.StopWorkflow as t_002_NoInputFileProvidedFailure { + input: reason = "No input file has been provided! You must provide either an input bam or input fastq1/fastq2 files." + } + } + if (defined(input_bam) && (defined(fq_end1) || defined(fq_end2))) { + call Utils.StopWorkflow as t_003_TooManyInputsProvidedFailure { + input: reason = "Too many inputs provided! You must provide EITHER an input bam OR input fastq1/fastq2 files." + } + } + + # Create an outdir: + String outdir = if DEBUG_MODE then sub(gcs_out_root_dir, "/$", "") + "/RemoveSingleOrganismContamination/~{dir_prefix}/" + t_001_WdlExecutionStartTimestamp.timestamp_string else sub(gcs_out_root_dir, "/$", "") + "/RemoveSingleOrganismContamination/~{dir_prefix}" + + # Get ref info: + Map[String, String] ref_map = read_map(contaminant_ref_map_file) + + if (defined(input_bam)) { + # Convert the given bam to a uBAM (needed for previous aligned data): + call SRUTIL.RevertSam as t_004_RevertSam { + input: + input_bam = select_first([input_bam]), + prefix = SM + ".revertSam" + } + + # Convert input SAM/BAM to FASTQ: + call SRUTIL.BamToFq as t_005_Bam2Fastq { + input: + bam = t_004_RevertSam.bam, + prefix = SM + } + call Utils.GetRawReadGroup as t_006_GetRawReadGroup { input: gcs_bam_path = select_first([input_bam]) } + } + + File fq_e1 = select_first([fq_end1, t_005_Bam2Fastq.fq_end1]) + File fq_e2 = select_first([fq_end2, t_005_Bam2Fastq.fq_end2]) + + String RG = select_first([t_006_GetRawReadGroup.rg, "@RG\tID:" + SM + "_" + LB + "\tPL:" + platform + "\tLB:" + LB + "\tSM:" + SM]) + + # Align data to contaminant reference: + call SRUTIL.BwaMem2 as t_007_AlignReads { + input: + fq_end1 = fq_e1, + fq_end2 = fq_e2, + + ref_fasta = ref_map["fasta"], + ref_fasta_index = ref_map["fai"], + ref_dict = ref_map["dict"], + ref_0123 = ref_map["0123"], + ref_amb = ref_map["amb"], + ref_ann = ref_map["ann"], + ref_bwt = ref_map["bwt"], + ref_pac = ref_map["pac"], + + mark_short_splits_as_secondary = true, + + read_group = RG, + prefix = SM + ".contaminant_aligned." + contaminant_ref_name, + + runtime_attr_override = object {mem_gb: 64} # Need a lot of ram to use BWA-MEM2 + } + + call ExtractReadsWithSamtools as t_008_ExtractDecontaminatedReads { + input: + bam = t_007_AlignReads.bam, + sam_flags = "256", + extra_args = " -f 12 ", + prefix = SM + ".decontaminated" + } + + call ExtractReadsWithSamtools as t_009_ExtractContaminatedReads { + input: + bam = t_007_AlignReads.bam, + sam_flags = "12", + prefix = SM + ".contaminated_" + contaminant_ref_name + "_reads" + } + + call SortBamWithoutIndexing as t_010_SortDecontaminatedReads { + input: + input_bam = t_008_ExtractDecontaminatedReads.output_bam, + extra_args = " -n ", + prefix = SM + ".decontaminated.sorted" + } + + call SortBamWithoutIndexing as t_011_SortContaminatedReads { + input: + input_bam = t_009_ExtractContaminatedReads.output_bam, + extra_args = " -n ", + prefix = SM + ".contaminated_" + contaminant_ref_name + "_reads.sorted" + } + + # Convert input SAM/BAM to FASTQ: + call SRUTIL.BamToFq as t_012_CreateFastqFromDecontaminatedReads { + input: + bam = t_010_SortDecontaminatedReads.sorted_bam, + prefix = SM + ".decontaminated" + } + + ############################################ + # _____ _ _ _ + # | ___(_)_ __ __ _| (_)_______ + # | |_ | | '_ \ / _` | | |_ / _ \ + # | _| | | | | | (_| | | |/ / __/ + # |_| |_|_| |_|\__,_|_|_/___\___| + # + ############################################ + + # Chosen because it's a relatively small file. + File keyfile = t_012_CreateFastqFromDecontaminatedReads.fq_unpaired + + call FF.FinalizeToFile as t_013_FinalizeContaminatedBam { input: outdir = outdir, file = t_011_SortContaminatedReads.sorted_bam, keyfile = keyfile } + call FF.FinalizeToFile as t_014_FinalizeDecontaminatedFq1 { input: outdir = outdir, file = t_012_CreateFastqFromDecontaminatedReads.fq_end1, keyfile = keyfile } + call FF.FinalizeToFile as t_015_FinalizeDecontaminatedFq2 { input: outdir = outdir, file = t_012_CreateFastqFromDecontaminatedReads.fq_end2, keyfile = keyfile } + call FF.FinalizeToFile as t_016_FinalizeDecontaminatedUnpaired { input: outdir = outdir, file = t_012_CreateFastqFromDecontaminatedReads.fq_unpaired, keyfile = keyfile } + + ############################################ + # ___ _ _ + # / _ \ _ _| |_ _ __ _ _| |_ + # | | | | | | | __| '_ \| | | | __| + # | |_| | |_| | |_| |_) | |_| | |_ + # \___/ \__,_|\__| .__/ \__,_|\__| + # |_| + ############################################ + + output { + File contaminated_bam = t_013_FinalizeContaminatedBam.gcs_path + + File decontaminated_fq1 = t_014_FinalizeDecontaminatedFq1.gcs_path + File decontaminated_fq2 = t_015_FinalizeDecontaminatedFq2.gcs_path + File decontaminated_unpaired = t_016_FinalizeDecontaminatedUnpaired.gcs_path + } +} + +task ExtractReadsWithSamtools { + meta { + description : "Filter reads based on sam flags. Reads with ANY of the given flags will be removed from the given dataset. Does not sort or index the results." + author : "Jonn Smith" + email : "jonn@broadinstitute.org" + } + + input { + File bam + String sam_flags + + String extra_args = "" + + String prefix = "filtered_reads" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + bam: "BAM file to be filtered." + sam_flags: "Flags for which to remove reads. Reads with ANY of the given flags will be removed from the given dataset." + prefix : "[Optional] Prefix string to name the output file (Default: filtered_reads)." + } + + Int disk_size = 20 + ceil(11 * size(bam, "GiB")) + + command <<< + + # Make sure we use all our proocesors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + samtools view -h -b -F ~{sam_flags} -@$np ~{extra_args} ~{bam} > ~{prefix}.bam + >>> + + output { + File output_bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-align:0.1.26" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SortBamWithoutIndexing { + input { + File input_bam + String prefix = "sorted" + + String? extra_args = "" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + input_bam: "input BAM" + prefix: "[default-valued] prefix for output BAM" + } + + Int disk_size = 10 + 10*ceil(size(input_bam, "GB")) + + command <<< + set -euxo pipefail + + num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) + + samtools sort ~{extra_args} -@$num_core -o ~{prefix}.bam ~{input_bam} + >>> + + output { + File sorted_bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/tasks/SGKit.wdl b/wdl/tasks/SGKit.wdl new file mode 100644 index 000000000..d62b635b6 --- /dev/null +++ b/wdl/tasks/SGKit.wdl @@ -0,0 +1,69 @@ +version 1.0 + +import "Structs.wdl" + +task ConvertToZarrStore { + meta { + description: "Convert a .vcf.bgz file to a Zarr store and copy it to a final gs:// URL." + } + + input { + File gvcf + File tbi + + String reference = "GRCh38" + String? ref_fasta + String? ref_fai + String prefix = "out" + + Int num_cpus = 4 + + String outdir + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 3*ceil(size(gvcf, "GB")) + + command <<< + set -x + + python3 <>> + + output { + String gcs_path = "~{outdir}/~{prefix}.zarr" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: num_cpus, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 0, + max_retries: 0, + docker: "us.gcr.io/broad-dsp-lrma/lr-sgkit:0.5.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/tasks/SRGermlineCNVs.wdl b/wdl/tasks/SRGermlineCNVs.wdl new file mode 100644 index 000000000..933879352 --- /dev/null +++ b/wdl/tasks/SRGermlineCNVs.wdl @@ -0,0 +1,6 @@ +version 1.0 + +################################################################################## +## A collection of tasks related to germline CNV calling using GATK4's GCNV tools. +################################################################################## + diff --git a/wdl/tasks/SRJointGenotyping.wdl b/wdl/tasks/SRJointGenotyping.wdl new file mode 100644 index 000000000..936b79a83 --- /dev/null +++ b/wdl/tasks/SRJointGenotyping.wdl @@ -0,0 +1,347 @@ +version 1.0 + +import "Structs.wdl" +import "Utils.wdl" as Utils + +task CreateSampleNameMap { + + meta { + description: "Creates the sample / name-map file of the GVCFs for ingest into ImportGVCFs. NOTE: Some of this functionality is duplicated from Utils.InferSampleName. This is intentional - we don't want to localize all these files or shard over potentially thousands of input GVCFs." + } + + input { + Array[File] gvcfs + String prefix + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + gvcfs: { + help: "Array of single-sample GVCF files.", + localization_optional: true + } + } + + Int disk_size_gb = 20 + + String outfile_name = "~{prefix}.sample_name_map.tsv" + String size_file_gb = "~{prefix}.total_gvcf_file_size.txt" + + # Every so often we should reauthorize so `bcftools` can continue to access our data: + Int re_auth_interval = 50 + + command <<< + set -euxo pipefail + + # Put our gvcfs into a file we can iterate over: + gvcf_file_list=~{write_lines(gvcfs)} + + # Initialize a file for the sample names: + [ -e ~{outfile_name} ] && rm -rf ~{outfile_name} + + # Set our access token: + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + + # Create a temporary file to store file sizes in: + size_file=$(mktemp) + + let i=1 + while read file_path ; do + + # Get our sample list from our file: + bcftools query -l ${file_path} > sample_names.txt + + # Make sure we only have one sample name: + [[ $(wc -l sample_names.txt | awk '{print $1}') -ne 1 ]] && echo "Incorrect number of sample names found in GVCF (there can be only one!): ${file_path}" && exit 1 + + # Make sure the samplename has an actual name: + [ $(grep -iq "unnamedsample" sample_names.txt) ] && echo "Sample name found to be unnamedsample in GVCF: ${file_path}" && exit 1 + + # Add the sample name and GVCF path to the sample name file: + echo -e "$(cat sample_names.txt)\t${file_path}" >> ~{outfile_name} + + # Add the file size to the size file: + gsutil du -sac ${file_path} | tail -n1 | awk '{print $1}' >> ${size_file} + + let i=$i+1 + if [[ $i -gt ~{re_auth_interval} ]] ; then + # Periodically we should update the token so we don't have problems with long file lists: + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + i=0 + fi + done < ${gvcf_file_list} + + # Now calculate the final file size in GB: + # We include an additional GB in case we have a very small dataset: + awk '{s += $1}END{print int(1+s/(1024*1024*1024))}' ${size_file} > ~{size_file_gb} + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size_gb, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File sample_name_map = outfile_name + Int total_gvcf_size_gb = read_int("~{size_file_gb}") + } +} + +task ImportGVCFs { + + input { + File sample_name_map + + File interval_list + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix + + Int batch_size = 50 + + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_fasta_fai, "GB") + size(ref_dict, "GB")) + + Int disk_size = 1 + 4*ref_size + + command <<< + set -euxo pipefail + + # Make sure that the output directory does not exist: + [ -e ~{prefix} ] && rm -rf ~{prefix} + + # + # Notes from WARP Team: + # + # We've seen some GenomicsDB performance regressions related to intervals, so we're going to pretend we only have a single interval + # using the --merge-input-intervals arg + # There's no data in between since we didn't run HaplotypeCaller over those loci so we're not wasting any compute + + # The memory setting here is very important and must be several GiB lower + # than the total memory allocated to the VM because this tool uses + # a significant amount of non-heap memory for native libraries. + # Also, testing has shown that the multithreaded reader initialization + # does not scale well beyond 5 threads, so don't increase beyond that. + gatk --java-options "-Xms8000m -Xmx25000m" \ + GenomicsDBImport \ + --genomicsdb-workspace-path ~{prefix}.genomicsDB \ + --batch-size ~{batch_size} \ + -L ~{interval_list} \ + --sample-name-map ~{sample_name_map} \ + --reader-threads 5 \ + --merge-input-intervals \ + --consolidate + + tar -cf ~{prefix}.genomicsDB.tar ~{prefix}.genomicsDB + >>> + + output { + File output_genomicsdb = "~{prefix}.genomicsDB.tar" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ReblockGVCF { + + input { + File input_gvcf + File input_gvcf_index + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix + + Array[Int] gq_blocks = [20, 30, 40] + + String? annotations_to_keep_command + Float? tree_score_cutoff + + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_fasta_fai, "GB") + size(ref_dict, "GB")) + + Int disk_size = 1 + 4*ceil(size(input_gvcf, "GB")) + 4*ceil(size(input_gvcf_index, "GB")) + ref_size + + command <<< + set -euxo pipefail + + gatk --java-options "-Xms3000m -Xmx3000m" \ + ReblockGVCF \ + -R ~{ref_fasta} \ + -V ~{input_gvcf} \ + -do-qual-approx \ + --floor-blocks \ + -GQB ~{sep=" -GQB " gq_blocks} \ + ~{annotations_to_keep_command} \ + ~{"--tree-score-threshold-to-no-call " + tree_score_cutoff} \ + -O ~{prefix}.reblocked.g.vcf.gz + >>> + + output { + File output_gvcf = "~{prefix}.reblocked.g.vcf.gz" + File output_gvcf_index = "~{prefix}.reblocked.g.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task GenotypeGVCFs { + + input { + File input_gvcf_data + File? input_gvcf_index # Required if passing a VCF file. + + File interval_list + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String? dbsnp_vcf + + String prefix + + Boolean keep_combined_raw_annotations = false + RuntimeAttr? runtime_attr_override + } + + Int ref_size = ceil(size(ref_fasta, "GB") + size(ref_fasta_fai, "GB") + size(ref_dict, "GB")) + Int db_snp_size = ceil(size(dbsnp_vcf, "GB")) + + Int disk_size = 1 + 4*ceil(size(input_gvcf_data, "GB")) + ref_size + db_snp_size + + String dbsnp_vcf_arg = if defined(dbsnp_vcf) then "-D ~{dbsnp_vcf} " else "" + + parameter_meta { + input_gvcf_data: { help: "Either a single GVCF file or a GenomicsDB Tar file." } + interval_list: { + localization_optional: true + } + } + + command <<< + set -euxo pipefail + + # We must determine if our input variants are in a genomicsdb file or in a VCF. + # The easiest way is to see if the input is a .tar file: + + is_genomics_db=true + filename=$(basename -- "~{input_gvcf_data}") + extension="${filename##*.}" + if [[ "${extension}" != "tar" ]] ; then + is_genomics_db=false + fi + + if $is_genomics_db ; then + tar -xf ~{input_gvcf_data} + INPUT_FILE="gendb://$(basename ~{input_gvcf_data} .tar)" + else + INPUT_FILE=~{input_gvcf_data} + fi + + gatk --java-options "-Xms8000m -Xmx25000m" \ + GenotypeGVCFs \ + -R ~{ref_fasta} \ + -O ~{prefix}.vcf.gz \ + ~{dbsnp_vcf_arg} \ + -G StandardAnnotation \ + --only-output-calls-starting-in-intervals \ + -V ${INPUT_FILE} \ + -L ~{interval_list} \ + ~{true='--keep-combined-raw-annotations' false='' keep_combined_raw_annotations} \ + --merge-input-intervals + + # Removed for now: + # -G AS_StandardAnnotation + >>> + + output { + File output_vcf = "~{prefix}.vcf.gz" + File output_vcf_index = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 26, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/tasks/SRUtils.wdl b/wdl/tasks/SRUtils.wdl new file mode 100644 index 000000000..2862aabbe --- /dev/null +++ b/wdl/tasks/SRUtils.wdl @@ -0,0 +1,822 @@ +version 1.0 + +import "Structs.wdl" + +task BamToFq { + input { + File bam + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + samtools sort -n ~{bam} | samtools bam2fq \ + -n \ + -s /dev/null \ + -1 ~{prefix}.end1.fq.gz \ + -2 ~{prefix}.end2.fq.gz \ + -0 ~{prefix}.unpaired.fq.gz + >>> + + output { + File fq_end1 = "~{prefix}.end1.fq.gz" + File fq_end2 = "~{prefix}.end2.fq.gz" + File fq_unpaired = "~{prefix}.unpaired.fq.gz" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task FixMate { + input { + File input_bam + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(input_bam, "GB")) + + command <<< + set -euxo pipefail + + samtools fixmate ~{input_bam} ~{prefix}.bam + >>> + + output { + File bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-utils:0.1.8" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task Bam2FqPicard { + input { + File bam + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(bam, "GB")) + + command <<< + set -euxo pipefail + + java -Xms8192m -Xmx30768m -jar /usr/picard/picard.jar \ + SamToFastq \ + INPUT=~{bam} \ + FASTQ=~{prefix}.fastq \ + INTERLEAVE=true \ + NON_PF=true + >>> + + output { + File fastq = "~{prefix}.fastq" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 16, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + +} + +task BwaMem2 { + input { + File fq_end1 + File fq_end2 + + File ref_fasta + File ref_fasta_index + File ref_dict + File ref_0123 + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + + String? read_group + + String prefix = "out" + + Boolean mark_short_splits_as_secondary = false + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(fq_end1, "GB")) + + 4*ceil(size(fq_end2, "GB")) + + 4*ceil(size(ref_fasta, "GB")) + + 4*ceil(size(ref_fasta_index, "GB")) + + 4*ceil(size(ref_dict, "GB")) + + 4*ceil(size(ref_amb, "GB")) + + 4*ceil(size(ref_ann, "GB")) + + 4*ceil(size(ref_bwt, "GB")) + + 4*ceil(size(ref_pac, "GB")) + + 4*ceil(size(ref_0123, "GB")) + + String rg_arg = if defined(read_group) then " -R " else "" + + command <<< + set -euxo pipefail + + # Make sure we use all our proocesors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + if [[ ${np} -gt 2 ]] ; then + let np=${np}-1 + fi + + # Breakdown of the arguments: + # -K INT process INT input bases in each batch regardless of nThreads (for reproducibility) [] + # -v INT verbose level: 1=error, 2=warning, 3=message, 4+=debugging [3] + # -t INT number of threads [1] + # -Y use soft clipping for supplementary alignments + # -R STR read group header line such as '@RG\tID:foo\tSM:bar' [null] + # -M mark shorter split hits as secondary + + bwa-mem2 mem \ + -K 100000000 \ + -v 3 \ + -t ${np} \ + -Y \ + ~{rg_arg}'~{default="" sep=" -R " read_group}' \ + ~{true='-M' false="" mark_short_splits_as_secondary} \ + ~{ref_fasta} \ + ~{fq_end1} \ + ~{fq_end2} | \ + samtools view -1 - > ~{prefix}.bam + >>> + + output { + File bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-utils:0.2.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MergeBamAlignment { + input { + File aligned_bam + File unaligned_bam + + File ref_fasta + File ref_fasta_index + File ref_dict + + String prefix = "out" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(aligned_bam, "GB")) + + 4*ceil(size(unaligned_bam, "GB")) + + 4*ceil(size(ref_fasta, "GB")) + + 4*ceil(size(ref_fasta_index, "GB")) + + 4*ceil(size(ref_dict, "GB")) + + command <<< + set -euxo pipefail + + # Make sure we use all our proocesors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + let np=${np}-1 + + java -Dsamjdk.compression_level=2 -Xms8192m -Xmx30768m -jar /usr/picard/picard.jar \ + MergeBamAlignment \ + VALIDATION_STRINGENCY=SILENT \ + EXPECTED_ORIENTATIONS=FR \ + ATTRIBUTES_TO_RETAIN=X0 \ + ATTRIBUTES_TO_REMOVE=NM \ + ATTRIBUTES_TO_REMOVE=MD \ + ALIGNED_BAM=~{aligned_bam} \ + UNMAPPED_BAM=~{unaligned_bam} \ + OUTPUT=~{prefix}.bam \ + REFERENCE_SEQUENCE=~{ref_fasta} \ + SORT_ORDER="unsorted" \ + IS_BISULFITE_SEQUENCE=false \ + ALIGNED_READS_ONLY=false \ + CLIP_ADAPTERS=false \ + MAX_RECORDS_IN_RAM=2000000 \ + ADD_MATE_CIGAR=true \ + MAX_INSERTIONS_OR_DELETIONS=-1 \ + PRIMARY_ALIGNMENT_STRATEGY=MostDistant \ + PROGRAM_RECORD_ID="bwa-mem2" \ + PROGRAM_GROUP_VERSION="2.2.1" \ + PROGRAM_GROUP_COMMAND_LINE="bwa-mem2 mem -K 100000000 -p -v 3 -t 15 -Y" \ + PROGRAM_GROUP_NAME="bwa-mem2" \ + UNMAPPED_READ_STRATEGY=COPY_TO_TAG \ + ALIGNER_PROPER_PAIR_FLAGS=true \ + UNMAP_CONTAMINANT_READS=true \ + ADD_PG_TAG_TO_READS=false + >>> + + output { + File bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 16, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MarkDuplicates { + input { + File input_bam + + String prefix + + # The program default for READ_NAME_REGEX is appropriate in nearly every case. + # Sometimes we wish to supply "null" in order to turn off optical duplicate detection + # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing + String? read_name_regex + + Float? sorting_collection_size_ratio + + RuntimeAttr? runtime_attr_override + } + + Int compression_level = 2 + + Int disk_size = 1 + 4*ceil(size(input_bam, "GB")) + + # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly + # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. + # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" + + command <<< + tot_mem_mb=$(free -m | grep '^Mem' | awk '{print $2}') + let java_memory_size_mb=${tot_mem_mb}-5120 + + java -Dsamjdk.compression_level=~{compression_level} -Xms${java_memory_size_mb}m -jar /usr/picard/picard.jar \ + MarkDuplicates \ + INPUT=~{input_bam} \ + OUTPUT=~{prefix}.bam \ + METRICS_FILE=~{prefix}.metrics.txt \ + VALIDATION_STRINGENCY=SILENT \ + ~{"READ_NAME_REGEX=" + read_name_regex} \ + ~{"SORTING_COLLECTION_SIZE_RATIO=" + sorting_collection_size_ratio} \ + OPTICAL_DUPLICATE_PIXEL_DISTANCE=2500 \ + ASSUME_SORT_ORDER="queryname" \ + CLEAR_DT="false" \ + ADD_PG_TAG_TO_READS=false + >>> + + output { + File bam = "~{prefix}.bam" + File metrics = "~{prefix}.metrics.txt" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 16, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + + +# Generate Base Quality Score Recalibration (BQSR) model +task BaseRecalibrator { + input { + File input_bam + File input_bam_index + + File ref_dict + File ref_fasta + File ref_fasta_index + + File known_sites_vcf + File known_sites_index + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size(input_bam, "GB")) + + 4*ceil(size(input_bam_index, "GB")) + + 2*ceil(size(ref_dict, "GB")) + + 2*ceil(size(ref_fasta, "GB")) + + 2*ceil(size(ref_fasta_index, "GB")) + + 2*ceil(size(known_sites_vcf, "GB")) + + 2*ceil(size(known_sites_index, "GB")) + + parameter_meta { + input_bam: { + localization_optional: true + } + } + + command { + + gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal \ + -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps -XX:+PrintGCDetails \ + -Xloggc:gc_log.log -Xms5000m -Xmx5500m" \ + BaseRecalibrator \ + -R ~{ref_fasta} \ + -I ~{input_bam} \ + --use-original-qualities \ + -O ~{prefix}.txt \ + --known-sites ~{known_sites_vcf} + + } + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 16, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + output { + File recalibration_report = "~{prefix}.txt" + } +} + +task ApplyBQSR { + input { + File input_bam + File input_bam_index + + File ref_dict + File ref_fasta + File ref_fasta_index + + File recalibration_report + + Boolean bin_base_qualities = true + Boolean emit_original_quals = true + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int compression_level = 2 + Int java_memory_size_mb = 30768 + + parameter_meta { + input_bam: { + localization_optional: true + } + } + + Int disk_size = 1 + 4*ceil(size(input_bam, "GB")) + + 4*ceil(size(input_bam_index, "GB")) + + 2*ceil(size(ref_dict, "GB")) + + 2*ceil(size(ref_fasta, "GB")) + + 2*ceil(size(ref_fasta_index, "GB")) + + 2*ceil(size(recalibration_report, "GB")) + + command <<< + + gatk --java-options "-XX:+PrintFlagsFinal -XX:+PrintGCTimeStamps -XX:+PrintGCDateStamps \ + -XX:+PrintGCDetails -Xloggc:gc_log.log \ + -XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -Dsamjdk.compression_level=~{compression_level} -Xms8192m -Xmx~{java_memory_size_mb}m" \ + ApplyBQSR \ + --create-output-bam-md5 \ + --add-output-sam-program-record \ + -R ~{ref_fasta} \ + -I ~{input_bam} \ + --use-original-qualities \ + -O ~{prefix}.bam \ + -bqsr ~{recalibration_report} \ + --emit-original-quals ~{emit_original_quals} \ + ~{true='--static-quantized-quals 10' false='' bin_base_qualities} \ + ~{true='--static-quantized-quals 20' false='' bin_base_qualities} \ + ~{true='--static-quantized-quals 30' false='' bin_base_qualities} \ + + # Make sure we use all our proocesors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + samtools index -@${np} ~{prefix}.bam + >>> + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 16, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + output { + File recalibrated_bam = "~{prefix}.bam" + File recalibrated_bai = "~{prefix}.bam.bai" + } +} + +task RevertSam { + input { + File input_bam + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int compression_level = 2 + Int java_memory_size_mb = 30768 + + Int disk_size = 1 + 20*ceil(size(input_bam, "GB")) + + # As documented on the GATK website: + # https://gatk.broadinstitute.org/hc/en-us/articles/4403687183515--How-to-Generate-an-unmapped-BAM-from-FASTQ-or-aligned-BAM + command { + + java -Dsamjdk.compression_level=~{compression_level} -Xms~{java_memory_size_mb}m -jar /usr/picard/picard.jar \ + RevertSam \ + INPUT=~{input_bam} \ + OUTPUT=~{prefix}.bam \ + SANITIZE=true \ + MAX_DISCARD_FRACTION=0.005 \ + ATTRIBUTE_TO_CLEAR=XT \ + ATTRIBUTE_TO_CLEAR=XN \ + ATTRIBUTE_TO_CLEAR=AS \ + ATTRIBUTE_TO_CLEAR=OC \ + ATTRIBUTE_TO_CLEAR=OP \ + SORT_ORDER=queryname \ + RESTORE_ORIGINAL_QUALITIES=true \ + REMOVE_DUPLICATE_INFORMATION=true \ + REMOVE_ALIGNMENT_INFORMATION=true + } + + output { + File bam = "~{prefix}.bam" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 16, + mem_gb: 32, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ComputeBamStats { + input { + File bam_file + Int? qual_threshold + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 2*ceil(size(bam_file, "GB")) + String qual_thresh_arg = if defined(qual_threshold) then " -q " else "" + + String qual_stats_file_decoration = if defined(qual_threshold) then ".q" + qual_threshold else "" + + String stats_file_name = basename(bam_file, ".bam") + ".stats_map" + qual_stats_file_decoration + ".txt" + + command <<< + set -euxo pipefail + + python3 /python/compute_sr_stats.py \ + ~{qual_thresh_arg}~{default="" sep=" -q " qual_threshold} \ + ~{bam_file} \ + | tee ~{stats_file_name} + + >>> + + output { + Map[String, Float] results = read_map(stats_file_name) + File results_file = stats_file_name + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 2, + docker: "us.gcr.io/broad-dsp-lrma/sr-utils:0.2.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task MergeVCFs { + meta { + description: "Combine multiple VCFs or GVCFs from scattered HaplotypeCaller runs" + } + + input { + Array[File] input_vcfs + Array[File] input_vcfs_indexes + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = ceil(size(input_vcfs, "GiB") * 2.5) + 10 + + command <<< + java -Xms2000m -Xmx2500m -jar /usr/picard/picard.jar \ + MergeVcfs \ + INPUT=~{sep=' INPUT=' input_vcfs} \ + OUTPUT=~{prefix}.vcf.gz + >>> + + output { + File output_vcf = "~{prefix}.vcf.gz" + File output_vcf_index = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 3, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + + +task IndexFeatureFile { + meta { + description: "Create a Tribble index for a feature file using GATK. Feature files are defined inside GATK and include VCF, BED, GTF, and other files." + } + + input { + File feature_file + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = ceil(size(feature_file, "GiB") * 2) + 10 + + String fname = basename(feature_file) + + command <<< + mv ~{feature_file} ~{fname} + gatk --java-options "-Xmx1500m" \ + IndexFeatureFile \ + -I ~{fname} + >>> + + output { + File index = "~{fname}.idx" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + + +task RevertBaseQualities { + meta { + description: "Replace base qualities in the bam file with those located in the `OQ` tag. If `ApplyBQSR` has not been run on the given bam file, no changes are made and the original file is returned." + } + + input { + File bam + File? bai + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = ceil(size(bam, "GiB") * 4) + 10 + + command <<< + set -euxo pipefail + + # Check if the input bam has been run through `ApplyBQSR`. + # If not, we can just return the input bam. + samtools view -H ~{bam} | grep '^@PG' > header.pg.txt + + grep -q 'ID:GATK ApplyBQSR' header.pg.txt > applybqsr.pg.txt + rv=$? + + if [[ $rv -eq 0 ]] && grep -q '\-\-emit-original-quals' applybqsr.pg.txt ; then + # OK - our data has had it's base quality scores recalibrated. + # We must revert them: + gatk \ + RevertBaseQualityScores \ + -I ~{bam} \ + -O ~{prefix}.bam + else + # BQSR was not applied. Just copy input -> output + cp ~{bam} ~{prefix}.bam + if [[ ! -e '~{bai}' ]] ; then + samtools index ~{prefix}.bam + else + cp ~{bai} ~{prefix}.bam.bai + fi + fi + >>> + + output { + File bam_out = "~{prefix}.bam" + File bai_out = "~{prefix}.bam.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file diff --git a/wdl/tasks/Utils.wdl b/wdl/tasks/Utils.wdl index 0b67ff67b..6fed2511b 100644 --- a/wdl/tasks/Utils.wdl +++ b/wdl/tasks/Utils.wdl @@ -162,6 +162,8 @@ task SortBam { File input_bam String prefix = "sorted" + String? extra_args = "" + RuntimeAttr? runtime_attr_override } @@ -170,12 +172,14 @@ task SortBam { prefix: "[default-valued] prefix for output BAM" } + Int disk_size = 10 + 10*ceil(size(input_bam, "GB")) + command <<< set -euxo pipefail num_core=$(cat /proc/cpuinfo | awk '/^processor/{print $3}' | wc -l) - samtools sort -@$num_core -o ~{prefix}.bam ~{input_bam} + samtools sort ~{extra_args} -@$num_core -o ~{prefix}.bam ~{input_bam} samtools index ~{prefix}.bam >>> @@ -188,7 +192,7 @@ task SortBam { RuntimeAttr default_attr = object { cpu_cores: 2, mem_gb: 4, - disk_gb: 10, + disk_gb: disk_size, boot_disk_gb: 10, preemptible_tries: 3, max_retries: 1, @@ -282,10 +286,21 @@ task MakeChrIntervalList { sed 's/[SL]N://g' | \ grep -v -e '^@HD' ~{true='-e' false='' length(filter) > 0} ~{sep=" -e " filter} | \ tee chrs.txt + + cat chrs.txt | awk '{printf("%s:%d-%d\n", $1,$2,$3)}' > intervalList.intervals + + # Now make another output - a set of individual contig interval list files: + while read line ; do + contig=$(echo "${line}" | awk '{print $1}') + echo "${line}" | awk '{printf("%s:%d-%d\n", $1,$2,$3)}' > contig.${contig}.intervals + done < chrs.txt >>> output { Array[Array[String]] chrs = read_tsv("chrs.txt") + File interval_list = "intervalList.intervals" + Array[String] contig_interval_strings = read_lines("intervalList.intervals") + Array[File] contig_interval_list_files = glob("contig.*.intervals") } ######################### @@ -651,7 +666,7 @@ task DownsampleSam { boot_disk_gb: 10, preemptible_tries: 2, max_retries: 1, - docker: "us.gcr.io/broad-gatk/gatk:4.2.0.0" + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" } RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) runtime { @@ -1204,7 +1219,7 @@ task Index { set -euxo pipefail mv ~{bam} ~{prefix} - samtools index ~{basename(prefix)} + samtools index -@ 2 ~{basename(prefix)} >>> output { @@ -1225,7 +1240,7 @@ task Index { runtime { cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " SSD" bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) @@ -2327,6 +2342,67 @@ task GetRawReadGroup { } } +task GetReadsInBedFileRegions { + meta { + desciption: "Get the reads from the given bam path which overlap the regions in the given bed file." + } + + input { + String gcs_bam_path + File regions_bed + + String prefix = "reads" + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + gcs_bam_path: "GCS URL to bam file from which to extract reads." + regions_bed: "Bed file containing regions for which to extract reads." + prefix: "[default-valued] prefix for output BAM" + runtime_attr_override: "Runtime attributes override struct." + } + + Int disk_size = 2 * ceil(size([gcs_bam_path, regions_bed], "GB")) + + command <<< + set -x + export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token` + + # Make sure we use all our proocesors: + np=$(cat /proc/cpuinfo | grep ^processor | tail -n1 | awk '{print $NF+1}') + + samtools view -@${np} -b -h -L ~{regions_bed} ~{gcs_bam_path} | samtools sort - > ~{prefix}.bam + samtools index -@${np} ~{prefix}.bam + >>> + + output { + File bam = "~{prefix}.bam" + File bai = "~{prefix}.bai" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 4, + mem_gb: 16, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-pb:0.1.30" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + task FailWithWarning { input { String warning @@ -2425,3 +2501,113 @@ task MapToTsv { docker: "us.gcr.io/broad-dsp-lrma/lr-basic:0.1.1" } } + +task CreateIGVSession { + meta { + description: "Create an IGV session given a list of IGV compatible file paths. Adapted / borrowed from https://github.com/broadinstitute/palantir-workflows/blob/mg_benchmark_compare/BenchmarkVCFs ." + } + input { + Array[String] input_bams + Array[String] input_vcfs + String reference_short_name + String output_name + + RuntimeAttr? runtime_attr_override + } + + Array[String] input_files = flatten([input_bams, input_vcfs]) + + command { + bash /usr/writeIGV.sh ~{reference_short_name} ~{sep=" " input_files} > "~{output_name}.xml" + } + + output { + File igv_session = "${output_name}.xml" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 1, + disk_gb: 50, + boot_disk_gb: 10, + preemptible_tries: 3, + max_retries: 1, + docker: "quay.io/mduran/generate-igv-session_2:v1.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SplitContigToIntervals { + meta { + author: "Jonn Smith" + notes: "Splits the given contig into intervals of the given size." + } + + input { + File ref_dict + String contig + Int size = 200000 + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 2 + + command <<< + set -euxo pipefail + + cat ~{ref_dict} | awk '{print $2,$3}' | grep '^SN' | sed -e 's@SN:@@' -e 's@LN:@@' | tr ' ' '\t' > genome.txt + grep "~{contig}" genome.txt > genome.contig.txt + + bedtools makewindows -g genome.contig.txt -w ~{size} > ~{contig}.~{size}bp_intervals.bed + + # Make individual bed files from each line: + while read line ; do + start=$(echo "${line}" | cut -d $'\t' -f 2) + end=$(echo "${line}" | cut -d $'\t' -f 3) + echo "${line}" > ~{contig}.${start}-${end}.single_interval.bed + done < ~{contig}.~{size}bp_intervals.bed + >>> + + output { + File full_bed_file = "~{contig}.~{size}bp_intervals.bed" + Array[File] individual_bed_files = glob("*.single_interval.bed") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 2, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 0, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/lr-metrics:0.1.11" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} diff --git a/wdl/tasks/VariantUtils.wdl b/wdl/tasks/VariantUtils.wdl index 25d3f8a8b..0f392a362 100644 --- a/wdl/tasks/VariantUtils.wdl +++ b/wdl/tasks/VariantUtils.wdl @@ -626,3 +626,1279 @@ task FixSnifflesVCF { docker: select_first([runtime_attr.docker, default_attr.docker]) } } + +######################################################################################################################## +######################################################################################################################## +######################################################################################################################## + +task HardFilterVcf { + + input { + File vcf + File vcf_index + + String prefix + + # From WARP: + # ExcessHet is a phred-scaled p-value. We want a cutoff of anything more extreme + # than a z-score of -4.5 which is a p-value of 3.4e-06, which phred-scaled is 54.69 + Float excess_het_threshold = 54.69 + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size([vcf, vcf_index], "GB")) + + command <<< + set -euo pipefail + + # Get amount of memory to use: + mem_available=$(free -m | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-1000 + let mem_max=${mem_available}-750 + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + VariantFiltration \ + --filter-expression "ExcessHet > ~{excess_het_threshold}" \ + --filter-name ExcessHet \ + -V ~{vcf} \ + -O ~{prefix}.hard_filtered.vcf.gz + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File variant_filtered_vcf = "~{prefix}.hard_filtered.vcf.gz" + File variant_filtered_vcf_index = "~{prefix}.hard_filtered.vcf.gz.tbi" + } +} + +task MakeSitesOnlyVcf { + + input { + File vcf + File vcf_index + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size([vcf, vcf_index], "GB")) + + command <<< + set -euo pipefail + + # Get amount of memory to use: + mem_available=$(free -m | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-1000 + let mem_max=${mem_available}-750 + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + MakeSitesOnlyVcf \ + -I ~{vcf} \ + -O ~{prefix}.sites_only.vcf.gz + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File sites_only_vcf = "~{prefix}.sites_only.vcf.gz" + File sites_only_vcf_index = "~{prefix}.sites_only.vcf.gz.tbi" + } +} + +task AnnotateVcfWithBedRegions { + input { + File vcf + File vcf_index + + Array[File] bed_files + Array[File] bed_file_indexes + Array[String] bed_file_annotation_names + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size([vcf, vcf_index, bed_files, bed_file_indexes], "GB")) + + command <<< + set -euxo pipefail + + # Get amount of memory to use: + mem_available=$(free -m | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-1000 + let mem_max=${mem_available}-750 + + # We need to generate argument strings from the input arrays. + # First we check that the arrays are the same length: + if [[ ~{length(bed_files)} -ne ~{length(bed_file_indexes)} ]] || \ + [[ ~{length(bed_files)} -ne ~{length(bed_file_annotation_names)} ]] ; then + echo "ERROR: Not all input arrays for known variants contain the same number of elements: " 1>&2 + echo " bed_files = ~{length(bed_files)}" 1>&2 + echo " bed_file_indices = ~{length(bed_file_indexes)}" 1>&2 + echo " bed_file_annotation_names = ~{length(bed_file_annotation_names)}" 1>&2 + false + fi + + # Now we can write out the arrays into a TSV file and add them line by line to the execution: + # Create the TSV: + options_tsv=~{write_tsv(transpose([bed_files, bed_file_annotation_names]))} + + # Now we have to run `VariantFiltration` multiple times on its own output so that it can + # annotate each region in the file: + # NOTE: This is dumb, but must be done because the `--mask` and `--mask-name` inputs are not arrays. + + input_vcf=~{vcf} + output_vcf=~{prefix}.intermediate.vcf.gz + while read mask_options ; do + + bed_file=$(echo "${mask_options}" | awk -F'\t' '{print $1}') + mask_name=$(echo "${mask_options}" | awk -F'\t' '{print $2}') + + echo -e "RUNNING GATK ON NEW MASK: ${mask_name}\t${bed_file}" + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + VariantFiltration \ + -V ${input_vcf} \ + -O ${output_vcf} \ + --mask ${bed_file} \ + --mask-name ${mask_name} + + mv ${output_vcf} ~{prefix}.new_input.vcf.gz + mv ${output_vcf}.tbi ~{prefix}.new_input.vcf.gz.tbi + input_vcf=~{prefix}.new_input.vcf.gz + done < ${options_tsv} + + # Because of the `mv` at the end of the loop we need to move the "new_input" files here: + mv ~{prefix}.new_input.vcf.gz ~{prefix}.vcf.gz + mv ~{prefix}.new_input.vcf.gz.tbi ~{prefix}.vcf.gz.tbi + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File annotated_vcf = "~{prefix}.vcf.gz" + File annotated_vcf_index = "~{prefix}.vcf.gz.tbi" + } +} + +task IndelsVariantRecalibrator { + + input { + Array[File] vcfs + Array[File] vcf_indices + + String prefix + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + Array[File] known_reference_variants + Array[File] known_reference_variants_index + Array[String] known_reference_variants_identifier + Array[Boolean] is_known + Array[Boolean] is_training + Array[Boolean] is_truth + Array[Float] prior + + Boolean use_allele_specific_annotations + Int max_gaussians = 4 + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + vcfs: "Sites only VCFs. Can be pre-filtered using hard-filters." + vcf_indices: "Tribble Indexes for sites only VCF." + known_reference_variants: "Array of known reference VCF files. For humans, dbSNP is one example." + known_reference_variants_index: "Array of index files for known reference VCF files." + known_reference_variants_identifier: "Array of boolean values the identifier / name for the known_reference_variant file at the same array position. Must be the same length as `known_reference_variants`." + is_known: "Array of boolean values indicating if the known_reference_variant file at the same array position contains known variants. Must be the same length as `known_reference_variants`." + is_training: "Array of boolean values indicating if the known_reference_variant file at the same array position contains training data. Must be the same length as `known_reference_variants`." + is_truth: "Array of boolean values indicating if the known_reference_variant file at the same array position contains truth data. Must be the same length as `known_reference_variants`." + prior: "Array of integer values indicating the priors for the known_reference_variant file at the same array position. Must be the same length as `known_reference_variants`." + } + + + Int disk_size = 10 + ceil(size(known_reference_variants, "GB")) + + 4*ceil(size(vcfs, "GB")) + + 2*ceil(size(vcf_indices, "GB")) + + command <<< + set -euxo pipefail + + # We need to generate resource strings from the input arrays. + # First we check that the arrays are the same length: + if [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_identifier)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_index)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_known)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_training)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_truth)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(prior)} ]] ; then + echo "ERROR: Not all input arrays for known variants contain the same number of elements: " 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants)}" 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants_index)}" 1>&2 + echo " known_reference_variants_identifier = ~{length(known_reference_variants_identifier)}" 1>&2 + echo " is_known = ~{length(is_known)}" 1>&2 + echo " is_training = ~{length(is_training)}" 1>&2 + echo " is_truth = ~{length(is_truth)}" 1>&2 + echo " prior = ~{length(prior)}" 1>&2 + false + fi + + # Now we can write out the arrays into a TSV file and add them line by line to the execution: + # Create the TSV: + options_tsv=~{write_tsv(transpose([known_reference_variants_identifier, is_known, is_training, is_truth, prior, known_reference_variants]))} + + # Now read them into a string: + resource_flags=$(awk '{printf("--resource:%s,known=%s,training=%s,truth=%s,prior=%d %s ", $1, $2, $3, $4, $5, $6)}' ${options_tsv}) + + # Get amount of memory to use: + mem_available=$(free -g | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-2 + let mem_max=${mem_available}-1 + + gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \ + VariantRecalibrator \ + -V ~{sep=' -V ' vcfs} \ + -O ~{prefix}.recal \ + --tranches-file ~{prefix}.tranches \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode INDEL \ + --output-model ~{prefix}.model.report \ + --max-gaussians ~{max_gaussians} \ + ${resource_flags} + >>> + + output { + File recalibration = "~{prefix}.recal" + File recalibration_index = "~{prefix}.recal.idx" + File tranches = "~{prefix}.tranches" + File model_report = "~{prefix}.model.report" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 26, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SNPsVariantRecalibratorCreateModel { + + input { + Array[File] vcfs + Array[File] vcf_indices + + String prefix + + Array[String] recalibration_tranche_values + Array[String] recalibration_annotation_values + + Array[File] known_reference_variants + Array[File] known_reference_variants_index + Array[String] known_reference_variants_identifier + Array[Boolean] is_known + Array[Boolean] is_training + Array[Boolean] is_truth + Array[Float] prior + + Int? downsampleFactor + + Boolean use_allele_specific_annotations + Int max_gaussians = 6 + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + vcfs: "Sites only VCFs. Can be pre-filtered using hard-filters." + vcf_indices: "Tribble Indexes for sites only VCF." + known_reference_variants: "Array of known reference VCF files. For humans, dbSNP is one example." + known_reference_variants_index: "Array of index files for known reference VCF files." + known_reference_variants_identifier: "Array of boolean values the identifier / name for the known_reference_variant file at the same array position. Must be the same length as `known_reference_variants`." + is_known: "Array of boolean values indicating if the known_reference_variant file at the same array position contains known variants. Must be the same length as `known_reference_variants`." + is_training: "Array of boolean values indicating if the known_reference_variant file at the same array position contains training data. Must be the same length as `known_reference_variants`." + is_truth: "Array of boolean values indicating if the known_reference_variant file at the same array position contains truth data. Must be the same length as `known_reference_variants`." + prior: "Array of integer values indicating the priors for the known_reference_variant file at the same array position. Must be the same length as `known_reference_variants`." + } + + Int disk_size = 10 + ceil(size(known_reference_variants, "GB")) + + 4*ceil(size(vcfs, "GB")) + + 2*ceil(size(vcf_indices, "GB")) + + String downsample_factor_arg = if defined(downsampleFactor) then " --sample-every-Nth-variant " else "" + + command <<< + set -euxo pipefail + + # We need to generate resource strings from the input arrays. + # First we check that the arrays are the same length: + if [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_identifier)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_index)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_known)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_training)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_truth)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(prior)} ]] ; then + echo "ERROR: Not all input arrays for known variants contain the same number of elements: " 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants)}" 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants_index)}" 1>&2 + echo " known_reference_variants_identifier = ~{length(known_reference_variants_identifier)}" 1>&2 + echo " is_known = ~{length(is_known)}" 1>&2 + echo " is_training = ~{length(is_training)}" 1>&2 + echo " is_truth = ~{length(is_truth)}" 1>&2 + echo " prior = ~{length(prior)}" 1>&2 + false + fi + + # Now we can write out the arrays into a TSV file and add them line by line to the execution: + # Create the TSV: + options_tsv=~{write_tsv(transpose([known_reference_variants_identifier, is_known, is_training, is_truth, prior, known_reference_variants]))} + + # Now read them into a string: + resource_flags=$(awk '{printf("--resource:%s,known=%s,training=%s,truth=%s,prior=%d %s ", $1, $2, $3, $4, $5, $6)}' ${options_tsv}) + + # Get amount of memory to use: + mem_available=$(free -g | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-2 + let mem_max=${mem_available}-1 + + gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \ + VariantRecalibrator \ + -V ~{sep=' -V ' vcfs} \ + -O ~{prefix}.recal \ + --tranches-file ~{prefix}.tranches \ + --trust-all-polymorphic \ + -tranche ~{sep=' -tranche ' recalibration_tranche_values} \ + -an ~{sep=' -an ' recalibration_annotation_values} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + -mode SNP \ + ~{downsample_factor_arg}~{default="" sep=" --sample-every-Nth-variant " downsampleFactor} \ + --output-model ~{prefix}.model.report \ + --max-gaussians ~{max_gaussians} \ + ${resource_flags} + >>> + + output { + File recalibration = "~{prefix}.recal" + File recalibration_index = "~{prefix}.recal.idx" + File tranches = "~{prefix}.tranches" + File model_report = "~{prefix}.model.report" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 64, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ApplyVqsr { + + input { + File vcf + File vcf_index + + String prefix + + File snps_recalibration + File snps_recalibration_index + File snps_tranches + Float snp_filter_level + + File indels_recalibration + File indels_recalibration_index + File indels_tranches + Float indel_filter_level + + Boolean use_allele_specific_annotations + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + ceil(size([vcf, vcf_index], "GB")) + + 2*ceil(size([snps_recalibration, snps_recalibration_index, snps_tranches], "GB")) + + 2*ceil(size([indels_recalibration, indels_recalibration_index, indels_tranches], "GB")) + + command <<< + set -euxo pipefail + + # Get amount of memory to use: + mem_available=$(free -m | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-2000 + let mem_max=${mem_available}-500 + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + ApplyVQSR \ + -V ~{vcf} \ + -O tmp.indel.recalibrated.vcf.gz \ + --recal-file ~{indels_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{indels_tranches} \ + --truth-sensitivity-filter-level ~{indel_filter_level} \ + --create-output-variant-index true \ + -mode INDEL + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + ApplyVQSR \ + -V tmp.indel.recalibrated.vcf.gz \ + -O ~{prefix}.recalibrated.vcf.gz \ + --recal-file ~{snps_recalibration} \ + ~{true='--use-allele-specific-annotations' false='' use_allele_specific_annotations} \ + --tranches-file ~{snps_tranches} \ + --truth-sensitivity-filter-level ~{snp_filter_level} \ + --create-output-variant-index true \ + -mode SNP + >>> + + output { + File recalibrated_vcf = "~{prefix}.recalibrated.vcf.gz" + File recalibrated_vcf_index = "~{prefix}.recalibrated.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 7, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task SelectVariants { + + input { + File vcf + File vcf_index + + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + ceil(size([vcf, vcf_index], "GB")) + + command <<< + set -euxo pipefail + + # Get amount of memory to use: + mem_available=$(free -m | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-2000 + let mem_max=${mem_available}-500 + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + SelectVariants \ + --exclude-filtered \ + -V ~{vcf} \ + -O ~{prefix}.vcf.gz + >>> + + output { + File vcf_out = "~{prefix}.vcf.gz" + File vcf_out_index = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 7, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task RenameSingleSampleVcf { + + input { + File vcf + File vcf_index + + String prefix + + String new_sample_name + + Boolean is_gvcf = false + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 1 + 4*ceil(size([vcf, vcf_index], "GB")) + + String suffix = if is_gvcf then "g.vcf.gz" else "vcf.gz" + + command <<< + set -euo pipefail + + # Get amount of memory to use: + mem_available=$(free -m | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-1000 + let mem_max=${mem_available}-750 + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + RenameSampleInVcf \ + --NEW_SAMPLE_NAME ~{new_sample_name} \ + -I ~{vcf} \ + -O ~{prefix}.~{suffix} + + gatk --java-options "-Xms${mem_start}m -Xmx${mem_max}m" \ + IndexFeatureFile \ + -I ~{prefix}.~{suffix} \ + -O ~{prefix}.~{suffix}.tbi + >>> + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 1, + mem_gb: 4, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } + + output { + File new_sample_name_vcf = "~{prefix}.~{suffix}" + File new_sample_name_vcf_index = "~{prefix}.~{suffix}.tbi" + } +} + +task GatherVcfs { + + input { + Array[File] input_vcfs + Array[File] input_vcf_indices + String prefix + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + 3*ceil(size(input_vcfs, "GB")) + ceil(size(input_vcf_indices, "GB")) + + parameter_meta { + input_vcfs: { + localization_optional: true + } + } + + command <<< + set -euxo pipefail + + # --ignore-safety-checks makes a big performance difference so we include it in our invocation. + # This argument disables expensive checks that the file headers contain the same set of + # genotyped samples and that files are in order by position of first record. + gatk --java-options "-Xms6000m -Xmx6500m" \ + GatherVcfsCloud \ + --ignore-safety-checks \ + --input ~{sep=" --input " input_vcfs} \ + --output ~{prefix}.vcf.gz + + tabix -p vcf ~{prefix}.vcf.gz + + ls -la + >>> + + output { + File output_vcf = "~{prefix}.vcf.gz" + File output_vcf_index = "~{prefix}.vcf.gz.tbi" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ExtractFingerprint { + + input { + File bam + File bai + + File haplotype_database_file + + File ref_fasta + File ref_index + File ref_dict + + String prefix = "fingerprint" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + 2*ceil((size(bam, "GB")) + ceil(size(bai, "GB")) + ceil(size(ref_fasta, "GB")) + ceil(size(ref_index, "GB")) + ceil(size(ref_dict, "GB"))) + + command <<< + set -euxo pipefail + + # Extract the fingerprint with the haplotype file: + gatk ExtractFingerprint \ + -H ~{haplotype_database_file} \ + -I ~{bam} \ + -R ~{ref_fasta} \ + -O ~{prefix}.vcf + + # Convert the fingerprint to a string: + bcftools query -f '%REF %ALT [%PL]\n' tmp.vcf | awk '{split($3,p,","); if (p[1]==0) {printf("%s",$1)} else {printf("%s",$2)}}' > ~{prefix}.string.txt + >>> + + output { + File output_vcf = "~{prefix}.vcf" + File fingerprint_string = read_string("~{prefix}.string.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ExtractFingerprintAndBarcode { + + input { + File vcf + File vcf_index + + File haplotype_database_file + + File ref_fasta + File ref_fasta_fai + File ref_dict + + String prefix = "fingerprint" + + RuntimeAttr? runtime_attr_override + } + + Int disk_size = 10 + ceil(size(vcf, "GB")) + + ceil(size(vcf_index, "GB")) + + ceil(size(haplotype_database_file, "GB")) + + ceil(size(ref_fasta, "GB")) + + ceil(size(ref_fasta_fai, "GB")) + + ceil(size(ref_dict, "GB")) + + command <<< + set -euxo pipefail + +python << CODE + +import pysam + +from collections import defaultdict +from tqdm import tqdm + +def read_reference(reference_fasta): + + ref = dict() + + print(f"Ingesting FASTA reference: {reference_fasta}") + with pysam.FastaFile(reference_fasta) as f: + + # Get all sequence names + contigs = f.references + + for contig in contigs: + sequence = f.fetch(contig) + ref[contig] = sequence + + return ref + + +def extract_barcode(vcf_file, haplotype_database_file, ref_seq_dict, vcf_out_path): + '''Extract a barcode from the given VCF file. + Produces a barcode string as well as a VCF file with any variants from the + barcode sites in the original file. + + Based on haplotype_database_files used in Picard fingerprinting. + Example: gs://gcp-public-data--broad-references/hg38/v0/Homo_sapiens_assembly38.haplotype_database.txt + ''' + + # Read in the fingerprint file: + barcode_site_contig_pos_dict = defaultdict(list) + with open(haplotype_database_file, 'r') as f: + for line in f: + if line.startswith("@") or line.startswith("#"): + continue + fields = line.strip().split('\t') + barcode_site_contig_pos_dict[fields[0]].append(int(fields[1])) + + barcode_alleles = [] + num_sites = sum([len(s) for s in barcode_site_contig_pos_dict.values()]) + num_found = 0 + + with pysam.VariantFile(vcf_file, 'r') as vcf: + with pysam.VariantFile(vcf_out_path, 'w', header=vcf.header) as vcf_out: + with tqdm(desc="Extracting barcode variants", total=num_sites) as pbar: + for contig, sites in barcode_site_contig_pos_dict.items(): + for site in sites: + variants = vcf.fetch(contig=contig, start=site-1, stop=site) + found = False + for variant in variants: + # Should only get here if we are at the site. + found = True + gts = [s['GT'] for s in variant.samples.values()] + if len(gts) > 1: + # Too many genotypes, set N: + bca = "N" + elif gts[0] == (0, 0): + # Ref: + bca = ref_seq_dict[contig][site-1] + elif gts[0] == (0, 1) or gts[0] == (1, 0): + # Het -> multi-infection -> N + bca = "N" + # Must be Hom VAR + elif len(variant.alts[0]) != 1: + # INDEL / MNP -> N + bca = "N" + else: + bca = variant.alts[0] + + barcode_alleles.append(bca) + + # in any event we have found our variant and we can stop: + break + + num_found += found + if found: + # Write the variant to the output file: + vcf_out.write(variant) + else: + # We need to pull from the reference for this site. + # Add 1 for genomic coordinates: + bca = ref_seq_dict[contig][site-1] + + # TODO: it is possible that we should instead add an X here (https://doi.org/10.1093%2Fpnasnexus%2Fpgac187). Double-check with Wes. + + # Add our barcode allele: + barcode_alleles.append(bca) + + pbar.update(1) + + return "".join(barcode_alleles) + + + +# Read the reference file: +ref = read_reference("~{ref_fasta}") + +# Calculate the barcode info: +barcode = extract_barcode("~{vcf}", "~{haplotype_database_file}", ref, "~{prefix}.fingerprint.vcf") + +print(f"Extracted barcode: {barcode}") + +# Write the barcode string to a file: +with open("~{prefix}.barcode.txt", 'w') as f: + f.write(f"{barcode}\n") + +CODE + >>> + + output { + File output_vcf = "~{prefix}.fingerprint.vcf" + String barcode = read_string("~{prefix}.barcode.txt") + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 8, + disk_gb: disk_size, + boot_disk_gb: 10, + preemptible_tries: 2, + max_retries: 1, + docker: "us.gcr.io/broad-dsp-lrma/sr-utils:0.2.1" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task ExtractVariantAnnotations { + + input { + File vcf + File vcf_index + + String prefix + + String mode + + Array[String] recalibration_annotation_values + + Array[File] known_reference_variants + Array[File] known_reference_variants_index + Array[String] known_reference_variants_identifier + Array[Boolean] is_training + Array[Boolean] is_calibration + + Int max_unlabeled_variants = 0 + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + vcf: "VCF File from which to extract annotations." + vcf_index: "Index for the given VCF file." + prefix: "Prefix of the output files." + mode: "SNP or INDEL" + known_reference_variants: "Array of known reference VCF files. For humans, dbSNP is one example." + known_reference_variants_index: "Array of index files for known reference VCF files." + known_reference_variants_identifier: "Array of boolean values the identifier / name for the known_reference_variant file at the same array position. Must be the same length as `known_reference_variants`." + is_training: "Array of boolean values indicating if the known_reference_variant file at the same array position should be used for 'training' data. Must be the same length as `known_reference_variants`." + is_calibration: "Array of boolean values indicating if the known_reference_variant file at the same array position should be used for 'calibration' data. Must be the same length as `known_reference_variants`." + max_unlabeled_variants: "How many sites should be used for unlableled training data. Setting this to values > 0 will enable a positive-negative training model." + } + + Int disk_size = 10 + ceil(size(known_reference_variants, "GB")) + + 4*ceil(size(vcf, "GB")) + + 2*ceil(size(vcf_index, "GB")) + + 2*ceil(size(known_reference_variants_index, "GB")) + + command <<< + set -euxo pipefail + + # We need to generate resource strings from the input arrays. + # First we check that the arrays are the same length: + if [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_identifier)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_index)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_training)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_calibration)} ]] ; then + echo "ERROR: Not all input arrays for known variants contain the same number of elements: " 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants)}" 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants_index)}" 1>&2 + echo " known_reference_variants_identifier = ~{length(known_reference_variants_identifier)}" 1>&2 + echo " is_training = ~{length(is_training)}" 1>&2 + echo " is_calibration = ~{length(is_calibration)}" 1>&2 + false + fi + + # Now we can write out the arrays into a TSV file and add them line by line to the execution: + # Create the TSV: + options_tsv=~{write_tsv(transpose([known_reference_variants_identifier, is_training, is_calibration, known_reference_variants]))} + + # Now read them into a string: + resource_flags=$(awk '{printf("--resource:%s,training=%s,calibration=%s %s ", $1, $2, $3, $4)}' ${options_tsv}) + + # Get amount of memory to use: + mem_available=$(free -g | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-2 + let mem_max=${mem_available}-2 + + gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \ + ExtractVariantAnnotations \ + --verbosity DEBUG \ + -V ~{vcf} \ + -A ~{sep=' -A ' recalibration_annotation_values} \ + --mode ~{mode} \ + --maximum-number-of-unlabeled-variants ~{max_unlabeled_variants} \ + ${resource_flags} \ + -O ~{prefix}_extracted_annotations_~{mode} + >>> + + output { + File annotation_hdf5 = "~{prefix}_extracted_annotations_~{mode}.annot.hdf5" + File sites_only_vcf = "~{prefix}_extracted_annotations_~{mode}.vcf.gz" + File sites_only_vcf_index = "~{prefix}_extracted_annotations_~{mode}.vcf.gz.tbi" + + File? unlabeled_annotation_hdf5 = "~{prefix}_extracted_annotations_~{mode}.unlabeled.annot.hdf5" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 26, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + +task TrainVariantAnnotationsModel { + + input { + File annotation_hdf5 + String mode + String prefix + + File? unlabeled_annotation_hdf5 + Float calibration_sensitivity_threshold = 0.95 + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + annotation_hdf5: "Labeled-annotations HDF5 file." + mode: "SNP or INDEL" + prefix: "Prefix of the output files." + unlabeled_annotation_hdf5: "Unlabeled-annotations HDF5 file (optional)" + calibration_sensitivity_threshold: "Calibration-set sensitivity threshold. (optional)" + } + + Int disk_size = 10 + 4*ceil(size(annotation_hdf5, "GB")) + + 4*ceil(size(unlabeled_annotation_hdf5, "GB")) + + String cal_sense_arg = if defined(unlabeled_annotation_hdf5) then " --calibration-sensitivity-threshold ~{calibration_sensitivity_threshold}" else "" + + # Needed for output. I think there's a bug in the output naming for this tool. + String mode_lower = if mode == "SNP" then "snp" else "indel" + + command <<< + set -euxo pipefail + + # Get amount of memory to use: + mem_available=$(free -g | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-2 + let mem_max=${mem_available}-2 + + gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \ + TrainVariantAnnotationsModel \ + --verbosity DEBUG \ + --annotations-hdf5 ~{annotation_hdf5} \ + --mode ~{mode} \ + ~{"--unlabeled-annotations-hdf5 " + unlabeled_annotation_hdf5} \ + ~{cal_sense_arg} \ + -O ~{prefix}_train_~{mode} + >>> + + output { + File training_scores = "~{prefix}_train_~{mode}.~{mode_lower}.trainingScores.hdf5" + File positive_model_scorer_pickle = "~{prefix}_train_~{mode}.~{mode_lower}.scorer.pkl" + + File? unlabeled_positive_model_scores = "~{prefix}_train_~{mode}.~{mode_lower}.unlabeledScores.hdf5" + File? calibration_set_scores = "~{prefix}_train_~{mode}.~{mode_lower}.calibrationScores.hdf5" + File? negative_model_scorer_pickle = "~{prefix}_train_~{mode}.~{mode_lower}.negative.scorer.pkl" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 26, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} + + +task ScoreVariantAnnotations { + + input { + File vcf + File vcf_index + + File sites_only_extracted_vcf + File sites_only_extracted_vcf_index + + String model_prefix + Array[File] model_files + + String prefix + + String mode + + Float calibration_sensitivity_threshold = 0.99 + + Array[String] recalibration_annotation_values + + Array[File] known_reference_variants + Array[File] known_reference_variants_index + Array[String] known_reference_variants_identifier + Array[Boolean] is_training + Array[Boolean] is_calibration + + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + vcf: "VCF File from which to extract annotations." + vcf_index: "Index for the given VCF file." + prefix: "Prefix of the output files." + mode: "SNP or INDEL" + known_reference_variants: "Array of known reference VCF files. For humans, dbSNP is one example." + known_reference_variants_index: "Array of index files for known reference VCF files." + known_reference_variants_identifier: "Array of boolean values the identifier / name for the known_reference_variant file at the same array position. Must be the same length as `known_reference_variants`." + is_training: "Array of boolean values indicating if the known_reference_variant file at the same array position should be used for 'training' data. Must be the same length as `known_reference_variants`." + is_calibration: "Array of boolean values indicating if the known_reference_variant file at the same array position should be used for 'calibration' data. Must be the same length as `known_reference_variants`." + } + + Int disk_size = 10 + 4*ceil(size(vcf, "GB")) + + 2*ceil(size(vcf_index, "GB")) + + 2*ceil(size(sites_only_extracted_vcf, "GB")) + + 2*ceil(size(sites_only_extracted_vcf_index, "GB")) + + 2*ceil(size(known_reference_variants, "GB")) + + 2*ceil(size(known_reference_variants_index, "GB")) + + 2*ceil(size(model_files, "GB")) + + command <<< + set -euxo pipefail + + # We need to generate resource strings from the input arrays. + # First we check that the arrays are the same length: + if [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_identifier)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(known_reference_variants_index)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_training)} ]] || \ + [[ ~{length(known_reference_variants)} -ne ~{length(is_calibration)} ]] ; then + echo "ERROR: Not all input arrays for known variants contain the same number of elements: " 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants)}" 1>&2 + echo " known_reference_variants = ~{length(known_reference_variants_index)}" 1>&2 + echo " known_reference_variants_identifier = ~{length(known_reference_variants_identifier)}" 1>&2 + echo " is_training = ~{length(is_training)}" 1>&2 + echo " is_calibration = ~{length(is_calibration)}" 1>&2 + false + fi + + # Now we can write out the arrays into a TSV file and add them line by line to the execution: + # Create the TSV: + options_tsv=~{write_tsv(transpose([known_reference_variants_identifier, is_training, is_calibration, known_reference_variants]))} + + # Now read them into a string: + resource_flags=$(awk '{printf("--resource:%s,training=%s,calibration=%s %s ", $1, $2, $3, $4)}' ${options_tsv}) + + # Get amount of memory to use: + mem_available=$(free -g | grep '^Mem' | awk '{print $2}') + let mem_start=${mem_available}-2 + let mem_max=${mem_available}-2 + + mode_lower=$(echo ~{mode} | tr 'A-Z' 'a-z') + + # Set up model files: + mkdir model_files + ln -s ~{sep=" model_files && ln -s " model_files} model_files + + # Debugging: + find . + + gatk --java-options "-Xms${mem_start}g -Xmx${mem_max}g" \ + ScoreVariantAnnotations \ + --verbosity DEBUG \ + -V ~{vcf} \ + -A ~{sep=' -A ' recalibration_annotation_values} \ + --mode ~{mode} \ + --model-prefix model_files/~{model_prefix} \ + ${resource_flags} \ + --resource:extracted,extracted=true ~{sites_only_extracted_vcf} \ + --${mode_lower}-calibration-sensitivity-threshold ~{calibration_sensitivity_threshold} \ + -O ~{prefix}_scored + >>> + + output { + File scored_vcf = "~{prefix}_scored.vcf.gz" + File scored_vcf_index = "~{prefix}_scored.vcf.gz.tbi" + + File? annotations_hdf5 = "~{prefix}_scored.annot.hdf5" + File? scores_hdf5 = "~{prefix}_scored.scores.hdf5" + } + + ######################### + RuntimeAttr default_attr = object { + cpu_cores: 2, + mem_gb: 26, + disk_gb: disk_size, + boot_disk_gb: 15, + preemptible_tries: 1, + max_retries: 1, + docker: "us.gcr.io/broad-gatk/gatk:4.3.0.0" + } + RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) + runtime { + cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) + memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" + disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " LOCAL" + bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) + preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) + maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) + docker: select_first([runtime_attr.docker, default_attr.docker]) + } +} \ No newline at end of file