Skip to content

Commit 5cc9b1d

Browse files
authored
Add a WDL for GangSTR (#218)
* Draft a WDL for GangSTR. * Simplify joining array items by `,`. * Install GangSTR via Bioconda. * Add a short doc about the workflow & a comment. * Update output, remove a comment & unused struct. * remove unused sets & pinned TRTools version & image clean up. * Use the current latest version of samtools. * Draft splitting the WDL in two: GangSTR & GangSTRScatter. * Bug fixes in GangSTR.wdl * Bug fixes in GangSTRScatter.wdl. * add str_ prefix to docker variables in GangSTR WDLs. * Refactor for clarity & add args documentation. * Add user-defined prefix for GangSTR output.
1 parent f7a5544 commit 5cc9b1d

File tree

3 files changed

+216
-25
lines changed

3 files changed

+216
-25
lines changed

dockerfiles/str/Dockerfile

Lines changed: 44 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,16 @@
55
# - ExpansionHunter
66

77
FROM ubuntu:20.04
8+
ARG DEBIAN_FRONTEND=noninteractive
9+
ENV PATH="/root/miniconda3/bin:${PATH}"
10+
ARG CONDA_RELEASE="4.6.14"
11+
ARG CONDA_INSTALL_DIR="/opt/conda"
12+
ARG CONDA_BIN=${CONDA_INSTALL_DIR}/bin
13+
ARG CONDA_CMD=${CONDA_BIN}/conda
14+
ENV PATH=${CONDA_BIN}:$PATH
15+
ENV EH_VERSION=v4.0.2
816

9-
RUN apt-get update && DEBIAN_FRONTEND="noninteractive" apt-get install --no-install-recommends -qqy \
17+
RUN apt-get update && apt-get install --no-install-recommends -qqy \
1018
python3-dev \
1119
python3-pip \
1220
python \
@@ -20,16 +28,24 @@ RUN apt-get update && DEBIAN_FRONTEND="noninteractive" apt-get install --no-inst
2028
pkg-config \
2129
wget \
2230
unzip \
23-
zlib1g-dev
31+
zlib1g-dev \
32+
libcurl4-openssl-dev \
33+
libssl-dev
34+
35+
# Install and configure conda
36+
RUN wget --quiet https://repo.anaconda.com/miniconda/Miniconda3-${CONDA_RELEASE}-Linux-x86_64.sh -O /tmp/miniconda.sh && \
37+
bash /tmp/miniconda.sh -b -p ${CONDA_INSTALL_DIR} && \
38+
${CONDA_CMD} init bash && \
39+
bash -c "source ${HOME}/.bashrc"
2440

25-
RUN pip3 install pybedtools==0.8.2 pyvcf==0.6.8 scipy==1.7.1 numpy==1.21.1
41+
RUN pip install Cython==0.29.24 numpy==1.21.2 pybedtools==0.8.2 scipy==1.7.1
2642

2743
# Install samtools (needed to index reference fasta files)
28-
RUN wget -O samtools-1.9.tar.bz2 https://github.com/samtools/samtools/releases/download/1.9/samtools-1.9.tar.bz2 \
29-
&& tar -xjf samtools-1.9.tar.bz2 \
30-
&& cd samtools-1.9 \
31-
&& ./configure --without-curses && make && make install \
32-
&& cd ..
44+
RUN wget -O samtools-1.13.tar.bz2 https://github.com/samtools/samtools/releases/download/1.13/samtools-1.13.tar.bz2 && \
45+
tar -xjf samtools-1.13.tar.bz2 && \
46+
cd samtools-1.13 && \
47+
./configure --without-curses && make && make install && \
48+
cd ..
3349

3450
# Install bedtools (needed for DumpSTR)
3551
## Option 1: install from source
@@ -41,23 +57,26 @@ WORKDIR ..
4157
## Option 2: install from apt
4258
#RUN apt-get install bedtools
4359

44-
# Download, compile, and install GangSTR
45-
RUN wget -O GangSTR-2.4.tar.gz https://github.com/gymreklab/GangSTR/releases/download/v2.4/GangSTR-2.4.tar.gz \
46-
&& tar -xzvf GangSTR-2.4.tar.gz \
47-
&& cd GangSTR-2.4 \
48-
&& ./install-gangstr.sh \
49-
&& ldconfig \
50-
&& cd ..
51-
5260
# Download and install TRTools
53-
RUN git clone https://github.com/gymreklab/TRTools \
54-
&& cd TRTools \
55-
&& python3 setup.py install \
56-
&& cd ..
61+
RUN git clone https://github.com/gymreklab/TRTools && \
62+
cd TRTools && \
63+
git checkout tags/v4.0.0 -b v4.0.0-branch && \
64+
pip install -r requirements.txt && \
65+
python3 setup.py install && \
66+
cd ..
5767

58-
ENV EH_VERSION=v4.0.2
59-
RUN wget https://github.com/Illumina/ExpansionHunter/releases/download/${EH_VERSION}/ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz \
60-
&& tar xzf ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz \
61-
&& rm ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz \
62-
&& mv /ExpansionHunter-${EH_VERSION}-linux_x86_64 /ExpansionHunter
68+
# Install ExpansionHunter
69+
RUN wget https://github.com/Illumina/ExpansionHunter/releases/download/${EH_VERSION}/ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz && \
70+
tar xzf ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz && \
71+
rm ExpansionHunter-${EH_VERSION}-linux_x86_64.tar.gz && \
72+
mv /ExpansionHunter-${EH_VERSION}-linux_x86_64 /ExpansionHunter
6373
ENV PATH="/ExpansionHunter/bin/:$PATH"
74+
75+
# Install GangSTR
76+
RUN conda install -c bioconda -c conda-forge gangstr
77+
78+
# Clean up
79+
RUN rm -rf /tmp/* \
80+
/var/tmp/* \
81+
/var/cache/apt/* \
82+
/var/lib/apt/lists/* \

wdl/GangSTR.wdl

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
## Workflow to run GangSTR (https://github.com/gymreklab/GangSTR), a tool
2+
## for computing genome-wide profile of short tandem repeats (STR) from
3+
## short reads.
4+
5+
version 1.0
6+
7+
import "Structs.wdl"
8+
9+
workflow GangSTR {
10+
11+
input {
12+
File bam_or_cram
13+
File? bam_or_cram_index
14+
File reference_fasta
15+
File? reference_fasta_index
16+
File target_tr_loci_regions_bed
17+
String? output_prefix
18+
String str_docker
19+
RuntimeAttr? runtime_attr
20+
}
21+
22+
parameter_meta {
23+
bam_or_cram: "Set the path to a sorted and indexed bam or cram file generated by an indel-sensitive aligner to be used as input for GangSTR."
24+
bam_or_cram_index: "[Optional] Set the path to the index file of the `bam_or_cram` input."
25+
reference_fasta: "Sets the path to the reference in fasta format."
26+
reference_fasta_index: "[Optional] Sets the path to the index of reference file."
27+
target_tr_loci_regions_bed: "Reference set of regions to genotype represented in bed-like format; see GangSTR documentation for the file structure at: https://github.com/gymreklab/GangSTR#tr-regions---regions"
28+
output_prefix: "[Optional] Set an string to be used as a prefix to the output files. Defaults to the bam_or_cram filename."
29+
str_docker: "Sets the STR docker image."
30+
runtime_attr: "[Optional] Override the default runtime attributes for the GangSTR workflow."
31+
}
32+
33+
Boolean is_bam =
34+
basename(bam_or_cram, ".bam") + ".bam" == basename(bam_or_cram)
35+
36+
File bam_or_cram_index_ =
37+
if defined(bam_or_cram_index) then
38+
select_first([bam_or_cram_index])
39+
else
40+
bam_or_cram + if is_bam then ".bai" else ".crai"
41+
42+
File reference_fasta_index_ = select_first([
43+
reference_fasta_index, reference_fasta + ".fai"])
44+
45+
String output_prefix_ =
46+
if defined(output_prefix) then
47+
select_first([output_prefix])
48+
else
49+
if is_bam then
50+
basename(bam_or_cram, ".bam")
51+
else
52+
basename(bam_or_cram, ".cram")
53+
54+
call CallGangSTR {
55+
input:
56+
bam_or_cram = bam_or_cram,
57+
bam_or_cram_index = bam_or_cram_index_,
58+
reference_fasta = reference_fasta,
59+
reference_fasta_index = reference_fasta_index_,
60+
target_tr_loci_regions_bed = target_tr_loci_regions_bed,
61+
output_prefix = output_prefix_,
62+
str_docker = str_docker,
63+
runtime_attr_override = runtime_attr
64+
}
65+
66+
output {
67+
File output_vcf = CallGangSTR.output_vcf
68+
File sample_stats = CallGangSTR.sample_stats
69+
File insdata = CallGangSTR.insdata
70+
}
71+
}
72+
73+
task CallGangSTR {
74+
input {
75+
File bam_or_cram
76+
File bam_or_cram_index
77+
File reference_fasta
78+
File reference_fasta_index
79+
File target_tr_loci_regions_bed
80+
String output_prefix
81+
String str_docker
82+
RuntimeAttr? runtime_attr_override
83+
}
84+
85+
output {
86+
File output_vcf = "${output_prefix}.vcf"
87+
File sample_stats = "${output_prefix}.samplestats.tab"
88+
File insdata = "${output_prefix}.insdata.tab"
89+
}
90+
91+
command <<<
92+
set -euxo pipefail
93+
94+
GangSTR \
95+
--bam ~{bam_or_cram} \
96+
--ref ~{reference_fasta} \
97+
--regions ~{target_tr_loci_regions_bed} \
98+
--out ~{output_prefix}
99+
>>>
100+
101+
RuntimeAttr runtime_attr_str_profile_default = object {
102+
cpu_cores: 1,
103+
mem_gb: 4,
104+
boot_disk_gb: 10,
105+
preemptible_tries: 3,
106+
max_retries: 1,
107+
disk_gb: 10 + ceil(size([
108+
bam_or_cram,
109+
reference_fasta,
110+
reference_fasta_index], "GiB"))
111+
}
112+
RuntimeAttr runtime_attr = select_first([
113+
runtime_attr_override,
114+
runtime_attr_str_profile_default])
115+
116+
runtime {
117+
docker: str_docker
118+
cpu: runtime_attr.cpu_cores
119+
memory: runtime_attr.mem_gb + " GiB"
120+
disks: "local-disk " + runtime_attr.disk_gb + " HDD"
121+
bootDiskSizeGb: runtime_attr.boot_disk_gb
122+
preemptible: runtime_attr.preemptible_tries
123+
maxRetries: runtime_attr.max_retries
124+
}
125+
}

wdl/GangSTRScatter.wdl

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
version 1.0
2+
3+
import "Structs.wdl"
4+
import "GangSTR.wdl" as GangSTR
5+
6+
workflow GangSTRScatter {
7+
8+
input {
9+
Array[File] bams_or_crams
10+
Array[File]? bams_or_crams_indexes
11+
File reference_fasta
12+
File? reference_fasta_index
13+
File target_tr_loci_regions_bed
14+
String str_docker
15+
RuntimeAttr? runtime_attr
16+
}
17+
18+
scatter (i in range(length(bams_or_crams))) {
19+
File bam_or_cram_ = bams_or_crams[i]
20+
Boolean is_bam =
21+
basename(bam_or_cram_, ".bam") + ".bam" == basename(bam_or_cram_)
22+
File bam_or_cram_index_ =
23+
if defined(bams_or_crams_indexes) then
24+
select_first([bams_or_crams_indexes])[i]
25+
else
26+
bam_or_cram_ + if is_bam then ".bai" else ".crai"
27+
File reference_fasta_index_ = select_first([
28+
reference_fasta_index, reference_fasta + ".fai"])
29+
30+
call GangSTR.GangSTR as gangSTR {
31+
input:
32+
bam_or_cram=bam_or_cram_,
33+
bam_or_cram_index=bam_or_cram_index_,
34+
reference_fasta=reference_fasta,
35+
reference_fasta_index=reference_fasta_index_,
36+
target_tr_loci_regions_bed=target_tr_loci_regions_bed,
37+
str_docker=str_docker,
38+
runtime_attr=runtime_attr
39+
}
40+
}
41+
42+
output {
43+
Array[File] output_vcfs = gangSTR.output_vcf
44+
Array[File] samples_stats = gangSTR.sample_stats
45+
Array[File] insdatas = gangSTR.insdata
46+
}
47+
}

0 commit comments

Comments
 (0)