Skip to content

Commit e701891

Browse files
committed
Add run_whamg.sh
1 parent eb152b4 commit e701891

File tree

1 file changed

+67
-0
lines changed

1 file changed

+67
-0
lines changed

src/bash_workflows/run_whamg.sh

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
#!/bin/bash
2+
3+
set -Eeuo pipefail
4+
5+
cram_file=$1
6+
cram_index=$2
7+
sample_id=$3
8+
reference_fasta=$4
9+
reference_index=$5
10+
include_bed_file=$6
11+
primary_contigs_list=$7
12+
cpu_cores=${8:-4}
13+
14+
15+
chr_list=$(paste -s -d ',' "${primary_contigs_list}")
16+
17+
# print some info that may be useful for debugging
18+
df -h
19+
echo "whamg $(whamg 2>&1 | grep Version)"
20+
21+
# necessary for getting permission to read from google bucket directly
22+
export GCS_OAUTH_TOKEN=`gcloud auth application-default print-access-token`
23+
24+
echo "Converting cram to bam ..."
25+
# covert cram to bam
26+
samtools view -b1@ ${cpu_cores} -T "${reference_fasta}" "${cram_file}" > sample.bam
27+
echo "Finished converting cram to bam."
28+
29+
# index bam file
30+
samtools index -@ ${cpu_cores} sample.bam
31+
32+
# ensure that index files are present in appropriate locations
33+
ln -s sample.bam.bai sample.bai
34+
if [ ! -e "${reference_fasta}.fai" ]; then
35+
ln -s "${reference_index}" "${reference_fasta}.fai"
36+
fi
37+
38+
# run whamg on all specified intervals
39+
mkdir tmpVcfs
40+
cd tmpVcfs
41+
awk 'BEGIN{FS=OFS="\t"}{printf("%07d\t%s\n",NR,$1":"$2"-"$3)}' ${include_bed_file} |\
42+
while read -r line interval; do
43+
vcfFile="$line.wham.vcf.gz"
44+
whamg \
45+
-c ${chr_list}" \
46+
-x ${cpu_cores} \
47+
-a "${reference_fasta}" \
48+
-f ../sample.bam \
49+
-r $interval \
50+
| bgzip -c > $vcfFile
51+
bcftools index -t $vcfFile
52+
done
53+
54+
# We need to update both the VCF sample ID and the TAGS INFO field in the WHAM output VCFs.
55+
# WHAM uses both to store the sample identifier, and by default uses the SM identifier from the BAM file.
56+
# We need to update both to reflect the potentially-renamed sample identifier used by the pipeline (sample_id) --
57+
# svtk standardize_vcf uses the TAGS field to identify the sample for WHAM VCFs.
58+
59+
ls -1 *.wham.vcf.gz > vcf.list
60+
bcftools concat -a -Ov -f vcf.list | \
61+
sed -e 's/^#CHROM\t.*/#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t${sample_id}/' -e 's/;TAGS=[^;]*;/;TAGS=${sample_id};/' | \
62+
bgzip -c > "../${sample_id}.wham.vcf.gz"
63+
cd ..
64+
bcftools index -t "${sample_id}.wham.vcf.gz"
65+
66+
df -h
67+
ls -l

0 commit comments

Comments
 (0)