Skip to content

Commit 5413794

Browse files
committed
Updates to genomicssb joint calling.
- Renamed to `SRJointGenotypingPopulationScale` - Fixed inputs to include genomicsdb map file for contig -> instance maps. - Renamed published workflow to reflect new name.
1 parent 03c9c6c commit 5413794

File tree

3 files changed

+45
-9
lines changed

3 files changed

+45
-9
lines changed

.dockstore.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,6 @@ workflows:
186186
- name: SRFlowcell_Simplified
187187
subclass: wdl
188188
primaryDescriptorPath: /wdl/pipelines/ILMN/Alignment/SRFlowcell_Simplified.wdl
189-
- name: SRJointCallGVCFsWithGenomicsDB_simplified
189+
- name: SRJointCallGVCFsWithGenomicsDBPopulationScale
190190
subclass: wdl
191-
primaryDescriptorPath: /wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDB_simplified.wdl
191+
primaryDescriptorPath: /wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDBPopulationScale.wdl

wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDB_simplified.wdl renamed to wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDBPopulationScale.wdl

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
11
version 1.0
22

3-
import "../../../tasks/VariantCalling/SRJointGenotyping_simplified.wdl" as SRJOINT
3+
import "../../../tasks/VariantCalling/SRJointGenotypingPopulationScale.wdl" as SRJOINT
44
import "../../../tasks/Utility/VariantUtils.wdl" as VARUTIL
55
import "../../../tasks/Utility/Utils.wdl" as UTILS
66
import "../../../tasks/TertiaryAnalysis/FunctionalAnnotation.wdl" as FUNK
77
import "../../../tasks/Utility/SGKit.wdl" as SGKit
88
import "../../../tasks/Utility/Finalize.wdl" as FF
99

10-
workflow SRJointCallGVCFsWithGenomicsDB_simplified {
10+
workflow SRJointCallGVCFsWithGenomicsDBPopulationScale {
1111

1212
meta {
1313
author: "Jonn Smith"
14-
description: "A workflow that performs joint calling on single-sample gVCFs from GATK4 HaplotypeCaller using GenomicsDB."
14+
description: "A workflow that performs joint calling on single-sample gVCFs from GATK4 HaplotypeCaller using GenomicsDB. This Workflow relies on previously constructed genomicsDB instances to provide population-scale context for joint calling. NOTE: Currently assumes the interval list consists of only whole contigs."
1515
}
1616
parameter_meta {
1717
gvcfs: "Array of GVCF files to use as inputs for joint calling."
1818
gvcf_indices: "Array of gvcf index files for `gvcfs`. Order should correspond to that in `gvcfs`."
19-
ref_map_file: "Reference map file indicating reference sequence and auxillary file locations"
19+
ref_map_file: "Reference map file indicating reference sequence and auxillary file locations"
20+
genomicsdb_tar_contig_map_file: "File containing a map of contigs to GenomicsDB tar files. This file is used to determine which GenomicsDB tar file to use for each contig."
2021

2122
heterozygosity: "Joint Genotyping Parameter - Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept"
2223
heterozygosity_stdev: "Joint Genotyping Parameter - Standard deviation of heterozygosity for SNP and indel calling."
@@ -62,6 +63,8 @@ workflow SRJointCallGVCFsWithGenomicsDB_simplified {
6263

6364
File ref_map_file
6465

66+
File genomicsdb_tar_contig_map_file
67+
6568
Float heterozygosity = 0.001
6669
Float heterozygosity_stdev = 0.01
6770
Float indel_heterozygosity = 0.000125
@@ -97,8 +100,6 @@ workflow SRJointCallGVCFsWithGenomicsDB_simplified {
97100
File? snpeff_db
98101
String? snpeff_db_identifier
99102

100-
File? existing_genomicsdb_tar
101-
102103
File? interval_list
103104

104105
Boolean do_zarr_conversion = false
@@ -114,6 +115,7 @@ workflow SRJointCallGVCFsWithGenomicsDB_simplified {
114115
}
115116

116117
Map[String, String] ref_map = read_map(ref_map_file)
118+
Map[String, File] genomicsdb_tar_contig_map = read_map(genomicsdb_tar_contig_map_file)
117119

118120
# Resolve the db_snp_vcf file, with preference to the db_snp_vcf file if it exists:
119121
call UTILS.ResolveMapKeysInPriorityOrder as ResolveMapKeysInPriorityOrder {
@@ -153,6 +155,8 @@ workflow SRJointCallGVCFsWithGenomicsDB_simplified {
153155

154156
String interval_name = ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][0] + "_" + ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][1] + "_" + ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][2]
155157

158+
File existing_genomicsdb_tar = genomicsdb_tar_contig_map[interval_name]
159+
156160
# To make sure the interval names and the files themselves correspond, we need to make the
157161
# interval list file here:
158162
call UTILS.CreateIntervalListFileFromIntervalInfo as CreateIntervalListFileFromIntervalInfo {

wdl/tasks/VariantCalling/SRJointGenotyping_simplified.wdl renamed to wdl/tasks/VariantCalling/SRJointGenotypingPopulationScale.wdl

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,10 @@ task ImportGVCFs {
248248
echo "" >&2
249249
echo "--------------------------------" >&2
250250
251+
# If we have a gendb input we may need to modify the sample name map.
252+
# Here we'll make a copy of the sample name map for final call to GenomicsDBImport:
253+
cp ~{sample_name_map} SAMPLE_NAME_MAP_FINAL.tsv
254+
251255
if [[ ~{has_existing_genomicsdb_tar} == "true" ]] ; then
252256
t_start=$(date +%s)
253257
date
@@ -256,14 +260,42 @@ task ImportGVCFs {
256260
date
257261
t_end=$(date +%s)
258262
echo "Untarring existing GenomicsDB workspace: ~{existing_genomicsdb_tar} took $((t_end - t_start)) seconds"
263+
264+
GENOMICSDB_DIR=$(basename ~{existing_genomicsdb_tar} .tar)
265+
266+
# We need to check if our input data contains any samples in the genomicsdb instance.
267+
# If it does, we must print a _big_ warning message to the user about each sample and then continue.
268+
# We can continue using the samples in the genomicsdb instance for any duplicates.
269+
grep 'sample_name' "${GENOMICSDB_DIR}/callset.json" | awk '{print $NF}' | tr -d '", ' > genomicsdb_samples.txt
270+
awk '{print $1}' ~{sample_name_map} > input_samples.txt
271+
272+
# Find the intersection of the two files:
273+
comm -12 genomicsdb_samples.txt input_samples.txt > overlapping_samples.txt
274+
275+
if [[ -s overlapping_samples.txt ]] ; then
276+
wc -l overlapping_samples.txt | awk '{print $1}' > overlapping_samples_count.txt
277+
278+
echo "************************************************************"
279+
echo "Warning: Found $(cat overlapping_samples_count.txt) overlapping samples"
280+
echo "Any overlaps will be ingested from genomicsDB and not the input data."
281+
echo "************************************************************"
282+
echo "Overlapping samples:"
283+
cat overlapping_samples.txt
284+
echo "************************************************************"
285+
286+
# Translate the overlapping samples into regular expressions for grep:
287+
sed 's@^@^@' overlapping_samples.txt > overlapping_samples_regex.txt
288+
289+
grep -v -f overlapping_samples_regex.txt ~{sample_name_map} > SAMPLE_NAME_MAP_FINAL.tsv
290+
fi
259291
fi
260292
261293
gatk --java-options "-Xms8192m -Xmx${java_memory_size_mb}m" \
262294
GenomicsDBImport \
263295
~{genomicsdb_arg} \
264296
--batch-size ~{batch_size} \
265297
-L ~{interval_list} \
266-
--sample-name-map ~{sample_name_map} \
298+
--sample-name-map SAMPLE_NAME_MAP_FINAL.tsv \
267299
--reader-threads 5 \
268300
--merge-input-intervals \
269301
--consolidate

0 commit comments

Comments
 (0)