Updates to genomicssb joint calling.

jonn-smith · jonn-smith · commit 54137949def1 · 2026-03-06T14:05:10.000-05:00
- Renamed to `SRJointGenotypingPopulationScale`
- Fixed inputs to include genomicsdb map file for contig -&gt; instance
  maps.
- Renamed published workflow to reflect new name.
diff --git a/.dockstore.yml b/.dockstore.yml
@@ -186,6 +186,6 @@ workflows:
 - name: SRFlowcell_Simplified
   subclass: wdl
   primaryDescriptorPath: /wdl/pipelines/ILMN/Alignment/SRFlowcell_Simplified.wdl
-- name: SRJointCallGVCFsWithGenomicsDB_simplified
+- name: SRJointCallGVCFsWithGenomicsDBPopulationScale
   subclass: wdl
-  primaryDescriptorPath: /wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDB_simplified.wdl
+  primaryDescriptorPath: /wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDBPopulationScale.wdl
diff --git a/wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDBPopulationScale.wdl b/wdl/pipelines/TechAgnostic/VariantCalling/SRJointCallGVCFsWithGenomicsDBPopulationScale.wdl
@@ -1,22 +1,23 @@
 version 1.0
 
-import "../../../tasks/VariantCalling/SRJointGenotyping_simplified.wdl" as SRJOINT
+import "../../../tasks/VariantCalling/SRJointGenotypingPopulationScale.wdl" as SRJOINT
 import "../../../tasks/Utility/VariantUtils.wdl" as VARUTIL
 import "../../../tasks/Utility/Utils.wdl" as UTILS
 import "../../../tasks/TertiaryAnalysis/FunctionalAnnotation.wdl" as FUNK
 import "../../../tasks/Utility/SGKit.wdl" as SGKit
 import "../../../tasks/Utility/Finalize.wdl" as FF
 
-workflow SRJointCallGVCFsWithGenomicsDB_simplified {
+workflow SRJointCallGVCFsWithGenomicsDBPopulationScale {
 
     meta {
         author: "Jonn Smith"
-        description: "A workflow that performs joint calling on single-sample gVCFs from GATK4 HaplotypeCaller using GenomicsDB."
+        description: "A workflow that performs joint calling on single-sample gVCFs from GATK4 HaplotypeCaller using GenomicsDB.  This Workflow relies on previously constructed genomicsDB instances to provide population-scale context for joint calling.  NOTE: Currently assumes the interval list consists of only whole contigs."
     }
     parameter_meta {
         gvcfs:  "Array of GVCF files to use as inputs for joint calling."
         gvcf_indices:   "Array of gvcf index files for `gvcfs`.  Order should correspond to that in `gvcfs`."
-        ref_map_file:  "Reference map file indicating reference sequence and auxillary file locations"
+        ref_map_file:  "Reference map file indicating reference sequence and auxillary file locations" 
+        genomicsdb_tar_contig_map_file: "File containing a map of contigs to GenomicsDB tar files.  This file is used to determine which GenomicsDB tar file to use for each contig."
 
         heterozygosity: "Joint Genotyping Parameter - Heterozygosity value used to compute prior likelihoods for any locus. See the GATKDocs for full details on the meaning of this population genetics concept"
         heterozygosity_stdev: "Joint Genotyping Parameter - Standard deviation of heterozygosity for SNP and indel calling."
@@ -62,6 +63,8 @@ workflow SRJointCallGVCFsWithGenomicsDB_simplified {
 
         File ref_map_file
 
+        File genomicsdb_tar_contig_map_file
+
         Float heterozygosity = 0.001
         Float heterozygosity_stdev = 0.01
         Float indel_heterozygosity = 0.000125
@@ -97,8 +100,6 @@ workflow SRJointCallGVCFsWithGenomicsDB_simplified {
         File? snpeff_db
         String? snpeff_db_identifier
 
-        File? existing_genomicsdb_tar
-
         File? interval_list
 
         Boolean do_zarr_conversion = false
@@ -114,6 +115,7 @@ workflow SRJointCallGVCFsWithGenomicsDB_simplified {
     }
 
     Map[String, String] ref_map = read_map(ref_map_file)
+    Map[String, File] genomicsdb_tar_contig_map = read_map(genomicsdb_tar_contig_map_file)
 
     # Resolve the db_snp_vcf file, with preference to the db_snp_vcf file if it exists:
     call UTILS.ResolveMapKeysInPriorityOrder as ResolveMapKeysInPriorityOrder {
@@ -153,6 +155,8 @@ workflow SRJointCallGVCFsWithGenomicsDB_simplified {
 
         String interval_name = ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][0] + "_" + ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][1] + "_" + ExtractIntervalNamesFromIntervalOrBamFile.interval_info[idx_1][2]
 
+        File existing_genomicsdb_tar = genomicsdb_tar_contig_map[interval_name]
+
         # To make sure the interval names and the files themselves correspond, we need to make the
         # interval list file here:
         call UTILS.CreateIntervalListFileFromIntervalInfo as CreateIntervalListFileFromIntervalInfo {
diff --git a/wdl/tasks/VariantCalling/SRJointGenotypingPopulationScale.wdl b/wdl/tasks/VariantCalling/SRJointGenotypingPopulationScale.wdl
@@ -248,6 +248,10 @@ task ImportGVCFs {
         echo "" >&2
         echo "--------------------------------" >&2
 
+        # If we have a gendb input we may need to modify the sample name map.
+        # Here we'll make a copy of the sample name map for final call to GenomicsDBImport:
+        cp ~{sample_name_map} SAMPLE_NAME_MAP_FINAL.tsv
+
         if [[ ~{has_existing_genomicsdb_tar} == "true" ]] ; then
             t_start=$(date +%s)
             date
@@ -256,14 +260,42 @@ task ImportGVCFs {
             date
             t_end=$(date +%s)
             echo "Untarring existing GenomicsDB workspace: ~{existing_genomicsdb_tar} took $((t_end - t_start)) seconds"
+
+            GENOMICSDB_DIR=$(basename ~{existing_genomicsdb_tar} .tar)
+
+            # We need to check if our input data contains any samples in the genomicsdb instance.
+            # If it does, we must print a _big_ warning message to the user about each sample and then continue.
+            # We can continue using the samples in the genomicsdb instance for any duplicates.
+            grep 'sample_name' "${GENOMICSDB_DIR}/callset.json" | awk '{print $NF}' | tr -d '", ' > genomicsdb_samples.txt
+            awk '{print $1}' ~{sample_name_map} > input_samples.txt
+
+            # Find the intersection of the two files:
+            comm -12 genomicsdb_samples.txt input_samples.txt > overlapping_samples.txt
+
+            if [[ -s overlapping_samples.txt ]] ; then
+                wc -l overlapping_samples.txt | awk '{print $1}' > overlapping_samples_count.txt
+
+                echo "************************************************************"
+                echo "Warning:  Found $(cat overlapping_samples_count.txt) overlapping samples"
+                echo "Any overlaps will be ingested from genomicsDB and not the input data."
+                echo "************************************************************"
+                echo "Overlapping samples:"
+                cat overlapping_samples.txt
+                echo "************************************************************"
+
+                # Translate the overlapping samples into regular expressions for grep:
+                sed 's@^@^@' overlapping_samples.txt > overlapping_samples_regex.txt
+
+                grep -v -f overlapping_samples_regex.txt ~{sample_name_map} >  SAMPLE_NAME_MAP_FINAL.tsv
+            fi
         fi
 
         gatk --java-options "-Xms8192m -Xmx${java_memory_size_mb}m" \
             GenomicsDBImport \
                 ~{genomicsdb_arg} \
                 --batch-size ~{batch_size} \
                 -L ~{interval_list} \
-                --sample-name-map ~{sample_name_map} \
+                --sample-name-map SAMPLE_NAME_MAP_FINAL.tsv \
                 --reader-threads 5 \
                 --merge-input-intervals \
                 --consolidate