add in a molecule count safeguard because GCC BAM processing process is unclear

SHuang-Broad · SHuang-Broad · commit e1a0fd642ced · 2026-01-23T11:21:36.000-05:00
diff --git a/wdl/pipelines/ONT/Preprocessing/DeduplicateAndResetONTAlignedBam.wdl b/wdl/pipelines/ONT/Preprocessing/DeduplicateAndResetONTAlignedBam.wdl
@@ -7,7 +7,7 @@ import "../../../tasks/Utility/ONTBamShardResetAndDeduplicate.wdl" as CleanOneSh
 
 workflow DeduplicateAndResetONTAlignedBam {
     meta {
-        desciption: "Removes duplicate records from an aligned ONT bam, while resetting the alignment information."
+        description: "Removes duplicate records from an aligned ONT bam, while resetting the alignment information."
     }
 
     parameter_meta {
@@ -22,6 +22,8 @@ workflow DeduplicateAndResetONTAlignedBam {
 
     output {
         File result = Merge.res
+        Array[Int] deduplicated_molecule_counts_per_shard = flatten([CountDedupMoleculesPerShard.number,
+                                                                     [CountDedupMoleculesUnmapped.number]])
     }
 
     # step 1, split input bam by the T2T size-balanced scheme, then for each (don't forget unmapped) shard
@@ -30,8 +32,10 @@ workflow DeduplicateAndResetONTAlignedBam {
     # for each shard, step 2
     scatter (shard_bam in ShardAlignedBam.split_bams) {
         call CleanOneShard.Work as DeShard { input: shard_bam = shard_bam }
+        call BU.CountMolecules as CountDedupMoleculesPerShard { input: bam = DeShard.clean_bam, localize_bam = true }
     }
     call CleanOneShard.Work as DeUmap { input: shard_bam = ShardAlignedBam.unmapped_reads }
+    call BU.CountMolecules as CountDedupMoleculesUnmapped { input: bam = DeUmap.clean_bam, localize_bam = true }
 
     # step 3 gather
     Array[File] fixed_shards = flatten([[DeUmap.clean_bam], DeShard.clean_bam])
diff --git a/wdl/pipelines/ONT/Preprocessing/RemoveDuplicateFromMergedONTBamAndSplitByReadgroup.wdl b/wdl/pipelines/ONT/Preprocessing/RemoveDuplicateFromMergedONTBamAndSplitByReadgroup.wdl
@@ -7,11 +7,12 @@ import "../../TechAgnostic/Utility/SplitBamByReadgroup.wdl" as Major
 import "../../../tasks/Utility/Utils.wdl"
 import "../../../tasks/Utility/BAMutils.wdl" as BU
 import "../../../tasks/Utility/ONTUtils.wdl" as OU
+import "../../../tasks/Utility/GeneralUtils.wdl" as GU
 
 workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup {
 
     meta {
-        desciption: "Remove duplicate records from an ONT alinged BAM, drop alignment information, and split by the bam by read groups."
+        description: "Remove duplicate records from an ONT alinged BAM, drop alignment information, and split by the bam by read groups."
     }
     parameter_meta {
         fix_bam_header: "Sometimes, the bam given to us contains a specific error mode. We fix it here."
@@ -36,6 +37,10 @@ workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup {
         Map[String, String]? rgid_2_ubam_emptyness = WORKHORSE.rgid_2_ubam_emptyness
         Boolean rgid_2_bam_are_aligned = WORKHORSE.rgid_2_bam_are_aligned
 
+        Map[String, Int] mol_counts = {'Original':     CountGCCMolecules.number,
+                                       'Deduplicated': CountDedupMolecules.sum,
+                                       'SplitSum':     CountFinalMolecules.sum}
+
         String last_processing_date = WORKHORSE.last_processing_date
     }
 
@@ -47,6 +52,7 @@ workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup {
     if ('coordinate' != GatherBamMetadata.sort_order) {
         call Utils.StopWorkflow { input: reason = "Input bam isn't coordinate-sorted, but rather sorted by ~{GatherBamMetadata.sort_order}"  }
     }
+    call BU.CountMolecules as CountGCCMolecules { input: bam = input_bam, localize_bam = true }
 
     # reality of life--submitted files sometimes need fixings in their headers
     if (fix_bam_header) {
@@ -56,6 +62,7 @@ workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup {
     call FixAndReset.DeduplicateAndResetONTAlignedBam as Dedup { input:
         aligned_bam = select_first([FixParticularBamHeaderIssue.fixed, input_bam]), aligned_bai = input_bai, scatter_scheme = scatter_scheme
     }
+    call GU.AddIntegers as CountDedupMolecules { input: integers = Dedup.deduplicated_molecule_counts_per_shard }
 
     File ok_input_bam = Dedup.result
 
@@ -73,7 +80,9 @@ workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup {
             gcs_out_root_dir = gcs_out_root_dir,
             debug_mode = false
     }
-
+    call GU.CoerceMapToArrayOfPairs { input: input_map = WORKHORSE.rgid_2_bam }
+    call GU.Unzip { input: apss = CoerceMapToArrayOfPairs.output_pairs }
+    call GU.AddIntegers as CountFinalMolecules{ input: integers = Unzip.res.right }
     call OU.GetBasecallModel { input: bam = ok_input_bam }
 }
 
diff --git a/wdl/pipelines/PacBio/Utility/SplitMergedPacBioBamByReadgroup.wdl b/wdl/pipelines/PacBio/Utility/SplitMergedPacBioBamByReadgroup.wdl
@@ -5,6 +5,7 @@ import "../../TechAgnostic/Utility/SplitBamByReadgroup.wdl" as Major
 import "../../../tasks/Utility/Utils.wdl"
 import "../../../tasks/Utility/BAMutils.wdl" as BU
 import "../../../tasks/Utility/PBUtils.wdl"
+import "../../../tasks/Utility/GeneralUtils.wdl" as GU
 
 workflow SplitMergedPacBioBamByReadgroup {
     meta {
@@ -29,6 +30,9 @@ workflow SplitMergedPacBioBamByReadgroup {
         Boolean rgid_2_bam_are_aligned = WORKHORSE.rgid_2_bam_are_aligned
         Map[String, String]? rgid_2_fastq = WORKHORSE.rgid_2_fastq
 
+        Map[String, Int] mol_counts = {'Original':     CountGCCMolecules.number,
+                                       'SplitSum':     CountFinalMolecules.sum}
+
         String last_processing_date = WORKHORSE.last_processing_date
     }
 
@@ -49,6 +53,7 @@ workflow SplitMergedPacBioBamByReadgroup {
     if ('coordinate' != GatherBamMetadata.sort_order) {
         call Utils.StopWorkflow { input: reason = "Input bam isn't coordinate-sorted, but rather sorted by ~{GatherBamMetadata.sort_order}"  }
     }
+    call BU.CountMolecules as CountGCCMolecules { input: bam = input_bam, localize_bam = true }
 
     # this guarantees that there are no read groups missing primrose runs
     if (!disable_primrose_check) {
@@ -74,4 +79,7 @@ workflow SplitMergedPacBioBamByReadgroup {
             override_workflow_name = workflow_name,
             debug_mode = false
     }
+    call GU.CoerceMapToArrayOfPairs { input: input_map = WORKHORSE.rgid_2_bam }
+    call GU.Unzip { input: apss = CoerceMapToArrayOfPairs.output_pairs }
+    call GU.AddIntegers as CountFinalMolecules{ input: integers = Unzip.res.right }
 }