Skip to content

Commit e1a0fd6

Browse files
committed
add in a molecule count safeguard because GCC BAM processing process is unclear
1 parent ca856c2 commit e1a0fd6

File tree

3 files changed

+24
-3
lines changed

3 files changed

+24
-3
lines changed

wdl/pipelines/ONT/Preprocessing/DeduplicateAndResetONTAlignedBam.wdl

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ import "../../../tasks/Utility/ONTBamShardResetAndDeduplicate.wdl" as CleanOneSh
77

88
workflow DeduplicateAndResetONTAlignedBam {
99
meta {
10-
desciption: "Removes duplicate records from an aligned ONT bam, while resetting the alignment information."
10+
description: "Removes duplicate records from an aligned ONT bam, while resetting the alignment information."
1111
}
1212

1313
parameter_meta {
@@ -22,6 +22,8 @@ workflow DeduplicateAndResetONTAlignedBam {
2222

2323
output {
2424
File result = Merge.res
25+
Array[Int] deduplicated_molecule_counts_per_shard = flatten([CountDedupMoleculesPerShard.number,
26+
[CountDedupMoleculesUnmapped.number]])
2527
}
2628

2729
# step 1, split input bam by the T2T size-balanced scheme, then for each (don't forget unmapped) shard
@@ -30,8 +32,10 @@ workflow DeduplicateAndResetONTAlignedBam {
3032
# for each shard, step 2
3133
scatter (shard_bam in ShardAlignedBam.split_bams) {
3234
call CleanOneShard.Work as DeShard { input: shard_bam = shard_bam }
35+
call BU.CountMolecules as CountDedupMoleculesPerShard { input: bam = DeShard.clean_bam, localize_bam = true }
3336
}
3437
call CleanOneShard.Work as DeUmap { input: shard_bam = ShardAlignedBam.unmapped_reads }
38+
call BU.CountMolecules as CountDedupMoleculesUnmapped { input: bam = DeUmap.clean_bam, localize_bam = true }
3539
3640
# step 3 gather
3741
Array[File] fixed_shards = flatten([[DeUmap.clean_bam], DeShard.clean_bam])

wdl/pipelines/ONT/Preprocessing/RemoveDuplicateFromMergedONTBamAndSplitByReadgroup.wdl

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,12 @@ import "../../TechAgnostic/Utility/SplitBamByReadgroup.wdl" as Major
77
import "../../../tasks/Utility/Utils.wdl"
88
import "../../../tasks/Utility/BAMutils.wdl" as BU
99
import "../../../tasks/Utility/ONTUtils.wdl" as OU
10+
import "../../../tasks/Utility/GeneralUtils.wdl" as GU
1011

1112
workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup {
1213

1314
meta {
14-
desciption: "Remove duplicate records from an ONT alinged BAM, drop alignment information, and split by the bam by read groups."
15+
description: "Remove duplicate records from an ONT alinged BAM, drop alignment information, and split by the bam by read groups."
1516
}
1617
parameter_meta {
1718
fix_bam_header: "Sometimes, the bam given to us contains a specific error mode. We fix it here."
@@ -36,6 +37,10 @@ workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup {
3637
Map[String, String]? rgid_2_ubam_emptyness = WORKHORSE.rgid_2_ubam_emptyness
3738
Boolean rgid_2_bam_are_aligned = WORKHORSE.rgid_2_bam_are_aligned
3839

40+
Map[String, Int] mol_counts = {'Original': CountGCCMolecules.number,
41+
'Deduplicated': CountDedupMolecules.sum,
42+
'SplitSum': CountFinalMolecules.sum}
43+
3944
String last_processing_date = WORKHORSE.last_processing_date
4045
}
4146

@@ -47,6 +52,7 @@ workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup {
4752
if ('coordinate' != GatherBamMetadata.sort_order) {
4853
call Utils.StopWorkflow { input: reason = "Input bam isn't coordinate-sorted, but rather sorted by ~{GatherBamMetadata.sort_order}" }
4954
}
55+
call BU.CountMolecules as CountGCCMolecules { input: bam = input_bam, localize_bam = true }
5056
5157
# reality of life--submitted files sometimes need fixings in their headers
5258
if (fix_bam_header) {
@@ -56,6 +62,7 @@ workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup {
5662
call FixAndReset.DeduplicateAndResetONTAlignedBam as Dedup { input:
5763
aligned_bam = select_first([FixParticularBamHeaderIssue.fixed, input_bam]), aligned_bai = input_bai, scatter_scheme = scatter_scheme
5864
}
65+
call GU.AddIntegers as CountDedupMolecules { input: integers = Dedup.deduplicated_molecule_counts_per_shard }
5966
6067
File ok_input_bam = Dedup.result
6168
@@ -73,7 +80,9 @@ workflow RemoveDuplicateFromMergedONTBamAndSplitByReadgroup {
7380
gcs_out_root_dir = gcs_out_root_dir,
7481
debug_mode = false
7582
}
76-
83+
call GU.CoerceMapToArrayOfPairs { input: input_map = WORKHORSE.rgid_2_bam }
84+
call GU.Unzip { input: apss = CoerceMapToArrayOfPairs.output_pairs }
85+
call GU.AddIntegers as CountFinalMolecules{ input: integers = Unzip.res.right }
7786
call OU.GetBasecallModel { input: bam = ok_input_bam }
7887
}
7988

wdl/pipelines/PacBio/Utility/SplitMergedPacBioBamByReadgroup.wdl

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import "../../TechAgnostic/Utility/SplitBamByReadgroup.wdl" as Major
55
import "../../../tasks/Utility/Utils.wdl"
66
import "../../../tasks/Utility/BAMutils.wdl" as BU
77
import "../../../tasks/Utility/PBUtils.wdl"
8+
import "../../../tasks/Utility/GeneralUtils.wdl" as GU
89

910
workflow SplitMergedPacBioBamByReadgroup {
1011
meta {
@@ -29,6 +30,9 @@ workflow SplitMergedPacBioBamByReadgroup {
2930
Boolean rgid_2_bam_are_aligned = WORKHORSE.rgid_2_bam_are_aligned
3031
Map[String, String]? rgid_2_fastq = WORKHORSE.rgid_2_fastq
3132

33+
Map[String, Int] mol_counts = {'Original': CountGCCMolecules.number,
34+
'SplitSum': CountFinalMolecules.sum}
35+
3236
String last_processing_date = WORKHORSE.last_processing_date
3337
}
3438

@@ -49,6 +53,7 @@ workflow SplitMergedPacBioBamByReadgroup {
4953
if ('coordinate' != GatherBamMetadata.sort_order) {
5054
call Utils.StopWorkflow { input: reason = "Input bam isn't coordinate-sorted, but rather sorted by ~{GatherBamMetadata.sort_order}" }
5155
}
56+
call BU.CountMolecules as CountGCCMolecules { input: bam = input_bam, localize_bam = true }
5257
5358
# this guarantees that there are no read groups missing primrose runs
5459
if (!disable_primrose_check) {
@@ -74,4 +79,7 @@ workflow SplitMergedPacBioBamByReadgroup {
7479
override_workflow_name = workflow_name,
7580
debug_mode = false
7681
}
82+
call GU.CoerceMapToArrayOfPairs { input: input_map = WORKHORSE.rgid_2_bam }
83+
call GU.Unzip { input: apss = CoerceMapToArrayOfPairs.output_pairs }
84+
call GU.AddIntegers as CountFinalMolecules{ input: integers = Unzip.res.right }
7785
}

0 commit comments

Comments
 (0)