Skip to content

Commit 7046e51

Browse files
committed
New subworkflow to align PacBio uBAM
1 parent 0d61935 commit 7046e51

1 file changed

Lines changed: 105 additions & 0 deletions

File tree

Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
version 1.0
2+
3+
import "../Utility/Utils.wdl"
4+
import "../Utility/GeneralUtils.wdl"
5+
import "../Utility/PBUtils.wdl" as PB
6+
import "../Utility/BAMutils.wdl" as BU
7+
8+
workflow AlignHiFiUBAM {
9+
meta {
10+
desciption:
11+
"Align an CCS/HiFi unaligned BAM from a single readgroup."
12+
}
13+
14+
parameter_meta {
15+
application: "Application of the data; accepted values are: [CCS, HIFI, ISOSEQ, MASSEQ]"
16+
library: "If provided, the output aligned BAM will use this value for the LB field in its readgroup header line (@RG)"
17+
drop_per_base_N_pulse_tags: "If false, the per-base and pulse tags will not be stripped away from the input uBAM, if they are available (please think through the storage implications)"
18+
}
19+
20+
input {
21+
File uBAM
22+
File? uPBI
23+
String bam_sample_name
24+
String? library
25+
26+
String application
27+
28+
Boolean drop_per_base_N_pulse_tags = true
29+
File ref_map_file
30+
}
31+
32+
output {
33+
File aligned_bam = aBAM
34+
File aligned_bai = aBAI
35+
File aligned_pbi = IndexAlignedReads.pbi
36+
37+
String movie = movie_name
38+
}
39+
40+
# todo: verify if this is still necessary
41+
Map[String, String] map_presets = {
42+
# 'CLR': 'SUBREAD', # don't support any more
43+
'CCS': 'CCS',
44+
'HIFI': 'HIFI',
45+
'ISOSEQ': 'ISOSEQ',
46+
'MASSEQ': 'SUBREAD',
47+
}
48+
49+
Map[String, String] ref_map = read_map(ref_map_file)
50+
51+
call BU.GetReadGroupInfo as RG {input: bam = uBAM, keys = ['SM', 'LB', 'PU']}
52+
String LB = select_first([library, RG.read_group_info['LB']])
53+
String movie_name = RG.read_group_info['PU']
54+
55+
###################################################################################
56+
Int shard_threshold = 50
57+
58+
if (ceil(size(uBAM, "GiB")) > shard_threshold) {# shard & align
59+
60+
if (! defined(uPBI) ) {
61+
call PB.PBIndex as PBIndex {input: bam = uBAM}
62+
}
63+
64+
Int num_shards = ceil(size(uBAM, "GiB") / shard_threshold )
65+
call Utils.ComputeAllowedLocalSSD as Guess {input: intended_gb = 3*ceil(size(uBAM, "GiB") + size(uPBI, "GiB"))}
66+
call PB.ShardLongReads {
67+
input:
68+
unaligned_bam = uBAM, unaligned_pbi = select_first([uPBI, PBIndex.pbi]),
69+
num_shards = num_shards,
70+
num_ssds = Guess.numb_of_local_ssd
71+
}
72+
73+
scatter (unaligned_bam in ShardLongReads.unmapped_shards) {
74+
# sometimes we see the sharded bams mising EOF marker, which then fails record counts, use this as a checkpoint
75+
call Utils.CountBamRecords as ValidateShard {input: bam = unaligned_bam}
76+
77+
call PB.Align as AlignReads {
78+
input:
79+
bam = unaligned_bam,
80+
sample_name = bam_sample_name,
81+
library = LB,
82+
ref_fasta = ref_map['fasta'],
83+
map_preset = map_presets['CCS'],
84+
drop_per_base_N_pulse_tags = drop_per_base_N_pulse_tags
85+
}
86+
}
87+
88+
call Utils.MergeBams as MergeAlignedReads { input: bams = AlignReads.aligned_bam, prefix = basename(uBAM, ".bam") }
89+
}
90+
if (! (ceil(size(uBAM, "GiB")) > shard_threshold)) {
91+
call PB.Align as AlignReadsTogether {
92+
input:
93+
bam = uBAM,
94+
sample_name = bam_sample_name,
95+
library = LB,
96+
ref_fasta = ref_map['fasta'],
97+
map_preset = map_presets['CCS'],
98+
drop_per_base_N_pulse_tags = drop_per_base_N_pulse_tags
99+
}
100+
}
101+
102+
File aBAM = select_first([MergeAlignedReads.merged_bam, AlignReadsTogether.aligned_bam])
103+
File aBAI = select_first([MergeAlignedReads.merged_bai, AlignReadsTogether.aligned_bai])
104+
call PB.PBIndex as IndexAlignedReads { input: bam = aBAM }
105+
}

0 commit comments

Comments
 (0)