1+ version 1.0
2+
3+ import "../Utility/Utils.wdl"
4+ import "../Utility/GeneralUtils.wdl"
5+ import "../Utility/PBUtils.wdl" as PB
6+ import "../Utility/BAMutils.wdl" as BU
7+
8+ workflow AlignHiFiUBAM {
9+ meta {
10+ desciption :
11+ "Align an CCS/HiFi unaligned BAM from a single readgroup."
12+ }
13+
14+ parameter_meta {
15+ application : "Application of the data; accepted values are: [CCS, HIFI, ISOSEQ, MASSEQ]"
16+ library : "If provided, the output aligned BAM will use this value for the LB field in its readgroup header line (@RG)"
17+ drop_per_base_N_pulse_tags : "If false, the per-base and pulse tags will not be stripped away from the input uBAM, if they are available (please think through the storage implications)"
18+ }
19+
20+ input {
21+ File uBAM
22+ File ? uPBI
23+ String bam_sample_name
24+ String ? library
25+
26+ String application
27+
28+ Boolean drop_per_base_N_pulse_tags = true
29+ File ref_map_file
30+ }
31+
32+ output {
33+ File aligned_bam = aBAM
34+ File aligned_bai = aBAI
35+ File aligned_pbi = IndexAlignedReads .pbi
36+
37+ String movie = movie_name
38+ }
39+
40+ # todo: verify if this is still necessary
41+ Map [String , String ] map_presets = {
42+ # 'CLR': 'SUBREAD', # don't support any more
43+ 'CCS' : 'CCS' ,
44+ 'HIFI' : 'HIFI' ,
45+ 'ISOSEQ' : 'ISOSEQ' ,
46+ 'MASSEQ' : 'SUBREAD' ,
47+ }
48+
49+ Map [String , String ] ref_map = read_map (ref_map_file )
50+
51+ call BU .GetReadGroupInfo as RG {input : bam = uBAM , keys = ['SM' , 'LB' , 'PU' ]}
52+ String LB = select_first ([library , RG .read_group_info ['LB' ]])
53+ String movie_name = RG .read_group_info ['PU' ]
54+
55+ ###################################################################################
56+ Int shard_threshold = 50
57+
58+ if (ceil (size (uBAM , "GiB" )) > shard_threshold ) { # shard & align
59+
60+ if (! defined (uPBI ) ) {
61+ call PB .PBIndex as PBIndex {input : bam = uBAM }
62+ }
63+
64+ Int num_shards = ceil (size (uBAM , "GiB" ) / shard_threshold )
65+ call Utils .ComputeAllowedLocalSSD as Guess {input : intended_gb = 3 *ceil (size (uBAM , "GiB" ) + size (uPBI , "GiB" ))}
66+ call PB .ShardLongReads {
67+ input :
68+ unaligned_bam = uBAM , unaligned_pbi = select_first ([uPBI , PBIndex .pbi ]),
69+ num_shards = num_shards ,
70+ num_ssds = Guess .numb_of_local_ssd
71+ }
72+
73+ scatter (unaligned_bam in ShardLongReads .unmapped_shards ) {
74+ # sometimes we see the sharded bams mising EOF marker, which then fails record counts, use this as a checkpoint
75+ call Utils .CountBamRecords as ValidateShard {input : bam = unaligned_bam }
76+
77+ call PB .Align as AlignReads {
78+ input :
79+ bam = unaligned_bam ,
80+ sample_name = bam_sample_name ,
81+ library = LB ,
82+ ref_fasta = ref_map ['fasta' ],
83+ map_preset = map_presets ['CCS' ],
84+ drop_per_base_N_pulse_tags = drop_per_base_N_pulse_tags
85+ }
86+ }
87+
88+ call Utils .MergeBams as MergeAlignedReads { input : bams = AlignReads .aligned_bam , prefix = basename (uBAM , ".bam" ) }
89+ }
90+ if (! (ceil (size (uBAM , "GiB" )) > shard_threshold )) {
91+ call PB .Align as AlignReadsTogether {
92+ input :
93+ bam = uBAM ,
94+ sample_name = bam_sample_name ,
95+ library = LB ,
96+ ref_fasta = ref_map ['fasta' ],
97+ map_preset = map_presets ['CCS' ],
98+ drop_per_base_N_pulse_tags = drop_per_base_N_pulse_tags
99+ }
100+ }
101+
102+ File aBAM = select_first ([MergeAlignedReads .merged_bam , AlignReadsTogether .aligned_bam ])
103+ File aBAI = select_first ([MergeAlignedReads .merged_bai , AlignReadsTogether .aligned_bai ])
104+ call PB .PBIndex as IndexAlignedReads { input : bam = aBAM }
105+ }
0 commit comments