-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathbam.jl
More file actions
366 lines (331 loc) · 12.6 KB
/
Copy pathbam.jl
File metadata and controls
366 lines (331 loc) · 12.6 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
module BAM
import ..AuxTag, ..AbstractAuxiliary, ..Hex
import ..AUX_NUMBER_TYPES, ..try_auxtag, ..Error, ..Errors
import ..is_printable, ..ELTYPE_DICT, ..load_hex, ..iter_encodings, ..AbstractEncodedIterator
import ..is_printable_char, ..as_bam_aux_value, ..get_type_tag, ..hexencode!, ..AuxException
import ..striptype, ..validate_hex, ..is_well_formed
public Auxiliary, AuxTag, Error, Errors
using MemoryViews: MemoryViews, ImmutableMemoryView, MutableMemoryView, MemoryView
using StringViews: StringView
"""
BAM.Auxiliary{T <: AbstractVector{UInt8}} <: AbstractDict{AuxTag, Any}
Lazily loaded `AbstractDict` representing the auxiliary data fields of a BAM
record. Immutable aux's can be constructed with `Auxiliary(x)` for any `x`
with `MemoryView(x)` defined.
Mutable aux data is constructed with `Auxiliary(x::Vector{UInt8}, start::Int)`,
where `start` gives the first index of the used data in `x` - all data before
`start` will be ignored and never modified.
# Examples
```jldoctest
julia> immut = BAM.Auxiliary("KJS\7\\0ABZabc\\0");
julia> immut["KJ"]
0x0007
julia> haskey(immut, "AB")
true
```
# Extended help
# Extended help
Since fields of `Auxiliary` are lazily loaded, auxiliaries may contain invalid
data even after successful construction.
`Auxiliary` operates with two distinct notions of invalid data - malformedness
and invalidness. See [`is_well_formed`](@ref) for their definitions.
```
"""
struct Auxiliary{T} <: AbstractAuxiliary{T}
x::T
start::Int
function Auxiliary(v::Vector{UInt8}, i::Integer)
((i - 1) % UInt) > (length(v) % UInt) + 1 && error("Start index must be in 1:length(vector) + 1")
new{Vector{UInt8}}(v, Int(i)::Int)
end
function Auxiliary(x)
mem = ImmutableMemoryView(x)
eltype(mem) == UInt8 || error("Must construct Auxiliary from MemoryView{UInt8}")
new{ImmutableMemoryView{UInt8}}(mem, 1)
end
end
striptype(::Type{<:Auxiliary}) = Auxiliary
Base.empty(::Type{Auxiliary}) = Auxiliary(UInt8[], 1)
const MutableAuxiliary = Auxiliary{Vector{UInt8}}
MemoryViews.MemoryView(x::Auxiliary) = @inbounds MemoryView(x.x)[x.start:end]
function Base.empty!(x::MutableAuxiliary)
resize!(x.x, x.start - 1)
x
end
Base.isempty(x::Auxiliary) = x.start > length(x.x)
function Base.iterate(aux::Auxiliary, state::Int=1)
it = iter_encodings(aux)
itval = iterate(it, state)
itval === nothing && return nothing
(val, new_state) = itval
val isa Error && throw(AuxException(val))
(key, typetag, span) = val
value = load_auxvalue(typetag, @inbounds it.mem[span])
value isa Error && return (value, new_state)
(key => value, new_state)
end
struct EncodedIterator <: AbstractEncodedIterator
mem::ImmutableMemoryView{UInt8}
end
iter_encodings(aux::Auxiliary) = EncodedIterator(ImmutableMemoryView(aux))
function Base.isvalid(aux::Auxiliary)
it = iter_encodings(aux)
for i in it
i isa Error && return false
(_, eltype, span) = i
mem = it.mem[span]
validate_encoding(eltype, mem) || return false
end
true
end
function validate_encoding(type_tag::UInt8, mem::ImmutableMemoryView{UInt8})::Bool
if type_tag == UInt8('A')
length(mem) == 1 || return false
b = @inbounds mem[1]
is_printable_char(b)
# Binary numbers, or arrays of binary numbers cannot be invalid
elseif type_tag in (UInt8('c'), UInt8('C'), UInt8('s'), UInt8('S'), UInt8('I'), UInt8('i'), UInt8('f'), UInt8('B'))
true
elseif type_tag == UInt8('H')
zeropos = findnext(iszero, mem, 1)
isnothing(zeropos) && return false
validate_hex(mem[1:zeropos-1])
elseif type_tag == UInt8('Z')
zeropos = findnext(iszero, mem, 1)
isnothing(zeropos) && return false
is_printable(mem[1:zeropos-1])
else
false
end
end
function Base.iterate(it::EncodedIterator, state::Int=1)
mem = it.mem
state > length(mem) && return nothing
state > length(mem) - 3 && return (Errors.TooShortMemory, length(mem) + 1)
t1 = @inbounds mem[state]
t2 = @inbounds mem[state + 1]
tag = @something try_auxtag(t1, t2) return (Errors.InvalidAuxTag, length(mem) + 1)
type_tag = @inbounds mem[state + 2]
start = state + 3
data_length = length(mem) - start + 1
# One byte values
stop = if type_tag in (UInt8('C'), UInt8('c'), UInt8('A'))
start
# Two byte values
elseif type_tag in (UInt8('S'), UInt8('s'))
data_length < 2 && return (Errors.TooShortMemory, length(mem)+1)
start + 1
# Four byte values
elseif type_tag in (UInt8('I'), UInt8('i'), UInt8('f'))
data_length < 4 && return (Errors.TooShortMemory, length(mem)+1)
start + 3
# Null-terminated values
elseif type_tag in (UInt8('Z'), UInt8('H'))
zeropos = findnext(iszero, mem, start)
isnothing(zeropos) && return (Errors.NoNullByte, length(mem) + 1)
zeropos
# Arrays
elseif type_tag == UInt8('B')
# Minimum data length for empty array:
# Array element type byte plus 4 for array length
data_length < 5 && return (Errors.TooShortMemory, length(mem) + 1)
eltype_tag = @inbounds mem[start]
# Don't use dispatch here for efficiency to avoid type instability
eltype_size = if eltype_tag in (UInt8('C'), UInt8('c'))
1
elseif eltype_tag in (UInt8('S'), UInt8('s'))
2
elseif eltype_tag in (UInt8('I'), UInt8('i'), UInt8('f'))
4
else
return (Errors.InvalidArrayEltype, length(mem) + 1)
end
# Note: All BAM integers are little endian so we can do this
n_elements = @inbounds begin
mem[start + 1] % UInt32 |
(mem[start + 2] % UInt32) << 8 |
(mem[start + 3] % UInt32) << 16 |
(mem[start + 4] % UInt32) << 24
end
len = n_elements * eltype_size
data_length < len + 5 && return (Errors.TooShortMemory, length(mem) + 1)
start + 4 + len
else
return (Errors.InvalidTypeTag, length(mem) + 1)
end
((tag, type_tag, start:stop), stop + 1)
end
function load_array(mem::ImmutableMemoryView{UInt8})
# This might not be possible to hit in practise.
length(mem) < 5 && return Errors.InvalidArray
@inbounds begin
# The correctness of this byte has already been validated in the EncodedIterator
eltype = ELTYPE_DICT[mem[1]]
n_elements = mem[2] % UInt32 |
(mem[3] % UInt32) << 8 |
(mem[4] % UInt32) << 16 |
(mem[5] % UInt32) << 24
end
load_array(eltype, n_elements, @inbounds mem[6:end])
end
function load_array(T::Type, n_elements::UInt32, mem::ImmutableMemoryView{UInt8})
res = reinterpret(T, mem)
# Should not be possible, since the number of elements is used to determine
# the memory size
length(res) == n_elements || return Errors.InvalidArray
res
end
function load_auxvalue(type_tag::UInt8, mem::ImmutableMemoryView{UInt8})
if type_tag == UInt8('C')
@inbounds mem[1]
elseif type_tag == UInt8('c')
@inbounds mem[1] % Int8
elseif type_tag == UInt8('A')
c = @inbounds mem[1]
is_printable_char(c) || return Errors.InvalidChar
Char(c)
else
GC.@preserve mem begin
ptr = pointer(mem)
if type_tag == UInt8('s')
ltoh(unsafe_load(Ptr{Int16}(ptr)))
elseif type_tag == UInt8('S')
ltoh(unsafe_load(Ptr{UInt16}(ptr)))
elseif type_tag == UInt8('i')
ltoh(unsafe_load(Ptr{Int32}(ptr)))
elseif type_tag == UInt8('I')
ltoh(unsafe_load(Ptr{UInt32}(ptr)))
elseif type_tag == UInt8('f')
ltoh(unsafe_load(Ptr{Float32}(ptr)))
elseif type_tag == UInt8('Z')
# Compensate for null terminator byte
sm = @inbounds mem[1:end-1]
is_printable(sm) ? StringView(sm) : Errors.InvalidString
elseif type_tag == UInt8('H')
mem = @inbounds mem[1:end-1]
load_hex(mem)
elseif type_tag == UInt8('B')
load_array(mem)
else
# should be unreachable, has been validated in EncodedIterator
Errors.InvalidTypeTag
end
end
end
end
function Base.delete!(aux::MutableAuxiliary, k)
key = convert(AuxTag, k)
for v in iter_encodings(aux)
v isa Error && throw(AuxException(v))
(tag, _, span) = v
if tag == key
offset = aux.start - 1
deleteat!(aux.x, first(span)-3+offset:last(span)+offset)
break
end
end
aux
end
bytes_needed(x::Union{Int8, UInt8, Char}) = 1
bytes_needed(x::Union{Int16, UInt16}) = 2
bytes_needed(x::Union{Int32, UInt32, Float32}) = 4
bytes_needed(x::AbstractString) = length(MemoryView(codeunits(x))) + 1 # null byte
bytes_needed(x::Hex) = 2 * length(x.x) + 1 # null byte
function bytes_needed(x::AbstractVector{<:AUX_NUMBER_TYPES})
base = 1 + 4
isempty(x) ? base : base + bytes_needed(@inbounds x[1]) * length(x)
end
as_bam_aux_value(x::AUX_NUMBER_TYPES) = x
as_bam_aux_value(x::Signed) = Int32(x)
as_bam_aux_value(x::Unsigned) = UInt32(x)
function Base.setindex!(aux::MutableAuxiliary, val, k)
key = convert(AuxTag, k)
for v in iter_encodings(aux)
v isa Error && error("Cannot set value into invalid Auxiliary")
(tag, type_tag, span) = v
if tag == key
bam_val = as_bam_aux_value(val)
n_bytes_needed = bytes_needed(bam_val)
if length(span) > n_bytes_needed
leftshift = length(span) - n_bytes_needed
deleteat!(aux.x, first(span)-1:first(span)-2+leftshift)
# Shift right
elseif length(span) < n_bytes_needed
rightshift = n_bytes_needed - length(span)
oldsize = length(aux.x)
resize!(aux.x, oldsize + rightshift)
mem = MemoryView(aux)
@inbounds for i in length(mem):-1:last(span)+rightshift+1
mem[i] = mem[i - rightshift]
end
end
mem = @inbounds MemoryView(aux)[first(span)-1:first(span) + n_bytes_needed - 1]
write_auxvalue_typetag!(mem, bam_val)
return aux
end
end
setindex_nonexisting!(aux, val, key)
aux
end
function setindex_nonexisting!(aux::MutableAuxiliary, val, k)
key = convert(AuxTag, k)
bam_val = as_bam_aux_value(val)
n_bytes_needed = bytes_needed(bam_val)
data = aux.x
old_len = length(data)
resize!(data, old_len + 3 + n_bytes_needed)
@inbounds data[old_len + 1] = key.x[1]
@inbounds data[old_len + 2] = key.x[2]
mem = @inbounds MemoryView(aux)[end - n_bytes_needed: end]
write_auxvalue_typetag!(mem, bam_val)
aux
end
function write_auxvalue_typetag!(mem::MutableMemoryView{UInt8}, bam_val)
type_tag = get_type_tag(typeof(bam_val))
@inbounds mem[1] = type_tag
if type_tag in (UInt8('C'), UInt8('c'))
@inbounds mem[2] = reinterpret(UInt8, bam_val)
elseif type_tag == UInt8('A')
@inbounds mem[2] = (reinterpret(UInt32, bam_val) >> 24) % UInt8
else
GC.@preserve mem begin
ptr = pointer(mem) + 1
if type_tag in (UInt8('S'), UInt8('s'))
unsafe_store!(Ptr{UInt16}(ptr), htol(reinterpret(UInt16, bam_val)))
elseif type_tag in (UInt8('i'), UInt8('I'), UInt8('f'))
unsafe_store!(Ptr{UInt32}(ptr), htol(reinterpret(UInt32, bam_val)))
elseif type_tag == UInt8('Z')
unsafe_copyto!(mem[2:end], ImmutableMemoryView(codeunits(bam_val)))
@inbounds mem[end] = 0x00
elseif type_tag == UInt8('H')
m = @inbounds mem[2:end-1]
hexencode!(m, bam_val)
@inbounds mem[end] = 0x00
elseif type_tag == UInt8('B')
eltype_tag = get_type_tag(eltype(bam_val))
@inbounds mem[2] = eltype_tag
unsafe_store!(Ptr{UInt32}(ptr) + 1, htol(length(bam_val) % UInt32))
ptr = Ptr{eltype(bam_val)}(ptr + 5)
for num in bam_val
unsafe_store!(ptr, htol(num))
ptr += sizeof(num)
end
else
error("Unreachable")
end
end
end
end
function Base.get(aux::Auxiliary, k, default)
key = AuxTag(k)
it = iter_encodings(aux)
for i in it
i isa Error && throw(AuxException(i))
(auxtag, typetag, span) = i
if auxtag == key
return load_auxvalue(typetag, @inbounds it.mem[span])
end
end
default
end
end # module