-
Notifications
You must be signed in to change notification settings - Fork 238
/
Copy pathatomics.jl
469 lines (369 loc) · 15.6 KB
/
atomics.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
# Atomic Functions (B.12)
#
# Low-level intrinsics
#
# TODO:
# - scoped atomics: _system and _block versions (see CUDA programming guide, sm_60+)
# https://github.com/Microsoft/clang/blob/86d4513d3e0daa4d5a29b0b1de7c854ca15f9fe5/test/CodeGen/builtins-nvptx.c#L293
## LLVM
# all atomic operations have acquire and/or release semantics,
# depending on whether they load or store values (mimics Base)
const atomic_acquire = LLVM.API.LLVMAtomicOrderingAcquire
const atomic_release = LLVM.API.LLVMAtomicOrderingRelease
const atomic_acquire_release = LLVM.API.LLVMAtomicOrderingAcquireRelease
# common arithmetic operations on integers using LLVM instructions
#
# > 8.6.6. atomicrmw Instruction
# >
# > nand is not supported. The other keywords are supported for i32 and i64 types, with the
# > following restrictions.
# >
# > - The pointer must be either a global pointer, a shared pointer, or a generic pointer
# > that points to either the global address space or the shared address space.
@generated function llvm_atomic_op(::Val{binop}, ptr::LLVMPtr{T,A}, val::T) where {binop, T, A}
@dispose ctx=Context() begin
T_val = convert(LLVMType, T)
T_ptr = convert(LLVMType, ptr)
T_typed_ptr = LLVM.PointerType(T_val, A)
llvm_f, _ = create_function(T_val, [T_ptr, T_val])
@dispose builder=IRBuilder() begin
entry = BasicBlock(llvm_f, "entry")
position!(builder, entry)
typed_ptr = bitcast!(builder, parameters(llvm_f)[1], T_typed_ptr)
rv = atomic_rmw!(builder, binop,
typed_ptr, parameters(llvm_f)[2],
atomic_acquire_release, #=single_threaded=# false)
ret!(builder, rv)
end
call_function(llvm_f, T, Tuple{LLVMPtr{T,A}, T}, :ptr, :val)
end
end
const binops = Dict(
:xchg => LLVM.API.LLVMAtomicRMWBinOpXchg,
:add => LLVM.API.LLVMAtomicRMWBinOpAdd,
:sub => LLVM.API.LLVMAtomicRMWBinOpSub,
:and => LLVM.API.LLVMAtomicRMWBinOpAnd,
:or => LLVM.API.LLVMAtomicRMWBinOpOr,
:xor => LLVM.API.LLVMAtomicRMWBinOpXor,
:max => LLVM.API.LLVMAtomicRMWBinOpMax,
:min => LLVM.API.LLVMAtomicRMWBinOpMin,
:umax => LLVM.API.LLVMAtomicRMWBinOpUMax,
:umin => LLVM.API.LLVMAtomicRMWBinOpUMin,
:fadd => LLVM.API.LLVMAtomicRMWBinOpFAdd,
:fsub => LLVM.API.LLVMAtomicRMWBinOpFSub,
)
for T in (Int32, Int64, UInt32, UInt64)
ops = [:xchg, :add, :sub, :and, :or, :xor, :max, :min]
for op in ops
# LLVM distinguishes signedness in the operation, not the integer type.
rmw = if T <: Unsigned && (op == :max || op == :min)
Symbol("u$op")
else
Symbol("$op")
end
fn = Symbol("atomic_$(op)!")
@eval @inline $fn(ptr::Union{LLVMPtr{$T,AS.Generic},
LLVMPtr{$T,AS.Global},
LLVMPtr{$T,AS.Shared}}, val::$T) =
llvm_atomic_op($(Val(binops[rmw])), ptr, val)
end
end
for T in (:Float32, :Float64)
ops = [:add]
for op in ops
# LLVM has specific operations for floating point types.
rmw = Symbol("f$op")
fn = Symbol("atomic_$(op)!")
# XXX: cannot select
@eval @inline $fn(ptr::Union{LLVMPtr{$T,AS.Generic},
LLVMPtr{$T,AS.Global},
LLVMPtr{$T,AS.Shared}}, val::$T) =
llvm_atomic_op($(Val(binops[rmw])), ptr, val)
end
# there's no specific NNVM intrinsic for fsub, resulting in a selection error.
@eval @inline atomic_sub!(ptr::Union{LLVMPtr{$T,AS.Generic},
LLVMPtr{$T,AS.Global},
LLVMPtr{$T,AS.Shared}}, val::$T) =
atomic_add!(ptr, -val)
end
@generated function llvm_atomic_cas(ptr::LLVMPtr{T,A}, cmp::T, val::T) where {T, A}
@dispose ctx=Context() begin
T_val = convert(LLVMType, T)
T_ptr = convert(LLVMType, ptr)
T_typed_ptr = LLVM.PointerType(T_val, A)
llvm_f, _ = create_function(T_val, [T_ptr, T_val, T_val])
@dispose builder=IRBuilder() begin
entry = BasicBlock(llvm_f, "entry")
position!(builder, entry)
typed_ptr = bitcast!(builder, parameters(llvm_f)[1], T_typed_ptr)
res = atomic_cmpxchg!(builder, typed_ptr, parameters(llvm_f)[2],
parameters(llvm_f)[3], atomic_acquire_release, atomic_acquire,
#=single threaded=# false)
rv = extract_value!(builder, res, 0)
ret!(builder, rv)
end
call_function(llvm_f, T, Tuple{LLVMPtr{T,A}, T, T}, :ptr, :cmp, :val)
end
end
for T in (:Int32, :Int64, :UInt32, :UInt64)
@eval @inline atomic_cas!(ptr::LLVMPtr{$T}, cmp::$T, val::$T) =
llvm_atomic_cas(ptr, cmp, val)
end
# NVPTX doesn't support cmpxchg with i16 yet
for A in (AS.Generic, AS.Global, AS.Shared), T in (:Int16, :UInt16)
if A == AS.Global
scope = ".global"
elseif A == AS.Shared
scope = ".shared"
else
scope = ""
end
intr = "atom$scope.cas.b16 \$0, [\$1], \$2, \$3;"
@eval @device_function @inline atomic_cas!(ptr::LLVMPtr{$T,$A}, cmp::$T, val::$T) =
@asmcall($intr, "=h,l,h,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T,$T}, ptr, cmp, val)
end
## NVVM
# floating-point operations using NVVM intrinsics
for A in (AS.Generic, AS.Global, AS.Shared)
# declare i32 @llvm.nvvm.atomic.load.inc.32.p0i32(i32* address, i32 val)
# declare i32 @llvm.nvvm.atomic.load.inc.32.p1i32(i32 addrspace(1)* address, i32 val)
# declare i32 @llvm.nvvm.atomic.load.inc.32.p3i32(i32 addrspace(3)* address, i32 val)
#
# declare i32 @llvm.nvvm.atomic.load.dec.32.p0i32(i32* address, i32 val)
# declare i32 @llvm.nvvm.atomic.load.dec.32.p1i32(i32 addrspace(1)* address, i32 val)
# declare i32 @llvm.nvvm.atomic.load.dec.32.p3i32(i32 addrspace(3)* address, i32 val)
for T in (Int32,), op in (:inc, :dec)
nb = sizeof(T)*8
fn = Symbol("atomic_$(op)!")
intr = "llvm.nvvm.atomic.load.$op.$nb.p$(convert(Int, A))i$nb"
@eval @device_function @inline $fn(ptr::LLVMPtr{$T,$A}, val::$T) =
@typed_ccall($intr, llvmcall, $T, (LLVMPtr{$T,$A}, $T), ptr, val)
end
end
## PTX
# half-precision atomics using PTX instruction
for A in (AS.Generic, AS.Global, AS.Shared), T in (:Float16,)
if A == AS.Global
scope = ".global"
elseif A == AS.Shared
scope = ".shared"
else
scope = ""
end
intr = "atom$scope.add.noftz.f16 \$0, [\$1], \$2;"
@eval @device_function @inline atomic_add!(ptr::LLVMPtr{$T,$A}, val::$T) =
@asmcall($intr, "=h,l,h", true, $T, Tuple{Core.LLVMPtr{$T,$A},$T}, ptr, val)
end
## Julia
# floating-point CAS via bitcasting
inttype(::Type{T}) where {T<:Integer} = T
inttype(::Type{Float16}) = Int16
inttype(::Type{Float32}) = Int32
inttype(::Type{Float64}) = Int64
inttype(::Type{BFloat16}) = Int16
for T in [:Float16, :Float32, :Float64, :BFloat16]
@eval @inline function atomic_cas!(ptr::LLVMPtr{$T,A}, cmp::$T, new::$T) where {A}
IT = inttype($T)
cmp_i = reinterpret(IT, cmp)
new_i = reinterpret(IT, new)
old_i = atomic_cas!(reinterpret(LLVMPtr{IT,A}, ptr), cmp_i, new_i)
return reinterpret($T, old_i)
end
end
# generic atomic support using compare-and-swap
@inline function atomic_op!(ptr::LLVMPtr{T}, op::Function, val) where {T}
old = Base.unsafe_load(ptr)
while true
cmp = old
new = convert(T, op(old, val))
old = atomic_cas!(ptr, cmp, new)
isequal(old, cmp) && return new
end
end
## documentation
"""
atomic_cas!(ptr::LLVMPtr{T}, cmp::T, val::T)
Reads the value `old` located at address `ptr` and compare with `cmp`. If `old` equals to
`cmp`, stores `val` at the same address. Otherwise, doesn't change the value `old`. These
operations are performed in one atomic transaction. The function returns `old`.
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
Additionally, on GPU hardware with compute capability 7.0+, values of type UInt16 are
supported.
"""
atomic_cas!
"""
atomic_xchg!(ptr::LLVMPtr{T}, val::T)
Reads the value `old` located at address `ptr` and stores `val` at the same address. These
operations are performed in one atomic transaction. The function returns `old`.
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
"""
atomic_xchg!
"""
atomic_add!(ptr::LLVMPtr{T}, val::T)
Reads the value `old` located at address `ptr`, computes `old + val`, and stores the result
back to memory at the same address. These operations are performed in one atomic
transaction. The function returns `old`.
This operation is supported for values of type Int32, Int64, UInt32, UInt64, and Float32.
Additionally, on GPU hardware with compute capability 6.0+, values of type Float64 are
supported.
"""
atomic_add!
"""
atomic_sub!(ptr::LLVMPtr{T}, val::T)
Reads the value `old` located at address `ptr`, computes `old - val`, and stores the result
back to memory at the same address. These operations are performed in one atomic
transaction. The function returns `old`.
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
"""
atomic_sub!
"""
atomic_and!(ptr::LLVMPtr{T}, val::T)
Reads the value `old` located at address `ptr`, computes `old & val`, and stores the result
back to memory at the same address. These operations are performed in one atomic
transaction. The function returns `old`.
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
"""
atomic_and!
"""
atomic_or!(ptr::LLVMPtr{T}, val::T)
Reads the value `old` located at address `ptr`, computes `old | val`, and stores the result
back to memory at the same address. These operations are performed in one atomic
transaction. The function returns `old`.
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
"""
atomic_or!
"""
atomic_xor!(ptr::LLVMPtr{T}, val::T)
Reads the value `old` located at address `ptr`, computes `old ⊻ val`, and stores the result
back to memory at the same address. These operations are performed in one atomic
transaction. The function returns `old`.
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
"""
atomic_xor!
"""
atomic_min!(ptr::LLVMPtr{T}, val::T)
Reads the value `old` located at address `ptr`, computes `min(old, val)`, and stores the
result back to memory at the same address. These operations are performed in one atomic
transaction. The function returns `old`.
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
"""
atomic_min!
"""
atomic_max!(ptr::LLVMPtr{T}, val::T)
Reads the value `old` located at address `ptr`, computes `max(old, val)`, and stores the
result back to memory at the same address. These operations are performed in one atomic
transaction. The function returns `old`.
This operation is supported for values of type Int32, Int64, UInt32 and UInt64.
"""
atomic_max!
"""
atomic_inc!(ptr::LLVMPtr{T}, val::T)
Reads the value `old` located at address `ptr`, computes `((old >= val) ? 0 : (old+1))`, and
stores the result back to memory at the same address. These three operations are performed
in one atomic transaction. The function returns `old`.
This operation is only supported for values of type Int32.
"""
atomic_inc!
"""
atomic_dec!(ptr::LLVMPtr{T}, val::T)
Reads the value `old` located at address `ptr`, computes `(((old == 0) | (old > val)) ? val
: (old-1) )`, and stores the result back to memory at the same address. These three
operations are performed in one atomic transaction. The function returns `old`.
This operation is only supported for values of type Int32.
"""
atomic_dec!
#
# High-level interface
#
# prototype of a high-level interface for performing atomic operations on arrays
#
# this design could be generalized by having atomic {field,array}{set,ref} accessors, as
# well as acquire/release operations to implement the fallback functionality where any
# operation can be applied atomically.
const inplace_ops = Dict(
:(+=) => :(+),
:(-=) => :(-),
:(*=) => :(*),
:(/=) => :(/),
:(\=) => :(\),
:(%=) => :(%),
:(^=) => :(^),
:(&=) => :(&),
:(|=) => :(|),
:(⊻=) => :(⊻),
:(>>>=) => :(>>>),
:(>>=) => :(>>),
:(<<=) => :(<<),
)
struct AtomicError <: Exception
msg::AbstractString
end
Base.showerror(io::IO, err::AtomicError) =
print(io, "AtomicError: ", err.msg)
"""
@atomic a[I] = op(a[I], val)
@atomic a[I] ...= val
Atomically perform a sequence of operations that loads an array element `a[I]`, performs the
operation `op` on that value and a second value `val`, and writes the result back to the
array. This sequence can be written out as a regular assignment, in which case the same
array element should be used in the left and right hand side of the assignment, or as an
in-place application of a known operator. In both cases, the array reference should be pure
and not induce any side-effects.
!!! warn
This interface is experimental, and might change without warning. Use the lower-level
`atomic_...!` functions for a stable API, albeit one limited to natively-supported ops.
"""
macro atomic(ex)
# decode assignment and call
if ex.head == :(=)
ref = ex.args[1]
rhs = ex.args[2]
Meta.isexpr(rhs, :call) || throw(AtomicError("right-hand side of an @atomic assignment should be a call"))
op = rhs.args[1]
if rhs.args[2] != ref
throw(AtomicError("right-hand side of a non-inplace @atomic assignment should reference the left-hand side"))
end
val = rhs.args[3]
elseif haskey(inplace_ops, ex.head)
op = inplace_ops[ex.head]
ref = ex.args[1]
val = ex.args[2]
else
throw(AtomicError("unknown @atomic expression"))
end
# decode array expression
Meta.isexpr(ref, :ref) || throw(AtomicError("@atomic should be applied to an array reference expression"))
array = ref.args[1]
indices = Expr(:tuple, ref.args[2:end]...)
esc(quote
$atomic_arrayset($array, $indices, $op, $val)
end)
end
# FIXME: make this respect the indexing style
@inline atomic_arrayset(A::AbstractArray{T}, Is::Tuple, op::Function, val) where {T} =
atomic_arrayset(A, Base._to_linear_index(A, Is...), op, convert(T, val))
# native atomics
for (op,impl,typ) in [(:(+), :(atomic_add!), [:UInt32,:Int32,:UInt64,:Int64,:Float32]),
(:(-), :(atomic_sub!), [:UInt32,:Int32,:UInt64,:Int64,:Float32]),
(:(&), :(atomic_and!), [:UInt32,:Int32,:UInt64,:Int64]),
(:(|), :(atomic_or!), [:UInt32,:Int32,:UInt64,:Int64]),
(:(⊻), :(atomic_xor!), [:UInt32,:Int32,:UInt64,:Int64]),
(:max, :(atomic_max!), [:UInt32,:Int32,:UInt64,:Int64]),
(:min, :(atomic_min!), [:UInt32,:Int32,:UInt64,:Int64])]
@eval @inline atomic_arrayset(A::AbstractArray{T}, I::Integer, ::typeof($op),
val::T) where {T<:Union{$(typ...)}} =
$impl(pointer(A, I), val)
end
# native atomics that are not supported on all devices
@inline function atomic_arrayset(A::AbstractArray{T}, I::Integer, op::typeof(+),
val::T) where {T <: Union{Float64}}
ptr = pointer(A, I)
if compute_capability() >= sv"6.0"
atomic_add!(ptr, val)
else
atomic_op!(ptr, op, val)
end
end
# fallback using compare-and-swap
@inline atomic_arrayset(A::AbstractArray{T}, I::Integer, op::Function, val) where {T} =
atomic_op!(pointer(A, I), op, val)