@@ -307,15 +307,15 @@ bags = Mill.length2bags(n)
307307builtin(x, bags, z) = Mill. segmented_sum_forw(x, vec(z), bags, nothing )
308308
309309function naive(x, bags, z)
310- o = similar(x, size(x,1 ), length(bags))
311- foreach(enumerate(bags)) do (i,b)
312- if isempty(b)
313- o[:,i] .= z
314- else
315- @inbounds o[:,i] = sum(@view(x[:,b]), dims = 2 )
316- end
317- end
318- o
310+ o = similar(x, size(x,1 ), length(bags))
311+ foreach(enumerate(bags)) do (i,b)
312+ if isempty(b)
313+ o[:,i] .= z
314+ else
315+ @inbounds o[:,i] = sum(@view(x[:,b]), dims = 2 )
316+ end
317+ end
318+ o
319319end
320320
321321builtin(x, bags, z) ≈ naive(x, bags, z)
@@ -389,39 +389,39 @@ The most trivial example of a kernel is addition as
389389== CUDA
390390
391391``` julia
392+ using CUDA
392393function vadd!(c, a, b, n)
393- i = threadIdx(). x + (blockIdx(). x - 1 ) * blockDim(). x
394- if i <= n
395- c[i] = a[i] + b[i]
396- end
397- return
394+ i = threadIdx(). x + (blockIdx(). x - 1 ) * blockDim(). x
395+ if i <= n
396+ c[i] = a[i] + b[i]
397+ end
398+ return
398399end
399400
400401a = CuArray(Float32.(1 : 10000 ))
401402b = CuArray(Float32.(2 : 2 : 20000 ))
402403c = similar(a)
403404@cuda threads= 1024 blocks= cld(length(a), 1024 ) vadd!(c, a, b, length(a))
405+ all(@. c == a + b)
404406```
405407
406408== KernelAbstractions with Metal
407409
408410``` julia
409- using Metal
410- import KernelAbstractions as KA
411+ using Metal, KernelAbstractions
411412@kernel function vadd!(c, a, b, n)
412- i = @index(Global)
413- if i ≤ n
414- @inbounds c[i] = a[i] + b[i]
415- end
413+ i = @index(Global)
414+ if i ≤ n
415+ @inbounds c[i] = a[i] + b[i]
416+ end
416417end
417418
418419a = MtlArray(Float32.(1 : 10000 ))
419420b = MtlArray(Float32.(2 : 2 : 20000 ))
420421c = similar(a)
421422
422- backend = KA . get_backend(a)
423+ backend = KernelAbstractions . get_backend(a)
423424vadd!(backend, 64 )(c, a, b, length(a), ndrange= size(a))
424- synchronize(backend)
425425all(@. c == a + b)
426426```
427427
@@ -507,12 +507,14 @@ We can use **atomic** operations to mark that the reduction operation has to be
507507== Cuda
508508
509509``` julia
510+ using CUDA, BenchmarkTools
511+
510512function reduce_atomic(op, a, b)
511- i = threadIdx(). x + (blockIdx(). x - 1 ) * blockDim(). x
512- if i <= length(a)
513- CUDA. @atomic b[] = op(b[], a[i])
514- end
515- return
513+ i = threadIdx(). x + (blockIdx(). x - 1 ) * blockDim(). x
514+ if i <= length(a)
515+ CUDA. @atomic b[] = op(b[], a[i])
516+ end
517+ return
516518end
517519
518520x = rand(Float32, 1024 , 1024 )
@@ -528,16 +530,16 @@ sum(x)
528530== KernelAbstractions with Metal
529531
530532``` julia
531- using Atomix
533+ using Atomix, Metal, KernelAbstractions, BenchmarkTools
532534
533535@kernel function reduce_atomic(a, b)
534- i = @index(Global)
535- Atomix. @atomic b[] += a[i]
536+ i = @index(Global)
537+ Atomix. @atomic b[] += a[i]
536538end
537539
538540x = rand(Float32, 1024 , 1024 );
539541cx = MtlArray(x);
540- backend = KA . get_backend(cx);
542+ backend = KernelAbstractions . get_backend(cx);
541543# cb = zeros(backend, Float32, 1)
542544cb = MtlArray([0f0 ]);
543545reduce_atomic(backend, 64 )(+ , cx, cb, ndrange= size(cx))
@@ -563,28 +565,30 @@ The parallel reduction is tricky. **Let's assume that we are allowed to overwrit
563565== CUDA
564566
565567``` julia
568+ using CUDA, BenchmarkTools
569+
566570function reduce_block(op, a, b)
567- elements = 2 * blockDim(). x
568- thread = threadIdx(). x
571+ elements = 2 * blockDim(). x
572+ thread = threadIdx(). x
569573
570- # parallel reduction of values in a block
571- d = 1
572- while d < elements
573- sync_threads()
574- index = 2 * d * (thread- 1 ) + 1
575- @inbounds if index <= elements && index+ d <= length(a)
576- @cuprintln " thread $thread : a[$index ] + a[$(index+ d) ] = $(a[index]) + $(a[index+ d]) = $(op(a[index], a[index+ d])) "
577- a[index] = op(a[index], a[index+ d])
578- end
579- d *= 2
580- thread == 1 && @cuprintln()
581- end
582-
583- if thread == 1
584- b[] = a[1 ]
574+ # parallel reduction of values in a block
575+ d = 1
576+ while d < elements
577+ sync_threads()
578+ index = 2 * d * (thread- 1 ) + 1
579+ @inbounds if index <= elements && index+ d <= length(a)
580+ @cuprintln " thread $thread : a[$index ] + a[$(index+ d) ] = $(a[index]) + $(a[index+ d]) = $(op(a[index], a[index+ d])) "
581+ a[index] = op(a[index], a[index+ d])
585582 end
586-
587- return
583+ d *= 2
584+ thread == 1 && @cuprintln()
585+ end
586+
587+ if thread == 1
588+ b[] = a[1 ]
589+ end
590+
591+ return
588592end
589593
590594a = CuArray(1 : 16 );
@@ -597,32 +601,29 @@ CUDA.@allowscalar b[]
597601== KernelAbstractions with Metal
598602
599603``` julia
600- using Metal, BenchmarkTools
601- using KernelAbstractions
602- import KernelAbstractions as KA
603- using Atomix
604+ using Atomix, Metal, KernelAbstractions, BenchmarkTools
604605
605606
606607@kernel function reduce_block(a, b)
607- elements = 2 * prod(@groupsize())
608- thread = @index(Local)
609-
610- # parallel reduction of values in a block
611- d = 1
612- while d < elements
613- index = 2 * d * (thread- 1 ) + 1
614- if index <= elements && index+ d <= length(a)
615- KA. @print " thread $thread : a[$index ] + a[$(index+ d) ] = $(a[index]) + $(a[index+ d]) = $(a[index] + a[index+ d]) )"
616- a[index] += a[index+ d]
617- end
618- d *= 2
619- thread == 1 && KA. @print " \n "
620- @synchronize
621- end
622-
623- if thread == 1
624- b[] = a[1 ]
608+ elements = 2 * prod(@groupsize())
609+ thread = @index(Local)
610+
611+ # parallel reduction of values in a block
612+ d = 1
613+ while d < elements
614+ index = 2 * d * (thread- 1 ) + 1
615+ if index <= elements && index+ d <= length(a)
616+ KA. @print " thread $thread : a[$index ] + a[$(index+ d) ] = $(a[index]) + $(a[index+ d]) = $(a[index] + a[index+ d]) )"
617+ a[index] += a[index+ d]
625618 end
619+ d *= 2
620+ thread == 1 && KA. @print " \n "
621+ @synchronize
622+ end
623+
624+ if thread == 1
625+ b[] = a[1 ]
626+ end
626627end
627628
628629a = MtlArray(1 : 16 );
@@ -646,6 +647,8 @@ To extend the above for multiple blocks, we need to add reduction over blocks. T
646647== Cuda
647648
648649``` julia
650+ using CUDA, BenchmarkTools
651+
649652function reduce_grid_atomic(op, a, b)
650653 elements = 2 * blockDim(). x
651654 offset = 2 * (blockIdx(). x - 1 ) * blockDim(). x
@@ -682,6 +685,7 @@ sum(x)
682685== KernelAbstractions with Metal
683686
684687``` julia
688+ using Atomix, Metal, KernelAbstractions, BenchmarkTools
685689
686690@kernel function reduce_grid_atomic(a, b)
687691 block_dim = prod(@groupsize())
0 commit comments