@@ -650,28 +650,28 @@ To extend the above for multiple blocks, we need to add reduction over blocks. T
650650using CUDA, BenchmarkTools
651651
652652function reduce_grid_atomic(op, a, b)
653- elements = 2 * blockDim(). x
654- offset = 2 * (blockIdx(). x - 1 ) * blockDim(). x
655- thread = threadIdx(). x
656-
657- # parallel reduction of values within the single block
658- d = 1
659- while d < elements
660- sync_threads()
661- index = 2 * d * (thread- 1 ) + 1
662- @inbounds if index <= elements && index+ d+ offset <= length(a)
663- index += offset
664- a[index] = op(a[index], a[index+ d])
665- end
666- d *= 2
667- end
668-
669- # atomic reduction of this block's value
670- if thread == 1
671- CUDA. @atomic b[] = op(b[], a[offset + 1 ])
653+ elements = 2 * blockDim(). x
654+ offset = 2 * (blockIdx(). x - 1 ) * blockDim(). x
655+ thread = threadIdx(). x
656+
657+ # parallel reduction of values within the single block
658+ d = 1
659+ while d < elements
660+ sync_threads()
661+ index = 2 * d * (thread- 1 ) + 1
662+ @inbounds if index <= elements && index+ d+ offset <= length(a)
663+ index += offset
664+ a[index] = op(a[index], a[index+ d])
672665 end
666+ d *= 2
667+ end
673668
674- return
669+ # atomic reduction of this block's value
670+ if thread == 1
671+ CUDA. @atomic b[] = op(b[], a[offset + 1 ])
672+ end
673+
674+ return
675675end
676676
677677x = rand(Float32, 1024 , 1024 )
@@ -688,27 +688,27 @@ sum(x)
688688using Atomix, Metal, KernelAbstractions, BenchmarkTools
689689
690690@kernel function reduce_grid_atomic(a, b)
691- block_dim = prod(@groupsize())
692- elements = 2 * block_dim
693- offset = 2 * (@index(Group) - 1 ) * block_dim
694- thread = @index(Local)
695-
696- # parallel reduction of values within the single block
697- d = 1
698- while d < elements
699- @synchronize()
700- index = 2 * d * (thread- 1 ) + 1
701- if index <= elements && index+ d+ offset <= length(a)
702- index += offset
703- @inbounds a[index] += a[index+ d]
704- end
705- d *= 2
706- end
707-
708- # atomic reduction of this block's value
709- if thread == 1
710- Atomix. @atomic b[] += a[offset + 1 ]
691+ block_dim = prod(@groupsize())
692+ elements = 2 * block_dim
693+ offset = 2 * (@index(Group) - 1 ) * block_dim
694+ thread = @index(Local)
695+
696+ # parallel reduction of values within the single block
697+ d = 1
698+ while d < elements
699+ @synchronize()
700+ index = 2 * d * (thread- 1 ) + 1
701+ if index <= elements && index+ d+ offset <= length(a)
702+ index += offset
703+ @inbounds a[index] += a[index+ d]
711704 end
705+ d *= 2
706+ end
707+
708+ # atomic reduction of this block's value
709+ if thread == 1
710+ Atomix. @atomic b[] += a[offset + 1 ]
711+ end
712712end
713713
714714x = rand(Float32, 1024 , 1024 )
@@ -729,30 +729,30 @@ Recall that each block is executed on a separate SM, each equipped with the loca
729729
730730``` julia
731731function reduce_grid_localmem(op, a:: AbstractArray{T} , b) where {T}
732- elements = 2 * blockDim(). x
733- offset = 2 * (blockIdx(). x - 1 ) * blockDim(). x
734- thread = threadIdx(). x
735-
736- shared = @cuStaticSharedMem(T, (2048 ,))
737- @inbounds shared[thread] = a[offset+ thread]
738- @inbounds shared[thread+ blockDim(). x] = a[offset+ thread+ blockDim(). x]
739-
740- # parallel reduction of values within the single block
741- d = 1
742- while d < elements
743- sync_threads()
744- index = 2 * d * (thread- 1 ) + 1
745- @inbounds if index <= elements && index+ d+ offset <= length(a)
746- shared[index] = op(shared[index], shared[index+ d])
747- end
748- d *= 2
749- end
750-
751- # atomic reduction of this block's value to the global accumulator
752- if thread == 1
753- CUDA. @atomic b[] = op(b[], a[offset + 1 ])
732+ elements = 2 * blockDim(). x
733+ offset = 2 * (blockIdx(). x - 1 ) * blockDim(). x
734+ thread = threadIdx(). x
735+
736+ shared = @cuStaticSharedMem(T, (2048 ,))
737+ @inbounds shared[thread] = a[offset+ thread]
738+ @inbounds shared[thread+ blockDim(). x] = a[offset+ thread+ blockDim(). x]
739+
740+ # parallel reduction of values within the single block
741+ d = 1
742+ while d < elements
743+ sync_threads()
744+ index = 2 * d * (thread- 1 ) + 1
745+ if index <= elements && index+ d+ offset <= length(a)
746+ @inbounds shared[index] = op(shared[index], shared[index+ d])
754747 end
755- return
748+ d *= 2
749+ end
750+
751+ # atomic reduction of this block's value to the global accumulator
752+ if thread == 1
753+ CUDA. @atomic b[] = op(b[], a[offset + 1 ])
754+ end
755+ return
756756end
757757
758758x = rand(Float32, 1024 , 1024 )
@@ -769,30 +769,30 @@ sum(x)
769769
770770``` julia
771771@kernel function reduce_grid_localmem(a, b)
772- block_dim = prod(@groupsize())
773- elements = 2 * block_dim
774- offset = 2 * (@index(Group) - 1 ) * block_dim
775- thread = @index(Local)
776-
777- shmem = @localmem eltype(a) 2048
778- @inbounds shmem[thread] = offset+ thread ≤ length(a) ? a[offset+ thread] : 0
779- @inbounds shmem[thread+ block_dim] = offset+ thread+ block_dim ≤ length(a) ? a[offset+ thread+ block_dim] : 0
780- @synchronize()
781- # parallel reduction of values within the single block
782- d = 1
783- while d < elements
784- index = 2 * d * (thread- 1 ) + 1
785- if index + d <= elements
786- @inbounds shmem[index] += shmem[index+ d]
787- end
788- d *= 2
789- @synchronize()
790- end
791-
792- # atomic reduction of this block's value to the global accumulator
793- if thread == 1
794- Atomix. @atomic b[] += shmem[1 ]
772+ block_dim = prod(@groupsize())
773+ elements = 2 * block_dim
774+ offset = 2 * (@index(Group) - 1 ) * block_dim
775+ thread = @index(Local)
776+
777+ shmem = @localmem eltype(a) 2048
778+ @inbounds shmem[thread] = offset+ thread ≤ length(a) ? a[offset+ thread] : 0
779+ @inbounds shmem[thread+ block_dim] = offset+ thread+ block_dim ≤ length(a) ? a[offset+ thread+ block_dim] : 0
780+ @synchronize()
781+ # parallel reduction of values within the single block
782+ d = 1
783+ while d < elements
784+ index = 2 * d * (thread- 1 ) + 1
785+ if index + d <= elements
786+ @inbounds shmem[index] += shmem[index+ d]
795787 end
788+ d *= 2
789+ @synchronize()
790+ end
791+
792+ # atomic reduction of this block's value to the global accumulator
793+ if thread == 1
794+ Atomix. @atomic b[] += shmem[1 ]
795+ end
796796end
797797```
798798
0 commit comments