Skip to content

Commit d209e80

Browse files
author
pevnak
committed
improving formatting
1 parent 5db57b3 commit d209e80

File tree

1 file changed

+86
-86
lines changed

1 file changed

+86
-86
lines changed

docs/src/lectures/lecture_11/lecture.md

Lines changed: 86 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -650,28 +650,28 @@ To extend the above for multiple blocks, we need to add reduction over blocks. T
650650
using CUDA, BenchmarkTools
651651

652652
function reduce_grid_atomic(op, a, b)
653-
elements = 2*blockDim().x
654-
offset = 2*(blockIdx().x - 1) * blockDim().x
655-
thread = threadIdx().x
656-
657-
# parallel reduction of values within the single block
658-
d = 1
659-
while d < elements
660-
sync_threads()
661-
index = 2 * d * (thread-1) + 1
662-
@inbounds if index <= elements && index+d+offset <= length(a)
663-
index += offset
664-
a[index] = op(a[index], a[index+d])
665-
end
666-
d *= 2
667-
end
668-
669-
# atomic reduction of this block's value
670-
if thread == 1
671-
CUDA.@atomic b[] = op(b[], a[offset + 1])
653+
elements = 2*blockDim().x
654+
offset = 2*(blockIdx().x - 1) * blockDim().x
655+
thread = threadIdx().x
656+
657+
# parallel reduction of values within the single block
658+
d = 1
659+
while d < elements
660+
sync_threads()
661+
index = 2 * d * (thread-1) + 1
662+
@inbounds if index <= elements && index+d+offset <= length(a)
663+
index += offset
664+
a[index] = op(a[index], a[index+d])
672665
end
666+
d *= 2
667+
end
673668

674-
return
669+
# atomic reduction of this block's value
670+
if thread == 1
671+
CUDA.@atomic b[] = op(b[], a[offset + 1])
672+
end
673+
674+
return
675675
end
676676

677677
x = rand(Float32, 1024, 1024)
@@ -688,27 +688,27 @@ sum(x)
688688
using Atomix, Metal, KernelAbstractions, BenchmarkTools
689689

690690
@kernel function reduce_grid_atomic(a, b)
691-
block_dim = prod(@groupsize())
692-
elements = 2*block_dim
693-
offset = 2*(@index(Group) - 1) * block_dim
694-
thread = @index(Local)
695-
696-
# parallel reduction of values within the single block
697-
d = 1
698-
while d < elements
699-
@synchronize()
700-
index = 2 * d * (thread-1) + 1
701-
if index <= elements && index+d+offset <= length(a)
702-
index += offset
703-
@inbounds a[index] += a[index+d]
704-
end
705-
d *= 2
706-
end
707-
708-
# atomic reduction of this block's value
709-
if thread == 1
710-
Atomix.@atomic b[] += a[offset + 1]
691+
block_dim = prod(@groupsize())
692+
elements = 2*block_dim
693+
offset = 2*(@index(Group) - 1) * block_dim
694+
thread = @index(Local)
695+
696+
# parallel reduction of values within the single block
697+
d = 1
698+
while d < elements
699+
@synchronize()
700+
index = 2 * d * (thread-1) + 1
701+
if index <= elements && index+d+offset <= length(a)
702+
index += offset
703+
@inbounds a[index] += a[index+d]
711704
end
705+
d *= 2
706+
end
707+
708+
# atomic reduction of this block's value
709+
if thread == 1
710+
Atomix.@atomic b[] += a[offset + 1]
711+
end
712712
end
713713

714714
x = rand(Float32, 1024, 1024)
@@ -729,30 +729,30 @@ Recall that each block is executed on a separate SM, each equipped with the loca
729729

730730
```julia
731731
function reduce_grid_localmem(op, a::AbstractArray{T}, b) where {T}
732-
elements = 2*blockDim().x
733-
offset = 2*(blockIdx().x - 1) * blockDim().x
734-
thread = threadIdx().x
735-
736-
shared = @cuStaticSharedMem(T, (2048,))
737-
@inbounds shared[thread] = a[offset+thread]
738-
@inbounds shared[thread+blockDim().x] = a[offset+thread+blockDim().x]
739-
740-
# parallel reduction of values within the single block
741-
d = 1
742-
while d < elements
743-
sync_threads()
744-
index = 2 * d * (thread-1) + 1
745-
@inbounds if index <= elements && index+d+offset <= length(a)
746-
shared[index] = op(shared[index], shared[index+d])
747-
end
748-
d *= 2
749-
end
750-
751-
# atomic reduction of this block's value to the global accumulator
752-
if thread == 1
753-
CUDA.@atomic b[] = op(b[], a[offset + 1])
732+
elements = 2*blockDim().x
733+
offset = 2*(blockIdx().x - 1) * blockDim().x
734+
thread = threadIdx().x
735+
736+
shared = @cuStaticSharedMem(T, (2048,))
737+
@inbounds shared[thread] = a[offset+thread]
738+
@inbounds shared[thread+blockDim().x] = a[offset+thread+blockDim().x]
739+
740+
# parallel reduction of values within the single block
741+
d = 1
742+
while d < elements
743+
sync_threads()
744+
index = 2 * d * (thread-1) + 1
745+
if index <= elements && index+d+offset <= length(a)
746+
@inbounds shared[index] = op(shared[index], shared[index+d])
754747
end
755-
return
748+
d *= 2
749+
end
750+
751+
# atomic reduction of this block's value to the global accumulator
752+
if thread == 1
753+
CUDA.@atomic b[] = op(b[], a[offset + 1])
754+
end
755+
return
756756
end
757757

758758
x = rand(Float32, 1024, 1024)
@@ -769,30 +769,30 @@ sum(x)
769769

770770
```julia
771771
@kernel function reduce_grid_localmem(a, b)
772-
block_dim = prod(@groupsize())
773-
elements = 2*block_dim
774-
offset = 2*(@index(Group) - 1) * block_dim
775-
thread = @index(Local)
776-
777-
shmem = @localmem eltype(a) 2048
778-
@inbounds shmem[thread] = offset+thread length(a) ? a[offset+thread] : 0
779-
@inbounds shmem[thread+block_dim] = offset+thread+block_dim length(a) ? a[offset+thread+block_dim] : 0
780-
@synchronize()
781-
# parallel reduction of values within the single block
782-
d = 1
783-
while d < elements
784-
index = 2 * d * (thread-1) + 1
785-
if index + d <= elements
786-
@inbounds shmem[index] += shmem[index+d]
787-
end
788-
d *= 2
789-
@synchronize()
790-
end
791-
792-
# atomic reduction of this block's value to the global accumulator
793-
if thread == 1
794-
Atomix.@atomic b[] += shmem[1]
772+
block_dim = prod(@groupsize())
773+
elements = 2*block_dim
774+
offset = 2*(@index(Group) - 1) * block_dim
775+
thread = @index(Local)
776+
777+
shmem = @localmem eltype(a) 2048
778+
@inbounds shmem[thread] = offset+thread length(a) ? a[offset+thread] : 0
779+
@inbounds shmem[thread+block_dim] = offset+thread+block_dim length(a) ? a[offset+thread+block_dim] : 0
780+
@synchronize()
781+
# parallel reduction of values within the single block
782+
d = 1
783+
while d < elements
784+
index = 2 * d * (thread-1) + 1
785+
if index + d <= elements
786+
@inbounds shmem[index] += shmem[index+d]
795787
end
788+
d *= 2
789+
@synchronize()
790+
end
791+
792+
# atomic reduction of this block's value to the global accumulator
793+
if thread == 1
794+
Atomix.@atomic b[] += shmem[1]
795+
end
796796
end
797797
```
798798

0 commit comments

Comments
 (0)