Skip to content

Commit 5db57b3

Browse files
author
pevnak
committed
improving formatting
1 parent 38b2339 commit 5db57b3

File tree

1 file changed

+76
-72
lines changed

1 file changed

+76
-72
lines changed

docs/src/lectures/lecture_11/lecture.md

Lines changed: 76 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -307,15 +307,15 @@ bags = Mill.length2bags(n)
307307
builtin(x, bags, z) = Mill.segmented_sum_forw(x, vec(z), bags, nothing)
308308

309309
function naive(x, bags, z)
310-
o = similar(x, size(x,1), length(bags))
311-
foreach(enumerate(bags)) do (i,b)
312-
if isempty(b)
313-
o[:,i] .= z
314-
else
315-
@inbounds o[:,i] = sum(@view(x[:,b]), dims = 2)
316-
end
317-
end
318-
o
310+
o = similar(x, size(x,1), length(bags))
311+
foreach(enumerate(bags)) do (i,b)
312+
if isempty(b)
313+
o[:,i] .= z
314+
else
315+
@inbounds o[:,i] = sum(@view(x[:,b]), dims = 2)
316+
end
317+
end
318+
o
319319
end
320320

321321
builtin(x, bags, z) naive(x, bags, z)
@@ -389,39 +389,39 @@ The most trivial example of a kernel is addition as
389389
== CUDA
390390

391391
```julia
392+
using CUDA
392393
function vadd!(c, a, b, n)
393-
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
394-
if i <= n
395-
c[i] = a[i] + b[i]
396-
end
397-
return
394+
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
395+
if i <= n
396+
c[i] = a[i] + b[i]
397+
end
398+
return
398399
end
399400

400401
a = CuArray(Float32.(1:10000))
401402
b = CuArray(Float32.(2:2:20000))
402403
c = similar(a)
403404
@cuda threads=1024 blocks=cld(length(a), 1024) vadd!(c, a, b, length(a))
405+
all(@. c == a + b)
404406
```
405407

406408
== KernelAbstractions with Metal
407409

408410
```julia
409-
using Metal
410-
import KernelAbstractions as KA
411+
using Metal, KernelAbstractions
411412
@kernel function vadd!(c, a, b, n)
412-
i = @index(Global)
413-
if i n
414-
@inbounds c[i] = a[i] + b[i]
415-
end
413+
i = @index(Global)
414+
if i n
415+
@inbounds c[i] = a[i] + b[i]
416+
end
416417
end
417418

418419
a = MtlArray(Float32.(1:10000))
419420
b = MtlArray(Float32.(2:2:20000))
420421
c = similar(a)
421422

422-
backend = KA.get_backend(a)
423+
backend = KernelAbstractions.get_backend(a)
423424
vadd!(backend, 64)(c, a, b, length(a), ndrange=size(a))
424-
synchronize(backend)
425425
all(@. c == a + b)
426426
```
427427

@@ -507,12 +507,14 @@ We can use **atomic** operations to mark that the reduction operation has to be
507507
== Cuda
508508

509509
```julia
510+
using CUDA, BenchmarkTools
511+
510512
function reduce_atomic(op, a, b)
511-
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
512-
if i <= length(a)
513-
CUDA.@atomic b[] = op(b[], a[i])
514-
end
515-
return
513+
i = threadIdx().x + (blockIdx().x - 1) * blockDim().x
514+
if i <= length(a)
515+
CUDA.@atomic b[] = op(b[], a[i])
516+
end
517+
return
516518
end
517519

518520
x = rand(Float32, 1024, 1024)
@@ -528,16 +530,16 @@ sum(x)
528530
== KernelAbstractions with Metal
529531

530532
```julia
531-
using Atomix
533+
using Atomix, Metal, KernelAbstractions, BenchmarkTools
532534

533535
@kernel function reduce_atomic(a, b)
534-
i = @index(Global)
535-
Atomix.@atomic b[] += a[i]
536+
i = @index(Global)
537+
Atomix.@atomic b[] += a[i]
536538
end
537539

538540
x = rand(Float32, 1024, 1024);
539541
cx = MtlArray(x);
540-
backend = KA.get_backend(cx);
542+
backend = KernelAbstractions.get_backend(cx);
541543
# cb = zeros(backend, Float32, 1)
542544
cb = MtlArray([0f0]);
543545
reduce_atomic(backend, 64)(+, cx, cb, ndrange=size(cx))
@@ -563,28 +565,30 @@ The parallel reduction is tricky. **Let's assume that we are allowed to overwrit
563565
== CUDA
564566

565567
```julia
568+
using CUDA, BenchmarkTools
569+
566570
function reduce_block(op, a, b)
567-
elements = 2* blockDim().x
568-
thread = threadIdx().x
571+
elements = 2* blockDim().x
572+
thread = threadIdx().x
569573

570-
# parallel reduction of values in a block
571-
d = 1
572-
while d < elements
573-
sync_threads()
574-
index = 2 * d * (thread-1) + 1
575-
@inbounds if index <= elements && index+d <= length(a)
576-
@cuprintln "thread $thread: a[$index] + a[$(index+d)] = $(a[index]) + $(a[index+d]) = $(op(a[index], a[index+d]))"
577-
a[index] = op(a[index], a[index+d])
578-
end
579-
d *= 2
580-
thread == 1 && @cuprintln()
581-
end
582-
583-
if thread == 1
584-
b[] = a[1]
574+
# parallel reduction of values in a block
575+
d = 1
576+
while d < elements
577+
sync_threads()
578+
index = 2 * d * (thread-1) + 1
579+
@inbounds if index <= elements && index+d <= length(a)
580+
@cuprintln "thread $thread: a[$index] + a[$(index+d)] = $(a[index]) + $(a[index+d]) = $(op(a[index], a[index+d]))"
581+
a[index] = op(a[index], a[index+d])
585582
end
586-
587-
return
583+
d *= 2
584+
thread == 1 && @cuprintln()
585+
end
586+
587+
if thread == 1
588+
b[] = a[1]
589+
end
590+
591+
return
588592
end
589593

590594
a = CuArray(1:16);
@@ -597,32 +601,29 @@ CUDA.@allowscalar b[]
597601
== KernelAbstractions with Metal
598602

599603
```julia
600-
using Metal, BenchmarkTools
601-
using KernelAbstractions
602-
import KernelAbstractions as KA
603-
using Atomix
604+
using Atomix, Metal, KernelAbstractions, BenchmarkTools
604605

605606

606607
@kernel function reduce_block(a, b)
607-
elements = 2 * prod(@groupsize())
608-
thread = @index(Local)
609-
610-
# parallel reduction of values in a block
611-
d = 1
612-
while d < elements
613-
index = 2 * d * (thread-1) + 1
614-
if index <= elements && index+d <= length(a)
615-
KA.@print "thread $thread: a[$index] + a[$(index+d)] = $(a[index]) + $(a[index+d]) = $(a[index] + a[index+d]))"
616-
a[index] += a[index+d]
617-
end
618-
d *= 2
619-
thread == 1 && KA.@print "\n"
620-
@synchronize
621-
end
622-
623-
if thread == 1
624-
b[] = a[1]
608+
elements = 2 * prod(@groupsize())
609+
thread = @index(Local)
610+
611+
# parallel reduction of values in a block
612+
d = 1
613+
while d < elements
614+
index = 2 * d * (thread-1) + 1
615+
if index <= elements && index+d <= length(a)
616+
KA.@print "thread $thread: a[$index] + a[$(index+d)] = $(a[index]) + $(a[index+d]) = $(a[index] + a[index+d]))"
617+
a[index] += a[index+d]
625618
end
619+
d *= 2
620+
thread == 1 && KA.@print "\n"
621+
@synchronize
622+
end
623+
624+
if thread == 1
625+
b[] = a[1]
626+
end
626627
end
627628

628629
a = MtlArray(1:16);
@@ -646,6 +647,8 @@ To extend the above for multiple blocks, we need to add reduction over blocks. T
646647
== Cuda
647648

648649
```julia
650+
using CUDA, BenchmarkTools
651+
649652
function reduce_grid_atomic(op, a, b)
650653
elements = 2*blockDim().x
651654
offset = 2*(blockIdx().x - 1) * blockDim().x
@@ -682,6 +685,7 @@ sum(x)
682685
== KernelAbstractions with Metal
683686

684687
```julia
688+
using Atomix, Metal, KernelAbstractions, BenchmarkTools
685689

686690
@kernel function reduce_grid_atomic(a, b)
687691
block_dim = prod(@groupsize())

0 commit comments

Comments
 (0)