Use GPU blit for large SharedStorage GPU→GPU copies (>32MB)#716
Conversation
|
Your PR requires formatting changes to meet the project's style guidelines. Click here to view the suggested changes.diff --git a/test/array.jl b/test/array.jl
index c8b45b3b..f55e7114 100644
--- a/test/array.jl
+++ b/test/array.jl
@@ -69,33 +69,33 @@ end
end
@testset "copyto!" begin
- @testset "$S" for S in [Metal.PrivateStorage, Metal.SharedStorage]
- @testset "$T" for T in [Float16, Float32, Bool, Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8]
- dim = (1000, 17, 10)
- A = rand(T, dim)
- mtlA = mtl(A; storage = S)
-
- #cpu -> gpu
- res = Metal.zeros(T, dim; storage = S)
- copyto!(res, A)
- @test Array(res) == Array(A)
-
- #gpu -> cpu
- res = zeros(T, dim)
- copyto!(res, mtlA)
- @test Array(res) == Array(mtlA)
+ @testset "$S" for S in [Metal.PrivateStorage, Metal.SharedStorage]
+ @testset "$T" for T in [Float16, Float32, Bool, Int16, Int32, Int64, Int8, UInt16, UInt32, UInt64, UInt8]
+ dim = (1000, 17, 10)
+ A = rand(T, dim)
+ mtlA = mtl(A; storage = S)
+
+ #cpu -> gpu
+ res = Metal.zeros(T, dim; storage = S)
+ copyto!(res, A)
+ @test Array(res) == Array(A)
+
+ #gpu -> cpu
+ res = zeros(T, dim)
+ copyto!(res, mtlA)
+ @test Array(res) == Array(mtlA)
+
+ #gpu -> gpu
+ res = Metal.zeros(T, dim; storage = S)
+ copyto!(res, mtlA)
+ @test Array(res) == Array(mtlA)
+ end
- #gpu -> gpu
- res = Metal.zeros(T, dim; storage = S)
+ # Large array, only test Float32
+ A = rand(Float32, 32 * 2^20)
+ mtlA = mtl(A; storage = S)
+ res = similar(A)
copyto!(res, mtlA)
- @test Array(res) == Array(mtlA)
- end
-
- # Large array, only test Float32
- A = rand(Float32, 32 * 2^20)
- mtlA = mtl(A; storage = S)
- res = similar(A)
- copyto!(res, mtlA)
@test Array(res) == Array(mtlA)
end
end |
79ebfd3 to
2c16d2e
Compare
|
This is interesting! Would you mind sharing the script you used to benchmark so we can ask people with various devices to test the effect of this change? Ideally it'd be something anyone with an Apple-silicon mac can copy-paste into a Julia REPL |
Codecov Report❌ Patch coverage is
Additional details and impacted files@@ Coverage Diff @@
## main #716 +/- ##
==========================================
- Coverage 81.26% 81.19% -0.08%
==========================================
Files 62 62
Lines 2899 2904 +5
==========================================
+ Hits 2356 2358 +2
- Misses 543 546 +3 ☔ View full report in Codecov by Sentry. 🚀 New features to boost your workflow:
|
There was a problem hiding this comment.
Metal Benchmarks
Details
| Benchmark suite | Current: 16a0986 | Previous: 1c1115e | Ratio |
|---|---|---|---|
latency/precompile |
25140585125 ns |
24830600583 ns |
1.01 |
latency/ttfp |
2304812042 ns |
2266835375 ns |
1.02 |
latency/import |
1465330542 ns |
1435733583 ns |
1.02 |
integration/metaldevrt |
841312.5 ns |
844458 ns |
1.00 |
integration/byval/slices=1 |
1585375 ns |
1559000 ns |
1.02 |
integration/byval/slices=3 |
19245250 ns |
8227271 ns |
2.34 |
integration/byval/reference |
1581875 ns |
1552167 ns |
1.02 |
integration/byval/slices=2 |
2718208 ns |
2584417 ns |
1.05 |
kernel/indexing |
495333 ns |
592729 ns |
0.84 |
kernel/indexing_checked |
487583 ns |
615500 ns |
0.79 |
kernel/launch |
11416.5 ns |
11750 ns |
0.97 |
kernel/rand |
517791 ns |
554042 ns |
0.93 |
array/construct |
6167 ns |
6167 ns |
1 |
array/broadcast |
553854 ns |
592958 ns |
0.93 |
array/random/randn/Float32 |
921416.5 ns |
795792 ns |
1.16 |
array/random/randn!/Float32 |
583375 ns |
614917 ns |
0.95 |
array/random/rand!/Int64 |
540209 ns |
550625 ns |
0.98 |
array/random/rand!/Float32 |
549291.5 ns |
581916 ns |
0.94 |
array/random/rand/Int64 |
879729 ns |
751542 ns |
1.17 |
array/random/rand/Float32 |
837042 ns |
672583 ns |
1.24 |
array/accumulate/Int64/1d |
1304917 ns |
1252916.5 ns |
1.04 |
array/accumulate/Int64/dims=1 |
1859208.5 ns |
1802021 ns |
1.03 |
array/accumulate/Int64/dims=2 |
2207812.5 ns |
2126375 ns |
1.04 |
array/accumulate/Int64/dims=1L |
12272542 ns |
11668167 ns |
1.05 |
array/accumulate/Int64/dims=2L |
9717208 ns |
9627062.5 ns |
1.01 |
array/accumulate/Float32/1d |
1080292 ns |
1118979 ns |
0.97 |
array/accumulate/Float32/dims=1 |
1601416 ns |
1522875 ns |
1.05 |
array/accumulate/Float32/dims=2 |
1964917 ns |
1825895.5 ns |
1.08 |
array/accumulate/Float32/dims=1L |
10341500 ns |
10050958 ns |
1.03 |
array/accumulate/Float32/dims=2L |
7416666.5 ns |
7212875 ns |
1.03 |
array/reductions/reduce/Int64/1d |
1301187 ns |
1530166 ns |
0.85 |
array/reductions/reduce/Int64/dims=1 |
1119208.5 ns |
1075500 ns |
1.04 |
array/reductions/reduce/Int64/dims=2 |
1160792 ns |
1170250 ns |
0.99 |
array/reductions/reduce/Int64/dims=1L |
2034958.5 ns |
2007666.5 ns |
1.01 |
array/reductions/reduce/Int64/dims=2L |
3929458 ns |
4172166 ns |
0.94 |
array/reductions/reduce/Float32/1d |
737458 ns |
1034979 ns |
0.71 |
array/reductions/reduce/Float32/dims=1 |
804667 ns |
807791 ns |
1.00 |
array/reductions/reduce/Float32/dims=2 |
832750 ns |
841792 ns |
0.99 |
array/reductions/reduce/Float32/dims=1L |
1336000 ns |
1306334 ns |
1.02 |
array/reductions/reduce/Float32/dims=2L |
1812021 ns |
1780833 ns |
1.02 |
array/reductions/mapreduce/Int64/1d |
1328937.5 ns |
1549333 ns |
0.86 |
array/reductions/mapreduce/Int64/dims=1 |
1113000 ns |
1103709 ns |
1.01 |
array/reductions/mapreduce/Int64/dims=2 |
1160959 ns |
1335541 ns |
0.87 |
array/reductions/mapreduce/Int64/dims=1L |
2008667 ns |
2016417 ns |
1.00 |
array/reductions/mapreduce/Int64/dims=2L |
3593334 ns |
3600500 ns |
1.00 |
array/reductions/mapreduce/Float32/1d |
802709 ns |
1006834 ns |
0.80 |
array/reductions/mapreduce/Float32/dims=1 |
812312.5 ns |
817916.5 ns |
0.99 |
array/reductions/mapreduce/Float32/dims=2 |
831542 ns |
848917 ns |
0.98 |
array/reductions/mapreduce/Float32/dims=1L |
1351541.5 ns |
1296750 ns |
1.04 |
array/reductions/mapreduce/Float32/dims=2L |
1794667 ns |
1795584 ns |
1.00 |
array/private/copyto!/gpu_to_gpu |
549208 ns |
638708 ns |
0.86 |
array/private/copyto!/cpu_to_gpu |
753542 ns |
790458.5 ns |
0.95 |
array/private/copyto!/gpu_to_cpu |
687749.5 ns |
804083 ns |
0.86 |
array/private/iteration/findall/int |
1577625 ns |
1618458.5 ns |
0.97 |
array/private/iteration/findall/bool |
1488854 ns |
1439479.5 ns |
1.03 |
array/private/iteration/findfirst/int |
2101041 ns |
2030729 ns |
1.03 |
array/private/iteration/findfirst/bool |
2030541 ns |
2006541 ns |
1.01 |
array/private/iteration/scalar |
3399374.5 ns |
4026208.5 ns |
0.84 |
array/private/iteration/logical |
2678166.5 ns |
2529542 ns |
1.06 |
array/private/iteration/findmin/1d |
2271583 ns |
2190979.5 ns |
1.04 |
array/private/iteration/findmin/2d |
1552417 ns |
1497562.5 ns |
1.04 |
array/private/copy |
819750 ns |
588562.5 ns |
1.39 |
array/shared/copyto!/gpu_to_gpu |
84417 ns |
83375 ns |
1.01 |
array/shared/copyto!/cpu_to_gpu |
83833 ns |
81917 ns |
1.02 |
array/shared/copyto!/gpu_to_cpu |
83916 ns |
82292 ns |
1.02 |
array/shared/iteration/findall/int |
1571000 ns |
1585625 ns |
0.99 |
array/shared/iteration/findall/bool |
1488208.5 ns |
1438375 ns |
1.03 |
array/shared/iteration/findfirst/int |
1707437.5 ns |
1653917 ns |
1.03 |
array/shared/iteration/findfirst/bool |
1640229.5 ns |
1613416 ns |
1.02 |
array/shared/iteration/scalar |
207125 ns |
201042 ns |
1.03 |
array/shared/iteration/logical |
2415812.5 ns |
2304458.5 ns |
1.05 |
array/shared/iteration/findmin/1d |
1897000 ns |
1799125 ns |
1.05 |
array/shared/iteration/findmin/2d |
1556958 ns |
1499000 ns |
1.04 |
array/shared/copy |
216750 ns |
243292 ns |
0.89 |
array/permutedims/4d |
2480041 ns |
2349854 ns |
1.06 |
array/permutedims/2d |
1191125 ns |
1132417 ns |
1.05 |
array/permutedims/3d |
1768833.5 ns |
1647166 ns |
1.07 |
metal/synchronization/stream |
19292 ns |
18833 ns |
1.02 |
metal/synchronization/context |
20292 ns |
19834 ns |
1.02 |
This comment was automatically generated by workflow using github-action-benchmark.
|
Copy-paste benchmark script and results table added to the PR description above. |
|
I asked in the Slack and the results seem to show that 32MiB is a reasonable threshold. I also added a test. Copy-paste script (to be run from a version of Metal.jl that doesn't include this optimization)using Printf, Metal, Random; begin
println("Device: $(Metal.device().name) ($(Metal.num_gpu_cores()) cores)")
println("Testing SharedStorage GPU→GPU copyto! performance\n")
sizes_mb = [16, 32, 64, 128, 256, 512, 1024, 2048]
if Sys.total_memory() >= 16*2^30
push!(sizes_mb, 4096)
end
if Sys.total_memory() >= 32*2^30
push!(sizes_mb, 8192)
end
println("| Size | CPU Bandwidth | GPU Bandwidth |")
println("| (MB) | (GB/s) | (GB/s) |")
println("|------|---------------|---------------|")
for size_mb in sizes_mb
n = size_mb * 1024^2 ÷ sizeof(Float32)
src = rand!(MtlArray{Float32, 1, Metal.SharedStorage}(undef, n))
dst = MtlArray{Float32, 1, Metal.SharedStorage}(undef, n)
Metal.synchronize()
# Warmup
for _ in 1:3
copyto!(dst, src)
Metal.synchronize()
end
# Benchmark (10 iterations)
cpu_times = Float64[]
for _ in 1:10
Metal.synchronize()
t = @elapsed begin
copyto!(dst, src)
Metal.synchronize()
end
GC.gc(false)
push!(cpu_times, t)
end
cpu_time_ms = minimum(cpu_times) * 1000
bytes = n * sizeof(Float32) * 2 # read + write
cpu_bandwidth = bytes / minimum(cpu_times) / 1e9
src = dst = nothing
GC.gc(true)
src = rand!(MtlArray{Float32, 1, Metal.PrivateStorage}(undef, n))
dst = MtlArray{Float32, 1, Metal.PrivateStorage}(undef, n)
Metal.synchronize()
# Warmup
for _ in 1:3
copyto!(dst, src)
Metal.synchronize()
end
# Benchmark (10 iterations)
gpu_times = Float64[]
for _ in 1:10
Metal.synchronize()
t = @elapsed begin
copyto!(dst, src)
Metal.synchronize()
end
GC.gc(false)
push!(gpu_times, t)
end
gpu_time_ms = minimum(gpu_times) * 1000
gpu_bandwidth = bytes / minimum(gpu_times) / 1e9
src = dst = nothing
GC.gc(true)
@printf "| %4d | %13.1f | %13.1f |\n" size_mb cpu_bandwidth gpu_bandwidth
end
end
|
43e4269 to
7559e52
Compare
christiangnrd
left a comment
There was a problem hiding this comment.
I think this is good to go! @maleadt do you think this is a reasonable heuristic?
maleadt
left a comment
There was a problem hiding this comment.
LGTM (except for the Factored out for testing that I presume still needs to be updated).
7559e52 to
2b09aee
Compare
SharedStorage GPU->GPU copies previously always used CPU memcpy, which is slower than GPU blit for large buffers. This commit adds a size-based heuristic: - Small copies (<=32MB): Use CPU memcpy (avoids GPU command overhead) - Large copies (>32MB): Use GPU blit (2-3x faster bandwidth) Benchmark results: - 64 MB: 2.2x faster (1.28ms -> 0.58ms) - 256 MB: 3.1x faster (5.03ms -> 1.63ms) - 1024 MB: 3.4x faster (20.2ms -> 6.0ms) The 32MB threshold was determined empirically as the crossover point where GPU blit becomes faster than CPU memcpy.
2b09aee to
16a0986
Compare
SharedStorage GPU→GPU
copyto!currently always uses CPU memcpy, which becomes a bottleneck for large buffers. This PR adds a size-based heuristic that uses GPU blit for copies larger than 32MB, achieving up to 3.6x speedup.Problem
PR #445 introduced CPU memcpy for SharedStorage copies to avoid ObjectiveC.jl overhead. This is beneficial for small copies where the overhead dominates, but causes significant performance regression for large copies where GPU blit would be much faster due to its higher memory bandwidth.
Solution
Use a size-based threshold (32MB) to choose the optimal path:
Why 32MB?
The threshold is based on amortizing the fixed overhead against the bandwidth differential:
Using the crossover formula
threshold = overhead / (1/cpu_bw - 1/gpu_bw), the theoretical crossover is ~17MB. We use 32MB as a conservative margin to ensure no regression at the boundary.Benchmark Results
GPU→GPU SharedStorage (improved by this change):
Measured on M2 Max.
Community Benchmark Script
Copy-paste into a Julia REPL to test on your device:
Results on M2 Max (main vs PR):
The
mainbranch plateaus at ~54 GB/s due to CPU memcpy, while the PR achieves ~180-185 GB/s via GPU blit for large copies.The 32MB threshold can be adjusted if community testing on other devices suggests a different value.
Key Points