@@ -18,21 +18,13 @@ function Base.summary(io::IO, ::CUDADevice)
1818 return " $name ($uuid )"
1919end
2020
21- function ClimaComms. device_functional (:: CUDADevice )
22- return CUDA. functional ()
23- end
21+ ClimaComms. device_functional (:: CUDADevice ) = CUDA. functional ()
2422
25- function Adapt. adapt_structure (
26- to:: Type{<:CUDA.CuArray} ,
27- ctx:: ClimaComms.AbstractCommsContext ,
28- )
29- return ClimaComms. context (Adapt. adapt (to, ClimaComms. device (ctx)))
30- end
23+ Adapt. adapt_structure (to:: Type{<:CUDA.CuArray} , ctx:: ClimaComms.AbstractCommsContext ) =
24+ ClimaComms. context (Adapt. adapt (to, ClimaComms. device (ctx)))
3125
32- Adapt. adapt_structure (
33- :: Type{<:CUDA.CuArray} ,
34- device:: ClimaComms.AbstractDevice ,
35- ) = ClimaComms. CUDADevice ()
26+ Adapt. adapt_structure (:: Type{<:CUDA.CuArray} , device:: ClimaComms.AbstractDevice ) =
27+ ClimaComms. CUDADevice ()
3628
3729ClimaComms. array_type (:: CUDADevice ) = CUDA. CuArray
3830ClimaComms. free_memory (:: CUDADevice ) = CUDA. free_memory ()
@@ -56,57 +48,44 @@ ClimaComms.assert(::CUDADevice, cond::C, text::T) where {C, T} =
5648threads_in_kernel () = CUDA. blockDim (). x * CUDA. gridDim (). x
5749
5850# The index of the calling thread, which is between 1 and threads_in_kernel().
59- thread_index () =
60- (CUDA. blockIdx (). x - 1 ) * CUDA. blockDim (). x + CUDA. threadIdx (). x
51+ thread_index () = (CUDA. blockIdx (). x - 1 ) * CUDA. blockDim (). x + CUDA. threadIdx (). x
6152
6253# The maximum number of blocks that can fit on the GPU used for this kernel.
6354grid_size_limit (kernel) = CUDA. attribute (
64- CUDA. device (kernel. fun. mod. ctx),
65- CUDA. DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
55+ CUDA. device (kernel. fun. mod. ctx), CUDA. DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
6656)
6757
6858# Either the first value if it is available, or the maximum number of threads
6959# that can fit in one block of this kernel (cuOccupancyMaxPotentialBlockSize).
7060# With enough blocks, the latter value will maximize the occupancy of the GPU.
7161block_size_limit (max_threads_in_block:: Int , _) = max_threads_in_block
72- block_size_limit (:: Val{:auto} , kernel) =
73- CUDA. launch_configuration (kernel. fun). threads
62+ block_size_limit (:: Val{:auto} , kernel) = CUDA. launch_configuration (kernel. fun). threads
7463
75- function ClimaComms. run_threaded (
76- f:: F ,
77- :: CUDADevice ,
78- :: Val ,
79- itr;
80- block_size,
81- ) where {F}
64+ function ClimaComms. run_threaded (f:: F , :: CUDADevice , :: Val , itr; block_size) where {F}
8265 n_items = length (itr)
8366 n_items > 0 || return nothing
8467
8568 function call_f_from_thread ()
8669 item_index = thread_index ()
87- item_index <= n_items &&
88- @inbounds f (itr[firstindex (itr) + item_index - 1 ])
70+ item_index <= n_items && @inbounds f (itr[firstindex (itr) + item_index - 1 ])
8971 return nothing
9072 end
9173 kernel = CUDA. @cuda always_inline= true launch= false call_f_from_thread ()
9274 max_blocks = grid_size_limit (kernel)
9375 max_threads_in_block = block_size_limit (block_size, kernel)
9476
77+ params = ClimaComms. _compute_launch_params_simple (
78+ n_items, max_blocks, max_threads_in_block,
79+ )
9580 # If there are too many items, coarsen by the smallest possible amount.
96- n_items <= max_blocks * max_threads_in_block ||
81+ isnothing (params) &&
9782 return ClimaComms. run_threaded (f, CUDADevice (), 1 , itr; block_size)
9883
99- threads_in_block = min (max_threads_in_block, n_items)
100- blocks = cld (n_items, threads_in_block)
101- kernel (; blocks, threads = threads_in_block)
84+ kernel (; params. blocks, threads = params. threads_in_block)
10285end
10386
10487function ClimaComms. run_threaded (
105- f:: F ,
106- :: CUDADevice ,
107- min_items_in_thread:: Int ,
108- itr;
109- block_size,
88+ f:: F , :: CUDADevice , min_items_in_thread:: Int , itr; block_size,
11089) where {F}
11190 min_items_in_thread > 0 || throw (ArgumentError (" `coarsen` is not positive" ))
11291 n_items = length (itr)
@@ -122,16 +101,10 @@ function ClimaComms.run_threaded(
122101 max_blocks = grid_size_limit (kernel)
123102 max_threads_in_block = block_size_limit (block_size, kernel)
124103
125- # If there are too many items to use the specified coarsening, increase it
126- # by the smallest possible amount.
127- max_required_threads = cld (n_items, min_items_in_thread)
128- items_in_thread =
129- max_required_threads <= max_blocks * max_threads_in_block ?
130- min_items_in_thread : cld (n_items, max_blocks * max_threads_in_block)
131-
132- threads_in_block = min (max_threads_in_block, max_required_threads)
133- blocks = cld (n_items, items_in_thread * threads_in_block)
134- kernel (; blocks, threads = threads_in_block)
104+ params = ClimaComms. _compute_launch_params_coarsened (
105+ n_items, max_blocks, max_threads_in_block, min_items_in_thread,
106+ )
107+ kernel (; params. blocks, threads = params. threads_in_block)
135108end
136109
137110end
0 commit comments