Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .buildkite/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ steps:
key: "initialize"
command:
- echo "--- Instantiate project"
- julia --project=test -e 'using Pkg; Pkg.develop(;path="."); Pkg.add("CUDA"); Pkg.add("MPI"); Pkg.instantiate(;verbose=true); Pkg.precompile(;strict=true)'
- julia --project=test -e 'using Pkg; Pkg.develop(;path="."); Pkg.add(["CUDA", "MPI"]); Pkg.instantiate(;verbose=true); Pkg.precompile(;strict=true)'
# force the initialization of the CUDA runtime as it is lazily loaded by default
- "julia --project=test -e 'using CUDA; CUDA.precompile_runtime()'"
- "julia --project=test -e 'using Pkg; Pkg.status()'"
Expand Down
5 changes: 5 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,12 @@ ClimaComms.jl Release Notes

main
-------

v0.6.11
-------
- ci: update JuliaFormatter job [PR 127](https://github.com/CliMA/ClimaComms.jl/pull/127)
- Added Metal support [PR 126](https://github.com/CliMA/ClimaComms.jl/pull/126)
- NOTE: This is considered experimental as it is not continuously tested in CI.

v0.6.10
-------
Expand Down
7 changes: 5 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "ClimaComms"
uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
authors = ["Kiran Pamnany <clima-software@caltech.edu>", "Simon Byrne <simonbyrne@caltech.edu>", "Charles Kawczynski <charliek@caltech.edu>", "Sriharsha Kandala <Sriharsha.kvs@gmail.com>", "Jake Bolewski <clima-software@caltech.edu>", "Gabriele Bozzola <gbozzola@caltech.edu>"]
version = "0.6.10"
authors = ["Kiran Pamnany <clima-software@caltech.edu>", "Simon Byrne <simonbyrne@caltech.edu>", "Charles Kawczynski <charliek@caltech.edu>", "Sriharsha Kandala <Sriharsha.kvs@gmail.com>", "Jake Bolewski <clima-software@caltech.edu>", "Gabriele Bozzola <gbozzola@caltech.edu>", "Haakon Ludvig Langeland Ervik <45243236+haakon-e@users.noreply.github.com>"]
version = "0.6.11"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
Expand All @@ -11,15 +11,18 @@ LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
[weakdeps]
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
Metal = "dde4c033-4e86-420c-a63e-0dd931031962"

[extensions]
ClimaCommsCUDAExt = "CUDA"
ClimaCommsMPIExt = "MPI"
ClimaCommsMetalExt = "Metal"

[compat]
CUDA = "3, 4, 5"
Adapt = "3, 4"
Logging = "1.9.4"
LoggingExtras = "1.1.0"
MPI = "0.20.18"
Metal = "1"
julia = "1.9"
58 changes: 19 additions & 39 deletions ext/ClimaCommsCUDAExt.jl
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The updates in this function is mostly formatting changes, with a few exceptions I'll outline below

Original file line number Diff line number Diff line change
Expand Up @@ -18,21 +18,15 @@ function Base.summary(io::IO, ::CUDADevice)
return "$name ($uuid)"
end

function ClimaComms.device_functional(::CUDADevice)
return CUDA.functional()
end

function Adapt.adapt_structure(
to::Type{<:CUDA.CuArray},
ctx::ClimaComms.AbstractCommsContext,
)
return ClimaComms.context(Adapt.adapt(to, ClimaComms.device(ctx)))
end
ClimaComms.device_functional(::CUDADevice) = CUDA.functional()

Adapt.adapt_structure(
::Type{<:CUDA.CuArray},
device::ClimaComms.AbstractDevice,
) = ClimaComms.CUDADevice()
to::Type{<:CUDA.CuArray}, ctx::ClimaComms.AbstractCommsContext,
) =
ClimaComms.context(Adapt.adapt(to, ClimaComms.device(ctx)))

Adapt.adapt_structure(::Type{<:CUDA.CuArray}, ::ClimaComms.AbstractDevice) =
ClimaComms.CUDADevice()

ClimaComms.array_type(::CUDADevice) = CUDA.CuArray
ClimaComms.free_memory(::CUDADevice) = CUDA.free_memory()
Expand Down Expand Up @@ -61,8 +55,7 @@ thread_index() =

# The maximum number of blocks that can fit on the GPU used for this kernel.
grid_size_limit(kernel) = CUDA.attribute(
CUDA.device(kernel.fun.mod.ctx),
CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
CUDA.device(kernel.fun.mod.ctx), CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
)

# Either the first value if it is available, or the maximum number of threads
Expand All @@ -73,11 +66,7 @@ block_size_limit(::Val{:auto}, kernel) =
CUDA.launch_configuration(kernel.fun).threads

function ClimaComms.run_threaded(
f::F,
::CUDADevice,
::Val,
itr;
block_size,
f::F, ::CUDADevice, ::Val, itr; block_size,
) where {F}
n_items = length(itr)
n_items > 0 || return nothing
Expand All @@ -92,21 +81,18 @@ function ClimaComms.run_threaded(
max_blocks = grid_size_limit(kernel)
max_threads_in_block = block_size_limit(block_size, kernel)

params = ClimaComms._compute_launch_params_simple(
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Defined _compute_launch_params_simple in src/devices.jl for re-use in the metal extension. Should be equivalent to the existing code

n_items, max_blocks, max_threads_in_block,
)
# If there are too many items, coarsen by the smallest possible amount.
n_items <= max_blocks * max_threads_in_block ||
isnothing(params) &&
return ClimaComms.run_threaded(f, CUDADevice(), 1, itr; block_size)

threads_in_block = min(max_threads_in_block, n_items)
blocks = cld(n_items, threads_in_block)
kernel(; blocks, threads = threads_in_block)
kernel(; params.blocks, threads = params.threads_in_block)
end

function ClimaComms.run_threaded(
f::F,
::CUDADevice,
min_items_in_thread::Int,
itr;
block_size,
f::F, ::CUDADevice, min_items_in_thread::Int, itr; block_size,
) where {F}
min_items_in_thread > 0 || throw(ArgumentError("`coarsen` is not positive"))
n_items = length(itr)
Expand All @@ -122,16 +108,10 @@ function ClimaComms.run_threaded(
max_blocks = grid_size_limit(kernel)
max_threads_in_block = block_size_limit(block_size, kernel)

# If there are too many items to use the specified coarsening, increase it
# by the smallest possible amount.
max_required_threads = cld(n_items, min_items_in_thread)
items_in_thread =
max_required_threads <= max_blocks * max_threads_in_block ?
min_items_in_thread : cld(n_items, max_blocks * max_threads_in_block)

threads_in_block = min(max_threads_in_block, max_required_threads)
blocks = cld(n_items, items_in_thread * threads_in_block)
kernel(; blocks, threads = threads_in_block)
params = ClimaComms._compute_launch_params_coarsened(
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Defined _compute_launch_params_coarsened in src/devices.jl for re-use in the metal extension.

n_items, max_blocks, max_threads_in_block, min_items_in_thread,
)
kernel(; params.blocks, threads = params.threads_in_block)
end

end
116 changes: 116 additions & 0 deletions ext/ClimaCommsMetalExt.jl
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If you open this file next to the ext/ClimaCommsCUDAExt.jl file, they should look pretty identical apart from renaming CUDA to Metal.

Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
module ClimaCommsMetalExt

import Metal

import Adapt
import ClimaComms
import ClimaComms: MetalDevice

# Metal automatically manages device assignment, so this is a no-op
ClimaComms._assign_device(::MetalDevice, rank_number) = nothing

function Base.summary(io::IO, ::MetalDevice)
dev = Metal.device()
name = dev.name
return "$name (Metal)"
end

ClimaComms.device_functional(::MetalDevice) = !isempty(Metal.devices())

Adapt.adapt_structure(
to::Type{<:Metal.MtlArray}, ctx::ClimaComms.AbstractCommsContext,
) =
ClimaComms.context(Adapt.adapt(to, ClimaComms.device(ctx)))

Adapt.adapt_structure(::Type{<:Metal.MtlArray}, ::ClimaComms.AbstractDevice) =
ClimaComms.MetalDevice()

ClimaComms.array_type(::MetalDevice) = Metal.MtlArray
ClimaComms.free_memory(::MetalDevice) = Metal.device().currentAllocatedSize
ClimaComms.total_memory(::MetalDevice) = Metal.device().maxBufferLength
ClimaComms.allowscalar(f, ::MetalDevice, args...; kwargs...) =
Metal.@allowscalar f(args...; kwargs...)

# Extending ClimaComms methods that operate on expressions (cannot use dispatch here)
ClimaComms.sync(f::F, ::MetalDevice, args...; kwargs...) where {F} =
Metal.@sync f(args...; kwargs...)
ClimaComms.cuda_sync(f::F, ::MetalDevice, args...; kwargs...) where {F} = # TODO: Rename to `device_sync` to unify `Metal` and `CUDA`
Metal.@sync f(args...; kwargs...)
ClimaComms.time(f::F, ::MetalDevice, args...; kwargs...) where {F} =
Metal.@time f(args...; kwargs...)
ClimaComms.elapsed(f::F, ::MetalDevice, args...; kwargs...) where {F} =
Metal.@elapsed f(args...; kwargs...)
ClimaComms.assert(::MetalDevice, cond::C, text::T) where {C, T} =
isnothing(text) ? (Metal.@assert cond()) : (Metal.@assert cond() text())

# The number of threads in the kernel being executed by the calling thread.
threads_in_kernel() = Metal.threads_per_grid_1d()

# The index of the calling thread, which is between 1 and threads_in_kernel().
thread_index() = Metal.thread_position_in_grid_1d()

# The maximum number of blocks that can fit on the GPU used for this kernel.
# Metal doesn't have a direct equivalent to CUDA's max grid dim, so we use a reasonable default
grid_size_limit(kernel) = 65535

# Either the first value if it is available, or the maximum number of threads
# that can fit in one block of this kernel.
# With enough blocks, the latter value will maximize the occupancy of the GPU.
block_size_limit(max_threads_in_block::Int, _) = max_threads_in_block
block_size_limit(::Val{:auto}, kernel) =
Int(kernel.pipeline.maxTotalThreadsPerThreadgroup)

function ClimaComms.run_threaded(
f::F, ::MetalDevice, ::Val, itr; block_size,
) where {F}
n_items = length(itr)
n_items > 0 || return nothing

function call_f_from_thread()
item_index = thread_index()
item_index <= n_items &&
@inbounds f(itr[firstindex(itr) + item_index - 1])
return nothing
end
kernel = Metal.@metal launch = false call_f_from_thread()
max_blocks = grid_size_limit(kernel)
max_threads_in_block = block_size_limit(block_size, kernel)

params = ClimaComms._compute_launch_params_simple(
n_items, max_blocks, max_threads_in_block,
)
# If there are too many items, coarsen by the smallest possible amount.
isnothing(params) &&
return ClimaComms.run_threaded(f, MetalDevice(), 1, itr; block_size)

Metal.@sync kernel(;
threads = params.threads_in_block, groups = params.blocks,
)
end

function ClimaComms.run_threaded(
f::F, ::MetalDevice, min_items_in_thread::Int, itr; block_size,
) where {F}
min_items_in_thread > 0 || throw(ArgumentError("`coarsen` is not positive"))
n_items = length(itr)
n_items > 0 || return nothing

# Maximize memory coalescing with a "grid-stride loop"; for reference, see
# https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops
call_f_from_thread() =
for item_index in thread_index():threads_in_kernel():n_items
@inbounds f(itr[firstindex(itr) + item_index - 1])
end
kernel = Metal.@metal launch = false call_f_from_thread()
max_blocks = grid_size_limit(kernel)
max_threads_in_block = block_size_limit(block_size, kernel)

params = ClimaComms._compute_launch_params_coarsened(
n_items, max_blocks, max_threads_in_block, min_items_in_thread,
)
Metal.@sync kernel(;
threads = params.threads_in_block, groups = params.blocks,
)
end

end
65 changes: 64 additions & 1 deletion src/devices.jl
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,13 @@ Use NVIDIA GPU accelarator
"""
struct CUDADevice <: AbstractDevice end

"""
MetalDevice()

Use Apple GPU accelerator (Metal)
"""
struct MetalDevice <: AbstractDevice end

"""
ClimaComms.device_functional(device)

Expand All @@ -57,6 +64,8 @@ function device_type()
return :CPUMultiThreaded
elseif env_var == "CUDA"
return :CUDADevice
elseif env_var == "Metal"
return :MetalDevice
else
error("Invalid CLIMACOMMS_DEVICE: $env_var")
end
Expand All @@ -71,7 +80,8 @@ Allowed values:
- `CPU`, single-threaded or multi-threaded depending on the number of threads;
- `CPUSingleThreaded`,
- `CPUMultiThreaded`,
- `CUDA`.
- `CUDA`,
- `Metal`.

The default is `CPU`.
"""
Expand All @@ -82,6 +92,11 @@ function device()
"Loading CUDA.jl is required to use CUDADevice. You might want to call ClimaComms.@import_required_backends",
)
end
if target_device == :MetalDevice && !metal_ext_is_loaded()
error(
"Loading Metal.jl is required to use MetalDevice. You might want to call ClimaComms.@import_required_backends",
)
end
DeviceConstructor = getproperty(ClimaComms, target_device)
return DeviceConstructor()
end
Expand Down Expand Up @@ -742,3 +757,51 @@ Base.@propagate_inbounds function Base.getindex(
end

# TODO: Check whether conversion of every Int to Int32 improves GPU performance.

# Internal helpers for GPU kernel launch parameters

"""
_compute_launch_params_simple(n_items, max_blocks, max_threads_in_block)

Compute kernel launch parameters (`blocks`, `threads_in_block`) for a simple (1 item per thread)
execution strategy. Returns `nothing` if the number of items exceeds the GPU's capacity for
this strategy (requires coarsening).

Used by `ClimaCommsCUDAExt` and `ClimaCommsMetalExt` in `run_threaded`.
"""
function _compute_launch_params_simple(
n_items, max_blocks, max_threads_in_block,
)
if n_items <= max_blocks * max_threads_in_block
threads_in_block = min(max_threads_in_block, n_items)
blocks = cld(n_items, threads_in_block)
return (; blocks, threads_in_block)
else
return nothing
end
end

"""
_compute_launch_params_coarsened(n_items, max_blocks, max_threads_in_block, min_items_in_thread)

Compute kernel launch parameters (`blocks`, `threads_in_block`) for a coarsened execution strategy,
where each thread processes at least `min_items_in_thread`. This strategy maximizes GPU occupancy
when `n_items` is large.

Used by `ClimaCommsCUDAExt` and `ClimaCommsMetalExt` in `run_threaded`.
"""
function _compute_launch_params_coarsened(
n_items, max_blocks, max_threads_in_block, min_items_in_thread,
)
# If there are too many items to use the specified coarsening, increase it
# by the smallest possible amount.
max_required_threads = cld(n_items, min_items_in_thread)
items_in_thread =
max_required_threads <= max_blocks * max_threads_in_block ?
min_items_in_thread :
cld(n_items, max_blocks * max_threads_in_block)

threads_in_block = min(max_threads_in_block, max_required_threads)
blocks = cld(n_items, items_in_thread * threads_in_block)
return (; blocks, threads_in_block)
end
Loading
Loading