CliMA
diff --git a/‎.buildkite/pipeline.yml‎
Lines changed: 1 addition & 1 deletion b/‎.buildkite/pipeline.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎NEWS.md‎
Lines changed: 4 additions & 0 deletions b/‎NEWS.md‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎Project.toml‎
Lines changed: 5 additions & 2 deletions b/‎Project.toml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎ext/ClimaCommsCUDAExt.jl‎
Lines changed: 20 additions & 47 deletions b/‎ext/ClimaCommsCUDAExt.jl‎
Lines changed: 20 additions & 47 deletions
diff --git a/‎ext/ClimaCommsMetalExt.jl‎
Lines changed: 106 additions & 0 deletions b/‎ext/ClimaCommsMetalExt.jl‎
Lines changed: 106 additions & 0 deletions
diff --git a/‎src/devices.jl‎
Lines changed: 62 additions & 1 deletion b/‎src/devices.jl‎
Lines changed: 62 additions & 1 deletion
@@ -11,7 +11,7 @@ steps:
     key: "initialize"
     command:
       - echo "--- Instantiate project"
-      - julia --project=test -e 'using Pkg; Pkg.develop(;path="."); Pkg.add("CUDA"); Pkg.add("MPI"); Pkg.instantiate(;verbose=true); Pkg.precompile(;strict=true)'
+      - julia --project=test -e 'using Pkg; Pkg.develop(;path="."); Pkg.add([PackageSpec("CUDA"), PackageSpec("MPI"), PackageSpec("Metal")]); Pkg.instantiate(;verbose=true); Pkg.precompile(;strict=true)'
       # force the initialization of the CUDA runtime as it is lazily loaded by default
       - "julia --project=test -e 'using CUDA; CUDA.precompile_runtime()'"
       - "julia --project=test -e 'using Pkg; Pkg.status()'"
 
@@ -4,6 +4,10 @@ ClimaComms.jl Release Notes
 main
 -------
 
+v0.6.11
+- Added Metal support [PR 126](https://github.com/CliMA/ClimaComms.jl/pull/126)
+  - NOTE: This is considered experimental as it is not continuously tested in CI.
+
 v0.6.10
 -------
 - fixed logging interoperability with `GPUCompiler.jl` [PR 119](https://github.com/CliMA/ClimaComms.jl/pull/119)
 
@@ -1,12 +1,13 @@
 name = "ClimaComms"
 uuid = "3a4d1b5c-c61d-41fd-a00a-5873ba7a1b0d"
-authors = ["Kiran Pamnany <clima-software@caltech.edu>", "Simon Byrne <simonbyrne@caltech.edu>", "Charles Kawczynski <charliek@caltech.edu>", "Sriharsha Kandala <Sriharsha.kvs@gmail.com>", "Jake Bolewski <clima-software@caltech.edu>", "Gabriele Bozzola <gbozzola@caltech.edu>"]
-version = "0.6.10"
+authors = ["Kiran Pamnany <clima-software@caltech.edu>", "Simon Byrne <simonbyrne@caltech.edu>", "Charles Kawczynski <charliek@caltech.edu>", "Sriharsha Kandala <Sriharsha.kvs@gmail.com>", "Jake Bolewski <clima-software@caltech.edu>", "Gabriele Bozzola <gbozzola@caltech.edu>", "Haakon Ludvig Langeland Ervik <45243236+haakon-e@users.noreply.github.com>"]
+version = "0.6.11"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
 LoggingExtras = "e6f89c97-d47a-5376-807f-9c37f3926c36"
+Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
@@ -15,11 +16,13 @@ MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
 [extensions]
 ClimaCommsCUDAExt = "CUDA"
 ClimaCommsMPIExt = "MPI"
+ClimaCommsMetalExt = "Metal"
 
 [compat]
 CUDA = "3, 4, 5"
 Adapt = "3, 4"
 Logging = "1.9.4"
 LoggingExtras = "1.1.0"
 MPI = "0.20.18"
+Metal = "1"
 julia = "1.9"
@@ -18,21 +18,13 @@ function Base.summary(io::IO, ::CUDADevice)
     return "$name ($uuid)"
 end
 
-function ClimaComms.device_functional(::CUDADevice)
-    return CUDA.functional()
-end
+ClimaComms.device_functional(::CUDADevice) = CUDA.functional()
 
-function Adapt.adapt_structure(
-    to::Type{<:CUDA.CuArray},
-    ctx::ClimaComms.AbstractCommsContext,
-)
-    return ClimaComms.context(Adapt.adapt(to, ClimaComms.device(ctx)))
-end
+Adapt.adapt_structure(to::Type{<:CUDA.CuArray}, ctx::ClimaComms.AbstractCommsContext) =
+    ClimaComms.context(Adapt.adapt(to, ClimaComms.device(ctx)))
 
-Adapt.adapt_structure(
-    ::Type{<:CUDA.CuArray},
-    device::ClimaComms.AbstractDevice,
-) = ClimaComms.CUDADevice()
+Adapt.adapt_structure(::Type{<:CUDA.CuArray}, device::ClimaComms.AbstractDevice) =
+    ClimaComms.CUDADevice()
 
 ClimaComms.array_type(::CUDADevice) = CUDA.CuArray
 ClimaComms.free_memory(::CUDADevice) = CUDA.free_memory()
@@ -56,57 +48,44 @@ ClimaComms.assert(::CUDADevice, cond::C, text::T) where {C, T} =
 threads_in_kernel() = CUDA.blockDim().x * CUDA.gridDim().x
 
 # The index of the calling thread, which is between 1 and threads_in_kernel().
-thread_index() =
-    (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
+thread_index() = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
 
 # The maximum number of blocks that can fit on the GPU used for this kernel.
 grid_size_limit(kernel) = CUDA.attribute(
-    CUDA.device(kernel.fun.mod.ctx),
-    CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
+    CUDA.device(kernel.fun.mod.ctx), CUDA.DEVICE_ATTRIBUTE_MAX_GRID_DIM_X,
 )
 
 # Either the first value if it is available, or the maximum number of threads
 # that can fit in one block of this kernel (cuOccupancyMaxPotentialBlockSize).
 # With enough blocks, the latter value will maximize the occupancy of the GPU.
 block_size_limit(max_threads_in_block::Int, _) = max_threads_in_block
-block_size_limit(::Val{:auto}, kernel) =
-    CUDA.launch_configuration(kernel.fun).threads
+block_size_limit(::Val{:auto}, kernel) = CUDA.launch_configuration(kernel.fun).threads
 
-function ClimaComms.run_threaded(
-    f::F,
-    ::CUDADevice,
-    ::Val,
-    itr;
-    block_size,
-) where {F}
+function ClimaComms.run_threaded(f::F, ::CUDADevice, ::Val, itr; block_size) where {F}
     n_items = length(itr)
     n_items > 0 || return nothing
 
     function call_f_from_thread()
         item_index = thread_index()
-        item_index <= n_items &&
-            @inbounds f(itr[firstindex(itr) + item_index - 1])
+        item_index <= n_items && @inbounds f(itr[firstindex(itr) + item_index - 1])
         return nothing
     end
     kernel = CUDA.@cuda always_inline=true launch=false call_f_from_thread()
     max_blocks = grid_size_limit(kernel)
     max_threads_in_block = block_size_limit(block_size, kernel)
 
+    params = ClimaComms._compute_launch_params_simple(
+        n_items, max_blocks, max_threads_in_block,
+    )
     # If there are too many items, coarsen by the smallest possible amount.
-    n_items <= max_blocks * max_threads_in_block ||
+    isnothing(params) && 
         return ClimaComms.run_threaded(f, CUDADevice(), 1, itr; block_size)
 
-    threads_in_block = min(max_threads_in_block, n_items)
-    blocks = cld(n_items, threads_in_block)
-    kernel(; blocks, threads = threads_in_block)
+    kernel(; params.blocks, threads = params.threads_in_block)
 end
 
 function ClimaComms.run_threaded(
-    f::F,
-    ::CUDADevice,
-    min_items_in_thread::Int,
-    itr;
-    block_size,
+    f::F, ::CUDADevice, min_items_in_thread::Int, itr; block_size,
 ) where {F}
     min_items_in_thread > 0 || throw(ArgumentError("`coarsen` is not positive"))
     n_items = length(itr)
@@ -122,16 +101,10 @@ function ClimaComms.run_threaded(
     max_blocks = grid_size_limit(kernel)
     max_threads_in_block = block_size_limit(block_size, kernel)
 
-    # If there are too many items to use the specified coarsening, increase it
-    # by the smallest possible amount.
-    max_required_threads = cld(n_items, min_items_in_thread)
-    items_in_thread =
-        max_required_threads <= max_blocks * max_threads_in_block ?
-        min_items_in_thread : cld(n_items, max_blocks * max_threads_in_block)
-
-    threads_in_block = min(max_threads_in_block, max_required_threads)
-    blocks = cld(n_items, items_in_thread * threads_in_block)
-    kernel(; blocks, threads = threads_in_block)
+    params = ClimaComms._compute_launch_params_coarsened(
+        n_items, max_blocks, max_threads_in_block, min_items_in_thread,
+    )
+    kernel(; params.blocks, threads = params.threads_in_block)
 end
 
 end
@@ -0,0 +1,106 @@
+module ClimaCommsMetalExt
+
+import Metal
+
+import Adapt
+import ClimaComms
+import ClimaComms: MetalDevice
+
+# Metal automatically manages device assignment, so this is a no-op
+ClimaComms._assign_device(::MetalDevice, rank_number) = nothing
+
+function Base.summary(io::IO, ::MetalDevice)
+    dev = Metal.device()
+    name = dev.name
+    return "$name (Metal)"
+end
+
+ClimaComms.device_functional(::MetalDevice) = !isempty(Metal.devices())
+
+Adapt.adapt_structure(to::Type{<:Metal.MtlArray}, ctx::ClimaComms.AbstractCommsContext) =
+    ClimaComms.context(Adapt.adapt(to, ClimaComms.device(ctx)))
+
+Adapt.adapt_structure(::Type{<:Metal.MtlArray}, device::ClimaComms.AbstractDevice) =
+    ClimaComms.MetalDevice()
+
+ClimaComms.array_type(::MetalDevice) = Metal.MtlArray
+ClimaComms.free_memory(::MetalDevice) = Metal.device().currentAllocatedSize
+ClimaComms.total_memory(::MetalDevice) = Metal.device().maxBufferLength
+ClimaComms.allowscalar(f, ::MetalDevice, args...; kwargs...) =
+    Metal.@allowscalar f(args...; kwargs...)
+
+# Extending ClimaComms methods that operate on expressions (cannot use dispatch here)
+ClimaComms.sync(f::F, ::MetalDevice, args...; kwargs...) where {F} =
+    Metal.@sync f(args...; kwargs...)
+ClimaComms.cuda_sync(f::F, ::MetalDevice, args...; kwargs...) where {F} =  # TODO: Rename to `device_sync` to unify `Metal` and `CUDA`
+    Metal.@sync f(args...; kwargs...)
+ClimaComms.time(f::F, ::MetalDevice, args...; kwargs...) where {F} =
+    Metal.@time f(args...; kwargs...)
+ClimaComms.elapsed(f::F, ::MetalDevice, args...; kwargs...) where {F} =
+    Metal.@elapsed f(args...; kwargs...)
+ClimaComms.assert(::MetalDevice, cond::C, text::T) where {C,T} =
+    isnothing(text) ? (Metal.@assert cond()) : (Metal.@assert cond() text())
+
+# The number of threads in the kernel being executed by the calling thread.
+threads_in_kernel() = Metal.threads_per_grid_1d()
+
+# The index of the calling thread, which is between 1 and threads_in_kernel().
+thread_index() = Metal.thread_position_in_grid_1d()
+
+# The maximum number of blocks that can fit on the GPU used for this kernel.
+# Metal doesn't have a direct equivalent to CUDA's max grid dim, so we use a reasonable default
+grid_size_limit(kernel) = 65535
+
+# Either the first value if it is available, or the maximum number of threads
+# that can fit in one block of this kernel.
+# With enough blocks, the latter value will maximize the occupancy of the GPU.
+block_size_limit(max_threads_in_block::Int, _) = max_threads_in_block
+block_size_limit(::Val{:auto}, kernel) = Int(kernel.pipeline.maxTotalThreadsPerThreadgroup)
+
+function ClimaComms.run_threaded(f::F, ::MetalDevice, ::Val, itr; block_size) where {F}
+    n_items = length(itr)
+    n_items > 0 || return nothing
+
+    function call_f_from_thread()
+        item_index = thread_index()
+        item_index <= n_items && @inbounds f(itr[firstindex(itr)+item_index-1])
+        return nothing
+    end
+    kernel = Metal.@metal launch = false call_f_from_thread()
+    max_blocks = grid_size_limit(kernel)
+    max_threads_in_block = block_size_limit(block_size, kernel)
+
+    params = ClimaComms._compute_launch_params_simple(
+        n_items, max_blocks, max_threads_in_block,
+    )
+    # If there are too many items, coarsen by the smallest possible amount.
+    isnothing(params) && 
+        return ClimaComms.run_threaded(f, MetalDevice(), 1, itr; block_size)
+
+    Metal.@sync kernel(; threads = params.threads_in_block, groups = params.blocks)
+end
+
+function ClimaComms.run_threaded(
+    f::F, ::MetalDevice, min_items_in_thread::Int, itr; block_size
+) where {F}
+    min_items_in_thread > 0 || throw(ArgumentError("`coarsen` is not positive"))
+    n_items = length(itr)
+    n_items > 0 || return nothing
+
+    # Maximize memory coalescing with a "grid-stride loop"; for reference, see
+    # https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops
+    call_f_from_thread() =
+        for item_index in thread_index():threads_in_kernel():n_items
+            @inbounds f(itr[firstindex(itr)+item_index-1])
+        end
+    kernel = Metal.@metal launch = false call_f_from_thread()
+    max_blocks = grid_size_limit(kernel)
+    max_threads_in_block = block_size_limit(block_size, kernel)
+
+    params = ClimaComms._compute_launch_params_coarsened(
+        n_items, max_blocks, max_threads_in_block, min_items_in_thread,
+    )
+    Metal.@sync kernel(; threads = params.threads_in_block, groups = params.blocks)
+end
+
+end
@@ -37,6 +37,13 @@ Use NVIDIA GPU accelarator
 """
 struct CUDADevice <: AbstractDevice end
 
+"""
+    MetalDevice()
+
+Use Apple GPU accelerator (Metal)
+"""
+struct MetalDevice <: AbstractDevice end
+
 """
     ClimaComms.device_functional(device)
 
@@ -57,6 +64,8 @@ function device_type()
         return :CPUMultiThreaded
     elseif env_var == "CUDA"
         return :CUDADevice
+    elseif env_var == "Metal"
+        return :MetalDevice
     else
         error("Invalid CLIMACOMMS_DEVICE: $env_var")
     end
@@ -71,7 +80,8 @@ Allowed values:
 - `CPU`, single-threaded or multi-threaded depending on the number of threads;
 - `CPUSingleThreaded`,
 - `CPUMultiThreaded`,
-- `CUDA`.
+- `CUDA`,
+- `Metal`.
 
 The default is `CPU`.
 """
@@ -82,6 +92,11 @@ function device()
             "Loading CUDA.jl is required to use CUDADevice. You might want to call ClimaComms.@import_required_backends",
         )
     end
+    if target_device == :MetalDevice && !metal_ext_is_loaded()
+        error(
+            "Loading Metal.jl is required to use MetalDevice. You might want to call ClimaComms.@import_required_backends",
+        )
+    end
     DeviceConstructor = getproperty(ClimaComms, target_device)
     return DeviceConstructor()
 end
@@ -742,3 +757,49 @@ Base.@propagate_inbounds function Base.getindex(
 end
 
 # TODO: Check whether conversion of every Int to Int32 improves GPU performance.
+
+# Internal helpers for GPU kernel launch parameters
+
+"""
+    _compute_launch_params_simple(n_items, max_blocks, max_threads_in_block)
+
+Compute kernel launch parameters (`blocks`, `threads_in_block`) for a simple (1 item per thread)
+execution strategy. Returns `nothing` if the number of items exceeds the GPU's capacity for
+this strategy (requires coarsening).
+
+Used by `ClimaCommsCUDAExt` and `ClimaCommsMetalExt` in `run_threaded`.
+"""
+function _compute_launch_params_simple(n_items, max_blocks, max_threads_in_block)
+    if n_items <= max_blocks * max_threads_in_block
+        threads_in_block = min(max_threads_in_block, n_items)
+        blocks = cld(n_items, threads_in_block)
+        return (; blocks, threads_in_block)
+    else
+        return nothing
+    end
+end
+
+"""
+    _compute_launch_params_coarsened(n_items, max_blocks, max_threads_in_block, min_items_in_thread)
+
+Compute kernel launch parameters (`blocks`, `threads_in_block`) for a coarsened execution strategy,
+where each thread processes at least `min_items_in_thread`. This strategy maximizes GPU occupancy
+when `n_items` is large.
+
+Used by `ClimaCommsCUDAExt` and `ClimaCommsMetalExt` in `run_threaded`.
+"""
+function _compute_launch_params_coarsened(
+    n_items, max_blocks, max_threads_in_block, min_items_in_thread,
+)
+    # If there are too many items to use the specified coarsening, increase it
+    # by the smallest possible amount.
+    max_required_threads = cld(n_items, min_items_in_thread)
+    items_in_thread =
+        max_required_threads <= max_blocks * max_threads_in_block ?
+        min_items_in_thread :
+        cld(n_items, max_blocks * max_threads_in_block)
+
+    threads_in_block = min(max_threads_in_block, max_required_threads)
+    blocks = cld(n_items, items_in_thread * threads_in_block)
+    return (; blocks, threads_in_block)
+end