CliMA · michel2323 · May 12, 2025 · May 13, 2025 · May 13, 2025 · May 14, 2025
diff --git a/Project.toml b/Project.toml
@@ -5,14 +5,14 @@ version = "0.96.34"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
 CubedSphere = "7445602f-e544-4518-8976-18f8e8ae6cdb"
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
 FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
 GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
+GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
@@ -39,6 +39,7 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 
 [weakdeps]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
@@ -50,6 +51,7 @@ oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [extensions]
 OceananigansAMDGPUExt = "AMDGPU"
+OceananigansCUDAExt = "CUDA"
 OceananigansEnzymeExt = "Enzyme"
 OceananigansMakieExt = ["MakieCore", "Makie"]
 OceananigansMetalExt = "Metal"
@@ -103,6 +105,7 @@ oneAPI = "2.0.1"
 
 [extras]
 AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
 DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
 Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
@@ -115,4 +118,4 @@ TimesDates = "bdfc003b-8df8-5c39-adcd-3a9087f5df4a"
 oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"
 
 [targets]
-test = ["AMDGPU", "oneAPI", "DataDeps", "SafeTestsets", "Test", "Enzyme", "Reactant", "Metal", "CUDA_Runtime_jll", "MPIPreferences", "TimesDates", "NCDatasets"]
+test = ["AMDGPU", "CUDA", "oneAPI", "DataDeps", "SafeTestsets", "Test", "Enzyme", "Reactant", "Metal", "CUDA_Runtime_jll", "MPIPreferences", "TimesDates", "NCDatasets"]
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,4 +1,5 @@
 [deps]
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterCitations = "daee34ce-89f3-4625-b898-19384cb65244"
@@ -15,6 +16,7 @@ TimesDates = "bdfc003b-8df8-5c39-adcd-3a9087f5df4a"
 
 [compat]
 CairoMakie = "0.11, 0.12, 0.13"
+CUDA = "5.4"
 Documenter = "1"
 DocumenterCitations = "1"
 JLD2 = "0.4, 0,5"

diff --git a/docs/src/grids.md b/docs/src/grids.md
@@ -49,6 +49,7 @@ As a result, the grid was constructed by default on the CPU.
 Next we build a grid on the _GPU_ that's two-dimensional in ``x, z`` and has variably-spaced cell interfaces in the `z`-direction,
 
 ```jldoctest grids_gpu
+using CUDA
 architecture = GPU()
 z_faces = [0, 1, 3, 6, 10]
 

diff --git a/docs/src/model_setup/legacy_grids.md b/docs/src/model_setup/legacy_grids.md
@@ -42,6 +42,7 @@ architecture. By default `architecture = CPU()`. By providing `GPU()` as the `ar
 we can construct the grid on GPU:
 
 ```julia
+julia> using CUDA
 julia> grid = RectilinearGrid(GPU(), size = (32, 64, 256), extent = (128, 256, 512))
 32×64×256 RectilinearGrid{Float64, Periodic, Periodic, Bounded} on GPU with 3×3×3 halo
 ├── Periodic x ∈ [0.0, 128.0)  regularly spaced with Δx=4.0

diff --git a/docs/src/quick_start.md b/docs/src/quick_start.md
@@ -65,6 +65,7 @@ CairoMakie.activate!(type = "png")
 ```@example gpu
 using Oceananigans
 using CairoMakie
+using CUDA
 
 grid = RectilinearGrid(GPU(),
                        size = (1024, 1024),

diff --git a/examples/langmuir_turbulence.jl b/examples/langmuir_turbulence.jl
@@ -18,11 +18,12 @@
 
 # ```julia
 # using Pkg
-# pkg"add Oceananigans, CairoMakie"
+# pkg"add Oceananigans, CairoMakie, CUDA"
 # ```
 
 using Oceananigans
 using Oceananigans.Units: minute, minutes, hours
+using CUDA
 
 # ## Model set-up
 #

diff --git a/ext/OceananigansAMDGPUExt.jl b/ext/OceananigansAMDGPUExt.jl
@@ -1,35 +1,100 @@
 module OceananigansAMDGPUExt
 
-using AMDGPU
 using Oceananigans
+using InteractiveUtils
+using AMDGPU, AMDGPU.rocSPARSE, AMDGPU.rocFFT
 using Oceananigans.Utils: linear_expand, __linear_ndrange, MappedCompilerMetadata
 using KernelAbstractions: __dynamic_checkbounds, __iterspace
-import KernelAbstractions: __validindex
+using KernelAbstractions
+import Oceananigans.Architectures as AC
+import Oceananigans.BoundaryConditions as BC
+import Oceananigans.DistributedComputations as DC
+import Oceananigans.Fields as FD
+import Oceananigans.Grids as GD
+import Oceananigans.Solvers as SO
+import Oceananigans.Utils as UT
+import SparseArrays: SparseMatrixCSC
+import KernelAbstractions: __iterspace, __groupindex, __dynamic_checkbounds,
+                           __validindex, CompilerMetadata
+import Oceananigans.DistributedComputations: Distributed
 
-import Oceananigans.Architectures:
-    architecture,
-    convert_to_device,
-    on_architecture
+const GPUVar = Union{ROCArray, Ptr}
 
-const ROCGPU = GPU{<:AMDGPU.ROCBackend}
-ROCGPU() = GPU(AMDGPU.ROCBackend())
+function __init__()
+    if AMDGPU.functional()
+        @debug "ROCm-enabled GPU(s) detected:"
+        for (gpu, dev) in enumerate(AMDGPU.devices())
+            @debug "$dev: $(AMDGPU.name(dev))"
+        end
+    end
+end
+
+const ROCGPU = AC.GPU{ROCBackend}
+ROCGPU() = AC.GPU(AMDGPU.ROCBackend())
 
-architecture(::ROCArray) = ROCGPU()
 Base.summary(::ROCGPU) = "ROCGPU"
 
-on_architecture(::ROCGPU, a::Number) = a
-on_architecture(::ROCGPU, a::Array) = ROCArray(a)
-on_architecture(::ROCGPU, a::BitArray) = ROCArray(a)
-on_architecture(::ROCGPU, a::SubArray{<:Any, <:Any, <:Array}) = ROCArray(a)
-on_architecture(::CPU, a::ROCArray) = Array(a)
-on_architecture(::CPU, a::SubArray{<:Any, <:Any, <:ROCArray}) = Array(a)
-on_architecture(::ROCGPU, a::ROCArray) = a
-on_architecture(::ROCGPU, a::SubArray{<:Any, <:Any, <:ROCArray}) = a
-on_architecture(::ROCGPU, a::StepRangeLen) = a
+AC.architecture(::ROCArray) = ROCGPU()
+AC.architecture(::ROCSparseMatrixCSC) = ROCGPU()
+AC.array_type(::AC.GPU{ROCBackend}) = ROCArray
+
+AC.on_architecture(::ROCGPU, a::Number) = a
+AC.on_architecture(::AC.CPU, a::ROCArray) = Array(a)
+AC.on_architecture(::ROCGPU, a::Array) = ROCArray(a)
+AC.on_architecture(::ROCGPU, a::ROCArray) = a
+AC.on_architecture(::ROCGPU, a::BitArray) = ROCArray(a)
+AC.on_architecture(::ROCGPU, a::SubArray{<:Any, <:Any, <:ROCArray}) = a
+AC.on_architecture(::ROCGPU, a::SubArray{<:Any, <:Any, <:Array}) = ROCArray(a)
+AC.on_architecture(::AC.CPU, a::SubArray{<:Any, <:Any, <:ROCArray}) = Array(a)
+AC.on_architecture(::ROCGPU, a::StepRangeLen) = a
+AC.on_architecture(arch::Distributed, a::ROCArray) = AC.on_architecture(AC.child_architecture(arch), a)
+AC.on_architecture(arch::Distributed, a::SubArray{<:Any, <:Any, <:ROCArray}) = AC.on_architecture(child_architecture(arch), a)
+
+function AC.unified_array(::ROCGPU, a::AbstractArray)
+    error("unified_array is not implemented for ROCGPU.")
+end
+
+## GPU to GPU copy of contiguous data
+@inline function AC.device_copy_to!(dst::ROCArray, src::ROCArray; async::Bool = false)
+    if async == true
+        @warn "Asynchronous copy is not supported for ROCArray. Falling back to synchronous copy."
+    end
+    copyto!(dst, src)
+    return dst
+end
+
+@inline AC.unsafe_free!(a::ROCArray) = AMDGPU.unsafe_free!(a)
+
+@inline AC.constructors(::AC.GPU{ROCBackend}, A::SparseMatrixCSC) = (ROCArray(A.colptr), ROCArray(A.rowval), ROCArray(A.nzval),  (A.m, A.n))
+@inline AC.constructors(::AC.CPU, A::ROCSparseMatrixCSC) = (A.dims[1], A.dims[2], Int64.(Array(A.colPtr)), Int64.(Array(A.rowVal)), Array(A.nzVal))
+@inline AC.constructors(::AC.GPU{ROCBackend}, A::ROCSparseMatrixCSC) = (A.colPtr, A.rowVal, A.nzVal,  A.dims)
+
+@inline AC.arch_sparse_matrix(::AC.GPU{ROCBackend}, constr::Tuple) = ROCSparseMatrixCSC(constr...)
+@inline AC.arch_sparse_matrix(::AC.CPU, A::ROCSparseMatrixCSC)   = SparseMatrixCSC(AC.constructors(AC.CPU(), A)...)
+@inline AC.arch_sparse_matrix(::AC.GPU{ROCBackend}, A::SparseMatrixCSC)     = ROCSparseMatrixCSC(AC.constructors(AC.GPU(), A)...)
+@inline AC.arch_sparse_matrix(::AC.GPU{ROCBackend}, A::ROCSparseMatrixCSC) = A
 
 @inline convert_to_device(::ROCGPU, args) = AMDGPU.rocconvert(args)
 @inline convert_to_device(::ROCGPU, args::Tuple) = map(AMDGPU.rocconvert, args)
 
+BC.validate_boundary_condition_architecture(::ROCArray, ::AC.GPU, bc, side) = nothing
+
+BC.validate_boundary_condition_architecture(::ROCArray, ::AC.CPU, bc, side) =
+    throw(ArgumentError("$side $bc must use `Array` rather than `ROCArray` on CPU architectures!"))
+
+function SO.plan_forward_transform(A::ROCArray, ::Union{GD.Bounded, GD.Periodic}, dims, planner_flag)
+    length(dims) == 0 && return nothing
+    return AMDGPU.rocFFT.plan_fft!(A, dims)
+end
+
+FD.set!(v::Field, a::ROCArray) = FD.set_to_array!(v, a)
+DC.set!(v::DC.DistributedField, a::ROCArray) = DC.set_to_array!(v, a)
+
+function SO.plan_backward_transform(A::ROCArray, ::Union{GD.Bounded, GD.Periodic}, dims, planner_flag)
+    length(dims) == 0 && return nothing
+    return AMDGPU.rocFFT.plan_ifft!(A, dims)
+end
+
 AMDGPU.Device.@device_override @inline function __validindex(ctx::MappedCompilerMetadata)
     if __dynamic_checkbounds(ctx)
         I = @inbounds linear_expand(__iterspace(ctx), AMDGPU.Device.blockIdx().x, AMDGPU.Device.threadIdx().x)
@@ -39,4 +104,10 @@ AMDGPU.Device.@device_override @inline function __validindex(ctx::MappedCompiler
     end
 end
 
+@inline UT.getdevice(roc::GPUVar, i)  = device(roc)
+@inline UT.getdevice(roc::GPUVar)     = device(roc)
+@inline UT.switch_device!(dev::Int64) = device!(dev)
+@inline UT.sync_device!(::ROCGPU)     = AMDGPU.synchronize()
+@inline UT.sync_device!(::ROCBackend) = AMDGPU.synchronize()
+
 end # module
diff --git a/ext/OceananigansCUDAExt.jl b/ext/OceananigansCUDAExt.jl
@@ -0,0 +1,136 @@
+module OceananigansCUDAExt
+
+using Oceananigans
+using InteractiveUtils
+using CUDA, CUDA.CUSPARSE, CUDA.CUFFT
+using Oceananigans.Utils: linear_expand, __linear_ndrange, MappedCompilerMetadata
+using KernelAbstractions: __dynamic_checkbounds, __iterspace
+using KernelAbstractions
+import Oceananigans.Architectures as AC
+import Oceananigans.BoundaryConditions as BC
+import Oceananigans.DistributedComputations as DC
+import Oceananigans.Fields as FD
+import Oceananigans.Grids as GD
+import Oceananigans.Solvers as SO
+import Oceananigans.Utils as UT
+import SparseArrays: SparseMatrixCSC
+import KernelAbstractions: __iterspace, __groupindex, __dynamic_checkbounds,
+                           __validindex, CompilerMetadata
+import Oceananigans.DistributedComputations: Distributed
+
+const GPUVar = Union{CuArray, CuContext, CuPtr, Ptr}
+
+function __init__()
+    if CUDA.functional()
+        @debug "CUDA-enabled GPU(s) detected:"
+        for (gpu, dev) in enumerate(CUDA.devices())
+            @debug "$dev: $(CUDA.name(dev))"
+        end
+
+        CUDA.allowscalar(false)
+    end
+end
+
+const CUDAGPU = AC.GPU{<:CUDABackend}
+CUDAGPU() = AC.GPU(CUDABackend(always_inline=true))
+
+# Keep default CUDA backend
+function AC.GPU()
+    if CUDA.has_cuda_gpu()
+        return CUDAGPU()
+    else
+        msg = """We cannot make a GPU with the CUDA backend:
+                 a CUDA GPU was not found!"""
+        throw(ArgumentError(msg))
+    end
+end
+
+function UT.versioninfo_with_gpu(::CUDAGPU)
+    s = sprint(versioninfo)
+    gpu_name = CUDA.CuDevice(0) |> CUDA.name
+    return "CUDA GPU: $gpu_name"
+end
+
+Base.summary(::CUDAGPU) = "CUDAGPU"
+
+AC.architecture(::CuArray) = CUDAGPU()
+AC.architecture(::CuSparseMatrixCSC) = CUDAGPU()
+AC.array_type(::AC.GPU{CUDABackend}) = CuArray
+
+AC.on_architecture(::CUDAGPU, a::Number) = a
+AC.on_architecture(::AC.CPU, a::CuArray) = Array(a)
+AC.on_architecture(::CUDAGPU, a::Array) = CuArray(a)
+AC.on_architecture(::CUDAGPU, a::CuArray) = a
+AC.on_architecture(::CUDAGPU, a::BitArray) = CuArray(a)
+AC.on_architecture(::CUDAGPU, a::SubArray{<:Any, <:Any, <:CuArray}) = a
+AC.on_architecture(::CUDAGPU, a::SubArray{<:Any, <:Any, <:Array}) = CuArray(a)
+AC.on_architecture(::AC.CPU, a::SubArray{<:Any, <:Any, <:CuArray}) = Array(a)
+AC.on_architecture(::CUDAGPU, a::StepRangeLen) = a
+AC.on_architecture(arch::Distributed, a::CuArray) = AC.on_architecture(AC.child_architecture(arch), a)
+AC.on_architecture(arch::Distributed, a::SubArray{<:Any, <:Any, <:CuArray}) = AC.on_architecture(child_architecture(arch), a)
+
+# cu alters the type of `a`, so we convert it back to the correct type
+AC.unified_array(::CUDAGPU, a::AbstractArray) = map(eltype(a), cu(a; unified = true))
+
+## GPU to GPU copy of contiguous data
+@inline function AC.device_copy_to!(dst::CuArray, src::CuArray; async::Bool = false)
+    n = length(src)
+    context!(context(src)) do
+        GC.@preserve src dst begin
+            unsafe_copyto!(pointer(dst, 1), pointer(src, 1), n; async)
+        end
+    end
+    return dst
+end
+
+@inline AC.unsafe_free!(a::CuArray) = CUDA.unsafe_free!(a)
+
+@inline AC.constructors(::AC.GPU{CUDABackend}, A::SparseMatrixCSC) = (CuArray(A.colptr), CuArray(A.rowval), CuArray(A.nzval),  (A.m, A.n))
+@inline AC.constructors(::AC.CPU, A::CuSparseMatrixCSC) = (A.dims[1], A.dims[2], Int64.(Array(A.colPtr)), Int64.(Array(A.rowVal)), Array(A.nzVal))
+@inline AC.constructors(::AC.GPU{CUDABackend}, A::CuSparseMatrixCSC) = (A.colPtr, A.rowVal, A.nzVal,  A.dims)
+
+@inline AC.arch_sparse_matrix(::AC.GPU{CUDABackend}, constr::Tuple) = CuSparseMatrixCSC(constr...)
+@inline AC.arch_sparse_matrix(::AC.CPU, A::CuSparseMatrixCSC)   = SparseMatrixCSC(AC.constructors(AC.CPU(), A)...)
+@inline AC.arch_sparse_matrix(::AC.GPU{CUDABackend}, A::SparseMatrixCSC)     = CuSparseMatrixCSC(AC.constructors(AC.GPU(), A)...)
+@inline AC.arch_sparse_matrix(::AC.GPU{CUDABackend}, A::CuSparseMatrixCSC) = A
+
+@inline AC.convert_to_device(::CUDAGPU, args) = CUDA.cudaconvert(args)
+@inline AC.convert_to_device(::CUDAGPU, args::Tuple) = map(CUDA.cudaconvert, args)
+
+BC.validate_boundary_condition_architecture(::CuArray, ::AC.GPU, bc, side) = nothing
+
+BC.validate_boundary_condition_architecture(::CuArray, ::AC.CPU, bc, side) =
+    throw(ArgumentError("$side $bc must use `Array` rather than `CuArray` on CPU architectures!"))
+
+function SO.plan_forward_transform(A::CuArray, ::Union{GD.Bounded, GD.Periodic}, dims, planner_flag)
+    length(dims) == 0 && return nothing
+    return CUDA.CUFFT.plan_fft!(A, dims)
+end
+
+FD.set!(v::Field, a::CuArray) = FD.set_to_array!(v, a)
+DC.set!(v::DC.DistributedField, a::CuArray) = DC.set_to_array!(v, a)
+
+function SO.plan_backward_transform(A::CuArray, ::Union{GD.Bounded, GD.Periodic}, dims, planner_flag)
+    length(dims) == 0 && return nothing
+    return CUDA.CUFFT.plan_ifft!(A, dims)
+end
+
+# CUDA version, the indices are passed implicitly
+# You must not use KA here as this code is executed in another scope
+CUDA.@device_override @inline function __validindex(ctx::MappedCompilerMetadata)
+    if __dynamic_checkbounds(ctx)
+        index = @inbounds linear_expand(__iterspace(ctx), CUDA.blockIdx().x, CUDA.threadIdx().x)
+        return index ≤ __linear_ndrange(ctx)
+    else
+        return true
+    end
+end
+
+@inline UT.sync_device!(::CuDevice)      = CUDA.synchronize()
+@inline UT.getdevice(cu::GPUVar, i)      = device(cu)
+@inline UT.getdevice(cu::GPUVar)         = device(cu)
+@inline UT.switch_device!(dev::CuDevice) = device!(dev)
+@inline UT.sync_device!(::CUDAGPU)       = CUDA.synchronize()
+@inline UT.sync_device!(::CUDABackend)   = CUDA.synchronize()
+
+end # module OceananigansCUDAExt
diff --git a/src/AbstractOperations/AbstractOperations.jl b/src/AbstractOperations/AbstractOperations.jl
@@ -6,7 +6,6 @@ export Average, Integral, CumulativeIntegral, KernelFunctionOperation
 export UnaryOperation, Derivative, BinaryOperation, MultiaryOperation, ConditionalOperation
 
 
-using CUDA
 using Base: @propagate_inbounds
 
 using Oceananigans.Architectures

diff --git a/src/AbstractOperations/binary_operations.jl b/src/AbstractOperations/binary_operations.jl
@@ -210,7 +210,7 @@ end
 ##### GPU capabilities
 #####
 
-"Adapt `BinaryOperation` to work on the GPU via CUDAnative and CUDAdrv."
+"Adapt `BinaryOperation` to work on the GPU via KernelAbstractions."
 Adapt.adapt_structure(to, binary::BinaryOperation{LX, LY, LZ}) where {LX, LY, LZ} =
     BinaryOperation{LX, LY, LZ}(Adapt.adapt(to, binary.op),
                                 Adapt.adapt(to, binary.a),

diff --git a/src/AbstractOperations/kernel_function_operation.jl b/src/AbstractOperations/kernel_function_operation.jl
@@ -69,7 +69,7 @@ end
 indices(κ::KernelFunctionOperation) = construct_regionally(intersect_indices, location(κ), κ.arguments...)
 compute_at!(κ::KernelFunctionOperation, time) = Tuple(compute_at!(d, time) for d in κ.arguments)
 
-"Adapt `KernelFunctionOperation` to work on the GPU via CUDAnative and CUDAdrv."
+"Adapt `KernelFunctionOperation` to work on the GPU via KernelAbstractions."
 Adapt.adapt_structure(to, κ::KernelFunctionOperation{LX, LY, LZ}) where {LX, LY, LZ} =
     KernelFunctionOperation{LX, LY, LZ}(Adapt.adapt(to, κ.kernel_function),
                                         Adapt.adapt(to, κ.grid),

diff --git a/src/AbstractOperations/multiary_operations.jl b/src/AbstractOperations/multiary_operations.jl
@@ -144,7 +144,7 @@ end
 ##### GPU capabilities
 #####
 
-"Adapt `MultiaryOperation` to work on the GPU via CUDAnative and CUDAdrv."
+"Adapt `MultiaryOperation` to work on the GPU via KernelAbstractions."
 Adapt.adapt_structure(to, multiary::MultiaryOperation{LX, LY, LZ}) where {LX, LY, LZ} =
     MultiaryOperation{LX, LY, LZ}(Adapt.adapt(to, multiary.op),
                                   Adapt.adapt(to, multiary.args),