Skip to content

Isolate CUDA #4499

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,14 @@ version = "0.96.27"

[deps]
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
CubedSphere = "7445602f-e544-4518-8976-18f8e8ae6cdb"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
FFTW = "7a1cc6ca-52ef-59f5-83cd-3a7055c09341"
GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
Glob = "c27321d9-0574-5035-807b-f59d2c89b15c"
InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
IterativeSolvers = "42fd0dbc-a981-5370-80f2-aaf504508153"
Expand Down Expand Up @@ -40,6 +40,7 @@ TimesDates = "bdfc003b-8df8-5c39-adcd-3a9087f5df4a"

[weakdeps]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
ConstructionBase = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
Expand All @@ -51,6 +52,7 @@ oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"

[extensions]
OceananigansAMDGPUExt = "AMDGPU"
OceananigansCUDAExt = "CUDA"
OceananigansEnzymeExt = "Enzyme"
OceananigansMakieExt = ["MakieCore", "Makie"]
OceananigansMetalExt = "Metal"
Expand Down Expand Up @@ -104,6 +106,7 @@ oneAPI = "2.0.1"

[extras]
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CUDA_Runtime_jll = "76a88914-d11a-5bdc-97e0-2f5a05c973a2"
DataDeps = "124859b0-ceae-595e-8997-d05f6a7a8dfe"
Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
Expand All @@ -116,4 +119,4 @@ TimesDates = "bdfc003b-8df8-5c39-adcd-3a9087f5df4a"
oneAPI = "8f75cd03-7ff8-4ecb-9b8f-daf728133b1b"

[targets]
test = ["AMDGPU", "oneAPI", "DataDeps", "SafeTestsets", "Test", "Enzyme", "Reactant", "Metal", "CUDA_Runtime_jll", "MPIPreferences", "TimesDates", "NCDatasets"]
test = ["AMDGPU", "CUDA", "oneAPI", "DataDeps", "SafeTestsets", "Test", "Enzyme", "Reactant", "Metal", "CUDA_Runtime_jll", "MPIPreferences", "TimesDates", "NCDatasets"]
2 changes: 1 addition & 1 deletion examples/langmuir_turbulence.jl
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ using Oceananigans.Units: minute, minutes, hours
#
# We use a modest resolution and the same total extent as Wagner et al. (2021),

grid = RectilinearGrid(GPU(), size=(128, 128, 64), extent=(128, 128, 64))
grid = RectilinearGrid(CPU(), size=(128, 128, 64), extent=(128, 128, 64))

# ### The Stokes Drift profile
#
Expand Down
140 changes: 140 additions & 0 deletions ext/OceananigansCUDAExt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
module OceananigansCUDAExt

using Oceananigans
using InteractiveUtils
using CUDA, CUDA.CUSPARSE, CUDA.CUFFT
using KernelAbstractions
import Oceananigans.Architectures as AC
import Oceananigans.BoundaryConditions as BC
import Oceananigans.DistributedComputations as DC
import Oceananigans.Fields as FD
import Oceananigans.Grids as GD
import Oceananigans.Solvers as SO
import Oceananigans.Utils as UT
import SparseArrays: SparseMatrixCSC
import KernelAbstractions: __iterspace, __groupindex, __dynamic_checkbounds,
__validindex, CompilerMetadata
import Oceananigans.DistributedComputations: Distributed

const GPUVar = Union{CuArray, CuContext, CuPtr, Ptr}

function __init__()
if CUDA.has_cuda()
@debug "CUDA-enabled GPU(s) detected:"
for (gpu, dev) in enumerate(CUDA.devices())
@debug "$dev: $(CUDA.name(dev))"
end

CUDA.allowscalar(false)
end
end

const CUDAGPU = AC.GPU{<:CUDABackend}
CUDAGPU() = AC.GPU(CUDABackend(always_inline=true))

# Keep default CUDA backend
function AC.GPU()
if CUDA.has_cuda_gpu()
return CUDAGPU()
else
msg = """We cannot make a GPU with the CUDA backend:
a CUDA GPU was not found!"""
throw(ArgumentError(msg))
end
end

function UT.versioninfo_with_gpu(::CUDAGPU)
s = sprint(versioninfo)
gpu_name = CUDA.CuDevice(0) |> CUDA.name
return "CUDA GPU: $gpu_name"
end


Base.summary(::CUDAGPU) = "CUDAGPU"

AC.architecture(::CuArray) = CUDAGPU()
AC.architecture(::CuSparseMatrixCSC) = CUDAGPU()
AC.array_type(::AC.GPU{CUDABackend}) = CuArray

AC.on_architecture(::AC.CPU, a::CuArray) = Array(a)

AC.on_architecture(::CUDAGPU, a::Array) = CuArray(a)
AC.on_architecture(::CUDAGPU, a::CuArray) = a
AC.on_architecture(::CUDAGPU, a::BitArray) = CuArray(a)
AC.on_architecture(::CUDAGPU, a::SubArray{<:Any, <:Any, <:CuArray}) = a
AC.on_architecture(::CUDAGPU, a::SubArray{<:Any, <:Any, <:Array}) = CuArray(a)
AC.on_architecture(::AC.CPU, a::SubArray{<:Any, <:Any, <:CuArray}) = Array(a)
AC.on_architecture(::CUDAGPU, a::StepRangeLen) = a

# cu alters the type of `a`, so we convert it back to the correct type
unified_array(::CUDAGPU, a::AbstractArray) = map(eltype(a), cu(a; unified = true))

## GPU to GPU copy of contiguous data
@inline function AC.device_copy_to!(dst::CuArray, src::CuArray; async::Bool = false)
n = length(src)
context!(context(src)) do
GC.@preserve src dst begin
unsafe_copyto!(pointer(dst, 1), pointer(src, 1), n; async)
end
end
return dst
end

@inline AC.unsafe_free!(a::CuArray) = CUDA.unsafe_free!(a)

@inline AC.constructors(::AC.GPU{CUDABackend}, A::SparseMatrixCSC) = (CuArray(A.colptr), CuArray(A.rowval), CuArray(A.nzval), (A.m, A.n))
@inline AC.constructors(::AC.CPU, A::CuSparseMatrixCSC) = (A.dims[1], A.dims[2], Int64.(Array(A.colPtr)), Int64.(Array(A.rowVal)), Array(A.nzVal))
@inline AC.constructors(::AC.GPU{CUDABackend}, A::CuSparseMatrixCSC) = (A.colPtr, A.rowVal, A.nzVal, A.dims)

@inline AC.unpack_constructors(::AC.CPU, constr::Tuple) = (constr[3], constr[4], constr[5])
@inline AC.copy_unpack_constructors(::AC.CPU, constr::Tuple) = deepcopy((constr[3], constr[4], constr[5]))

@inline AC.arch_sparse_matrix(::AC.GPU{CUDABackend}, constr::Tuple) = CuSparseMatrixCSC(constr...)
@inline AC.arch_sparse_matrix(::AC.CPU, A::CuSparseMatrixCSC) = SparseMatrixCSC(AC.constructors(AC.CPU(), A)...)
@inline AC.arch_sparse_matrix(::AC.GPU{CUDABackend}, A::SparseMatrixCSC) = CuSparseMatrixCSC(AC.constructors(AC.GPU(), A)...)

@inline AC.arch_sparse_matrix(::AC.GPU{CUDABackend}, A::CuSparseMatrixCSC) = A

@inline AC.convert_to_device(::CUDAGPU, args) = CUDA.cudaconvert(args)
@inline AC.convert_to_device(::CUDAGPU, args::Tuple) = map(CUDA.cudaconvert, args)


BC.validate_boundary_condition_architecture(::CuArray, ::AC.GPU, bc, side) = nothing

BC.validate_boundary_condition_architecture(::CuArray, ::AC.CPU, bc, side) =
throw(ArgumentError("$side $bc must use `Array` rather than `CuArray` on CPU architectures!"))

function SO.plan_forward_transform(A::CuArray, ::Union{GD.Bounded, GD.Periodic}, dims, planner_flag)
length(dims) == 0 && return nothing
return CUDA.CUFFT.plan_fft!(A, dims)
end

FD.set!(v::Field, a::CuArray) = FD._set!(v, a)
DC.set!(v::DC.DistributedField, a::CuArray) = DC._set!(v, a)

function SO.plan_backward_transform(A::CuArray, ::Union{GD.Bounded, GD.Periodic}, dims, planner_flag)
length(dims) == 0 && return nothing
return CUDA.CUFFT.plan_ifft!(A, dims)
end

# CUDA version, the indices are passed implicitly
# You must not use KA here as this code is executed in another scope
CUDA.@device_override @inline function KernelAbstractions.__validindex(ctx::UT.MappedCompilerMetadata)
if __dynamic_checkbounds(ctx)
index = @inbounds UT.linear_expand(__iterspace(ctx), blockIdx().x, threadIdx().x)
return index ≤ UT.__linear_ndrange(ctx)
else
return true
end
end

@inline UT.sync_device!(::CuDevice) = CUDA.synchronize()
@inline UT.getdevice(cu::GPUVar, i) = device(cu)
@inline UT.getdevice(cu::GPUVar) = device(cu)
@inline UT.switch_device!(dev::CuDevice) = device!(dev)
@inline UT.sync_device!(::CUDAGPU) = CUDA.synchronize()
@inline UT.sync_device!(::CUDABackend) = CUDA.synchronize()
AC.on_architecture(arch::Distributed, a::CuArray) = AC.on_architecture(AC.child_architecture(arch), a)
AC.on_architecture(arch::Distributed, a::SubArray{<:Any, <:Any, <:CuArray}) = AC.on_architecture(child_architecture(arch), a)

end # module OceananigansCUDAExt
1 change: 0 additions & 1 deletion src/AbstractOperations/AbstractOperations.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ export Average, Integral, CumulativeIntegral, KernelFunctionOperation
export UnaryOperation, Derivative, BinaryOperation, MultiaryOperation, ConditionalOperation


using CUDA
using Base: @propagate_inbounds

using Oceananigans.Architectures
Expand Down
67 changes: 23 additions & 44 deletions src/Architectures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,16 @@ module Architectures

export AbstractArchitecture, AbstractSerialArchitecture
export CPU, GPU, ReactantState
export device, architecture, unified_array, device_copy_to!
export device, device!, devices, ndevices, synchronize, architecture, unified_array, device_copy_to!
export array_type, on_architecture, arch_array
export constructors, unpack_constructors, copy_unpack_constructors
export arch_sparse_matrix, child_architecture
using SparseArrays

using CUDA
using KernelAbstractions
using Adapt
using OffsetArrays
const KA = KernelAbstractions

"""
AbstractArchitecture
Expand Down Expand Up @@ -37,25 +40,12 @@ struct CPU <: AbstractSerialArchitecture end

Return a GPU architecture using `device`.
`device` defauls to CUDA.CUDABackend(always_inline=true)
if CUDA is loaded.
"""
struct GPU{D} <: AbstractSerialArchitecture
device :: D
end

const CUDAGPU = GPU{<:CUDA.CUDABackend}
CUDAGPU() = GPU(CUDA.CUDABackend(always_inline=true))
Base.summary(::CUDAGPU) = "CUDAGPU"

function GPU()
if CUDA.has_cuda_gpu()
return CUDAGPU()
else
msg = """We cannot make a GPU with the CUDA backend:
a CUDA GPU was not found!"""
throw(ArgumentError(msg))
end
end

"""
ReactantState <: AbstractArchitecture

Expand All @@ -69,13 +59,17 @@ struct ReactantState <: AbstractSerialArchitecture end

device(a::CPU) = KernelAbstractions.CPU()
device(a::GPU) = a.device
devices(a::AbstractArchitecture) = KA.devices(device(a))
device!(a::AbstractArchitecture, i) = KA.device!(device(a), i+1)
ndevices(a::AbstractArchitecture) = KA.ndevices(device(a))
synchronize(a::AbstractArchitecture) = KA.synchronize(device(a))

architecture() = nothing
architecture(::Number) = nothing
architecture(::Array) = CPU()
architecture(::CuArray) = CUDAGPU()
architecture(a::SubArray) = architecture(parent(a))
architecture(a::OffsetArray) = architecture(parent(a))
architecture(::SparseMatrixCSC) = CPU()

"""
child_architecture(arch)
Expand All @@ -86,7 +80,6 @@ On single-process, non-distributed systems, return `arch`.
child_architecture(arch::AbstractSerialArchitecture) = arch

array_type(::CPU) = Array
array_type(::GPU) = CuArray

# Fallback
on_architecture(arch, a) = a
Expand All @@ -98,18 +91,9 @@ on_architecture(arch::AbstractSerialArchitecture, nt::NamedTuple) = NamedTuple{k
# On architecture for array types
on_architecture(::CPU, a::Array) = a
on_architecture(::CPU, a::BitArray) = a
on_architecture(::CPU, a::CuArray) = Array(a)
on_architecture(::CPU, a::SubArray{<:Any, <:Any, <:CuArray}) = Array(a)
on_architecture(::CPU, a::SubArray{<:Any, <:Any, <:Array}) = a
on_architecture(::CPU, a::StepRangeLen) = a

on_architecture(::CUDAGPU, a::Array) = CuArray(a)
on_architecture(::CUDAGPU, a::CuArray) = a
on_architecture(::CUDAGPU, a::BitArray) = CuArray(a)
on_architecture(::CUDAGPU, a::SubArray{<:Any, <:Any, <:CuArray}) = a
on_architecture(::CUDAGPU, a::SubArray{<:Any, <:Any, <:Array}) = CuArray(a)
on_architecture(::CUDAGPU, a::StepRangeLen) = a

on_architecture(arch::AbstractSerialArchitecture, a::OffsetArray) =
OffsetArray(on_architecture(arch, a.parent), a.offsets...)

Expand All @@ -120,30 +104,25 @@ cpu_architecture(::ReactantState) = CPU()
unified_array(::CPU, a) = a
unified_array(::GPU, a) = a

# cu alters the type of `a`, so we convert it back to the correct type
unified_array(::GPU, a::AbstractArray) = map(eltype(a), cu(a; unified = true))

## GPU to GPU copy of contiguous data
@inline function device_copy_to!(dst::CuArray, src::CuArray; async::Bool = false)
n = length(src)
context!(context(src)) do
GC.@preserve src dst begin
unsafe_copyto!(pointer(dst, 1), pointer(src, 1), n; async)
end
end
return dst
end

@inline device_copy_to!(dst::Array, src::Array; kw...) = Base.copyto!(dst, src)

@inline unsafe_free!(a::CuArray) = CUDA.unsafe_free!(a)
@inline unsafe_free!(a) = nothing

# Convert arguments to GPU-compatible types
@inline convert_to_device(arch, args) = args
@inline convert_to_device(::CPU, args) = args
@inline convert_to_device(::CUDAGPU, args) = CUDA.cudaconvert(args)
@inline convert_to_device(::CUDAGPU, args::Tuple) = map(CUDA.cudaconvert, args)

# Utils for sparse matrix manipulation
@inline constructors(::CPU, A::SparseMatrixCSC) = (A.m, A.n, A.colptr, A.rowval, A.nzval)
@inline constructors(::CPU, m::Number, n::Number, constr::Tuple) = (m, n, constr...)
@inline constructors(::GPU, m::Number, n::Number, constr::Tuple) = (constr..., (m, n))

@inline unpack_constructors(::GPU, constr::Tuple) = (constr[1], constr[2], constr[3])

@inline copy_unpack_constructors(::GPU, constr::Tuple) = deepcopy((constr[1], constr[2], constr[3]))

@inline arch_sparse_matrix(::CPU, constr::Tuple) = SparseMatrixCSC(constr...)
@inline arch_sparse_matrix(::CPU, A::SparseMatrixCSC) = A

# Deprecated functions
function arch_array(arch, arr)
Expand Down
2 changes: 1 addition & 1 deletion src/BoundaryConditions/BoundaryConditions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ export
apply_x_bcs!, apply_y_bcs!, apply_z_bcs!,
fill_halo_regions!

using CUDA, Adapt
using Adapt
using KernelAbstractions: @index, @kernel

using Oceananigans.Architectures: CPU, GPU, device
Expand Down
6 changes: 1 addition & 5 deletions src/BoundaryConditions/boundary_condition.jl
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ MultiRegionCommunicationBoundaryCondition() = BoundaryCondition(MultiRegionCommu
MultiRegionCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(MultiRegionCommunication(), val; kwargs...)
ZipperBoundaryCondition(val; kwargs...) = BoundaryCondition(Zipper(), val; kwargs...)
DistributedCommunicationBoundaryCondition(val; kwargs...) = BoundaryCondition(DistributedCommunication(), val; kwargs...)

# Support for various types of boundary conditions.
#
# Notes:
Expand Down Expand Up @@ -144,10 +144,6 @@ validate_boundary_condition_architecture(bc::BoundaryCondition, arch, side) =

validate_boundary_condition_architecture(condition, arch, bc, side) = nothing
validate_boundary_condition_architecture(::Array, ::CPU, bc, side) = nothing
validate_boundary_condition_architecture(::CuArray, ::GPU, bc, side) = nothing

validate_boundary_condition_architecture(::CuArray, ::CPU, bc, side) =
throw(ArgumentError("$side $bc must use `Array` rather than `CuArray` on CPU architectures!"))

validate_boundary_condition_architecture(::Array, ::GPU, bc, side) =
throw(ArgumentError("$side $bc must use `CuArray` rather than `Array` on GPU architectures!"))
Expand Down
Loading