diff --git a/src/CUDAKernels.jl b/src/CUDAKernels.jl index 87e2bb3f95..77a304e508 100644 --- a/src/CUDAKernels.jl +++ b/src/CUDAKernels.jl @@ -16,9 +16,10 @@ export CUDABackend struct CUDABackend <: KA.GPU prefer_blocks::Bool always_inline::Bool + fastmath::Bool end -CUDABackend(; prefer_blocks=false, always_inline=false) = CUDABackend(prefer_blocks, always_inline) +CUDABackend(; prefer_blocks=false, always_inline=false, fastmath=false) = CUDABackend(prefer_blocks, always_inline, fastmath) KA.allocate(::CUDABackend, ::Type{T}, dims::Tuple) where T = CuArray{T}(undef, dims) KA.zeros(::CUDABackend, ::Type{T}, dims::Tuple) where T = CUDA.zeros(T, dims) @@ -100,7 +101,7 @@ function (obj::KA.Kernel{CUDABackend})(args...; ndrange=nothing, workgroupsize=n maxthreads = nothing end - kernel = @cuda launch=false always_inline=backend.always_inline maxthreads=maxthreads obj.f(ctx, args...) + kernel = @cuda launch=false always_inline=backend.always_inline fastmath=backend.fastmath maxthreads=maxthreads obj.f(ctx, args...) # figure out the optimal workgroupsize automatically if KA.workgroupsize(obj) <: KA.DynamicSize && workgroupsize === nothing