Skip to content
This repository was archived by the owner on Jan 12, 2026. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"

[compat]
CUDA = "3.3.1"
NNlib = "0.7.25"
NNlib = "0.7.31"
julia = "1.6"

[extras]
Expand Down
1 change: 1 addition & 0 deletions src/NNlibCUDA.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ using Random, Statistics
const IntOrIntTuple = Union{Integer, NTuple{N,<:Integer} where N}

include("upsample.jl")
include("sampling.jl")
include("activations.jl")
include("batchedmul.jl")
include("scatter.jl")
Expand Down
2 changes: 1 addition & 1 deletion src/cudnn/batchnorm.jl
Original file line number Diff line number Diff line change
Expand Up @@ -121,4 +121,4 @@ function cudnnBNBackward!(dg::DenseCuArray{T}, g::DenseCuArray{T}, db::DenseCuAr
db .= vec(sum(dy, dims=rdims))
end
end


61 changes: 61 additions & 0 deletions src/sampling.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
@inline function NNlib._safe_add!(dx::CuDeviceArray{T, 4}, value, ix, iy, c, n) where T
@inbounds CUDA.@atomic dx[ix, iy, c, n] += value
end

function grid_sample_kernel!(n_elem, output, input, grid, padding_mode)
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
if index < n_elem
iW, iH, iC, _ = size(input)
_, gW, gH, _ = size(grid)

w = index % gW + 1
h = (index ÷ gW) % gH + 1
n = index ÷ (gW * gH) + 1
NNlib._grid_sample_kernel!(output, input, grid, padding_mode, w, h, n, iW, iH, iC)
end
nothing
end

function ∇grid_sample_kernel!(n_elem, dx, dgrid, Δ, input, grid, padding_mode)
index = (threadIdx().x - 1) + (blockIdx().x - 1) * blockDim().x
if index < n_elem
iW, iH, iC, _ = size(input)
_, gW, gH, _ = size(grid)

w = index % gW + 1
h = (index ÷ gW) % gH + 1
n = index ÷ (gW * gH) + 1
NNlib._∇grid_sample_kernel!(dx, dgrid, Δ, input, grid, padding_mode, w, h, n, iW, iH, iC)
end
nothing
end

function NNlib.grid_sample(x::CuArray{T, 4}, grid::CuArray{V, 4}; padding_mode = :zeros) where {T, V}
pad = Val(padding_mode)
_, _, xC, xN = size(x)
_, gW, gH, _ = size(grid)
n_elem = gW * gH * xN
y = similar(x, T, (gW, gH, xC, xN))

kernel = @cuda launch=false grid_sample_kernel!(n_elem, y, x, grid, pad)
config = launch_configuration(kernel.fun; max_threads=256)
threads = min(n_elem, config.threads)
blocks = cld(n_elem, threads)
kernel(n_elem, y, x, grid, pad; threads=threads, blocks=blocks)
y
end

function NNlib.∇grid_sample(Δ::CuArray{T, 4}, x::CuArray{T, 4}, grid::CuArray{V, 4}; padding_mode = :zeros) where {T, V}
pad = Val(padding_mode)
xN = size(x, 4)
_, gW, gH, _ = size(grid)
n_elem = gW * gH * xN
dx, dgrid = CUDA.zeros(T, size(x)), similar(grid)

kernel = @cuda launch=false ∇grid_sample_kernel!(n_elem, dx, dgrid, Δ, x, grid, pad)
config = launch_configuration(kernel.fun; max_threads=256)
threads = min(n_elem, config.threads)
blocks = cld(n_elem, threads)
kernel(n_elem, dx, dgrid, Δ, x, grid, pad; threads=threads, blocks=blocks)
dx, dgrid
end
1 change: 1 addition & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,5 @@ include("softmax.jl")
include("batchnorm.jl")
include("scatter.jl")
include("gather.jl")
include("sampling.jl")
end
53 changes: 53 additions & 0 deletions test/sampling.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
@testset "Grid Sampling" begin
for T in (Float32, Float64)
x = ones(T, (2, 2, 1, 1))
grid = Array{T}(undef, 2, 2, 2, 1)
grid[:, 1, 1, 1] .= (-1, -1)
grid[:, 2, 1, 1] .= (1, -1)
grid[:, 1, 2, 1] .= (-1, 1)
grid[:, 2, 2, 1] .= (1, 1)

∇grid_true = Array{T}(undef, size(grid))
∇grid_true[:, :, 1, 1] = [[0.0, 0.0] [-0.5, 0.0]]
∇grid_true[:, :, 2, 1] = [[0.0, -0.5] [-0.5, -0.5]]

x_gpu, grid_gpu = CuArray(x), CuArray(grid)

padding_mode = :zeros
y_gpu = grid_sample(x_gpu, grid_gpu; padding_mode=padding_mode)
@test x == collect(y_gpu)
@test eltype(y_gpu) == T

external_grad = CUDA.ones(T, size(y_gpu))
∇input, ∇grid = ∇grid_sample(external_grad, x_gpu, grid_gpu; padding_mode=padding_mode)
@test x == collect(∇input)
@test ∇grid_true == collect(∇grid)
@test eltype(∇input) == T
@test eltype(∇grid) == T

padding_mode = :border
fill!(∇grid_true, 0.0)
sampled = grid_sample(x_gpu, grid_gpu; padding_mode=padding_mode)
@test x == collect(sampled)
@test eltype(sampled) == T

∇input, ∇grid = ∇grid_sample(external_grad, x_gpu, grid_gpu; padding_mode=padding_mode)
@test x == collect(∇input)
@test ∇grid_true == collect(∇grid)
@test eltype(∇input) == T
@test eltype(∇grid) == T
end
end

@testset "Compare grid sampling with NNlib" begin
w, h, c, n = 16, 16, 2, 4
input = rand(Float64, w, h, c, n)
grid = zeros(Float64, 2, w, h, n)
@inbounds for xi in 1:w, yi in 1:h, ni in 1:n
grid[1, xi, yi, ni] = (xi / w) * 2.0 - 1.0 + 0.01
grid[2, xi, yi, ni] = (yi / h) * 2.0 - 1.0
end
for padding_mode in (:zeros, :border)
gputest(grid_sample, input, grid; atol=1e-6, padding_mode=padding_mode)
end
end
6 changes: 3 additions & 3 deletions test/test_utils.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@ function gputest(f, xs...; checkgrad=true, atol=1e-10, kws...)
cpu_out = f(cpu_in...; kws...)
gpu_out = f(gpu_in...; kws...)
@test collect(cpu_out) ≈ collect(gpu_out)

if checkgrad
cpu_grad = gradient((x...) -> sum(f(x...)), cpu_in...)
gpu_grad = gradient((x...) -> sum(f(x...)), gpu_in...)
cpu_grad = gradient((x...) -> sum(f(x...; kws...)), cpu_in...)
gpu_grad = gradient((x...) -> sum(f(x...; kws...)), gpu_in...)
for (cpu_g, gpu_g) in zip(cpu_grad, gpu_grad)
if cpu_g === nothing
@test gpu_g === nothing
Expand Down