Merge pull request #116 from EHTJulia/ptiede-copyto

ptiede · web-flow · commit b008322399ef · 2026-02-08T00:00:43.000-05:00
Switch to copyto! for better performance with Enzyme
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,6 @@
 name = "VLBISkyModels"
 uuid = "d6343c73-7174-4e0f-bb64-562643efbeca"
-version = "0.6.20"
+version = "0.6.21"
 authors = ["Paul Tiede <ptiede91@gmail.com> and contributors"]
 
 [deps]
@@ -36,18 +36,21 @@ StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 FINUFFT = "d8beea63-0952-562e-9c6a-8e8ef7364055"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 NonuniformFFTs = "cd96f58b-6017-4a02-bb9e-f4d81626177f"
+Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
+
 
 [extensions]
 VLBISkyModelsFINUFFT = ["FINUFFT"]
 VLBISkyModelsMakieExt = ["Makie", "DimensionalData"]
 VLBISkyModelsNonuniformFFTs = ["NonuniformFFTs"]
+VLBISkyModelsReactantExt = ["Reactant"]
 
 [compat]
 AbstractFFTs = "1"
 Accessors = "0.1"
 ArgCheck = "2"
 ChainRulesCore = "1"
-ComradeBase = "^0.9.6"
+ComradeBase = "^0.9.8"
 DelimitedFiles = "1"
 DimensionalData = "0.29 - 0.29.24, ^0.29.26"
 DocStringExtensions = "0.6,0.7,0.8,0.9"
@@ -66,6 +69,7 @@ NonuniformFFTs = "0.9"
 PaddedViews = "0.5"
 PolarizedTypes = "^0.1.1"
 Printf = "1.8"
+Reactant = "0.2"
 RecipesBase = "1"
 Reexport = "1"
 Serialization = "1.8"
@@ -78,7 +82,8 @@ julia = "1.9"
 FINUFFT = "d8beea63-0952-562e-9c6a-8e8ef7364055"
 Makie = "ee78f7c6-11fb-53f2-987a-cfe4a2b5a57a"
 NonuniformFFTs = "cd96f58b-6017-4a02-bb9e-f4d81626177f"
+Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test", "FINUFFT", "Makie", "NonuniformFFTs"]
+test = ["Test", "FINUFFT", "Makie", "NonuniformFFTs", "Reactant"]
diff --git a/docs/src/base_api.md b/docs/src/base_api.md
@@ -89,6 +89,7 @@ ComradeBase.phasecenter
 ComradeBase.executor
 ComradeBase.Serial
 ComradeBase.ThreadsEx
+ComradeBase.ReactantEx
 ComradeBase.header
 ComradeBase.NoHeader
 ComradeBase.MinimalHeader
diff --git a/ext/VLBISkyModelsFINUFFT.jl b/ext/VLBISkyModelsFINUFFT.jl
@@ -70,7 +70,7 @@ EnzymeRules.inactive_type(::Type{<:FINUFFT.finufft_plan}) = true
 
 function VLBISkyModels._jlnuft!(out, A::FINUFFTPlan, b::AbstractArray{<:Real})
     bc = getcache(A)
-    bc .= b
+    copyto!(bc, b)
     FINUFFT.finufft_exec!(A.forward, bc, out)
     return nothing
 end
diff --git a/ext/VLBISkyModelsReactantExt.jl b/ext/VLBISkyModelsReactantExt.jl
@@ -0,0 +1,187 @@
+module VLBISkyModelsReactantExt
+
+using VLBISkyModels
+using AbstractFFTs
+using Reactant
+using NFFT
+using NFFT: AbstractNFFTs
+using VLBISkyModels: ReactantAlg
+using LinearAlgebra
+
+
+struct ReactantNFFTPlan{T, D, K <: AbstractArray, arrTc, vecI, vecII, FP, BP, INV, SM} <:
+    AbstractNFFTPlan{T, D, 1}
+    N::NTuple{D, Int}
+    NOut::NTuple{1, Int}
+    J::Int
+    k::K
+    Ñ::NTuple{D, Int}
+    dims::UnitRange{Int}
+    forwardFFT::FP
+    backwardFFT::BP
+    tmpVec::arrTc
+    tmpVecHat::arrTc
+    deconvolveIdx::vecI
+    windowLinInterp::vecII
+    windowHatInvLUT::INV
+    B::SM
+end
+
+
+function VLBISkyModels.plan_nuft_spatial(
+        ::ReactantAlg,
+        imgdomain::ComradeBase.AbstractRectiGrid,
+        visdomain::ComradeBase.UnstructuredDomain,
+    )
+    visp = domainpoints(visdomain)
+    uv2 = similar(visp.U, (2, length(visdomain)))
+    dpx = pixelsizes(imgdomain)
+    dx = dpx.X
+    dy = dpx.Y
+    rm = ComradeBase.rotmat(imgdomain)'
+    # Here we flip the sign because the NFFT uses the -2pi convention
+    uv2[1, :] .= -VLBISkyModels._rotatex.(visp.U, visp.V, Ref(rm)) .* dx
+    uv2[2, :] .= -VLBISkyModels._rotatey.(visp.U, visp.V, Ref(rm)) .* dy
+    return ReactantNFFTPlan(uv2, size(imgdomain))
+end
+
+function VLBISkyModels.make_phases(
+        ::ReactantAlg,
+        imgdomain::ComradeBase.AbstractRectiGrid,
+        visdomain::ComradeBase.UnstructuredDomain,
+    )
+    return Reactant.to_rarray(VLBISkyModels.make_phases(NFFTAlg(), imgdomain, visdomain))
+end
+
+function VLBISkyModels._jlnuft!(out, A::ReactantNFFTPlan, inp::Reactant.AnyTracedRArray)
+    LinearAlgebra.mul!(out, A, inp)
+    return nothing
+end
+
+
+Base.adjoint(p::ReactantNFFTPlan) = p
+
+
+function AbstractNFFTs.plan_nfft(
+        arr::Type{<:Reactant.AnyTracedRArray},
+        k::AbstractMatrix,
+        N::NTuple{D, Int},
+        rest...;
+        kargs...,
+    ) where {D}
+    p = ReactantNFFTPlan(arr, k, N; kargs...)
+    return p
+end
+
+function ReactantNFFTPlan(
+        k::AbstractArray{T}, N::NTuple{D, Int}; fftflags = nothing, kwargs...
+    ) where {T, D}
+
+
+    dims = 1:D
+    CT = complex(T)
+    params, N, NOut, J, Ñ, dims_ = NFFT.initParams(k, N, dims; kwargs...)
+
+    # Get the correct type
+    FP = @jit plan_fft!(zeros(ComplexF64, 2, 2))
+    BP = @jit plan_bfft!(zeros(ComplexF64, 2, 2))
+
+    params.storeDeconvolutionIdx = true # GPU_NFFT only works this way
+    params.precompute = NFFT.FULL # GPU_NFFT only works this way
+
+    windowLinInterp, windowPolyInterp, windowHatInvLUT, deconvolveIdx, B = NFFT.precomputation(
+        k, N[dims_], Ñ[dims_], params
+    )
+
+    U = params.storeDeconvolutionIdx ? N : ntuple(d -> 0, Val(D))
+
+    tmpVec = Reactant.to_rarray(zeros(CT, Ñ))
+    tmpVecHat = Reactant.to_rarray(zeros(CT, U))
+    deconvIdx = Reactant.to_rarray(Int.(deconvolveIdx))
+    winHatInvLUT = Reactant.to_rarray(complex(windowHatInvLUT[1]))
+    B_ = (Reactant.to_rarray(complex.(Array(B))))
+
+    return ReactantNFFTPlan{
+        T,
+        D,
+        typeof(k),
+        typeof(tmpVec),
+        typeof(deconvIdx),
+        typeof(windowLinInterp),
+        typeof(FP),
+        typeof(BP),
+        typeof(winHatInvLUT),
+        typeof(B_),
+    }(
+        N,
+        NOut,
+        J,
+        k,
+        Ñ,
+        dims_,
+        FP,
+        BP,
+        tmpVec,
+        tmpVecHat,
+        deconvIdx,
+        windowLinInterp,
+        winHatInvLUT,
+        B_,
+    )
+end
+
+AbstractNFFTs.size_in(p::ReactantNFFTPlan) = p.N
+AbstractNFFTs.size_out(p::ReactantNFFTPlan) = p.NOut
+
+function AbstractNFFTs.convolve!(
+        p::ReactantNFFTPlan{T, D}, g::Reactant.AnyTracedRArray, fHat::Reactant.AnyTracedRArray
+    ) where {D, T}
+    mul!(fHat, transpose(p.B), vec(g))
+    return nothing
+end
+
+function AbstractNFFTs.convolve_transpose!(
+        p::ReactantNFFTPlan{T, D}, fHat::Reactant.AnyTracedRArray, g::Reactant.AnyTracedRArray
+    ) where {D, T}
+    mul!(vec(g), p.B, fHat)
+    return nothing
+end
+
+function Base.:*(p::ReactantNFFTPlan{T}, f::Reactant.AnyTracedRArray; kargs...) where {T}
+    fHat = similar(f, complex(T), size_out(p))
+    mul!(fHat, p, f; kargs...)
+    return fHat
+end
+
+function AbstractNFFTs.deconvolve!(
+        p::ReactantNFFTPlan{T, D}, f::AbstractArray, g::AbstractArray
+    ) where {D, T}
+    tmp = f .* reshape(p.windowHatInvLUT, size(f))
+    @allowscalar g[p.deconvolveIdx] = reshape(tmp, :)
+    return nothing
+end
+
+"""  in-place NFFT on the GPU"""
+function LinearAlgebra.mul!(
+        fHat::Reactant.AnyTracedRArray,
+        p::ReactantNFFTPlan{T, D},
+        f::Reactant.AnyTracedRArray;
+        verbose = false,
+        timing::Union{Nothing, TimingStats} = nothing,
+    ) where {T, D}
+    NFFT.consistencyCheck(p, f, fHat)
+
+    fill!(p.tmpVec, zero(Complex{T}))
+    t1 = @elapsed @inbounds deconvolve!(p, f, p.tmpVec)
+    fHat .= p.tmpVec[1:length(fHat)]
+    p.forwardFFT * p.tmpVec
+    return t3 = @elapsed @inbounds NFFT.convolve!(p, p.tmpVec, fHat)
+end
+
+function NFFT.nfft(k::AbstractMatrix, f::Reactant.AnyTracedRArray, args...; kwargs...)
+    p = ReactantNFFTPlan(typeof(f), k, size(f))
+    return p * f
+end
+
+
+end
diff --git a/src/fourierdomain/nuft/nfft_alg.jl b/src/fourierdomain/nuft/nfft_alg.jl
@@ -83,7 +83,7 @@ end
 @inline function _nuft!(out::AbstractArray, A, b::AbstractArray)
     tmp = similar(out)
     _jlnuft!(tmp, A, b)
-    out .= tmp
+    copyto!(out, tmp)
     return nothing
 end
 
@@ -183,7 +183,7 @@ function EnzymeRules.reverse(
     for (db, dout) in zip(dbs, douts)
         # TODO open PR on NFFT so we can do this in place.
         _jlnuft_adjointadd!(db, A.val, dout)
-        dout .= 0
+        fill!(dout, 0)
     end
     return (nothing, nothing, nothing)
 end
diff --git a/src/fourierdomain/nuft/nfft_reactant.jl b/src/fourierdomain/nuft/nfft_reactant.jl
@@ -0,0 +1 @@
+struct ReactantAlg <: NUFT end
diff --git a/src/fourierdomain/nuft/nuft.jl b/src/fourierdomain/nuft/nuft.jl
@@ -206,3 +206,5 @@ include(joinpath(@__DIR__, "dft_alg.jl"))
 include(joinpath(@__DIR__, "finufft.jl"))
 
 include(joinpath(@__DIR__, "nonuniformffts.jl"))
+
+include(joinpath(@__DIR__, "nfft_reactant.jl"))
diff --git a/src/models/combinators.jl b/src/models/combinators.jl
@@ -144,9 +144,9 @@ end
 function intensitymap_numeric!(sim::IntensityMap, m::AddModel)
     csim = copy(sim)
     intensitymap!(csim, m.m1)
-    sim .= csim
+    copyto!(sim, csim)
     intensitymap!(csim, m.m2)
-    sim .= sim .+ csim
+    sim .+= csim
     return nothing
 end
 
@@ -296,7 +296,7 @@ end
     ) where {M1, M2}
     cvis = similar(vis)
     visibilitymap!(cvis, m.m1)
-    vis .= cvis
+    copyto!(vis, cvis)
     visibilitymap!(cvis, m.m2)
     vis .*= cvis
     return nothing
diff --git a/src/models/continuous_image.jl b/src/models/continuous_image.jl
@@ -194,9 +194,10 @@ function applypulse!(vis, pulse, gfour::AbstractFourierDualDomain)
     # through the broadcast
     pvis = parent(vis)
     dp = domainpoints(guv)
-    for i in eachindex(pvis, dp)
-        pvis[i] *= visibility_point(mp, dp[i])
-    end
+    pvis .*= visibility_point.(Ref(mp), dp)
+    # for i in eachindex(pvis, dp)
+    #     pvis[i] *= visibility_point(mp, dp[i])
+    # end
     # pvis .*= visibility_point.(Ref(mp), dp)
     return vis
 end
diff --git a/src/models/geometric_models.jl b/src/models/geometric_models.jl
@@ -61,7 +61,6 @@ end
     return exp(-2 * T(π)^2 * (u^2 + v^2)) + zero(T)im
 end
 
-
 """
     $(TYPEDEF)
 
diff --git a/src/models/numerical.jl b/src/models/numerical.jl
@@ -80,7 +80,7 @@ function visibilitymap_numeric!(vis::IntensityMap, m::AbstractModel)
     img = allocate_imgmap(m, gridxy)
     intensitymap!(img, m)
     tildeI = _fft(parent(img))
-    baseimage(vis) .= fftshift(tildeI, 1:2)
+    copyto!(baseimage(vis), fftshift(tildeI, 1:2))
     phasecenter!(vis, gridxy, grid)
     return nothing
 end
diff --git a/test/Project.toml b/test/Project.toml
@@ -20,10 +20,11 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
 Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Pyehtim = "3d61700d-6e5b-419a-8e22-9c066cf00468"
+Reactant = "3c362404-f566-11ee-1572-e11a4b42c853"
 Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
-Enzyme = "0.13 - 0.13.104"
+Enzyme = "0.13"
diff --git a/test/reactant.jl b/test/reactant.jl
@@ -0,0 +1,25 @@
+using Reactant
+
+@testset "Reactant" begin
+
+    gim = imagepixels(10.0, 10.0, 128, 128)
+    gimr = @jit(identity(gim))
+
+    rast = rand(128, 128)
+    rastr = Reactant.to_rarray(rast)
+
+    mr = ContinuousImage(rastr, gimr, BSplinePulse{3}())
+    m = ContinuousImage(rast, gim, BSplinePulse{3}())
+
+    u = randn(64) / 5.0
+    v = randn(64) / 5.0
+    guv = UnstructuredDomain((U = u, V = v))
+
+    gfn = FourierDualDomain(gim, guv, NFFTAlg())
+    gfr = FourierDualDomain(gimr, guv, VLBISkyModels.ReactantAlg())
+
+    vnf = visibilitymap(m, gfn)
+    vrf = @jit visibilitymap(mr, gfr)
+
+    @test parent(vrf) ≈ vnf
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -36,7 +36,7 @@ function FiniteDifferences.to_vec(k::UnstructuredMap)
     return v, back
 end
 
-function testgrad(f, x; atol = 1.0e-8, rtol = 1.0e-5)
+function testgrad(f, x; atol = 1.0e-8, rtol = 1.0e-7)
     dx = Enzyme.make_zero(x)
     autodiff(set_runtime_activity(Enzyme.Reverse), Const(f), Active, Duplicated(x, dx))
     fdm = central_fdm(5, 1)
@@ -234,4 +234,5 @@ end
     include("stokesintensitymap.jl")
     include("rules.jl")
     include("rotgrid.jl")
+    include("reactant.jl")
 end