rft: make gamma(a,x) GPU compatible

haakon-e · haakon-e · commit 03fbbfd5569e · 2026-02-27T16:33:19.000-08:00
- improve type stability for En_expand_origin_general, En_safe_expfact, En_expand_origin
- this improves type stability for expint(a, x) and thus gamma(a, x)
- make gamma(n::Int) for n&gt;20 call gamma(Float64(n)) for compatibility with CUDA.jl's device override of the gamma function
- manually inline single recursion in _zeta, _trigamma
- modify while loop in _expint to guarantee termination
- add related tests
diff --git a/src/expint.jl b/src/expint.jl
@@ -284,8 +284,9 @@ end
 # series about origin, general ν
 # https://functions.wolfram.com/GammaBetaErf/ExpIntegralE/06/01/04/01/01/0003/
 function En_expand_origin_general(ν::Number, z::Number, niter::Integer)
+    FT = promote_type(typeof(ν), typeof(real(z)))  # ensure return type isn't more precise than inputs
     # gammaterm = En_safe_gamma_term(ν, z)
-    gammaterm = gamma(1-ν)*z^(ν-1)
+    gammaterm = FT(gamma(1-ν))*z^(ν-1)
     frac = one(z)
     blowup  = abs(1 - ν) < 0.5 ? frac / (1 - ν) : zero(z)
     sumterm = abs(1 - ν) < 0.5 ? zero(z) : frac / (1 - ν)
@@ -321,13 +322,14 @@ function En_expand_origin_general(ν::Number, z::Number, niter::Integer)
         series2 += (7π^4 + 15*(ψ₀^4 + 2ψ₀^2 * (π^2 - 3ψ₁) + ψ₁*(-2π^2 + 3ψ₁) + 4ψ₀*ψ₂) - 15ψ₃)*δ^3/360
         series2 += (3ψ₀^5 + ψ₀^3*(10π^2 - 30ψ₁) + 30ψ₀^2*ψ₂ + ψ₀*(45ψ₁^2 - 30π^2*ψ₁ - 15ψ₃ + 7π^4) - 30ψ₁*ψ₂ + 10π^2*ψ₂ + 3ψ₄)*δ^4/360
 
-        return (series1 + series2) * En_safe_expfact(n, z) * z^(ν-n-1) - sumterm
+        return (series1 + FT(series2)) * En_safe_expfact(n, z) * z^(ν-n-1) - sumterm
     end
     return gammaterm - (blowup + sumterm)
 end
 
 # compute (-z)^n / n!, avoiding overflow if possible, where n is an integer ≥ 0 (but not necessarily an Integer)
 function En_safe_expfact(n::Real, z::Number)
+    FT = eltype(real(z))  # get the floating point type of the input
     if n < 100
         powerterm = one(z)
         for i = 1:Int(n)
@@ -337,9 +339,9 @@ function En_safe_expfact(n::Real, z::Number)
     else
         if z isa Real
             sgn = z ≤ 0 ? one(n) : (n <= typemax(Int) ? (isodd(Int(n)) ? -one(n) : one(n)) : (-1)^n)
-            return sgn * exp(n * log(abs(z)) - loggamma(n+1))
+            return sgn * exp(n * log(abs(z)) - loggamma(FT(n+1)))
         else
-            return exp(n * log(-z) - loggamma(n+1))
+            return exp(n * log(-z) - loggamma(FT(n+1)))
         end
     end
 end
@@ -379,10 +381,11 @@ end
 # can find imaginary part of E_ν(x) for x on negative real axis analytically
 # https://functions.wolfram.com/GammaBetaErf/ExpIntegralE/04/05/01/0003/
 function En_imagbranchcut(ν::Number, z::Number)
+    FT = promote_type(typeof(real(ν)), typeof(real(z)))
     a = real(z)
     e1 = exp(oftype(a, π) * imag(ν))
-    e2 = Complex(cospi(real(ν)), -sinpi(real(ν)))
-    lgamma, lgammasign = ν isa Real ? logabsgamma(ν) : (loggamma(ν), 1)
+    e2 = Complex(cospi(FT(real(ν))), -sinpi(FT(real(ν))))
+    lgamma, lgammasign = ν isa Real ? logabsgamma(FT(ν)) : (loggamma(ν), 1)
     return -2 * lgammasign * e1 * π * e2 * exp((ν-1)*log(complex(a)) - lgamma) * im
 end
 
@@ -468,12 +471,14 @@ function _expint(ν::Number, z::Number, niter::Int=1000, ::Val{expscaled}=Val{fa
             end
             return doconj ? conj(E_start) : E_start
         end
-        while i == quick_niter
+        doublings = 0
+        while i == quick_niter && doublings < 50
             # double imaginary part until in region with fast convergence
             imstart *= 2
             z₀ = rez + imstart*im
             g_start, cf_start, i, _ = En_cf(ν, z₀, quick_niter)
             E_start = g_start + En_safeexpmult(-z₀, cf_start)
+            doublings += 1
         end
 
         # nsteps chosen so |Δ| ≤ 0.5
diff --git a/src/gamma.jl b/src/gamma.jl
@@ -64,10 +64,9 @@ trigamma(x::Number) = _trigamma(float(x))
 
 function _trigamma(z::ComplexOrReal{Float64})
     # via the derivative of the Kölbig digamma formulation
+    z′ = z  # save original value
+    z = ifelse(real(z′) <= 0, 1 - z′, z′)  # reflection formula (unrolled recursion)
     x = real(z)
-    if x <= 0 # reflection formula
-        return (π / sinpi(z))^2 - trigamma(1 - z)
-    end
     ψ = zero(z)
     N = 10
     if x < N
@@ -84,6 +83,7 @@ function _trigamma(z::ComplexOrReal{Float64})
     ψ += t + 0.5*w
     # the coefficients here are Float64(bernoulli[2:9])
     ψ += t*w * @evalpoly(w,0.16666666666666666,-0.03333333333333333,0.023809523809523808,-0.03333333333333333,0.07575757575757576,-0.2531135531135531,1.1666666666666667,-7.092156862745098)
+    ifelse(real(z′) <= 0, (π / sinpi(z′))^2 - ψ, ψ)  # reflection formula (unrolled recursion)
 end
 
 signflip(m::Number, z) = (-1+0im)^m * z
@@ -417,9 +417,6 @@ function _zeta(s::ComplexOrReal{Float64})
     # Riemann zeta function; algorithm is based on specializing the Hurwitz
     # zeta function above for z==1.
 
-    # blows up to ±Inf, but get correct sign of imaginary zero
-    s == 1 && return NaN + zero(s) * imag(s)
-
     if !isfinite(s) # annoying NaN and Inf cases
         isnan(s) && return imag(s) == 0 ? s : s*s
         if isfinite(imag(s))
@@ -436,18 +433,28 @@ function _zeta(s::ComplexOrReal{Float64})
                              -1.00078519447704240796017680222772921424,
                              -0.9998792995005711649578008136558752359121)
         end
+        ζ1ms = _zeta_core(1 - s)
         if absim > 12 # amplitude of sinpi(s/2) ≈ exp(imag(s)*π/2)
             # avoid overflow/underflow (issue #128)
             lg = loggamma(1 - s)
             rehalf = real(s)*0.5
-            return zeta(1 - s) * exp(lg + absim*halfπ + s*log2π) * inv2π * Complex(
+            return ζ1ms * exp(lg + absim*halfπ + s*log2π) * inv2π * Complex(
                 sinpi(rehalf), flipsign(cospi(rehalf), imag(s))
             )
         else
-            return zeta(1 - s) * gamma(1 - s) * sinpi(s*0.5) * twoπ^s * invπ
+            return ζ1ms * gamma(1 - s) * sinpi(s*0.5) * twoπ^s * invπ
         end
     end
 
+    return _zeta_core(s)
+end
+
+
+# Core asymptotic computation of the Riemann zeta function for real(s) >= 0.5.
+# Factored out of _zeta to avoid unprovable recursion in the reflection formula.
+function _zeta_core(s::ComplexOrReal{Float64})
+    # blows up to ±Inf, but get correct sign of imaginary zero
+    s == 1 && return NaN + zero(s) * imag(s)
     m = s - 1
 
     # shift using recurrence formula:
@@ -595,7 +602,7 @@ function gamma(n::Union{Int8,UInt8,Int16,UInt16,Int32,UInt32,Int64,UInt64})
     n < 0 && throw(DomainError(n, "`n` must not be negative."))
     n == 0 && return Inf
     n <= 2 && return 1.0
-    n > 20 && return _gamma(Float64(n))
+    n > 20 && return gamma(Float64(n))
     @inbounds return Float64(Base._fact_table64[n-1])
 end
 
diff --git a/test/gamma_inc.jl b/test/gamma_inc.jl
@@ -281,13 +281,25 @@ end
 @testset "GPU compatibility ($FT)" for FT in (Float64, Float32, Float16)
     # Note: This test is a proxy for GPU compatibility by checking that the functions
     # are type stable and do not allocate memory. It does not launch any GPU kernels.
+    @testset "gamma type stability" begin
+        @test @inferred(gamma(FT(2), FT(0.5))) isa FT
+        @test @inferred(gamma(FT(2), FT(0.5) * im)) isa Complex{FT}
+        # Test with integer `a`
+        @test @inferred(gamma(2, FT(0.5))) isa FT
+        @test @inferred(gamma(2, FT(0.5) * im)) isa Complex{FT}
+    end
     @testset "gamma_inc type stability" begin
         @test @inferred(gamma_inc(FT(30.0), FT(29.99999), 0)) isa Tuple{FT,FT}
     end
     @testset "gamma_inc_inv type stability" begin
         @test @inferred(gamma_inc_inv(FT(1.0), FT(0.01), FT(0.99))) isa FT
     end
 
+    @testset "gamma allocations" begin
+        @test iszero((FT -> @allocated(gamma(2, FT(0.5))))(FT))
+        @test iszero((FT -> @allocated(gamma(2, FT(0.5) * im)))(FT))
+    end
+
     @testset "gamma_inc allocations" begin
         # `@allocated` checks for allocations for specific code paths
         ## a >= 1