Add @fastmath for telu_fast and reuse telu(x) to compute telu'(x)

zengmao · zengmao · commit a25a1d112868 · 2025-01-07T21:34:44.000Z
diff --git a/src/activations.jl b/src/activations.jl
@@ -791,20 +791,51 @@ telu(x) = x * tanh(exp(x))
 This is faster but less accruate version of `telu`. This function is associated with a hard-coded derivative,
 `deriv_telu_fast`, which is faster but less accurate that `deriv_telu`.
     """
-telu_fast(x) = x * tanh_fast(exp(x))
+telu_fast(x) = @fastmath x * tanh_fast(exp(x))
 
 # Adapted from the Discourse post: <https://discourse.julialang.org/t/how-to-compute-tanhexp-telu-function-accurately/124464/7>
 function deriv_telu(x)
     exp_x = exp(x)
     tanh(exp_x) + 4x / (exp(exp_x - x/2) + exp(-exp_x - x/2))^2
 end
 
-function deriv_telu_fast(x)
-    tanh_exp_x = tanh(exp(x))
+# 0th and 1st order Taylor expansion for telu'(x) around x=0
+const deriv_telu_taylor_expansion = (tanh(1.0), 8*exp(1)^2 / (1+exp(1)^2)^2)
+
+# Various cutoffs for numerical evaluations of telu'(x)
+const sqrt_eps_f16, sqrt_eps_f32, sqrt_eps_f64 = sqrt(eps(Float16)), sqrt(eps(Float32)), sqrt(eps(Float64))
+const minus_log_cutoff_f16, minus_log_cutoff_f32, minus_log_cutoff_f64 = -log(sqrt_eps_f16), -log(sqrt_eps_f32), -log(sqrt_eps_f64) # positive cutoff to e.g. prevent `exp` from overflow
+@inline small_x_cutoff_deriv_telu(::Float16) = sqrt_eps_f16
+@inline small_x_cutoff_deriv_telu(::Float32) = sqrt_eps_f32
+@inline small_x_cutoff_deriv_telu(::Float64) = sqrt_eps_f64
+@inline small_x_cutoff_deriv_telu(::T) where T <: AbstractFloat = sqrt(eps(T))
+@inline minus_log_cutoff(::Float16) = minus_log_cutoff_f16
+@inline minus_log_cutoff(::Float32) = minus_log_cutoff_f32
+@inline minus_log_cutoff(::Float64) = minus_log_cutoff_f64
+@inline minus_log_cutoff(::T) where T <: AbstractFloat = -log(small_x_cutoff_deriv_telu(zero(T)))
+
+@inline function _deriv_telu_taylor_expansion(x::T) where {T <: Union{Float16, Float32, Float64}}
+    convert(T, deriv_telu_taylor_expansion[1]) + x * convert(T, deriv_telu_taylor_expansion[2])
+end
+
+@inline function _deriv_telu_taylor_expansion(x::T) where {T <: AbstractFloat}
+    tanh(one(T)) + x * 8*exp(one(T))^2 / (one(T)+exp(one(T))^2)^2
+end
+
+function deriv_telu_fast(x, Ω)
+    ifelse(abs(x) < small_x_cutoff_deriv_telu(x), _deriv_telu_taylor_expansion(x), # if x is close to 0, return linear-order Taylor expansion
+           ifelse(x >= minus_log_cutoff(x), one(x), _deriv_telu_fast(x, Ω))) # cut off large x to prevent `exp(x)` overflow. This cutoff is good for all types (Float16, 32, 64) in terms of both preventing overflow and maintaining accuracy
+end
+
+@inline function _deriv_telu_fast(x, Ω)
+    tanh_exp_x = Ω / x
     sech_exp_x_squared = 1 - tanh_exp_x^2
-    ifelse(x >= 4, one(x), tanh_exp_x + x * exp(x) * sech_exp_x_squared) # cut off large x to prevent `exp(x)` overflow. This cutoff is good for all types (Float16, 32, 64) in terms of both preventing overflow and maintaining accuracy
+    tanh_exp_x + x * exp(x) * sech_exp_x_squared
 end
 
+# for testing accuracy
+_deriv_telu_fast(x) = deriv_telu_fast(x, telu_fast(x))
+
 # Define broadcasts for activation functions on arrays
 for f in ACTIVATIONS
   @eval $(f)(x::AbstractArray, args...) = $(f).(x, args...)
@@ -948,7 +979,7 @@ UNARY_ACTS = [ # f, dfdx
     ## Fast variants are the same!
     (:tanh_fast,    :(conj(1 - Ω^2))),
     (:sigmoid_fast, :(conj(Ω * (1 - Ω)))),
-    (:telu_fast,    :(deriv_telu_fast(x)))
+    (:telu_fast,    :(deriv_telu_fast(x, Ω)))
 ]
 
 for (f, dfdx) in UNARY_ACTS
diff --git a/test/activations.jl b/test/activations.jl
@@ -214,7 +214,7 @@ end
 
 ## Faster variants
 
-using NNlib: tanh_fast, sigmoid_fast, telu_fast, deriv_telu, deriv_telu_fast
+using NNlib: tanh_fast, sigmoid_fast, telu_fast, deriv_telu, _deriv_telu_fast
 
 function countepsfrom(x::T, xtrue) where {T<:AbstractFloat}
     target = T(xtrue)
@@ -269,8 +269,8 @@ end
         mean_eps(telu, telu, x64) # 0.1146
         worst_eps(telu, telu, x64) # 2
 
-        @test mean_eps(telu_fast, telu, x64) < 0.13 # 0.12204
-        @test worst_eps(telu_fast, telu, x64) <= 3 # 2
+        @test mean_eps(telu_fast, telu, x64) < 0.14 # 0.1338
+        @test worst_eps(telu_fast, telu, x64) <= 4 # 3
 
         @test telu_fast.(xbig[1:end-1]) ≈ telu.(xbig[1:end-1])
         @test telu_fast.(-xbig[1:end-1]) ≈ telu.(-xbig[1:end-1])
@@ -279,11 +279,11 @@ end
         mean_eps(deriv_telu, deriv_telu, x64) # 0.09304
         worst_eps(deriv_telu, deriv_telu, x64) # 2
 
-        @test mean_eps(deriv_telu_fast, deriv_telu, x64) < 2.1 # 2.05944
-        @test worst_eps(deriv_telu_fast, deriv_telu, x64) <= 29 # 28
+        @test mean_eps(_deriv_telu_fast, deriv_telu, x64) < 4.1 # 4.06396
+        @test worst_eps(_deriv_telu_fast, deriv_telu, x64) <= 125 # 120
 
-        @test deriv_telu_fast.(xbig[1:end-1]) ≈ deriv_telu.(xbig[1:end-1])
-        @test deriv_telu_fast.(-xbig[1:end-1]) ≈ deriv_telu.(-xbig[1:end-1])
+        @test _deriv_telu_fast.(xbig[1:end-1]) ≈ deriv_telu.(xbig[1:end-1])
+        @test _deriv_telu_fast.(-xbig[1:end-1]) ≈ deriv_telu.(-xbig[1:end-1])
     end
 end
 
@@ -335,11 +335,11 @@ end
         mean_eps(deriv_telu, deriv_telu, x32) # 0.07228
         worst_eps(deriv_telu, deriv_telu, x32) # 1
 
-        @test mean_eps(deriv_telu_fast, deriv_telu, x32) < 0.69 # 0.68772
-        @test worst_eps(deriv_telu_fast, deriv_telu, x32) <= 11 # 10
+        @test mean_eps(_deriv_telu_fast, deriv_telu, x32) < 2.4 # 2.31772
+        @test worst_eps(_deriv_telu_fast, deriv_telu, x32) <= 70 # 66
 
-        @test deriv_telu_fast.(xbig32[1:end-1]) ≈ deriv_telu.(xbig32[1:end-1])
-        @test deriv_telu_fast.(-xbig32[1:end-1]) ≈ deriv_telu.(-xbig32[1:end-1])
+        @test _deriv_telu_fast.(xbig32[1:end-1]) ≈ deriv_telu.(xbig32[1:end-1])
+        @test _deriv_telu_fast.(-xbig32[1:end-1]) ≈ deriv_telu.(-xbig32[1:end-1])
     end
 end