Skip to content

Commit 6f554ef

Browse files
committed
Added erf(x) Float64/Float32 Julia implementation
1 parent f8fd782 commit 6f554ef

File tree

1 file changed

+259
-5
lines changed

1 file changed

+259
-5
lines changed

src/erf.jl

Lines changed: 259 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,6 @@ for f in (:erf, :erfc)
1212
@eval begin
1313
$f(x::Number) = $internalf(float(x))
1414

15-
$internalf(x::Float64) = ccall(($libopenlibmf, libopenlibm), Float64, (Float64,), x)
16-
$internalf(x::Float32) = ccall(($libopenlibmf0, libopenlibm), Float32, (Float32,), x)
17-
$internalf(x::Float16) = Float16($internalf(Float32(x)))
1815

1916
$internalf(z::Complex{Float64}) = Complex{Float64}(ccall(($openspecfunf, libopenspecfun), Complex{Float64}, (Complex{Float64}, Float64), z, zero(Float64)))
2017
$internalf(z::Complex{Float32}) = Complex{Float32}(ccall(($openspecfunf, libopenspecfun), Complex{Float64}, (Complex{Float64}, Float64), Complex{Float64}(z), Float64(eps(Float32))))
@@ -28,6 +25,10 @@ for f in (:erf, :erfc)
2825
end
2926
end
3027

28+
erfc(x::Float64) = ccall((erfc, libopenlibm), Float64, (Float64,), x)
29+
erfc(x::Float32) = ccall((erfcf, libopenlibm), Float32, (Float32,), x)
30+
erfc(x::Float16) = Float16(erfc(Float32(x)))
31+
3132
for f in (:erfcx, :erfi, :dawson, :faddeeva)
3233
internalf = Symbol(:_, f)
3334
openspecfunfsym = Symbol(:Faddeeva_, f === :dawson ? :Dawson : f === :faddeeva ? :w : f)
@@ -96,10 +97,263 @@ See also:
9697
[`erfinv(x)`](@ref erfinv), [`erfcinv(x)`](@ref erfcinv).
9798
9899
# Implementation by
99-
- `Float32`/`Float64`: C standard math library
100-
[libm](https://en.wikipedia.org/wiki/C_mathematical_functions#libm).
100+
- `Float32`/`Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c
101101
- `BigFloat`: C library for multiple-precision floating-point [MPFR](https://www.mpfr.org/)
102102
"""
103+
104+
# Fast erf implementation using a mix of
105+
# rational and polynomial approximations.
106+
# Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0.
107+
function erf(x::Float64)
108+
# Minimax approximation of erf
109+
PA=(0x1.06eba8214db68p-3, -0x1.812746b037948p-2, 0x1.ce2f21a03872p-4,-0x1.b82ce30e6548p-6, 0x1.565bcc360a2f2p-8, -0x1.c02d812bc979ap-11,0x1.f99bddfc1ebe9p-14, -0x1.f42c457cee912p-17, 0x1.b0e414ec20ee9p-20,-0x1.18c47fd143c5ep-23)
110+
# Rational approximation on [0x1p-28, 0.84375]
111+
NA=(0x1.06eba8214db68p-3, -0x1.4cd7d691cb913p-2, -0x1.d2a51dbd7194fp-6,-0x1.7a291236668e4p-8, -0x1.8ead6120016acp-16)
112+
DA=(0x1.97779cddadc09p-2, 0x1.0a54c5536cebap-4, 0x1.4d022c4d36b0fp-8,0x1.15dc9221c1a1p-13, -0x1.09c4342a2612p-18)
113+
# Rational approximation on [0.84375, 1.25]
114+
NB=( -0x1.359b8bef77538p-9, 0x1.a8d00ad92b34dp-2, -0x1.7d240fbb8c3f1p-2, 0x1.45fca805120e4p-2, -0x1.c63983d3e28ecp-4, 0x1.22a36599795ebp-5, -0x1.1bf380a96073fp-9 )
115+
DB=( 0x1.b3e6618eee323p-4, 0x1.14af092eb6f33p-1, 0x1.2635cd99fe9a7p-4, 0x1.02660e763351fp-3, 0x1.bedc26b51dd1cp-7, 0x1.88b545735151dp-7 )
116+
117+
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25
118+
PC=( 0x1.3bcd133aa0ffcp-4, -0x1.e4652fadcb702p-3, 0x1.2ebf3dcca0446p-2, -0x1.571d01c62d66p-3, 0x1.93a9a8f5b3413p-8, 0x1.8281cbcc2cd52p-5, -0x1.5cffd86b4de16p-6, -0x1.db4ccf595053ep-9, 0x1.757cbf8684edap-8, -0x1.ce7dfd2a9e56ap-11, -0x1.99ee3bc5a3263p-11, 0x1.3c57cf9213f5fp-12, 0x1.60692996bf254p-14, -0x1.6e44cb7c1fa2ap-14, 0x1.9d4484ac482b2p-16, -0x1.578c9e375d37p-19)
119+
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2
120+
PD=( 0x1.328f5ec350e5p-8, -0x1.529b9e8cf8e99p-5, 0x1.529b9e8cd9e71p-3, -0x1.8b0ae3a023bf2p-2, 0x1.1a2c592599d82p-1, -0x1.ace732477e494p-2, -0x1.e1a06a27920ffp-6, 0x1.bae92a6d27af6p-2, -0x1.a15470fcf5ce7p-2, 0x1.bafe45d18e213p-6, 0x1.0d950680d199ap-2, -0x1.8c9481e8f22e3p-3, -0x1.158450ed5c899p-4, 0x1.c01f2973b44p-3, -0x1.73ed2827546a7p-3, 0x1.47733687d1ff7p-4, -0x1.2dec70d00b8e1p-6, 0x1.a947ab83cd4fp-10 )
121+
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25
122+
PE=( 0x1.20c13035539e4p-18, -0x1.e9b5e8d16df7ep-16, 0x1.8de3cd4733bf9p-14, -0x1.9aa48beb8382fp-13, 0x1.2c7d713370a9fp-12, -0x1.490b12110b9e2p-12, 0x1.1459c5d989d23p-12, -0x1.64b28e9f1269p-13, 0x1.57c76d9d05cf8p-14, -0x1.bf271d9951cf8p-16, 0x1.db7ea4d4535c9p-19, 0x1.91c2e102d5e49p-20, -0x1.e9f0826c2149ep-21, 0x1.60eebaea236e1p-23 )
123+
# Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4
124+
PF=( 0x1.08ddd130d1fa6p-26, -0x1.10b146f59ff06p-22, 0x1.10b135328b7b2p-19, -0x1.6039988e7575fp-17, 0x1.497d365e19367p-15, -0x1.da48d9afac83ep-14, 0x1.1024c9b1fbb48p-12, -0x1.fc962e7066272p-12, 0x1.87297282d4651p-11, -0x1.f057b255f8c59p-11, 0x1.0228d0eee063p-10, -0x1.b1b21b84ec41cp-11, 0x1.1ead8ae9e1253p-11, -0x1.1e708fba37fccp-12, 0x1.9559363991edap-14, -0x1.68c827b783d9cp-16, 0x1.2ec4adeccf4a2p-19 )
125+
126+
C = 0x1.b0ac16p-1
127+
128+
TwoOverSqrtPiMinusOne=0x1.06eba8214db69p-3
129+
130+
131+
# # top 32 bits
132+
ix::UInt32=reinterpret(UInt64,x)>>32
133+
# # top 32, without sign bit
134+
ia::UInt32=ix & 0x7fffffff
135+
# # sign
136+
# sign::UInt32=ix>>31
137+
138+
sign::Bool=x<0
139+
140+
141+
142+
if (ia < 0x3feb0000)
143+
# a = |x| < 0.84375.
144+
145+
if (ia < 0x3e300000)
146+
# a < 2^(-28).
147+
if (ia < 0x00800000)
148+
# a < 2^(-1015).
149+
y = fma(TwoOverSqrtPiMinusOne, x, x)
150+
151+
## case of underflow TBD
152+
#return check_uflow (y)
153+
return y
154+
end
155+
return x + TwoOverSqrtPiMinusOne * x
156+
end
157+
158+
x2 = x * x
159+
160+
if (ia < 0x3fe00000)
161+
## a < 0.5 - Use polynomial approximation.
162+
r1 = fma(x2, PA[2], PA[1])
163+
r2 = fma(x2, PA[4], PA[3])
164+
r3 = fma(x2, PA[6], PA[5])
165+
r4 = fma(x2, PA[8], PA[7])
166+
r5 = fma(x2, PA[10], PA[9])
167+
168+
x4 = x2 * x2
169+
r = r5
170+
r = fma(x4, r, r4)
171+
r = fma(x4, r, r3)
172+
r = fma(x4, r, r2)
173+
r = fma(x4, r, r1)
174+
return fma(r, x, x) ## This fma is crucial for accuracy.
175+
else
176+
## 0.5 <= a < 0.84375 - Use rational approximation.
177+
178+
r1n = fma(x2, NA[2], NA[1])
179+
x4 = x2 * x2
180+
r2n = fma(x2, NA[4], NA[3])
181+
x8 = x4 * x4
182+
r1d = fma(x2, DA[1], 1.0)
183+
r2d = fma(x2, DA[3], DA[2])
184+
r3d = fma(x2, DA[5], DA[4])
185+
P = r1n + x4 * r2n + x8 * NA[5]
186+
187+
Q = r1d + x4 * r2d + x8 * r3d
188+
return fma(P / Q, x, x)
189+
end
190+
elseif (ia < 0x3ff40000)
191+
## 0.84375 <= |x| < 1.25.
192+
193+
a = abs(x) - 1.0
194+
r1n = fma(a, NB[2], NB[1])
195+
a2 = a * a
196+
r1d = fma(a, DB[1], 1.0)
197+
a4 = a2 * a2
198+
r2n = fma(a, NB[4], NB[3])
199+
a6 = a4 * a2
200+
r2d = fma(a, DB[3], DB[2])
201+
r3n = fma(a, NB[6], NB[5])
202+
r3d = fma(a, DB[5], DB[4])
203+
r4n = NB[7]
204+
r4d = DB[6]
205+
206+
P = r1n + a2 * r2n + a4 * r3n + a6 * r4n
207+
Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d
208+
if (sign)
209+
return -C - P / Q
210+
else
211+
return C + P / Q
212+
end
213+
elseif (ia < 0x40000000)
214+
## 1.25 <= |x| < 2.0.
215+
a = abs(x)
216+
a = a - 1.25
217+
218+
r1 = fma(a, PC[2], PC[1])
219+
r2 = fma(a, PC[4], PC[3])
220+
r3 = fma(a, PC[6], PC[5])
221+
r4 = fma(a, PC[8], PC[7])
222+
r5 = fma(a, PC[10], PC[9])
223+
r6 = fma(a, PC[12], PC[11])
224+
r7 = fma(a, PC[14], PC[13])
225+
r8 = fma(a, PC[16], PC[15])
226+
227+
228+
a2 = a * a
229+
230+
r = r8
231+
r = fma(a2, r, r7)
232+
r = fma(a2, r, r6)
233+
r = fma(a2, r, r5)
234+
r = fma(a2, r, r4)
235+
r = fma(a2, r, r3)
236+
r = fma(a2, r, r2)
237+
r = fma(a2, r, r1)
238+
239+
if (sign)
240+
return -1.0 + r
241+
else
242+
return 1.0 - r
243+
end
244+
elseif (ia < 0x400a0000)
245+
## 2 <= |x| < 3.25.
246+
a = abs(x)
247+
a = fma(0.5, a, -1.0)
248+
249+
r1 = fma(a, PD[2], PD[1])
250+
r2 = fma(a, PD[4], PD[3])
251+
r3 = fma(a, PD[6], PD[5])
252+
r4 = fma(a, PD[8], PD[7])
253+
r5 = fma(a, PD[10], PD[9])
254+
r6 = fma(a, PD[12], PD[11])
255+
r7 = fma(a, PD[14], PD[13])
256+
r8 = fma(a, PD[16], PD[15])
257+
r9 = fma(a, PD[18], PD[17])
258+
259+
a2 = a * a
260+
261+
r = r9
262+
r = fma(a2, r, r8)
263+
r = fma(a2, r, r7)
264+
r = fma(a2, r, r6)
265+
r = fma(a2, r, r5)
266+
r = fma(a2, r, r4)
267+
r = fma(a2, r, r3)
268+
r = fma(a2, r, r2)
269+
r = fma(a2, r, r1)
270+
271+
if (sign)
272+
return -1.0 + r
273+
else
274+
return 1.0 - r
275+
end
276+
elseif (ia < 0x40100000)
277+
## 3.25 <= |x| < 4.0.
278+
a = abs(x)
279+
a = a - 3.25
280+
281+
r1 = fma(a, PE[2], PE[1])
282+
r2 = fma(a, PE[4], PE[3])
283+
r3 = fma(a, PE[6], PE[5])
284+
r4 = fma(a, PE[8], PE[7])
285+
r5 = fma(a, PE[10], PE[9])
286+
r6 = fma(a, PE[12], PE[11])
287+
r7 = fma(a, PE[14], PE[13])
288+
289+
290+
a2 = a * a
291+
292+
r = r7
293+
r = fma(a2, r, r6)
294+
r = fma(a2, r, r5)
295+
r = fma(a2, r, r4)
296+
r = fma(a2, r, r3)
297+
r = fma(a2, r, r2)
298+
r = fma(a2, r, r1)
299+
300+
if (sign)
301+
return -1.0 + r
302+
else
303+
return 1.0 - r
304+
end
305+
elseif (ia < 0x4017a000)
306+
## 4 <= |x| < 5.90625.
307+
a = abs(x)
308+
a = fma(0.5, a, -2.0)
309+
310+
r1 = fma(a, PF[2], PF[1])
311+
r2 = fma(a, PF[4], PF[3])
312+
r3 = fma(a, PF[6], PF[5])
313+
r4 = fma(a, PF[8], PF[7])
314+
r5 = fma(a, PF[10], PF[9])
315+
r6 = fma(a, PF[12], PF[11])
316+
r7 = fma(a, PF[14], PF[13])
317+
r8 = fma(a, PF[16], PF[15])
318+
319+
r9 = PF[17]
320+
321+
a2 = a * a
322+
323+
r = r9
324+
r = fma(a2, r, r8)
325+
r = fma(a2, r, r7)
326+
r = fma(a2, r, r6)
327+
r = fma(a2, r, r5)
328+
r = fma(a2, r, r4)
329+
r = fma(a2, r, r3)
330+
r = fma(a2, r, r2)
331+
r = fma(a2, r, r1)
332+
333+
if (sign)
334+
return -1.0 + r
335+
else
336+
return 1.0 - r
337+
end
338+
else
339+
340+
341+
if (sign)
342+
return -1.0
343+
else
344+
return 1.0
345+
end
346+
347+
end
348+
349+
350+
end
351+
352+
erf(x::Float32)=Float32(erf(Float64(x)))
353+
354+
erf(x::Float16)=Float16(erf(Float64(x)))
355+
356+
103357
function erf end
104358
"""
105359
erf(x, y)

0 commit comments

Comments
 (0)