@@ -12,9 +12,6 @@ for f in (:erf, :erfc)
1212 @eval begin
1313 $ f (x:: Number ) = $ internalf (float (x))
1414
15- $ internalf (x:: Float64 ) = ccall (($ libopenlibmf, libopenlibm), Float64, (Float64,), x)
16- $ internalf (x:: Float32 ) = ccall (($ libopenlibmf0, libopenlibm), Float32, (Float32,), x)
17- $ internalf (x:: Float16 ) = Float16 ($ internalf (Float32 (x)))
1815
1916 $ internalf (z:: Complex{Float64} ) = Complex {Float64} (ccall (($ openspecfunf, libopenspecfun), Complex{Float64}, (Complex{Float64}, Float64), z, zero (Float64)))
2017 $ internalf (z:: Complex{Float32} ) = Complex {Float32} (ccall (($ openspecfunf, libopenspecfun), Complex{Float64}, (Complex{Float64}, Float64), Complex {Float64} (z), Float64 (eps (Float32))))
@@ -28,6 +25,10 @@ for f in (:erf, :erfc)
2825 end
2926end
3027
28+ erfc (x:: Float64 ) = ccall ((erfc, libopenlibm), Float64, (Float64,), x)
29+ erfc (x:: Float32 ) = ccall ((erfcf, libopenlibm), Float32, (Float32,), x)
30+ erfc (x:: Float16 ) = Float16 (erfc (Float32 (x)))
31+
3132for f in (:erfcx , :erfi , :dawson , :faddeeva )
3233 internalf = Symbol (:_ , f)
3334 openspecfunfsym = Symbol (:Faddeeva_ , f === :dawson ? :Dawson : f === :faddeeva ? :w : f)
@@ -96,10 +97,263 @@ See also:
9697[`erfinv(x)`](@ref erfinv), [`erfcinv(x)`](@ref erfcinv).
9798
9899# Implementation by
99- - `Float32`/`Float64`: C standard math library
100- [libm](https://en.wikipedia.org/wiki/C_mathematical_functions#libm).
100+ - `Float32`/`Float64`: Julia implementation of https://github.com/ARM-software/optimized-routines/blob/master/math/erf.c
101101- `BigFloat`: C library for multiple-precision floating-point [MPFR](https://www.mpfr.org/)
102102"""
103+
104+ # Fast erf implementation using a mix of
105+ # rational and polynomial approximations.
106+ # Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0.
107+ function erf (x:: Float64 )
108+ # Minimax approximation of erf
109+ PA= (0x1 .06 eba8214db68p- 3 , - 0x1 .812746 b037948p- 2 , 0x1 . ce2f21a03872p- 4 ,- 0x1 . b82ce30e6548p- 6 , 0x1 .565 bcc360a2f2p- 8 , - 0x1 . c02d812bc979ap- 11 ,0x1 . f99bddfc1ebe9p- 14 , - 0x1 . f42c457cee912p- 17 , 0x1 . b0e414ec20ee9p- 20 ,- 0x1 .18 c47fd143c5ep- 23 )
110+ # Rational approximation on [0x1p-28, 0.84375]
111+ NA= (0x1 .06 eba8214db68p- 3 , - 0x1 .4 cd7d691cb913p- 2 , - 0x1 . d2a51dbd7194fp- 6 ,- 0x1 .7 a291236668e4p- 8 , - 0x1 .8 ead6120016acp- 16 )
112+ DA= (0x1 .97779 cddadc09p- 2 , 0x1 .0 a54c5536cebap- 4 , 0x1 .4 d022c4d36b0fp- 8 ,0x1 .15 dc9221c1a1p- 13 , - 0x1 .09 c4342a2612p- 18 )
113+ # Rational approximation on [0.84375, 1.25]
114+ NB= ( - 0x1 .359 b8bef77538p- 9 , 0x1 . a8d00ad92b34dp- 2 , - 0x1 .7 d240fbb8c3f1p- 2 , 0x1 .45 fca805120e4p- 2 , - 0x1 . c63983d3e28ecp- 4 , 0x1 .22 a36599795ebp- 5 , - 0x1 .1 bf380a96073fp- 9 )
115+ DB= ( 0x1 . b3e6618eee323p- 4 , 0x1 .14 af092eb6f33p- 1 , 0x1 .2635 cd99fe9a7p- 4 , 0x1 .02660e763351 fp- 3 , 0x1 . bedc26b51dd1cp- 7 , 0x1 .88 b545735151dp- 7 )
116+
117+ # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=15 a=1.25 b=2 c=1 d=1.25
118+ PC= ( 0x1 .3 bcd133aa0ffcp- 4 , - 0x1 . e4652fadcb702p- 3 , 0x1 .2 ebf3dcca0446p- 2 , - 0x1 .571 d01c62d66p- 3 , 0x1 .93 a9a8f5b3413p- 8 , 0x1 .8281 cbcc2cd52p- 5 , - 0x1 .5 cffd86b4de16p- 6 , - 0x1 . db4ccf595053ep- 9 , 0x1 .757 cbf8684edap- 8 , - 0x1 . ce7dfd2a9e56ap- 11 , - 0x1 .99 ee3bc5a3263p- 11 , 0x1 .3 c57cf9213f5fp- 12 , 0x1 .60692996 bf254p- 14 , - 0x1 .6e44 cb7c1fa2ap- 14 , 0x1 .9 d4484ac482b2p- 16 , - 0x1 .578 c9e375d37p- 19 )
119+ # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=17 a=2 b=3.25 c=2 d=2
120+ PD= ( 0x1 .328f5 ec350e5p- 8 , - 0x1 .529 b9e8cf8e99p- 5 , 0x1 .529 b9e8cd9e71p- 3 , - 0x1 .8 b0ae3a023bf2p- 2 , 0x1 .1 a2c592599d82p- 1 , - 0x1 . ace732477e494p- 2 , - 0x1 . e1a06a27920ffp- 6 , 0x1 . bae92a6d27af6p- 2 , - 0x1 . a15470fcf5ce7p- 2 , 0x1 . bafe45d18e213p- 6 , 0x1 .0 d950680d199ap- 2 , - 0x1 .8 c9481e8f22e3p- 3 , - 0x1 .158450 ed5c899p- 4 , 0x1 . c01f2973b44p- 3 , - 0x1 .73 ed2827546a7p- 3 , 0x1 .47733687 d1ff7p- 4 , - 0x1 .2 dec70d00b8e1p- 6 , 0x1 . a947ab83cd4fp- 10 )
121+ # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=13 a=3.25 b=4 c=1 d=3.25
122+ PE= ( 0x1 .20 c13035539e4p- 18 , - 0x1 . e9b5e8d16df7ep- 16 , 0x1 .8 de3cd4733bf9p- 14 , - 0x1 .9 aa48beb8382fp- 13 , 0x1 .2 c7d713370a9fp- 12 , - 0x1 .490 b12110b9e2p- 12 , 0x1 .1459 c5d989d23p- 12 , - 0x1 .64 b28e9f1269p- 13 , 0x1 .57 c76d9d05cf8p- 14 , - 0x1 . bf271d9951cf8p- 16 , 0x1 . db7ea4d4535c9p- 19 , 0x1 .91 c2e102d5e49p- 20 , - 0x1 . e9f0826c2149ep- 21 , 0x1 .60 eebaea236e1p- 23 )
123+ # Generated using Sollya::remez(f(c*x+d), deg, [(a-d)/c;(b-d)/c], 1, 1e-16), [|D ...|] with deg=16 a=4 b=5.90625 c=2 d=4
124+ PF= ( 0x1 .08 ddd130d1fa6p- 26 , - 0x1 .10 b146f59ff06p- 22 , 0x1 .10 b135328b7b2p- 19 , - 0x1 .6039988e7575 fp- 17 , 0x1 .497 d365e19367p- 15 , - 0x1 . da48d9afac83ep- 14 , 0x1 .1024 c9b1fbb48p- 12 , - 0x1 . fc962e7066272p- 12 , 0x1 .87297282 d4651p- 11 , - 0x1 . f057b255f8c59p- 11 , 0x1 .0228 d0eee063p- 10 , - 0x1 . b1b21b84ec41cp- 11 , 0x1 .1 ead8ae9e1253p- 11 , - 0x1 .1e708 fba37fccp- 12 , 0x1 .9559363991 edap- 14 , - 0x1 .68 c827b783d9cp- 16 , 0x1 .2 ec4adeccf4a2p- 19 )
125+
126+ C = 0x1 . b0ac16p- 1
127+
128+ TwoOverSqrtPiMinusOne= 0x1 .06 eba8214db69p- 3
129+
130+
131+ # # top 32 bits
132+ ix:: UInt32 = reinterpret (UInt64,x)>> 32
133+ # # top 32, without sign bit
134+ ia:: UInt32 = ix & 0x7fffffff
135+ # # sign
136+ # sign::UInt32=ix>>31
137+
138+ sign:: Bool = x< 0
139+
140+
141+
142+ if (ia < 0x3feb0000 )
143+ # a = |x| < 0.84375.
144+
145+ if (ia < 0x3e300000 )
146+ # a < 2^(-28).
147+ if (ia < 0x00800000 )
148+ # a < 2^(-1015).
149+ y = fma (TwoOverSqrtPiMinusOne, x, x)
150+
151+ # # case of underflow TBD
152+ # return check_uflow (y)
153+ return y
154+ end
155+ return x + TwoOverSqrtPiMinusOne * x
156+ end
157+
158+ x2 = x * x
159+
160+ if (ia < 0x3fe00000 )
161+ # # a < 0.5 - Use polynomial approximation.
162+ r1 = fma (x2, PA[2 ], PA[1 ])
163+ r2 = fma (x2, PA[4 ], PA[3 ])
164+ r3 = fma (x2, PA[6 ], PA[5 ])
165+ r4 = fma (x2, PA[8 ], PA[7 ])
166+ r5 = fma (x2, PA[10 ], PA[9 ])
167+
168+ x4 = x2 * x2
169+ r = r5
170+ r = fma (x4, r, r4)
171+ r = fma (x4, r, r3)
172+ r = fma (x4, r, r2)
173+ r = fma (x4, r, r1)
174+ return fma (r, x, x) # # This fma is crucial for accuracy.
175+ else
176+ # # 0.5 <= a < 0.84375 - Use rational approximation.
177+
178+ r1n = fma (x2, NA[2 ], NA[1 ])
179+ x4 = x2 * x2
180+ r2n = fma (x2, NA[4 ], NA[3 ])
181+ x8 = x4 * x4
182+ r1d = fma (x2, DA[1 ], 1.0 )
183+ r2d = fma (x2, DA[3 ], DA[2 ])
184+ r3d = fma (x2, DA[5 ], DA[4 ])
185+ P = r1n + x4 * r2n + x8 * NA[5 ]
186+
187+ Q = r1d + x4 * r2d + x8 * r3d
188+ return fma (P / Q, x, x)
189+ end
190+ elseif (ia < 0x3ff40000 )
191+ # # 0.84375 <= |x| < 1.25.
192+
193+ a = abs (x) - 1.0
194+ r1n = fma (a, NB[2 ], NB[1 ])
195+ a2 = a * a
196+ r1d = fma (a, DB[1 ], 1.0 )
197+ a4 = a2 * a2
198+ r2n = fma (a, NB[4 ], NB[3 ])
199+ a6 = a4 * a2
200+ r2d = fma (a, DB[3 ], DB[2 ])
201+ r3n = fma (a, NB[6 ], NB[5 ])
202+ r3d = fma (a, DB[5 ], DB[4 ])
203+ r4n = NB[7 ]
204+ r4d = DB[6 ]
205+
206+ P = r1n + a2 * r2n + a4 * r3n + a6 * r4n
207+ Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d
208+ if (sign)
209+ return - C - P / Q
210+ else
211+ return C + P / Q
212+ end
213+ elseif (ia < 0x40000000 )
214+ # # 1.25 <= |x| < 2.0.
215+ a = abs (x)
216+ a = a - 1.25
217+
218+ r1 = fma (a, PC[2 ], PC[1 ])
219+ r2 = fma (a, PC[4 ], PC[3 ])
220+ r3 = fma (a, PC[6 ], PC[5 ])
221+ r4 = fma (a, PC[8 ], PC[7 ])
222+ r5 = fma (a, PC[10 ], PC[9 ])
223+ r6 = fma (a, PC[12 ], PC[11 ])
224+ r7 = fma (a, PC[14 ], PC[13 ])
225+ r8 = fma (a, PC[16 ], PC[15 ])
226+
227+
228+ a2 = a * a
229+
230+ r = r8
231+ r = fma (a2, r, r7)
232+ r = fma (a2, r, r6)
233+ r = fma (a2, r, r5)
234+ r = fma (a2, r, r4)
235+ r = fma (a2, r, r3)
236+ r = fma (a2, r, r2)
237+ r = fma (a2, r, r1)
238+
239+ if (sign)
240+ return - 1.0 + r
241+ else
242+ return 1.0 - r
243+ end
244+ elseif (ia < 0x400a0000 )
245+ # # 2 <= |x| < 3.25.
246+ a = abs (x)
247+ a = fma (0.5 , a, - 1.0 )
248+
249+ r1 = fma (a, PD[2 ], PD[1 ])
250+ r2 = fma (a, PD[4 ], PD[3 ])
251+ r3 = fma (a, PD[6 ], PD[5 ])
252+ r4 = fma (a, PD[8 ], PD[7 ])
253+ r5 = fma (a, PD[10 ], PD[9 ])
254+ r6 = fma (a, PD[12 ], PD[11 ])
255+ r7 = fma (a, PD[14 ], PD[13 ])
256+ r8 = fma (a, PD[16 ], PD[15 ])
257+ r9 = fma (a, PD[18 ], PD[17 ])
258+
259+ a2 = a * a
260+
261+ r = r9
262+ r = fma (a2, r, r8)
263+ r = fma (a2, r, r7)
264+ r = fma (a2, r, r6)
265+ r = fma (a2, r, r5)
266+ r = fma (a2, r, r4)
267+ r = fma (a2, r, r3)
268+ r = fma (a2, r, r2)
269+ r = fma (a2, r, r1)
270+
271+ if (sign)
272+ return - 1.0 + r
273+ else
274+ return 1.0 - r
275+ end
276+ elseif (ia < 0x40100000 )
277+ # # 3.25 <= |x| < 4.0.
278+ a = abs (x)
279+ a = a - 3.25
280+
281+ r1 = fma (a, PE[2 ], PE[1 ])
282+ r2 = fma (a, PE[4 ], PE[3 ])
283+ r3 = fma (a, PE[6 ], PE[5 ])
284+ r4 = fma (a, PE[8 ], PE[7 ])
285+ r5 = fma (a, PE[10 ], PE[9 ])
286+ r6 = fma (a, PE[12 ], PE[11 ])
287+ r7 = fma (a, PE[14 ], PE[13 ])
288+
289+
290+ a2 = a * a
291+
292+ r = r7
293+ r = fma (a2, r, r6)
294+ r = fma (a2, r, r5)
295+ r = fma (a2, r, r4)
296+ r = fma (a2, r, r3)
297+ r = fma (a2, r, r2)
298+ r = fma (a2, r, r1)
299+
300+ if (sign)
301+ return - 1.0 + r
302+ else
303+ return 1.0 - r
304+ end
305+ elseif (ia < 0x4017a000 )
306+ # # 4 <= |x| < 5.90625.
307+ a = abs (x)
308+ a = fma (0.5 , a, - 2.0 )
309+
310+ r1 = fma (a, PF[2 ], PF[1 ])
311+ r2 = fma (a, PF[4 ], PF[3 ])
312+ r3 = fma (a, PF[6 ], PF[5 ])
313+ r4 = fma (a, PF[8 ], PF[7 ])
314+ r5 = fma (a, PF[10 ], PF[9 ])
315+ r6 = fma (a, PF[12 ], PF[11 ])
316+ r7 = fma (a, PF[14 ], PF[13 ])
317+ r8 = fma (a, PF[16 ], PF[15 ])
318+
319+ r9 = PF[17 ]
320+
321+ a2 = a * a
322+
323+ r = r9
324+ r = fma (a2, r, r8)
325+ r = fma (a2, r, r7)
326+ r = fma (a2, r, r6)
327+ r = fma (a2, r, r5)
328+ r = fma (a2, r, r4)
329+ r = fma (a2, r, r3)
330+ r = fma (a2, r, r2)
331+ r = fma (a2, r, r1)
332+
333+ if (sign)
334+ return - 1.0 + r
335+ else
336+ return 1.0 - r
337+ end
338+ else
339+
340+
341+ if (sign)
342+ return - 1.0
343+ else
344+ return 1.0
345+ end
346+
347+ end
348+
349+
350+ end
351+
352+ erf (x:: Float32 )= Float32 (erf (Float64 (x)))
353+
354+ erf (x:: Float16 )= Float16 (erf (Float64 (x)))
355+
356+
103357function erf end
104358"""
105359 erf(x, y)
0 commit comments