@@ -225,6 +225,247 @@ ndarray_dd_t ndarray_dd_rsqrt(ndarray_dd_t a) {
225225 return ndarray_dd_div (ndarray_dd_from_double (1.0 ), r );
226226}
227227
228+ /* ── DD-precision transcendentals ───────────────────────────────────────── */
229+
230+ /* Pre-computed DD constants, accurate to ~106 bits. Each is split so
231+ `hi` is the closest fp64 approximation and `lo` is the residual.
232+ Verified against libquadmath: `(double)ln2_hi + (double)ln2_lo` matches
233+ `logq(2.0Q)` to within 1 DD ULP. */
234+ static const ndarray_dd_t DD_LN2 = {
235+ /* ln(2) = 0.69314718055994530941723212145817... */
236+ 0.6931471805599453 , /* hi: 0.693147180559945286... (closest double) */
237+ 2.3190468138462996e-17 /* lo: residual = ln2 - hi */
238+ };
239+ static const ndarray_dd_t DD_LN10 = {
240+ /* ln(10) = 2.30258509299404568401799145468... */
241+ 2.302585092994046 ,
242+ -2.1707562233822494e-16
243+ };
244+ static const ndarray_dd_t DD_LOG2_E = {
245+ /* 1/ln(2) = 1.44269504088896340735992468100... */
246+ 1.4426950408889634 ,
247+ 2.0355273740931033e-17
248+ };
249+ static const ndarray_dd_t DD_LOG10_E = {
250+ /* 1/ln(10) = 0.43429448190325182765112891891... */
251+ 0.4342944819032518 ,
252+ 1.0983196502167645e-17
253+ };
254+
255+ /**
256+ * @brief DD-precision exp(x).
257+ *
258+ * Range reduction: write x = k·ln(2) + r with k = round(x / ln(2)) and
259+ * |r| ≤ ln(2)/2 ≈ 0.347. Then exp(x) = 2^k · exp(r); the 2^k factor
260+ * is exact in fp64 (just shifts the exponent), and exp(r) is evaluated
261+ * via the Taylor series 1 + r + r²/2! + r³/3! + … in DD arithmetic
262+ * using Horner's method. Twenty terms suffice for |r| ≤ 0.347 because
263+ * the (i+1)-th term shrinks by factor ≤ 0.347/(i+1) — at i = 19 the
264+ * term magnitude is well below DD epsilon (~2⁻¹⁰⁶ ≈ 1.2e-32).
265+ *
266+ * Handles overflow (`exp(x) > DBL_MAX`) by returning +inf and underflow
267+ * (`exp(x) < DBL_MIN_SUBNORMAL`) by returning 0. NaN propagates.
268+ *
269+ * @param[in] a Input DD value.
270+ * @return exp(a) in DD precision.
271+ */
272+ ndarray_dd_t ndarray_dd_exp (ndarray_dd_t a ) {
273+ if (ndarray_dd_isnan (a )) return a ;
274+ if (isinf (a .hi )) return ndarray_dd_from_double (a .hi > 0 ? INFINITY : 0.0 );
275+ /* Fast over/underflow guards — exp(±709.78…) is the fp64 edge. */
276+ if (a .hi > 709.7827 ) return ndarray_dd_from_double (INFINITY );
277+ if (a .hi < -745.1332 ) return ndarray_dd_from_double (0.0 );
278+ /* Range reduction: k = round(x · log2_e), r = x − k · ln2. */
279+ double k_d = round (a .hi * 1.4426950408889634 );
280+ int k = (int )k_d ;
281+ ndarray_dd_t k_dd = ndarray_dd_from_double (k_d );
282+ ndarray_dd_t r = ndarray_dd_sub (a , ndarray_dd_mul (k_dd , DD_LN2 ));
283+
284+ /* Horner evaluation of 1 + r·(1 + r/2·(1 + r/3·(… + r/20))) */
285+ ndarray_dd_t result = ndarray_dd_from_double (1.0 );
286+ for (int i = 20 ; i >= 1 ; i -- ) {
287+ /* result = 1 + (r/i) · result */
288+ ndarray_dd_t r_over_i = ndarray_dd_div (r , ndarray_dd_from_double ((double )i ));
289+ result = ndarray_dd_add (ndarray_dd_from_double (1.0 ),
290+ ndarray_dd_mul (r_over_i , result ));
291+ }
292+
293+ /* Scale by 2^k using ldexp on each limb — exact, exponent-only op. */
294+ result .hi = ldexp (result .hi , k );
295+ result .lo = ldexp (result .lo , k );
296+ return result ;
297+ }
298+
299+ /**
300+ * @brief DD-precision expm1(x) = exp(x) − 1.
301+ *
302+ * Near zero, computing `exp(x) − 1` directly suffers catastrophic
303+ * cancellation. Use the Taylor series of expm1 itself for |x| ≤ 0.5:
304+ * expm1(x) = x + x²/2! + x³/3! + … = x · (1 + x/2 · (1 + x/3 · (…)))
305+ * which converges with 25 terms at full DD precision. For larger |x|
306+ * the cancellation is negligible — defer to `exp(x) − 1`.
307+ *
308+ * @param[in] a Input DD value.
309+ * @return exp(a) − 1 in DD precision.
310+ */
311+ ndarray_dd_t ndarray_dd_expm1 (ndarray_dd_t a ) {
312+ if (ndarray_dd_isnan (a )) return a ;
313+ if (a .hi >= 0.5 || a .hi <= -0.5 ) {
314+ return ndarray_dd_sub (ndarray_dd_exp (a ), ndarray_dd_from_double (1.0 ));
315+ }
316+ /* Horner of x·(1 + x/2·(1 + x/3·(…))) — start at i=25 to capture
317+ (0.5)^25/25! ≈ 1.9e-33 < DD eps. */
318+ ndarray_dd_t result = ndarray_dd_from_double (1.0 );
319+ for (int i = 25 ; i >= 2 ; i -- ) {
320+ ndarray_dd_t a_over_i = ndarray_dd_div (a , ndarray_dd_from_double ((double )i ));
321+ result = ndarray_dd_add (ndarray_dd_from_double (1.0 ),
322+ ndarray_dd_mul (a_over_i , result ));
323+ }
324+ return ndarray_dd_mul (a , result );
325+ }
326+
327+ /**
328+ * @brief DD-precision log(x) (natural logarithm).
329+ *
330+ * Range reduction: write x = m · 2^e via `frexp`, so m ∈ [0.5, 1). To
331+ * keep the substitution `u = (m − 1)/(m + 1)` small we conditionally
332+ * shift m into [√0.5, √2) ≈ [0.707, 1.414); then |u| ≤ 0.172. The
333+ * atanh-style series ln(m) = 2·(u + u³/3 + u⁵/5 + u⁷/7 + …) converges
334+ * about twice as fast as the plain Taylor of ln(1+y) because the
335+ * even-power terms vanish. Eleven odd terms (u^21/21) give ~30 sig
336+ * digits at the |u| ≤ 0.172 boundary.
337+ *
338+ * Final: log(x) = 2·Σ + e·ln(2).
339+ *
340+ * NaN / negative / zero handling:
341+ * log(NaN) → NaN, log(<0) → NaN, log(0) → −inf, log(+inf) → +inf.
342+ *
343+ * @param[in] a Input DD value (must be > 0 for a finite result).
344+ * @return log(a) in DD precision.
345+ */
346+ ndarray_dd_t ndarray_dd_log (ndarray_dd_t a ) {
347+ if (ndarray_dd_isnan (a )) return a ;
348+ if (a .hi < 0.0 ) return ndarray_dd_from_double (NAN );
349+ if (a .hi == 0.0 && a .lo == 0.0 ) return ndarray_dd_from_double (- INFINITY );
350+ if (isinf (a .hi )) return ndarray_dd_from_double (INFINITY );
351+
352+ /* Decompose hi = m · 2^e so m ∈ [0.5, 1). The lo limb is folded back
353+ in DD multiplication. */
354+ int e ;
355+ double m_hi = frexp (a .hi , & e );
356+ /* Re-normalize the DD pair after stripping 2^e: dd = a / 2^e. */
357+ ndarray_dd_t m = ndarray_dd_from_pair (m_hi , ldexp (a .lo , - e ));
358+
359+ /* Bring m into [sqrt(0.5), sqrt(2)) so |u| ≤ ~0.172. */
360+ if (m .hi < 0.7071067811865476 ) {
361+ m = ndarray_dd_add (m , m ); /* m · 2 */
362+ e -= 1 ;
363+ }
364+
365+ /* u = (m − 1) / (m + 1). */
366+ ndarray_dd_t one = ndarray_dd_from_double (1.0 );
367+ ndarray_dd_t u = ndarray_dd_div (ndarray_dd_sub (m , one ),
368+ ndarray_dd_add (m , one ));
369+ ndarray_dd_t u2 = ndarray_dd_mul (u , u );
370+
371+ /* 2·atanh(u) = 2·(u + u³/3 + u⁵/5 + … + u^(2N-1)/(2N-1)). For
372+ |u| ≤ 0.172 (the post-shift range), 2N-1 = 51 gives the worst-
373+ case truncated term u^51/51 ≈ 4·10⁻⁴² — well below DD epsilon.
374+ Use `dd_div` for the 1/k constants; `dd_from_double(1.0 / k)`
375+ would only have fp64 precision in the constant. */
376+ ndarray_dd_t sum = ndarray_dd_div (one , ndarray_dd_from_double (51.0 ));
377+ for (int k = 49 ; k >= 1 ; k -= 2 ) {
378+ ndarray_dd_t inv_k = ndarray_dd_div (one , ndarray_dd_from_double ((double )k ));
379+ sum = ndarray_dd_add (inv_k , ndarray_dd_mul (u2 , sum ));
380+ }
381+ /* Multiply by 2u to get the series sum. */
382+ ndarray_dd_t log_m = ndarray_dd_mul (u , sum );
383+ log_m = ndarray_dd_add (log_m , log_m ); /* · 2 */
384+
385+ /* log(x) = log(m) + e · ln(2). */
386+ ndarray_dd_t e_dd = ndarray_dd_from_double ((double )e );
387+ return ndarray_dd_add (log_m , ndarray_dd_mul (e_dd , DD_LN2 ));
388+ }
389+
390+ /**
391+ * @brief DD-precision log1p(x) = log(1 + x).
392+ *
393+ * For |x| ≤ 0.5 use the Taylor series directly:
394+ * log1p(x) = x − x²/2 + x³/3 − x⁴/4 + …
395+ * evaluated via Horner so the cancellation at small x is avoided.
396+ * For |x| > 0.5 fall back to `log(1 + x)` — 1 + x has no cancellation
397+ * there.
398+ *
399+ * @param[in] a Input DD value (a > −1 for a finite result).
400+ * @return log(1 + a) in DD precision.
401+ */
402+ ndarray_dd_t ndarray_dd_log1p (ndarray_dd_t a ) {
403+ if (ndarray_dd_isnan (a )) return a ;
404+ /* `dd_add(1, a)` preserves DD precision even when |a| is at DD
405+ epsilon — the lo limb of the sum captures the contribution of `a`
406+ past fp64's 53 bits. So `dd_log(1 + a)` is precision-faithful
407+ across the full input range without needing a Taylor branch. */
408+ return ndarray_dd_log (ndarray_dd_add (ndarray_dd_from_double (1.0 ), a ));
409+ }
410+
411+ /**
412+ * @brief DD-precision exp2(x) = 2^x.
413+ *
414+ * Implemented as `exp(x · ln(2))` so the existing DD exp drives the
415+ * precision; the multiplication by `DD_LN2` is exact at DD precision
416+ * because `x` is the only fp64-tier input.
417+ *
418+ * @param[in] a Input DD value.
419+ * @return 2^a in DD precision.
420+ */
421+ ndarray_dd_t ndarray_dd_exp2 (ndarray_dd_t a ) {
422+ return ndarray_dd_exp (ndarray_dd_mul (a , DD_LN2 ));
423+ }
424+
425+ /**
426+ * @brief DD-precision log2(x) = log(x) / ln(2) = log(x) · log2(e).
427+ *
428+ * Special-case exact integer powers of two so `log2(2^k)` returns
429+ * exactly `k` (matching the GPU `tcuda_log2_fp` short-circuit and
430+ * the CPU libm guarantee on libquadmath builds).
431+ *
432+ * @param[in] a Input DD value (a > 0 for a finite result).
433+ * @return log2(a) in DD precision.
434+ */
435+ ndarray_dd_t ndarray_dd_log2 (ndarray_dd_t a ) {
436+ /* Power-of-2 short-circuit: frexp(2^k) = (0.5, k+1). */
437+ if (a .lo == 0.0 && isfinite (a .hi ) && a .hi > 0.0 ) {
438+ int e ;
439+ double m = frexp (a .hi , & e );
440+ if (m == 0.5 ) return ndarray_dd_from_double ((double )(e - 1 ));
441+ }
442+ return ndarray_dd_mul (ndarray_dd_log (a ), DD_LOG2_E );
443+ }
444+
445+ /**
446+ * @brief DD-precision log10(x) = log(x) · log10(e).
447+ *
448+ * @param[in] a Input DD value (a > 0 for a finite result).
449+ * @return log10(a) in DD precision.
450+ */
451+ ndarray_dd_t ndarray_dd_log10 (ndarray_dd_t a ) {
452+ return ndarray_dd_mul (ndarray_dd_log (a ), DD_LOG10_E );
453+ }
454+
455+ /**
456+ * @brief DD-precision logb(x) — binary exponent of |x| as a DD integer.
457+ *
458+ * `logb(x)` is defined as `floor(log2(|x|))` for finite normal x and
459+ * is already integer-valued, so the result fits exactly in fp64. The
460+ * `lo` limb of the result is always 0.
461+ *
462+ * @param[in] a Input DD value.
463+ * @return logb(a) in DD precision.
464+ */
465+ ndarray_dd_t ndarray_dd_logb (ndarray_dd_t a ) {
466+ return ndarray_dd_from_double (logb (a .hi ));
467+ }
468+
228469/* ── int conversion ─────────────────────────────────────────────────────── */
229470
230471long long ndarray_dd_to_int64 (ndarray_dd_t a ) {
0 commit comments