RubixML
diff --git a/‎numpower.c‎
Lines changed: 4 additions & 2 deletions b/‎numpower.c‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎src/dd_math.c‎
Lines changed: 38 additions & 22 deletions b/‎src/dd_math.c‎
Lines changed: 38 additions & 22 deletions
diff --git a/‎src/ndmath/arithmetics.c‎
Lines changed: 94 additions & 11 deletions b/‎src/ndmath/arithmetics.c‎
Lines changed: 94 additions & 11 deletions
@@ -3717,9 +3717,11 @@ static NDArray *ndarray_resolve_unary_input(zval *array, int *owned)
                 zend_throw_error(NULL,
                     "Numeric string expected, got a whitespace-only value.");
             } else {
+                /* Length-bounded (`%.*s`) so an embedded NUL doesn't
+                   truncate the offending literal in the diagnostic. */
                 zend_throw_error(NULL,
-                    "Numeric string expected, got malformed literal: \"%s\".",
-                    p);
+                    "Numeric string expected, got malformed literal: \"%.*s\".",
+                    (int)n, p);
             }
             return NULL;
         }
 
@@ -236,11 +236,6 @@ static const ndarray_dd_t DD_LN2 = {
      0.6931471805599453,    /* hi: 0.693147180559945286... (closest double) */
      2.3190468138462996e-17 /* lo: residual = ln2 - hi   */
 };
-static const ndarray_dd_t DD_LN10 = {
-    /* ln(10) = 2.30258509299404568401799145468... */
-     2.302585092994046,
-    -2.1707562233822494e-16
-};
 static const ndarray_dd_t DD_LOG2_E = {
     /* 1/ln(2) = 1.44269504088896340735992468100... */
      1.4426950408889634,
@@ -259,9 +254,13 @@ static const ndarray_dd_t DD_LOG10_E = {
  * |r| ≤ ln(2)/2 ≈ 0.347. Then exp(x) = 2^k · exp(r); the 2^k factor
  * is exact in fp64 (just shifts the exponent), and exp(r) is evaluated
  * via the Taylor series 1 + r + r²/2! + r³/3! + … in DD arithmetic
- * using Horner's method. Twenty terms suffice for |r| ≤ 0.347 because
- * the (i+1)-th term shrinks by factor ≤ 0.347/(i+1) — at i = 19 the
- * term magnitude is well below DD epsilon (~2⁻¹⁰⁶ ≈ 1.2e-32).
+ * using Horner's method. The series is summed through r²⁴/24!: at the
+ * worst-case |r| ≤ ln(2)/2 ≈ 0.3466 the first omitted term r²⁵/25! ≈
+ * 6.8e-37 is far below DD epsilon (~2⁻¹⁰⁶ ≈ 1.2e-32), so the result
+ * carries full ~32-digit DD precision. (Twenty terms — the original
+ * cutoff — left r²¹/21! ≈ 4.2e-30 in the remainder, capping accuracy
+ * at ~29 digits and making the GPU DD path diverge from the CPU
+ * libquadmath path at the 31st digit.)
  *
  * Handles overflow (`exp(x) > DBL_MAX`) by returning +inf and underflow
  * (`exp(x) < DBL_MIN_SUBNORMAL`) by returning 0. NaN propagates.
@@ -281,9 +280,9 @@ ndarray_dd_t ndarray_dd_exp(ndarray_dd_t a) {
     ndarray_dd_t k_dd = ndarray_dd_from_double(k_d);
     ndarray_dd_t r    = ndarray_dd_sub(a, ndarray_dd_mul(k_dd, DD_LN2));
 
-    /* Horner evaluation of 1 + r·(1 + r/2·(1 + r/3·(… + r/20))) */
+    /* Horner evaluation of 1 + r·(1 + r/2·(1 + r/3·(… + r/24))) */
     ndarray_dd_t result = ndarray_dd_from_double(1.0);
-    for (int i = 20; i >= 1; i--) {
+    for (int i = 24; i >= 1; i--) {
         /* result = 1 + (r/i) · result */
         ndarray_dd_t r_over_i = ndarray_dd_div(r, ndarray_dd_from_double((double)i));
         result = ndarray_dd_add(ndarray_dd_from_double(1.0),
@@ -332,8 +331,8 @@ ndarray_dd_t ndarray_dd_expm1(ndarray_dd_t a) {
  * shift m into [√0.5, √2) ≈ [0.707, 1.414); then |u| ≤ 0.172. The
  * atanh-style series ln(m) = 2·(u + u³/3 + u⁵/5 + u⁷/7 + …) converges
  * about twice as fast as the plain Taylor of ln(1+y) because the
- * even-power terms vanish. Eleven odd terms (u^21/21) give ~30 sig
- * digits at the |u| ≤ 0.172 boundary.
+ * even-power terms vanish. Twenty-six odd terms (through u^51/51) give
+ * full ~32-digit DD precision at the |u| ≤ 0.172 boundary.
  *
  * Final: log(x) = 2·Σ + e·ln(2).
  *
@@ -390,22 +389,39 @@ ndarray_dd_t ndarray_dd_log(ndarray_dd_t a) {
 /**
  * @brief DD-precision log1p(x) = log(1 + x).
  *
- * For |x| ≤ 0.5 use the Taylor series directly:
- *     log1p(x) = x − x²/2 + x³/3 − x⁴/4 + …
- * evaluated via Horner so the cancellation at small x is avoided.
- * For |x| > 0.5 fall back to `log(1 + x)` — 1 + x has no cancellation
- * there.
+ * For |x| ≤ 0.5 the value 1 + x suffers catastrophic cancellation of
+ * x's sub-fp64 information (when |x| ≲ fp64 epsilon the whole of x lands
+ * in the lo limb and is rounded away by the subsequent range reduction).
+ * Use instead the area-hyperbolic-tangent identity
+ *     log1p(x) = 2·atanh( x / (2 + x) ),
+ * with u = x / (2 + x). The divisor 2 + x stays in [1.5, 2.5] so it
+ * never cancels and the DD add/divide preserve x's lo limb in full;
+ * |u| ≤ 0.2 over the branch, so the odd series 2·(u + u³/3 + … + u⁵¹/51)
+ * (26 odd terms) is below DD epsilon. For |x| > 0.5 there is no
+ * cancellation in 1 + x, so defer to `dd_log(1 + x)` — that path also
+ * covers the x ≤ −1 (→ NaN / −inf) and +inf edges.
  *
  * @param[in] a Input DD value (a > −1 for a finite result).
  * @return log(1 + a) in DD precision.
  */
 ndarray_dd_t ndarray_dd_log1p(ndarray_dd_t a) {
     if (ndarray_dd_isnan(a)) return a;
-    /* `dd_add(1, a)` preserves DD precision even when |a| is at DD
-       epsilon — the lo limb of the sum captures the contribution of `a`
-       past fp64's 53 bits. So `dd_log(1 + a)` is precision-faithful
-       across the full input range without needing a Taylor branch. */
-    return ndarray_dd_log(ndarray_dd_add(ndarray_dd_from_double(1.0), a));
+    if (a.hi >= 0.5 || a.hi <= -0.5) {
+        return ndarray_dd_log(ndarray_dd_add(ndarray_dd_from_double(1.0), a));
+    }
+    ndarray_dd_t one = ndarray_dd_from_double(1.0);
+    ndarray_dd_t u   = ndarray_dd_div(a,
+                           ndarray_dd_add(ndarray_dd_from_double(2.0), a));
+    ndarray_dd_t u2  = ndarray_dd_mul(u, u);
+    /* Same 26-odd-term atanh ladder as ndarray_dd_log; |u| ≤ 0.2 here so
+       the truncated term u^53/53 is far below DD epsilon. */
+    ndarray_dd_t sum = ndarray_dd_div(one, ndarray_dd_from_double(51.0));
+    for (int k = 49; k >= 1; k -= 2) {
+        ndarray_dd_t inv_k = ndarray_dd_div(one, ndarray_dd_from_double((double)k));
+        sum = ndarray_dd_add(inv_k, ndarray_dd_mul(u2, sum));
+    }
+    ndarray_dd_t r = ndarray_dd_mul(u, sum);
+    return ndarray_dd_add(r, r);  /* · 2 */
 }
 
 /**
 
@@ -3627,13 +3627,28 @@ static int unary_validate_numeric_string(const char *str, const char *which) {
         return -1;
     }
     const char *p = unary_skip_sign_ws(str);
-    /* Accept inf / nan tokens (case-insensitive, with optional trailing
-       junk consistent with strtod's contract). */
+    /* Accept inf / infinity / nan tokens (case-insensitive). The token
+       must consume the rest of the (trimmed) literal — trailing junk
+       such as "infX" / "nanZ" is rejected rather than silently read as
+       a valid prefix the way strtod would, mirroring the strict array-
+       input inferrer `ndarray_infer_dtype_from_string`. */
     char low3[4] = {0};
     for (int i = 0; i < 3 && p[i]; i++) {
         low3[i] = (char)(p[i] | 0x20);
     }
     if (!strncmp(low3, "inf", 3) || !strncmp(low3, "nan", 3)) {
+        const char *t = p + 3;
+        if (low3[0] == 'i') {                     /* maybe the "infinity" spelling */
+            char low5[6] = {0};
+            for (int i = 0; i < 5 && t[i]; i++) low5[i] = (char)(t[i] | 0x20);
+            if (!strncmp(low5, "inity", 5)) t += 5;
+        }
+        while (*t == ' ' || *t == '\t' || *t == '\n' || *t == '\r') t++;
+        if (*t != '\0') {
+            zend_throw_error(NULL,
+                "NDArray clip: '%s' is not a valid number: %s.", which, str);
+            return -1;
+        }
         return 0;
     }
     int saw_digit = 0;
@@ -3670,12 +3685,65 @@ static int unary_validate_numeric_string(const char *str, const char *which) {
 /**
  * @brief Skip leading ASCII whitespace, returning the first non-space char's
  *        pointer. Mirrors `strtoll`'s leading-whitespace handling.
+ *
+ * @param[in] s NUL-terminated string to scan.
+ * @return Pointer into @p s at the first non-whitespace character (the
+ *         terminating NUL when @p s is empty or all whitespace).
  */
 static inline const char *unary_skip_ws(const char *s) {
     while (*s == ' ' || *s == '\t' || *s == '\n' || *s == '\r') s++;
     return s;
 }
 
+/** Special-value kind of a validated clip-bound literal. */
+typedef enum {
+    UNARY_FINITE = 0, UNARY_POS_INF, UNARY_NEG_INF, UNARY_NAN
+} unary_special_t;
+
+/**
+ * @brief Classify an already-validated clip-bound literal as finite, ±inf,
+ *        or nan (case-insensitive, honouring an optional leading sign).
+ *
+ * @param[in] str NUL-terminated, syntactically validated literal.
+ * @return The special-value kind; `UNARY_FINITE` for an ordinary number.
+ */
+static unary_special_t unary_classify_special(const char *str) {
+    const char *p = unary_skip_ws(str);
+    int neg = 0;
+    if (*p == '+' || *p == '-') { neg = (*p == '-'); p++; }
+    char low3[4] = {0};
+    for (int i = 0; i < 3 && p[i]; i++) low3[i] = (char)(p[i] | 0x20);
+    if (!strncmp(low3, "inf", 3)) return neg ? UNARY_NEG_INF : UNARY_POS_INF;
+    if (!strncmp(low3, "nan", 3)) return UNARY_NAN;
+    return UNARY_FINITE;
+}
+
+/**
+ * @brief Write the representable extreme of an integer dtype into @p out_buf.
+ *
+ * Used to give an inf/nan clip bound PyTorch's "no bound" semantics on the
+ * 8 integer dtypes (strtoll/strtoull would otherwise read zero digits from
+ * the token and yield 0, collapsing the clip range).
+ *
+ * @param[in]  dt       Canonical dtype string.
+ * @param[in]  want_max Non-zero → dtype maximum; zero → dtype minimum
+ *                      (0 for the unsigned dtypes).
+ * @param[out] out_buf  Buffer of `elsize(dt)` bytes to receive the value.
+ * @return 1 when @p dt is one of the 8 integer dtypes (value written);
+ *         0 otherwise (a float dtype — caller handles it).
+ */
+static int unary_write_int_extreme(const char *dt, int want_max, void *out_buf) {
+    if (!strcmp(dt, "int8"))   { *(int8_t   *)out_buf = want_max ? INT8_MAX   : INT8_MIN;   return 1; }
+    if (!strcmp(dt, "int16"))  { *(int16_t  *)out_buf = want_max ? INT16_MAX  : INT16_MIN;  return 1; }
+    if (!strcmp(dt, "int32"))  { *(int32_t  *)out_buf = want_max ? INT32_MAX  : INT32_MIN;  return 1; }
+    if (!strcmp(dt, "int64"))  { *(int64_t  *)out_buf = want_max ? INT64_MAX  : INT64_MIN;  return 1; }
+    if (!strcmp(dt, "uint8"))  { *(uint8_t  *)out_buf = want_max ? UINT8_MAX  : 0;          return 1; }
+    if (!strcmp(dt, "uint16")) { *(uint16_t *)out_buf = want_max ? UINT16_MAX : 0;          return 1; }
+    if (!strcmp(dt, "uint32")) { *(uint32_t *)out_buf = want_max ? UINT32_MAX : 0;          return 1; }
+    if (!strcmp(dt, "uint64")) { *(uint64_t *)out_buf = want_max ? UINT64_MAX : 0;          return 1; }
+    return 0;
+}
+
 /**
  * @brief Parse @p str into the typed scalar buffer @p out_buf for @p dt.
  *
@@ -3703,6 +3771,21 @@ static int unary_parse_typed_scalar(const char *dt, const char *str,
                                      const char *which, void *out_buf) {
     if (unary_validate_numeric_string(str, which) < 0) return -1;
 
+    /* inf / nan bounds on integer dtypes: strtoll/strtoull read zero
+       digits from the token and yield 0, which would collapse the clip
+       range. Map the token to the dtype's representable extreme so an
+       inf bound acts as PyTorch's "no bound" (−inf → MIN, +inf → MAX),
+       and a nan bound becomes the no-op extreme for whichever side it
+       sits on (min → MIN, max → MAX), matching how the float path
+       silently ignores a nan bound. Float dtypes fall through so strtod
+       yields a real ±inf / nan. */
+    unary_special_t sp = unary_classify_special(str);
+    if (sp != UNARY_FINITE) {
+        int want_max = (sp == UNARY_POS_INF) ||
+                       (sp == UNARY_NAN && !strcmp(which, "max"));
+        if (unary_write_int_extreme(dt, want_max, out_buf)) return 0;
+    }
+
     /* Narrow integer dtypes — saturate the bound to the dtype range so
        out-of-range literals don't wrap via the implicit `(T)strtoll(...)`
        cast inside `ndarray_set_from_string`. int64/uint64 keep the
@@ -3713,39 +3796,39 @@ static int unary_parse_typed_scalar(const char *dt, const char *str,
     if (!strcmp(dt, "uint8")) {
         if (is_neg) { *(uint8_t *)out_buf = 0; return 0; }
         unsigned long long v = strtoull(p, NULL, 10);
-        *(uint8_t *)out_buf = (uint8_t)(v > 0xFFu ? 0xFFu : v);
+        *(uint8_t *)out_buf = (uint8_t)(v > UINT8_MAX ? UINT8_MAX : v);
         return 0;
     }
     if (!strcmp(dt, "uint16")) {
         if (is_neg) { *(uint16_t *)out_buf = 0; return 0; }
         unsigned long long v = strtoull(p, NULL, 10);
-        *(uint16_t *)out_buf = (uint16_t)(v > 0xFFFFu ? 0xFFFFu : v);
+        *(uint16_t *)out_buf = (uint16_t)(v > UINT16_MAX ? UINT16_MAX : v);
         return 0;
     }
     if (!strcmp(dt, "uint32")) {
         if (is_neg) { *(uint32_t *)out_buf = 0; return 0; }
         unsigned long long v = strtoull(p, NULL, 10);
-        *(uint32_t *)out_buf = (uint32_t)(v > 0xFFFFFFFFu ? 0xFFFFFFFFu : v);
+        *(uint32_t *)out_buf = (uint32_t)(v > UINT32_MAX ? UINT32_MAX : v);
         return 0;
     }
     if (!strcmp(dt, "int8")) {
         long long v = strtoll(str, NULL, 10);
-        if (v >  0x7F)            v =  0x7F;
-        else if (v < -0x80)       v = -0x80;
+        if (v > INT8_MAX)      v = INT8_MAX;
+        else if (v < INT8_MIN) v = INT8_MIN;
         *(int8_t *)out_buf = (int8_t)v;
         return 0;
     }
     if (!strcmp(dt, "int16")) {
         long long v = strtoll(str, NULL, 10);
-        if (v >  0x7FFF)          v =  0x7FFF;
-        else if (v < -0x8000)     v = -0x8000;
+        if (v > INT16_MAX)      v = INT16_MAX;
+        else if (v < INT16_MIN) v = INT16_MIN;
         *(int16_t *)out_buf = (int16_t)v;
         return 0;
     }
     if (!strcmp(dt, "int32")) {
         long long v = strtoll(str, NULL, 10);
-        if (v >  0x7FFFFFFFLL)    v =  0x7FFFFFFFLL;
-        else if (v < -0x80000000LL) v = -0x80000000LL;
+        if (v > INT32_MAX)      v = INT32_MAX;
+        else if (v < INT32_MIN) v = INT32_MIN;
         *(int32_t *)out_buf = (int32_t)v;
         return 0;
     }
Original file line number	Diff line number	Diff line change
`@@ -3717,9 +3717,11 @@ static NDArray ndarray_resolve_unary_input(zval array, int *owned)`
`3717`	`3717`	`zend_throw_error(NULL,`
`3718`	`3718`	`"Numeric string expected, got a whitespace-only value.");`
`3719`	`3719`	`} else {`
	`3720`	+ /* Length-bounded (`%.*s`) so an embedded NUL doesn't
	`3721`	`+ truncate the offending literal in the diagnostic. */`
`3720`	`3722`	`zend_throw_error(NULL,`
`3721`		`- "Numeric string expected, got malformed literal: \"%s\".",`
`3722`		`- p);`
	`3723`	`+ "Numeric string expected, got malformed literal: \"%.*s\".",`
	`3724`	`+ (int)n, p);`
`3723`	`3725`	`}`
`3724`	`3726`	`return NULL;`
`3725`	`3727`	`}`