RubixML
diff --git a/‎numpower.c‎
Lines changed: 158 additions & 238 deletions b/‎numpower.c‎
Lines changed: 158 additions & 238 deletions
diff --git a/‎src/dd_math.c‎
Lines changed: 49 additions & 0 deletions b/‎src/dd_math.c‎
Lines changed: 49 additions & 0 deletions
diff --git a/‎src/dd_math.h‎
Lines changed: 7 additions & 0 deletions b/‎src/dd_math.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎src/ndarray_types.c‎
Lines changed: 54 additions & 0 deletions b/‎src/ndarray_types.c‎
Lines changed: 54 additions & 0 deletions
diff --git a/‎src/ndarray_types.h‎
Lines changed: 19 additions & 0 deletions b/‎src/ndarray_types.h‎
Lines changed: 19 additions & 0 deletions
@@ -176,6 +176,55 @@ ndarray_dd_t ndarray_dd_pow(ndarray_dd_t a, ndarray_dd_t b) {
     return ndarray_dd_from_double(pow(a.hi, b.hi));
 }
 
+/* ── sqrt / rsqrt ──────────────────────────────────────────────────────── */
+
+/**
+ * @brief Double-double square root.
+ *
+ * Computes sqrt(a) to ~106-bit precision via a single Newton refinement
+ * step starting from a fp64 seed. Identity:
+ *   y' = 0.5 * (y + a / y)
+ * Carried out in DD so the residual `a - y*y` is captured exactly.
+ *
+ * NaN propagation: returns NaN for any input with `hi < 0` or NaN.
+ * Zero is preserved exactly (no division by zero).
+ *
+ * @param[in] a Non-negative DD input.
+ * @return DD square root of @p a.
+ */
+ndarray_dd_t ndarray_dd_sqrt(ndarray_dd_t a) {
+    if (a.hi == 0.0 && a.lo == 0.0) return a;
+    if (a.hi < 0.0 || ndarray_dd_isnan(a)) {
+        return ndarray_dd_from_double(NAN);
+    }
+    double y = sqrt(a.hi);
+    /* y' = y + (a - y*y) / (2*y) — refines the seed from fp64 (53-bit) to
+       full DD precision in one pass. */
+    ndarray_dd_t y_dd  = ndarray_dd_from_double(y);
+    ndarray_dd_t y_sq  = ndarray_dd_mul(y_dd, y_dd);
+    ndarray_dd_t diff  = ndarray_dd_sub(a, y_sq);
+    ndarray_dd_t denom = ndarray_dd_from_double(2.0 * y);
+    ndarray_dd_t corr  = ndarray_dd_div(diff, denom);
+    return ndarray_dd_add(y_dd, corr);
+}
+
+/**
+ * @brief Double-double reciprocal square root, `1 / sqrt(a)`.
+ *
+ * Implemented as `1.0 / sqrt(a)` rather than a fused Newton step on
+ * `1/sqrt` because the DD division and DD sqrt are both already
+ * iteratively refined; chaining them keeps the implementation small
+ * without measurably worse precision than the fused form.
+ *
+ * @param[in] a Positive DD input.
+ * @return DD reciprocal square root of @p a; +inf for zero, NaN for
+ *         negative.
+ */
+ndarray_dd_t ndarray_dd_rsqrt(ndarray_dd_t a) {
+    ndarray_dd_t r = ndarray_dd_sqrt(a);
+    return ndarray_dd_div(ndarray_dd_from_double(1.0), r);
+}
+
 /* ── int conversion ─────────────────────────────────────────────────────── */
 
 long long ndarray_dd_to_int64(ndarray_dd_t a) {
 
@@ -36,6 +36,13 @@ ndarray_dd_t ndarray_dd_trunc(ndarray_dd_t a);
 ndarray_dd_t ndarray_dd_fmod(ndarray_dd_t a, ndarray_dd_t b);
 ndarray_dd_t ndarray_dd_pow(ndarray_dd_t a, ndarray_dd_t b);
 
+/* Square root and reciprocal sqrt to ~106-bit precision via one
+   double-precision sqrt seed followed by a single Newton iteration in
+   double-double arithmetic. Returns NaN for negative inputs and matches
+   sqrtq(__float128)/qsqrtq on the libquadmath build to the last DD bit. */
+ndarray_dd_t ndarray_dd_sqrt(ndarray_dd_t a);
+ndarray_dd_t ndarray_dd_rsqrt(ndarray_dd_t a);
+
 /* ── comparisons ────────────────────────────────────────────────────────── */
 int  ndarray_dd_cmp(ndarray_dd_t a, ndarray_dd_t b);   /* -1, 0, 1 */
 int  ndarray_dd_iszero(ndarray_dd_t a);
 
@@ -227,6 +227,60 @@ uint16_t ndarray_double_to_fp16(double val) {
    chosen storage can represent.
    ══════════════════════════════════════════════════════════════════════════ */
 
+#if NDARRAY_HAVE_FLOAT128
+/**
+ * @brief Square root of an `__float128` value at the highest precision the
+ *        platform offers.
+ *
+ * Uses `sqrtq` from libquadmath when present (full 113-bit precision,
+ * matching CPU `__float128` exactly). Falls back to `sqrtl((long double)x)`
+ * when libquadmath is absent — this branch is only reachable on
+ * misconfigured builds; precision drops to ~64 bits.
+ *
+ * Calling through this out-of-line helper (rather than expanding the
+ * `#if` at every header include site) ensures the libquadmath choice is
+ * made once when this translation unit is compiled with `config.h`
+ * already in scope.
+ *
+ * @param[in] a Input.
+ * @return Square root of @p a in the native fp128 storage.
+ */
+ndarray_fp128_t ndarray_fp128_sqrt(ndarray_fp128_t a) {
+#  if HAVE_QUADMATH
+    return sqrtq(a);
+#  else
+    return (ndarray_fp128_t)sqrtl((long double)a);
+#  endif
+}
+
+/**
+ * @brief Sine of an `__float128` value at the highest available precision.
+ *        See `ndarray_fp128_sqrt` for the libquadmath-vs-fallback contract.
+ * @param[in] a Input in radians.
+ * @return sin(@p a) in the native fp128 storage.
+ */
+ndarray_fp128_t ndarray_fp128_sin(ndarray_fp128_t a) {
+#  if HAVE_QUADMATH
+    return sinq(a);
+#  else
+    return (ndarray_fp128_t)sinl((long double)a);
+#  endif
+}
+
+/**
+ * @brief NaN detection for `__float128`. Returns 1 for NaN, 0 otherwise.
+ * @param[in] a Input.
+ * @return Non-zero iff @p a is NaN.
+ */
+int ndarray_fp128_isnan(ndarray_fp128_t a) {
+#  if HAVE_QUADMATH
+    return isnanq(a);
+#  else
+    return isnan((double)a);
+#  endif
+}
+#endif /* NDARRAY_HAVE_FLOAT128 */
+
 ndarray_fp128_t ndarray_double_to_fp128(double val) {
     return NDARRAY_FP128_FROM_D(val);
 }
 
@@ -42,10 +42,25 @@ uint16_t ndarray_double_to_fp16(double val);
    they expand to native operators (zero overhead); on the DD path they
    expand to ndarray_dd_* calls. */
 #if NDARRAY_HAVE_FLOAT128
+   /* sqrt / sin / isnan for __float128 are routed through these out-of-line
+      helpers so the libquadmath-vs-libm-fallback choice is made inside
+      ndarray_types.c (where `config.h` has been included and `HAVE_QUADMATH`
+      is meaningful) rather than baked into the expansion at every header
+      include site. Inlining the choice here would silently pick the
+      long-double fallback in any translation unit that pulls
+      ndarray_types.h via a transitive header before its own
+      `#include "config.h"`. */
+   ndarray_fp128_t ndarray_fp128_sqrt(ndarray_fp128_t a);
+   ndarray_fp128_t ndarray_fp128_sin (ndarray_fp128_t a);
+   int             ndarray_fp128_isnan(ndarray_fp128_t a);
+#  define NDARRAY_FP128_SQRT(a)     ndarray_fp128_sqrt(a)
+#  define NDARRAY_FP128_SIN(a)      ndarray_fp128_sin(a)
+#  define NDARRAY_FP128_ISNAN(a)    ndarray_fp128_isnan(a)
 #  define NDARRAY_FP128_FROM_D(d)   ((ndarray_fp128_t)(d))
 #  define NDARRAY_FP128_FROM_LD(ld) ((ndarray_fp128_t)(ld))
 #  define NDARRAY_FP128_TO_D(x)     ((double)(x))
 #  define NDARRAY_FP128_ZERO()      ((ndarray_fp128_t)0)
+#  define NDARRAY_FP128_ONE()       ((ndarray_fp128_t)1)
 #  define NDARRAY_FP128_NAN()       ((ndarray_fp128_t)(0.0/0.0))
 #  define NDARRAY_FP128_ADD(a, b)   ((a) + (b))
 #  define NDARRAY_FP128_SUB(a, b)   ((a) - (b))
@@ -63,16 +78,20 @@ uint16_t ndarray_double_to_fp16(double val);
 #  define NDARRAY_FP128_FROM_LD(ld) ndarray_dd_from_double((double)(ld))
 #  define NDARRAY_FP128_TO_D(x)     ndarray_dd_to_double(x)
 #  define NDARRAY_FP128_ZERO()      ndarray_dd_from_double(0.0)
+#  define NDARRAY_FP128_ONE()       ndarray_dd_from_double(1.0)
 #  define NDARRAY_FP128_NAN()       ndarray_dd_from_double(0.0/0.0)
 #  define NDARRAY_FP128_ADD(a, b)   ndarray_dd_add((a), (b))
 #  define NDARRAY_FP128_SUB(a, b)   ndarray_dd_sub((a), (b))
 #  define NDARRAY_FP128_MUL(a, b)   ndarray_dd_mul((a), (b))
 #  define NDARRAY_FP128_DIV(a, b)   ndarray_dd_div((a), (b))
 #  define NDARRAY_FP128_NEG(a)      ndarray_dd_neg(a)
 #  define NDARRAY_FP128_ABS(a)      ndarray_dd_abs(a)
+#  define NDARRAY_FP128_SQRT(a)     ndarray_dd_sqrt(a)
+#  define NDARRAY_FP128_SIN(a)      ndarray_dd_from_double(sin(ndarray_dd_to_double(a)))
 #  define NDARRAY_FP128_EQ(a, b)    (ndarray_dd_cmp((a), (b)) == 0)
 #  define NDARRAY_FP128_LT(a, b)    (ndarray_dd_cmp((a), (b)) <  0)
 #  define NDARRAY_FP128_ISZERO(a)   ndarray_dd_iszero(a)
+#  define NDARRAY_FP128_ISNAN(a)    ndarray_dd_isnan(a)
 #  define NDARRAY_FP128_FROM_I64(i) ndarray_dd_from_int64(i)
 #  define NDARRAY_FP128_TO_I64(x)   ndarray_dd_to_int64(x)
 #endif