Skip to content

np.sum(a, dtype=xprec.ddouble) - too slow #19

@Serge3leo

Description

@Serge3leo

Configuration: xprec-1.4.2 on Intel macOS Sonoma
Benchmark:

  1. Add ndarray[ddouble] + ndarray[float64];
  2. Add in-place ndarray[ddouble] += ndarray[float64];
  3. Sum of elements ndarray[float64] as ddouble
  4. Compare with CPython implementation Neumaier-Babuška-Kahan
In [3]: import os
   ...: import numpy as np
   ...: import xprec
   ...: 
   ...: print(os.getpid())
   ...: 
   ...: a=np.random.uniform(size=10_000_000)
   ...: x=np.asarray(a, dtype=xprec.ddouble)**2
   ...: 
   ...: %timeit x+a
   ...: %timeit exec("x+=a")
   ...: %timeit np.sum(a, dtype=xprec.ddouble)
   ...: atl = a.tolist()
   ...: np.sum(a, dtype=xprec.ddouble)
   ...: %timeit sum(atl)
83743
56.2 ms ± 892 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
31.1 ms ± 179 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
129 ms ± 97 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
108 ms ± 126 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)

The profiler shows u_added() as a problem.

For an experiment to evaluate possible performance improvements, I made the following changes:

diff --git a/csrc/_dd_ufunc.c b/csrc/_dd_ufunc.c
index 7acdf4f..477bb45 100644
--- a/csrc/_dd_ufunc.c
+++ b/csrc/_dd_ufunc.c
@@ -777,7 +777,75 @@ static int register_casts()
         MARK_UNUSED(data);                                              \
     }
 
-ULOOP_BINARY(u_addwd, addwd, ddouble, ddouble, double)
+//ULOOP_BINARY(u_addwd, addwd, ddouble, ddouble, double)
+    static void u_addwd(char **args, const npy_intp *dimensions,
+                          const npy_intp* steps, void *data)
+    {
+        const npy_intp n = dimensions[0];
+        const npy_intp as = steps[0] / sizeof(ddouble),
+                       bs = steps[1] / sizeof(double),
+                       os = steps[2] / sizeof(ddouble);
+        const ddouble *a = (const ddouble *)args[0];
+        const double *b = (const double *)args[1];
+        ddouble *out = (ddouble *)args[2];
+       npy_intp i;
+       ddouble out_a[8];
+
+       if(os == 0 && as == 0 && out == a) {
+           out_a[0] = *out;
+           out_a[1].hi = 0.; out_a[1].lo = 0.;
+           out_a[2].hi = 0.; out_a[2].lo = 0.;
+           out_a[3].hi = 0.; out_a[3].lo = 0.;
+           out_a[4].hi = 0.; out_a[4].lo = 0.;
+           out_a[5].hi = 0.; out_a[5].lo = 0.;
+           out_a[6].hi = 0.; out_a[6].lo = 0.;
+           out_a[7].hi = 0.; out_a[7].lo = 0.;
+           for (i = 0; i+7 < n; i += 8) {
+               out_a[0] = addwd(out_a[0], b[(i+0) * bs]);
+               out_a[1] = addwd(out_a[1], b[(i+1) * bs]);
+               out_a[2] = addwd(out_a[2], b[(i+2) * bs]);
+               out_a[3] = addwd(out_a[3], b[(i+3) * bs]);
+               out_a[4] = addwd(out_a[4], b[(i+4) * bs]);
+               out_a[5] = addwd(out_a[5], b[(i+5) * bs]);
+               out_a[6] = addwd(out_a[6], b[(i+6) * bs]);
+               out_a[7] = addwd(out_a[7], b[(i+7) * bs]);
+           }
+           out_a[0] = addww(out_a[0], out_a[1]);
+           out_a[2] = addww(out_a[2], out_a[3]);
+           out_a[4] = addww(out_a[4], out_a[5]);
+           out_a[6] = addww(out_a[6], out_a[7]);
+           out_a[0] = addww(out_a[0], out_a[2]);
+           out_a[4] = addww(out_a[4], out_a[6]);
+           out_a[0] = addww(out_a[0], out_a[4]);
+           for (; i < n; ++i) {
+               out_a[0] = addwd(out_a[0], b[i * bs]);
+           }
+           *out = out_a[0];
+           return;
+       }
+        for (i = 0; i+7 < n; i += 8) {
+            out_a[0] = addwd(a[(i+0) * as], b[(i+0) * bs]);
+            out_a[1] = addwd(a[(i+1) * as], b[(i+1) * bs]);
+            out_a[2] = addwd(a[(i+2) * as], b[(i+2) * bs]);
+            out_a[3] = addwd(a[(i+3) * as], b[(i+3) * bs]);
+            out_a[4] = addwd(a[(i+4) * as], b[(i+4) * bs]);
+            out_a[5] = addwd(a[(i+5) * as], b[(i+5) * bs]);
+            out_a[6] = addwd(a[(i+6) * as], b[(i+6) * bs]);
+            out_a[7] = addwd(a[(i+7) * as], b[(i+7) * bs]);
+            out[(i+0) * os] = out_a[0];
+            out[(i+1) * os] = out_a[1];
+            out[(i+2) * os] = out_a[2];
+            out[(i+3) * os] = out_a[3];
+            out[(i+4) * os] = out_a[4];
+            out[(i+5) * os] = out_a[5];
+            out[(i+6) * os] = out_a[6];
+            out[(i+7) * os] = out_a[7];
+        }
+        for (; i < n; ++i) {
+            out[i * os] = addwd(a[i * as], b[i * bs]);
+        }
+        MARK_UNUSED(data);
+    }
 ULOOP_BINARY(u_subwd, subwd, ddouble, ddouble, double)
 ULOOP_BINARY(u_mulwd, mulwd, ddouble, ddouble, double)
 ULOOP_BINARY(u_divwd, divwd, ddouble, ddouble, double)

Preliminary assessment of the change

In [5]: import os
   ...: import numpy as np
   ...: import xprec
   ...: 
   ...: print(os.getpid())
   ...: 
   ...: a=np.random.uniform(size=10_000_000)
   ...: x=np.asarray(a, dtype=xprec.ddouble)**2
   ...: 
   ...: %timeit x+a
   ...: %timeit exec("x+=a")
   ...: %timeit np.sum(a, dtype=xprec.ddouble)
   ...: atl = a.tolist()
   ...: np.sum(a, dtype=xprec.ddouble)
   ...: %timeit sum(atl)
83991
56.2 ms ± 265 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
32.1 ms ± 211 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
17.1 ms ± 21.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
108 ms ± 94.8 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)

What do you think about this problem and possible changes?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions