-
Notifications
You must be signed in to change notification settings - Fork 5
Open
Description
Configuration: xprec-1.4.2 on Intel macOS Sonoma
Benchmark:
- Add
ndarray[ddouble]+ndarray[float64]; - Add in-place
ndarray[ddouble]+=ndarray[float64]; - Sum of elements
ndarray[float64]asddouble - Compare with CPython implementation Neumaier-Babuška-Kahan
In [3]: import os
...: import numpy as np
...: import xprec
...:
...: print(os.getpid())
...:
...: a=np.random.uniform(size=10_000_000)
...: x=np.asarray(a, dtype=xprec.ddouble)**2
...:
...: %timeit x+a
...: %timeit exec("x+=a")
...: %timeit np.sum(a, dtype=xprec.ddouble)
...: atl = a.tolist()
...: np.sum(a, dtype=xprec.ddouble)
...: %timeit sum(atl)
83743
56.2 ms ± 892 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
31.1 ms ± 179 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
129 ms ± 97 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
108 ms ± 126 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
The profiler shows u_added() as a problem.
For an experiment to evaluate possible performance improvements, I made the following changes:
diff --git a/csrc/_dd_ufunc.c b/csrc/_dd_ufunc.c
index 7acdf4f..477bb45 100644
--- a/csrc/_dd_ufunc.c
+++ b/csrc/_dd_ufunc.c
@@ -777,7 +777,75 @@ static int register_casts()
MARK_UNUSED(data); \
}
-ULOOP_BINARY(u_addwd, addwd, ddouble, ddouble, double)
+//ULOOP_BINARY(u_addwd, addwd, ddouble, ddouble, double)
+ static void u_addwd(char **args, const npy_intp *dimensions,
+ const npy_intp* steps, void *data)
+ {
+ const npy_intp n = dimensions[0];
+ const npy_intp as = steps[0] / sizeof(ddouble),
+ bs = steps[1] / sizeof(double),
+ os = steps[2] / sizeof(ddouble);
+ const ddouble *a = (const ddouble *)args[0];
+ const double *b = (const double *)args[1];
+ ddouble *out = (ddouble *)args[2];
+ npy_intp i;
+ ddouble out_a[8];
+
+ if(os == 0 && as == 0 && out == a) {
+ out_a[0] = *out;
+ out_a[1].hi = 0.; out_a[1].lo = 0.;
+ out_a[2].hi = 0.; out_a[2].lo = 0.;
+ out_a[3].hi = 0.; out_a[3].lo = 0.;
+ out_a[4].hi = 0.; out_a[4].lo = 0.;
+ out_a[5].hi = 0.; out_a[5].lo = 0.;
+ out_a[6].hi = 0.; out_a[6].lo = 0.;
+ out_a[7].hi = 0.; out_a[7].lo = 0.;
+ for (i = 0; i+7 < n; i += 8) {
+ out_a[0] = addwd(out_a[0], b[(i+0) * bs]);
+ out_a[1] = addwd(out_a[1], b[(i+1) * bs]);
+ out_a[2] = addwd(out_a[2], b[(i+2) * bs]);
+ out_a[3] = addwd(out_a[3], b[(i+3) * bs]);
+ out_a[4] = addwd(out_a[4], b[(i+4) * bs]);
+ out_a[5] = addwd(out_a[5], b[(i+5) * bs]);
+ out_a[6] = addwd(out_a[6], b[(i+6) * bs]);
+ out_a[7] = addwd(out_a[7], b[(i+7) * bs]);
+ }
+ out_a[0] = addww(out_a[0], out_a[1]);
+ out_a[2] = addww(out_a[2], out_a[3]);
+ out_a[4] = addww(out_a[4], out_a[5]);
+ out_a[6] = addww(out_a[6], out_a[7]);
+ out_a[0] = addww(out_a[0], out_a[2]);
+ out_a[4] = addww(out_a[4], out_a[6]);
+ out_a[0] = addww(out_a[0], out_a[4]);
+ for (; i < n; ++i) {
+ out_a[0] = addwd(out_a[0], b[i * bs]);
+ }
+ *out = out_a[0];
+ return;
+ }
+ for (i = 0; i+7 < n; i += 8) {
+ out_a[0] = addwd(a[(i+0) * as], b[(i+0) * bs]);
+ out_a[1] = addwd(a[(i+1) * as], b[(i+1) * bs]);
+ out_a[2] = addwd(a[(i+2) * as], b[(i+2) * bs]);
+ out_a[3] = addwd(a[(i+3) * as], b[(i+3) * bs]);
+ out_a[4] = addwd(a[(i+4) * as], b[(i+4) * bs]);
+ out_a[5] = addwd(a[(i+5) * as], b[(i+5) * bs]);
+ out_a[6] = addwd(a[(i+6) * as], b[(i+6) * bs]);
+ out_a[7] = addwd(a[(i+7) * as], b[(i+7) * bs]);
+ out[(i+0) * os] = out_a[0];
+ out[(i+1) * os] = out_a[1];
+ out[(i+2) * os] = out_a[2];
+ out[(i+3) * os] = out_a[3];
+ out[(i+4) * os] = out_a[4];
+ out[(i+5) * os] = out_a[5];
+ out[(i+6) * os] = out_a[6];
+ out[(i+7) * os] = out_a[7];
+ }
+ for (; i < n; ++i) {
+ out[i * os] = addwd(a[i * as], b[i * bs]);
+ }
+ MARK_UNUSED(data);
+ }
ULOOP_BINARY(u_subwd, subwd, ddouble, ddouble, double)
ULOOP_BINARY(u_mulwd, mulwd, ddouble, ddouble, double)
ULOOP_BINARY(u_divwd, divwd, ddouble, ddouble, double)
Preliminary assessment of the change
In [5]: import os
...: import numpy as np
...: import xprec
...:
...: print(os.getpid())
...:
...: a=np.random.uniform(size=10_000_000)
...: x=np.asarray(a, dtype=xprec.ddouble)**2
...:
...: %timeit x+a
...: %timeit exec("x+=a")
...: %timeit np.sum(a, dtype=xprec.ddouble)
...: atl = a.tolist()
...: np.sum(a, dtype=xprec.ddouble)
...: %timeit sum(atl)
83991
56.2 ms ± 265 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
32.1 ms ± 211 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
17.1 ms ± 21.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)
108 ms ± 94.8 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)
What do you think about this problem and possible changes?
Metadata
Metadata
Assignees
Labels
No labels