Skip to content

Commit c513d65

Browse files
include highway wrapper
1 parent b39e65d commit c513d65

1 file changed

Lines changed: 105 additions & 99 deletions

File tree

numpy/_core/src/umath/loops_arithmetic.dispatch.cpp

Lines changed: 105 additions & 99 deletions
Original file line numberDiff line numberDiff line change
@@ -13,19 +13,18 @@
1313
#include "loops_utils.h"
1414
#include "loops.h"
1515
#include "fast_loop_macros.h"
16-
#include "simd/simd.h"
16+
#include "simd/simd.hpp"
1717
#include "lowlevel_strided_loops.h"
1818
#include "common.hpp"
1919

2020
#include <cstring> // for memcpy
2121
#include <limits>
2222
#include <cstdio>
2323

24-
#include <hwy/highway.h>
25-
namespace hn = hwy::HWY_NAMESPACE;
26-
27-
HWY_BEFORE_NAMESPACE();
28-
namespace HWY_NAMESPACE {
24+
using namespace np::simd;
25+
#if NPY_HWY
26+
namespace hn = np::simd::hn;
27+
#endif
2928

3029
// Helper function to set float status
3130
inline void set_float_status(bool overflow, bool divbyzero) {
@@ -36,13 +35,15 @@ inline void set_float_status(bool overflow, bool divbyzero) {
3635
npy_set_floatstatus_divbyzero();
3736
}
3837
}
39-
#if NPY_SIMD
40-
// Signed integer division
38+
#if NPY_HWY
39+
40+
// Signed integer DIVIDE by scalar
41+
4142
template <typename T>
4243
void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len) {
43-
using D = hn::ScalableTag<T>;
44+
using D = _Tag<T>;
4445
const D d;
45-
const size_t N = hn::Lanes(d);
46+
const size_t N = Lanes(T{});
4647

4748
bool raise_overflow = false;
4849
bool raise_divbyzero = false;
@@ -59,13 +60,13 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
5960
}
6061
}
6162
else if (scalar == static_cast<T>(-1)) {
62-
const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
63+
const auto vec_min_val = Set(static_cast<T>(std::numeric_limits<T>::min()));
6364
size_t i = 0;
6465
for (; i + N <= static_cast<size_t>(len); i += N) {
65-
const auto vec_src = hn::LoadU(d, src + i);
66-
const auto is_min_val = hn::Eq(vec_src, vec_min_val);
66+
const auto vec_src = LoadU(src + i);
67+
const auto is_min_val = Eq(vec_src, vec_min_val);
6768
const auto vec_res = hn::IfThenElse(is_min_val, vec_min_val, hn::Neg(vec_src));
68-
hn::StoreU(vec_res, d, dst + i);
69+
StoreU(vec_res, dst + i);
6970
if (!raise_overflow && !hn::AllFalse(d, is_min_val)) {
7071
raise_overflow = true;
7172
}
@@ -83,21 +84,21 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
8384
}
8485
else {
8586
// General case with floor division semantics
86-
const auto vec_scalar = hn::Set(d, scalar);
87-
const auto vec_zero = hn::Zero(d);
88-
const auto one = hn::Set(d, static_cast<T>(1));
87+
const auto vec_scalar = Set(scalar);
88+
const auto vec_zero = Zero<T>();
89+
const auto one = Set(static_cast<T>(1));
8990
size_t i = 0;
9091

9192
for (; i + N <= static_cast<size_t>(len); i += N) {
92-
const auto vec_src = hn::LoadU(d, src + i);
93-
auto vec_div = hn::Div(vec_src, vec_scalar);
94-
const auto vec_mul = hn::Mul(vec_div, vec_scalar);
95-
const auto eq_mask = hn::Eq(vec_src, vec_mul);
96-
const auto diff_signs = hn::Lt(hn::Xor(vec_src, vec_scalar), vec_zero);
97-
const auto adjust = hn::AndNot(eq_mask, diff_signs);
93+
const auto vec_src = LoadU(src + i);
94+
auto vec_div = Div(vec_src, vec_scalar);
95+
const auto vec_mul = Mul(vec_div, vec_scalar);
96+
const auto eq_mask = Eq(vec_src, vec_mul);
97+
const auto diff_signs = Lt(Xor(vec_src, vec_scalar), vec_zero);
98+
const auto adjust = AndNot(eq_mask, diff_signs);
9899

99100
vec_div = hn::MaskedSubOr(vec_div, adjust, vec_div, one);
100-
hn::StoreU(vec_div, d, dst + i);
101+
StoreU(vec_div, dst + i);
101102
}
102103

103104
// Handle remaining elements with scalar code
@@ -113,13 +114,12 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
113114
set_float_status(raise_overflow, raise_divbyzero);
114115
}
115116

116-
// Unsigned integer division
117+
// Unsigned integer DIVIDE by scalar
118+
117119
template <typename T>
118120
void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp len) {
119-
using D = hn::ScalableTag<T>;
120-
const D d;
121-
const size_t N = hn::Lanes(d);
122121

122+
const size_t N = Lanes(T{});
123123
bool raise_divbyzero = false;
124124

125125
if (scalar == 0) {
@@ -134,12 +134,12 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
134134
}
135135
}
136136
else {
137-
const auto vec_scalar = hn::Set(d, scalar);
137+
const auto vec_scalar = Set(scalar);
138138
size_t i = 0;
139139
for (; i + N <= static_cast<size_t>(len); i += N) {
140-
const auto vec_src = hn::LoadU(d, src + i);
141-
const auto vec_res = hn::Div(vec_src, vec_scalar);
142-
hn::StoreU(vec_res, d, dst + i);
140+
const auto vec_src = LoadU(src + i);
141+
const auto vec_res = Div(vec_src, vec_scalar);
142+
StoreU(vec_res, dst + i);
143143
}
144144
// Handle remaining elements
145145
for (; i < static_cast<size_t>(len); i++) {
@@ -149,71 +149,53 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
149149

150150
set_float_status(false, raise_divbyzero);
151151
}
152-
#endif // NPY_SIMD
153-
// Floor division for signed integers
154-
template <typename T>
155-
T floor_div(T n, T d) {
156-
if (HWY_UNLIKELY(d == 0 || (n == std::numeric_limits<T>::min() && d == -1))) {
157-
if (d == 0) {
158-
npy_set_floatstatus_divbyzero();
159-
return 0;
160-
}
161-
else {
162-
npy_set_floatstatus_overflow();
163-
return std::numeric_limits<T>::min();
164-
}
165-
}
166-
T r = n / d;
167-
if (((n > 0) != (d > 0)) && ((r * d) != n)) {
168-
--r;
169-
}
170-
return r;
171-
}
172152

173-
// General divide implementation for arrays
153+
// Signed integer DIVIDE array / array
154+
174155
template <typename T>
175156
void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
176-
using D = hn::ScalableTag<T>;
157+
using D = _Tag<T>;
177158
const D d;
178-
const size_t N = hn::Lanes(d);
159+
const size_t N = Lanes(T{});
160+
179161
bool raise_overflow = false;
180162
bool raise_divbyzero = false;
181-
const auto vec_zero = hn::Zero(d);
182-
const auto vec_one = hn::Set(d, static_cast<T>(1));
183-
const auto vec_min_val = hn::Set(d, std::numeric_limits<T>::min());
184-
const auto vec_neg_one = hn::Set(d, static_cast<T>(-1));
163+
const auto vec_zero = Zero<T>();
164+
const auto vec_one = Set(static_cast<T>(1));
165+
const auto vec_min_val = Set(static_cast<T>(std::numeric_limits<T>::min()));
166+
const auto vec_neg_one = Set(static_cast<T>(-1));
185167

186168
size_t i = 0;
187169
for (; i + N <= static_cast<size_t>(len); i += N) {
188-
const auto vec_a = hn::LoadU(d, src1 + i);
189-
const auto vec_b = hn::LoadU(d, src2 + i);
170+
const auto vec_a = LoadU(src1 + i);
171+
const auto vec_b = LoadU(src2 + i);
190172

191-
const auto b_is_zero = hn::Eq(vec_b, vec_zero);
192-
const auto a_is_min = hn::Eq(vec_a, vec_min_val);
193-
const auto b_is_neg_one = hn::Eq(vec_b, vec_neg_one);
194-
const auto overflow_cond = hn::And(a_is_min, b_is_neg_one);
173+
const auto b_is_zero = Eq(vec_b, vec_zero);
174+
const auto a_is_min = Eq(vec_a, vec_min_val);
175+
const auto b_is_neg_one = Eq(vec_b, vec_neg_one);
176+
const auto overflow_cond = And(a_is_min, b_is_neg_one);
195177

196-
const auto safe_div_mask = hn::Not(hn::Or(b_is_zero, overflow_cond));
197-
const auto safe_b = hn::IfThenElse(hn::Or(b_is_zero, overflow_cond), vec_one, vec_b);
178+
const auto safe_div_mask = hn::Not(Or(b_is_zero, overflow_cond));
179+
const auto safe_b = hn::IfThenElse(Or(b_is_zero, overflow_cond), vec_one, vec_b);
198180

199-
auto vec_div = hn::Div(vec_a, safe_b);
181+
auto vec_div = Div(vec_a, safe_b);
200182

201183
if (!hn::AllFalse(d, safe_div_mask)) {
202-
const auto vec_mul = hn::Mul(vec_div, safe_b);
184+
const auto vec_mul = Mul(vec_div, safe_b);
203185
const auto has_remainder = hn::Ne(vec_a, vec_mul);
204-
const auto a_sign = hn::Lt(vec_a, vec_zero);
205-
const auto b_sign = hn::Lt(vec_b, vec_zero);
206-
const auto different_signs = hn::Xor(a_sign, b_sign);
207-
const auto needs_adjustment = hn::And(safe_div_mask,
208-
hn::And(has_remainder, different_signs));
186+
const auto a_sign = Lt(vec_a, vec_zero);
187+
const auto b_sign = Lt(vec_b, vec_zero);
188+
const auto different_signs = Xor(a_sign, b_sign);
189+
const auto needs_adjustment = And(safe_div_mask,
190+
And(has_remainder, different_signs));
209191

210192
vec_div = hn::MaskedSubOr(vec_div, needs_adjustment, vec_div, vec_one);
211193
}
212194

213195
vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
214196
vec_div = hn::IfThenElse(overflow_cond, vec_min_val, vec_div);
215197

216-
hn::StoreU(vec_div, d, dst + i);
198+
StoreU(vec_div, dst + i);
217199

218200
if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
219201
raise_divbyzero = true;
@@ -249,30 +231,32 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
249231
set_float_status(raise_overflow, raise_divbyzero);
250232
}
251233

252-
// Unsigned division for arrays
234+
// Unsigned integer DIVIDE array / array
235+
253236
template <typename T>
254237
void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
255-
using D = hn::ScalableTag<T>;
238+
using D = _Tag<T>;
256239
const D d;
257-
const size_t N = hn::Lanes(d);
240+
const size_t N = Lanes(T{});
241+
258242
bool raise_divbyzero = false;
259-
const auto vec_zero = hn::Zero(d);
260-
const auto vec_one = hn::Set(d, static_cast<T>(1));
243+
const auto vec_zero = Zero<T>();
244+
const auto vec_one = Set(static_cast<T>(1));
261245

262246
size_t i = 0;
263247
for (; i + N <= static_cast<size_t>(len); i += N) {
264-
const auto vec_a = hn::LoadU(d, src1 + i);
265-
const auto vec_b = hn::LoadU(d, src2 + i);
248+
const auto vec_a = LoadU(src1 + i);
249+
const auto vec_b = LoadU(src2 + i);
266250

267-
const auto b_is_zero = hn::Eq(vec_b, vec_zero);
251+
const auto b_is_zero = Eq(vec_b, vec_zero);
268252

269253
const auto safe_b = hn::IfThenElse(b_is_zero, vec_one, vec_b);
270254

271-
auto vec_div = hn::Div(vec_a, safe_b);
255+
auto vec_div = Div(vec_a, safe_b);
272256

273257
vec_div = hn::IfThenElse(b_is_zero, vec_zero, vec_div);
274258

275-
hn::StoreU(vec_div, d, dst + i);
259+
StoreU(vec_div, dst + i);
276260

277261
if (!raise_divbyzero && !hn::AllFalse(d, b_is_zero)) {
278262
raise_divbyzero = true;
@@ -296,17 +280,40 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
296280
set_float_status(false, raise_divbyzero);
297281
}
298282

283+
#endif // NPY_HWY
284+
285+
// Floor division for signed integers
286+
template <typename T>
287+
T floor_div(T n, T d) {
288+
if (NPY_UNLIKELY(d == 0 || (n == std::numeric_limits<T>::min() && d == -1))) {
289+
if (d == 0) {
290+
npy_set_floatstatus_divbyzero();
291+
return 0;
292+
}
293+
else {
294+
npy_set_floatstatus_overflow();
295+
return std::numeric_limits<T>::min();
296+
}
297+
}
298+
T r = n / d;
299+
if (((n > 0) != (d > 0)) && ((r * d) != n)) {
300+
--r;
301+
}
302+
return r;
303+
}
304+
305+
299306
// Dispatch functions for signed integer division
300307
template <typename T>
301308
void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED(func)) {
302309
npy_clear_floatstatus();
303310
if (IS_BINARY_REDUCE) {
304311
BINARY_REDUCE_LOOP(T) {
305312
const T divisor = *reinterpret_cast<T*>(ip2);
306-
if (HWY_UNLIKELY(divisor == 0)) {
313+
if (NPY_UNLIKELY(divisor == 0)) {
307314
npy_set_floatstatus_divbyzero();
308315
io1 = 0;
309-
} else if (HWY_UNLIKELY(io1 == std::numeric_limits<T>::min() && divisor == -1)) {
316+
} else if (NPY_UNLIKELY(io1 == std::numeric_limits<T>::min() && divisor == -1)) {
310317
npy_set_floatstatus_overflow();
311318
io1 = std::numeric_limits<T>::min();
312319
} else {
@@ -316,7 +323,8 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
316323
*reinterpret_cast<T*>(iop1) = io1;
317324
return;
318325
}
319-
#if NPY_SIMD
326+
327+
#if NPY_HWY
320328
// Handle array-array case
321329
if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH))
322330
{
@@ -343,18 +351,19 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
343351
return;
344352
}
345353
}
346-
#endif // NPY_SIMD
354+
#endif // NPY_HWY
347355

356+
// Scalar fallback
348357
// Fallback for non-blockable, in-place, or zero divisor cases
349358
BINARY_LOOP {
350359
const T dividend = *reinterpret_cast<T*>(ip1);
351360
const T divisor = *reinterpret_cast<T*>(ip2);
352361
T* result = reinterpret_cast<T*>(op1);
353362

354-
if (HWY_UNLIKELY(divisor == 0)) {
363+
if (NPY_UNLIKELY(divisor == 0)) {
355364
npy_set_floatstatus_divbyzero();
356365
*result = 0;
357-
} else if (HWY_UNLIKELY(dividend == std::numeric_limits<T>::min() && divisor == -1)) {
366+
} else if (NPY_UNLIKELY(dividend == std::numeric_limits<T>::min() && divisor == -1)) {
358367
npy_set_floatstatus_overflow();
359368
*result = std::numeric_limits<T>::min();
360369
} else {
@@ -370,7 +379,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
370379
if (IS_BINARY_REDUCE) {
371380
BINARY_REDUCE_LOOP(T) {
372381
const T d = *reinterpret_cast<T*>(ip2);
373-
if (HWY_UNLIKELY(d == 0)) {
382+
if (NPY_UNLIKELY(d == 0)) {
374383
npy_set_floatstatus_divbyzero();
375384
io1 = 0;
376385
} else {
@@ -380,7 +389,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
380389
*reinterpret_cast<T*>(iop1) = io1;
381390
return;
382391
}
383-
#if NPY_SIMD
392+
#if NPY_HWY
384393
// Handle array-array case
385394
if (IS_BLOCKABLE_BINARY(sizeof(T), NPY_SIMD_WIDTH)) {
386395
bool no_overlap = nomemoverlap(args[2], steps[2], args[0], steps[0], dimensions[0]) &&
@@ -406,13 +415,13 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
406415
return;
407416
}
408417
}
409-
#endif // NPY_SIMD
418+
#endif // NPY_HWY
410419

411420
// Fallback for non-blockable, in-place, or zero divisor cases
412421
BINARY_LOOP {
413422
const T in1 = *reinterpret_cast<T*>(ip1);
414423
const T in2 = *reinterpret_cast<T*>(ip2);
415-
if (HWY_UNLIKELY(in2 == 0)) {
424+
if (NPY_UNLIKELY(in2 == 0)) {
416425
npy_set_floatstatus_divbyzero();
417426
*reinterpret_cast<T*>(op1) = 0;
418427
} else {
@@ -465,7 +474,7 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
465474
T* indexed = (T*)(ip1 + is1 * indx);
466475
T divisor = *(T*)value;
467476

468-
if (HWY_UNLIKELY(divisor == 0)) {
477+
if (NPY_UNLIKELY(divisor == 0)) {
469478
npy_set_floatstatus_divbyzero();
470479
*indexed = 0;
471480
} else {
@@ -524,6 +533,3 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
524533

525534
#undef DEFINE_DIVIDE_FUNCTION
526535
#undef DEFINE_DIVIDE_FUNCTION_UNSIGNED
527-
528-
} // namespace HWY_NAMESPACE
529-
HWY_AFTER_NAMESPACE();

0 commit comments

Comments
 (0)