1313#include " loops_utils.h"
1414#include " loops.h"
1515#include " fast_loop_macros.h"
16- #include " simd/simd.h "
16+ #include " simd/simd.hpp "
1717#include " lowlevel_strided_loops.h"
1818#include " common.hpp"
1919
2020#include < cstring> // for memcpy
2121#include < limits>
2222#include < cstdio>
2323
24- #include < hwy/highway.h>
25- namespace hn = hwy::HWY_NAMESPACE ;
26-
27- HWY_BEFORE_NAMESPACE ();
28- namespace HWY_NAMESPACE {
24+ using namespace np ::simd;
25+ #if NPY_HWY
26+ namespace hn = np::simd::hn;
27+ #endif
2928
3029// Helper function to set float status
3130inline void set_float_status (bool overflow, bool divbyzero) {
@@ -36,13 +35,15 @@ inline void set_float_status(bool overflow, bool divbyzero) {
3635 npy_set_floatstatus_divbyzero ();
3736 }
3837}
39- #if NPY_SIMD
40- // Signed integer division
38+ #if NPY_HWY
39+
40+ // Signed integer DIVIDE by scalar
41+
4142template <typename T>
4243void simd_divide_by_scalar_contig_signed (T* src, T scalar, T* dst, npy_intp len) {
43- using D = hn::ScalableTag <T>;
44+ using D = _Tag <T>;
4445 const D d;
45- const size_t N = hn:: Lanes (d );
46+ const size_t N = Lanes (T{} );
4647
4748 bool raise_overflow = false ;
4849 bool raise_divbyzero = false ;
@@ -59,13 +60,13 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
5960 }
6061 }
6162 else if (scalar == static_cast <T>(-1 )) {
62- const auto vec_min_val = hn:: Set (d, std::numeric_limits<T>::min ());
63+ const auto vec_min_val = Set (static_cast <T>( std::numeric_limits<T>::min () ));
6364 size_t i = 0 ;
6465 for (; i + N <= static_cast <size_t >(len); i += N) {
65- const auto vec_src = hn:: LoadU (d, src + i);
66- const auto is_min_val = hn:: Eq (vec_src, vec_min_val);
66+ const auto vec_src = LoadU (src + i);
67+ const auto is_min_val = Eq (vec_src, vec_min_val);
6768 const auto vec_res = hn::IfThenElse (is_min_val, vec_min_val, hn::Neg (vec_src));
68- hn:: StoreU (vec_res, d , dst + i);
69+ StoreU (vec_res, dst + i);
6970 if (!raise_overflow && !hn::AllFalse (d, is_min_val)) {
7071 raise_overflow = true ;
7172 }
@@ -83,21 +84,21 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
8384 }
8485 else {
8586 // General case with floor division semantics
86- const auto vec_scalar = hn:: Set (d, scalar);
87- const auto vec_zero = hn:: Zero(d );
88- const auto one = hn:: Set (d, static_cast <T>(1 ));
87+ const auto vec_scalar = Set (scalar);
88+ const auto vec_zero = Zero<T>( );
89+ const auto one = Set (static_cast <T>(1 ));
8990 size_t i = 0 ;
9091
9192 for (; i + N <= static_cast <size_t >(len); i += N) {
92- const auto vec_src = hn:: LoadU (d, src + i);
93- auto vec_div = hn:: Div (vec_src, vec_scalar);
94- const auto vec_mul = hn:: Mul (vec_div, vec_scalar);
95- const auto eq_mask = hn:: Eq (vec_src, vec_mul);
96- const auto diff_signs = hn:: Lt (hn:: Xor (vec_src, vec_scalar), vec_zero);
97- const auto adjust = hn:: AndNot (eq_mask, diff_signs);
93+ const auto vec_src = LoadU (src + i);
94+ auto vec_div = Div (vec_src, vec_scalar);
95+ const auto vec_mul = Mul (vec_div, vec_scalar);
96+ const auto eq_mask = Eq (vec_src, vec_mul);
97+ const auto diff_signs = Lt (Xor (vec_src, vec_scalar), vec_zero);
98+ const auto adjust = AndNot (eq_mask, diff_signs);
9899
99100 vec_div = hn::MaskedSubOr (vec_div, adjust, vec_div, one);
100- hn:: StoreU (vec_div, d , dst + i);
101+ StoreU (vec_div, dst + i);
101102 }
102103
103104 // Handle remaining elements with scalar code
@@ -113,13 +114,12 @@ void simd_divide_by_scalar_contig_signed(T* src, T scalar, T* dst, npy_intp len)
113114 set_float_status (raise_overflow, raise_divbyzero);
114115}
115116
116- // Unsigned integer division
117+ // Unsigned integer DIVIDE by scalar
118+
117119template <typename T>
118120void simd_divide_by_scalar_contig_unsigned (T* src, T scalar, T* dst, npy_intp len) {
119- using D = hn::ScalableTag<T>;
120- const D d;
121- const size_t N = hn::Lanes (d);
122121
122+ const size_t N = Lanes (T{});
123123 bool raise_divbyzero = false ;
124124
125125 if (scalar == 0 ) {
@@ -134,12 +134,12 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
134134 }
135135 }
136136 else {
137- const auto vec_scalar = hn:: Set (d, scalar);
137+ const auto vec_scalar = Set (scalar);
138138 size_t i = 0 ;
139139 for (; i + N <= static_cast <size_t >(len); i += N) {
140- const auto vec_src = hn:: LoadU (d, src + i);
141- const auto vec_res = hn:: Div (vec_src, vec_scalar);
142- hn:: StoreU (vec_res, d , dst + i);
140+ const auto vec_src = LoadU (src + i);
141+ const auto vec_res = Div (vec_src, vec_scalar);
142+ StoreU (vec_res, dst + i);
143143 }
144144 // Handle remaining elements
145145 for (; i < static_cast <size_t >(len); i++) {
@@ -149,71 +149,53 @@ void simd_divide_by_scalar_contig_unsigned(T* src, T scalar, T* dst, npy_intp le
149149
150150 set_float_status (false , raise_divbyzero);
151151}
152- #endif // NPY_SIMD
153- // Floor division for signed integers
154- template <typename T>
155- T floor_div (T n, T d) {
156- if (HWY_UNLIKELY (d == 0 || (n == std::numeric_limits<T>::min () && d == -1 ))) {
157- if (d == 0 ) {
158- npy_set_floatstatus_divbyzero ();
159- return 0 ;
160- }
161- else {
162- npy_set_floatstatus_overflow ();
163- return std::numeric_limits<T>::min ();
164- }
165- }
166- T r = n / d;
167- if (((n > 0 ) != (d > 0 )) && ((r * d) != n)) {
168- --r;
169- }
170- return r;
171- }
172152
173- // General divide implementation for arrays
153+ // Signed integer DIVIDE array / array
154+
174155template <typename T>
175156void simd_divide_contig_signed (T* src1, T* src2, T* dst, npy_intp len) {
176- using D = hn::ScalableTag <T>;
157+ using D = _Tag <T>;
177158 const D d;
178- const size_t N = hn::Lanes (d);
159+ const size_t N = Lanes (T{});
160+
179161 bool raise_overflow = false ;
180162 bool raise_divbyzero = false ;
181- const auto vec_zero = hn:: Zero(d );
182- const auto vec_one = hn:: Set (d, static_cast <T>(1 ));
183- const auto vec_min_val = hn:: Set (d, std::numeric_limits<T>::min ());
184- const auto vec_neg_one = hn:: Set (d, static_cast <T>(-1 ));
163+ const auto vec_zero = Zero<T>( );
164+ const auto vec_one = Set (static_cast <T>(1 ));
165+ const auto vec_min_val = Set (static_cast <T>( std::numeric_limits<T>::min () ));
166+ const auto vec_neg_one = Set (static_cast <T>(-1 ));
185167
186168 size_t i = 0 ;
187169 for (; i + N <= static_cast <size_t >(len); i += N) {
188- const auto vec_a = hn:: LoadU (d, src1 + i);
189- const auto vec_b = hn:: LoadU (d, src2 + i);
170+ const auto vec_a = LoadU (src1 + i);
171+ const auto vec_b = LoadU (src2 + i);
190172
191- const auto b_is_zero = hn:: Eq (vec_b, vec_zero);
192- const auto a_is_min = hn:: Eq (vec_a, vec_min_val);
193- const auto b_is_neg_one = hn:: Eq (vec_b, vec_neg_one);
194- const auto overflow_cond = hn:: And (a_is_min, b_is_neg_one);
173+ const auto b_is_zero = Eq (vec_b, vec_zero);
174+ const auto a_is_min = Eq (vec_a, vec_min_val);
175+ const auto b_is_neg_one = Eq (vec_b, vec_neg_one);
176+ const auto overflow_cond = And (a_is_min, b_is_neg_one);
195177
196- const auto safe_div_mask = hn::Not (hn:: Or (b_is_zero, overflow_cond));
197- const auto safe_b = hn::IfThenElse (hn:: Or (b_is_zero, overflow_cond), vec_one, vec_b);
178+ const auto safe_div_mask = hn::Not (Or (b_is_zero, overflow_cond));
179+ const auto safe_b = hn::IfThenElse (Or (b_is_zero, overflow_cond), vec_one, vec_b);
198180
199- auto vec_div = hn:: Div (vec_a, safe_b);
181+ auto vec_div = Div (vec_a, safe_b);
200182
201183 if (!hn::AllFalse (d, safe_div_mask)) {
202- const auto vec_mul = hn:: Mul (vec_div, safe_b);
184+ const auto vec_mul = Mul (vec_div, safe_b);
203185 const auto has_remainder = hn::Ne (vec_a, vec_mul);
204- const auto a_sign = hn:: Lt (vec_a, vec_zero);
205- const auto b_sign = hn:: Lt (vec_b, vec_zero);
206- const auto different_signs = hn:: Xor (a_sign, b_sign);
207- const auto needs_adjustment = hn:: And (safe_div_mask,
208- hn:: And (has_remainder, different_signs));
186+ const auto a_sign = Lt (vec_a, vec_zero);
187+ const auto b_sign = Lt (vec_b, vec_zero);
188+ const auto different_signs = Xor (a_sign, b_sign);
189+ const auto needs_adjustment = And (safe_div_mask,
190+ And (has_remainder, different_signs));
209191
210192 vec_div = hn::MaskedSubOr (vec_div, needs_adjustment, vec_div, vec_one);
211193 }
212194
213195 vec_div = hn::IfThenElse (b_is_zero, vec_zero, vec_div);
214196 vec_div = hn::IfThenElse (overflow_cond, vec_min_val, vec_div);
215197
216- hn:: StoreU (vec_div, d , dst + i);
198+ StoreU (vec_div, dst + i);
217199
218200 if (!raise_divbyzero && !hn::AllFalse (d, b_is_zero)) {
219201 raise_divbyzero = true ;
@@ -249,30 +231,32 @@ void simd_divide_contig_signed(T* src1, T* src2, T* dst, npy_intp len) {
249231 set_float_status (raise_overflow, raise_divbyzero);
250232}
251233
252- // Unsigned division for arrays
234+ // Unsigned integer DIVIDE array / array
235+
253236template <typename T>
254237void simd_divide_contig_unsigned (T* src1, T* src2, T* dst, npy_intp len) {
255- using D = hn::ScalableTag <T>;
238+ using D = _Tag <T>;
256239 const D d;
257- const size_t N = hn::Lanes (d);
240+ const size_t N = Lanes (T{});
241+
258242 bool raise_divbyzero = false ;
259- const auto vec_zero = hn:: Zero(d );
260- const auto vec_one = hn:: Set (d, static_cast <T>(1 ));
243+ const auto vec_zero = Zero<T>( );
244+ const auto vec_one = Set (static_cast <T>(1 ));
261245
262246 size_t i = 0 ;
263247 for (; i + N <= static_cast <size_t >(len); i += N) {
264- const auto vec_a = hn:: LoadU (d, src1 + i);
265- const auto vec_b = hn:: LoadU (d, src2 + i);
248+ const auto vec_a = LoadU (src1 + i);
249+ const auto vec_b = LoadU (src2 + i);
266250
267- const auto b_is_zero = hn:: Eq (vec_b, vec_zero);
251+ const auto b_is_zero = Eq (vec_b, vec_zero);
268252
269253 const auto safe_b = hn::IfThenElse (b_is_zero, vec_one, vec_b);
270254
271- auto vec_div = hn:: Div (vec_a, safe_b);
255+ auto vec_div = Div (vec_a, safe_b);
272256
273257 vec_div = hn::IfThenElse (b_is_zero, vec_zero, vec_div);
274258
275- hn:: StoreU (vec_div, d , dst + i);
259+ StoreU (vec_div, dst + i);
276260
277261 if (!raise_divbyzero && !hn::AllFalse (d, b_is_zero)) {
278262 raise_divbyzero = true ;
@@ -296,17 +280,40 @@ void simd_divide_contig_unsigned(T* src1, T* src2, T* dst, npy_intp len) {
296280 set_float_status (false , raise_divbyzero);
297281}
298282
283+ #endif // NPY_HWY
284+
285+ // Floor division for signed integers
286+ template <typename T>
287+ T floor_div (T n, T d) {
288+ if (NPY_UNLIKELY (d == 0 || (n == std::numeric_limits<T>::min () && d == -1 ))) {
289+ if (d == 0 ) {
290+ npy_set_floatstatus_divbyzero ();
291+ return 0 ;
292+ }
293+ else {
294+ npy_set_floatstatus_overflow ();
295+ return std::numeric_limits<T>::min ();
296+ }
297+ }
298+ T r = n / d;
299+ if (((n > 0 ) != (d > 0 )) && ((r * d) != n)) {
300+ --r;
301+ }
302+ return r;
303+ }
304+
305+
299306// Dispatch functions for signed integer division
300307template <typename T>
301308void TYPE_divide (char **args, npy_intp const *dimensions, npy_intp const *steps, void *NPY_UNUSED (func)) {
302309 npy_clear_floatstatus ();
303310 if (IS_BINARY_REDUCE ) {
304311 BINARY_REDUCE_LOOP (T) {
305312 const T divisor = *reinterpret_cast <T*>(ip2);
306- if (HWY_UNLIKELY (divisor == 0 )) {
313+ if (NPY_UNLIKELY (divisor == 0 )) {
307314 npy_set_floatstatus_divbyzero ();
308315 io1 = 0 ;
309- } else if (HWY_UNLIKELY (io1 == std::numeric_limits<T>::min () && divisor == -1 )) {
316+ } else if (NPY_UNLIKELY (io1 == std::numeric_limits<T>::min () && divisor == -1 )) {
310317 npy_set_floatstatus_overflow ();
311318 io1 = std::numeric_limits<T>::min ();
312319 } else {
@@ -316,7 +323,8 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
316323 *reinterpret_cast <T*>(iop1) = io1;
317324 return ;
318325 }
319- #if NPY_SIMD
326+
327+ #if NPY_HWY
320328 // Handle array-array case
321329 if (IS_BLOCKABLE_BINARY (sizeof (T), NPY_SIMD_WIDTH ))
322330 {
@@ -343,18 +351,19 @@ void TYPE_divide(char **args, npy_intp const *dimensions, npy_intp const *steps,
343351 return ;
344352 }
345353 }
346- #endif // NPY_SIMD
354+ #endif // NPY_HWY
347355
356+ // Scalar fallback
348357 // Fallback for non-blockable, in-place, or zero divisor cases
349358 BINARY_LOOP {
350359 const T dividend = *reinterpret_cast <T*>(ip1);
351360 const T divisor = *reinterpret_cast <T*>(ip2);
352361 T* result = reinterpret_cast <T*>(op1);
353362
354- if (HWY_UNLIKELY (divisor == 0 )) {
363+ if (NPY_UNLIKELY (divisor == 0 )) {
355364 npy_set_floatstatus_divbyzero ();
356365 *result = 0 ;
357- } else if (HWY_UNLIKELY (dividend == std::numeric_limits<T>::min () && divisor == -1 )) {
366+ } else if (NPY_UNLIKELY (dividend == std::numeric_limits<T>::min () && divisor == -1 )) {
358367 npy_set_floatstatus_overflow ();
359368 *result = std::numeric_limits<T>::min ();
360369 } else {
@@ -370,7 +379,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
370379 if (IS_BINARY_REDUCE ) {
371380 BINARY_REDUCE_LOOP (T) {
372381 const T d = *reinterpret_cast <T*>(ip2);
373- if (HWY_UNLIKELY (d == 0 )) {
382+ if (NPY_UNLIKELY (d == 0 )) {
374383 npy_set_floatstatus_divbyzero ();
375384 io1 = 0 ;
376385 } else {
@@ -380,7 +389,7 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
380389 *reinterpret_cast <T*>(iop1) = io1;
381390 return ;
382391 }
383- #if NPY_SIMD
392+ #if NPY_HWY
384393 // Handle array-array case
385394 if (IS_BLOCKABLE_BINARY (sizeof (T), NPY_SIMD_WIDTH )) {
386395 bool no_overlap = nomemoverlap (args[2 ], steps[2 ], args[0 ], steps[0 ], dimensions[0 ]) &&
@@ -406,13 +415,13 @@ void TYPE_divide_unsigned(char **args, npy_intp const *dimensions, npy_intp cons
406415 return ;
407416 }
408417 }
409- #endif // NPY_SIMD
418+ #endif // NPY_HWY
410419
411420 // Fallback for non-blockable, in-place, or zero divisor cases
412421 BINARY_LOOP {
413422 const T in1 = *reinterpret_cast <T*>(ip1);
414423 const T in2 = *reinterpret_cast <T*>(ip2);
415- if (HWY_UNLIKELY (in2 == 0 )) {
424+ if (NPY_UNLIKELY (in2 == 0 )) {
416425 npy_set_floatstatus_divbyzero ();
417426 *reinterpret_cast <T*>(op1) = 0 ;
418427 } else {
@@ -465,7 +474,7 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
465474 T* indexed = (T*)(ip1 + is1 * indx);
466475 T divisor = *(T*)value;
467476
468- if (HWY_UNLIKELY (divisor == 0 )) {
477+ if (NPY_UNLIKELY (divisor == 0 )) {
469478 npy_set_floatstatus_divbyzero ();
470479 *indexed = 0 ;
471480 } else {
@@ -524,6 +533,3 @@ int TYPE_divide_unsigned_indexed(PyArrayMethod_Context *NPY_UNUSED(context),
524533
525534#undef DEFINE_DIVIDE_FUNCTION
526535#undef DEFINE_DIVIDE_FUNCTION_UNSIGNED
527-
528- } // namespace HWY_NAMESPACE
529- HWY_AFTER_NAMESPACE ();
0 commit comments