Skip to content

Commit a41d6c0

Browse files
authored
arm neon riscv64: add min.h and max.h RVV implementations. (#1283)
1 parent 3e5facc commit a41d6c0

File tree

2 files changed

+330
-70
lines changed

2 files changed

+330
-70
lines changed

simde/arm/neon/max.h

+191-38
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
* 2020 Evan Nemerson <[email protected]>
2525
* 2020 Sean Maher <[email protected]> (Copyright owned by Google, LLC)
2626
* 2023 Yi-Yen Chung <[email protected]> (Copyright owned by Andes Technology)
27+
* 2023 Yung-Cheng Su <[email protected]>
2728
*/
2829

2930
#if !defined(SIMDE_ARM_NEON_MAX_H)
@@ -96,14 +97,27 @@ simde_vmax_f32(simde_float32x2_t a, simde_float32x2_t b) {
9697
a_ = simde_float32x2_to_private(a),
9798
b_ = simde_float32x2_to_private(b);
9899

99-
SIMDE_VECTORIZE
100-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
100+
#if defined(SIMDE_RISCV_V_NATIVE)
101101
#if !defined(SIMDE_FAST_NANS)
102-
r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NANF);
102+
vbool32_t va_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv64 , 2) , 512 , 2);
103+
vbool32_t vb_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(b_.sv64 , 2) , 512 , 2);
104+
vbool32_t vab_mask = __riscv_vmnor_mm_b32(va_mask , vb_mask , 2);
105+
vfloat32m1_t vnan = __riscv_vfmv_v_f_f32m1(SIMDE_MATH_NANF , 2);
106+
r_.sv64 = __riscv_vfmax_vv_f32m1_m(vab_mask , a_.sv64 , b_.sv64 , 2);
107+
r_.sv64 = __riscv_vmerge_vvm_f32m1(vnan , r_.sv64 , vab_mask , 2);
103108
#else
104-
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
109+
r_.sv64 = __riscv_vfmax_vv_f32m1(a_.sv64, b_.sv64, 2);
105110
#endif
106-
}
111+
#else
112+
SIMDE_VECTORIZE
113+
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
114+
#if !defined(SIMDE_FAST_NANS)
115+
r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NANF);
116+
#else
117+
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
118+
#endif
119+
}
120+
#endif
107121

108122
return simde_float32x2_from_private(r_);
109123
#endif
@@ -124,14 +138,28 @@ simde_vmax_f64(simde_float64x1_t a, simde_float64x1_t b) {
124138
a_ = simde_float64x1_to_private(a),
125139
b_ = simde_float64x1_to_private(b);
126140

127-
SIMDE_VECTORIZE
128-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
141+
#if defined(SIMDE_RISCV_V_NATIVE)
129142
#if !defined(SIMDE_FAST_NANS)
130-
r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NAN);
143+
simde_float64 nan = SIMDE_MATH_NAN;
144+
vbool64_t va_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv64 , 1) , 512 , 1);
145+
vbool64_t vb_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(b_.sv64 , 1) , 512 , 1);
146+
vbool64_t vab_mask = __riscv_vmnor_mm_b64(va_mask , vb_mask , 1);
147+
vfloat64m1_t vnan = __riscv_vfmv_v_f_f64m1(nan , 1);
148+
r_.sv64 = __riscv_vfmax_vv_f64m1_m(vab_mask , a_.sv64 , b_.sv64 , 1);
149+
r_.sv64 = __riscv_vmerge_vvm_f64m1(vnan, r_.sv64, vab_mask , 1);
131150
#else
132-
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
151+
r_.sv64 = __riscv_vfmax_vv_f64m1(a_.sv64, b_.sv64, 1);
133152
#endif
134-
}
153+
#else
154+
SIMDE_VECTORIZE
155+
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
156+
#if !defined(SIMDE_FAST_NANS)
157+
r_.values[i] = (a_.values[i] >= b_.values[i]) ? a_.values[i] : ((a_.values[i] < b_.values[i]) ? b_.values[i] : SIMDE_MATH_NAN);
158+
#else
159+
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
160+
#endif
161+
}
162+
#endif
135163

136164
return simde_float64x1_from_private(r_);
137165
#endif
@@ -154,10 +182,14 @@ simde_vmax_s8(simde_int8x8_t a, simde_int8x8_t b) {
154182
a_ = simde_int8x8_to_private(a),
155183
b_ = simde_int8x8_to_private(b);
156184

157-
SIMDE_VECTORIZE
158-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
159-
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
160-
}
185+
#if defined(SIMDE_RISCV_V_NATIVE)
186+
r_.sv64 = __riscv_vmax_vv_i8m1(a_.sv64, b_.sv64, 8);
187+
#else
188+
SIMDE_VECTORIZE
189+
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
190+
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
191+
}
192+
#endif
161193

162194
return simde_int8x8_from_private(r_);
163195
#endif
@@ -180,10 +212,14 @@ simde_vmax_s16(simde_int16x4_t a, simde_int16x4_t b) {
180212
a_ = simde_int16x4_to_private(a),
181213
b_ = simde_int16x4_to_private(b);
182214

183-
SIMDE_VECTORIZE
184-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
185-
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
186-
}
215+
#if defined(SIMDE_RISCV_V_NATIVE)
216+
r_.sv64 = __riscv_vmax_vv_i16m1(a_.sv64, b_.sv64, 4);
217+
#else
218+
SIMDE_VECTORIZE
219+
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
220+
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
221+
}
222+
#endif
187223

188224
return simde_int16x4_from_private(r_);
189225
#endif
@@ -206,10 +242,14 @@ simde_vmax_s32(simde_int32x2_t a, simde_int32x2_t b) {
206242
a_ = simde_int32x2_to_private(a),
207243
b_ = simde_int32x2_to_private(b);
208244

209-
SIMDE_VECTORIZE
210-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
211-
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
212-
}
245+
#if defined(SIMDE_RISCV_V_NATIVE)
246+
r_.sv64 = __riscv_vmax_vv_i32m1(a_.sv64, b_.sv64, 2);
247+
#else
248+
SIMDE_VECTORIZE
249+
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
250+
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
251+
}
252+
#endif
213253

214254
return simde_int32x2_from_private(r_);
215255
#endif
@@ -230,10 +270,14 @@ simde_x_vmax_s64(simde_int64x1_t a, simde_int64x1_t b) {
230270
a_ = simde_int64x1_to_private(a),
231271
b_ = simde_int64x1_to_private(b);
232272

233-
SIMDE_VECTORIZE
234-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
235-
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
236-
}
273+
#if defined(SIMDE_RISCV_V_NATIVE)
274+
r_.sv64 = __riscv_vmax_vv_i64m1(a_.sv64, b_.sv64, 1);
275+
#else
276+
SIMDE_VECTORIZE
277+
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
278+
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
279+
}
280+
#endif
237281

238282
return simde_int64x1_from_private(r_);
239283
#endif
@@ -252,10 +296,14 @@ simde_vmax_u8(simde_uint8x8_t a, simde_uint8x8_t b) {
252296
a_ = simde_uint8x8_to_private(a),
253297
b_ = simde_uint8x8_to_private(b);
254298

255-
SIMDE_VECTORIZE
256-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
257-
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
258-
}
299+
#if defined(SIMDE_RISCV_V_NATIVE)
300+
r_.sv64 = __riscv_vmaxu_vv_u8m1(a_.sv64, b_.sv64, 8);
301+
#else
302+
SIMDE_VECTORIZE
303+
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
304+
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
305+
}
306+
#endif
259307

260308
return simde_uint8x8_from_private(r_);
261309
#endif
@@ -281,6 +329,8 @@ simde_vmax_u16(simde_uint16x4_t a, simde_uint16x4_t b) {
281329
#if defined(SIMDE_X86_MMX_NATIVE)
282330
/* https://github.com/simd-everywhere/simde/issues/855#issuecomment-881656284 */
283331
r_.m64 = _mm_add_pi16(b_.m64, _mm_subs_pu16(a_.m64, b_.m64));
332+
#elif defined(SIMDE_RISCV_V_NATIVE)
333+
r_.sv64 = __riscv_vmaxu_vv_u16m1(a_.sv64, b_.sv64, 4);
284334
#else
285335
SIMDE_VECTORIZE
286336
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -309,10 +359,14 @@ simde_vmax_u32(simde_uint32x2_t a, simde_uint32x2_t b) {
309359
a_ = simde_uint32x2_to_private(a),
310360
b_ = simde_uint32x2_to_private(b);
311361

312-
SIMDE_VECTORIZE
313-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
314-
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
315-
}
362+
#if defined(SIMDE_RISCV_V_NATIVE)
363+
r_.sv64 = __riscv_vmaxu_vv_u32m1(a_.sv64, b_.sv64, 2);
364+
#else
365+
SIMDE_VECTORIZE
366+
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
367+
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
368+
}
369+
#endif
316370

317371
return simde_uint32x2_from_private(r_);
318372
#endif
@@ -333,10 +387,14 @@ simde_x_vmax_u64(simde_uint64x1_t a, simde_uint64x1_t b) {
333387
a_ = simde_uint64x1_to_private(a),
334388
b_ = simde_uint64x1_to_private(b);
335389

336-
SIMDE_VECTORIZE
337-
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
338-
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
339-
}
390+
#if defined(SIMDE_RISCV_V_NATIVE)
391+
r_.sv64 = __riscv_vmaxu_vv_u64m1(a_.sv64, b_.sv64, 1);
392+
#else
393+
SIMDE_VECTORIZE
394+
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
395+
r_.values[i] = (a_.values[i] > b_.values[i]) ? a_.values[i] : b_.values[i];
396+
}
397+
#endif
340398

341399
return simde_uint64x1_from_private(r_);
342400
#endif
@@ -414,6 +472,17 @@ simde_vmaxq_f32(simde_float32x4_t a, simde_float32x4_t b) {
414472
#endif
415473
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
416474
r_.v128 = wasm_f32x4_max(a_.v128, b_.v128);
475+
#elif defined(SIMDE_RISCV_V_NATIVE)
476+
#if !defined(SIMDE_FAST_NANS)
477+
vbool32_t va_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(a_.sv128 , 4) , 512 , 4);
478+
vbool32_t vb_mask = __riscv_vmseq_vx_u32m1_b32(__riscv_vfclass_v_u32m1(b_.sv128 , 4) , 512 , 4);
479+
vbool32_t vab_mask = __riscv_vmnor_mm_b32(va_mask , vb_mask , 4);
480+
vfloat32m1_t vnan = __riscv_vfmv_v_f_f32m1(SIMDE_MATH_NANF , 4);
481+
r_.sv128 = __riscv_vfmax_vv_f32m1_m(vab_mask , a_.sv128 , b_.sv128 , 4);
482+
r_.sv128 = __riscv_vmerge_vvm_f32m1(vnan , r_.sv128 , vab_mask , 4);
483+
#else
484+
r_.sv128 = __riscv_vfmax_vv_f32m1(a_.sv128, b_.sv128, 4);
485+
#endif
417486
#else
418487
SIMDE_VECTORIZE
419488
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -461,6 +530,18 @@ simde_vmaxq_f64(simde_float64x2_t a, simde_float64x2_t b) {
461530
#endif
462531
#elif defined(SIMDE_WASM_SIMD128_NATIVE)
463532
r_.v128 = wasm_f64x2_max(a_.v128, b_.v128);
533+
#elif defined(SIMDE_RISCV_V_NATIVE)
534+
#if !defined(SIMDE_FAST_NANS)
535+
simde_float64 nan = SIMDE_MATH_NAN;
536+
vbool64_t va_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(a_.sv128 , 2) , 512 , 2);
537+
vbool64_t vb_mask = __riscv_vmseq_vx_u64m1_b64(__riscv_vfclass_v_u64m1(b_.sv128 , 2) , 512 , 2);
538+
vbool64_t vab_mask = __riscv_vmnor_mm_b64(va_mask , vb_mask , 2);
539+
vfloat64m1_t vnan = __riscv_vfmv_v_f_f64m1(nan , 2);
540+
r_.sv128 = __riscv_vfmax_vv_f64m1_m(vab_mask , a_.sv128 , b_.sv128 , 2);
541+
r_.sv128 = __riscv_vmerge_vvm_f64m1(vnan, r_.sv128, vab_mask , 2);
542+
#else
543+
r_.sv128 = __riscv_vfmax_vv_f64m1(a_.sv128, b_.sv128, 2);
544+
#endif
464545
#else
465546
SIMDE_VECTORIZE
466547
for (size_t i = 0 ; i < (sizeof(r_.values) / sizeof(r_.values[0])) ; i++) {
@@ -504,6 +585,15 @@ simde_vmaxq_s8(simde_int8x16_t a, simde_int8x16_t b) {
504585
r_.v128 = wasm_i8x16_max(a_.v128, b_.v128);
505586
#endif
506587

588+
return simde_int8x16_from_private(r_);
589+
#elif defined(SIMDE_RISCV_V_NATIVE)
590+
simde_int8x16_private
591+
r_,
592+
a_ = simde_int8x16_to_private(a),
593+
b_ = simde_int8x16_to_private(b);
594+
595+
r_.sv128 = __riscv_vmax_vv_i8m1(a_.sv128, b_.sv128, 16);
596+
507597
return simde_int8x16_from_private(r_);
508598
#else
509599
return simde_vbslq_s8(simde_vcgtq_s8(a, b), a, b);
@@ -535,6 +625,15 @@ simde_vmaxq_s16(simde_int16x8_t a, simde_int16x8_t b) {
535625
r_.v128 = wasm_i16x8_max(a_.v128, b_.v128);
536626
#endif
537627

628+
return simde_int16x8_from_private(r_);
629+
#elif defined(SIMDE_RISCV_V_NATIVE)
630+
simde_int16x8_private
631+
r_,
632+
a_ = simde_int16x8_to_private(a),
633+
b_ = simde_int16x8_to_private(b);
634+
635+
r_.sv128 = __riscv_vmax_vv_i16m1(a_.sv128, b_.sv128, 8);
636+
538637
return simde_int16x8_from_private(r_);
539638
#else
540639
return simde_vbslq_s16(simde_vcgtq_s16(a, b), a, b);
@@ -566,6 +665,15 @@ simde_vmaxq_s32(simde_int32x4_t a, simde_int32x4_t b) {
566665
r_.v128 = wasm_i32x4_max(a_.v128, b_.v128);
567666
#endif
568667

668+
return simde_int32x4_from_private(r_);
669+
#elif defined(SIMDE_RISCV_V_NATIVE)
670+
simde_int32x4_private
671+
r_,
672+
a_ = simde_int32x4_to_private(a),
673+
b_ = simde_int32x4_to_private(b);
674+
675+
r_.sv128 = __riscv_vmax_vv_i32m1(a_.sv128, b_.sv128, 4);
676+
569677
return simde_int32x4_from_private(r_);
570678
#else
571679
return simde_vbslq_s32(simde_vcgtq_s32(a, b), a, b);
@@ -581,6 +689,15 @@ simde_int64x2_t
581689
simde_x_vmaxq_s64(simde_int64x2_t a, simde_int64x2_t b) {
582690
#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
583691
return vec_max(a, b);
692+
#elif defined(SIMDE_RISCV_V_NATIVE)
693+
simde_int64x2_private
694+
r_,
695+
a_ = simde_int64x2_to_private(a),
696+
b_ = simde_int64x2_to_private(b);
697+
698+
r_.sv128 = __riscv_vmax_vv_i64m1(a_.sv128, b_.sv128, 2);
699+
700+
return simde_int64x2_from_private(r_);
584701
#else
585702
return simde_vbslq_s64(simde_vcgtq_s64(a, b), a, b);
586703
#endif
@@ -607,6 +724,15 @@ simde_vmaxq_u8(simde_uint8x16_t a, simde_uint8x16_t b) {
607724
r_.v128 = wasm_u8x16_max(a_.v128, b_.v128);
608725
#endif
609726

727+
return simde_uint8x16_from_private(r_);
728+
#elif defined(SIMDE_RISCV_V_NATIVE)
729+
simde_uint8x16_private
730+
r_,
731+
a_ = simde_uint8x16_to_private(a),
732+
b_ = simde_uint8x16_to_private(b);
733+
734+
r_.sv128 = __riscv_vmaxu_vv_u8m1(a_.sv128, b_.sv128, 16);
735+
610736
return simde_uint8x16_from_private(r_);
611737
#else
612738
return simde_vbslq_u8(simde_vcgtq_u8(a, b), a, b);
@@ -641,6 +767,15 @@ simde_vmaxq_u16(simde_uint16x8_t a, simde_uint16x8_t b) {
641767
r_.v128 = wasm_u16x8_max(a_.v128, b_.v128);
642768
#endif
643769

770+
return simde_uint16x8_from_private(r_);
771+
#elif defined(SIMDE_RISCV_V_NATIVE)
772+
simde_uint16x8_private
773+
r_,
774+
a_ = simde_uint16x8_to_private(a),
775+
b_ = simde_uint16x8_to_private(b);
776+
777+
r_.sv128 = __riscv_vmaxu_vv_u16m1(a_.sv128, b_.sv128, 8);
778+
644779
return simde_uint16x8_from_private(r_);
645780
#else
646781
return simde_vbslq_u16(simde_vcgtq_u16(a, b), a, b);
@@ -672,6 +807,15 @@ simde_vmaxq_u32(simde_uint32x4_t a, simde_uint32x4_t b) {
672807
r_.v128 = wasm_u32x4_max(a_.v128, b_.v128);
673808
#endif
674809

810+
return simde_uint32x4_from_private(r_);
811+
#elif defined(SIMDE_RISCV_V_NATIVE)
812+
simde_uint32x4_private
813+
r_,
814+
a_ = simde_uint32x4_to_private(a),
815+
b_ = simde_uint32x4_to_private(b);
816+
817+
r_.sv128 = __riscv_vmaxu_vv_u32m1(a_.sv128, b_.sv128, 4);
818+
675819
return simde_uint32x4_from_private(r_);
676820
#else
677821
return simde_vbslq_u32(simde_vcgtq_u32(a, b), a, b);
@@ -687,6 +831,15 @@ simde_uint64x2_t
687831
simde_x_vmaxq_u64(simde_uint64x2_t a, simde_uint64x2_t b) {
688832
#if defined(SIMDE_POWER_ALTIVEC_P8_NATIVE) || defined(SIMDE_ZARCH_ZVECTOR_13_NATIVE)
689833
return vec_max(a, b);
834+
#elif defined(SIMDE_RISCV_V_NATIVE)
835+
simde_uint64x2_private
836+
r_,
837+
a_ = simde_uint64x2_to_private(a),
838+
b_ = simde_uint64x2_to_private(b);
839+
840+
r_.sv128 = __riscv_vmaxu_vv_u64m1(a_.sv128, b_.sv128, 2);
841+
842+
return simde_uint64x2_from_private(r_);
690843
#else
691844
return simde_vbslq_u64(simde_vcgtq_u64(a, b), a, b);
692845
#endif

0 commit comments

Comments
 (0)