@@ -268,11 +268,33 @@ IQK_ALWAYS_INLINE __m256 hsum_float_8x8(__m256 * accm) {
268
268
for (int i = 0 ; i < 2 ; ++i) accm[i] = _mm256_add_ps (_mm256_unpacklo_ps (accm[i], accm[i+2 ]), _mm256_unpackhi_ps (accm[i], accm[i+2 ]));
269
269
return _mm256_add_ps (_mm256_unpacklo_ps (accm[0 ], accm[1 ]), _mm256_unpackhi_ps (accm[0 ], accm[1 ]));
270
270
}
271
+ #ifdef HAVE_FANCY_SIMD
271
272
IQK_ALWAYS_INLINE void store_8 (int ix, __m256 * accm, const DataInfo& info) {
272
273
union { __m256 vec; float val[8 ]; } h;
273
274
h.vec = hsum_float_8x8 (accm);
274
275
for (int iy = 0 ; iy < 8 ; ++iy) info.store (ix, iy, h.val [iy]);
275
276
}
277
+ #else
278
+ // Somehow on the AVX2 system that I have available (Ryzen-5975WX), the store_8 version above
279
+ // and the commented out store_8 version below are slower than this.
280
+ IQK_ALWAYS_INLINE void store_8 (int ix, __m256 * accm, const DataInfo& info) {
281
+ for (int iy = 0 ; iy < 8 ; ++iy) info.store (ix, iy, hsum_float_8 (accm[iy]));
282
+ }
283
+ // IQK_ALWAYS_INLINE __m128 hsum_float_4x4(__m128 * a) {
284
+ // for (int i = 0; i < 2; ++i) a[i] = _mm_add_ps(_mm_unpacklo_ps(a[i], a[i+2]), _mm_unpackhi_ps(a[i], a[i+2]));
285
+ // return _mm_add_ps(_mm_unpacklo_ps(a[0], a[1]), _mm_unpackhi_ps(a[0], a[1]));
286
+ // }
287
+ // IQK_ALWAYS_INLINE void store_8(int ix, __m256 * accm, const DataInfo& info) {
288
+ // union { __m128 vec; float val[4]; } h;
289
+ // __m128 a[4];
290
+ // for (int i = 0; i < 4; ++i) a[i] = _mm_add_ps(_mm256_castps256_ps128(accm[i]), _mm256_extractf128_ps(accm[i], 1));
291
+ // h.vec = hsum_float_4x4(a);
292
+ // for (int iy = 0; iy < 4; ++iy) info.store(ix, iy, h.val[iy]);
293
+ // for (int i = 0; i < 4; ++i) a[i] = _mm_add_ps(_mm256_castps256_ps128(accm[i+4]), _mm256_extractf128_ps(accm[i+4], 1));
294
+ // h.vec = hsum_float_4x4(a);
295
+ // for (int iy = 0; iy < 4; ++iy) info.store(ix, iy+4, h.val[iy]);
296
+ #endif
297
+
276
298
277
299
#define MM256_SET_M128I (a, b ) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1 )
278
300
0 commit comments