Skip to content

Commit 5065dcd

Browse files
committed
Playing with hsums
1 parent 07b5d73 commit 5065dcd

File tree

1 file changed

+22
-0
lines changed

1 file changed

+22
-0
lines changed

ggml/src/iqk/iqk_mul_mat.cpp

+22
Original file line numberDiff line numberDiff line change
@@ -268,11 +268,33 @@ IQK_ALWAYS_INLINE __m256 hsum_float_8x8(__m256 * accm) {
268268
for (int i = 0; i < 2; ++i) accm[i] = _mm256_add_ps(_mm256_unpacklo_ps(accm[i], accm[i+2]), _mm256_unpackhi_ps(accm[i], accm[i+2]));
269269
return _mm256_add_ps(_mm256_unpacklo_ps(accm[0], accm[1]), _mm256_unpackhi_ps(accm[0], accm[1]));
270270
}
271+
#ifdef HAVE_FANCY_SIMD
271272
IQK_ALWAYS_INLINE void store_8(int ix, __m256 * accm, const DataInfo& info) {
272273
union { __m256 vec; float val[8]; } h;
273274
h.vec = hsum_float_8x8(accm);
274275
for (int iy = 0; iy < 8; ++iy) info.store(ix, iy, h.val[iy]);
275276
}
277+
#else
278+
// Somehow on the AVX2 system that I have available (Ryzen-5975WX), the store_8 version above
279+
// and the commented out store_8 version below are slower than this.
280+
IQK_ALWAYS_INLINE void store_8(int ix, __m256 * accm, const DataInfo& info) {
281+
for (int iy = 0; iy < 8; ++iy) info.store(ix, iy, hsum_float_8(accm[iy]));
282+
}
283+
//IQK_ALWAYS_INLINE __m128 hsum_float_4x4(__m128 * a) {
284+
// for (int i = 0; i < 2; ++i) a[i] = _mm_add_ps(_mm_unpacklo_ps(a[i], a[i+2]), _mm_unpackhi_ps(a[i], a[i+2]));
285+
// return _mm_add_ps(_mm_unpacklo_ps(a[0], a[1]), _mm_unpackhi_ps(a[0], a[1]));
286+
//}
287+
//IQK_ALWAYS_INLINE void store_8(int ix, __m256 * accm, const DataInfo& info) {
288+
// union { __m128 vec; float val[4]; } h;
289+
// __m128 a[4];
290+
// for (int i = 0; i < 4; ++i) a[i] = _mm_add_ps(_mm256_castps256_ps128(accm[i]), _mm256_extractf128_ps(accm[i], 1));
291+
// h.vec = hsum_float_4x4(a);
292+
// for (int iy = 0; iy < 4; ++iy) info.store(ix, iy, h.val[iy]);
293+
// for (int i = 0; i < 4; ++i) a[i] = _mm_add_ps(_mm256_castps256_ps128(accm[i+4]), _mm256_extractf128_ps(accm[i+4], 1));
294+
// h.vec = hsum_float_4x4(a);
295+
// for (int iy = 0; iy < 4; ++iy) info.store(ix, iy+4, h.val[iy]);
296+
#endif
297+
276298

277299
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
278300

0 commit comments

Comments
 (0)