|
| 1 | +diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h |
| 2 | +index 99f3ccc..a11de9d 100644 |
| 3 | +--- a/bestla/bestla/bestla_prologue_b.h |
| 4 | ++++ b/bestla/bestla/bestla_prologue_b.h |
| 5 | +@@ -456,9 +456,8 @@ class WeightKBlockNInteger { |
| 6 | + auto tmpscales = tmp; |
| 7 | + auto tmpzeropoints = reinterpret_cast<int8_t*>(tmpscales + N * blks); |
| 8 | + if (scales) { |
| 9 | +- for (size_t i = 0; i < N * blks; i += 2) { |
| 10 | ++ for (size_t i = 0; i < N * blks; i ++) { |
| 11 | + tmpscales[i] = scales[i] / 16; |
| 12 | +- tmpscales[i + 1] = scales[i + 1] / 16; |
| 13 | + } |
| 14 | + } |
| 15 | + if (zero_points) { |
| 16 | +diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h |
| 17 | +index 6783ee8..59822e5 100644 |
| 18 | +--- a/bestla/bestla/kernel_avx512f.h |
| 19 | ++++ b/bestla/bestla/kernel_avx512f.h |
| 20 | +@@ -673,8 +673,8 @@ inline BTLA_CODE decompress_kblock_s3_s8fp(utils::bit2x4* bit2ptr, utils::bit1x8 |
| 21 | + zmm1 = _mm512_sllv_epi32(zmm1, zmm_shift); // int3_clip => int8 |
| 22 | + zmm2 = _mm512_sllv_epi32(zmm2, zmm_shift); // int3_clip => int8 |
| 23 | + |
| 24 | +- _mm512_storeu_epi8((__m512i*)dst, zmm1); |
| 25 | +- _mm512_storeu_epi8((__m512i*)(dst + 64), zmm2); |
| 26 | ++ _mm512_storeu_si512((__m512i*)dst, zmm1); |
| 27 | ++ _mm512_storeu_si512((__m512i*)(dst + 64), zmm2); |
| 28 | + }; |
| 29 | + |
| 30 | + assert(head_ignore_num % 8 == 0); |
0 commit comments