Skip to content

Commit 252fba7

Browse files
committed
Add: u8 kernels & FMA
1 parent 71c68fe commit 252fba7

File tree

9 files changed

+603
-57
lines changed

9 files changed

+603
-57
lines changed

README.md

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -91,15 +91,16 @@ You can learn more about the technical implementation details in the following b
9191
## Benchmarks
9292

9393
For reference, we use 1536-dimensional vectors, like the embeddings produced by the OpenAI Ada API.
94-
Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements:
94+
Comparing the serial code throughput produced by GCC 12 to hand-optimized kernels in SimSIMD, we see the following single-core improvements for the two most common vector-vector similarity metrics - the Cosine similarity and the Euclidean distance:
9595

96-
| Type | Apple M2 Pro | AMD Genoa | AWS Graviton 4 |
97-
| :----- | ---------------------------------: | ---------------------------------: | ---------------------------------: |
98-
| `f64` | 18.5 → 28.8 GB/s <br/> + 56 % | 21.9 → 41.4 GB/s <br/> + 89 % | 20.7 → 41.3 GB/s <br/> + 99 % |
99-
| `f32` | 9.2 → 29.6 GB/s <br/> + 221 % | 10.9 → 95.8 GB/s <br/> + 779 % | 4.9 → 41.9 GB/s <br/> + 755 % |
100-
| `f16` | 4.6 → 14.6 GB/s <br/> + 217 % | 3.1 → 108.4 GB/s <br/> + 3,397 % | 5.4 → 39.3 GB/s <br/> + 627 % |
101-
| `bf16` | 4.6 → 26.3 GB/s <br/> + 472 % | 0.8 → 59.5 GB/s <br/> +7,437 % | 2.5 → 29.9 GB/s <br/> + 1,096 % |
102-
| `i8` | 25.8 → 47.1 GB/s <br/> + 83 % | 33.1 → 65.3 GB/s <br/> + 97 % | 35.2 → 43.5 GB/s <br/> + 24 % |
96+
| Type | Apple M2 Pro | Intel Sapphire Rapids | AWS Graviton 4 |
97+
| :----- | ----------------------------: | -------------------------------: | ------------------------------: |
98+
| `f64` | 18.5 → 28.8 GB/s <br/> + 56 % | 21.9 → 41.4 GB/s <br/> + 89 % | 20.7 → 41.3 GB/s <br/> + 99 % |
99+
| `f32` | 9.2 → 29.6 GB/s <br/> + 221 % | 10.9 → 95.8 GB/s <br/> + 779 % | 4.9 → 41.9 GB/s <br/> + 755 % |
100+
| `f16` | 4.6 → 14.6 GB/s <br/> + 217 % | 3.1 → 108.4 GB/s <br/> + 3,397 % | 5.4 → 39.3 GB/s <br/> + 627 % |
101+
| `bf16` | 4.6 → 26.3 GB/s <br/> + 472 % | 0.8 → 59.5 GB/s <br/> +7,437 % | 2.5 → 29.9 GB/s <br/> + 1,096 % |
102+
| `i8` | 25.8 → 47.1 GB/s <br/> + 83 % | 33.1 → 65.3 GB/s <br/> + 97 % | 35.2 → 43.5 GB/s <br/> + 24 % |
103+
| `u8` | | 32.5 → 66.5 GB/s <br/> + 105 % | |
103104

104105
Similar speedups are often observed even when compared to BLAS and LAPACK libraries underlying most numerical computing libraries, including NumPy and SciPy in Python.
105106
Broader benchmarking results:
@@ -112,7 +113,7 @@ Broader benchmarking results:
112113

113114
The package is intended to replace the usage of `numpy.inner`, `numpy.dot`, and `scipy.spatial.distance`.
114115
Aside from drastic performance improvements, SimSIMD significantly improves accuracy in mixed precision setups.
115-
NumPy and SciPy, processing `i8` or `f16` vectors, will use the same types for accumulators, while SimSIMD can combine `i8` enumeration, `i16` multiplication, and `i32` accumulation to avoid overflows entirely.
116+
NumPy and SciPy, processing `i8`, `u8` or `f16` vectors, will use the same types for accumulators, while SimSIMD can combine `i8` enumeration, `i16` multiplication, and `i32` accumulation to avoid overflows entirely.
116117
The same applies to processing `f16` and `bf16` values with `f32` precision.
117118

118119
### Installation

c/lib.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,6 +108,8 @@ extern "C" {
108108
}
109109

110110
// Dot products
111+
SIMSIMD_DECLARATION_DENSE(dot, i8, i8)
112+
SIMSIMD_DECLARATION_DENSE(dot, u8, u8)
111113
SIMSIMD_DECLARATION_DENSE(dot, f16, f16)
112114
SIMSIMD_DECLARATION_DENSE(dot, bf16, bf16)
113115
SIMSIMD_DECLARATION_DENSE(dot, f32, f32)
@@ -123,16 +125,19 @@ SIMSIMD_DECLARATION_DENSE(vdot, f64c, f64)
123125

124126
// Spatial distances
125127
SIMSIMD_DECLARATION_DENSE(cos, i8, i8)
128+
SIMSIMD_DECLARATION_DENSE(cos, u8, u8)
126129
SIMSIMD_DECLARATION_DENSE(cos, f16, f16)
127130
SIMSIMD_DECLARATION_DENSE(cos, bf16, bf16)
128131
SIMSIMD_DECLARATION_DENSE(cos, f32, f32)
129132
SIMSIMD_DECLARATION_DENSE(cos, f64, f64)
130133
SIMSIMD_DECLARATION_DENSE(l2sq, i8, i8)
134+
SIMSIMD_DECLARATION_DENSE(l2sq, u8, u8)
131135
SIMSIMD_DECLARATION_DENSE(l2sq, f16, f16)
132136
SIMSIMD_DECLARATION_DENSE(l2sq, bf16, bf16)
133137
SIMSIMD_DECLARATION_DENSE(l2sq, f32, f32)
134138
SIMSIMD_DECLARATION_DENSE(l2sq, f64, f64)
135139
SIMSIMD_DECLARATION_DENSE(l2, i8, i8)
140+
SIMSIMD_DECLARATION_DENSE(l2, u8, u8)
136141
SIMSIMD_DECLARATION_DENSE(l2, f16, f16)
137142
SIMSIMD_DECLARATION_DENSE(l2, bf16, bf16)
138143
SIMSIMD_DECLARATION_DENSE(l2, f32, f32)
@@ -199,10 +204,13 @@ SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void) {
199204
void* dummy = 0;
200205

201206
// Dense:
207+
simsimd_dot_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results);
208+
simsimd_dot_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results);
202209
simsimd_dot_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
203210
simsimd_dot_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
204211
simsimd_dot_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
205212
simsimd_dot_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results);
213+
206214
simsimd_dot_f16c((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
207215
simsimd_dot_bf16c((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
208216
simsimd_dot_f32c((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
@@ -211,23 +219,32 @@ SIMSIMD_DYNAMIC simsimd_capability_t simsimd_capabilities(void) {
211219
simsimd_vdot_bf16c((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
212220
simsimd_vdot_f32c((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
213221
simsimd_vdot_f64c((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results);
222+
214223
simsimd_cos_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results);
224+
simsimd_cos_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results);
215225
simsimd_cos_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
216226
simsimd_cos_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
217227
simsimd_cos_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
218228
simsimd_cos_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results);
229+
219230
simsimd_l2sq_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results);
231+
simsimd_l2sq_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results);
220232
simsimd_l2sq_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
221233
simsimd_l2sq_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
222234
simsimd_l2sq_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
223235
simsimd_l2sq_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results);
236+
224237
simsimd_l2_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results);
238+
simsimd_l2_i8((simsimd_i8_t*)dummy, (simsimd_i8_t*)dummy, 0, dummy_results);
239+
simsimd_l2_u8((simsimd_u8_t*)dummy, (simsimd_u8_t*)dummy, 0, dummy_results);
225240
simsimd_l2_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
226241
simsimd_l2_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
227242
simsimd_l2_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);
228243
simsimd_l2_f64((simsimd_f64_t*)dummy, (simsimd_f64_t*)dummy, 0, dummy_results);
244+
229245
simsimd_hamming_b8((simsimd_b8_t*)dummy, (simsimd_b8_t*)dummy, 0, dummy_results);
230246
simsimd_jaccard_b8((simsimd_b8_t*)dummy, (simsimd_b8_t*)dummy, 0, dummy_results);
247+
231248
simsimd_kl_f16((simsimd_f16_t*)dummy, (simsimd_f16_t*)dummy, 0, dummy_results);
232249
simsimd_kl_bf16((simsimd_bf16_t*)dummy, (simsimd_bf16_t*)dummy, 0, dummy_results);
233250
simsimd_kl_f32((simsimd_f32_t*)dummy, (simsimd_f32_t*)dummy, 0, dummy_results);

include/simsimd/binary.h

Lines changed: 14 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -319,7 +319,20 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
319319
simsimd_distance_t* result) {
320320

321321
simsimd_size_t intersection = 0, union_ = 0;
322-
// It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
322+
//? On such vectors we can clearly see that the CPU struggles to perform this many parallel
323+
//? population counts, because the throughput of Jaccard and Hamming in this case starts to differ.
324+
//? One optimization, aside from Harley-Seal transforms can be using "shuffles" for nibble-popcount
325+
//? lookups, to utilize other ports on the CPU.
326+
//? https://github.com/ashvardanian/SimSIMD/pull/138
327+
//
328+
// - `_mm512_popcnt_epi64` maps to `VPOPCNTQ (ZMM, K, ZMM)`:
329+
// - On Ice Lake: 3 cycles latency, ports: 1*p5
330+
// - On Genoa: 2 cycles latency, ports: 1*FP01
331+
// - `_mm512_shuffle_epi8` maps to `VPSHUFB (ZMM, ZMM, ZMM)`:
332+
// - On Ice Lake: 1 cycles latency, ports: 1*p5
333+
// - On Genoa: 2 cycles latency, ports: 1*FP12
334+
//
335+
// It's harder to squeeze out performance from tiny representations, so we unroll the loops for binary metrics.
323336
if (n_words <= 64) { // Up to 512 bits.
324337
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words);
325338
__m512i a_vec = _mm512_maskz_loadu_epi8(mask, a);
@@ -341,20 +354,6 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
341354
intersection = _mm512_reduce_add_epi64(_mm512_add_epi64(and2_count_vec, and1_count_vec));
342355
union_ = _mm512_reduce_add_epi64(_mm512_add_epi64(or2_count_vec, or1_count_vec));
343356
} else if (n_words <= 196) { // Up to 1568 bits.
344-
// TODO: On such vectors we can clearly see that the CPU struggles to perform this many parallel
345-
// population counts, because the throughput of Jaccard and Hamming in this case starts to differ.
346-
// One optimization, aside from Harley-Seal transforms can be using "shuffles" for nibble-popcount
347-
// lookups, to utilize other ports on the CPU.
348-
// https://github.com/ashvardanian/SimSIMD/pull/138
349-
//
350-
// On Ice Lake:
351-
// - `VPOPCNTQ (ZMM, K, ZMM)` can only execute on port 5, which is a bottleneck.
352-
// - `VPSHUFB (ZMM, ZMM, ZMM)` can only run on the same port 5 as well!
353-
// On Zen4:
354-
// - `VPOPCNTQ (ZMM, K, ZMM)` can run on ports: 0, 1.
355-
// - `VPSHUFB (ZMM, ZMM, ZMM)` can run on ports: 1, 2.
356-
// https://uops.info/table.html?search=VPOPCNTQ%20(ZMM%2C%20K%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
357-
// https://uops.info/table.html?search=VPSHUFB%20(ZMM%2C%20ZMM%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
358357
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 128);
359358
__m512i a1_vec = _mm512_loadu_epi8(a);
360359
__m512i b1_vec = _mm512_loadu_epi8(b);

0 commit comments

Comments
 (0)