@@ -263,6 +263,20 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
263
263
intersection = _mm512_reduce_add_epi64 (_mm512_add_epi64 (and2_count_vec , and1_count_vec ));
264
264
union_ = _mm512_reduce_add_epi64 (_mm512_add_epi64 (or2_count_vec , or1_count_vec ));
265
265
} else if (n_words <= 196 ) { // Up to 1568 bits.
266
+ // TODO: On such vectors we can clearly see that the CPU struggles to perform this many parallel
267
+ // population counts, because the throughput of Jaccard and Hamming in this case starts to differ.
268
+ // One optimization, aside from Harley-Seal transforms can be using "shuffles" for nibble-popcount
269
+ // lookups, to utilize other ports on the CPU.
270
+ // https://github.com/ashvardanian/SimSIMD/pull/138
271
+ //
272
+ // On Ice Lake:
273
+ // - `VPOPCNTQ (ZMM, K, ZMM)` can only execute on port 5, which is a bottleneck.
274
+ // - `VPSHUFB (ZMM, ZMM, ZMM)` can only run on the same port 5 as well!
275
+ // On Zen4:
276
+ // - `VPOPCNTQ (ZMM, K, ZMM)` can run on ports: 0, 1.
277
+ // - `VPSHUFB (ZMM, ZMM, ZMM)` can run on ports: 1, 2.
278
+ // https://uops.info/table.html?search=VPOPCNTQ%20(ZMM%2C%20K%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
279
+ // https://uops.info/table.html?search=VPSHUFB%20(ZMM%2C%20ZMM%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
266
280
__mmask64 mask = (__mmask64 )_bzhi_u64 (0xFFFFFFFFFFFFFFFF , n_words - 128 );
267
281
__m512i a1_vec = _mm512_loadu_epi8 (a );
268
282
__m512i b1_vec = _mm512_loadu_epi8 (b );
@@ -276,10 +290,10 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
276
290
__m512i or2_count_vec = _mm512_popcnt_epi64 (_mm512_or_si512 (a2_vec , b2_vec ));
277
291
__m512i and3_count_vec = _mm512_popcnt_epi64 (_mm512_and_si512 (a3_vec , b3_vec ));
278
292
__m512i or3_count_vec = _mm512_popcnt_epi64 (_mm512_or_si512 (a3_vec , b3_vec ));
279
- intersection =
280
- _mm512_reduce_add_epi64 ( _mm512_add_epi64 (and3_count_vec , _mm512_add_epi64 (and2_count_vec , and1_count_vec )));
281
- union_ =
282
- _mm512_reduce_add_epi64 ( _mm512_add_epi64 (or3_count_vec , _mm512_add_epi64 (or2_count_vec , or1_count_vec )));
293
+ intersection = _mm512_reduce_add_epi64 ( //
294
+ _mm512_add_epi64 (and3_count_vec , _mm512_add_epi64 (and2_count_vec , and1_count_vec )));
295
+ union_ = _mm512_reduce_add_epi64 ( //
296
+ _mm512_add_epi64 (or3_count_vec , _mm512_add_epi64 (or2_count_vec , or1_count_vec )));
283
297
} else if (n_words <= 256 ) { // Up to 2048 bits.
284
298
__mmask64 mask = (__mmask64 )_bzhi_u64 (0xFFFFFFFFFFFFFFFF , n_words - 192 );
285
299
__m512i a1_vec = _mm512_loadu_epi8 (a );
0 commit comments