Skip to content

Commit 98fbf5d

Browse files
committed
Docs: Harley Seal optimization ideas
Relates to #138
1 parent cfc0c44 commit 98fbf5d

File tree

2 files changed

+29
-4
lines changed

2 files changed

+29
-4
lines changed

include/simsimd/binary.h

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -263,6 +263,20 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
263263
intersection = _mm512_reduce_add_epi64(_mm512_add_epi64(and2_count_vec, and1_count_vec));
264264
union_ = _mm512_reduce_add_epi64(_mm512_add_epi64(or2_count_vec, or1_count_vec));
265265
} else if (n_words <= 196) { // Up to 1568 bits.
266+
// TODO: On such vectors we can clearly see that the CPU struggles to perform this many parallel
267+
// population counts, because the throughput of Jaccard and Hamming in this case starts to differ.
268+
// One optimization, aside from Harley-Seal transforms can be using "shuffles" for nibble-popcount
269+
// lookups, to utilize other ports on the CPU.
270+
// https://github.com/ashvardanian/SimSIMD/pull/138
271+
//
272+
// On Ice Lake:
273+
// - `VPOPCNTQ (ZMM, K, ZMM)` can only execute on port 5, which is a bottleneck.
274+
// - `VPSHUFB (ZMM, ZMM, ZMM)` can only run on the same port 5 as well!
275+
// On Zen4:
276+
// - `VPOPCNTQ (ZMM, K, ZMM)` can run on ports: 0, 1.
277+
// - `VPSHUFB (ZMM, ZMM, ZMM)` can run on ports: 1, 2.
278+
// https://uops.info/table.html?search=VPOPCNTQ%20(ZMM%2C%20K%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
279+
// https://uops.info/table.html?search=VPSHUFB%20(ZMM%2C%20ZMM%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_TGL=on&cb_measurements=on&cb_doc=on&cb_avx512=on
266280
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 128);
267281
__m512i a1_vec = _mm512_loadu_epi8(a);
268282
__m512i b1_vec = _mm512_loadu_epi8(b);
@@ -276,10 +290,10 @@ SIMSIMD_PUBLIC void simsimd_jaccard_b8_ice(simsimd_b8_t const* a, simsimd_b8_t c
276290
__m512i or2_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a2_vec, b2_vec));
277291
__m512i and3_count_vec = _mm512_popcnt_epi64(_mm512_and_si512(a3_vec, b3_vec));
278292
__m512i or3_count_vec = _mm512_popcnt_epi64(_mm512_or_si512(a3_vec, b3_vec));
279-
intersection =
280-
_mm512_reduce_add_epi64(_mm512_add_epi64(and3_count_vec, _mm512_add_epi64(and2_count_vec, and1_count_vec)));
281-
union_ =
282-
_mm512_reduce_add_epi64(_mm512_add_epi64(or3_count_vec, _mm512_add_epi64(or2_count_vec, or1_count_vec)));
293+
intersection = _mm512_reduce_add_epi64( //
294+
_mm512_add_epi64(and3_count_vec, _mm512_add_epi64(and2_count_vec, and1_count_vec)));
295+
union_ = _mm512_reduce_add_epi64( //
296+
_mm512_add_epi64(or3_count_vec, _mm512_add_epi64(or2_count_vec, or1_count_vec)));
283297
} else if (n_words <= 256) { // Up to 2048 bits.
284298
__mmask64 mask = (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFF, n_words - 192);
285299
__m512i a1_vec = _mm512_loadu_epi8(a);

include/simsimd/spatial.h

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1707,6 +1707,17 @@ SIMSIMD_PUBLIC void simsimd_cos_i8_ice(simsimd_i8_t const* a, simsimd_i8_t const
17071707
// as it's asymmetric with respect to the sign of the input arguments:
17081708
// Signed(ZeroExtend16(a.byte[4*j]) * SignExtend16(b.byte[4*j]))
17091709
// Luckily to compute the squares, we just drop the sign bit of the second argument.
1710+
//
1711+
// On Ice Lake:
1712+
// - `VPDPBUSDS (ZMM, ZMM, ZMM)` can only execute on port 0, with 5 cycle latency.
1713+
// - `VPDPWSSDS (ZMM, ZMM, ZMM)` can also only execute on port 0, with 5 cycle latency.
1714+
//
1715+
// On Zen4 Genoa:
1716+
// - `VPDPBUSDS (ZMM, ZMM, ZMM)` can execute on ports 0 and 1, with 4 cycle latency.
1717+
// - `VPDPWSSDS (ZMM, ZMM, ZMM)` can also execute on ports 0 and 1, with 4 cycle latency.
1718+
//
1719+
// https://uops.info/table.html?search=VPDPBUSDS%20(ZMM%2C%20ZMM%2C%20ZMM)&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_ZEN4=on&cb_measurements=on&cb_doc=on&cb_avx512=on
1720+
// https://uops.info/table.html?search=vpdpwssds%20(ZMM&cb_lat=on&cb_tp=on&cb_uops=on&cb_ports=on&cb_SKX=on&cb_ICL=on&cb_ZEN4=on&cb_measurements=on&cb_doc=on&cb_avx512=on
17101721
a_i8_abs_vec = _mm512_abs_epi8(a_i8_vec);
17111722
b_i8_abs_vec = _mm512_abs_epi8(b_i8_vec);
17121723
a2_i32_vec = _mm512_dpbusds_epi32(a2_i32_vec, a_i8_abs_vec, a_i8_abs_vec);

0 commit comments

Comments
 (0)