Skip to content

Commit 554f50d

Browse files
committed
Add: Separate Skylake-X & Ice Lake checksums
1 parent 0a3e363 commit 554f50d

File tree

3 files changed

+164
-32
lines changed

3 files changed

+164
-32
lines changed

c/lib.c

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,6 +259,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
259259
impl->rfind = sz_rfind_skylake;
260260
impl->find_byte = sz_find_byte_skylake;
261261
impl->rfind_byte = sz_rfind_byte_skylake;
262+
impl->checksum = sz_checksum_skylake;
262263
}
263264
#endif
264265

include/stringzilla/hash.h

Lines changed: 156 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,9 @@
55
*
66
* Includes core APIs:
77
*
8-
* - `sz_checksum` - for byte-level checksums.
8+
* - `sz_checksum` - for byte-level 64-bit unsigned checksums.
99
* - `sz_hash` - for 64-bit single-shot hashing.
10-
* - `sz_hashes` - producing the rolling hashes of a string.
1110
* - `sz_generate` - populating buffers with random data.
12-
*
13-
* Convenience functions for character-set matching:
14-
*
15-
* - `sz_hashes_fingerprint`
16-
* - `sz_hashes_intersection`
1711
*/
1812
#ifndef STRINGZILLA_HASH_H_
1913
#define STRINGZILLA_HASH_H_
@@ -334,23 +328,124 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
334328
#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
335329
#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
336330

331+
SZ_PUBLIC sz_u64_t sz_checksum_skylake(sz_cptr_t text, sz_size_t length) {
332+
// The naive implementation of this function is very simple.
333+
// It assumes the CPU is great at handling unaligned "loads".
334+
//
335+
// A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
336+
// 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
337+
// With two strings, we may consider the overall workload huge, if each exceeds 1 MB in length.
338+
int const is_huge = length >= 1ull * 1024ull * 1024ull;
339+
sz_u512_vec_t text_vec, sums_vec;
340+
341+
// When the buffer is small, there isn't much to innovate.
342+
// Separately handling even smaller payloads doesn't increase performance even on synthetic benchmarks.
343+
if (length <= 16) {
344+
__mmask16 mask = _sz_u16_mask_until(length);
345+
text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
346+
sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
347+
sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
348+
sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
349+
return low + high;
350+
}
351+
else if (length <= 32) {
352+
__mmask32 mask = _sz_u32_mask_until(length);
353+
text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
354+
sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
355+
// Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
356+
__m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
357+
__m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
358+
__m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
359+
sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
360+
sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
361+
return low + high;
362+
}
363+
else if (length <= 64) {
364+
__mmask64 mask = _sz_u64_mask_until(length);
365+
text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
366+
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
367+
return _mm512_reduce_add_epi64(sums_vec.zmm);
368+
}
369+
// For large buffers, fitting into L1 cache sizes, there are other tricks we can use.
370+
//
371+
// 1. Moving in both directions to maximize the throughput, when fetching from multiple
372+
// memory pages. Also helps with cache set-associativity issues, as we won't always
373+
// be fetching the same buckets in the lookup table.
374+
//
375+
// Bidirectional traversal generally adds about 10% to such algorithms.
376+
else if (!is_huge) {
377+
sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
378+
sz_size_t tail_length = (sz_size_t)(text + length) % 64; // 63 or less.
379+
sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
380+
_sz_assert(body_length % 64 == 0 && head_length < 64 && tail_length < 64);
381+
__mmask64 head_mask = _sz_u64_mask_until(head_length);
382+
__mmask64 tail_mask = _sz_u64_mask_until(tail_length);
383+
384+
text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
385+
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
386+
for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
387+
text_vec.zmm = _mm512_load_si512((__m512i const *)text);
388+
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
389+
}
390+
text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
391+
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
392+
return _mm512_reduce_add_epi64(sums_vec.zmm);
393+
}
394+
// For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
395+
//
396+
// 1. Using non-temporal loads to avoid polluting the cache.
397+
// 2. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
398+
// for predictable patterns, so disregard this advice.
399+
//
400+
// Bidirectional traversal generally adds about 10% to such algorithms.
401+
else {
402+
sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
403+
sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
404+
sz_size_t tail_length = (sz_size_t)(text + length) % 64;
405+
sz_size_t body_length = length - head_length - tail_length;
406+
__mmask64 head_mask = _sz_u64_mask_until(head_length);
407+
__mmask64 tail_mask = _sz_u64_mask_until(tail_length);
408+
409+
text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
410+
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
411+
text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
412+
sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
413+
414+
// Now in the main loop, we can use non-temporal loads, performing the operation in both directions.
415+
for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
416+
text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
417+
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
418+
text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
419+
sums_reversed_vec.zmm =
420+
_mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
421+
}
422+
if (body_length >= 64) {
423+
text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
424+
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
425+
}
426+
427+
return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
428+
}
429+
}
430+
337431
#pragma clang attribute pop
338432
#pragma GCC pop_options
339433
#endif // SZ_USE_SKYLAKE
340434
#pragma endregion // Skylake Implementation
341435

342436
/* AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
343437
* Includes extensions:
344-
* - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
345-
* - 2018 CannonLake: IFMA, VBMI,
346-
* - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
438+
* - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
439+
* - 2018 CannonLake: IFMA, VBMI,
440+
* - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
347441
*/
348442
#pragma region Ice Lake Implementation
349443
#if SZ_USE_ICE
350444
#pragma GCC push_options
351-
#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
352-
#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
353-
apply_to = function)
445+
#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2")
446+
#pragma clang attribute push( \
447+
__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vnni,bmi,bmi2"))), \
448+
apply_to = function)
354449

355450
SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
356451
// The naive implementation of this function is very simple.
@@ -363,6 +458,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
363458
sz_u512_vec_t text_vec, sums_vec;
364459

365460
// When the buffer is small, there isn't much to innovate.
461+
// Separately handling even smaller payloads doesn't increase performance even on synthetic benchmarks.
366462
if (length <= 16) {
367463
__mmask16 mask = _sz_u16_mask_until(length);
368464
text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
@@ -375,7 +471,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
375471
__mmask32 mask = _sz_u32_mask_until(length);
376472
text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
377473
sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
378-
// Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
474+
// Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
379475
__m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
380476
__m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
381477
__m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
@@ -389,30 +485,60 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
389485
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
390486
return _mm512_reduce_add_epi64(sums_vec.zmm);
391487
}
488+
// For large buffers, fitting into L1 cache sizes, there are other tricks we can use.
489+
//
490+
// 1. Moving in both directions to maximize the throughput, when fetching from multiple
491+
// memory pages. Also helps with cache set-associativity issues, as we won't always
492+
// be fetching the same buckets in the lookup table.
493+
// 2. Port-level parallelism, can be used to hide the latency of expensive SIMD instructions.
494+
// - `VPSADBW (ZMM, ZMM, ZMM)` combination with `VPADDQ (ZMM, ZMM, ZMM)`:
495+
// - On Ice Lake, the `VPSADBW` is 3 cycles on port 5; the `VPADDQ` is 1 cycle on ports 0/5.
496+
// - On Zen 4, the `VPSADBW` is 3 cycles on ports 0/1; the `VPADDQ` is 1 cycle on ports 0/1/2/3.
497+
// - `VPDPBUSDS (ZMM, ZMM, ZMM)`:
498+
// - On Ice Lake, the `VPDPBUSDS` is 5 cycles on port 0.
499+
// - On Zen 4, the `VPDPBUSDS` is 4 cycles on ports 0/1.
500+
//
501+
// Bidirectional traversal generally adds about 10% to such algorithms.
392502
else if (!is_huge) {
393503
sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
394504
sz_size_t tail_length = (sz_size_t)(text + length) % 64; // 63 or less.
395505
sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
506+
_sz_assert(body_length % 64 == 0 && head_length < 64 && tail_length < 64);
396507
__mmask64 head_mask = _sz_u64_mask_until(head_length);
397508
__mmask64 tail_mask = _sz_u64_mask_until(tail_length);
509+
510+
sz_u512_vec_t zeros_vec, ones_vec;
511+
zeros_vec.zmm = _mm512_setzero_si512();
512+
ones_vec.zmm = _mm512_set1_epi8(1);
513+
514+
// Take care of the unaligned head and tail!
515+
sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
398516
text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
399-
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
400-
for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
401-
text_vec.zmm = _mm512_load_si512((__m512i const *)text);
402-
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
517+
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, zeros_vec.zmm);
518+
text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
519+
sums_reversed_vec.zmm = _mm512_dpbusds_epi32(zeros_vec.zmm, text_reversed_vec.zmm, ones_vec.zmm);
520+
521+
// Now in the main loop, we can use aligned loads, performing the operation in both directions.
522+
for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
523+
text_reversed_vec.zmm = _mm512_load_si512((__m512i *)(text + body_length - 64));
524+
sums_reversed_vec.zmm = _mm512_dpbusds_epi32(sums_reversed_vec.zmm, text_reversed_vec.zmm, ones_vec.zmm);
525+
text_vec.zmm = _mm512_load_si512((__m512i *)(text));
526+
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, zeros_vec.zmm));
403527
}
404-
text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
405-
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
406-
return _mm512_reduce_add_epi64(sums_vec.zmm);
528+
// There may be an aligned chunk of 64 bytes left.
529+
if (body_length >= 64) {
530+
_sz_assert(body_length == 64);
531+
text_vec.zmm = _mm512_load_si512((__m512i *)(text));
532+
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, zeros_vec.zmm));
533+
}
534+
535+
return _mm512_reduce_add_epi64(sums_vec.zmm) + _mm512_reduce_add_epi32(sums_reversed_vec.zmm);
407536
}
408537
// For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
409538
//
410-
// 1. Moving in both directions to maximize the throughput, when fetching from multiple
411-
// memory pages. Also helps with cache set-associativity issues, as we won't always
412-
// be fetching the same entries in the lookup table.
413-
// 2. Using non-temporal stores to avoid polluting the cache.
414-
// 3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
415-
// for predictable patterns, so disregard this advice.
539+
// 1. Using non-temporal loads to avoid polluting the cache.
540+
// 2. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
541+
// for predictable patterns, so disregard this advice.
416542
//
417543
// Bidirectional traversal generally adds about 10% to such algorithms.
418544
else {
@@ -428,8 +554,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
428554
text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
429555
sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());
430556

431-
// Now in the main loop, we can use non-temporal loads and stores,
432-
// performing the operation in both directions.
557+
// Now in the main loop, we can use non-temporal loads, performing the operation in both directions.
433558
for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
434559
text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
435560
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
@@ -506,6 +631,8 @@ SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
506631
SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
507632
#if SZ_USE_ICE
508633
return sz_checksum_ice(text, length);
634+
#elif SZ_USE_SKYLAKE
635+
return sz_checksum_skylake(text, length);
509636
#elif SZ_USE_HASWELL
510637
return sz_checksum_haswell(text, length);
511638
#elif SZ_USE_NEON

scripts/bench_token.cpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,18 @@ tracked_unary_functions_t checksum_functions() {
2121
return std::accumulate(s.begin(), s.end(), (std::size_t)0,
2222
[](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
2323
}},
24-
{"sz_checksum_serial", wrap_sz(sz_checksum_serial), true},
24+
{"sz_checksum_serial", wrap_sz(sz_checksum_serial), false},
2525
#if SZ_USE_HASWELL
26-
{"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), true},
26+
{"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), false},
27+
#endif
28+
#if SZ_USE_SKYLAKE
29+
{"sz_checksum_skylake", wrap_sz(sz_checksum_skylake), false},
2730
#endif
2831
#if SZ_USE_ICE
32+
{"sz_checksum_ice", wrap_sz(sz_checksum_ice), false},
2933
#endif
3034
#if SZ_USE_NEON
31-
{"sz_checksum_neon", wrap_sz(sz_checksum_neon), true},
35+
{"sz_checksum_neon", wrap_sz(sz_checksum_neon), false},
3236
#endif
3337
};
3438
return result;

0 commit comments

Comments
 (0)