Skip to content

Commit

Permalink
Add: Separate Skylake-X & Ice Lake checksums
Browse files Browse the repository at this point in the history
  • Loading branch information
ashvardanian committed Feb 13, 2025
1 parent 0a3e363 commit 554f50d
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 32 deletions.
1 change: 1 addition & 0 deletions c/lib.c
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ SZ_DYNAMIC void sz_dispatch_table_init(void) {
impl->rfind = sz_rfind_skylake;
impl->find_byte = sz_find_byte_skylake;
impl->rfind_byte = sz_rfind_byte_skylake;
impl->checksum = sz_checksum_skylake;
}
#endif

Expand Down
185 changes: 156 additions & 29 deletions include/stringzilla/hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,9 @@
*
* Includes core APIs:
*
* - `sz_checksum` - for byte-level checksums.
* - `sz_checksum` - for byte-level 64-bit unsigned checksums.
* - `sz_hash` - for 64-bit single-shot hashing.
* - `sz_hashes` - producing the rolling hashes of a string.
* - `sz_generate` - populating buffers with random data.
*
* Convenience functions for character-set matching:
*
* - `sz_hashes_fingerprint`
* - `sz_hashes_intersection`
*/
#ifndef STRINGZILLA_HASH_H_
#define STRINGZILLA_HASH_H_
Expand Down Expand Up @@ -334,23 +328,124 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)

SZ_PUBLIC sz_u64_t sz_checksum_skylake(sz_cptr_t text, sz_size_t length) {
// The naive implementation of this function is very simple.
// It assumes the CPU is great at handling unaligned "loads".
//
// A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
// 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
// With two strings, we may consider the overall workload huge, if each exceeds 1 MB in length.
int const is_huge = length >= 1ull * 1024ull * 1024ull;
sz_u512_vec_t text_vec, sums_vec;

// When the buffer is small, there isn't much to innovate.
// Separately handling even smaller payloads doesn't increase performance even on synthetic benchmarks.
if (length <= 16) {
__mmask16 mask = _sz_u16_mask_until(length);
text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
sums_vec.xmms[0] = _mm_sad_epu8(text_vec.xmms[0], _mm_setzero_si128());
sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_vec.xmms[0]);
sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_vec.xmms[0], 1);
return low + high;
}
else if (length <= 32) {
__mmask32 mask = _sz_u32_mask_until(length);
text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
// Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
__m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
__m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
__m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
sz_u64_t low = (sz_u64_t)_mm_cvtsi128_si64(sums_xmm);
sz_u64_t high = (sz_u64_t)_mm_extract_epi64(sums_xmm, 1);
return low + high;
}
else if (length <= 64) {
__mmask64 mask = _sz_u64_mask_until(length);
text_vec.zmm = _mm512_maskz_loadu_epi8(mask, text);
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
return _mm512_reduce_add_epi64(sums_vec.zmm);
}
// For large buffers, fitting into L1 cache sizes, there are other tricks we can use.
//
// 1. Moving in both directions to maximize the throughput, when fetching from multiple
// memory pages. Also helps with cache set-associativity issues, as we won't always
// be fetching the same buckets in the lookup table.
//
// Bidirectional traversal generally adds about 10% to such algorithms.
else if (!is_huge) {
sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
sz_size_t tail_length = (sz_size_t)(text + length) % 64; // 63 or less.
sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
_sz_assert(body_length % 64 == 0 && head_length < 64 && tail_length < 64);
__mmask64 head_mask = _sz_u64_mask_until(head_length);
__mmask64 tail_mask = _sz_u64_mask_until(tail_length);

text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
text_vec.zmm = _mm512_load_si512((__m512i const *)text);
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
}
text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
return _mm512_reduce_add_epi64(sums_vec.zmm);
}
// For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
//
// 1. Using non-temporal loads to avoid polluting the cache.
// 2. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
// for predictable patterns, so disregard this advice.
//
// Bidirectional traversal generally adds about 10% to such algorithms.
else {
sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64;
sz_size_t tail_length = (sz_size_t)(text + length) % 64;
sz_size_t body_length = length - head_length - tail_length;
__mmask64 head_mask = _sz_u64_mask_until(head_length);
__mmask64 tail_mask = _sz_u64_mask_until(tail_length);

text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());

// Now in the main loop, we can use non-temporal loads, performing the operation in both directions.
for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
text_reversed_vec.zmm = _mm512_stream_load_si512((__m512i *)(text + body_length - 64));
sums_reversed_vec.zmm =
_mm512_add_epi64(sums_reversed_vec.zmm, _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512()));
}
if (body_length >= 64) {
text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
}

return _mm512_reduce_add_epi64(_mm512_add_epi64(sums_vec.zmm, sums_reversed_vec.zmm));
}
}

#pragma clang attribute pop
#pragma GCC pop_options
#endif // SZ_USE_SKYLAKE
#pragma endregion // Skylake Implementation

/* AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
* Includes extensions:
* - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
* - 2018 CannonLake: IFMA, VBMI,
* - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
* - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
* - 2018 CannonLake: IFMA, VBMI,
* - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
*/
#pragma region Ice Lake Implementation
#if SZ_USE_ICE
#pragma GCC push_options
#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
apply_to = function)
#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2")
#pragma clang attribute push( \
__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vnni,bmi,bmi2"))), \
apply_to = function)

SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
// The naive implementation of this function is very simple.
Expand All @@ -363,6 +458,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
sz_u512_vec_t text_vec, sums_vec;

// When the buffer is small, there isn't much to innovate.
// Separately handling even smaller payloads doesn't increase performance even on synthetic benchmarks.
if (length <= 16) {
__mmask16 mask = _sz_u16_mask_until(length);
text_vec.xmms[0] = _mm_maskz_loadu_epi8(mask, text);
Expand All @@ -375,7 +471,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
__mmask32 mask = _sz_u32_mask_until(length);
text_vec.ymms[0] = _mm256_maskz_loadu_epi8(mask, text);
sums_vec.ymms[0] = _mm256_sad_epu8(text_vec.ymms[0], _mm256_setzero_si256());
// Accumulating 256 bits is harders, as we need to extract the 128-bit sums first.
// Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
__m128i low_xmm = _mm256_castsi256_si128(sums_vec.ymms[0]);
__m128i high_xmm = _mm256_extracti128_si256(sums_vec.ymms[0], 1);
__m128i sums_xmm = _mm_add_epi64(low_xmm, high_xmm);
Expand All @@ -389,30 +485,60 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
return _mm512_reduce_add_epi64(sums_vec.zmm);
}
// For large buffers, fitting into L1 cache sizes, there are other tricks we can use.
//
// 1. Moving in both directions to maximize the throughput, when fetching from multiple
// memory pages. Also helps with cache set-associativity issues, as we won't always
// be fetching the same buckets in the lookup table.
// 2. Port-level parallelism, can be used to hide the latency of expensive SIMD instructions.
// - `VPSADBW (ZMM, ZMM, ZMM)` combination with `VPADDQ (ZMM, ZMM, ZMM)`:
// - On Ice Lake, the `VPSADBW` is 3 cycles on port 5; the `VPADDQ` is 1 cycle on ports 0/5.
// - On Zen 4, the `VPSADBW` is 3 cycles on ports 0/1; the `VPADDQ` is 1 cycle on ports 0/1/2/3.
// - `VPDPBUSDS (ZMM, ZMM, ZMM)`:
// - On Ice Lake, the `VPDPBUSDS` is 5 cycles on port 0.
// - On Zen 4, the `VPDPBUSDS` is 4 cycles on ports 0/1.
//
// Bidirectional traversal generally adds about 10% to such algorithms.
else if (!is_huge) {
sz_size_t head_length = (64 - ((sz_size_t)text % 64)) % 64; // 63 or less.
sz_size_t tail_length = (sz_size_t)(text + length) % 64; // 63 or less.
sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
_sz_assert(body_length % 64 == 0 && head_length < 64 && tail_length < 64);
__mmask64 head_mask = _sz_u64_mask_until(head_length);
__mmask64 tail_mask = _sz_u64_mask_until(tail_length);

sz_u512_vec_t zeros_vec, ones_vec;
zeros_vec.zmm = _mm512_setzero_si512();
ones_vec.zmm = _mm512_set1_epi8(1);

// Take care of the unaligned head and tail!
sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
text_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, text);
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512());
for (text += head_length; body_length >= 64; text += 64, body_length -= 64) {
text_vec.zmm = _mm512_load_si512((__m512i const *)text);
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
sums_vec.zmm = _mm512_sad_epu8(text_vec.zmm, zeros_vec.zmm);
text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
sums_reversed_vec.zmm = _mm512_dpbusds_epi32(zeros_vec.zmm, text_reversed_vec.zmm, ones_vec.zmm);

// Now in the main loop, we can use aligned loads, performing the operation in both directions.
for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
text_reversed_vec.zmm = _mm512_load_si512((__m512i *)(text + body_length - 64));
sums_reversed_vec.zmm = _mm512_dpbusds_epi32(sums_reversed_vec.zmm, text_reversed_vec.zmm, ones_vec.zmm);
text_vec.zmm = _mm512_load_si512((__m512i *)(text));
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, zeros_vec.zmm));
}
text_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text);
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
return _mm512_reduce_add_epi64(sums_vec.zmm);
// There may be an aligned chunk of 64 bytes left.
if (body_length >= 64) {
_sz_assert(body_length == 64);
text_vec.zmm = _mm512_load_si512((__m512i *)(text));
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, zeros_vec.zmm));
}

return _mm512_reduce_add_epi64(sums_vec.zmm) + _mm512_reduce_add_epi32(sums_reversed_vec.zmm);
}
// For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
//
// 1. Moving in both directions to maximize the throughput, when fetching from multiple
// memory pages. Also helps with cache set-associativity issues, as we won't always
// be fetching the same entries in the lookup table.
// 2. Using non-temporal stores to avoid polluting the cache.
// 3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
// for predictable patterns, so disregard this advice.
// 1. Using non-temporal loads to avoid polluting the cache.
// 2. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
// for predictable patterns, so disregard this advice.
//
// Bidirectional traversal generally adds about 10% to such algorithms.
else {
Expand All @@ -428,8 +554,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
text_reversed_vec.zmm = _mm512_maskz_loadu_epi8(tail_mask, text + head_length + body_length);
sums_reversed_vec.zmm = _mm512_sad_epu8(text_reversed_vec.zmm, _mm512_setzero_si512());

// Now in the main loop, we can use non-temporal loads and stores,
// performing the operation in both directions.
// Now in the main loop, we can use non-temporal loads, performing the operation in both directions.
for (text += head_length; body_length >= 128; text += 64, text += 64, body_length -= 128) {
text_vec.zmm = _mm512_stream_load_si512((__m512i *)(text));
sums_vec.zmm = _mm512_add_epi64(sums_vec.zmm, _mm512_sad_epu8(text_vec.zmm, _mm512_setzero_si512()));
Expand Down Expand Up @@ -506,6 +631,8 @@ SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
SZ_DYNAMIC sz_u64_t sz_checksum(sz_cptr_t text, sz_size_t length) {
#if SZ_USE_ICE
return sz_checksum_ice(text, length);
#elif SZ_USE_SKYLAKE
return sz_checksum_skylake(text, length);
#elif SZ_USE_HASWELL
return sz_checksum_haswell(text, length);
#elif SZ_USE_NEON
Expand Down
10 changes: 7 additions & 3 deletions scripts/bench_token.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,18 @@ tracked_unary_functions_t checksum_functions() {
return std::accumulate(s.begin(), s.end(), (std::size_t)0,
[](std::size_t sum, char c) { return sum + static_cast<unsigned char>(c); });
}},
{"sz_checksum_serial", wrap_sz(sz_checksum_serial), true},
{"sz_checksum_serial", wrap_sz(sz_checksum_serial), false},
#if SZ_USE_HASWELL
{"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), true},
{"sz_checksum_haswell", wrap_sz(sz_checksum_haswell), false},
#endif
#if SZ_USE_SKYLAKE
{"sz_checksum_skylake", wrap_sz(sz_checksum_skylake), false},
#endif
#if SZ_USE_ICE
{"sz_checksum_ice", wrap_sz(sz_checksum_ice), false},
#endif
#if SZ_USE_NEON
{"sz_checksum_neon", wrap_sz(sz_checksum_neon), true},
{"sz_checksum_neon", wrap_sz(sz_checksum_neon), false},
#endif
};
return result;
Expand Down

0 comments on commit 554f50d

Please sign in to comment.