5
5
*
6
6
* Includes core APIs:
7
7
*
8
- * - `sz_checksum` - for byte-level checksums.
8
+ * - `sz_checksum` - for byte-level 64-bit unsigned checksums.
9
9
* - `sz_hash` - for 64-bit single-shot hashing.
10
- * - `sz_hashes` - producing the rolling hashes of a string.
11
10
* - `sz_generate` - populating buffers with random data.
12
- *
13
- * Convenience functions for character-set matching:
14
- *
15
- * - `sz_hashes_fingerprint`
16
- * - `sz_hashes_intersection`
17
11
*/
18
12
#ifndef STRINGZILLA_HASH_H_
19
13
#define STRINGZILLA_HASH_H_
@@ -334,23 +328,124 @@ SZ_PUBLIC sz_u64_t sz_checksum_haswell(sz_cptr_t text, sz_size_t length) {
334
328
#pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "bmi", "bmi2")
335
329
#pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,bmi,bmi2"))), apply_to = function)
336
330
331
+ SZ_PUBLIC sz_u64_t sz_checksum_skylake (sz_cptr_t text, sz_size_t length) {
332
+ // The naive implementation of this function is very simple.
333
+ // It assumes the CPU is great at handling unaligned "loads".
334
+ //
335
+ // A typical AWS Sapphire Rapids instance can have 48 KB x 2 blocks of L1 data cache per core,
336
+ // 2 MB x 2 blocks of L2 cache per core, and one shared 60 MB buffer of L3 cache.
337
+ // With two strings, we may consider the overall workload huge, if each exceeds 1 MB in length.
338
+ int const is_huge = length >= 1ull * 1024ull * 1024ull ;
339
+ sz_u512_vec_t text_vec, sums_vec;
340
+
341
+ // When the buffer is small, there isn't much to innovate.
342
+ // Separately handling even smaller payloads doesn't increase performance even on synthetic benchmarks.
343
+ if (length <= 16 ) {
344
+ __mmask16 mask = _sz_u16_mask_until (length);
345
+ text_vec.xmms [0 ] = _mm_maskz_loadu_epi8 (mask, text);
346
+ sums_vec.xmms [0 ] = _mm_sad_epu8 (text_vec.xmms [0 ], _mm_setzero_si128 ());
347
+ sz_u64_t low = (sz_u64_t )_mm_cvtsi128_si64 (sums_vec.xmms [0 ]);
348
+ sz_u64_t high = (sz_u64_t )_mm_extract_epi64 (sums_vec.xmms [0 ], 1 );
349
+ return low + high;
350
+ }
351
+ else if (length <= 32 ) {
352
+ __mmask32 mask = _sz_u32_mask_until (length);
353
+ text_vec.ymms [0 ] = _mm256_maskz_loadu_epi8 (mask, text);
354
+ sums_vec.ymms [0 ] = _mm256_sad_epu8 (text_vec.ymms [0 ], _mm256_setzero_si256 ());
355
+ // Accumulating 256 bits is harder, as we need to extract the 128-bit sums first.
356
+ __m128i low_xmm = _mm256_castsi256_si128 (sums_vec.ymms [0 ]);
357
+ __m128i high_xmm = _mm256_extracti128_si256 (sums_vec.ymms [0 ], 1 );
358
+ __m128i sums_xmm = _mm_add_epi64 (low_xmm, high_xmm);
359
+ sz_u64_t low = (sz_u64_t )_mm_cvtsi128_si64 (sums_xmm);
360
+ sz_u64_t high = (sz_u64_t )_mm_extract_epi64 (sums_xmm, 1 );
361
+ return low + high;
362
+ }
363
+ else if (length <= 64 ) {
364
+ __mmask64 mask = _sz_u64_mask_until (length);
365
+ text_vec.zmm = _mm512_maskz_loadu_epi8 (mask, text);
366
+ sums_vec.zmm = _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ());
367
+ return _mm512_reduce_add_epi64 (sums_vec.zmm );
368
+ }
369
+ // For large buffers, fitting into L1 cache sizes, there are other tricks we can use.
370
+ //
371
+ // 1. Moving in both directions to maximize the throughput, when fetching from multiple
372
+ // memory pages. Also helps with cache set-associativity issues, as we won't always
373
+ // be fetching the same buckets in the lookup table.
374
+ //
375
+ // Bidirectional traversal generally adds about 10% to such algorithms.
376
+ else if (!is_huge) {
377
+ sz_size_t head_length = (64 - ((sz_size_t )text % 64 )) % 64 ; // 63 or less.
378
+ sz_size_t tail_length = (sz_size_t )(text + length) % 64 ; // 63 or less.
379
+ sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
380
+ _sz_assert (body_length % 64 == 0 && head_length < 64 && tail_length < 64 );
381
+ __mmask64 head_mask = _sz_u64_mask_until (head_length);
382
+ __mmask64 tail_mask = _sz_u64_mask_until (tail_length);
383
+
384
+ text_vec.zmm = _mm512_maskz_loadu_epi8 (head_mask, text);
385
+ sums_vec.zmm = _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ());
386
+ for (text += head_length; body_length >= 64 ; text += 64 , body_length -= 64 ) {
387
+ text_vec.zmm = _mm512_load_si512 ((__m512i const *)text);
388
+ sums_vec.zmm = _mm512_add_epi64 (sums_vec.zmm , _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ()));
389
+ }
390
+ text_vec.zmm = _mm512_maskz_loadu_epi8 (tail_mask, text);
391
+ sums_vec.zmm = _mm512_add_epi64 (sums_vec.zmm , _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ()));
392
+ return _mm512_reduce_add_epi64 (sums_vec.zmm );
393
+ }
394
+ // For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
395
+ //
396
+ // 1. Using non-temporal loads to avoid polluting the cache.
397
+ // 2. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
398
+ // for predictable patterns, so disregard this advice.
399
+ //
400
+ // Bidirectional traversal generally adds about 10% to such algorithms.
401
+ else {
402
+ sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
403
+ sz_size_t head_length = (64 - ((sz_size_t )text % 64 )) % 64 ;
404
+ sz_size_t tail_length = (sz_size_t )(text + length) % 64 ;
405
+ sz_size_t body_length = length - head_length - tail_length;
406
+ __mmask64 head_mask = _sz_u64_mask_until (head_length);
407
+ __mmask64 tail_mask = _sz_u64_mask_until (tail_length);
408
+
409
+ text_vec.zmm = _mm512_maskz_loadu_epi8 (head_mask, text);
410
+ sums_vec.zmm = _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ());
411
+ text_reversed_vec.zmm = _mm512_maskz_loadu_epi8 (tail_mask, text + head_length + body_length);
412
+ sums_reversed_vec.zmm = _mm512_sad_epu8 (text_reversed_vec.zmm , _mm512_setzero_si512 ());
413
+
414
+ // Now in the main loop, we can use non-temporal loads, performing the operation in both directions.
415
+ for (text += head_length; body_length >= 128 ; text += 64 , text += 64 , body_length -= 128 ) {
416
+ text_vec.zmm = _mm512_stream_load_si512 ((__m512i *)(text));
417
+ sums_vec.zmm = _mm512_add_epi64 (sums_vec.zmm , _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ()));
418
+ text_reversed_vec.zmm = _mm512_stream_load_si512 ((__m512i *)(text + body_length - 64 ));
419
+ sums_reversed_vec.zmm =
420
+ _mm512_add_epi64 (sums_reversed_vec.zmm , _mm512_sad_epu8 (text_reversed_vec.zmm , _mm512_setzero_si512 ()));
421
+ }
422
+ if (body_length >= 64 ) {
423
+ text_vec.zmm = _mm512_stream_load_si512 ((__m512i *)(text));
424
+ sums_vec.zmm = _mm512_add_epi64 (sums_vec.zmm , _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ()));
425
+ }
426
+
427
+ return _mm512_reduce_add_epi64 (_mm512_add_epi64 (sums_vec.zmm , sums_reversed_vec.zmm ));
428
+ }
429
+ }
430
+
337
431
#pragma clang attribute pop
338
432
#pragma GCC pop_options
339
433
#endif // SZ_USE_SKYLAKE
340
434
#pragma endregion // Skylake Implementation
341
435
342
436
/* AVX512 implementation of the string search algorithms for Ice Lake and newer CPUs.
343
437
* Includes extensions:
344
- * - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
345
- * - 2018 CannonLake: IFMA, VBMI,
346
- * - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
438
+ * - 2017 Skylake: F, CD, ER, PF, VL, DQ, BW,
439
+ * - 2018 CannonLake: IFMA, VBMI,
440
+ * - 2019 Ice Lake: VPOPCNTDQ, VNNI, VBMI2, BITALG, GFNI, VPCLMULQDQ, VAES.
347
441
*/
348
442
#pragma region Ice Lake Implementation
349
443
#if SZ_USE_ICE
350
444
#pragma GCC push_options
351
- #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "bmi", "bmi2")
352
- #pragma clang attribute push(__attribute__((target("avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,bmi,bmi2"))), \
353
- apply_to = function)
445
+ #pragma GCC target("avx", "avx512f", "avx512vl", "avx512bw", "avx512dq", "avx512vbmi", "avx512vnni", "bmi", "bmi2")
446
+ #pragma clang attribute push( \
447
+ __attribute__ ((target(" avx,avx512f,avx512vl,avx512bw,avx512dq,avx512vbmi,avx512vnni,bmi,bmi2" ))), \
448
+ apply_to = function)
354
449
355
450
SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
356
451
// The naive implementation of this function is very simple.
@@ -363,6 +458,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
363
458
sz_u512_vec_t text_vec, sums_vec;
364
459
365
460
// When the buffer is small, there isn't much to innovate.
461
+ // Separately handling even smaller payloads doesn't increase performance even on synthetic benchmarks.
366
462
if (length <= 16 ) {
367
463
__mmask16 mask = _sz_u16_mask_until (length);
368
464
text_vec.xmms [0 ] = _mm_maskz_loadu_epi8 (mask, text);
@@ -375,7 +471,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
375
471
__mmask32 mask = _sz_u32_mask_until (length);
376
472
text_vec.ymms [0 ] = _mm256_maskz_loadu_epi8 (mask, text);
377
473
sums_vec.ymms [0 ] = _mm256_sad_epu8 (text_vec.ymms [0 ], _mm256_setzero_si256 ());
378
- // Accumulating 256 bits is harders , as we need to extract the 128-bit sums first.
474
+ // Accumulating 256 bits is harder , as we need to extract the 128-bit sums first.
379
475
__m128i low_xmm = _mm256_castsi256_si128 (sums_vec.ymms [0 ]);
380
476
__m128i high_xmm = _mm256_extracti128_si256 (sums_vec.ymms [0 ], 1 );
381
477
__m128i sums_xmm = _mm_add_epi64 (low_xmm, high_xmm);
@@ -389,30 +485,60 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
389
485
sums_vec.zmm = _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ());
390
486
return _mm512_reduce_add_epi64 (sums_vec.zmm );
391
487
}
488
+ // For large buffers, fitting into L1 cache sizes, there are other tricks we can use.
489
+ //
490
+ // 1. Moving in both directions to maximize the throughput, when fetching from multiple
491
+ // memory pages. Also helps with cache set-associativity issues, as we won't always
492
+ // be fetching the same buckets in the lookup table.
493
+ // 2. Port-level parallelism, can be used to hide the latency of expensive SIMD instructions.
494
+ // - `VPSADBW (ZMM, ZMM, ZMM)` combination with `VPADDQ (ZMM, ZMM, ZMM)`:
495
+ // - On Ice Lake, the `VPSADBW` is 3 cycles on port 5; the `VPADDQ` is 1 cycle on ports 0/5.
496
+ // - On Zen 4, the `VPSADBW` is 3 cycles on ports 0/1; the `VPADDQ` is 1 cycle on ports 0/1/2/3.
497
+ // - `VPDPBUSDS (ZMM, ZMM, ZMM)`:
498
+ // - On Ice Lake, the `VPDPBUSDS` is 5 cycles on port 0.
499
+ // - On Zen 4, the `VPDPBUSDS` is 4 cycles on ports 0/1.
500
+ //
501
+ // Bidirectional traversal generally adds about 10% to such algorithms.
392
502
else if (!is_huge) {
393
503
sz_size_t head_length = (64 - ((sz_size_t )text % 64 )) % 64 ; // 63 or less.
394
504
sz_size_t tail_length = (sz_size_t )(text + length) % 64 ; // 63 or less.
395
505
sz_size_t body_length = length - head_length - tail_length; // Multiple of 64.
506
+ _sz_assert (body_length % 64 == 0 && head_length < 64 && tail_length < 64 );
396
507
__mmask64 head_mask = _sz_u64_mask_until (head_length);
397
508
__mmask64 tail_mask = _sz_u64_mask_until (tail_length);
509
+
510
+ sz_u512_vec_t zeros_vec, ones_vec;
511
+ zeros_vec.zmm = _mm512_setzero_si512 ();
512
+ ones_vec.zmm = _mm512_set1_epi8 (1 );
513
+
514
+ // Take care of the unaligned head and tail!
515
+ sz_u512_vec_t text_reversed_vec, sums_reversed_vec;
398
516
text_vec.zmm = _mm512_maskz_loadu_epi8 (head_mask, text);
399
- sums_vec.zmm = _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ());
400
- for (text += head_length; body_length >= 64 ; text += 64 , body_length -= 64 ) {
401
- text_vec.zmm = _mm512_load_si512 ((__m512i const *)text);
402
- sums_vec.zmm = _mm512_add_epi64 (sums_vec.zmm , _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ()));
517
+ sums_vec.zmm = _mm512_sad_epu8 (text_vec.zmm , zeros_vec.zmm );
518
+ text_reversed_vec.zmm = _mm512_maskz_loadu_epi8 (tail_mask, text + head_length + body_length);
519
+ sums_reversed_vec.zmm = _mm512_dpbusds_epi32 (zeros_vec.zmm , text_reversed_vec.zmm , ones_vec.zmm );
520
+
521
+ // Now in the main loop, we can use aligned loads, performing the operation in both directions.
522
+ for (text += head_length; body_length >= 128 ; text += 64 , text += 64 , body_length -= 128 ) {
523
+ text_reversed_vec.zmm = _mm512_load_si512 ((__m512i *)(text + body_length - 64 ));
524
+ sums_reversed_vec.zmm = _mm512_dpbusds_epi32 (sums_reversed_vec.zmm , text_reversed_vec.zmm , ones_vec.zmm );
525
+ text_vec.zmm = _mm512_load_si512 ((__m512i *)(text));
526
+ sums_vec.zmm = _mm512_add_epi64 (sums_vec.zmm , _mm512_sad_epu8 (text_vec.zmm , zeros_vec.zmm ));
403
527
}
404
- text_vec.zmm = _mm512_maskz_loadu_epi8 (tail_mask, text);
405
- sums_vec.zmm = _mm512_add_epi64 (sums_vec.zmm , _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ()));
406
- return _mm512_reduce_add_epi64 (sums_vec.zmm );
528
+ // There may be an aligned chunk of 64 bytes left.
529
+ if (body_length >= 64 ) {
530
+ _sz_assert (body_length == 64 );
531
+ text_vec.zmm = _mm512_load_si512 ((__m512i *)(text));
532
+ sums_vec.zmm = _mm512_add_epi64 (sums_vec.zmm , _mm512_sad_epu8 (text_vec.zmm , zeros_vec.zmm ));
533
+ }
534
+
535
+ return _mm512_reduce_add_epi64 (sums_vec.zmm ) + _mm512_reduce_add_epi32 (sums_reversed_vec.zmm );
407
536
}
408
537
// For gigantic buffers, exceeding typical L1 cache sizes, there are other tricks we can use.
409
538
//
410
- // 1. Moving in both directions to maximize the throughput, when fetching from multiple
411
- // memory pages. Also helps with cache set-associativity issues, as we won't always
412
- // be fetching the same entries in the lookup table.
413
- // 2. Using non-temporal stores to avoid polluting the cache.
414
- // 3. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
415
- // for predictable patterns, so disregard this advice.
539
+ // 1. Using non-temporal loads to avoid polluting the cache.
540
+ // 2. Prefetching the next cache line, to avoid stalling the CPU. This generally useless
541
+ // for predictable patterns, so disregard this advice.
416
542
//
417
543
// Bidirectional traversal generally adds about 10% to such algorithms.
418
544
else {
@@ -428,8 +554,7 @@ SZ_PUBLIC sz_u64_t sz_checksum_ice(sz_cptr_t text, sz_size_t length) {
428
554
text_reversed_vec.zmm = _mm512_maskz_loadu_epi8 (tail_mask, text + head_length + body_length);
429
555
sums_reversed_vec.zmm = _mm512_sad_epu8 (text_reversed_vec.zmm , _mm512_setzero_si512 ());
430
556
431
- // Now in the main loop, we can use non-temporal loads and stores,
432
- // performing the operation in both directions.
557
+ // Now in the main loop, we can use non-temporal loads, performing the operation in both directions.
433
558
for (text += head_length; body_length >= 128 ; text += 64 , text += 64 , body_length -= 128 ) {
434
559
text_vec.zmm = _mm512_stream_load_si512 ((__m512i *)(text));
435
560
sums_vec.zmm = _mm512_add_epi64 (sums_vec.zmm , _mm512_sad_epu8 (text_vec.zmm , _mm512_setzero_si512 ()));
@@ -506,6 +631,8 @@ SZ_PUBLIC sz_u64_t sz_checksum_neon(sz_cptr_t text, sz_size_t length) {
506
631
SZ_DYNAMIC sz_u64_t sz_checksum (sz_cptr_t text, sz_size_t length) {
507
632
#if SZ_USE_ICE
508
633
return sz_checksum_ice (text, length);
634
+ #elif SZ_USE_SKYLAKE
635
+ return sz_checksum_skylake (text, length);
509
636
#elif SZ_USE_HASWELL
510
637
return sz_checksum_haswell (text, length);
511
638
#elif SZ_USE_NEON
0 commit comments