Skip to content

Commit a296292

Browse files
authored
Merge: Fast Search for Georgian & Hashing w/out Stack Protection 🔞
2 parents 7ca3964 + 374adbf commit a296292

File tree

11 files changed

+315
-53
lines changed

11 files changed

+315
-53
lines changed

.clang-format

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,23 @@ ReflowComments: true
88
UseTab: Never
99
IndentPPDirectives: None
1010

11+
# StringZilla-specific macros
12+
# Visibility/linkage macros that act like storage class specifiers
13+
StatementAttributeLikeMacros:
14+
- SZ_PUBLIC
15+
- SZ_INTERNAL
16+
- SZ_DYNAMIC
17+
- SZ_EXTERNAL
18+
19+
# Attribute-like macros (clang-format 12+)
20+
AttributeMacros:
21+
- SZ_NO_STACK_PROTECTOR
22+
- sz_align_
23+
24+
# Macros that behave like type qualifiers in parameters
25+
TypenameMacros:
26+
- sz_at_least_
27+
1128
AlignConsecutiveAssignments: false
1229
AlignConsecutiveDeclarations: false
1330
AlignEscapedNewlines: true

.gitignore

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -31,16 +31,11 @@ node_modules/
3131
.tmp/
3232

3333
# Recommended datasets
34-
utf8.txt
35-
leipzig1M.txt
36-
enwik9.txt
37-
xlsum.csv
38-
acgt_100.txt
39-
acgt_100k.txt
40-
acgt_10k.txt
41-
acgt_10m.txt
42-
acgt_1k.txt
43-
acgt_1m.txt
34+
/utf8.txt
35+
/enwik9.txt
36+
/xlsum.csv
37+
/acgt*.txt
38+
/leipzig*.txt
4439

4540
# StringZilla-specific log files
4641
/failed_sz_*

CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -861,5 +861,5 @@ if (STRINGZILLA_INSTALL)
861861
endif ()
862862

863863
install(DIRECTORY ${STRINGZILLA_INCLUDE_BUILD_DIR} DESTINATION ${STRINGZILLA_INCLUDE_INSTALL_DIR})
864-
install(DIRECTORY ./c/ DESTINATION /usr/src/${PROJECT_NAME}/)
864+
install(DIRECTORY c/ DESTINATION ${CMAKE_INSTALL_DATADIR}/${PROJECT_NAME}/src)
865865
endif ()

include/stringzilla/compare.h

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -347,10 +347,9 @@ SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cpt
347347
b_vec.zmm = _mm512_maskz_loadu_epi8(head_mask, b);
348348
__mmask64 mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
349349
if (mask_not_equal != 0) {
350+
// Reload from original memory (L1 cached) to avoid ZMM-to-stack spill.
350351
sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
351-
char a_char = a_vec.u8s[first_diff];
352-
char b_char = b_vec.u8s[first_diff];
353-
return sz_order_scalars_(a_char, b_char);
352+
return sz_order_scalars_(a[first_diff], b[first_diff]);
354353
}
355354
else if (head_length == a_length && head_length == b_length) { return sz_equal_k; }
356355
else { a += head_length, b += head_length, a_length -= head_length, b_length -= head_length; }
@@ -362,10 +361,9 @@ SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cpt
362361
b_vec.zmm = _mm512_loadu_si512(b);
363362
mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
364363
if (mask_not_equal != 0) {
364+
// Reload from original memory (L1 cached) to avoid ZMM-to-stack spill.
365365
sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
366-
char a_char = a_vec.u8s[first_diff];
367-
char b_char = b_vec.u8s[first_diff];
368-
return sz_order_scalars_(a_char, b_char);
366+
return sz_order_scalars_(a[first_diff], b[first_diff]);
369367
}
370368
a += 64, b += 64, a_length -= 64, b_length -= 64;
371369
}
@@ -381,10 +379,9 @@ SZ_PUBLIC sz_ordering_t sz_order_skylake(sz_cptr_t a, sz_size_t a_length, sz_cpt
381379
// been cheaper, if we didn't have to apply `_mm256_movemask_epi8` afterwards.
382380
mask_not_equal = _mm512_cmpneq_epi8_mask(a_vec.zmm, b_vec.zmm);
383381
if (mask_not_equal != 0) {
382+
// Reload from original memory (L1 cached) to avoid ZMM-to-stack spill.
384383
sz_u64_t first_diff = _tzcnt_u64(mask_not_equal);
385-
char a_char = a_vec.u8s[first_diff];
386-
char b_char = b_vec.u8s[first_diff];
387-
return sz_order_scalars_(a_char, b_char);
384+
return sz_order_scalars_(a[first_diff], b[first_diff]);
388385
}
389386
// From logic perspective, the hardest cases are "abc\0" and "abc".
390387
// The result must be `sz_greater_k`, as the latter is shorter.

include/stringzilla/find.h

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1132,17 +1132,34 @@ SZ_PUBLIC sz_cptr_t sz_find_byteset_haswell(sz_cptr_t text, sz_size_t length, sz
11321132

11331133
// Let's unzip even and odd elements and replicate them into both lanes of the YMM register.
11341134
// That way when we invoke `_mm256_shuffle_epi8` we can use the same mask for both lanes.
1135+
// Load the 32-byte filter as two 16-byte halves, separate even/odd bytes, pack, and broadcast to YMM.
1136+
sz_u128_vec_t byte_mask_vec;
1137+
sz_u128_vec_t filter_lo_vec, filter_hi_vec;
1138+
sz_u128_vec_t lo_evens_vec, hi_evens_vec;
1139+
sz_u128_vec_t lo_odds_vec, hi_odds_vec;
1140+
sz_u128_vec_t evens_xmm_vec, odds_xmm_vec;
11351141
sz_u256_vec_t filter_even_vec, filter_odd_vec;
1136-
for (sz_size_t i = 0; i != 16; ++i)
1137-
filter_even_vec.u8s[i] = filter->_u8s[i * 2], filter_odd_vec.u8s[i] = filter->_u8s[i * 2 + 1];
1138-
filter_even_vec.xmms[1] = filter_even_vec.xmms[0];
1139-
filter_odd_vec.xmms[1] = filter_odd_vec.xmms[0];
11401142

11411143
sz_u256_vec_t text_vec;
11421144
sz_u256_vec_t matches_vec;
11431145
sz_u256_vec_t lower_nibbles_vec, higher_nibbles_vec;
11441146
sz_u256_vec_t bitset_even_vec, bitset_odd_vec;
11451147
sz_u256_vec_t bitmask_vec, bitmask_lookup_vec;
1148+
1149+
byte_mask_vec.xmm = _mm_set1_epi16(0x00ff);
1150+
1151+
filter_lo_vec.xmm = _mm_lddqu_si128((__m128i const *)(filter));
1152+
filter_hi_vec.xmm = _mm_lddqu_si128((__m128i const *)(filter) + 1);
1153+
lo_evens_vec.xmm = _mm_and_si128(filter_lo_vec.xmm, byte_mask_vec.xmm);
1154+
hi_evens_vec.xmm = _mm_and_si128(filter_hi_vec.xmm, byte_mask_vec.xmm);
1155+
lo_odds_vec.xmm = _mm_srli_epi16(filter_lo_vec.xmm, 8);
1156+
hi_odds_vec.xmm = _mm_srli_epi16(filter_hi_vec.xmm, 8);
1157+
1158+
evens_xmm_vec.xmm = _mm_packus_epi16(lo_evens_vec.xmm, hi_evens_vec.xmm);
1159+
odds_xmm_vec.xmm = _mm_packus_epi16(lo_odds_vec.xmm, hi_odds_vec.xmm);
1160+
filter_even_vec.ymm = _mm256_set_m128i(evens_xmm_vec.xmm, evens_xmm_vec.xmm);
1161+
filter_odd_vec.ymm = _mm256_set_m128i(odds_xmm_vec.xmm, odds_xmm_vec.xmm);
1162+
11461163
bitmask_lookup_vec.ymm = _mm256_set_epi8( //
11471164
-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1, //
11481165
-128, 64, 32, 16, 8, 4, 2, 1, -128, 64, 32, 16, 8, 4, 2, 1);

include/stringzilla/hash.h

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,7 @@ SZ_DYNAMIC void sz_sha256_state_digest(sz_sha256_state_t const *state, sz_u8_t d
279279
SZ_PUBLIC sz_u64_t sz_bytesum_serial(sz_cptr_t text, sz_size_t length);
280280

281281
/** @copydoc sz_hash */
282-
SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
282+
SZ_PUBLIC SZ_NO_STACK_PROTECTOR sz_u64_t sz_hash_serial(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
283283

284284
/** @copydoc sz_fill_random */
285285
SZ_PUBLIC void sz_fill_random_serial(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
@@ -296,7 +296,7 @@ SZ_PUBLIC sz_u64_t sz_hash_state_digest_serial(sz_hash_state_t const *state);
296296
#if SZ_USE_WESTMERE
297297

298298
/** @copydoc sz_hash */
299-
SZ_PUBLIC sz_u64_t sz_hash_westmere(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
299+
SZ_PUBLIC SZ_NO_STACK_PROTECTOR sz_u64_t sz_hash_westmere(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
300300

301301
/** @copydoc sz_fill_random */
302302
SZ_PUBLIC void sz_fill_random_westmere(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
@@ -338,7 +338,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_haswell(sz_cptr_t text, sz_size_t length);
338338
SZ_PUBLIC sz_u64_t sz_bytesum_skylake(sz_cptr_t text, sz_size_t length);
339339

340340
/** @copydoc sz_hash */
341-
SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
341+
SZ_PUBLIC SZ_NO_STACK_PROTECTOR sz_u64_t sz_hash_skylake(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
342342

343343
/** @copydoc sz_fill_random */
344344
SZ_PUBLIC void sz_fill_random_skylake(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
@@ -360,7 +360,7 @@ SZ_PUBLIC sz_u64_t sz_hash_state_digest_skylake(sz_hash_state_t const *state);
360360
SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length);
361361

362362
/** @copydoc sz_hash */
363-
SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
363+
SZ_PUBLIC SZ_NO_STACK_PROTECTOR sz_u64_t sz_hash_ice(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
364364

365365
/** @copydoc sz_fill_random */
366366
SZ_PUBLIC void sz_fill_random_ice(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
@@ -395,7 +395,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_neon(sz_cptr_t text, sz_size_t length);
395395
#if SZ_USE_NEON_AES
396396

397397
/** @copydoc sz_hash */
398-
SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
398+
SZ_PUBLIC SZ_NO_STACK_PROTECTOR sz_u64_t sz_hash_neon(sz_cptr_t text, sz_size_t length, sz_u64_t seed);
399399

400400
/** @copydoc sz_fill_random */
401401
SZ_PUBLIC void sz_fill_random_neon(sz_ptr_t text, sz_size_t length, sz_u64_t nonce);
@@ -824,7 +824,7 @@ SZ_INTERNAL sz_u64_t sz_hash_state_finalize_serial_(sz_hash_state_t const *state
824824
return mixed_in_register.u64s[0];
825825
}
826826

827-
SZ_PUBLIC sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
827+
SZ_PUBLIC SZ_NO_STACK_PROTECTOR sz_u64_t sz_hash_serial(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
828828
if (length <= 16) {
829829
// Initialize the AES block with a given seed
830830
sz_align_(16) sz_hash_minimal_t_ state;
@@ -1328,7 +1328,7 @@ SZ_INTERNAL sz_u64_t sz_hash_state_finalize_westmere_(sz_hash_state_t const *sta
13281328
return _mm_cvtsi128_si64(mixed_in_register);
13291329
}
13301330

1331-
SZ_PUBLIC sz_u64_t sz_hash_westmere(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
1331+
SZ_PUBLIC SZ_NO_STACK_PROTECTOR sz_u64_t sz_hash_westmere(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
13321332

13331333
if (length <= 16) {
13341334
// Initialize the AES block with a given seed
@@ -2223,7 +2223,7 @@ SZ_PUBLIC void sz_hash_state_init_skylake(sz_hash_state_t *state, sz_u64_t seed)
22232223
state->ins_length = 0;
22242224
}
22252225

2226-
SZ_PUBLIC sz_u64_t sz_hash_skylake(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
2226+
SZ_PUBLIC SZ_NO_STACK_PROTECTOR sz_u64_t sz_hash_skylake(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
22272227

22282228
if (length <= 16) {
22292229
// Initialize the AES block with a given seed
@@ -2562,7 +2562,7 @@ SZ_PUBLIC sz_u64_t sz_bytesum_ice(sz_cptr_t text, sz_size_t length) {
25622562
}
25632563
}
25642564

2565-
SZ_PUBLIC sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
2565+
SZ_PUBLIC SZ_NO_STACK_PROTECTOR sz_u64_t sz_hash_ice(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
25662566

25672567
// For short strings the "masked loads" are identical to Skylake-X and
25682568
// the "logic" is identical to Haswell.
@@ -3477,7 +3477,7 @@ SZ_PUBLIC sz_u64_t sz_hash_state_digest_neon(sz_hash_state_t const *state) {
34773477
}
34783478
}
34793479

3480-
SZ_PUBLIC sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
3480+
SZ_PUBLIC SZ_NO_STACK_PROTECTOR sz_u64_t sz_hash_neon(sz_cptr_t start, sz_size_t length, sz_u64_t seed) {
34813481
if (length <= 16) {
34823482
// Initialize the AES block with a given seed
34833483
sz_align_(16) sz_hash_minimal_t_ state;

include/stringzilla/types.h

Lines changed: 40 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -153,25 +153,45 @@
153153
* - `SZ_DYNAMIC` is used for functions that are part of the public API, but are dispatched at runtime.
154154
* - `SZ_EXTERNAL` is used for third-party libraries that are linked dynamically.
155155
*/
156+
157+
#if defined(__cplusplus)
158+
#define SZ_C_INLINE inline
159+
#else
160+
#define SZ_C_INLINE inline static
161+
#endif
162+
156163
#if SZ_DYNAMIC_DISPATCH
157164
#if defined(_WIN32) || defined(__CYGWIN__)
158165
#define SZ_DYNAMIC __declspec(dllexport)
159166
#define SZ_EXTERNAL __declspec(dllimport)
160-
#define SZ_PUBLIC inline static
161-
#define SZ_INTERNAL inline static
167+
#define SZ_PUBLIC SZ_C_INLINE
168+
#define SZ_INTERNAL SZ_C_INLINE
162169
#else
163170
#define SZ_DYNAMIC extern __attribute__((visibility("default")))
164171
#define SZ_EXTERNAL extern
165-
#define SZ_PUBLIC __attribute__((unused)) inline static
166-
#define SZ_INTERNAL __attribute__((always_inline)) inline static
172+
#define SZ_PUBLIC __attribute__((unused)) SZ_C_INLINE
173+
#define SZ_INTERNAL __attribute__((always_inline)) SZ_C_INLINE
167174
#endif // _WIN32 || __CYGWIN__
168175
#else
169-
#define SZ_DYNAMIC inline static
176+
#define SZ_DYNAMIC SZ_C_INLINE
170177
#define SZ_EXTERNAL extern
171-
#define SZ_PUBLIC inline static
172-
#define SZ_INTERNAL inline static
178+
#define SZ_PUBLIC SZ_C_INLINE
179+
#define SZ_INTERNAL SZ_C_INLINE
173180
#endif // SZ_DYNAMIC_DISPATCH
174181

182+
/**
183+
* @brief Disables stack protection for performance-critical functions.
184+
*
185+
* GCC's `-fstack-protector-strong` inserts stack canary checks for functions with local arrays
186+
* or buffers. For hash functions that use fixed-size state structures, this is unnecessary
187+
* overhead (~10 cycles per call). This macro opts out of stack protection for such functions.
188+
*/
189+
#if defined(__GNUC__) || defined(__clang__)
190+
#define SZ_NO_STACK_PROTECTOR __attribute__((no_stack_protector))
191+
#else
192+
#define SZ_NO_STACK_PROTECTOR
193+
#endif
194+
175195
/**
176196
* @brief Alignment macro for N-byte alignment.
177197
*/
@@ -1358,11 +1378,21 @@ SZ_INTERNAL sz_size_t sz_size_log2i_nonzero(sz_size_t x) {
13581378

13591379
/**
13601380
* @brief Compute the smallest power of two greater than or equal to @p x.
1361-
* @pre Unlike the commonly used trick with `clz` intrinsics, is valid across the whole range of `x`, @b including
1362-
* 0.
1381+
* @note Uses LZCNT/CLZ for efficient computation on modern CPUs.
1382+
* Edge cases: bit_ceil(0) = 0, bit_ceil(1) = 1.
13631383
* @see https://stackoverflow.com/a/10143264
13641384
*/
13651385
SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
1386+
#if defined(__LZCNT__) || defined(__BMI__)
1387+
// Edge cases: 0 and 1 return themselves, avoids undefined clz(0).
1388+
if (x <= 1) return x;
1389+
#if SZ_IS_64BIT_
1390+
return (sz_size_t)1 << (64 - sz_u64_clz(x - 1));
1391+
#else
1392+
return (sz_size_t)1 << (32 - sz_u32_clz((sz_u32_t)(x - 1)));
1393+
#endif
1394+
#else
1395+
// The following trick is valid for 0 input as well.
13661396
x--;
13671397
x |= x >> 1;
13681398
x |= x >> 2;
@@ -1374,6 +1404,7 @@ SZ_INTERNAL sz_size_t sz_size_bit_ceil(sz_size_t x) {
13741404
#endif
13751405
x++;
13761406
return x;
1407+
#endif
13771408
}
13781409

13791410
/**

0 commit comments

Comments
 (0)