Skip to content

Commit bd7054e

Browse files
committed
Fix: Masks back to using BZHI
1 parent f3811d7 commit bd7054e

File tree

1 file changed

+16
-11
lines changed

1 file changed

+16
-11
lines changed

include/stringzilla/types.h

+16-11
Original file line numberDiff line numberDiff line change
@@ -800,17 +800,6 @@ SZ_INTERNAL sz_u64_t sz_u64_bytes_reverse(sz_u64_t val) { return __builtin_bswap
800800
SZ_INTERNAL sz_u32_t sz_u32_bytes_reverse(sz_u32_t val) { return __builtin_bswap32(val); }
801801
#endif
802802

803-
/*
804-
*/
805-
SZ_INTERNAL sz_u16_t _sz_u16_mask_until(sz_size_t n) { return (0x0001u << n) - 1u; }
806-
SZ_INTERNAL sz_u32_t _sz_u32_mask_until(sz_size_t n) { return (0x00000001u << n) - 1u; }
807-
SZ_INTERNAL sz_u64_t _sz_u64_mask_until(sz_size_t n) { return (0x0000000000000001ull << n) - 1ull; }
808-
SZ_INTERNAL sz_u16_t _sz_u16_clamp_mask_until(sz_size_t n) { return n < 16 ? _sz_u16_mask_until(n) : 0xFFFFu; }
809-
SZ_INTERNAL sz_u32_t _sz_u32_clamp_mask_until(sz_size_t n) { return n < 32 ? _sz_u32_mask_until(n) : 0xFFFFFFFFu; }
810-
SZ_INTERNAL sz_u64_t _sz_u64_clamp_mask_until(sz_size_t n) {
811-
return n < 64 ? _sz_u64_mask_until(n) : 0xFFFFFFFFFFFFFFFFull;
812-
}
813-
814803
SZ_INTERNAL sz_u64_t sz_u64_rotl(sz_u64_t x, sz_u64_t r) { return (x << r) | (x >> (64 - r)); }
815804

816805
/**
@@ -865,6 +854,22 @@ SZ_INTERNAL sz_i32_t sz_i32_min_of_two(sz_i32_t x, sz_i32_t y) { return y + ((x
865854
/** @brief Branchless minimum function for two signed 32-bit integers. */
866855
SZ_INTERNAL sz_i32_t sz_i32_max_of_two(sz_i32_t x, sz_i32_t y) { return x - ((x - y) & (x - y) >> 31); }
867856

857+
/* In AVX-512 we actively use masked operations and the "K mask registers".
858+
* Producing a mask for the first N elements of a sequence can be done using the `1 << N - 1` idiom.
859+
* It, however, induces undefined behavior if `N == 64` or `N == 32` on 64-bit or 32-bit systems respectively.
860+
* Alternatively, the BZHI instruction can be used to clear the bits above N.
861+
*/
862+
#if SZ_USE_SKYLAKE || SZ_USE_ICE
863+
SZ_INTERNAL __mmask16 _sz_u16_mask_until(sz_size_t n) { return (__mmask16)_bzhi_u32(0xFFFFu, n); }
864+
SZ_INTERNAL __mmask32 _sz_u32_mask_until(sz_size_t n) { return (__mmask32)_bzhi_u64(0xFFFFFFFFu, n); }
865+
SZ_INTERNAL __mmask64 _sz_u64_mask_until(sz_size_t n) { return (__mmask64)_bzhi_u64(0xFFFFFFFFFFFFFFFFull, n); }
866+
SZ_INTERNAL __mmask16 _sz_u16_clamp_mask_until(sz_size_t n) { return n < 16 ? _sz_u16_mask_until(n) : 0xFFFFu; }
867+
SZ_INTERNAL __mmask32 _sz_u32_clamp_mask_until(sz_size_t n) { return n < 32 ? _sz_u32_mask_until(n) : 0xFFFFFFFFu; }
868+
SZ_INTERNAL __mmask64 _sz_u64_clamp_mask_until(sz_size_t n) {
869+
return n < 64 ? _sz_u64_mask_until(n) : 0xFFFFFFFFFFFFFFFFull;
870+
}
871+
#endif
872+
868873
/**
869874
* @brief Byte-level equality comparison between two 64-bit integers.
870875
* @return 64-bit integer, where every top bit in each byte signifies a match.

0 commit comments

Comments
 (0)