From 14ee92c360ee506f42a5d6e8143b8cfecd690317 Mon Sep 17 00:00:00 2001 From: Ash Vardanian <1983160+ashvardanian@users.noreply.github.com> Date: Wed, 11 Dec 2024 14:45:42 +0000 Subject: [PATCH] Fix: Matching N3322 for `memcpy` UB in C2y https://developers.redhat.com/articles/2024/12/11/making-memcpynull-null-0-well-defined --- README.md | 2 ++ include/stringzilla/stringzilla.h | 32 ++++++++++++++++++------------- scripts/test.cpp | 13 +++++++++++-- 3 files changed, 32 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index dbfd3f9b..bf3872a7 100644 --- a/README.md +++ b/README.md @@ -1196,8 +1196,10 @@ __`SZ_AVOID_LIBC`__ and __`SZ_OVERRIDE_LIBC`__: > This may affect the type resolution system on obscure hardware platforms. > Moreover, one may let `stringzilla` override the common symbols like the `memcpy` and `memset` with its own implementations. > In that case you can use the [`LD_PRELOAD` trick][ld-preload-trick] to prioritize it's symbols over the ones from the LibC and accelerate existing string-heavy applications without recompiling them. +> It also adds a layer of security, as the `stringzilla` isn't [undefined for NULL inputs][redhat-memcpy-ub] like `memcpy(NULL, NULL, 0)`. [ld-preload-trick]: https://ashvardanian.com/posts/ld-preload-libsee +[redhat-memcpy-ub]: https://developers.redhat.com/articles/2024/12/11/making-memcpynull-null-0-well-defined __`SZ_AVOID_STL`__ and __`SZ_SAFETY_OVER_COMPATIBILITY`__: diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h index bb543e89..6574a514 100644 --- a/include/stringzilla/stringzilla.h +++ b/include/stringzilla/stringzilla.h @@ -1941,7 +1941,7 @@ SZ_PUBLIC sz_cptr_t sz_find_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr if (!h_length) return SZ_NULL_CHAR; sz_cptr_t const h_end = h + h_length; -#if !SZ_DETECT_BIG_ENDIAN // Use SWAR only on little-endian platforms for brevety. +#if !SZ_DETECT_BIG_ENDIAN // Use SWAR only on little-endian platforms for brevity. #if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((sz_size_t)h & 7ull) && h < h_end; ++h) if (*h == *n) return h; @@ -1978,7 +1978,7 @@ sz_cptr_t sz_rfind_byte_serial(sz_cptr_t h, sz_size_t h_length, sz_cptr_t n) { // Reposition the `h` pointer to the end, as we will be walking backwards. h = h + h_length - 1; -#if !SZ_DETECT_BIG_ENDIAN // Use SWAR only on little-endian platforms for brevety. +#if !SZ_DETECT_BIG_ENDIAN // Use SWAR only on little-endian platforms for brevity. #if !SZ_USE_MISALIGNED_LOADS // Process the misaligned head, to void UB on unaligned 64-bit loads. for (; ((sz_size_t)(h + 1) & 7ull) && h >= h_start; --h) if (*h == *n) return h; @@ -2464,9 +2464,9 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( // sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1; next_distances[i + 1] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution); } - // Don't forget to populate the first row and the fiest column of the Levenshtein matrix. + // Don't forget to populate the first row and the first column of the Levenshtein matrix. next_distances[0] = next_distances[next_skew_diagonal_length - 1] = next_skew_diagonal_index; - // Perform a circular rotarion of those buffers, to reuse the memory. + // Perform a circular rotation of those buffers, to reuse the memory. sz_size_t *temporary = previous_distances; previous_distances = current_distances; current_distances = next_distances; @@ -2486,7 +2486,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_serial( // sz_size_t cost_if_deletion_or_insertion = sz_min_of_two(current_distances[i], current_distances[i + 1]) + 1; next_distances[i] = sz_min_of_two(cost_if_deletion_or_insertion, cost_if_substitution); } - // Perform a circular rotarion of those buffers, to reuse the memory, this time, with a shift, + // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift, // dropping the first element in the current array. sz_size_t *temporary = previous_distances; previous_distances = current_distances + 1; @@ -3486,25 +3486,24 @@ SZ_PUBLIC void sz_string_free(sz_string_t *string, sz_memory_allocator_t *alloca sz_string_init(string); } -// When overriding libc, disable optimisations for this function beacuse MSVC will optimize the loops into a memset. +// When overriding libc, disable optimizations for this function because MSVC will optimize the loops into a memset. // Which then causes a stack overflow due to infinite recursion (memset -> sz_fill_serial -> memset). #if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC #pragma optimize("", off) #endif SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) { - sz_ptr_t end = target + length; // Dealing with short strings, a single sequential pass would be faster. // If the size is larger than 2 words, then at least 1 of them will be aligned. // But just one aligned word may not be worth SWAR. if (length < SZ_SWAR_THRESHOLD) - while (target != end) *(target++) = value; + while (length--) *(target++) = value; // In case of long strings, skip unaligned bytes, and then fill the rest in 64-bit chunks. else { sz_u64_t value64 = (sz_u64_t)value * 0x0101010101010101ull; - while ((sz_size_t)target & 7ull) *(target++) = value; - while (target + 8 <= end) *(sz_u64_t *)target = value64, target += 8; - while (target != end) *(target++) = value; + while ((sz_size_t)target & 7ull) *(target++) = value, length--; + while (length >= 8) *(sz_u64_t *)target = value64, target += 8, length -= 8; + while (length--) *(target++) = value; } } #if defined(_MSC_VER) && defined(SZ_OVERRIDE_LIBC) && SZ_OVERRIDE_LIBC @@ -3512,6 +3511,13 @@ SZ_PUBLIC void sz_fill_serial(sz_ptr_t target, sz_size_t length, sz_u8_t value) #endif SZ_PUBLIC void sz_copy_serial(sz_ptr_t target, sz_cptr_t source, sz_size_t length) { + // The most typical implementation of `memcpy` suffers from Undefined Behavior: + // + // for (char const *end = source + length; source < end; source++) *target++ = *source; + // + // As NULL pointer arithmetic is undefined for calls like `memcpy(NULL, NULL, 0)`. + // That's mitigated in C2y with the N3322 proposal, but our solution uses a design, that has no such issues. + // https://developers.redhat.com/articles/2024/12/11/making-memcpynull-null-0-well-defined #if SZ_USE_MISALIGNED_LOADS while (length >= 8) *(sz_u64_t *)target = *(sz_u64_t const *)source, target += 8, source += 8, length -= 8; #endif @@ -5215,7 +5221,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( // } // Don't forget to populate the first row and the fiest column of the Levenshtein matrix. next_distances[0] = next_distances[next_skew_diagonal_length - 1] = (sz_u16_t)next_skew_diagonal_index; - // Perform a circular rotarion of those buffers, to reuse the memory. + // Perform a circular rotation of those buffers, to reuse the memory. sz_u16_t *temporary = previous_distances; previous_distances = current_distances; current_distances = next_distances; @@ -5257,7 +5263,7 @@ SZ_INTERNAL sz_size_t _sz_edit_distance_skewed_diagonals_upto65k_avx512( // i += register_length; } - // Perform a circular rotarion of those buffers, to reuse the memory, this time, with a shift, + // Perform a circular rotation of those buffers, to reuse the memory, this time, with a shift, // dropping the first element in the current array. sz_u16_t *temporary = previous_distances; previous_distances = current_distances + 1; diff --git a/scripts/test.cpp b/scripts/test.cpp index 47ef46d2..ead0c88d 100644 --- a/scripts/test.cpp +++ b/scripts/test.cpp @@ -153,14 +153,23 @@ inline void expect_equality(char const *a, char const *b, std::size_t size) { * Uses a large heap-allocated buffer to ensure that operations optimized for @b larger-than-L2-cache memory * regions are tested. Uses a combination of deterministic and random tests with uniform and exponential distributions. */ -static void test_memory_utilities(std::size_t experiments = 1024ull * 1024ull, - std::size_t max_l2_size = 1024ull * 1024ull) { +static void test_memory_utilities( // + std::size_t experiments = 1024ull * 1024ull, std::size_t max_l2_size = 1024ull * 1024ull) { // We will be mirroring the operations on both standard and StringZilla strings. std::string text_stl(max_l2_size, '-'); std::string text_sz(max_l2_size, '-'); expect_equality(text_stl.data(), text_sz.data(), max_l2_size); + // The traditional `memset` and `memcpy` functions are undefined for zero-length buffers and NULL pointers + // for older C standards. However, with the N3322 proposal for C2y, that issue has been resolved. + // https://developers.redhat.com/articles/2024/12/11/making-memcpynull-null-0-well-defined + // + // Let's make sure, that our versions don't trigger any undefined behavior. + sz::memset(NULL, 0, 0); + sz::memcpy(NULL, NULL, 0); + sz::memmove(NULL, NULL, 0); + // First start with simple deterministic tests. // Let's use `memset` to fill the strings with a pattern like "122333444455555...00000000000011111111111..." std::size_t count_groups = 0;