diff --git a/CMakeLists.txt b/CMakeLists.txt index 68bbd2a4..2712f45d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -338,6 +338,11 @@ if(${STRINGZILLA_BUILD_SHARED}) target_compile_definitions(stringzillite PRIVATE "SZ_AVOID_LIBC=1") target_compile_definitions(stringzillite PRIVATE "SZ_OVERRIDE_LIBC=1") + if (MSVC) + target_link_libraries(stringzilla_shared PRIVATE msvcrt.lib) + target_link_libraries(stringzillite PRIVATE msvcrt.lib) + endif() + # Avoid built-ins on MSVC and other compilers, as that will cause compileration errors target_compile_options(stringzillite PRIVATE "$<$:-fno-builtin;-nostdlib>" diff --git a/README.md b/README.md index 821be609..dbfd3f9b 100644 --- a/README.md +++ b/README.md @@ -171,7 +171,7 @@ __Who is this for?__ arm: 9.4 MB/s - uniform_int_distribution
+ std::uniform_int_distribution
x86: 47.2 · arm: 20.4 MB/s @@ -193,7 +193,7 @@ __Who is this for?__ ⚪ - transform
+ std::transform
x86: 3.81 · arm: 2.65 GB/s diff --git a/include/stringzilla/stringzilla.h b/include/stringzilla/stringzilla.h index fbfbf28b..8176ee13 100644 --- a/include/stringzilla/stringzilla.h +++ b/include/stringzilla/stringzilla.h @@ -5323,8 +5323,16 @@ SZ_PUBLIC void sz_look_up_transform_avx512(sz_cptr_t source, sz_size_t length, s // operate on 4 registers, it might be cleaner to use 2x separate `_mm512_permutexvar_epi8` calls. // Combining the results with 2x `_mm512_test_epi8_mask` and 3x blends afterwards. // - // - `_mm512_mask_blend_epi8` - 1 cycle latency, and generally 2x can run in parallel. - // - `_mm512_test_epi8_mask` - 3 cycles latency, same as most comparison functions in AVX-512. + // - 4x `_mm512_permutexvar_epi8` maps to "VPERMB (ZMM, ZMM, ZMM)": + // - On Ice Lake: 3 cycles latency, ports: 1*p5 + // - On Genoa: 6 cycles latency, ports: 1*FP12 + // - 3x `_mm512_mask_blend_epi8` maps to "VPBLENDMB_Z (ZMM, K, ZMM, ZMM)": + // - On Ice Lake: 3 cycles latency, ports: 1*p05 + // - On Genoa: 1 cycle latency, ports: 1*FP0123 + // - 2x `_mm512_test_epi8_mask` maps to "VPTESTMB (K, ZMM, ZMM)": + // - On Ice Lake: 3 cycles latency, ports: 1*p5 + // - On Genoa: 4 cycles latency, ports: 1*FP01 + // sz_u512_vec_t lut_0_to_63_vec, lut_64_to_127_vec, lut_128_to_191_vec, lut_192_to_255_vec; lut_0_to_63_vec.zmm = _mm512_loadu_si512((lut)); lut_64_to_127_vec.zmm = _mm512_loadu_si512((lut + 64));