From c6248e033849062153baae637780a72cd122b40d Mon Sep 17 00:00:00 2001 From: "Vazquez, Javier" Date: Thu, 22 Jun 2023 13:51:12 -0600 Subject: [PATCH 01/18] Added CRC32C AVX512 support. --- CMakeLists.txt | 2 + source/intel/asm/crc32c_sse42_asm.c | 242 ++++++++++++++++++++++++++-- 2 files changed, 227 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb1cd75..2056ae3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -66,9 +66,11 @@ if (USE_CPU_EXTENSIONS) source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC}) elseif(AWS_ARCH_INTEL AND AWS_HAVE_GCC_INLINE_ASM) + set(AWS_CMAKE_REQUIRED_FLAGS "-mavx512f -msse4.2 -mvpclmulqdq -mpclmul") file(GLOB AWS_ARCH_SRC "source/intel/asm/*.c" ) + SET_SOURCE_FILES_PROPERTIES(source/intel/asm/crc32c_sse42_asm.c COMPILE_FLAGS ${AWS_CMAKE_REQUIRED_FLAGS}) endif() if (MSVC AND AWS_ARCH_ARM64) diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c index 35e1d09..7a4d550 100644 --- a/source/intel/asm/crc32c_sse42_asm.c +++ b/source/intel/asm/crc32c_sse42_asm.c @@ -6,6 +6,12 @@ #include #include +#include +#include +#include +#include + +#define zalign(x) __attribute__((aligned((x)))) /* clang-format off */ @@ -273,8 +279,195 @@ static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t return crc; } +/* + * crc32_avx512(): compute the crc32 of the buffer, where the buffer + * length must be at least 256, and a multiple of 64. Based on: + * + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 + */ +static uint32_t crc32_avx512_simd(const uint8_t *input, int length, uint32_t crc) +{ + /* + * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6 + * are similar to those given at the end of the paper + * + * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 + * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 + * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 + * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 + * k5 = ( x ^ ( 128 + 32 ) mod P(x) << 32 )' << 1 + * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1 + */ + + static const uint64_t zalign(64) k1k2[] = { 0xdcb17aa4, 0xb9e02b86, + 0xdcb17aa4, 0xb9e02b86, + 0xdcb17aa4, 0xb9e02b86, + 0xdcb17aa4, 0xb9e02b86 }; + static const uint64_t zalign(64) k3k4[] = { 0x740eef02, 0x9e4addf8, + 0x740eef02, 0x9e4addf8, + 0x740eef02, 0x9e4addf8, + 0x740eef02, 0x9e4addf8 }; + static const uint64_t zalign(16) k5k6[] = { 0xf20c0dfe, 0x14cd00bd6 }; + static const uint64_t zalign(16) k7k8[] = { 0xdd45aab8, 0x000000000 }; + static const uint64_t zalign(16) poly[] = { 0x105ec76f1, 0xdea713f1 }; + + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + __m128i a0, a1, a2, a3; + + /* + * There's at least one block of 256. + */ + x1 = _mm512_loadu_si512((__m512i *)(input + 0x00)); + x2 = _mm512_loadu_si512((__m512i *)(input + 0x40)); + x3 = _mm512_loadu_si512((__m512i *)(input + 0x80)); + x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); + + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); + + x0 = _mm512_load_si512((__m512i *)k1k2); + + input += 256; + length -= 256; + + /* + * Parallel fold blocks of 256, if any. + */ + while (length >= 256) + { + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); + + + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); + + y5 = _mm512_loadu_si512((__m512i *)(input + 0x00)); + y6 = _mm512_loadu_si512((__m512i *)(input + 0x40)); + y7 = _mm512_loadu_si512((__m512i *)(input + 0x80)); + y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); + + x1 = _mm512_xor_si512(x1, x5); + x2 = _mm512_xor_si512(x2, x6); + x3 = _mm512_xor_si512(x3, x7); + x4 = _mm512_xor_si512(x4, x8); + + x1 = _mm512_xor_si512(x1, y5); + x2 = _mm512_xor_si512(x2, y6); + x3 = _mm512_xor_si512(x3, y7); + x4 = _mm512_xor_si512(x4, y8); + + input += 256; + length -= 256; + } + + /* + * Fold into 512-bits. + */ + x0 = _mm512_load_si512((__m512i *)k3k4); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_xor_si512(x1, x2); + x1 = _mm512_xor_si512(x1, x5); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_xor_si512(x1, x3); + x1 = _mm512_xor_si512(x1, x5); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_xor_si512(x1, x4); + x1 = _mm512_xor_si512(x1, x5); + + /* + * Single fold blocks of 64, if any. + */ + while (length >= 64) + { + x2 = _mm512_loadu_si512((__m512i *)input); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_xor_si512(x1, x2); + x1 = _mm512_xor_si512(x1, x5); + + input += 64; + length -= 64; + } + + /* + * Fold 512-bits to 384-bits. + */ + a0 = _mm_load_si128((__m128i *)k5k6); + + a1 = _mm512_extracti32x4_epi32(x1, 0); + a2 = _mm512_extracti32x4_epi32(x1, 1); + + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); + + a1 = _mm_xor_si128(a1, a3); + a1 = _mm_xor_si128(a1, a2); + + /* + * Fold 384-bits to 256-bits. + */ + a2 = _mm512_extracti32x4_epi32(x1, 2); + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); + a1 = _mm_xor_si128(a1, a3); + a1 = _mm_xor_si128(a1, a2); + + /* + * Fold 256-bits to 128-bits. + */ + a2 = _mm512_extracti32x4_epi32(x1, 3); + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); + a1 = _mm_xor_si128(a1, a3); + a1 = _mm_xor_si128(a1, a2); + + /* + * Fold 128-bits to 64-bits. + */ + a2 = _mm_clmulepi64_si128(a1, a0, 0x10); + a3 = _mm_setr_epi32(~0, 0, ~0, 0); + a1 = _mm_srli_si128(a1, 8); + a1 = _mm_xor_si128(a1, a2); + + a0 = _mm_loadl_epi64((__m128i*)k7k8); + a2 = _mm_srli_si128(a1, 4); + a1 = _mm_and_si128(a1, a3); + a1 = _mm_clmulepi64_si128(a1, a0, 0x00); + a1 = _mm_xor_si128(a1, a2); + + /* + * Barret reduce to 32-bits. + */ + a0 = _mm_load_si128((__m128i*)poly); + + a2 = _mm_and_si128(a1, a3); + a2 = _mm_clmulepi64_si128(a2, a0, 0x10); + a2 = _mm_and_si128(a2, a3); + a2 = _mm_clmulepi64_si128(a2, a0, 0x00); + a1 = _mm_xor_si128(a1, a2); + + /* + * Return the crc32. + */ + return _mm_extract_epi32(a1, 1); +} + static bool detection_performed = false; static bool detected_clmul = false; +static bool detected_sse42 = false; +static bool detected_avx512 = false; /* * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and @@ -287,6 +480,8 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev if (AWS_UNLIKELY(!detection_performed)) { detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL); + detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2); + detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512); /* Simply setting the flag true to skip HW detection next time Not using memory barriers since the worst that can happen is a fallback to the non HW accelerated code. */ @@ -321,24 +516,37 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev /* Using likely to keep this code inlined */ if (AWS_LIKELY(detected_clmul)) { - - while (AWS_LIKELY(length >= 3072)) { - /* Compute crc32c on each block, chaining each crc result */ - crc = s_crc32c_sse42_clmul_3072(input, crc); - input += 3072; - length -= 3072; - } - while (AWS_LIKELY(length >= 1024)) { - /* Compute crc32c on each block, chaining each crc result */ - crc = s_crc32c_sse42_clmul_1024(input, crc); - input += 1024; - length -= 1024; + if (AWS_LIKELY(detected_avx512)) { + if (AWS_LIKELY(length >= 256)) { + ssize_t chunk_size = length & ~63; + crc = ~crc32_avx512_simd(input, length, crc); + /* check remaining data */ + length -= chunk_size; + if (!length) + return crc; + /* Fall into the default crc32 for the remaining data. */ + input += chunk_size; + } } - while (AWS_LIKELY(length >= 256)) { - /* Compute crc32c on each block, chaining each crc result */ - crc = s_crc32c_sse42_clmul_256(input, crc); - input += 256; - length -= 256; + else if (AWS_LIKELY(detected_sse42)) { + while (AWS_LIKELY(length >= 3072)) { + /* Compute crc32c on each block, chaining each crc result */ + crc = s_crc32c_sse42_clmul_3072(input, crc); + input += 3072; + length -= 3072; + } + while (AWS_LIKELY(length >= 1024)) { + /* Compute crc32c on each block, chaining each crc result */ + crc = s_crc32c_sse42_clmul_1024(input, crc); + input += 1024; + length -= 1024; + } + while (AWS_LIKELY(length >= 256)) { + /* Compute crc32c on each block, chaining each crc result */ + crc = s_crc32c_sse42_clmul_256(input, crc); + input += 256; + length -= 256; + } } } From cf22bcae847fd7c3470e4159d9c0e0e182aadd52 Mon Sep 17 00:00:00 2001 From: "Pulavarty, Badari" Date: Tue, 27 Jun 2023 12:54:41 -0400 Subject: [PATCH 02/18] Fixed routine name to indicate crc32c --- source/intel/asm/crc32c_sse42_asm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c index 7a4d550..68f8c24 100644 --- a/source/intel/asm/crc32c_sse42_asm.c +++ b/source/intel/asm/crc32c_sse42_asm.c @@ -280,13 +280,13 @@ static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t } /* - * crc32_avx512(): compute the crc32 of the buffer, where the buffer + * crc32c_avx512(): compute the crc32c of the buffer, where the buffer * length must be at least 256, and a multiple of 64. Based on: * * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 */ -static uint32_t crc32_avx512_simd(const uint8_t *input, int length, uint32_t crc) +static uint32_t crc32c_avx512(const uint8_t *input, int length, uint32_t crc) { /* * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6 @@ -519,7 +519,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev if (AWS_LIKELY(detected_avx512)) { if (AWS_LIKELY(length >= 256)) { ssize_t chunk_size = length & ~63; - crc = ~crc32_avx512_simd(input, length, crc); + crc = ~crc32c_avx512(input, length, crc); /* check remaining data */ length -= chunk_size; if (!length) From 375fa35215a87d35166ece1ebb844700a462fcdf Mon Sep 17 00:00:00 2001 From: "Vazquez, Javier" Date: Thu, 13 Jul 2023 20:17:11 -0600 Subject: [PATCH 03/18] Add sse42 avx512_intrinsics support --- CMakeLists.txt | 20 +++-- .../intel/{asm => intrin}/crc32c_sse42_asm.c | 0 source/intel/visualc/visualc_crc32c_sse42.c | 77 ------------------- 3 files changed, 15 insertions(+), 82 deletions(-) rename source/intel/{asm => intrin}/crc32c_sse42_asm.c (100%) delete mode 100644 source/intel/visualc/visualc_crc32c_sse42.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 2056ae3..5af28e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -28,6 +28,7 @@ string(REPLACE ";" "${AWS_MODULE_DIR};" AWS_MODULE_PATH "${CMAKE_PREFIX_PATH}${A # Append that generated list to the module search path list(APPEND CMAKE_MODULE_PATH ${AWS_MODULE_PATH}) +include(AwsSIMD) include(AwsCFlags) include(AwsCheckHeaders) include(AwsSharedLibSetup) @@ -60,17 +61,26 @@ file(GLOB AWS_ARCH_SRC if (USE_CPU_EXTENSIONS) if (MSVC AND AWS_ARCH_INTEL) file(GLOB AWS_ARCH_SRC - "source/intel/visualc/*.c" + "source/intel/intrin/*.c" ) - source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC}) + source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_SRC}) elseif(AWS_ARCH_INTEL AND AWS_HAVE_GCC_INLINE_ASM) - set(AWS_CMAKE_REQUIRED_FLAGS "-mavx512f -msse4.2 -mvpclmulqdq -mpclmul") + if (HAVE_SSE42_INTRINSICS) + set(SSE42_CFLAGS "-msse4.2") + endif() + + if (HAVE_AVX512_INTRINSICS) + set(AVX512_CFLAGS "-mavx512f") + endif() + + set(AWS_CMAKE_REQUIRED_FLAGS "${SSE42_CFLAGS} ${AVX512_CFLAGS} -mvpclmulqdq -mpclmul") file(GLOB AWS_ARCH_SRC - "source/intel/asm/*.c" + "source/intel/intrin/*.c" ) - SET_SOURCE_FILES_PROPERTIES(source/intel/asm/crc32c_sse42_asm.c COMPILE_FLAGS ${AWS_CMAKE_REQUIRED_FLAGS}) + message(STATUS "CFLAGS: ${AWS_CMAKE_REQUIRED_FLAGS}") + SET_SOURCE_FILES_PROPERTIES(source/intel/intrin/crc32c_sse42_asm.c COMPILE_FLAGS ${AWS_CMAKE_REQUIRED_FLAGS}) endif() if (MSVC AND AWS_ARCH_ARM64) diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/intrin/crc32c_sse42_asm.c similarity index 100% rename from source/intel/asm/crc32c_sse42_asm.c rename to source/intel/intrin/crc32c_sse42_asm.c diff --git a/source/intel/visualc/visualc_crc32c_sse42.c b/source/intel/visualc/visualc_crc32c_sse42.c deleted file mode 100644 index ca1aca4..0000000 --- a/source/intel/visualc/visualc_crc32c_sse42.c +++ /dev/null @@ -1,77 +0,0 @@ -/** - * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * SPDX-License-Identifier: Apache-2.0. - */ - -#include -#include - -#if defined(_M_X64) || defined(_M_IX86) - -# if defined(_M_X64) -typedef uint64_t *slice_ptr_type; -typedef uint64_t slice_ptr_int_type; -# else -typedef uint32_t *slice_ptr_type; -typedef uint32_t slice_ptr_int_type; -# endif - -/** - * This implements crc32c via the intel sse 4.2 instructions. - * This is separate from the straight asm version, because visual c does not allow - * inline assembly for x64. - */ -uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32) { - uint32_t crc = ~previousCrc32; - int length_to_process = length; - - slice_ptr_type temp = (slice_ptr_type)data; - - /*to eek good performance out of the intel implementation, we need to only hit the hardware - once we are aligned on the byte boundaries we are using. So, peel off a byte at a time until we are - 8 byte aligned (64 bit arch) or 4 byte aligned (32 bit arch) - - first calculate how many bytes we need to burn before we are aligned. - for a 64 bit arch this is: - (8 - ) mod 8 - 32 bit: - (4 - ) mod 4 */ - uint8_t alignment_offset = (sizeof(slice_ptr_int_type) - ((slice_ptr_int_type)temp % sizeof(slice_ptr_int_type))) % - sizeof(slice_ptr_int_type); - - /*for every byte we need to burn off, just do them a byte at a time. - increment the temp pointer by one byte at a time until we get it on an alignment boundary */ - while (alignment_offset != 0 && length_to_process) { - uint8_t *byte_pos = (uint8_t *)temp; - crc = (uint32_t)_mm_crc32_u8(crc, *byte_pos++); - temp = (slice_ptr_type)byte_pos; - --alignment_offset; - --length_to_process; - } - - /*now whatever is left is properly aligned on a boundary*/ - uint32_t slices = length_to_process / sizeof(temp); - uint32_t remainder = length_to_process % sizeof(temp); - - while (slices--) { -# if defined(_M_X64) - crc = (uint32_t)_mm_crc32_u64(crc, *temp++); -# else - crc = _mm_crc32_u32(crc, *temp++); -# endif - } - - /* process the remaining parts that can't be done on the slice size. */ - uint8_t *remainderPos = (uint8_t *)temp; - - while (remainder--) { - crc = (uint32_t)_mm_crc32_u8(crc, *remainderPos++); - } - - return ~crc; -} - -uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32_sw(input, length, previousCrc32); -} -#endif /* x64 || x86 */ From 2eb55784e408f144fb4445ce9d6c1065c6aa47bb Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Tue, 18 Jul 2023 17:33:33 -0700 Subject: [PATCH 04/18] Refactoring work for the AVX512 code path. Testing shows it not quite working yet. --- CMakeLists.txt | 48 ++-- .../private/intel/crc32c_compiler_shims.h | 25 ++ .../intel/{intrin => asm}/crc32c_sse42_asm.c | 251 ++---------------- source/intel/crc_hw.c | 96 +++++++ source/intel/intrin/crc32c_sse42_avx512.c | 197 ++++++++++++++ source/intel/visualc/visualc_crc32c_sse42.c | 61 +++++ tests/crc_test.c | 14 + 7 files changed, 443 insertions(+), 249 deletions(-) create mode 100644 include/aws/checksums/private/intel/crc32c_compiler_shims.h rename source/intel/{intrin => asm}/crc32c_sse42_asm.c (66%) create mode 100644 source/intel/crc_hw.c create mode 100644 source/intel/intrin/crc32c_sse42_avx512.c create mode 100644 source/intel/visualc/visualc_crc32c_sse42.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 5af28e5..45caa2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -59,28 +59,37 @@ file(GLOB AWS_ARCH_SRC ) if (USE_CPU_EXTENSIONS) - if (MSVC AND AWS_ARCH_INTEL) - file(GLOB AWS_ARCH_SRC - "source/intel/intrin/*.c" + if (AWS_ARCH_INTEL) + file (GLOB AWS_ARCH_INTEL_SRC + "source/intel/*.c" ) - source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_SRC}) - - elseif(AWS_ARCH_INTEL AND AWS_HAVE_GCC_INLINE_ASM) - if (HAVE_SSE42_INTRINSICS) - set(SSE42_CFLAGS "-msse4.2") - endif() - - if (HAVE_AVX512_INTRINSICS) - set(AVX512_CFLAGS "-mavx512f") + if (MSVC) + file(GLOB AWS_ARCH_INTRIN_SRC + "source/intel/intrin/*.c" + "source/intel/visualc/*.c" + ) + else() + file(GLOB AWS_ARCH_INTRIN_SRC + "source/intel/intrin/*.c" + ) endif() + + source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC}) + source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC}) - set(AWS_CMAKE_REQUIRED_FLAGS "${SSE42_CFLAGS} ${AVX512_CFLAGS} -mvpclmulqdq -mpclmul") - file(GLOB AWS_ARCH_SRC - "source/intel/intrin/*.c" + if (AWS_HAVE_GCC_INLINE_ASM) + file(GLOB AWS_ARCH_SRC + ${AWS_ARCH_INTEL_SRC} + ${AWS_ARCH_INTRIN_SRC} + ${AWS_ARCH_ASM_SRC} + ) + else() + file(GLOB AWS_ARCH_SRC + ${AWS_ARCH_INTEL_SRC} + ${AWS_ARCH_INTRIN_SRC} ) - message(STATUS "CFLAGS: ${AWS_CMAKE_REQUIRED_FLAGS}") - SET_SOURCE_FILES_PROPERTIES(source/intel/intrin/crc32c_sse42_asm.c COMPILE_FLAGS ${AWS_CMAKE_REQUIRED_FLAGS}) + endif() endif() if (MSVC AND AWS_ARCH_ARM64) @@ -126,6 +135,7 @@ file(GLOB CHECKSUMS_COMBINED_SRC add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC}) + aws_set_common_properties(${PROJECT_NAME}) aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS") aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS}) @@ -135,6 +145,10 @@ aws_add_sanitizers(${PROJECT_NAME}) # We are not ABI stable yet set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0) +if (USE_CPU_EXTENSIONS AND AWS_ARCH_INTEL) + simd_add_source_avx(${PROJECT_NAME} ${AWS_ARCH_SRC}) +endif() + target_include_directories(${PROJECT_NAME} PUBLIC $ $) diff --git a/include/aws/checksums/private/intel/crc32c_compiler_shims.h b/include/aws/checksums/private/intel/crc32c_compiler_shims.h new file mode 100644 index 0000000..21002de --- /dev/null +++ b/include/aws/checksums/private/intel/crc32c_compiler_shims.h @@ -0,0 +1,25 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include + +#include +#include + +#if _WIN64 || __x86_64__ || __ppc64_ +typedef uint64_t *slice_ptr_type; +typedef uint64_t slice_ptr_int_type; +# define crc_intrin_fn _mm_crc32_u64 +#else +typedef uint32_t *slice_ptr_type; +typedef uint32_t slice_ptr_int_type; +# define crc_intrin_fn _mm_crc32_u32 +#endif + +#ifdef AWS_HAVE_AVX512_INTRINSICS +uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc); +#endif + +uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc); diff --git a/source/intel/intrin/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c similarity index 66% rename from source/intel/intrin/crc32c_sse42_asm.c rename to source/intel/asm/crc32c_sse42_asm.c index 68f8c24..e7c144e 100644 --- a/source/intel/intrin/crc32c_sse42_asm.c +++ b/source/intel/asm/crc32c_sse42_asm.c @@ -6,12 +6,6 @@ #include #include -#include -#include -#include -#include - -#define zalign(x) __attribute__((aligned((x)))) /* clang-format off */ @@ -279,195 +273,8 @@ static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t return crc; } -/* - * crc32c_avx512(): compute the crc32c of the buffer, where the buffer - * length must be at least 256, and a multiple of 64. Based on: - * - * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" - * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 - */ -static uint32_t crc32c_avx512(const uint8_t *input, int length, uint32_t crc) -{ - /* - * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6 - * are similar to those given at the end of the paper - * - * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 - * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 - * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 - * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 - * k5 = ( x ^ ( 128 + 32 ) mod P(x) << 32 )' << 1 - * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1 - */ - - static const uint64_t zalign(64) k1k2[] = { 0xdcb17aa4, 0xb9e02b86, - 0xdcb17aa4, 0xb9e02b86, - 0xdcb17aa4, 0xb9e02b86, - 0xdcb17aa4, 0xb9e02b86 }; - static const uint64_t zalign(64) k3k4[] = { 0x740eef02, 0x9e4addf8, - 0x740eef02, 0x9e4addf8, - 0x740eef02, 0x9e4addf8, - 0x740eef02, 0x9e4addf8 }; - static const uint64_t zalign(16) k5k6[] = { 0xf20c0dfe, 0x14cd00bd6 }; - static const uint64_t zalign(16) k7k8[] = { 0xdd45aab8, 0x000000000 }; - static const uint64_t zalign(16) poly[] = { 0x105ec76f1, 0xdea713f1 }; - - __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; - __m128i a0, a1, a2, a3; - - /* - * There's at least one block of 256. - */ - x1 = _mm512_loadu_si512((__m512i *)(input + 0x00)); - x2 = _mm512_loadu_si512((__m512i *)(input + 0x40)); - x3 = _mm512_loadu_si512((__m512i *)(input + 0x80)); - x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); - - x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); - - x0 = _mm512_load_si512((__m512i *)k1k2); - - input += 256; - length -= 256; - - /* - * Parallel fold blocks of 256, if any. - */ - while (length >= 256) - { - x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); - x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); - x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); - x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); - - - x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); - x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); - x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); - x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); - - y5 = _mm512_loadu_si512((__m512i *)(input + 0x00)); - y6 = _mm512_loadu_si512((__m512i *)(input + 0x40)); - y7 = _mm512_loadu_si512((__m512i *)(input + 0x80)); - y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); - - x1 = _mm512_xor_si512(x1, x5); - x2 = _mm512_xor_si512(x2, x6); - x3 = _mm512_xor_si512(x3, x7); - x4 = _mm512_xor_si512(x4, x8); - - x1 = _mm512_xor_si512(x1, y5); - x2 = _mm512_xor_si512(x2, y6); - x3 = _mm512_xor_si512(x3, y7); - x4 = _mm512_xor_si512(x4, y8); - - input += 256; - length -= 256; - } - - /* - * Fold into 512-bits. - */ - x0 = _mm512_load_si512((__m512i *)k3k4); - - x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); - x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); - x1 = _mm512_xor_si512(x1, x2); - x1 = _mm512_xor_si512(x1, x5); - - x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); - x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); - x1 = _mm512_xor_si512(x1, x3); - x1 = _mm512_xor_si512(x1, x5); - - x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); - x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); - x1 = _mm512_xor_si512(x1, x4); - x1 = _mm512_xor_si512(x1, x5); - - /* - * Single fold blocks of 64, if any. - */ - while (length >= 64) - { - x2 = _mm512_loadu_si512((__m512i *)input); - - x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); - x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); - x1 = _mm512_xor_si512(x1, x2); - x1 = _mm512_xor_si512(x1, x5); - - input += 64; - length -= 64; - } - - /* - * Fold 512-bits to 384-bits. - */ - a0 = _mm_load_si128((__m128i *)k5k6); - - a1 = _mm512_extracti32x4_epi32(x1, 0); - a2 = _mm512_extracti32x4_epi32(x1, 1); - - a3 = _mm_clmulepi64_si128(a1, a0, 0x00); - a1 = _mm_clmulepi64_si128(a1, a0, 0x11); - - a1 = _mm_xor_si128(a1, a3); - a1 = _mm_xor_si128(a1, a2); - - /* - * Fold 384-bits to 256-bits. - */ - a2 = _mm512_extracti32x4_epi32(x1, 2); - a3 = _mm_clmulepi64_si128(a1, a0, 0x00); - a1 = _mm_clmulepi64_si128(a1, a0, 0x11); - a1 = _mm_xor_si128(a1, a3); - a1 = _mm_xor_si128(a1, a2); - - /* - * Fold 256-bits to 128-bits. - */ - a2 = _mm512_extracti32x4_epi32(x1, 3); - a3 = _mm_clmulepi64_si128(a1, a0, 0x00); - a1 = _mm_clmulepi64_si128(a1, a0, 0x11); - a1 = _mm_xor_si128(a1, a3); - a1 = _mm_xor_si128(a1, a2); - - /* - * Fold 128-bits to 64-bits. - */ - a2 = _mm_clmulepi64_si128(a1, a0, 0x10); - a3 = _mm_setr_epi32(~0, 0, ~0, 0); - a1 = _mm_srli_si128(a1, 8); - a1 = _mm_xor_si128(a1, a2); - - a0 = _mm_loadl_epi64((__m128i*)k7k8); - a2 = _mm_srli_si128(a1, 4); - a1 = _mm_and_si128(a1, a3); - a1 = _mm_clmulepi64_si128(a1, a0, 0x00); - a1 = _mm_xor_si128(a1, a2); - - /* - * Barret reduce to 32-bits. - */ - a0 = _mm_load_si128((__m128i*)poly); - - a2 = _mm_and_si128(a1, a3); - a2 = _mm_clmulepi64_si128(a2, a0, 0x10); - a2 = _mm_and_si128(a2, a3); - a2 = _mm_clmulepi64_si128(a2, a0, 0x00); - a1 = _mm_xor_si128(a1, a2); - - /* - * Return the crc32. - */ - return _mm_extract_epi32(a1, 1); -} - static bool detection_performed = false; static bool detected_clmul = false; -static bool detected_sse42 = false; -static bool detected_avx512 = false; /* * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and @@ -476,12 +283,10 @@ static bool detected_avx512 = false; * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent * call. */ -uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { +uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) { if (AWS_UNLIKELY(!detection_performed)) { detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL); - detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2); - detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512); /* Simply setting the flag true to skip HW detection next time Not using memory barriers since the worst that can happen is a fallback to the non HW accelerated code. */ @@ -516,37 +321,24 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev /* Using likely to keep this code inlined */ if (AWS_LIKELY(detected_clmul)) { - if (AWS_LIKELY(detected_avx512)) { - if (AWS_LIKELY(length >= 256)) { - ssize_t chunk_size = length & ~63; - crc = ~crc32c_avx512(input, length, crc); - /* check remaining data */ - length -= chunk_size; - if (!length) - return crc; - /* Fall into the default crc32 for the remaining data. */ - input += chunk_size; - } + + while (AWS_LIKELY(length >= 3072)) { + /* Compute crc32c on each block, chaining each crc result */ + crc = s_crc32c_sse42_clmul_3072(input, crc); + input += 3072; + length -= 3072; } - else if (AWS_LIKELY(detected_sse42)) { - while (AWS_LIKELY(length >= 3072)) { - /* Compute crc32c on each block, chaining each crc result */ - crc = s_crc32c_sse42_clmul_3072(input, crc); - input += 3072; - length -= 3072; - } - while (AWS_LIKELY(length >= 1024)) { - /* Compute crc32c on each block, chaining each crc result */ - crc = s_crc32c_sse42_clmul_1024(input, crc); - input += 1024; - length -= 1024; - } - while (AWS_LIKELY(length >= 256)) { - /* Compute crc32c on each block, chaining each crc result */ - crc = s_crc32c_sse42_clmul_256(input, crc); - input += 256; - length -= 256; - } + while (AWS_LIKELY(length >= 1024)) { + /* Compute crc32c on each block, chaining each crc result */ + crc = s_crc32c_sse42_clmul_1024(input, crc); + input += 1024; + length -= 1024; + } + while (AWS_LIKELY(length >= 256)) { + /* Compute crc32c on each block, chaining each crc result */ + crc = s_crc32c_sse42_clmul_256(input, crc); + input += 256; + length -= 256; } } @@ -575,13 +367,8 @@ uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previ # endif #else -uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { +uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) { return aws_checksums_crc32_sw(input, length, previousCrc32); } - -uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32c_sw(input, length, previousCrc32); -} - #endif /* clang-format on */ diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c new file mode 100644 index 0000000..1e1c85b --- /dev/null +++ b/source/intel/crc_hw.c @@ -0,0 +1,96 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ +#include +#include + +static bool detection_performed = false; +static bool detected_sse42 = false; +static bool detected_avx512 = false; +static bool detected_clmul = false; + +/* + * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and + * PCLMULQDQ machine instructions (if present). + * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction. + * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent + * call. + */ +uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) { + + if (AWS_UNLIKELY(!detection_performed)) { + detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2); + detected_avx512 = true; //aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512); + detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL); + /* Simply setting the flag true to skip HW detection next time + Not using memory barriers since the worst that can + happen is a fallback to the non HW accelerated code. */ + detection_performed = true; + } + + uint32_t crc = ~previousCrc32; + + /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ + if (length < sizeof(slice_ptr_int_type)) { + while (length-- > 0) { + crc = (uint32_t)_mm_crc32_u8(crc, *input++); + } + return ~crc; + } + + /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */ + int input_alignment = (uintptr_t)(input) & 0x7; + + /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */ + int leading = (8 - input_alignment) & 0x7; + + /* reduce the length by the leading unaligned bytes we are about to process */ + length -= leading; + + /* spin through the leading unaligned input bytes (if any) one-by-one */ + while (leading-- > 0) { + crc = (uint32_t)_mm_crc32_u8(crc, *input++); + } + + int chunk_size = length & ~63; + +#ifdef AWS_HAVE_AVX512_INTRINSICS + if (detected_avx512 && detected_clmul) { + if (length >= 256) { + crc = aws_checksums_crc32c_avx512(input, length, crc); + /* check remaining data */ + length -= chunk_size; + if (!length) { + return crc; + } + + /* Fall into the default crc32 for the remaining data. */ + input += chunk_size; + } + } +#endif + + if (detected_sse42 && detected_clmul) { + return aws_checksums_crc32c_sse42(input, length, crc); + } + + /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ + while (length >= sizeof(slice_ptr_int_type)) { + crc = (uint32_t)crc_intrin_fn(crc, *input); + input += sizeof(slice_ptr_int_type); + length -= sizeof(slice_ptr_int_type); + } + + /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */ + while (length-- > 0) { + crc = (uint32_t)_mm_crc32_u8(crc, *input); + input++; + } + + return ~crc; +} + +uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { + return aws_checksums_crc32_sw(input, length, previousCrc32); +} diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c new file mode 100644 index 0000000..745bdac --- /dev/null +++ b/source/intel/intrin/crc32c_sse42_avx512.c @@ -0,0 +1,197 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include + +#include + +#include +#include +#include +#include + +AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_8[8], 64); +AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2[2], 16); + +/* + * crc32c_avx512(): compute the crc32c of the buffer, where the buffer + * length must be at least 256, and a multiple of 64. Based on: + * + * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" + * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 + */ +uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc) { + /* + * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6 + * are similar to those given at the end of the paper + * + * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1 + * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1 + * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1 + * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1 + * k5 = ( x ^ ( 128 + 32 ) mod P(x) << 32 )' << 1 + * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1 + */ + + static zalign_8 k1k2[] = { + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; + static zalign_8 k3k4[] = { + 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8}; + static zalign_2 k5k6[] = {0xf20c0dfe, 0x14cd00bd6}; + static zalign_2 k7k8[] = {0xdd45aab8, 0x000000000}; + static zalign_2 poly[] = {0x105ec76f1, 0xdea713f1}; + + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; + __m128i a0, a1, a2, a3; + + /* + * There's at least one block of 256. + */ + x1 = _mm512_loadu_si512((__m512i *)(input + 0x00)); + x2 = _mm512_loadu_si512((__m512i *)(input + 0x40)); + x3 = _mm512_loadu_si512((__m512i *)(input + 0x80)); + x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); + + x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc))); + + x0 = _mm512_load_si512((__m512i *)k1k2); + + input += 256; + length -= 256; + + /* + * Parallel fold blocks of 256, if any. + */ + while (length >= 256) + { + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); + + + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11); + + y5 = _mm512_loadu_si512((__m512i *)(input + 0x00)); + y6 = _mm512_loadu_si512((__m512i *)(input + 0x40)); + y7 = _mm512_loadu_si512((__m512i *)(input + 0x80)); + y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); + + x1 = _mm512_xor_si512(x1, x5); + x2 = _mm512_xor_si512(x2, x6); + x3 = _mm512_xor_si512(x3, x7); + x4 = _mm512_xor_si512(x4, x8); + + x1 = _mm512_xor_si512(x1, y5); + x2 = _mm512_xor_si512(x2, y6); + x3 = _mm512_xor_si512(x3, y7); + x4 = _mm512_xor_si512(x4, y8); + + input += 256; + length -= 256; + } + + /* + * Fold into 512-bits. + */ + x0 = _mm512_load_si512((__m512i *)k3k4); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_xor_si512(x1, x2); + x1 = _mm512_xor_si512(x1, x5); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_xor_si512(x1, x3); + x1 = _mm512_xor_si512(x1, x5); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_xor_si512(x1, x4); + x1 = _mm512_xor_si512(x1, x5); + + /* + * Single fold blocks of 64, if any. + */ + while (length >= 64) + { + x2 = _mm512_loadu_si512((__m512i *)input); + + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_xor_si512(x1, x2); + x1 = _mm512_xor_si512(x1, x5); + + input += 64; + length -= 64; + } + + /* + * Fold 512-bits to 384-bits. + */ + a0 = _mm_load_si128((__m128i *)k5k6); + + a1 = _mm512_extracti32x4_epi32(x1, 0); + a2 = _mm512_extracti32x4_epi32(x1, 1); + + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); + + a1 = _mm_xor_si128(a1, a3); + a1 = _mm_xor_si128(a1, a2); + + /* + * Fold 384-bits to 256-bits. + */ + a2 = _mm512_extracti32x4_epi32(x1, 2); + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); + a1 = _mm_xor_si128(a1, a3); + a1 = _mm_xor_si128(a1, a2); + + /* + * Fold 256-bits to 128-bits. + */ + a2 = _mm512_extracti32x4_epi32(x1, 3); + a3 = _mm_clmulepi64_si128(a1, a0, 0x00); + a1 = _mm_clmulepi64_si128(a1, a0, 0x11); + a1 = _mm_xor_si128(a1, a3); + a1 = _mm_xor_si128(a1, a2); + + /* + * Fold 128-bits to 64-bits. + */ + a2 = _mm_clmulepi64_si128(a1, a0, 0x10); + a3 = _mm_setr_epi32(~0, 0, ~0, 0); + a1 = _mm_srli_si128(a1, 8); + a1 = _mm_xor_si128(a1, a2); + + a0 = _mm_loadl_epi64((__m128i*)k7k8); + a2 = _mm_srli_si128(a1, 4); + a1 = _mm_and_si128(a1, a3); + a1 = _mm_clmulepi64_si128(a1, a0, 0x00); + a1 = _mm_xor_si128(a1, a2); + + /* + * Barret reduce to 32-bits. + */ + a0 = _mm_load_si128((__m128i*)poly); + + a2 = _mm_and_si128(a1, a3); + a2 = _mm_clmulepi64_si128(a2, a0, 0x10); + a2 = _mm_and_si128(a2, a3); + a2 = _mm_clmulepi64_si128(a2, a0, 0x00); + a1 = _mm_xor_si128(a1, a2); + + + /* + * Return the crc32. + */ + return ~_mm_extract_epi32(a1, 1); +} diff --git a/source/intel/visualc/visualc_crc32c_sse42.c b/source/intel/visualc/visualc_crc32c_sse42.c new file mode 100644 index 0000000..79ed836 --- /dev/null +++ b/source/intel/visualc/visualc_crc32c_sse42.c @@ -0,0 +1,61 @@ +/** + * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. + * SPDX-License-Identifier: Apache-2.0. + */ + +#include + +/** + * This implements crc32c via the intel sse 4.2 instructions. + * This is separate from the straight asm version, because visual c does not allow + * inline assembly for x64. + */ +uint32_t aws_checksums_crc32c_sse42(const uint8_t *data, int length, uint32_t previousCrc32) { + uint32_t crc = previousCrc32; + int length_to_process = length; + + slice_ptr_type temp = (slice_ptr_type)data; + + /*to eek good performance out of the intel implementation, we need to only hit the hardware + once we are aligned on the byte boundaries we are using. So, peel off a byte at a time until we are + 8 byte aligned (64 bit arch) or 4 byte aligned (32 bit arch) + + first calculate how many bytes we need to burn before we are aligned. + for a 64 bit arch this is: + (8 - ) mod 8 + 32 bit: + (4 - ) mod 4 */ + uint8_t alignment_offset = (sizeof(slice_ptr_int_type) - ((slice_ptr_int_type)temp % sizeof(slice_ptr_int_type))) % + sizeof(slice_ptr_int_type); + + /*for every byte we need to burn off, just do them a byte at a time. + increment the temp pointer by one byte at a time until we get it on an alignment boundary */ + while (alignment_offset != 0 && length_to_process) { + uint8_t *byte_pos = (uint8_t *)temp; + crc = (uint32_t)_mm_crc32_u8(crc, *byte_pos++); + temp = (slice_ptr_type)byte_pos; + --alignment_offset; + --length_to_process; + } + + /*now whatever is left is properly aligned on a boundary*/ + uint32_t slices = length_to_process / sizeof(temp); + uint32_t remainder = length_to_process % sizeof(temp); + + while (slices--) { +# if defined(_M_X64) + crc = (uint32_t)_mm_crc32_u64(crc, *temp++); +# else + crc = _mm_crc32_u32(crc, *temp++); +# endif + } + + /* process the remaining parts that can't be done on the slice size. */ + uint8_t *remainderPos = (uint8_t *)temp; + + while (remainder--) { + crc = (uint32_t)_mm_crc32_u8(crc, *remainderPos++); + } + + return ~crc; +} diff --git a/tests/crc_test.c b/tests/crc_test.c index ec9d2a4..58da0ea 100644 --- a/tests/crc_test.c +++ b/tests/crc_test.c @@ -5,6 +5,9 @@ #include #include + +#include + #include static const uint8_t DATA_32_ZEROS[32] = {0}; @@ -99,6 +102,17 @@ static int s_test_crc32c(struct aws_allocator *allocator, void *ctx) { res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c)); res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c_sw)); + + struct aws_byte_buf avx_buf; + /* enough for two avx512 runs */ + aws_byte_buf_init(&avx_buf, allocator, 512); + aws_device_random_buffer(&avx_buf); + + uint32_t crc = aws_checksums_crc32c_sw(avx_buf.buffer, avx_buf.len, 0); + + uint32_t hw_crc = aws_checksums_crc32c_hw(avx_buf.buffer, avx_buf.len, 0); + ASSERT_UINT_EQUALS(hw_crc, crc); + return res; } AWS_TEST_CASE(test_crc32c, s_test_crc32c) From 837d5a19571ce29909e831243956d50967935f35 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Tue, 18 Jul 2023 17:43:18 -0700 Subject: [PATCH 05/18] Keep the naive avx512 path on for figuring out codebuild capabilities, turn it off once we know what's supported where. --- tests/crc_test.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tests/crc_test.c b/tests/crc_test.c index 58da0ea..d28eb5b 100644 --- a/tests/crc_test.c +++ b/tests/crc_test.c @@ -108,9 +108,8 @@ static int s_test_crc32c(struct aws_allocator *allocator, void *ctx) { aws_byte_buf_init(&avx_buf, allocator, 512); aws_device_random_buffer(&avx_buf); - uint32_t crc = aws_checksums_crc32c_sw(avx_buf.buffer, avx_buf.len, 0); - - uint32_t hw_crc = aws_checksums_crc32c_hw(avx_buf.buffer, avx_buf.len, 0); + uint32_t crc = aws_checksums_crc32c_sw(avx_buf.buffer, (int)avx_buf.len, 0); + uint32_t hw_crc = aws_checksums_crc32c_hw(avx_buf.buffer, (int)avx_buf.len, 0); ASSERT_UINT_EQUALS(hw_crc, crc); return res; From 2289c9649c6d8222945ed06190cdb804f78c7acc Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Tue, 18 Jul 2023 17:58:39 -0700 Subject: [PATCH 06/18] Fix build and do correct cpu feature detection. --- CMakeLists.txt | 23 +++++++++++++++-------- source/intel/crc_hw.c | 8 ++++---- source/intel/intrin/crc32c_sse42_avx512.c | 14 +++++--------- 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 45caa2b..06702e6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,15 +64,22 @@ if (USE_CPU_EXTENSIONS) "source/intel/*.c" ) - if (MSVC) - file(GLOB AWS_ARCH_INTRIN_SRC - "source/intel/intrin/*.c" - "source/intel/visualc/*.c" - ) + if (AWS_HAVE_AVX2_INTRINSICS) + if (MSVC) + file(GLOB AWS_ARCH_INTRIN_SRC + "source/intel/intrin/*.c" + "source/intel/visualc/*.c" + ) + else() + file(GLOB AWS_ARCH_INTRIN_SRC + "source/intel/intrin/*.c" + ) + endif() else() - file(GLOB AWS_ARCH_INTRIN_SRC - "source/intel/intrin/*.c" - ) + if (MSVC) + file(GLOB AWS_ARCH_INTRIN_SRC + "source/intel/visualc/*.c") + endif() endif() source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC}) diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c index 1e1c85b..0154f05 100644 --- a/source/intel/crc_hw.c +++ b/source/intel/crc_hw.c @@ -21,7 +21,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev if (AWS_UNLIKELY(!detection_performed)) { detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2); - detected_avx512 = true; //aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512); + detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512); detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL); /* Simply setting the flag true to skip HW detection next time Not using memory barriers since the worst that can @@ -40,7 +40,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev } /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */ - int input_alignment = (uintptr_t)(input) & 0x7; + int input_alignment = (uintptr_t)(input)&0x7; /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */ int leading = (8 - input_alignment) & 0x7; @@ -68,11 +68,11 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev /* Fall into the default crc32 for the remaining data. */ input += chunk_size; } - } + } #endif if (detected_sse42 && detected_clmul) { - return aws_checksums_crc32c_sse42(input, length, crc); + return aws_checksums_crc32c_sse42(input, length, crc); } /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c index 745bdac..5b513d3 100644 --- a/source/intel/intrin/crc32c_sse42_avx512.c +++ b/source/intel/intrin/crc32c_sse42_avx512.c @@ -8,9 +8,9 @@ #include #include +#include #include #include -#include AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_8[8], 64); AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2[2], 16); @@ -64,14 +64,12 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t /* * Parallel fold blocks of 256, if any. */ - while (length >= 256) - { + while (length >= 256) { x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00); x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00); x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00); - x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11); x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11); @@ -119,8 +117,7 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t /* * Single fold blocks of 64, if any. */ - while (length >= 64) - { + while (length >= 64) { x2 = _mm512_loadu_si512((__m512i *)input); x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); @@ -172,7 +169,7 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t a1 = _mm_srli_si128(a1, 8); a1 = _mm_xor_si128(a1, a2); - a0 = _mm_loadl_epi64((__m128i*)k7k8); + a0 = _mm_loadl_epi64((__m128i *)k7k8); a2 = _mm_srli_si128(a1, 4); a1 = _mm_and_si128(a1, a3); a1 = _mm_clmulepi64_si128(a1, a0, 0x00); @@ -181,7 +178,7 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t /* * Barret reduce to 32-bits. */ - a0 = _mm_load_si128((__m128i*)poly); + a0 = _mm_load_si128((__m128i *)poly); a2 = _mm_and_si128(a1, a3); a2 = _mm_clmulepi64_si128(a2, a0, 0x10); @@ -189,7 +186,6 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t a2 = _mm_clmulepi64_si128(a2, a0, 0x00); a1 = _mm_xor_si128(a1, a2); - /* * Return the crc32. */ From ee3e5da419d0329e5c2deb1952756ba192df12e6 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Tue, 18 Jul 2023 18:05:24 -0700 Subject: [PATCH 07/18] fix 32-bit builds and builds that need to work without intrinsics available. --- CMakeLists.txt | 2 +- source/intel/crc_hw.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 06702e6..05265d9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,7 +64,7 @@ if (USE_CPU_EXTENSIONS) "source/intel/*.c" ) - if (AWS_HAVE_AVX2_INTRINSICS) + if (AWS_HAVE_AVX2_INTRINSICS AND CMAKE_SIZEOF_VOID_P EQUAL 8) if (MSVC) file(GLOB AWS_ARCH_INTRIN_SRC "source/intel/intrin/*.c" diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c index 0154f05..f74ba4c 100644 --- a/source/intel/crc_hw.c +++ b/source/intel/crc_hw.c @@ -32,7 +32,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev uint32_t crc = ~previousCrc32; /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ - if (length < sizeof(slice_ptr_int_type)) { + if (length < (int)sizeof(slice_ptr_int_type)) { while (length-- > 0) { crc = (uint32_t)_mm_crc32_u8(crc, *input++); } @@ -76,10 +76,10 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev } /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */ - while (length >= sizeof(slice_ptr_int_type)) { + while (length >= (int)sizeof(slice_ptr_int_type)) { crc = (uint32_t)crc_intrin_fn(crc, *input); input += sizeof(slice_ptr_int_type); - length -= sizeof(slice_ptr_int_type); + length -= (int)sizeof(slice_ptr_int_type); } /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */ From 005ed7ce058393b1496fd599c6e3d0314ecad53e Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Tue, 18 Jul 2023 18:12:03 -0700 Subject: [PATCH 08/18] Not sure how the avx512 code got called. hopefully coedebuild is just busted. --- source/intel/crc_hw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c index f74ba4c..ef6d00f 100644 --- a/source/intel/crc_hw.c +++ b/source/intel/crc_hw.c @@ -53,9 +53,9 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev crc = (uint32_t)_mm_crc32_u8(crc, *input++); } +#ifdef AWS_HAVE_AVX512_INTRINSICS int chunk_size = length & ~63; -#ifdef AWS_HAVE_AVX512_INTRINSICS if (detected_avx512 && detected_clmul) { if (length >= 256) { crc = aws_checksums_crc32c_avx512(input, length, crc); From 39094d4eda05c58f7c5c1b2a9db09e51240bb0a3 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Tue, 18 Jul 2023 18:14:48 -0700 Subject: [PATCH 09/18] Found why the wrong build files were being used at least. --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 05265d9..43aa73f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -64,7 +64,7 @@ if (USE_CPU_EXTENSIONS) "source/intel/*.c" ) - if (AWS_HAVE_AVX2_INTRINSICS AND CMAKE_SIZEOF_VOID_P EQUAL 8) + if (AWS_HAVE_AVX512_INTRINSICS AND CMAKE_SIZEOF_VOID_P EQUAL 8) if (MSVC) file(GLOB AWS_ARCH_INTRIN_SRC "source/intel/intrin/*.c" From d4ffdc1b0a88971c3a4e19eba305d281b58852ab Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Tue, 18 Jul 2023 18:19:06 -0700 Subject: [PATCH 10/18] Make test pass when it passes. --- tests/crc_test.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/crc_test.c b/tests/crc_test.c index d28eb5b..a8e8f55 100644 --- a/tests/crc_test.c +++ b/tests/crc_test.c @@ -110,6 +110,8 @@ static int s_test_crc32c(struct aws_allocator *allocator, void *ctx) { uint32_t crc = aws_checksums_crc32c_sw(avx_buf.buffer, (int)avx_buf.len, 0); uint32_t hw_crc = aws_checksums_crc32c_hw(avx_buf.buffer, (int)avx_buf.len, 0); + + aws_byte_buf_clean_up(&avx_buf); ASSERT_UINT_EQUALS(hw_crc, crc); return res; From bf799366846a079abbb7b7716aa3610fc4f28197 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Tue, 18 Jul 2023 18:29:05 -0700 Subject: [PATCH 11/18] Try it again. --- CMakeLists.txt | 7 ++++++- source/intel/asm/crc32c_sse42_asm.c | 2 +- source/intel/intrin/crc32c_sse42_avx512.c | 19 ++++++++++--------- 3 files changed, 17 insertions(+), 11 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 43aa73f..997c2ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -78,7 +78,8 @@ if (USE_CPU_EXTENSIONS) else() if (MSVC) file(GLOB AWS_ARCH_INTRIN_SRC - "source/intel/visualc/*.c") + "source/intel/visualc/*.c" + ) endif() endif() @@ -86,6 +87,10 @@ if (USE_CPU_EXTENSIONS) source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC}) if (AWS_HAVE_GCC_INLINE_ASM) + file(GLOB AWS_ARCH_ASM_SRC + "source/intel/asm/*.c" + ) + file(GLOB AWS_ARCH_SRC ${AWS_ARCH_INTEL_SRC} ${AWS_ARCH_INTRIN_SRC} diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c index e7c144e..dd7431e 100644 --- a/source/intel/asm/crc32c_sse42_asm.c +++ b/source/intel/asm/crc32c_sse42_asm.c @@ -3,7 +3,7 @@ * SPDX-License-Identifier: Apache-2.0. */ -#include +#include #include diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c index 5b513d3..35086c3 100644 --- a/source/intel/intrin/crc32c_sse42_avx512.c +++ b/source/intel/intrin/crc32c_sse42_avx512.c @@ -12,8 +12,8 @@ #include #include -AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_8[8], 64); -AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2[2], 16); +AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_8, 64); +AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2, 16); /* * crc32c_avx512(): compute the crc32c of the buffer, where the buffer @@ -35,13 +35,14 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1 */ - static zalign_8 k1k2[] = { - 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; - static zalign_8 k3k4[] = { - 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8}; - static zalign_2 k5k6[] = {0xf20c0dfe, 0x14cd00bd6}; - static zalign_2 k7k8[] = {0xdd45aab8, 0x000000000}; - static zalign_2 poly[] = {0x105ec76f1, 0xdea713f1}; + static zalign_8 k1k2[8] = + {0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; + + static zalign_8 k3k4[8] = + {0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8}; + static zalign_2 k5k6[2] = {0xf20c0dfe, 0x14cd00bd6}; + static zalign_2 k7k8[2] = {0xdd45aab8, 0x000000000}; + static zalign_2 poly[2] = {0x105ec76f1, 0xdea713f1}; __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; __m128i a0, a1, a2, a3; From 1e24d06f51055afed592d84cb043659fbc15ae44 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Tue, 18 Jul 2023 18:33:47 -0700 Subject: [PATCH 12/18] fix leftover symbol collision. --- source/intel/asm/crc32c_sse42_asm.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c index dd7431e..85acdf5 100644 --- a/source/intel/asm/crc32c_sse42_asm.c +++ b/source/intel/asm/crc32c_sse42_asm.c @@ -358,9 +358,6 @@ uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t p return ~crc; } -uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32_sw(input, length, previousCrc32); -} # if defined(__clang__) # pragma clang diagnostic pop @@ -368,7 +365,7 @@ uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previ #else uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32_sw(input, length, previousCrc32); + return aws_checksums_crc32c_sw(input, length, previousCrc32); } #endif /* clang-format on */ From 907e7215f990a87932e75e5715a81fc2f397f5c5 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Wed, 19 Jul 2023 11:46:06 -0700 Subject: [PATCH 13/18] Added more compile gates and assertions. --- source/intel/crc_hw.c | 4 ++-- source/intel/intrin/crc32c_sse42_avx512.c | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c index ef6d00f..3414d67 100644 --- a/source/intel/crc_hw.c +++ b/source/intel/crc_hw.c @@ -53,7 +53,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev crc = (uint32_t)_mm_crc32_u8(crc, *input++); } -#ifdef AWS_HAVE_AVX512_INTRINSICS +#if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX) int chunk_size = length & ~63; if (detected_avx512 && detected_clmul) { @@ -62,7 +62,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev /* check remaining data */ length -= chunk_size; if (!length) { - return crc; + return ~crc; } /* Fall into the default crc32 for the remaining data. */ diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c index 35086c3..c25799d 100644 --- a/source/intel/intrin/crc32c_sse42_avx512.c +++ b/source/intel/intrin/crc32c_sse42_avx512.c @@ -23,6 +23,8 @@ AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2, 16); * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 */ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc) { + AWS_ASSERT( + length >= 256 && "invariant violated. length must be greater than 256 bytes to use avx512 to compute crc."); /* * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6 * are similar to those given at the end of the paper From 5ab00462a861dc9a501057924ef3715ac7c9e585 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Thu, 20 Jul 2023 12:30:25 -0700 Subject: [PATCH 14/18] Fix osx build. --- source/intel/intrin/crc32c_sse42_avx512.c | 1 + 1 file changed, 1 insertion(+) diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c index c25799d..da26149 100644 --- a/source/intel/intrin/crc32c_sse42_avx512.c +++ b/source/intel/intrin/crc32c_sse42_avx512.c @@ -5,6 +5,7 @@ #include +#include #include #include From ca43c51c122305f09aaace72a2aef825efd4b062 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Thu, 20 Jul 2023 12:44:46 -0700 Subject: [PATCH 15/18] make the bitflips uniform. --- source/intel/asm/crc32c_sse42_asm.c | 3 ++- source/intel/intrin/crc32c_sse42_avx512.c | 5 ++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c index 85acdf5..7da4e50 100644 --- a/source/intel/asm/crc32c_sse42_asm.c +++ b/source/intel/asm/crc32c_sse42_asm.c @@ -293,7 +293,8 @@ uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t p detection_performed = true; } - uint32_t crc = ~previousCrc32; + /* this is called by a higher-level shim and previousCRC32 is already ~ */ + uint32_t crc = previousCrc32; /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ if (AWS_UNLIKELY(length < 8)) { diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c index da26149..8274a3a 100644 --- a/source/intel/intrin/crc32c_sse42_avx512.c +++ b/source/intel/intrin/crc32c_sse42_avx512.c @@ -23,9 +23,12 @@ AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2, 16); * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 */ -uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc) { +uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t previous_crc) { AWS_ASSERT( length >= 256 && "invariant violated. length must be greater than 256 bytes to use avx512 to compute crc."); + + fprintf(stderr, "Entered AVX512 branch."); + uint32_t crc = ~previous_crc; /* * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6 * are similar to those given at the end of the paper From 28dde8b1e92b50dc3d06280942c4c1bb51f8c568 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Thu, 20 Jul 2023 13:01:39 -0700 Subject: [PATCH 16/18] add additional runtime cpuid check and run formatter. --- source/intel/crc_hw.c | 7 ++++++- source/intel/intrin/crc32c_sse42_avx512.c | 9 ++++----- source/intel/visualc/visualc_crc32c_sse42.c | 6 +++--- tests/crc_test.c | 1 - 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c index 3414d67..d571cc0 100644 --- a/source/intel/crc_hw.c +++ b/source/intel/crc_hw.c @@ -9,6 +9,7 @@ static bool detection_performed = false; static bool detected_sse42 = false; static bool detected_avx512 = false; static bool detected_clmul = false; +static bool detected_vpclmulqdq = false; /* * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and @@ -23,12 +24,16 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2); detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512); detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL); + detected_vpclmulqdq = aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ); + /* Simply setting the flag true to skip HW detection next time Not using memory barriers since the worst that can happen is a fallback to the non HW accelerated code. */ detection_performed = true; } + /* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and + * branches.*/ uint32_t crc = ~previousCrc32; /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */ @@ -56,7 +61,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev #if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX) int chunk_size = length & ~63; - if (detected_avx512 && detected_clmul) { + if (detected_avx512 && detected_vpclmulqdq && detected_clmul) { if (length >= 256) { crc = aws_checksums_crc32c_avx512(input, length, crc); /* check remaining data */ diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c index 8274a3a..6f402de 100644 --- a/source/intel/intrin/crc32c_sse42_avx512.c +++ b/source/intel/intrin/crc32c_sse42_avx512.c @@ -27,7 +27,6 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t AWS_ASSERT( length >= 256 && "invariant violated. length must be greater than 256 bytes to use avx512 to compute crc."); - fprintf(stderr, "Entered AVX512 branch."); uint32_t crc = ~previous_crc; /* * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6 @@ -41,11 +40,11 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1 */ - static zalign_8 k1k2[8] = - {0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; + static zalign_8 k1k2[8] = { + 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86}; - static zalign_8 k3k4[8] = - {0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8}; + static zalign_8 k3k4[8] = { + 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8}; static zalign_2 k5k6[2] = {0xf20c0dfe, 0x14cd00bd6}; static zalign_2 k7k8[2] = {0xdd45aab8, 0x000000000}; static zalign_2 poly[2] = {0x105ec76f1, 0xdea713f1}; diff --git a/source/intel/visualc/visualc_crc32c_sse42.c b/source/intel/visualc/visualc_crc32c_sse42.c index 79ed836..707f2ba 100644 --- a/source/intel/visualc/visualc_crc32c_sse42.c +++ b/source/intel/visualc/visualc_crc32c_sse42.c @@ -43,11 +43,11 @@ uint32_t aws_checksums_crc32c_sse42(const uint8_t *data, int length, uint32_t pr uint32_t remainder = length_to_process % sizeof(temp); while (slices--) { -# if defined(_M_X64) +#if defined(_M_X64) crc = (uint32_t)_mm_crc32_u64(crc, *temp++); -# else +#else crc = _mm_crc32_u32(crc, *temp++); -# endif +#endif } /* process the remaining parts that can't be done on the slice size. */ diff --git a/tests/crc_test.c b/tests/crc_test.c index a8e8f55..0d8cfeb 100644 --- a/tests/crc_test.c +++ b/tests/crc_test.c @@ -102,7 +102,6 @@ static int s_test_crc32c(struct aws_allocator *allocator, void *ctx) { res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c)); res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c_sw)); - struct aws_byte_buf avx_buf; /* enough for two avx512 runs */ aws_byte_buf_init(&avx_buf, allocator, 512); From a00a8e3c0d586bf799f3bd3d8ae0c22b0f4917e1 Mon Sep 17 00:00:00 2001 From: "Jonathan M. Henson" Date: Thu, 20 Jul 2023 14:25:19 -0700 Subject: [PATCH 17/18] work around nasty bitflipping logic. --- source/intel/asm/crc32c_sse42_asm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c index 7da4e50..bc79597 100644 --- a/source/intel/asm/crc32c_sse42_asm.c +++ b/source/intel/asm/crc32c_sse42_asm.c @@ -366,7 +366,10 @@ uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t p #else uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) { - return aws_checksums_crc32c_sw(input, length, previousCrc32); + /* these are nested in a larger computation. As a result the crc doesn't need to be bit flipped. + However, the sw function is also used as a standalone implementation that does need to do the + bit flip. So go ahead and flip it here, so the sw implementation flips it back. */ + return aws_checksums_crc32c_sw(input, length, ~previousCrc32); } #endif /* clang-format on */ From 51384077aeacbc8613ca0223680a922aea4ec090 Mon Sep 17 00:00:00 2001 From: pbadari <107280494+pbadari@users.noreply.github.com> Date: Sun, 31 Dec 2023 18:34:58 -0800 Subject: [PATCH 18/18] =?UTF-8?q?Addressed=20review=20comments,=20use=20te?= =?UTF-8?q?rnary=20logic=20instructions=20and=20optimiz=E2=80=A6=20(#73)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- source/intel/intrin/crc32c_sse42_avx512.c | 121 +++++++--------------- 1 file changed, 37 insertions(+), 84 deletions(-) diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c index 6f402de..837a1ba 100644 --- a/source/intel/intrin/crc32c_sse42_avx512.c +++ b/source/intel/intrin/crc32c_sse42_avx512.c @@ -21,11 +21,11 @@ AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2, 16); * length must be at least 256, and a multiple of 64. Based on: * * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction" - * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0 + * V. Gopal, E. Ozturk, et al., 2009, http://download.intel.com/design/intarch/papers/323102.pdf */ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t previous_crc) { AWS_ASSERT( - length >= 256 && "invariant violated. length must be greater than 256 bytes to use avx512 to compute crc."); + length >= 256 && "invariant violated. length must be greater than 255 bytes to use avx512 to compute crc."); uint32_t crc = ~previous_crc; /* @@ -45,12 +45,14 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t static zalign_8 k3k4[8] = { 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8}; - static zalign_2 k5k6[2] = {0xf20c0dfe, 0x14cd00bd6}; - static zalign_2 k7k8[2] = {0xdd45aab8, 0x000000000}; - static zalign_2 poly[2] = {0x105ec76f1, 0xdea713f1}; + static zalign_8 k9k10[8] = { + 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092}; + static zalign_8 k1k4[8] = { + 0x1c291d04, 0xddc0152b, 0x3da6d0cb, 0xba4fc28e, 0xf20c0dfe, 0x493c7d27, 0x00000000, 0x00000000}; + __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8; - __m128i a0, a1, a2, a3; + __m128i a1, a2; /* * There's at least one block of 256. @@ -86,39 +88,31 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t y7 = _mm512_loadu_si512((__m512i *)(input + 0x80)); y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0)); - x1 = _mm512_xor_si512(x1, x5); - x2 = _mm512_xor_si512(x2, x6); - x3 = _mm512_xor_si512(x3, x7); - x4 = _mm512_xor_si512(x4, x8); - - x1 = _mm512_xor_si512(x1, y5); - x2 = _mm512_xor_si512(x2, y6); - x3 = _mm512_xor_si512(x3, y7); - x4 = _mm512_xor_si512(x4, y8); + x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96); + x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96); + x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96); + x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96); input += 256; length -= 256; } /* - * Fold into 512-bits. + * Fold 256 bytes into 64 bytes. */ - x0 = _mm512_load_si512((__m512i *)k3k4); - + x0 = _mm512_load_si512((__m512i *)k9k10); x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); - x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); - x1 = _mm512_xor_si512(x1, x2); - x1 = _mm512_xor_si512(x1, x5); + x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96); - x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); - x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); - x1 = _mm512_xor_si512(x1, x3); - x1 = _mm512_xor_si512(x1, x5); + x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00); + x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11); + x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96); - x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); - x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); - x1 = _mm512_xor_si512(x1, x4); - x1 = _mm512_xor_si512(x1, x5); + x0 = _mm512_load_si512((__m512i *)k3k4); + y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00); + y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96); /* * Single fold blocks of 64, if any. @@ -128,72 +122,31 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); - x1 = _mm512_xor_si512(x1, x2); - x1 = _mm512_xor_si512(x1, x5); + x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96); input += 64; length -= 64; } /* - * Fold 512-bits to 384-bits. + * Fold 512-bits to 128-bits. */ - a0 = _mm_load_si128((__m128i *)k5k6); - - a1 = _mm512_extracti32x4_epi32(x1, 0); - a2 = _mm512_extracti32x4_epi32(x1, 1); + x0 = _mm512_loadu_si512((__m512i *)k1k4); - a3 = _mm_clmulepi64_si128(a1, a0, 0x00); - a1 = _mm_clmulepi64_si128(a1, a0, 0x11); - - a1 = _mm_xor_si128(a1, a3); - a1 = _mm_xor_si128(a1, a2); - - /* - * Fold 384-bits to 256-bits. - */ - a2 = _mm512_extracti32x4_epi32(x1, 2); - a3 = _mm_clmulepi64_si128(a1, a0, 0x00); - a1 = _mm_clmulepi64_si128(a1, a0, 0x11); - a1 = _mm_xor_si128(a1, a3); - a1 = _mm_xor_si128(a1, a2); - - /* - * Fold 256-bits to 128-bits. - */ a2 = _mm512_extracti32x4_epi32(x1, 3); - a3 = _mm_clmulepi64_si128(a1, a0, 0x00); - a1 = _mm_clmulepi64_si128(a1, a0, 0x11); - a1 = _mm_xor_si128(a1, a3); - a1 = _mm_xor_si128(a1, a2); - - /* - * Fold 128-bits to 64-bits. - */ - a2 = _mm_clmulepi64_si128(a1, a0, 0x10); - a3 = _mm_setr_epi32(~0, 0, ~0, 0); - a1 = _mm_srli_si128(a1, 8); - a1 = _mm_xor_si128(a1, a2); - - a0 = _mm_loadl_epi64((__m128i *)k7k8); - a2 = _mm_srli_si128(a1, 4); - a1 = _mm_and_si128(a1, a3); - a1 = _mm_clmulepi64_si128(a1, a0, 0x00); - a1 = _mm_xor_si128(a1, a2); - - /* - * Barret reduce to 32-bits. - */ - a0 = _mm_load_si128((__m128i *)poly); + x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00); + x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11); + x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_castsi128_si512(a2), 0x96); - a2 = _mm_and_si128(a1, a3); - a2 = _mm_clmulepi64_si128(a2, a0, 0x10); - a2 = _mm_and_si128(a2, a3); - a2 = _mm_clmulepi64_si128(a2, a0, 0x00); - a1 = _mm_xor_si128(a1, a2); + x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E); + x0 = _mm512_xor_epi64(x1, x0); + a1 = _mm512_extracti32x4_epi32(x0, 1); + a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0)); /* - * Return the crc32. + * Fold 128-bits to 32-bits. */ - return ~_mm_extract_epi32(a1, 1); + uint64_t val; + val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0)); + return (uint32_t) _mm_crc32_u64(val, _mm_extract_epi64(a1, 1)); }