Skip to content

Commit 8733d2f

Browse files
committed
AVX512 and VPCLMULQDQ based CRC-32 and CRC-32C
This implementation is based on crc32_refl_by16_vclmul_avx512 in https://github.com/intel/intel-ipsec-mb/ with some optimizations. Changes to CMakeLists.txt and source/intel/asm/crc32c_sse42_asm.c are based on awslabs#72. This also fixes a bug in aws_checksums_crc32c_hw() when 128-bit pclmul is not available. crc_intrin_fn was being invoked on bytes instead of 32-bit or 64-bit words. The aws-checksums-tests was extended to cover all SIMD implementations. Note: The availability of the Intel CRC-32C instructions is checked as part of testing AWS_CPU_FEATURE_SSE_4_2. Both ISA extensions were introduced in the Intel Nehalem microarchitecture. For compiling this, https://github.com/awslabs/aws-c-common must be installed and CMAKE_MODULE_PATH must point to it, e.g.: cmake -DCMAKE_MODULE_PATH=/usr/local/lib/cmake. The AWS_CPU_FEATURE_AVX512 currently only checks for AVX512F and not other features that this implementation depends on: AVX512VL, AVX512BW, AVX512DQ. According to https://en.wikipedia.org/wiki/AVX-512#CPUs_with_AVX-512 there currently exist no CPUs that would support VPCLMULQDQ without supporting all those AVX512 features. The architecture target evex512 is something that was introduced as mandatory in GCC 14 and clang 18 as part of introducing the AVX10.1-512 target, which basically is a new name for a number of AVX512 features. Older compilers do not recognize this target, but they do emit EVEX encoded instructions.
1 parent 785e1b5 commit 8733d2f

File tree

9 files changed

+609
-51
lines changed

9 files changed

+609
-51
lines changed

CMakeLists.txt

Lines changed: 44 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -58,17 +58,48 @@ file(GLOB AWS_ARCH_SRC
5858
)
5959

6060
if (USE_CPU_EXTENSIONS)
61-
if(AWS_ARCH_INTEL)
62-
# First, check if inline assembly is available. Inline assembly can also be supported by MSVC if the compiler in use is Clang.
63-
if(AWS_HAVE_GCC_INLINE_ASM)
64-
file(GLOB AWS_ARCH_SRC
65-
"source/intel/asm/*.c"
61+
if (AWS_ARCH_INTEL)
62+
file (GLOB AWS_ARCH_INTEL_SRC
63+
"source/intel/*.c"
64+
)
65+
66+
if (AWS_HAVE_AVX512_INTRINSICS)
67+
if (MSVC)
68+
file(GLOB AWS_ARCH_INTRIN_SRC
69+
"source/intel/intrin/*.c"
70+
"source/intel/visualc/*.c"
6671
)
67-
elseif (MSVC)
68-
file(GLOB AWS_ARCH_SRC
72+
else()
73+
file(GLOB AWS_ARCH_INTRIN_SRC
74+
"source/intel/intrin/*.c"
75+
)
76+
endif()
77+
else()
78+
if (MSVC)
79+
file(GLOB AWS_ARCH_INTRIN_SRC
6980
"source/intel/visualc/*.c"
81+
)
82+
endif()
83+
endif()
84+
85+
source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC})
86+
source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC})
87+
88+
if (AWS_HAVE_GCC_INLINE_ASM)
89+
file(GLOB AWS_ARCH_ASM_SRC
90+
"source/intel/asm/*.c"
91+
)
92+
93+
file(GLOB AWS_ARCH_SRC
94+
${AWS_ARCH_INTEL_SRC}
95+
${AWS_ARCH_INTRIN_SRC}
96+
${AWS_ARCH_ASM_SRC}
97+
)
98+
else()
99+
file(GLOB AWS_ARCH_SRC
100+
${AWS_ARCH_INTEL_SRC}
101+
${AWS_ARCH_INTRIN_SRC}
70102
)
71-
source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC})
72103
endif()
73104
endif()
74105

@@ -114,6 +145,7 @@ file(GLOB CHECKSUMS_COMBINED_SRC
114145

115146

116147
add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC})
148+
117149
aws_set_common_properties(${PROJECT_NAME})
118150
aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS")
119151
aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS})
@@ -123,6 +155,10 @@ aws_add_sanitizers(${PROJECT_NAME})
123155
# We are not ABI stable yet
124156
set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0)
125157

158+
if (USE_CPU_EXTENSIONS AND AWS_ARCH_INTEL)
159+
SET_SOURCE_FILES_PROPERTIES(source/intel/crc_hw.c PROPERTIES COMPILE_FLAGS -msse4.2)
160+
endif()
161+
126162
target_include_directories(${PROJECT_NAME} PUBLIC
127163
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
128164
$<INSTALL_INTERFACE:include>)

include/aws/checksums/private/crc_priv.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,20 @@ AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_sw(const uint8_t *input, int leng
2020
/* Computes the Castagnoli CRC32c (iSCSI) using a (slow) reference implementation. */
2121
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_sw(const uint8_t *input, int length, uint32_t previousCrc32c);
2222

23+
/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */
24+
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32);
25+
26+
/* Computes CRC32 (Ethernet, gzip, et. al.) using AVX512 and VPCLMULQDQ. */
27+
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_avx512(const uint8_t *data, int length, uint32_t previousCrc32);
28+
2329
/* Computes the Castagnoli CRC32c (iSCSI). */
2430
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32);
2531

26-
/* Computes CRC32 (Ethernet, gzip, et. al.) using crc instructions. */
27-
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_hw(const uint8_t *data, int length, uint32_t previousCrc32);
32+
/* Computes the Castagnoli CRC32c (iSCSI) using 128-bit PCLMULQDQ. */
33+
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_clmul(const uint8_t *data, int length, uint32_t previousCrc32);
34+
35+
/* Computes the Castagnoli CRC32c (iSCSI) using AVX512 and VPCLMULQDQ. */
36+
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c_avx512(const uint8_t *data, int length, uint32_t previousCrc32);
2837

2938
#ifdef __cplusplus
3039
}
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/**
2+
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
* SPDX-License-Identifier: Apache-2.0.
4+
*/
5+
6+
#include <aws/checksums/private/crc_priv.h>
7+
8+
#include <aws/common/config.h>
9+
#include <nmmintrin.h>
10+
11+
#if defined _WIN64 || defined __x86_64__
12+
typedef uint64_t *slice_ptr_type;
13+
typedef uint64_t slice_ptr_int_type;
14+
# define crc_intrin_fn _mm_crc32_u64
15+
#else
16+
typedef uint32_t *slice_ptr_type;
17+
typedef uint32_t slice_ptr_int_type;
18+
# define crc_intrin_fn _mm_crc32_u32
19+
#endif
20+
21+
#ifdef AWS_HAVE_AVX512_INTRINSICS
22+
uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc);
23+
uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc);
24+
#endif
25+
26+
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc);

source/crc.c

Lines changed: 29 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,22 +12,45 @@ static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t pre
1212

1313
uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32) {
1414
if (AWS_UNLIKELY(!s_crc32_fn_ptr)) {
15-
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
15+
#ifdef AWS_HAVE_ARM32_CRC
16+
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC))
1617
s_crc32_fn_ptr = aws_checksums_crc32_hw;
17-
} else {
18+
#elif defined AWS_HAVE_AVX512_INTRINSICS
19+
if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) &&
20+
aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ))
21+
s_crc32_fn_ptr = aws_checksums_crc32_avx512;
22+
#else
23+
if (0) {}
24+
#endif
25+
else
1826
s_crc32_fn_ptr = aws_checksums_crc32_sw;
19-
}
2027
}
2128
return s_crc32_fn_ptr(input, length, previousCrc32);
2229
}
2330

2431
uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previousCrc32) {
2532
if (AWS_UNLIKELY(!s_crc32c_fn_ptr)) {
26-
if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) || aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
33+
#ifdef AWS_HAVE_ARM32_CRC
34+
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC))
2735
s_crc32c_fn_ptr = aws_checksums_crc32c_hw;
28-
} else {
29-
s_crc32c_fn_ptr = aws_checksums_crc32c_sw;
36+
#else
37+
# ifdef AWS_HAVE_AVX512_INTRINSICS
38+
if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) &&
39+
aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ))
40+
s_crc32c_fn_ptr = aws_checksums_crc32c_avx512;
41+
else
42+
# endif
43+
if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2)) {
44+
# ifdef AWS_HAVE_CLMUL
45+
if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL))
46+
s_crc32c_fn_ptr = aws_checksums_crc32c_clmul;
47+
else
48+
# endif
49+
s_crc32c_fn_ptr = aws_checksums_crc32c_hw;
3050
}
51+
#endif
52+
else
53+
s_crc32c_fn_ptr = aws_checksums_crc32c_sw;
3154
}
3255
return s_crc32c_fn_ptr(input, length, previousCrc32);
3356
}

source/intel/asm/crc32c_sse42_asm.c

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* SPDX-License-Identifier: Apache-2.0.
44
*/
55

6-
#include <aws/checksums/private/crc_priv.h>
6+
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
77

88
#include <aws/common/cpuid.h>
99

@@ -283,7 +283,7 @@ static bool detected_clmul = false;
283283
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
284284
* call.
285285
*/
286-
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
286+
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
287287

288288
if (AWS_UNLIKELY(!detection_performed)) {
289289
detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
@@ -293,7 +293,8 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
293293
detection_performed = true;
294294
}
295295

296-
uint32_t crc = ~previousCrc32;
296+
/* this is called by a higher-level shim and previousCRC32 is already ~ */
297+
uint32_t crc = previousCrc32;
297298

298299
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
299300
if (AWS_UNLIKELY(length < 8)) {
@@ -358,22 +359,17 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
358359

359360
return ~crc;
360361
}
361-
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
362-
return aws_checksums_crc32_sw(input, length, previousCrc32);
363-
}
364362

365363
# if defined(__clang__)
366364
# pragma clang diagnostic pop
367365
# endif
368366

369367
#else
370-
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
371-
return aws_checksums_crc32_sw(input, length, previousCrc32);
372-
}
373-
374-
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
375-
return aws_checksums_crc32c_sw(input, length, previousCrc32);
368+
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
369+
/* these are nested in a larger computation. As a result the crc doesn't need to be bit flipped.
370+
However, the sw function is also used as a standalone implementation that does need to do the
371+
bit flip. So go ahead and flip it here, so the sw implementation flips it back. */
372+
return aws_checksums_crc32c_sw(input, length, ~previousCrc32);
376373
}
377-
378374
#endif
379375
/* clang-format on */

source/intel/crc_hw.c

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
/**
2+
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
* SPDX-License-Identifier: Apache-2.0.
4+
*/
5+
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
6+
#include <aws/common/macros.h>
7+
8+
static uint32_t aws_checksums_crc32c_hw_small(const uint8_t *input, int length, uint32_t crc) {
9+
while (length-- > 0) {
10+
crc = (uint32_t)_mm_crc32_u8(crc, *input++);
11+
}
12+
return ~crc;
13+
}
14+
15+
static uint32_t aws_checksums_crc32c_hw_unaligned(const uint8_t **input, int *length, uint32_t crc) {
16+
/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
17+
int input_alignment = (uintptr_t)(*input)&0x7;
18+
19+
/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
20+
int leading = (8 - input_alignment) & 0x7;
21+
22+
/* reduce the length by the leading unaligned bytes we are about to process */
23+
*length -= leading;
24+
25+
/* spin through the leading unaligned input bytes (if any) one-by-one */
26+
while (leading-- > 0) {
27+
crc = (uint32_t)_mm_crc32_u8(crc, *(*input)++);
28+
}
29+
30+
return crc;
31+
}
32+
33+
/*
34+
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) instructions.
35+
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
36+
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
37+
* call.
38+
*/
39+
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
40+
41+
/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
42+
* branches.*/
43+
uint32_t crc = ~previousCrc32;
44+
45+
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
46+
if (length < (int)sizeof(slice_ptr_int_type)) {
47+
return aws_checksums_crc32c_hw_small(input, length, crc);
48+
}
49+
50+
crc = aws_checksums_crc32c_hw_unaligned(&input, &length, crc);
51+
/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
52+
while (length >= (int)sizeof(slice_ptr_int_type)) {
53+
crc = (uint32_t)crc_intrin_fn(crc, *(const slice_ptr_int_type*) input);
54+
input += sizeof(slice_ptr_int_type);
55+
length -= (int)sizeof(slice_ptr_int_type);
56+
}
57+
58+
/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
59+
while (length-- > 0) {
60+
crc = (uint32_t)_mm_crc32_u8(crc, *input);
61+
input++;
62+
}
63+
64+
return ~crc;
65+
}
66+
67+
/*
68+
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
69+
* PCLMULQDQ machine instructions (if present).
70+
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
71+
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
72+
* call.
73+
*/
74+
uint32_t aws_checksums_crc32c_clmul(const uint8_t *input, int length, uint32_t previousCrc32) {
75+
76+
/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
77+
* branches.*/
78+
uint32_t crc = ~previousCrc32;
79+
80+
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
81+
if (length < (int)sizeof(slice_ptr_int_type)) {
82+
return aws_checksums_crc32c_hw_small(input, length, crc);
83+
}
84+
85+
crc = aws_checksums_crc32c_hw_unaligned(&input, &length, crc);
86+
87+
return aws_checksums_crc32c_sse42(input, length, crc);
88+
}
89+
90+
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
91+
return aws_checksums_crc32_sw(input, length, previousCrc32);
92+
}

0 commit comments

Comments
 (0)