Skip to content

Commit 092f12d

Browse files
committed
AVX512 and VPCLMULQDQ based CRC-32 and CRC-32C
This implementation is based on crc32_refl_by16_vclmul_avx512 in https://github.com/intel/intel-ipsec-mb/ with some optimizations. Some of the code is based on awslabs#72.
1 parent 0884586 commit 092f12d

File tree

7 files changed

+581
-43
lines changed

7 files changed

+581
-43
lines changed

CMakeLists.txt

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ string(REPLACE ";" "${AWS_MODULE_DIR};" AWS_MODULE_PATH "${CMAKE_PREFIX_PATH}${A
2828
# Append that generated list to the module search path
2929
list(APPEND CMAKE_MODULE_PATH ${AWS_MODULE_PATH})
3030

31+
include(AwsSIMD)
3132
include(AwsCFlags)
3233
include(AwsCheckHeaders)
3334
include(AwsSharedLibSetup)
@@ -58,17 +59,48 @@ file(GLOB AWS_ARCH_SRC
5859
)
5960

6061
if (USE_CPU_EXTENSIONS)
61-
if(AWS_ARCH_INTEL)
62-
# First, check if inline assembly is available. Inline assembly can also be supported by MSVC if the compiler in use is Clang.
63-
if(AWS_HAVE_GCC_INLINE_ASM)
64-
file(GLOB AWS_ARCH_SRC
65-
"source/intel/asm/*.c"
62+
if (AWS_ARCH_INTEL)
63+
file (GLOB AWS_ARCH_INTEL_SRC
64+
"source/intel/*.c"
65+
)
66+
67+
if (AWS_HAVE_AVX512_INTRINSICS)
68+
if (MSVC)
69+
file(GLOB AWS_ARCH_INTRIN_SRC
70+
"source/intel/intrin/*.c"
71+
"source/intel/visualc/*.c"
6672
)
67-
elseif (MSVC)
68-
file(GLOB AWS_ARCH_SRC
73+
else()
74+
file(GLOB AWS_ARCH_INTRIN_SRC
75+
"source/intel/intrin/*.c"
76+
)
77+
endif()
78+
else()
79+
if (MSVC)
80+
file(GLOB AWS_ARCH_INTRIN_SRC
6981
"source/intel/visualc/*.c"
82+
)
83+
endif()
84+
endif()
85+
86+
source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC})
87+
source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC})
88+
89+
if (AWS_HAVE_GCC_INLINE_ASM)
90+
file(GLOB AWS_ARCH_ASM_SRC
91+
"source/intel/asm/*.c"
92+
)
93+
94+
file(GLOB AWS_ARCH_SRC
95+
${AWS_ARCH_INTEL_SRC}
96+
${AWS_ARCH_INTRIN_SRC}
97+
${AWS_ARCH_ASM_SRC}
98+
)
99+
else()
100+
file(GLOB AWS_ARCH_SRC
101+
${AWS_ARCH_INTEL_SRC}
102+
${AWS_ARCH_INTRIN_SRC}
70103
)
71-
source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC})
72104
endif()
73105
endif()
74106

@@ -114,6 +146,7 @@ file(GLOB CHECKSUMS_COMBINED_SRC
114146

115147

116148
add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC})
149+
117150
aws_set_common_properties(${PROJECT_NAME})
118151
aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS")
119152
aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS})
@@ -123,6 +156,10 @@ aws_add_sanitizers(${PROJECT_NAME})
123156
# We are not ABI stable yet
124157
set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0)
125158

159+
if (USE_CPU_EXTENSIONS AND AWS_ARCH_INTEL)
160+
simd_add_source_avx(${PROJECT_NAME} ${AWS_ARCH_SRC})
161+
endif()
162+
126163
target_include_directories(${PROJECT_NAME} PUBLIC
127164
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
128165
$<INSTALL_INTERFACE:include>)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
/**
2+
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
* SPDX-License-Identifier: Apache-2.0.
4+
*/
5+
6+
#include <aws/checksums/private/crc_priv.h>
7+
8+
#include <aws/common/config.h>
9+
#include <nmmintrin.h>
10+
11+
#if _WIN64 || __x86_64__ || __ppc64_
12+
typedef uint64_t *slice_ptr_type;
13+
typedef uint64_t slice_ptr_int_type;
14+
# define crc_intrin_fn _mm_crc32_u64
15+
#else
16+
typedef uint32_t *slice_ptr_type;
17+
typedef uint32_t slice_ptr_int_type;
18+
# define crc_intrin_fn _mm_crc32_u32
19+
#endif
20+
21+
#ifdef AWS_HAVE_AVX512_INTRINSICS
22+
uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc);
23+
uint32_t aws_checksums_crc32_avx512(const uint8_t *input, int length, uint32_t crc);
24+
#endif
25+
26+
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc);

source/intel/asm/crc32c_sse42_asm.c

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
* SPDX-License-Identifier: Apache-2.0.
44
*/
55

6-
#include <aws/checksums/private/crc_priv.h>
6+
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
77

88
#include <aws/common/cpuid.h>
99

@@ -283,7 +283,7 @@ static bool detected_clmul = false;
283283
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
284284
* call.
285285
*/
286-
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
286+
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
287287

288288
if (AWS_UNLIKELY(!detection_performed)) {
289289
detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
@@ -293,7 +293,8 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
293293
detection_performed = true;
294294
}
295295

296-
uint32_t crc = ~previousCrc32;
296+
/* this is called by a higher-level shim and previousCRC32 is already ~ */
297+
uint32_t crc = previousCrc32;
297298

298299
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
299300
if (AWS_UNLIKELY(length < 8)) {
@@ -358,22 +359,17 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
358359

359360
return ~crc;
360361
}
361-
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
362-
return aws_checksums_crc32_sw(input, length, previousCrc32);
363-
}
364362

365363
# if defined(__clang__)
366364
# pragma clang diagnostic pop
367365
# endif
368366

369367
#else
370-
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
371-
return aws_checksums_crc32_sw(input, length, previousCrc32);
372-
}
373-
374-
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
375-
return aws_checksums_crc32c_sw(input, length, previousCrc32);
368+
uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
369+
/* these are nested in a larger computation. As a result the crc doesn't need to be bit flipped.
370+
However, the sw function is also used as a standalone implementation that does need to do the
371+
bit flip. So go ahead and flip it here, so the sw implementation flips it back. */
372+
return aws_checksums_crc32c_sw(input, length, ~previousCrc32);
376373
}
377-
378374
#endif
379375
/* clang-format on */

source/intel/crc_hw.c

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/**
2+
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
3+
* SPDX-License-Identifier: Apache-2.0.
4+
*/
5+
#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
6+
#ifdef _MSC_VER
7+
# include <intrin.h>
8+
#else
9+
# include <cpuid.h>
10+
#endif
11+
12+
static bool detection_performed;
13+
static bool detected_sse42;
14+
static bool detected_clmul;
15+
#ifdef AWS_HAVE_AVX512_INTRINSICS
16+
static bool detected_vpclmulqdq;
17+
#endif
18+
19+
static void aws_checksums_hw_detect(void)
20+
{
21+
#ifdef _MSC_VER
22+
int regs[4];
23+
__cpuid(regs, 1);
24+
uint32_t ecx = regs[2];
25+
#else
26+
uint32_t eax = 0, ebx = 0, ecx = 0, edx = 0;
27+
__cpuid(1, reax, rebx, recx, redx);
28+
#endif
29+
detected_sse42 = ecx & 1U << 20;
30+
detected_clmul = ecx & 1U << 1;
31+
32+
#ifdef AWS_HAVE_AVX512_INTRINSICS
33+
# ifdef _MSC_VER
34+
__cpuidex(regs, 7, 0);
35+
uint32_t ebx = regs[1];
36+
ecx = regs[2];
37+
# else
38+
__cpuid_count(7, 0, eax, ebx, ecx, edx);
39+
# endif
40+
detected_vpclmulqdq = ecx & 1U<<10/*VPCLMULQDQ*/ &&
41+
!(~ebx & ((1U<<16/*AVX512F*/ | 1U<<17/*AVX512DQ*/ |
42+
1U<<30/*AVX512BW*/ | 1U<<31/*AVX512VL*/)));
43+
#endif
44+
45+
/* Simply setting the flag true to skip HW detection next time
46+
Not using memory barriers since the worst that can
47+
happen is a fallback to the non HW accelerated code. */
48+
detection_performed = true;
49+
}
50+
51+
/*
52+
* Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
53+
* PCLMULQDQ machine instructions (if present).
54+
* Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
55+
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
56+
* call.
57+
*/
58+
uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
59+
60+
if (AWS_UNLIKELY(!detection_performed)) {
61+
aws_checksums_hw_detect();
62+
}
63+
64+
#ifdef AWS_HAVE_AVX512_INTRINSICS
65+
if (detected_vpclmulqdq) {
66+
return aws_checksums_crc32c_avx512(inputr, length, crc);
67+
}
68+
#endif
69+
70+
/* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
71+
* branches.*/
72+
uint32_t crc = ~previousCrc32;
73+
74+
/* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
75+
if (length < (int)sizeof(slice_ptr_int_type)) {
76+
while (length-- > 0) {
77+
crc = (uint32_t)_mm_crc32_u8(crc, *input++);
78+
}
79+
return ~crc;
80+
}
81+
82+
/* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
83+
int input_alignment = (uintptr_t)(input)&0x7;
84+
85+
/* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
86+
int leading = (8 - input_alignment) & 0x7;
87+
88+
/* reduce the length by the leading unaligned bytes we are about to process */
89+
length -= leading;
90+
91+
/* spin through the leading unaligned input bytes (if any) one-by-one */
92+
while (leading-- > 0) {
93+
crc = (uint32_t)_mm_crc32_u8(crc, *input++);
94+
}
95+
96+
if (detected_sse42 && detected_clmul) {
97+
return aws_checksums_crc32c_sse42(input, length, crc);
98+
}
99+
100+
/* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
101+
while (length >= (int)sizeof(slice_ptr_int_type)) {
102+
crc = (uint32_t)crc_intrin_fn(crc, *input);
103+
input += sizeof(slice_ptr_int_type);
104+
length -= (int)sizeof(slice_ptr_int_type);
105+
}
106+
107+
/* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
108+
while (length-- > 0) {
109+
crc = (uint32_t)_mm_crc32_u8(crc, *input);
110+
input++;
111+
}
112+
113+
return ~crc;
114+
}
115+
116+
uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
117+
#ifdef AWS_HAVE_AVX512_INTRINSICS
118+
if (AWS_UNLIKELY(!detection_performed)) {
119+
aws_checksums_hw_detect();
120+
}
121+
122+
if (detected_vpclmulqdq) {
123+
return aws_checksums_crc32_avx512(inputr, length, crc);
124+
}
125+
#endif
126+
return aws_checksums_crc32_sw(input, length, previousCrc32);
127+
}

0 commit comments

Comments
 (0)