From c6248e033849062153baae637780a72cd122b40d Mon Sep 17 00:00:00 2001
From: "Vazquez, Javier" <javier.vazquez@intel.com>
Date: Thu, 22 Jun 2023 13:51:12 -0600
Subject: [PATCH 01/18] Added CRC32C AVX512 support.

---
 CMakeLists.txt                      |   2 +
 source/intel/asm/crc32c_sse42_asm.c | 242 ++++++++++++++++++++++++++--
 2 files changed, 227 insertions(+), 17 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index bb1cd75..2056ae3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,9 +66,11 @@ if (USE_CPU_EXTENSIONS)
         source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC})
 
     elseif(AWS_ARCH_INTEL AND AWS_HAVE_GCC_INLINE_ASM)
+        set(AWS_CMAKE_REQUIRED_FLAGS "-mavx512f -msse4.2 -mvpclmulqdq -mpclmul")
         file(GLOB AWS_ARCH_SRC
                 "source/intel/asm/*.c"
             )
+        SET_SOURCE_FILES_PROPERTIES(source/intel/asm/crc32c_sse42_asm.c COMPILE_FLAGS ${AWS_CMAKE_REQUIRED_FLAGS})
     endif()
 
     if (MSVC AND AWS_ARCH_ARM64)
diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c
index 35e1d09..7a4d550 100644
--- a/source/intel/asm/crc32c_sse42_asm.c
+++ b/source/intel/asm/crc32c_sse42_asm.c
@@ -6,6 +6,12 @@
 #include <aws/checksums/private/crc_priv.h>
 
 #include <aws/common/cpuid.h>
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <immintrin.h>
+
+#define zalign(x) __attribute__((aligned((x))))
 
 /* clang-format off */
 
@@ -273,8 +279,195 @@ static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t
     return crc;
 }
 
+/*
+ * crc32_avx512(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 256, and a multiple of 64. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+static uint32_t crc32_avx512_simd(const uint8_t *input, int length, uint32_t crc)
+{
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6
+     * are similar to those given at the end of the paper
+     *
+     * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1
+     * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1
+     * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1
+     * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1
+     * k5 = ( x ^ ( 128 + 32 ) mod P(x) << 32 )' << 1
+     * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1
+     */
+
+    static const uint64_t zalign(64) k1k2[] = { 0xdcb17aa4, 0xb9e02b86,
+                                                0xdcb17aa4, 0xb9e02b86,
+                                                0xdcb17aa4, 0xb9e02b86,
+                                                0xdcb17aa4, 0xb9e02b86 };
+    static const uint64_t zalign(64) k3k4[] = { 0x740eef02, 0x9e4addf8,
+                                                0x740eef02, 0x9e4addf8,
+                                                0x740eef02, 0x9e4addf8,
+                                                0x740eef02, 0x9e4addf8 };
+    static const uint64_t zalign(16) k5k6[] = { 0xf20c0dfe, 0x14cd00bd6 };
+    static const uint64_t zalign(16) k7k8[] = { 0xdd45aab8, 0x000000000 };
+    static const uint64_t zalign(16) poly[] = { 0x105ec76f1, 0xdea713f1 };
+
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+    __m128i a0, a1, a2, a3;
+
+    /*
+     * There's at least one block of 256.
+     */
+    x1 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+    x2 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+    x3 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+    x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+
+    x0 = _mm512_load_si512((__m512i *)k1k2);
+
+    input += 256;
+    length -= 256;
+
+    /*
+     * Parallel fold blocks of 256, if any.
+     */
+    while (length >= 256)
+    {
+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
+
+
+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
+
+        y5 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+        y6 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+        y7 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+        y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+        x1 = _mm512_xor_si512(x1, x5);
+        x2 = _mm512_xor_si512(x2, x6);
+        x3 = _mm512_xor_si512(x3, x7);
+        x4 = _mm512_xor_si512(x4, x8);
+
+        x1 = _mm512_xor_si512(x1, y5);
+        x2 = _mm512_xor_si512(x2, y6);
+        x3 = _mm512_xor_si512(x3, y7);
+        x4 = _mm512_xor_si512(x4, y8);
+
+        input += 256;
+        length -= 256;
+    }
+
+    /*
+     * Fold into 512-bits.
+     */
+    x0 = _mm512_load_si512((__m512i *)k3k4);
+
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x2);
+    x1 = _mm512_xor_si512(x1, x5);
+
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x3);
+    x1 = _mm512_xor_si512(x1, x5);
+
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x4);
+    x1 = _mm512_xor_si512(x1, x5);
+
+    /*
+     * Single fold blocks of 64, if any.
+     */
+    while (length >= 64)
+    {
+        x2 = _mm512_loadu_si512((__m512i *)input);
+
+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+        x1 = _mm512_xor_si512(x1, x2);
+        x1 = _mm512_xor_si512(x1, x5);
+
+        input += 64;
+        length -= 64;
+    }
+
+    /*
+     * Fold 512-bits to 384-bits.
+     */
+    a0 = _mm_load_si128((__m128i *)k5k6);
+
+    a1 = _mm512_extracti32x4_epi32(x1, 0);
+    a2 = _mm512_extracti32x4_epi32(x1, 1);
+
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Fold 384-bits to 256-bits.
+     */
+    a2 = _mm512_extracti32x4_epi32(x1, 2);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Fold 256-bits to 128-bits.
+     */
+    a2 = _mm512_extracti32x4_epi32(x1, 3);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
+    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    a1 = _mm_srli_si128(a1, 8);
+    a1 = _mm_xor_si128(a1, a2);
+
+    a0 = _mm_loadl_epi64((__m128i*)k7k8);
+    a2 = _mm_srli_si128(a1, 4);
+    a1 = _mm_and_si128(a1, a3);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    a0 = _mm_load_si128((__m128i*)poly);
+
+    a2 = _mm_and_si128(a1, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
+    a2 = _mm_and_si128(a2, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Return the crc32.
+     */
+    return _mm_extract_epi32(a1, 1);
+}
+
 static bool detection_performed = false;
 static bool detected_clmul = false;
+static bool detected_sse42 = false;
+static bool detected_avx512 = false;
 
 /*
  * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
@@ -287,6 +480,8 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
 
     if (AWS_UNLIKELY(!detection_performed)) {
         detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
+        detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2);
+        detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512);
         /* Simply setting the flag true to skip HW detection next time
            Not using memory barriers since the worst that can
            happen is a fallback to the non HW accelerated code. */
@@ -321,24 +516,37 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
 
     /* Using likely to keep this code inlined */
     if (AWS_LIKELY(detected_clmul)) {
-
-        while (AWS_LIKELY(length >= 3072)) {
-            /* Compute crc32c on each block, chaining each crc result */
-            crc = s_crc32c_sse42_clmul_3072(input, crc);
-            input += 3072;
-            length -= 3072;
-        }
-        while (AWS_LIKELY(length >= 1024)) {
-            /* Compute crc32c on each block, chaining each crc result */
-            crc = s_crc32c_sse42_clmul_1024(input, crc);
-            input += 1024;
-            length -= 1024;
+        if (AWS_LIKELY(detected_avx512)) {
+            if (AWS_LIKELY(length >= 256)) {
+                ssize_t chunk_size = length & ~63;
+                crc = ~crc32_avx512_simd(input, length, crc);
+                /* check remaining data */
+                length -= chunk_size;
+                if (!length)
+                    return crc;
+                /* Fall into the default crc32 for the remaining data. */
+                input += chunk_size;
+            }
         }
-        while (AWS_LIKELY(length >= 256)) {
-            /* Compute crc32c on each block, chaining each crc result */
-            crc = s_crc32c_sse42_clmul_256(input, crc);
-            input += 256;
-            length -= 256;
+        else if (AWS_LIKELY(detected_sse42)) {
+            while (AWS_LIKELY(length >= 3072)) {
+                /* Compute crc32c on each block, chaining each crc result */
+                crc = s_crc32c_sse42_clmul_3072(input, crc);
+                input += 3072;
+                length -= 3072;
+            }
+            while (AWS_LIKELY(length >= 1024)) {
+                /* Compute crc32c on each block, chaining each crc result */
+                crc = s_crc32c_sse42_clmul_1024(input, crc);
+                input += 1024;
+                length -= 1024;
+            }
+            while (AWS_LIKELY(length >= 256)) {
+                /* Compute crc32c on each block, chaining each crc result */
+                crc = s_crc32c_sse42_clmul_256(input, crc);
+                input += 256;
+                length -= 256;
+            }
         }
     }
 

From cf22bcae847fd7c3470e4159d9c0e0e182aadd52 Mon Sep 17 00:00:00 2001
From: "Pulavarty, Badari" <badari.pulavarty@intel.com>
Date: Tue, 27 Jun 2023 12:54:41 -0400
Subject: [PATCH 02/18] Fixed routine name to indicate crc32c

---
 source/intel/asm/crc32c_sse42_asm.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c
index 7a4d550..68f8c24 100644
--- a/source/intel/asm/crc32c_sse42_asm.c
+++ b/source/intel/asm/crc32c_sse42_asm.c
@@ -280,13 +280,13 @@ static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t
 }
 
 /*
- * crc32_avx512(): compute the crc32 of the buffer, where the buffer
+ * crc32c_avx512(): compute the crc32c of the buffer, where the buffer
  * length must be at least 256, and a multiple of 64. Based on:
  *
  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
  */
-static uint32_t crc32_avx512_simd(const uint8_t *input, int length, uint32_t crc)
+static uint32_t crc32c_avx512(const uint8_t *input, int length, uint32_t crc)
 {
     /*
      * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6
@@ -519,7 +519,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
         if (AWS_LIKELY(detected_avx512)) {
             if (AWS_LIKELY(length >= 256)) {
                 ssize_t chunk_size = length & ~63;
-                crc = ~crc32_avx512_simd(input, length, crc);
+                crc = ~crc32c_avx512(input, length, crc);
                 /* check remaining data */
                 length -= chunk_size;
                 if (!length)

From 375fa35215a87d35166ece1ebb844700a462fcdf Mon Sep 17 00:00:00 2001
From: "Vazquez, Javier" <javier.vazquez@intel.com>
Date: Thu, 13 Jul 2023 20:17:11 -0600
Subject: [PATCH 03/18] Add sse42 avx512_intrinsics support

---
 CMakeLists.txt                                | 20 +++--
 .../intel/{asm => intrin}/crc32c_sse42_asm.c  |  0
 source/intel/visualc/visualc_crc32c_sse42.c   | 77 -------------------
 3 files changed, 15 insertions(+), 82 deletions(-)
 rename source/intel/{asm => intrin}/crc32c_sse42_asm.c (100%)
 delete mode 100644 source/intel/visualc/visualc_crc32c_sse42.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2056ae3..5af28e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -28,6 +28,7 @@ string(REPLACE ";" "${AWS_MODULE_DIR};" AWS_MODULE_PATH "${CMAKE_PREFIX_PATH}${A
 # Append that generated list to the module search path
 list(APPEND CMAKE_MODULE_PATH ${AWS_MODULE_PATH})
 
+include(AwsSIMD)
 include(AwsCFlags)
 include(AwsCheckHeaders)
 include(AwsSharedLibSetup)
@@ -60,17 +61,26 @@ file(GLOB AWS_ARCH_SRC
 if (USE_CPU_EXTENSIONS)
     if (MSVC AND AWS_ARCH_INTEL)
         file(GLOB AWS_ARCH_SRC
-                "source/intel/visualc/*.c"
+                "source/intel/intrin/*.c"
         )
 
-        source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC})
+        source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_SRC})
 
     elseif(AWS_ARCH_INTEL AND AWS_HAVE_GCC_INLINE_ASM)
-        set(AWS_CMAKE_REQUIRED_FLAGS "-mavx512f -msse4.2 -mvpclmulqdq -mpclmul")
+        if (HAVE_SSE42_INTRINSICS)
+            set(SSE42_CFLAGS "-msse4.2")
+        endif()
+
+        if (HAVE_AVX512_INTRINSICS)
+            set(AVX512_CFLAGS "-mavx512f")
+        endif()
+
+        set(AWS_CMAKE_REQUIRED_FLAGS "${SSE42_CFLAGS} ${AVX512_CFLAGS} -mvpclmulqdq -mpclmul")
         file(GLOB AWS_ARCH_SRC
-                "source/intel/asm/*.c"
+                "source/intel/intrin/*.c"
             )
-        SET_SOURCE_FILES_PROPERTIES(source/intel/asm/crc32c_sse42_asm.c COMPILE_FLAGS ${AWS_CMAKE_REQUIRED_FLAGS})
+        message(STATUS "CFLAGS: ${AWS_CMAKE_REQUIRED_FLAGS}")
+        SET_SOURCE_FILES_PROPERTIES(source/intel/intrin/crc32c_sse42_asm.c COMPILE_FLAGS ${AWS_CMAKE_REQUIRED_FLAGS})
     endif()
 
     if (MSVC AND AWS_ARCH_ARM64)
diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/intrin/crc32c_sse42_asm.c
similarity index 100%
rename from source/intel/asm/crc32c_sse42_asm.c
rename to source/intel/intrin/crc32c_sse42_asm.c
diff --git a/source/intel/visualc/visualc_crc32c_sse42.c b/source/intel/visualc/visualc_crc32c_sse42.c
deleted file mode 100644
index ca1aca4..0000000
--- a/source/intel/visualc/visualc_crc32c_sse42.c
+++ /dev/null
@@ -1,77 +0,0 @@
-/**
- * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
- * SPDX-License-Identifier: Apache-2.0.
- */
-
-#include <aws/checksums/private/crc_priv.h>
-#include <intrin.h>
-
-#if defined(_M_X64) || defined(_M_IX86)
-
-#    if defined(_M_X64)
-typedef uint64_t *slice_ptr_type;
-typedef uint64_t slice_ptr_int_type;
-#    else
-typedef uint32_t *slice_ptr_type;
-typedef uint32_t slice_ptr_int_type;
-#    endif
-
-/**
- * This implements crc32c via the intel sse 4.2 instructions.
- *  This is separate from the straight asm version, because visual c does not allow
- *  inline assembly for x64.
- */
-uint32_t aws_checksums_crc32c_hw(const uint8_t *data, int length, uint32_t previousCrc32) {
-    uint32_t crc = ~previousCrc32;
-    int length_to_process = length;
-
-    slice_ptr_type temp = (slice_ptr_type)data;
-
-    /*to eek good performance out of the intel implementation, we need to only hit the hardware
-      once we are aligned on the byte boundaries we are using. So, peel off a byte at a time until we are
-      8 byte aligned (64 bit arch) or 4 byte aligned (32 bit arch)
-
-      first calculate how many bytes we need to burn before we are aligned.
-      for a 64 bit arch this is:
-      (8 - <how far we are past a boundary>) mod 8
-      32 bit:
-      (4 - <how far we are past a boundary>) mod 4 */
-    uint8_t alignment_offset = (sizeof(slice_ptr_int_type) - ((slice_ptr_int_type)temp % sizeof(slice_ptr_int_type))) %
-                               sizeof(slice_ptr_int_type);
-
-    /*for every byte we need to burn off, just do them a byte at a time.
-      increment the temp pointer by one byte at a time until we get it on an alignment boundary */
-    while (alignment_offset != 0 && length_to_process) {
-        uint8_t *byte_pos = (uint8_t *)temp;
-        crc = (uint32_t)_mm_crc32_u8(crc, *byte_pos++);
-        temp = (slice_ptr_type)byte_pos;
-        --alignment_offset;
-        --length_to_process;
-    }
-
-    /*now whatever is left is properly aligned on a boundary*/
-    uint32_t slices = length_to_process / sizeof(temp);
-    uint32_t remainder = length_to_process % sizeof(temp);
-
-    while (slices--) {
-#    if defined(_M_X64)
-        crc = (uint32_t)_mm_crc32_u64(crc, *temp++);
-#    else
-        crc = _mm_crc32_u32(crc, *temp++);
-#    endif
-    }
-
-    /* process the remaining parts that can't be done on the slice size. */
-    uint8_t *remainderPos = (uint8_t *)temp;
-
-    while (remainder--) {
-        crc = (uint32_t)_mm_crc32_u8(crc, *remainderPos++);
-    }
-
-    return ~crc;
-}
-
-uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
-    return aws_checksums_crc32_sw(input, length, previousCrc32);
-}
-#endif /* x64 || x86 */

From 2eb55784e408f144fb4445ce9d6c1065c6aa47bb Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Tue, 18 Jul 2023 17:33:33 -0700
Subject: [PATCH 04/18] Refactoring work for the AVX512 code path. Testing
 shows it not quite working yet.

---
 CMakeLists.txt                                |  48 ++--
 .../private/intel/crc32c_compiler_shims.h     |  25 ++
 .../intel/{intrin => asm}/crc32c_sse42_asm.c  | 251 ++----------------
 source/intel/crc_hw.c                         |  96 +++++++
 source/intel/intrin/crc32c_sse42_avx512.c     | 197 ++++++++++++++
 source/intel/visualc/visualc_crc32c_sse42.c   |  61 +++++
 tests/crc_test.c                              |  14 +
 7 files changed, 443 insertions(+), 249 deletions(-)
 create mode 100644 include/aws/checksums/private/intel/crc32c_compiler_shims.h
 rename source/intel/{intrin => asm}/crc32c_sse42_asm.c (66%)
 create mode 100644 source/intel/crc_hw.c
 create mode 100644 source/intel/intrin/crc32c_sse42_avx512.c
 create mode 100644 source/intel/visualc/visualc_crc32c_sse42.c

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5af28e5..45caa2b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,28 +59,37 @@ file(GLOB AWS_ARCH_SRC
         )
 
 if (USE_CPU_EXTENSIONS)
-    if (MSVC AND AWS_ARCH_INTEL)
-        file(GLOB AWS_ARCH_SRC
-                "source/intel/intrin/*.c"
+    if (AWS_ARCH_INTEL)
+        file (GLOB AWS_ARCH_INTEL_SRC
+            "source/intel/*.c"
         )
 
-        source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_SRC})
-
-    elseif(AWS_ARCH_INTEL AND AWS_HAVE_GCC_INLINE_ASM)
-        if (HAVE_SSE42_INTRINSICS)
-            set(SSE42_CFLAGS "-msse4.2")
-        endif()
-
-        if (HAVE_AVX512_INTRINSICS)
-            set(AVX512_CFLAGS "-mavx512f")
+        if (MSVC)
+            file(GLOB AWS_ARCH_INTRIN_SRC
+                "source/intel/intrin/*.c"
+                "source/intel/visualc/*.c"
+            )
+        else()
+            file(GLOB AWS_ARCH_INTRIN_SRC
+                "source/intel/intrin/*.c"
+            )
         endif()
+        
+        source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC})
+        source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC})
 
-        set(AWS_CMAKE_REQUIRED_FLAGS "${SSE42_CFLAGS} ${AVX512_CFLAGS} -mvpclmulqdq -mpclmul")
-        file(GLOB AWS_ARCH_SRC
-                "source/intel/intrin/*.c"
+        if (AWS_HAVE_GCC_INLINE_ASM)
+            file(GLOB AWS_ARCH_SRC
+                    ${AWS_ARCH_INTEL_SRC}
+                    ${AWS_ARCH_INTRIN_SRC}
+                    ${AWS_ARCH_ASM_SRC}
+            )            
+        else()
+            file(GLOB AWS_ARCH_SRC
+                    ${AWS_ARCH_INTEL_SRC}
+                    ${AWS_ARCH_INTRIN_SRC}
             )
-        message(STATUS "CFLAGS: ${AWS_CMAKE_REQUIRED_FLAGS}")
-        SET_SOURCE_FILES_PROPERTIES(source/intel/intrin/crc32c_sse42_asm.c COMPILE_FLAGS ${AWS_CMAKE_REQUIRED_FLAGS})
+        endif()
     endif()
 
     if (MSVC AND AWS_ARCH_ARM64)
@@ -126,6 +135,7 @@ file(GLOB CHECKSUMS_COMBINED_SRC
 
 
 add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC})
+
 aws_set_common_properties(${PROJECT_NAME})
 aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS")
 aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS})
@@ -135,6 +145,10 @@ aws_add_sanitizers(${PROJECT_NAME})
 # We are not ABI stable yet
 set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0)
 
+if (USE_CPU_EXTENSIONS AND AWS_ARCH_INTEL)
+   simd_add_source_avx(${PROJECT_NAME} ${AWS_ARCH_SRC})
+endif()
+
 target_include_directories(${PROJECT_NAME} PUBLIC
         $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
         $<INSTALL_INTERFACE:include>)
diff --git a/include/aws/checksums/private/intel/crc32c_compiler_shims.h b/include/aws/checksums/private/intel/crc32c_compiler_shims.h
new file mode 100644
index 0000000..21002de
--- /dev/null
+++ b/include/aws/checksums/private/intel/crc32c_compiler_shims.h
@@ -0,0 +1,25 @@
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+#include <aws/checksums/private/crc_priv.h>
+
+#include <aws/common/config.h>
+#include <nmmintrin.h>
+
+#if _WIN64 || __x86_64__ || __ppc64_
+typedef uint64_t *slice_ptr_type;
+typedef uint64_t slice_ptr_int_type;
+#    define crc_intrin_fn _mm_crc32_u64
+#else
+typedef uint32_t *slice_ptr_type;
+typedef uint32_t slice_ptr_int_type;
+#    define crc_intrin_fn _mm_crc32_u32
+#endif
+
+#ifdef AWS_HAVE_AVX512_INTRINSICS
+uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc);
+#endif
+
+uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t crc);
diff --git a/source/intel/intrin/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c
similarity index 66%
rename from source/intel/intrin/crc32c_sse42_asm.c
rename to source/intel/asm/crc32c_sse42_asm.c
index 68f8c24..e7c144e 100644
--- a/source/intel/intrin/crc32c_sse42_asm.c
+++ b/source/intel/asm/crc32c_sse42_asm.c
@@ -6,12 +6,6 @@
 #include <aws/checksums/private/crc_priv.h>
 
 #include <aws/common/cpuid.h>
-#include <emmintrin.h>
-#include <smmintrin.h>
-#include <wmmintrin.h>
-#include <immintrin.h>
-
-#define zalign(x) __attribute__((aligned((x))))
 
 /* clang-format off */
 
@@ -279,195 +273,8 @@ static inline uint32_t s_crc32c_sse42_clmul_3072(const uint8_t *input, uint32_t
     return crc;
 }
 
-/*
- * crc32c_avx512(): compute the crc32c of the buffer, where the buffer
- * length must be at least 256, and a multiple of 64. Based on:
- *
- * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
- *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
- */
-static uint32_t crc32c_avx512(const uint8_t *input, int length, uint32_t crc)
-{
-    /*
-     * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6
-     * are similar to those given at the end of the paper
-     *
-     * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1
-     * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1
-     * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1
-     * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1
-     * k5 = ( x ^ ( 128 + 32 ) mod P(x) << 32 )' << 1
-     * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1
-     */
-
-    static const uint64_t zalign(64) k1k2[] = { 0xdcb17aa4, 0xb9e02b86,
-                                                0xdcb17aa4, 0xb9e02b86,
-                                                0xdcb17aa4, 0xb9e02b86,
-                                                0xdcb17aa4, 0xb9e02b86 };
-    static const uint64_t zalign(64) k3k4[] = { 0x740eef02, 0x9e4addf8,
-                                                0x740eef02, 0x9e4addf8,
-                                                0x740eef02, 0x9e4addf8,
-                                                0x740eef02, 0x9e4addf8 };
-    static const uint64_t zalign(16) k5k6[] = { 0xf20c0dfe, 0x14cd00bd6 };
-    static const uint64_t zalign(16) k7k8[] = { 0xdd45aab8, 0x000000000 };
-    static const uint64_t zalign(16) poly[] = { 0x105ec76f1, 0xdea713f1 };
-
-    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
-    __m128i a0, a1, a2, a3;
-
-    /*
-     * There's at least one block of 256.
-     */
-    x1 = _mm512_loadu_si512((__m512i *)(input + 0x00));
-    x2 = _mm512_loadu_si512((__m512i *)(input + 0x40));
-    x3 = _mm512_loadu_si512((__m512i *)(input + 0x80));
-    x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
-
-    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
-
-    x0 = _mm512_load_si512((__m512i *)k1k2);
-
-    input += 256;
-    length -= 256;
-
-    /*
-     * Parallel fold blocks of 256, if any.
-     */
-    while (length >= 256)
-    {
-        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
-        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
-        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
-
-
-        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
-        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
-        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
-
-        y5 = _mm512_loadu_si512((__m512i *)(input + 0x00));
-        y6 = _mm512_loadu_si512((__m512i *)(input + 0x40));
-        y7 = _mm512_loadu_si512((__m512i *)(input + 0x80));
-        y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
-
-        x1 = _mm512_xor_si512(x1, x5);
-        x2 = _mm512_xor_si512(x2, x6);
-        x3 = _mm512_xor_si512(x3, x7);
-        x4 = _mm512_xor_si512(x4, x8);
-
-        x1 = _mm512_xor_si512(x1, y5);
-        x2 = _mm512_xor_si512(x2, y6);
-        x3 = _mm512_xor_si512(x3, y7);
-        x4 = _mm512_xor_si512(x4, y8);
-
-        input += 256;
-        length -= 256;
-    }
-
-    /*
-     * Fold into 512-bits.
-     */
-    x0 = _mm512_load_si512((__m512i *)k3k4);
-
-    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-    x1 = _mm512_xor_si512(x1, x2);
-    x1 = _mm512_xor_si512(x1, x5);
-
-    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-    x1 = _mm512_xor_si512(x1, x3);
-    x1 = _mm512_xor_si512(x1, x5);
-
-    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-    x1 = _mm512_xor_si512(x1, x4);
-    x1 = _mm512_xor_si512(x1, x5);
-
-    /*
-     * Single fold blocks of 64, if any.
-     */
-    while (length >= 64)
-    {
-        x2 = _mm512_loadu_si512((__m512i *)input);
-
-        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-        x1 = _mm512_xor_si512(x1, x2);
-        x1 = _mm512_xor_si512(x1, x5);
-
-        input += 64;
-        length -= 64;
-    }
-
-    /*
-     * Fold 512-bits to 384-bits.
-     */
-    a0 = _mm_load_si128((__m128i *)k5k6);
-
-    a1 = _mm512_extracti32x4_epi32(x1, 0);
-    a2 = _mm512_extracti32x4_epi32(x1, 1);
-
-    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
-
-    a1 = _mm_xor_si128(a1, a3);
-    a1 = _mm_xor_si128(a1, a2);
-
-    /*
-     * Fold 384-bits to 256-bits.
-     */
-    a2 = _mm512_extracti32x4_epi32(x1, 2);
-    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
-    a1 = _mm_xor_si128(a1, a3);
-    a1 = _mm_xor_si128(a1, a2);
-
-    /*
-     * Fold 256-bits to 128-bits.
-     */
-    a2 = _mm512_extracti32x4_epi32(x1, 3);
-    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
-    a1 = _mm_xor_si128(a1, a3);
-    a1 = _mm_xor_si128(a1, a2);
-
-    /*
-     * Fold 128-bits to 64-bits.
-     */
-    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
-    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
-    a1 = _mm_srli_si128(a1, 8);
-    a1 = _mm_xor_si128(a1, a2);
-
-    a0 = _mm_loadl_epi64((__m128i*)k7k8);
-    a2 = _mm_srli_si128(a1, 4);
-    a1 = _mm_and_si128(a1, a3);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_xor_si128(a1, a2);
-
-    /*
-     * Barret reduce to 32-bits.
-     */
-    a0 = _mm_load_si128((__m128i*)poly);
-
-    a2 = _mm_and_si128(a1, a3);
-    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
-    a2 = _mm_and_si128(a2, a3);
-    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
-    a1 = _mm_xor_si128(a1, a2);
-
-    /*
-     * Return the crc32.
-     */
-    return _mm_extract_epi32(a1, 1);
-}
-
 static bool detection_performed = false;
 static bool detected_clmul = false;
-static bool detected_sse42 = false;
-static bool detected_avx512 = false;
 
 /*
  * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
@@ -476,12 +283,10 @@ static bool detected_avx512 = false;
  * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
  * call.
  */
-uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
+uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
 
     if (AWS_UNLIKELY(!detection_performed)) {
         detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
-        detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2);
-        detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512);
         /* Simply setting the flag true to skip HW detection next time
            Not using memory barriers since the worst that can
            happen is a fallback to the non HW accelerated code. */
@@ -516,37 +321,24 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
 
     /* Using likely to keep this code inlined */
     if (AWS_LIKELY(detected_clmul)) {
-        if (AWS_LIKELY(detected_avx512)) {
-            if (AWS_LIKELY(length >= 256)) {
-                ssize_t chunk_size = length & ~63;
-                crc = ~crc32c_avx512(input, length, crc);
-                /* check remaining data */
-                length -= chunk_size;
-                if (!length)
-                    return crc;
-                /* Fall into the default crc32 for the remaining data. */
-                input += chunk_size;
-            }
+
+        while (AWS_LIKELY(length >= 3072)) {
+            /* Compute crc32c on each block, chaining each crc result */
+            crc = s_crc32c_sse42_clmul_3072(input, crc);
+            input += 3072;
+            length -= 3072;
         }
-        else if (AWS_LIKELY(detected_sse42)) {
-            while (AWS_LIKELY(length >= 3072)) {
-                /* Compute crc32c on each block, chaining each crc result */
-                crc = s_crc32c_sse42_clmul_3072(input, crc);
-                input += 3072;
-                length -= 3072;
-            }
-            while (AWS_LIKELY(length >= 1024)) {
-                /* Compute crc32c on each block, chaining each crc result */
-                crc = s_crc32c_sse42_clmul_1024(input, crc);
-                input += 1024;
-                length -= 1024;
-            }
-            while (AWS_LIKELY(length >= 256)) {
-                /* Compute crc32c on each block, chaining each crc result */
-                crc = s_crc32c_sse42_clmul_256(input, crc);
-                input += 256;
-                length -= 256;
-            }
+        while (AWS_LIKELY(length >= 1024)) {
+            /* Compute crc32c on each block, chaining each crc result */
+            crc = s_crc32c_sse42_clmul_1024(input, crc);
+            input += 1024;
+            length -= 1024;
+        }
+        while (AWS_LIKELY(length >= 256)) {
+            /* Compute crc32c on each block, chaining each crc result */
+            crc = s_crc32c_sse42_clmul_256(input, crc);
+            input += 256;
+            length -= 256;
         }
     }
 
@@ -575,13 +367,8 @@ uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previ
 #    endif
 
 #else
-uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
+uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
     return aws_checksums_crc32_sw(input, length, previousCrc32);
 }
-
-uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
-    return aws_checksums_crc32c_sw(input, length, previousCrc32);
-}
-
 #endif
 /* clang-format on */
diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c
new file mode 100644
index 0000000..1e1c85b
--- /dev/null
+++ b/source/intel/crc_hw.c
@@ -0,0 +1,96 @@
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
+#include <aws/common/cpuid.h>
+
+static bool detection_performed = false;
+static bool detected_sse42 = false;
+static bool detected_avx512 = false;
+static bool detected_clmul = false;
+
+/*
+ * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
+ * PCLMULQDQ machine instructions (if present).
+ * Handles data that isn't 8-byte aligned as well as any trailing data with the CRC32B (byte) instruction.
+ * Pass 0 in the previousCrc32 parameter as an initial value unless continuing to update a running CRC in a subsequent
+ * call.
+ */
+uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
+
+    if (AWS_UNLIKELY(!detection_performed)) {
+        detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2);
+        detected_avx512 = true; //aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512);
+        detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
+        /* Simply setting the flag true to skip HW detection next time
+           Not using memory barriers since the worst that can
+           happen is a fallback to the non HW accelerated code. */
+        detection_performed = true;
+    }
+
+    uint32_t crc = ~previousCrc32;
+
+    /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
+    if (length < sizeof(slice_ptr_int_type)) {
+        while (length-- > 0) {
+            crc = (uint32_t)_mm_crc32_u8(crc, *input++);
+        }
+        return ~crc;
+    }
+
+    /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
+    int input_alignment = (uintptr_t)(input) & 0x7;
+
+    /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
+    int leading = (8 - input_alignment) & 0x7;
+
+    /* reduce the length by the leading unaligned bytes we are about to process */
+    length -= leading;
+
+    /* spin through the leading unaligned input bytes (if any) one-by-one */
+    while (leading-- > 0) {
+        crc = (uint32_t)_mm_crc32_u8(crc, *input++);
+    }
+
+    int chunk_size = length & ~63;
+
+#ifdef AWS_HAVE_AVX512_INTRINSICS
+    if (detected_avx512 && detected_clmul) {
+        if (length >= 256) {
+            crc = aws_checksums_crc32c_avx512(input, length, crc);
+            /* check remaining data */
+            length -= chunk_size;
+            if (!length) {
+                return crc;
+            }
+
+            /* Fall into the default crc32 for the remaining data. */
+            input += chunk_size;
+        }
+    } 
+#endif
+
+    if (detected_sse42 && detected_clmul) {
+            return aws_checksums_crc32c_sse42(input, length, crc);
+    }
+
+    /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
+    while (length >= sizeof(slice_ptr_int_type)) {
+        crc = (uint32_t)crc_intrin_fn(crc, *input);
+        input += sizeof(slice_ptr_int_type);
+        length -= sizeof(slice_ptr_int_type);
+    }
+
+    /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */
+    while (length-- > 0) {
+        crc = (uint32_t)_mm_crc32_u8(crc, *input);
+        input++;
+    }
+
+    return ~crc;
+}
+
+uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
+    return aws_checksums_crc32_sw(input, length, previousCrc32);
+}
diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c
new file mode 100644
index 0000000..745bdac
--- /dev/null
+++ b/source/intel/intrin/crc32c_sse42_avx512.c
@@ -0,0 +1,197 @@
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
+
+#include <aws/common/macros.h>
+
+#include <emmintrin.h>
+#include <smmintrin.h>
+#include <wmmintrin.h>
+#include <immintrin.h>
+
+AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_8[8], 64);
+AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2[2], 16);
+
+/*
+ * crc32c_avx512(): compute the crc32c of the buffer, where the buffer
+ * length must be at least 256, and a multiple of 64. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc) {
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6
+     * are similar to those given at the end of the paper
+     *
+     * k1 = ( x ^ ( 512 * 4 + 32 ) mod P(x) << 32 )' << 1
+     * k2 = ( x ^ ( 512 * 4 - 32 ) mod P(x) << 32 )' << 1
+     * k3 = ( x ^ ( 512 + 32 ) mod P(x) << 32 )' << 1
+     * k4 = ( x ^ ( 512 - 32 ) mod P(x) << 32 )' << 1
+     * k5 = ( x ^ ( 128 + 32 ) mod P(x) << 32 )' << 1
+     * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1
+     */
+
+    static zalign_8 k1k2[] = {
+        0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+    static zalign_8 k3k4[] = {
+        0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8};
+    static zalign_2 k5k6[] = {0xf20c0dfe, 0x14cd00bd6};
+    static zalign_2 k7k8[] = {0xdd45aab8, 0x000000000};
+    static zalign_2 poly[] = {0x105ec76f1, 0xdea713f1};
+
+    __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+    __m128i a0, a1, a2, a3;
+
+    /*
+     * There's at least one block of 256.
+     */
+    x1 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+    x2 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+    x3 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+    x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+    x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
+
+    x0 = _mm512_load_si512((__m512i *)k1k2);
+
+    input += 256;
+    length -= 256;
+
+    /*
+     * Parallel fold blocks of 256, if any.
+     */
+    while (length >= 256)
+    {
+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+        x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+        x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+        x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
+
+
+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+        x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+        x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+        x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);
+
+        y5 = _mm512_loadu_si512((__m512i *)(input + 0x00));
+        y6 = _mm512_loadu_si512((__m512i *)(input + 0x40));
+        y7 = _mm512_loadu_si512((__m512i *)(input + 0x80));
+        y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
+
+        x1 = _mm512_xor_si512(x1, x5);
+        x2 = _mm512_xor_si512(x2, x6);
+        x3 = _mm512_xor_si512(x3, x7);
+        x4 = _mm512_xor_si512(x4, x8);
+
+        x1 = _mm512_xor_si512(x1, y5);
+        x2 = _mm512_xor_si512(x2, y6);
+        x3 = _mm512_xor_si512(x3, y7);
+        x4 = _mm512_xor_si512(x4, y8);
+
+        input += 256;
+        length -= 256;
+    }
+
+    /*
+     * Fold into 512-bits.
+     */
+    x0 = _mm512_load_si512((__m512i *)k3k4);
+
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x2);
+    x1 = _mm512_xor_si512(x1, x5);
+
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x3);
+    x1 = _mm512_xor_si512(x1, x5);
+
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_xor_si512(x1, x4);
+    x1 = _mm512_xor_si512(x1, x5);
+
+    /*
+     * Single fold blocks of 64, if any.
+     */
+    while (length >= 64)
+    {
+        x2 = _mm512_loadu_si512((__m512i *)input);
+
+        x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+        x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+        x1 = _mm512_xor_si512(x1, x2);
+        x1 = _mm512_xor_si512(x1, x5);
+
+        input += 64;
+        length -= 64;
+    }
+
+    /*
+     * Fold 512-bits to 384-bits.
+     */
+    a0 = _mm_load_si128((__m128i *)k5k6);
+
+    a1 = _mm512_extracti32x4_epi32(x1, 0);
+    a2 = _mm512_extracti32x4_epi32(x1, 1);
+
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Fold 384-bits to 256-bits.
+     */
+    a2 = _mm512_extracti32x4_epi32(x1, 2);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Fold 256-bits to 128-bits.
+     */
+    a2 = _mm512_extracti32x4_epi32(x1, 3);
+    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
+    a1 = _mm_xor_si128(a1, a3);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
+    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
+    a1 = _mm_srli_si128(a1, 8);
+    a1 = _mm_xor_si128(a1, a2);
+
+    a0 = _mm_loadl_epi64((__m128i*)k7k8);
+    a2 = _mm_srli_si128(a1, 4);
+    a1 = _mm_and_si128(a1, a3);
+    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    a0 = _mm_load_si128((__m128i*)poly);
+
+    a2 = _mm_and_si128(a1, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
+    a2 = _mm_and_si128(a2, a3);
+    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
+    a1 = _mm_xor_si128(a1, a2);
+
+
+    /*
+     * Return the crc32.
+     */
+    return ~_mm_extract_epi32(a1, 1);
+}
diff --git a/source/intel/visualc/visualc_crc32c_sse42.c b/source/intel/visualc/visualc_crc32c_sse42.c
new file mode 100644
index 0000000..79ed836
--- /dev/null
+++ b/source/intel/visualc/visualc_crc32c_sse42.c
@@ -0,0 +1,61 @@
+/**
+ * Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+ * SPDX-License-Identifier: Apache-2.0.
+ */
+
+#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
+
+/**
+ * This implements crc32c via the intel sse 4.2 instructions.
+ *  This is separate from the straight asm version, because visual c does not allow
+ *  inline assembly for x64.
+ */
+uint32_t aws_checksums_crc32c_sse42(const uint8_t *data, int length, uint32_t previousCrc32) {
+    uint32_t crc = previousCrc32;
+    int length_to_process = length;
+
+    slice_ptr_type temp = (slice_ptr_type)data;
+
+    /*to eek good performance out of the intel implementation, we need to only hit the hardware
+      once we are aligned on the byte boundaries we are using. So, peel off a byte at a time until we are
+      8 byte aligned (64 bit arch) or 4 byte aligned (32 bit arch)
+
+      first calculate how many bytes we need to burn before we are aligned.
+      for a 64 bit arch this is:
+      (8 - <how far we are past a boundary>) mod 8
+      32 bit:
+      (4 - <how far we are past a boundary>) mod 4 */
+    uint8_t alignment_offset = (sizeof(slice_ptr_int_type) - ((slice_ptr_int_type)temp % sizeof(slice_ptr_int_type))) %
+                               sizeof(slice_ptr_int_type);
+
+    /*for every byte we need to burn off, just do them a byte at a time.
+      increment the temp pointer by one byte at a time until we get it on an alignment boundary */
+    while (alignment_offset != 0 && length_to_process) {
+        uint8_t *byte_pos = (uint8_t *)temp;
+        crc = (uint32_t)_mm_crc32_u8(crc, *byte_pos++);
+        temp = (slice_ptr_type)byte_pos;
+        --alignment_offset;
+        --length_to_process;
+    }
+
+    /*now whatever is left is properly aligned on a boundary*/
+    uint32_t slices = length_to_process / sizeof(temp);
+    uint32_t remainder = length_to_process % sizeof(temp);
+
+    while (slices--) {
+#    if defined(_M_X64)
+        crc = (uint32_t)_mm_crc32_u64(crc, *temp++);
+#    else
+        crc = _mm_crc32_u32(crc, *temp++);
+#    endif
+    }
+
+    /* process the remaining parts that can't be done on the slice size. */
+    uint8_t *remainderPos = (uint8_t *)temp;
+
+    while (remainder--) {
+        crc = (uint32_t)_mm_crc32_u8(crc, *remainderPos++);
+    }
+
+    return ~crc;
+}
diff --git a/tests/crc_test.c b/tests/crc_test.c
index ec9d2a4..58da0ea 100644
--- a/tests/crc_test.c
+++ b/tests/crc_test.c
@@ -5,6 +5,9 @@
 
 #include <aws/checksums/crc.h>
 #include <aws/checksums/private/crc_priv.h>
+
+#include <aws/common/device_random.h>
+
 #include <aws/testing/aws_test_harness.h>
 
 static const uint8_t DATA_32_ZEROS[32] = {0};
@@ -99,6 +102,17 @@ static int s_test_crc32c(struct aws_allocator *allocator, void *ctx) {
     res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c));
     res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c_sw));
 
+
+    struct aws_byte_buf avx_buf;
+    /* enough for two avx512 runs */
+    aws_byte_buf_init(&avx_buf, allocator, 512);
+    aws_device_random_buffer(&avx_buf);
+
+    uint32_t crc = aws_checksums_crc32c_sw(avx_buf.buffer, avx_buf.len, 0);
+
+    uint32_t hw_crc = aws_checksums_crc32c_hw(avx_buf.buffer, avx_buf.len, 0);
+    ASSERT_UINT_EQUALS(hw_crc, crc);
+
     return res;
 }
 AWS_TEST_CASE(test_crc32c, s_test_crc32c)

From 837d5a19571ce29909e831243956d50967935f35 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Tue, 18 Jul 2023 17:43:18 -0700
Subject: [PATCH 05/18] Keep the naive avx512 path on for figuring out
 codebuild capabilities, turn it off once we know what's supported where.

---
 tests/crc_test.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tests/crc_test.c b/tests/crc_test.c
index 58da0ea..d28eb5b 100644
--- a/tests/crc_test.c
+++ b/tests/crc_test.c
@@ -108,9 +108,8 @@ static int s_test_crc32c(struct aws_allocator *allocator, void *ctx) {
     aws_byte_buf_init(&avx_buf, allocator, 512);
     aws_device_random_buffer(&avx_buf);
 
-    uint32_t crc = aws_checksums_crc32c_sw(avx_buf.buffer, avx_buf.len, 0);
-
-    uint32_t hw_crc = aws_checksums_crc32c_hw(avx_buf.buffer, avx_buf.len, 0);
+    uint32_t crc = aws_checksums_crc32c_sw(avx_buf.buffer, (int)avx_buf.len, 0);
+    uint32_t hw_crc = aws_checksums_crc32c_hw(avx_buf.buffer, (int)avx_buf.len, 0);
     ASSERT_UINT_EQUALS(hw_crc, crc);
 
     return res;

From 2289c9649c6d8222945ed06190cdb804f78c7acc Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Tue, 18 Jul 2023 17:58:39 -0700
Subject: [PATCH 06/18] Fix build and do correct cpu feature detection.

---
 CMakeLists.txt                            | 23 +++++++++++++++--------
 source/intel/crc_hw.c                     |  8 ++++----
 source/intel/intrin/crc32c_sse42_avx512.c | 14 +++++---------
 3 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 45caa2b..06702e6 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,15 +64,22 @@ if (USE_CPU_EXTENSIONS)
             "source/intel/*.c"
         )
 
-        if (MSVC)
-            file(GLOB AWS_ARCH_INTRIN_SRC
-                "source/intel/intrin/*.c"
-                "source/intel/visualc/*.c"
-            )
+        if (AWS_HAVE_AVX2_INTRINSICS)
+            if (MSVC)
+                file(GLOB AWS_ARCH_INTRIN_SRC
+                    "source/intel/intrin/*.c"
+                    "source/intel/visualc/*.c"
+                )
+            else()
+                file(GLOB AWS_ARCH_INTRIN_SRC
+                    "source/intel/intrin/*.c"
+                )
+            endif()
         else()
-            file(GLOB AWS_ARCH_INTRIN_SRC
-                "source/intel/intrin/*.c"
-            )
+            if (MSVC)
+               file(GLOB AWS_ARCH_INTRIN_SRC
+                    "source/intel/visualc/*.c")
+            endif()
         endif()
         
         source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC})
diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c
index 1e1c85b..0154f05 100644
--- a/source/intel/crc_hw.c
+++ b/source/intel/crc_hw.c
@@ -21,7 +21,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
 
     if (AWS_UNLIKELY(!detection_performed)) {
         detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2);
-        detected_avx512 = true; //aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512);
+        detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512);
         detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
         /* Simply setting the flag true to skip HW detection next time
            Not using memory barriers since the worst that can
@@ -40,7 +40,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
     }
 
     /* Get the 8-byte memory alignment of our input buffer by looking at the least significant 3 bits */
-    int input_alignment = (uintptr_t)(input) & 0x7;
+    int input_alignment = (uintptr_t)(input)&0x7;
 
     /* Compute the number of unaligned bytes before the first aligned 8-byte chunk (will be in the range 0-7) */
     int leading = (8 - input_alignment) & 0x7;
@@ -68,11 +68,11 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
             /* Fall into the default crc32 for the remaining data. */
             input += chunk_size;
         }
-    } 
+    }
 #endif
 
     if (detected_sse42 && detected_clmul) {
-            return aws_checksums_crc32c_sse42(input, length, crc);
+        return aws_checksums_crc32c_sse42(input, length, crc);
     }
 
     /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c
index 745bdac..5b513d3 100644
--- a/source/intel/intrin/crc32c_sse42_avx512.c
+++ b/source/intel/intrin/crc32c_sse42_avx512.c
@@ -8,9 +8,9 @@
 #include <aws/common/macros.h>
 
 #include <emmintrin.h>
+#include <immintrin.h>
 #include <smmintrin.h>
 #include <wmmintrin.h>
-#include <immintrin.h>
 
 AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_8[8], 64);
 AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2[2], 16);
@@ -64,14 +64,12 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
     /*
      * Parallel fold blocks of 256, if any.
      */
-    while (length >= 256)
-    {
+    while (length >= 256) {
         x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
         x6 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
         x7 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
         x8 = _mm512_clmulepi64_epi128(x4, x0, 0x00);
 
-
         x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
         x2 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
         x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
@@ -119,8 +117,7 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
     /*
      * Single fold blocks of 64, if any.
      */
-    while (length >= 64)
-    {
+    while (length >= 64) {
         x2 = _mm512_loadu_si512((__m512i *)input);
 
         x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
@@ -172,7 +169,7 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
     a1 = _mm_srli_si128(a1, 8);
     a1 = _mm_xor_si128(a1, a2);
 
-    a0 = _mm_loadl_epi64((__m128i*)k7k8);
+    a0 = _mm_loadl_epi64((__m128i *)k7k8);
     a2 = _mm_srli_si128(a1, 4);
     a1 = _mm_and_si128(a1, a3);
     a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
@@ -181,7 +178,7 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
     /*
      * Barret reduce to 32-bits.
      */
-    a0 = _mm_load_si128((__m128i*)poly);
+    a0 = _mm_load_si128((__m128i *)poly);
 
     a2 = _mm_and_si128(a1, a3);
     a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
@@ -189,7 +186,6 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
     a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
     a1 = _mm_xor_si128(a1, a2);
 
-
     /*
      * Return the crc32.
      */

From ee3e5da419d0329e5c2deb1952756ba192df12e6 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Tue, 18 Jul 2023 18:05:24 -0700
Subject: [PATCH 07/18] fix 32-bit builds and builds that need to work without
 intrinsics available.

---
 CMakeLists.txt        | 2 +-
 source/intel/crc_hw.c | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06702e6..05265d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,7 +64,7 @@ if (USE_CPU_EXTENSIONS)
             "source/intel/*.c"
         )
 
-        if (AWS_HAVE_AVX2_INTRINSICS)
+        if (AWS_HAVE_AVX2_INTRINSICS AND CMAKE_SIZEOF_VOID_P EQUAL 8)
             if (MSVC)
                 file(GLOB AWS_ARCH_INTRIN_SRC
                     "source/intel/intrin/*.c"
diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c
index 0154f05..f74ba4c 100644
--- a/source/intel/crc_hw.c
+++ b/source/intel/crc_hw.c
@@ -32,7 +32,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
     uint32_t crc = ~previousCrc32;
 
     /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
-    if (length < sizeof(slice_ptr_int_type)) {
+    if (length < (int)sizeof(slice_ptr_int_type)) {
         while (length-- > 0) {
             crc = (uint32_t)_mm_crc32_u8(crc, *input++);
         }
@@ -76,10 +76,10 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
     }
 
     /* Spin through remaining (aligned) 8-byte chunks using the CRC32Q quad word instruction */
-    while (length >= sizeof(slice_ptr_int_type)) {
+    while (length >= (int)sizeof(slice_ptr_int_type)) {
         crc = (uint32_t)crc_intrin_fn(crc, *input);
         input += sizeof(slice_ptr_int_type);
-        length -= sizeof(slice_ptr_int_type);
+        length -= (int)sizeof(slice_ptr_int_type);
     }
 
     /* Finish up with any trailing bytes using the CRC32B single byte instruction one-by-one */

From 005ed7ce058393b1496fd599c6e3d0314ecad53e Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Tue, 18 Jul 2023 18:12:03 -0700
Subject: [PATCH 08/18] Not sure how the avx512 code got called. hopefully
 coedebuild is just busted.

---
 source/intel/crc_hw.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c
index f74ba4c..ef6d00f 100644
--- a/source/intel/crc_hw.c
+++ b/source/intel/crc_hw.c
@@ -53,9 +53,9 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
         crc = (uint32_t)_mm_crc32_u8(crc, *input++);
     }
 
+#ifdef AWS_HAVE_AVX512_INTRINSICS
     int chunk_size = length & ~63;
 
-#ifdef AWS_HAVE_AVX512_INTRINSICS
     if (detected_avx512 && detected_clmul) {
         if (length >= 256) {
             crc = aws_checksums_crc32c_avx512(input, length, crc);

From 39094d4eda05c58f7c5c1b2a9db09e51240bb0a3 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Tue, 18 Jul 2023 18:14:48 -0700
Subject: [PATCH 09/18] Found why the wrong build files were being used at
 least.

---
 CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 05265d9..43aa73f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,7 +64,7 @@ if (USE_CPU_EXTENSIONS)
             "source/intel/*.c"
         )
 
-        if (AWS_HAVE_AVX2_INTRINSICS AND CMAKE_SIZEOF_VOID_P EQUAL 8)
+        if (AWS_HAVE_AVX512_INTRINSICS AND CMAKE_SIZEOF_VOID_P EQUAL 8)
             if (MSVC)
                 file(GLOB AWS_ARCH_INTRIN_SRC
                     "source/intel/intrin/*.c"

From d4ffdc1b0a88971c3a4e19eba305d281b58852ab Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Tue, 18 Jul 2023 18:19:06 -0700
Subject: [PATCH 10/18] Make test pass when it passes.

---
 tests/crc_test.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/crc_test.c b/tests/crc_test.c
index d28eb5b..a8e8f55 100644
--- a/tests/crc_test.c
+++ b/tests/crc_test.c
@@ -110,6 +110,8 @@ static int s_test_crc32c(struct aws_allocator *allocator, void *ctx) {
 
     uint32_t crc = aws_checksums_crc32c_sw(avx_buf.buffer, (int)avx_buf.len, 0);
     uint32_t hw_crc = aws_checksums_crc32c_hw(avx_buf.buffer, (int)avx_buf.len, 0);
+
+    aws_byte_buf_clean_up(&avx_buf);
     ASSERT_UINT_EQUALS(hw_crc, crc);
 
     return res;

From bf799366846a079abbb7b7716aa3610fc4f28197 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Tue, 18 Jul 2023 18:29:05 -0700
Subject: [PATCH 11/18] Try it again.

---
 CMakeLists.txt                            |  7 ++++++-
 source/intel/asm/crc32c_sse42_asm.c       |  2 +-
 source/intel/intrin/crc32c_sse42_avx512.c | 19 ++++++++++---------
 3 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 43aa73f..997c2ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -78,7 +78,8 @@ if (USE_CPU_EXTENSIONS)
         else()
             if (MSVC)
                file(GLOB AWS_ARCH_INTRIN_SRC
-                    "source/intel/visualc/*.c")
+                    "source/intel/visualc/*.c"
+               )
             endif()
         endif()
         
@@ -86,6 +87,10 @@ if (USE_CPU_EXTENSIONS)
         source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC})
 
         if (AWS_HAVE_GCC_INLINE_ASM)
+            file(GLOB AWS_ARCH_ASM_SRC
+                "source/intel/asm/*.c"
+            )
+
             file(GLOB AWS_ARCH_SRC
                     ${AWS_ARCH_INTEL_SRC}
                     ${AWS_ARCH_INTRIN_SRC}
diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c
index e7c144e..dd7431e 100644
--- a/source/intel/asm/crc32c_sse42_asm.c
+++ b/source/intel/asm/crc32c_sse42_asm.c
@@ -3,7 +3,7 @@
  * SPDX-License-Identifier: Apache-2.0.
  */
 
-#include <aws/checksums/private/crc_priv.h>
+#include <aws/checksums/private/intel/crc32c_compiler_shims.h>
 
 #include <aws/common/cpuid.h>
 
diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c
index 5b513d3..35086c3 100644
--- a/source/intel/intrin/crc32c_sse42_avx512.c
+++ b/source/intel/intrin/crc32c_sse42_avx512.c
@@ -12,8 +12,8 @@
 #include <smmintrin.h>
 #include <wmmintrin.h>
 
-AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_8[8], 64);
-AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2[2], 16);
+AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_8, 64);
+AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2, 16);
 
 /*
  * crc32c_avx512(): compute the crc32c of the buffer, where the buffer
@@ -35,13 +35,14 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
      * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1
      */
 
-    static zalign_8 k1k2[] = {
-        0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
-    static zalign_8 k3k4[] = {
-        0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8};
-    static zalign_2 k5k6[] = {0xf20c0dfe, 0x14cd00bd6};
-    static zalign_2 k7k8[] = {0xdd45aab8, 0x000000000};
-    static zalign_2 poly[] = {0x105ec76f1, 0xdea713f1};
+    static zalign_8 k1k2[8] = 
+        {0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+
+    static zalign_8 k3k4[8] =
+        {0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8};
+    static zalign_2 k5k6[2] = {0xf20c0dfe, 0x14cd00bd6};
+    static zalign_2 k7k8[2] = {0xdd45aab8, 0x000000000};
+    static zalign_2 poly[2] = {0x105ec76f1, 0xdea713f1};
 
     __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
     __m128i a0, a1, a2, a3;

From 1e24d06f51055afed592d84cb043659fbc15ae44 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Tue, 18 Jul 2023 18:33:47 -0700
Subject: [PATCH 12/18] fix leftover symbol collision.

---
 source/intel/asm/crc32c_sse42_asm.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c
index dd7431e..85acdf5 100644
--- a/source/intel/asm/crc32c_sse42_asm.c
+++ b/source/intel/asm/crc32c_sse42_asm.c
@@ -358,9 +358,6 @@ uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t p
 
     return ~crc;
 }
-uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previousCrc32) {
-    return aws_checksums_crc32_sw(input, length, previousCrc32);
-}
 
 #    if defined(__clang__)
 #        pragma clang diagnostic pop
@@ -368,7 +365,7 @@ uint32_t aws_checksums_crc32_hw(const uint8_t *input, int length, uint32_t previ
 
 #else
 uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
-    return aws_checksums_crc32_sw(input, length, previousCrc32);
+    return aws_checksums_crc32c_sw(input, length, previousCrc32);
 }
 #endif
 /* clang-format on */

From 907e7215f990a87932e75e5715a81fc2f397f5c5 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Wed, 19 Jul 2023 11:46:06 -0700
Subject: [PATCH 13/18] Added more compile gates and assertions.

---
 source/intel/crc_hw.c                     | 4 ++--
 source/intel/intrin/crc32c_sse42_avx512.c | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c
index ef6d00f..3414d67 100644
--- a/source/intel/crc_hw.c
+++ b/source/intel/crc_hw.c
@@ -53,7 +53,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
         crc = (uint32_t)_mm_crc32_u8(crc, *input++);
     }
 
-#ifdef AWS_HAVE_AVX512_INTRINSICS
+#if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX)
     int chunk_size = length & ~63;
 
     if (detected_avx512 && detected_clmul) {
@@ -62,7 +62,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
             /* check remaining data */
             length -= chunk_size;
             if (!length) {
-                return crc;
+                return ~crc;
             }
 
             /* Fall into the default crc32 for the remaining data. */
diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c
index 35086c3..c25799d 100644
--- a/source/intel/intrin/crc32c_sse42_avx512.c
+++ b/source/intel/intrin/crc32c_sse42_avx512.c
@@ -23,6 +23,8 @@ AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2, 16);
  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
  */
 uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc) {
+    AWS_ASSERT(
+        length >= 256 && "invariant violated. length must be greater than 256 bytes to use avx512 to compute crc.");
     /*
      * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6
      * are similar to those given at the end of the paper

From 5ab00462a861dc9a501057924ef3715ac7c9e585 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Thu, 20 Jul 2023 12:30:25 -0700
Subject: [PATCH 14/18] Fix osx build.

---
 source/intel/intrin/crc32c_sse42_avx512.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c
index c25799d..da26149 100644
--- a/source/intel/intrin/crc32c_sse42_avx512.c
+++ b/source/intel/intrin/crc32c_sse42_avx512.c
@@ -5,6 +5,7 @@
 
 #include <aws/checksums/private/intel/crc32c_compiler_shims.h>
 
+#include <aws/common/assert.h>
 #include <aws/common/macros.h>
 
 #include <emmintrin.h>

From ca43c51c122305f09aaace72a2aef825efd4b062 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Thu, 20 Jul 2023 12:44:46 -0700
Subject: [PATCH 15/18] make the bitflips uniform.

---
 source/intel/asm/crc32c_sse42_asm.c       | 3 ++-
 source/intel/intrin/crc32c_sse42_avx512.c | 5 ++++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c
index 85acdf5..7da4e50 100644
--- a/source/intel/asm/crc32c_sse42_asm.c
+++ b/source/intel/asm/crc32c_sse42_asm.c
@@ -293,7 +293,8 @@ uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t p
         detection_performed = true;
     }
 
-    uint32_t crc = ~previousCrc32;
+    /* this is called by a higher-level shim and previousCRC32 is already ~ */
+    uint32_t crc = previousCrc32;
 
     /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
     if (AWS_UNLIKELY(length < 8)) {
diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c
index da26149..8274a3a 100644
--- a/source/intel/intrin/crc32c_sse42_avx512.c
+++ b/source/intel/intrin/crc32c_sse42_avx512.c
@@ -23,9 +23,12 @@ AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2, 16);
  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
  *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
  */
-uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t crc) {
+uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t previous_crc) {
     AWS_ASSERT(
         length >= 256 && "invariant violated. length must be greater than 256 bytes to use avx512 to compute crc.");
+
+    fprintf(stderr, "Entered AVX512 branch.");
+    uint32_t crc = ~previous_crc;
     /*
      * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6
      * are similar to those given at the end of the paper

From 28dde8b1e92b50dc3d06280942c4c1bb51f8c568 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Thu, 20 Jul 2023 13:01:39 -0700
Subject: [PATCH 16/18] add additional runtime cpuid check and run formatter.

---
 source/intel/crc_hw.c                       | 7 ++++++-
 source/intel/intrin/crc32c_sse42_avx512.c   | 9 ++++-----
 source/intel/visualc/visualc_crc32c_sse42.c | 6 +++---
 tests/crc_test.c                            | 1 -
 4 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/source/intel/crc_hw.c b/source/intel/crc_hw.c
index 3414d67..d571cc0 100644
--- a/source/intel/crc_hw.c
+++ b/source/intel/crc_hw.c
@@ -9,6 +9,7 @@ static bool detection_performed = false;
 static bool detected_sse42 = false;
 static bool detected_avx512 = false;
 static bool detected_clmul = false;
+static bool detected_vpclmulqdq = false;
 
 /*
  * Computes the Castagnoli CRC32c (iSCSI) of the specified data buffer using the Intel CRC32Q (64-bit quad word) and
@@ -23,12 +24,16 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
         detected_sse42 = aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2);
         detected_avx512 = aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512);
         detected_clmul = aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL);
+        detected_vpclmulqdq = aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ);
+
         /* Simply setting the flag true to skip HW detection next time
            Not using memory barriers since the worst that can
            happen is a fallback to the non HW accelerated code. */
         detection_performed = true;
     }
 
+    /* this is the entry point. We should only do the bit flip once. It should not be done for the subfunctions and
+     * branches.*/
     uint32_t crc = ~previousCrc32;
 
     /* For small input, forget about alignment checks - simply compute the CRC32c one byte at a time */
@@ -56,7 +61,7 @@ uint32_t aws_checksums_crc32c_hw(const uint8_t *input, int length, uint32_t prev
 #if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX)
     int chunk_size = length & ~63;
 
-    if (detected_avx512 && detected_clmul) {
+    if (detected_avx512 && detected_vpclmulqdq && detected_clmul) {
         if (length >= 256) {
             crc = aws_checksums_crc32c_avx512(input, length, crc);
             /* check remaining data */
diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c
index 8274a3a..6f402de 100644
--- a/source/intel/intrin/crc32c_sse42_avx512.c
+++ b/source/intel/intrin/crc32c_sse42_avx512.c
@@ -27,7 +27,6 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
     AWS_ASSERT(
         length >= 256 && "invariant violated. length must be greater than 256 bytes to use avx512 to compute crc.");
 
-    fprintf(stderr, "Entered AVX512 branch.");
     uint32_t crc = ~previous_crc;
     /*
      * Definitions of the bit-reflected domain constants k1,k2,k3,k4,k5,k6
@@ -41,11 +40,11 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
      * k6 = ( x ^ ( 128 - 32 ) mod P(x) << 32 )' << 1
      */
 
-    static zalign_8 k1k2[8] = 
-        {0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
+    static zalign_8 k1k2[8] = {
+        0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};
 
-    static zalign_8 k3k4[8] =
-        {0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8};
+    static zalign_8 k3k4[8] = {
+        0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8};
     static zalign_2 k5k6[2] = {0xf20c0dfe, 0x14cd00bd6};
     static zalign_2 k7k8[2] = {0xdd45aab8, 0x000000000};
     static zalign_2 poly[2] = {0x105ec76f1, 0xdea713f1};
diff --git a/source/intel/visualc/visualc_crc32c_sse42.c b/source/intel/visualc/visualc_crc32c_sse42.c
index 79ed836..707f2ba 100644
--- a/source/intel/visualc/visualc_crc32c_sse42.c
+++ b/source/intel/visualc/visualc_crc32c_sse42.c
@@ -43,11 +43,11 @@ uint32_t aws_checksums_crc32c_sse42(const uint8_t *data, int length, uint32_t pr
     uint32_t remainder = length_to_process % sizeof(temp);
 
     while (slices--) {
-#    if defined(_M_X64)
+#if defined(_M_X64)
         crc = (uint32_t)_mm_crc32_u64(crc, *temp++);
-#    else
+#else
         crc = _mm_crc32_u32(crc, *temp++);
-#    endif
+#endif
     }
 
     /* process the remaining parts that can't be done on the slice size. */
diff --git a/tests/crc_test.c b/tests/crc_test.c
index a8e8f55..0d8cfeb 100644
--- a/tests/crc_test.c
+++ b/tests/crc_test.c
@@ -102,7 +102,6 @@ static int s_test_crc32c(struct aws_allocator *allocator, void *ctx) {
     res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c));
     res |= s_test_known_crc32c(CRC_FUNC_NAME(aws_checksums_crc32c_sw));
 
-
     struct aws_byte_buf avx_buf;
     /* enough for two avx512 runs */
     aws_byte_buf_init(&avx_buf, allocator, 512);

From a00a8e3c0d586bf799f3bd3d8ae0c22b0f4917e1 Mon Sep 17 00:00:00 2001
From: "Jonathan M. Henson" <jonathan.michael.henson@gmail.com>
Date: Thu, 20 Jul 2023 14:25:19 -0700
Subject: [PATCH 17/18] work around nasty bitflipping logic.

---
 source/intel/asm/crc32c_sse42_asm.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/source/intel/asm/crc32c_sse42_asm.c b/source/intel/asm/crc32c_sse42_asm.c
index 7da4e50..bc79597 100644
--- a/source/intel/asm/crc32c_sse42_asm.c
+++ b/source/intel/asm/crc32c_sse42_asm.c
@@ -366,7 +366,10 @@ uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t p
 
 #else
 uint32_t aws_checksums_crc32c_sse42(const uint8_t *input, int length, uint32_t previousCrc32) {
-    return aws_checksums_crc32c_sw(input, length, previousCrc32);
+    /* these are nested in a larger computation. As a result the crc doesn't need to be bit flipped.
+       However, the sw function is also used as a standalone implementation that does need to do the 
+       bit flip. So go ahead and flip it here, so the sw implementation flips it back. */
+    return aws_checksums_crc32c_sw(input, length, ~previousCrc32);
 }
 #endif
 /* clang-format on */

From 51384077aeacbc8613ca0223680a922aea4ec090 Mon Sep 17 00:00:00 2001
From: pbadari <107280494+pbadari@users.noreply.github.com>
Date: Sun, 31 Dec 2023 18:34:58 -0800
Subject: [PATCH 18/18] =?UTF-8?q?Addressed=20review=20comments,=20use=20te?=
 =?UTF-8?q?rnary=20logic=20instructions=20and=20optimiz=E2=80=A6=20(#73)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 source/intel/intrin/crc32c_sse42_avx512.c | 121 +++++++---------------
 1 file changed, 37 insertions(+), 84 deletions(-)

diff --git a/source/intel/intrin/crc32c_sse42_avx512.c b/source/intel/intrin/crc32c_sse42_avx512.c
index 6f402de..837a1ba 100644
--- a/source/intel/intrin/crc32c_sse42_avx512.c
+++ b/source/intel/intrin/crc32c_sse42_avx512.c
@@ -21,11 +21,11 @@ AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2, 16);
  * length must be at least 256, and a multiple of 64. Based on:
  *
  * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
- *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ *  V. Gopal, E. Ozturk, et al., 2009, http://download.intel.com/design/intarch/papers/323102.pdf
  */
 uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t previous_crc) {
     AWS_ASSERT(
-        length >= 256 && "invariant violated. length must be greater than 256 bytes to use avx512 to compute crc.");
+        length >= 256 && "invariant violated. length must be greater than 255 bytes to use avx512 to compute crc.");
 
     uint32_t crc = ~previous_crc;
     /*
@@ -45,12 +45,14 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
 
     static zalign_8 k3k4[8] = {
         0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8};
-    static zalign_2 k5k6[2] = {0xf20c0dfe, 0x14cd00bd6};
-    static zalign_2 k7k8[2] = {0xdd45aab8, 0x000000000};
-    static zalign_2 poly[2] = {0x105ec76f1, 0xdea713f1};
+    static zalign_8 k9k10[8] = {
+	0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092, 0x6992cea2, 0x0d3b6092};
+    static zalign_8 k1k4[8] =  {
+	0x1c291d04, 0xddc0152b, 0x3da6d0cb, 0xba4fc28e, 0xf20c0dfe, 0x493c7d27, 0x00000000, 0x00000000};
+
 
     __m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
-    __m128i a0, a1, a2, a3;
+    __m128i a1, a2;
 
     /*
      * There's at least one block of 256.
@@ -86,39 +88,31 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
         y7 = _mm512_loadu_si512((__m512i *)(input + 0x80));
         y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
 
-        x1 = _mm512_xor_si512(x1, x5);
-        x2 = _mm512_xor_si512(x2, x6);
-        x3 = _mm512_xor_si512(x3, x7);
-        x4 = _mm512_xor_si512(x4, x8);
-
-        x1 = _mm512_xor_si512(x1, y5);
-        x2 = _mm512_xor_si512(x2, y6);
-        x3 = _mm512_xor_si512(x3, y7);
-        x4 = _mm512_xor_si512(x4, y8);
+	x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
+	x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96);
+	x3 = _mm512_ternarylogic_epi64(x3, x7, y7, 0x96);
+	x4 = _mm512_ternarylogic_epi64(x4, x8, y8, 0x96);
 
         input += 256;
         length -= 256;
     }
 
     /*
-     * Fold into 512-bits.
+     * Fold 256 bytes into 64 bytes.
      */
-    x0 = _mm512_load_si512((__m512i *)k3k4);
-
+    x0 = _mm512_load_si512((__m512i *)k9k10);
     x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-    x1 = _mm512_xor_si512(x1, x2);
-    x1 = _mm512_xor_si512(x1, x5);
+    x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96);
 
-    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-    x1 = _mm512_xor_si512(x1, x3);
-    x1 = _mm512_xor_si512(x1, x5);
+    x7 = _mm512_clmulepi64_epi128(x2, x0, 0x00);
+    x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
+    x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96);
 
-    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
-    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-    x1 = _mm512_xor_si512(x1, x4);
-    x1 = _mm512_xor_si512(x1, x5);
+    x0 = _mm512_load_si512((__m512i *)k3k4);
+    y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
+    y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
+    x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96);
 
     /*
      * Single fold blocks of 64, if any.
@@ -128,72 +122,31 @@ uint32_t aws_checksums_crc32c_avx512(const uint8_t *input, int length, uint32_t
 
         x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
         x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
-        x1 = _mm512_xor_si512(x1, x2);
-        x1 = _mm512_xor_si512(x1, x5);
+        x1 = _mm512_ternarylogic_epi64(x1, x2, x5, 0x96);
 
         input += 64;
         length -= 64;
     }
 
     /*
-     * Fold 512-bits to 384-bits.
+     * Fold 512-bits to 128-bits.
      */
-    a0 = _mm_load_si128((__m128i *)k5k6);
-
-    a1 = _mm512_extracti32x4_epi32(x1, 0);
-    a2 = _mm512_extracti32x4_epi32(x1, 1);
+    x0 = _mm512_loadu_si512((__m512i *)k1k4);
 
-    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
-
-    a1 = _mm_xor_si128(a1, a3);
-    a1 = _mm_xor_si128(a1, a2);
-
-    /*
-     * Fold 384-bits to 256-bits.
-     */
-    a2 = _mm512_extracti32x4_epi32(x1, 2);
-    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
-    a1 = _mm_xor_si128(a1, a3);
-    a1 = _mm_xor_si128(a1, a2);
-
-    /*
-     * Fold 256-bits to 128-bits.
-     */
     a2 = _mm512_extracti32x4_epi32(x1, 3);
-    a3 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x11);
-    a1 = _mm_xor_si128(a1, a3);
-    a1 = _mm_xor_si128(a1, a2);
-
-    /*
-     * Fold 128-bits to 64-bits.
-     */
-    a2 = _mm_clmulepi64_si128(a1, a0, 0x10);
-    a3 = _mm_setr_epi32(~0, 0, ~0, 0);
-    a1 = _mm_srli_si128(a1, 8);
-    a1 = _mm_xor_si128(a1, a2);
-
-    a0 = _mm_loadl_epi64((__m128i *)k7k8);
-    a2 = _mm_srli_si128(a1, 4);
-    a1 = _mm_and_si128(a1, a3);
-    a1 = _mm_clmulepi64_si128(a1, a0, 0x00);
-    a1 = _mm_xor_si128(a1, a2);
-
-    /*
-     * Barret reduce to 32-bits.
-     */
-    a0 = _mm_load_si128((__m128i *)poly);
+    x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
+    x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
+    x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_castsi128_si512(a2), 0x96);
 
-    a2 = _mm_and_si128(a1, a3);
-    a2 = _mm_clmulepi64_si128(a2, a0, 0x10);
-    a2 = _mm_and_si128(a2, a3);
-    a2 = _mm_clmulepi64_si128(a2, a0, 0x00);
-    a1 = _mm_xor_si128(a1, a2);
+    x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
+    x0 = _mm512_xor_epi64(x1, x0);
+    a1 = _mm512_extracti32x4_epi32(x0, 1);
+    a1 = _mm_xor_epi64(a1, _mm512_castsi512_si128(x0));
 
     /*
-     * Return the crc32.
+     * Fold 128-bits to 32-bits.
      */
-    return ~_mm_extract_epi32(a1, 1);
+    uint64_t val;
+    val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
+    return (uint32_t) _mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
 }