@@ -740,10 +740,8 @@ SIMSIMD_MAKE_UMEYAMA(accurate, bf16, f64, f32, SIMSIMD_BF16_TO_F32) // simsimd_u
740740#if _SIMSIMD_TARGET_X86
741741#if SIMSIMD_TARGET_SKYLAKE
742742#pragma GCC push_options
743- #pragma GCC target("avx512f", "avx512vl", "avx512bw", "bmi2")
744- #pragma clang attribute push(__attribute__((target("avx512f,avx512vl,avx512bw,bmi2"))), apply_to = function)
745-
746- #include <immintrin.h>
743+ #pragma GCC target("avx512f", "avx512vl", "avx512bw", "avx512dq", "bmi2")
744+ #pragma clang attribute push(__attribute__((target("avx512f,avx512vl,avx512bw,avx512dq,bmi2"))), apply_to = function)
747745
748746/* Internal helper: Deinterleave 48 floats (16 xyz triplets) into separate x, y, z vectors.
749747 * Uses permutex2var shuffles instead of gather for ~1.8x speedup.
@@ -2143,8 +2141,6 @@ SIMSIMD_PUBLIC void simsimd_umeyama_f64_skylake(simsimd_f64_t const *a, simsimd_
21432141#pragma GCC target("avx2", "fma", "f16c", "bmi2")
21442142#pragma clang attribute push(__attribute__((target("avx2,fma,f16c,bmi2"))), apply_to = function)
21452143
2146- #include <immintrin.h>
2147-
21482144/* Internal helper: Deinterleave 24 floats (8 xyz triplets) into separate x, y, z vectors.
21492145 * Uses AVX2 gather instructions for clean stride-3 access.
21502146 *
0 commit comments