Wire 512-bit QBS kernels into fast scan DD dispatch (facebookresearch#5075)

algoriddle · meta-codesync[bot] · commit 9d567497ecae · 2026-04-21T01:15:23.000-07:00
Summary: Pull Request resolved: facebookresearch#5075 In DD mode, the QBS (bbs=32) accumulate path always used 256-bit kernels, even in the AVX512 per-ISA TU. The 512-bit kernels in kernels_simd512.h were dead because bare simdlib aliases resolve to _tpl<NONE> in DD mode, and 512-bit NONE types don't exist (empty primary templates). Fix: add function-local using declarations in both 512-bit kernel functions to bind types to explicit AVX512/AVX2 levels. Create accumulate_loops_512.h with FixedStorage512 (a non-virtual intermediate handler that bridges the AVX2→NONE type gap via storeu/loadu at the handler boundary) and the 512-bit QBS accumulate loop. Wire it into dispatching.h's ScannerMixIn behind an Reviewed By: mdouze Differential Revision: D100151879 fbshipit-source-id: b801f897f2d061a8448842f42edcdeb3a447eafd
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
@@ -277,6 +277,7 @@ set(FAISS_HEADERS
   impl/lattice_Zn.h
   impl/platform_macros.h
   impl/fast_scan/accumulate_loops.h
+  impl/fast_scan/accumulate_loops_512.h
   impl/fast_scan/dispatching.h
   impl/fast_scan/fast_scan.h
   impl/fast_scan/decompose_qbs.h
diff --git a/faiss/impl/fast_scan/accumulate_loops.h b/faiss/impl/fast_scan/accumulate_loops.h
@@ -17,10 +17,10 @@
  *   - accumulate_q_4step_256 / pq4_accumulate_loop_qbs_fixed_scaler_256
  *     (QBS path, bbs == 32, 256-bit kernel only)
  *
- * The QBS helpers use pq4_kernel_qbs_256 exclusively (not decompose_qbs.h)
- * because decompose_qbs.h includes kernels_simd512.h which uses 512-bit
- * types that are empty primary templates when SINGLE_SIMD_LEVEL=NONE
- * (DD mode). SL-parameterizing the 512-bit kernels is future work.
+ * The QBS helpers here use pq4_kernel_qbs_256 exclusively (not
+ * decompose_qbs.h) because decompose_qbs.h includes kernels_simd512.h
+ * whose 512-bit types need explicit SIMD levels.  The 512-bit QBS path
+ * lives in accumulate_loops_512.h, used by the AVX512 per-ISA TU.
  *
  * All functions live in `namespace faiss` (not anonymous) so they can be
  * shared by both the per-SIMD TU dispatcher (dispatching.h) and the old
diff --git a/faiss/impl/fast_scan/accumulate_loops_512.h b/faiss/impl/fast_scan/accumulate_loops_512.h
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/**
+ * @file accumulate_loops_512.h
+ * @brief 512-bit QBS accumulation loop for AVX512 per-ISA TUs.
+ *
+ * Mirrors accumulate_loops.h's QBS path but uses pq4_kernel_qbs_512
+ * (from kernels_simd512.h) instead of pq4_kernel_qbs_256.
+ *
+ * The 512-bit kernels produce simd16uint16_tpl<AVX2> results (via
+ * combine4x2). The virtual SIMDResultHandler::handle() expects
+ * simd16uint16_tpl<NONE> in DD mode. FixedStorage512 bridges this gap:
+ * it stores AVX2-level results internally, then converts to the handler's
+ * level via storeu/load in to_other_handler().
+ *
+ * Only included from the AVX512 per-ISA TU (impl-avx512.cpp) via
+ * dispatching.h's conditional include.
+ */
+
+#if defined(COMPILE_SIMD_AVX512) && defined(__AVX512F__)
+
+#include <cassert>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/fast_scan/accumulate_loops.h>
+#include <faiss/impl/fast_scan/kernels_simd512.h>
+#include <faiss/impl/fast_scan/simd_result_handlers.h>
+
+namespace faiss {
+
+using namespace simd_result_handlers;
+
+/***************************************************************
+ * FixedStorage512: non-virtual intermediate result storage
+ * for 512-bit kernels.
+ *
+ * Does NOT inherit from SIMDResultHandler — the virtual handle()
+ * signature is pinned to simd16uint16_tpl<NONE> in DD mode, but
+ * 512-bit kernels produce simd16uint16_tpl<AVX2>. By avoiding
+ * inheritance, handle() can accept AVX2-level types directly.
+ *
+ * The conversion to the outer handler's type happens in
+ * to_other_handler() via a store-to-memory roundtrip.
+ ***************************************************************/
+
+template <int NQ, int BB>
+struct FixedStorage512 {
+    using simd16uint16_avx2 = simd16uint16_tpl<SIMDLevel::AVX2>;
+
+    simd16uint16_avx2 dis[NQ][BB];
+    int i0 = 0;
+
+    void handle(
+            size_t q,
+            size_t b,
+            simd16uint16_avx2 d0,
+            simd16uint16_avx2 d1) {
+        dis[q + i0][2 * b] = d0;
+        dis[q + i0][2 * b + 1] = d1;
+    }
+
+    void set_block_origin(size_t i0_in, size_t) {
+        this->i0 = i0_in;
+    }
+
+    template <class OtherResultHandler>
+    void to_other_handler(OtherResultHandler& other) const {
+        using handler_simd16 = simd16uint16_tpl<SINGLE_SIMD_LEVEL_256>;
+        for (int q = 0; q < NQ; q++) {
+            for (int b = 0; b < BB; b += 2) {
+                // Convert AVX2 → handler level (NONE in DD mode)
+                ALIGNED(32) uint16_t buf0[16], buf1[16];
+                dis[q][b].storeu(buf0);
+                dis[q][b + 1].storeu(buf1);
+                handler_simd16 h0, h1;
+                h0.loadu(buf0);
+                h1.loadu(buf1);
+                other.handle(q, b / 2, h0, h1);
+            }
+        }
+    }
+};
+
+/***************************************************************
+ * QBS path: 512-bit kernel variants
+ ***************************************************************/
+
+template <int QBS, class ResultHandler, class Scaler>
+void accumulate_q_4step_512(
+        size_t ntotal2,
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT0,
+        ResultHandler& res,
+        const Scaler& scaler,
+        size_t block_stride) {
+    constexpr int Q1 = QBS & 15;
+    constexpr int Q2 = (QBS >> 4) & 15;
+    constexpr int Q3 = (QBS >> 8) & 15;
+    constexpr int Q4 = (QBS >> 12) & 15;
+    constexpr int SQ = Q1 + Q2 + Q3 + Q4;
+
+    for_each_block<32>(ntotal2, codes, block_stride, res, [&](size_t) {
+        FixedStorage512<SQ, 2> res2;
+        const uint8_t* LUT = LUT0;
+        pq4_kernel_qbs_512<Q1>(nsq, codes, LUT, res2, scaler);
+        LUT += Q1 * nsq * 16;
+        if (Q2 > 0) {
+            res2.set_block_origin(Q1, 0);
+            pq4_kernel_qbs_512<Q2>(nsq, codes, LUT, res2, scaler);
+            LUT += Q2 * nsq * 16;
+        }
+        if (Q3 > 0) {
+            res2.set_block_origin(Q1 + Q2, 0);
+            pq4_kernel_qbs_512<Q3>(nsq, codes, LUT, res2, scaler);
+            LUT += Q3 * nsq * 16;
+        }
+        if (Q4 > 0) {
+            res2.set_block_origin(Q1 + Q2 + Q3, 0);
+            pq4_kernel_qbs_512<Q4>(nsq, codes, LUT, res2, scaler);
+        }
+        res2.to_other_handler(res);
+    });
+}
+
+template <class ResultHandler, class Scaler>
+void pq4_accumulate_loop_qbs_fixed_scaler_512(
+        int qbs,
+        size_t ntotal2,
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT0,
+        ResultHandler& res,
+        const Scaler& scaler,
+        size_t block_stride) {
+    assert(nsq % 2 == 0);
+    assert(is_aligned_pointer(codes));
+    assert(is_aligned_pointer(LUT0));
+
+    switch (qbs) {
+#define FAISS_QBS512_DISPATCH(QBS)                                     \
+    case QBS:                                                          \
+        accumulate_q_4step_512<QBS>(                                   \
+                ntotal2, nsq, codes, LUT0, res, scaler, block_stride); \
+        return;
+        FAISS_QBS512_DISPATCH(0x3333); // 12
+        FAISS_QBS512_DISPATCH(0x2333); // 11
+        FAISS_QBS512_DISPATCH(0x2233); // 10
+        FAISS_QBS512_DISPATCH(0x333);  // 9
+        FAISS_QBS512_DISPATCH(0x2223); // 9
+        FAISS_QBS512_DISPATCH(0x233);  // 8
+        FAISS_QBS512_DISPATCH(0x1223); // 8
+        FAISS_QBS512_DISPATCH(0x223);  // 7
+        FAISS_QBS512_DISPATCH(0x34);   // 7
+        FAISS_QBS512_DISPATCH(0x133);  // 7
+        FAISS_QBS512_DISPATCH(0x6);    // 6
+        FAISS_QBS512_DISPATCH(0x33);   // 6
+        FAISS_QBS512_DISPATCH(0x123);  // 6
+        FAISS_QBS512_DISPATCH(0x222);  // 6
+        FAISS_QBS512_DISPATCH(0x23);   // 5
+        FAISS_QBS512_DISPATCH(0x5);    // 5
+        FAISS_QBS512_DISPATCH(0x13);   // 4
+        FAISS_QBS512_DISPATCH(0x22);   // 4
+        FAISS_QBS512_DISPATCH(0x4);    // 4
+        FAISS_QBS512_DISPATCH(0x3);    // 3
+        FAISS_QBS512_DISPATCH(0x21);   // 3
+        FAISS_QBS512_DISPATCH(0x2);    // 2
+        FAISS_QBS512_DISPATCH(0x1);    // 1
+#undef FAISS_QBS512_DISPATCH
+    }
+
+    // Fallback for unknown QBS values: use 256-bit path with NONE-level
+    // scalers for type compatibility. This is rare — pq4_preferred_qbs()
+    // covers all values above.
+    if constexpr (Scaler::nscale == 0) {
+        DummyScaler<> scaler_none;
+        pq4_accumulate_loop_qbs_fixed_scaler_256(
+                qbs, ntotal2, nsq, codes, LUT0, res, scaler_none, block_stride);
+    } else {
+        NormTableScaler<> scaler_none(scaler.scale_int);
+        pq4_accumulate_loop_qbs_fixed_scaler_256(
+                qbs, ntotal2, nsq, codes, LUT0, res, scaler_none, block_stride);
+    }
+}
+
+} // namespace faiss
+
+#endif // COMPILE_SIMD_AVX512 && __AVX512F__
diff --git a/faiss/impl/fast_scan/dispatching.h b/faiss/impl/fast_scan/dispatching.h
@@ -21,9 +21,8 @@
  *   #include <faiss/impl/fast_scan/dispatching.h>
  *
  * Kernel helpers come from accumulate_loops.h (search_1 multi-BB path
- * and QBS 256-bit path). The QBS helpers use pq4_kernel_qbs_256 only
- * because decompose_qbs.h pulls in 512-bit types that fail with
- * SINGLE_SIMD_LEVEL=NONE in DD mode.
+ * and QBS 256-bit path) and accumulate_loops_512.h (QBS 512-bit path,
+ * AVX512 TU only).
  */
 
 #ifndef THE_LEVEL_TO_DISPATCH
@@ -35,6 +34,10 @@
 #include <faiss/impl/fast_scan/accumulate_loops.h>
 #include <faiss/impl/fast_scan/fast_scan.h>
 
+#if defined(COMPILE_SIMD_AVX512) && defined(__AVX512F__)
+#include <faiss/impl/fast_scan/accumulate_loops_512.h>
+#endif
+
 namespace faiss {
 
 using namespace simd_result_handlers;
@@ -101,14 +104,62 @@ struct ScannerMixIn : FastScanCodeScanner {
             const uint8_t* LUT,
             int pq2x4_scale,
             size_t block_stride) override {
-        if (pq2x4_scale) {
-            NormTableScaler<> scaler(pq2x4_scale);
-            pq4_accumulate_loop_qbs_fixed_scaler_256(
-                    qbs, nb, nsq, codes, LUT, handler_, scaler, block_stride);
+#if defined(COMPILE_SIMD_AVX512) && defined(__AVX512F__)
+        constexpr bool use_avx512_qbs =
+                (THE_LEVEL_TO_DISPATCH == SIMDLevel::AVX512 ||
+                 THE_LEVEL_TO_DISPATCH == SIMDLevel::AVX512_SPR);
+#else
+        constexpr bool use_avx512_qbs = false;
+#endif
+        if constexpr (use_avx512_qbs) {
+            // Use 512-bit QBS kernels with properly-leveled scalers.
+            if (pq2x4_scale) {
+                NormTableScaler<THE_LEVEL_TO_DISPATCH> scaler(pq2x4_scale);
+                pq4_accumulate_loop_qbs_fixed_scaler_512(
+                        qbs,
+                        nb,
+                        nsq,
+                        codes,
+                        LUT,
+                        handler_,
+                        scaler,
+                        block_stride);
+            } else {
+                DummyScaler<THE_LEVEL_TO_DISPATCH> dummy;
+                pq4_accumulate_loop_qbs_fixed_scaler_512(
+                        qbs,
+                        nb,
+                        nsq,
+                        codes,
+                        LUT,
+                        handler_,
+                        dummy,
+                        block_stride);
+            }
         } else {
-            DummyScaler<> dummy;
-            pq4_accumulate_loop_qbs_fixed_scaler_256(
-                    qbs, nb, nsq, codes, LUT, handler_, dummy, block_stride);
+            if (pq2x4_scale) {
+                NormTableScaler<> scaler(pq2x4_scale);
+                pq4_accumulate_loop_qbs_fixed_scaler_256(
+                        qbs,
+                        nb,
+                        nsq,
+                        codes,
+                        LUT,
+                        handler_,
+                        scaler,
+                        block_stride);
+            } else {
+                DummyScaler<> dummy;
+                pq4_accumulate_loop_qbs_fixed_scaler_256(
+                        qbs,
+                        nb,
+                        nsq,
+                        codes,
+                        LUT,
+                        handler_,
+                        dummy,
+                        block_stride);
+            }
         }
     }
 };
diff --git a/faiss/impl/fast_scan/kernels_simd512.h b/faiss/impl/fast_scan/kernels_simd512.h
@@ -30,6 +30,13 @@ void kernel_accumulate_block_avx512_nq1(
         const uint8_t* LUT,
         ResultHandler& res,
         const Scaler& scaler) {
+    // Explicit SIMD levels for DD mode where bare aliases resolve to NONE
+    // (512-bit NONE types don't exist — empty primary templates).
+    using simd32uint16 = simd32uint16_tpl<SIMDLevel::AVX512>;
+    using simd64uint8 = simd64uint8_tpl<SIMDLevel::AVX512>;
+    using simd16uint16 = simd16uint16_tpl<SIMDLevel::AVX2>;
+    using simd32uint8 = simd32uint8_tpl<SIMDLevel::AVX2>;
+
     // NQ is kept in order to match the similarity to baseline function
     constexpr int NQ = 1;
     // distance accumulators. We can accept more for NQ=1
@@ -291,6 +298,12 @@ void kernel_accumulate_block_avx512_nqx(
         const uint8_t* LUT,
         ResultHandler& res,
         const Scaler& scaler) {
+    // Explicit SIMD levels for DD mode (see nq1 variant for explanation).
+    using simd32uint16 = simd32uint16_tpl<SIMDLevel::AVX512>;
+    using simd64uint8 = simd64uint8_tpl<SIMDLevel::AVX512>;
+    using simd16uint16 = simd16uint16_tpl<SIMDLevel::AVX2>;
+    using simd32uint8 = simd32uint8_tpl<SIMDLevel::AVX2>;
+
     // dummy alloc to keep the windows compiler happy
     constexpr int NQA = NQ > 0 ? NQ : 1;
     // distance accumulators