Add FastScanCodeScanner dispatch boundary with per-SIMD TUs

algoriddle · meta-codesync[bot] · commit b19e9bab89a7 · 2026-03-11T05:58:47.000-07:00
Summary:
Add `FastScanCodeScanner`, a virtual base that bundles handler + kernel
behind the SIMD dispatch boundary. In DD mode, `SINGLE_SIMD_LEVEL = NONE`
so the existing fast scan code path uses emulated SIMD types. The new
scanner provides per-SIMD translation units (AVX2, AVX512, ARM_NEON)
compiled with the correct ISA flags, and a factory function
(`make_fast_scan_knn_scanner`) that uses `DISPATCH_SIMDLevel` to select
the right TU at runtime.

This follows the proven `THE_LEVEL_TO_DISPATCH` pattern from the scalar
quantizer per-SIMD TUs (`sq-dispatch.h`). Each per-SIMD TU includes
`dispatching.h` which provides:
- `ScannerMixIn&lt;Handler&gt;`: wraps a concrete handler and calls accumulation
  kernels (both search_1 multi-BB and QBS paths)
- Factory specialization `make_fast_scan_scanner_impl&lt;SL&gt;()` with
  combinatorial dispatch over `is_max × with_id_map × handler_type`
  (SingleResultHandler for k=1, HeapHandler for k≤20, ReservoirHandler
  for k&gt;20)

New files:
- `impl/fast_scan/dispatching.h` — dispatch template header
- `impl/fast_scan/impl-avx2.cpp` — AVX2 per-SIMD TU
- `impl/fast_scan/impl-avx512.cpp` — AVX512 per-SIMD TU
- `impl/fast_scan/impl-neon.cpp` — ARM NEON TU (with ARM_SVE forwarding)

Modified files:
- `impl/fast_scan/pq4_fast_scan.h` — FastScanCodeScanner base + factory decl
- `impl/fast_scan/pq4_fast_scan.cpp` — NONE specialization + dispatch wrapper
- `xplat.bzl` / `CMakeLists.txt` — register SIMD files and header

Note: RaBitQ handler is not wired through FastScanCodeScanner in this
diff. That comes in later diffs when callers are switched.

Differential Revision: D95950483
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
@@ -9,16 +9,19 @@
 # Architecture-specific: only include files for the current build architecture
 # =============================================================================
 set(FAISS_SIMD_AVX2_SRC
+  impl/fast_scan/impl-avx2.cpp
   impl/pq_code_distance/pq_code_distance-avx2.cpp
   impl/scalar_quantizer/sq-avx2.cpp
   utils/simd_impl/distances_avx2.cpp
 )
 set(FAISS_SIMD_AVX512_SRC
+  impl/fast_scan/impl-avx512.cpp
   impl/pq_code_distance/pq_code_distance-avx512.cpp
   impl/scalar_quantizer/sq-avx512.cpp
   utils/simd_impl/distances_avx512.cpp
 )
 set(FAISS_SIMD_NEON_SRC
+  impl/fast_scan/impl-neon.cpp
   impl/scalar_quantizer/sq-neon.cpp
   utils/simd_impl/distances_aarch64.cpp
 )
@@ -262,6 +265,8 @@ set(FAISS_HEADERS
   impl/kmeans1d.h
   impl/lattice_Zn.h
   impl/platform_macros.h
+  impl/fast_scan/accumulate_loops.h
+  impl/fast_scan/dispatching.h
   impl/fast_scan/pq4_fast_scan.h
   impl/fast_scan/decompose_qbs.h
   impl/fast_scan/kernels_simd256.h
diff --git a/faiss/impl/fast_scan/accumulate_loops.h b/faiss/impl/fast_scan/accumulate_loops.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/**
+ * @file accumulate_loops.h
+ * @brief Shared accumulation loop helpers for fast-scan search paths.
+ *
+ * Contains the search_1 multi-BB accumulation loop (bbs > 32):
+ *   - accumulate_fixed_blocks / pq4_accumulate_loop_fixed_scaler
+ *
+ * The QBS path (bbs == 32) is in decompose_qbs.h.
+ *
+ * All functions live in `namespace faiss` (not anonymous) so they can be
+ * shared by both the per-SIMD TU dispatcher (dispatching.h) and the old
+ * free-function search paths (pq4_fast_scan_search_1.cpp).
+ *
+ * The QBS helpers here always use pq4_kernel_qbs_256 (never 512-bit).
+ * This is required for the per-SIMD DD TUs where SINGLE_SIMD_LEVEL=NONE
+ * leaves 512-bit types empty.  The old pq4_fast_scan_search_qbs.cpp
+ * continues to use decompose_qbs.h which includes both 256 and 512 paths.
+ */
+
+#include <cassert>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/fast_scan/LookupTableScaler.h>
+#include <faiss/impl/fast_scan/kernels_simd256.h>
+#include <faiss/impl/fast_scan/simd_result_handlers.h>
+
+namespace faiss {
+
+using namespace simd_result_handlers;
+
+/***************************************************************
+ * Search_1 path helpers (multi-BB kernel, bbs > 32)
+ ***************************************************************/
+
+template <int NQ, int BB, class ResultHandler, class Scaler>
+void accumulate_fixed_blocks(
+        size_t nb,
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler,
+        size_t block_stride) {
+    constexpr int bbs = 32 * BB;
+    for (size_t j0 = 0; j0 < nb; j0 += bbs) {
+        FixedStorageHandler<NQ, 2 * BB> res2;
+        kernel_accumulate_block<NQ, BB>(nsq, codes, LUT, res2, scaler);
+        res.set_block_origin(0, j0);
+        res2.to_other_handler(res);
+        codes += block_stride;
+    }
+}
+
+template <class ResultHandler, class Scaler>
+void pq4_accumulate_loop_fixed_scaler(
+        int nq,
+        size_t nb,
+        int bbs,
+        int nsq,
+        const uint8_t* codes,
+        const uint8_t* LUT,
+        ResultHandler& res,
+        const Scaler& scaler,
+        size_t block_stride) {
+    FAISS_THROW_IF_NOT(is_aligned_pointer(codes));
+    FAISS_THROW_IF_NOT(is_aligned_pointer(LUT));
+    FAISS_THROW_IF_NOT(bbs % 32 == 0);
+    FAISS_THROW_IF_NOT(nb % bbs == 0);
+
+#define FAISS_ACCLOOP_DISPATCH(NQ, BB)                           \
+    case NQ * 1000 + BB:                                         \
+        accumulate_fixed_blocks<NQ, BB>(                         \
+                nb, nsq, codes, LUT, res, scaler, block_stride); \
+        break
+
+    switch (nq * 1000 + bbs / 32) {
+        FAISS_ACCLOOP_DISPATCH(1, 1);
+        FAISS_ACCLOOP_DISPATCH(1, 2);
+        FAISS_ACCLOOP_DISPATCH(1, 3);
+        FAISS_ACCLOOP_DISPATCH(1, 4);
+        FAISS_ACCLOOP_DISPATCH(1, 5);
+        FAISS_ACCLOOP_DISPATCH(2, 1);
+        FAISS_ACCLOOP_DISPATCH(2, 2);
+        FAISS_ACCLOOP_DISPATCH(3, 1);
+        FAISS_ACCLOOP_DISPATCH(4, 1);
+        default:
+            FAISS_THROW_FMT("nq=%d bbs=%d not instantiated", nq, bbs);
+    }
+#undef FAISS_ACCLOOP_DISPATCH
+}
+
+} // namespace faiss
diff --git a/faiss/impl/fast_scan/decompose_qbs.h b/faiss/impl/fast_scan/decompose_qbs.h
@@ -19,8 +19,12 @@ namespace faiss {
 using namespace simd_result_handlers;
 
 /*
- * Unified kernel: selects 256-bit vs 512-bit path based on
- * compile-time __AVX512F__ guard.
+ * Unified kernel: selects 256-bit vs 512-bit path.
+ *
+ * In static AVX512 mode: SINGLE_SIMD_LEVEL == AVX512, uses 512-bit kernel.
+ * In DD mode AVX512 TU: __AVX512F__ is defined (compiler flags) but
+ * SINGLE_SIMD_LEVEL == NONE (handlers use emulated types), so we fall
+ * through to the 256-bit kernel. This is correct and intentional.
  */
 template <int NQ, class ResultHandler, class Scaler>
 void kernel_accumulate_block(
@@ -30,7 +34,13 @@ void kernel_accumulate_block(
         ResultHandler& res,
         const Scaler& scaler) {
 #ifdef __AVX512F__
-    pq4_kernel_qbs_512<NQ>(nsq, codes, LUT, res, scaler);
+    if constexpr (
+            SINGLE_SIMD_LEVEL == SIMDLevel::AVX512 ||
+            SINGLE_SIMD_LEVEL == SIMDLevel::AVX512_SPR) {
+        pq4_kernel_qbs_512<NQ>(nsq, codes, LUT, res, scaler);
+    } else {
+        pq4_kernel_qbs_256<NQ>(nsq, codes, LUT, res, scaler);
+    }
 #else
     pq4_kernel_qbs_256<NQ>(nsq, codes, LUT, res, scaler);
 #endif
diff --git a/faiss/impl/fast_scan/dispatching.h b/faiss/impl/fast_scan/dispatching.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/**
+ * @file dispatching.h
+ * @brief Per-SIMD TU dispatch template for fast scan.
+ *
+ * This header is included once per SIMD TU with THE_LEVEL_TO_DISPATCH
+ * set to the desired SIMDLevel. It provides:
+ *   - ScannerMixIn: wraps a handler + calls kernel at the TU's SIMD level
+ *   - make_fast_scan_scanner_impl<SL>: factory specialization
+ *
+ * Usage (in a per-SIMD .cpp file):
+ *   #define THE_LEVEL_TO_DISPATCH SIMDLevel::AVX2
+ *   #include <faiss/impl/fast_scan/dispatching.h>
+ *
+ * Kernel helpers come from accumulate_loops.h (search_1 multi-BB path)
+ * and decompose_qbs.h (QBS path, with if-constexpr guard for 512-bit).
+ */
+
+#ifndef THE_LEVEL_TO_DISPATCH
+#error "Define THE_LEVEL_TO_DISPATCH before including this header"
+#endif
+
+#include <memory>
+
+#include <faiss/impl/fast_scan/accumulate_loops.h>
+#include <faiss/impl/fast_scan/decompose_qbs.h>
+#include <faiss/impl/fast_scan/pq4_fast_scan.h>
+
+namespace faiss {
+
+using namespace simd_result_handlers;
+
+/***************************************************************
+ * ScannerMixIn: wraps a concrete handler + calls accumulation
+ * kernels. Lives behind the virtual FastScanCodeScanner interface
+ * so callers don't need to know the handler type.
+ ***************************************************************/
+
+template <class Handler>
+struct ScannerMixIn : FastScanCodeScanner {
+    Handler handler_;
+
+    template <typename... Args>
+    explicit ScannerMixIn(Args&&... args)
+            : handler_(std::forward<Args>(args)...) {}
+
+    SIMDResultHandlerToFloat* handler() override {
+        return &handler_;
+    }
+
+    void accumulate_loop(
+            int nq,
+            size_t nb,
+            int bbs,
+            int nsq,
+            const uint8_t* codes,
+            const uint8_t* LUT,
+            int pq2x4_scale,
+            size_t block_stride) override {
+        if (pq2x4_scale) {
+            NormTableScaler<> scaler(pq2x4_scale);
+            pq4_accumulate_loop_fixed_scaler(
+                    nq,
+                    nb,
+                    bbs,
+                    nsq,
+                    codes,
+                    LUT,
+                    handler_,
+                    scaler,
+                    block_stride);
+        } else {
+            DummyScaler<> dummy;
+            pq4_accumulate_loop_fixed_scaler(
+                    nq,
+                    nb,
+                    bbs,
+                    nsq,
+                    codes,
+                    LUT,
+                    handler_,
+                    dummy,
+                    block_stride);
+        }
+    }
+
+    void accumulate_loop_qbs(
+            int qbs,
+            size_t nb,
+            int nsq,
+            const uint8_t* codes,
+            const uint8_t* LUT,
+            int pq2x4_scale,
+            size_t block_stride) override {
+        if (pq2x4_scale) {
+            NormTableScaler<> scaler(pq2x4_scale);
+            pq4_accumulate_loop_qbs_fixed_scaler(
+                    qbs, nb, nsq, codes, LUT, handler_, scaler, block_stride);
+        } else {
+            DummyScaler<> dummy;
+            pq4_accumulate_loop_qbs_fixed_scaler(
+                    qbs, nb, nsq, codes, LUT, handler_, dummy, block_stride);
+        }
+    }
+};
+
+/***************************************************************
+ * Factory specialization for this SIMD level.
+ *
+ * Combinatorial dispatch: is_max × with_id_map × handler type
+ *   k == 1:  SingleResultHandler
+ *   impl even: HeapHandler
+ *   impl odd:  ReservoirHandler (capacity = 2*k)
+ ***************************************************************/
+
+template <>
+std::unique_ptr<FastScanCodeScanner> make_fast_scan_scanner_impl<
+        THE_LEVEL_TO_DISPATCH>(
+        bool is_max,
+        int impl,
+        size_t nq,
+        size_t ntotal,
+        int64_t k,
+        float* distances,
+        int64_t* ids,
+        const IDSelector* sel,
+        bool with_id_map) {
+    // Helper lambda: given comparator C and with_id_map W, select handler
+    auto make = [&]<class C, bool W>() -> std::unique_ptr<FastScanCodeScanner> {
+        if (k == 1) {
+            using H = SingleResultHandler<C, W>;
+            return std::make_unique<ScannerMixIn<H>>(
+                    nq, ntotal, distances, ids, sel);
+        } else if (impl % 2 == 0) {
+            using H = HeapHandler<C, W>;
+            return std::make_unique<ScannerMixIn<H>>(
+                    nq, ntotal, k, distances, ids, sel);
+        } else {
+            using H = ReservoirHandler<C, W>;
+            return std::make_unique<ScannerMixIn<H>>(
+                    nq, ntotal, size_t(k), size_t(2 * k), distances, ids, sel);
+        }
+    };
+
+    if (is_max) {
+        if (with_id_map) {
+            return make.template operator()<CMax<uint16_t, int64_t>, true>();
+        } else {
+            return make.template operator()<CMax<uint16_t, int>, false>();
+        }
+    } else {
+        if (with_id_map) {
+            return make.template operator()<CMin<uint16_t, int64_t>, true>();
+        } else {
+            return make.template operator()<CMin<uint16_t, int>, false>();
+        }
+    }
+}
+
+} // namespace faiss
diff --git a/faiss/impl/fast_scan/impl-avx2.cpp b/faiss/impl/fast_scan/impl-avx2.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_AVX2
+
+#define THE_LEVEL_TO_DISPATCH SIMDLevel::AVX2
+#include <faiss/impl/fast_scan/dispatching.h> // IWYU pragma: keep
+
+#endif // COMPILE_SIMD_AVX2
diff --git a/faiss/impl/fast_scan/impl-avx512.cpp b/faiss/impl/fast_scan/impl-avx512.cpp
@@ -0,0 +1,13 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_AVX512
+
+#define THE_LEVEL_TO_DISPATCH SIMDLevel::AVX512
+#include <faiss/impl/fast_scan/dispatching.h> // IWYU pragma: keep
+
+#endif // COMPILE_SIMD_AVX512
diff --git a/faiss/impl/fast_scan/impl-neon.cpp b/faiss/impl/fast_scan/impl-neon.cpp
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_ARM_NEON
+
+#define THE_LEVEL_TO_DISPATCH SIMDLevel::ARM_NEON
+#include <faiss/impl/fast_scan/dispatching.h> // IWYU pragma: keep
+
+// ARM_SVE: forward to ARM_NEON implementation until a dedicated SVE
+// specialization is written (same pattern as scalar_quantizer/sq-neon.cpp).
+#ifdef COMPILE_SIMD_ARM_SVE
+
+namespace faiss {
+
+template <>
+std::unique_ptr<FastScanCodeScanner> make_fast_scan_scanner_impl<
+        SIMDLevel::ARM_SVE>(
+        bool is_max,
+        int impl,
+        size_t nq,
+        size_t ntotal,
+        int64_t k,
+        float* distances,
+        int64_t* ids,
+        const IDSelector* sel,
+        bool with_id_map) {
+    return make_fast_scan_scanner_impl<SIMDLevel::ARM_NEON>(
+            is_max, impl, nq, ntotal, k, distances, ids, sel, with_id_map);
+}
+
+} // namespace faiss
+
+#endif // COMPILE_SIMD_ARM_SVE
+
+#endif // COMPILE_SIMD_ARM_NEON
diff --git a/faiss/impl/fast_scan/pq4_fast_scan.cpp b/faiss/impl/fast_scan/pq4_fast_scan.cpp
diff --git a/faiss/impl/fast_scan/pq4_fast_scan.h b/faiss/impl/fast_scan/pq4_fast_scan.h
diff --git a/faiss/impl/fast_scan/pq4_fast_scan_search_1.cpp b/faiss/impl/fast_scan/pq4_fast_scan_search_1.cpp