Add VPOPCNTDQ-based RaBitQ kernel for Sapphire Rapids+

mulugetam · mulugetam · commit db91a6c15357 · 2026-05-20T18:11:10.000Z
Add SIMDLevel::AVX512_SPR specializations of bitwise_and_dot_product,
bitwise_xor_dot_product, and popcount that use vpopcntq instead of the
shuffle-based lookup-table popcount. Used on SPR+ when built with
FAISS_OPT_LEVEL=avx512_spr or =dd; AVX-512-only builds are unchanged.

Recall is bit-identical. End-to-end ~1.07x on IndexIVFRaBitQ, ~1.4-1.6x
on flat RaBitQ scans (kernel-isolated). Measured on Sapphire Rapids
with benchs/bench_rabitq.py at d=256/512/768/1024.

Signed-off-by: Mulugeta Mammo &lt;mulugeta.mammo@intel.com&gt;
diff --git a/benchs/bench_rabitq.py b/benchs/bench_rabitq.py
@@ -10,21 +10,42 @@
 # NOTEBOOK_NUMBER: N7030784 (685760243832285)
 
 """:py"""
+import statistics
 import timeit
 from collections import defaultdict
 
 import faiss
 from faiss.contrib.datasets import SyntheticDataset
 
 """:py"""
-ds: SyntheticDataset = SyntheticDataset(256, 1_000_000, 1_000_000, 10_000)
+# Dimensions to sweep. The rabitq SIMD kernel
+# (bitwise_and_dot_product / bitwise_xor_dot_product / popcount) selects
+# its widest tier based on size = d / 8 bytes:
+#   d=256  -> size=32  -> 256-bit ymm only (no 512-bit work)
+#   d=512  -> size=64  -> 512-bit zmm, 1 iteration per bit-plane
+#   d=768  -> size=96  -> 512-bit zmm + 256-bit tail
+#   d=1024 -> size=128 -> 512-bit zmm only, 2 iterations per bit-plane
+# Sweeping these is useful for verifying the AVX512_SPR (vpopcntdq)
+# specialization in faiss/utils/simd_impl/rabitq_avx512_spr.cpp and for
+# profiling perf-record annotations across SIMD-width tiers.
+DIMENSIONS = [256, 512, 768, 1024]
 nlist: int = 1000
 qb: int = 8
+# Number of independent timing samples to take per (index, k, nprobe)
+# combination. Each sample is itself an average over `trials=10` calls
+# inside timeit, so total searches per row = ITERATIONS * 10. Using 3
+# samples is enough to flag whether differences across dimensions are
+# noise or real, while keeping the bench cheap.
+ITERATIONS: int = 3
 # This will contain <"index name", ([recalls],[speeds],[labels (the k)])>
 recall_speed_data = defaultdict(lambda: [[], [], []])
 # This will contain <"index name", ([recalls],[memory for this index])>
 recall_memory_data = defaultdict(lambda: [[], []])
 
+# Set when entering each per-d block below; used by helpers that close
+# over the active dataset.
+ds: SyntheticDataset = None  # type: ignore
+
 """:py"""
 # Helpers
 
@@ -62,6 +83,32 @@ def compute_recall(ground_truth_I, predicted_I):
     return recall
 
 
+def repeated_trials(trials_fn, *args, n=ITERATIONS, **kwargs):
+    """Run a trials function n times and return the list of per-iteration
+    average speeds (each in ms). Each call to trials_fn is itself an
+    average over multiple back-to-back searches, so the returned list
+    contains n independent samples of that average.
+    """
+    return [trials_fn(*args, **kwargs) for _ in range(n)]
+
+
+def summarize(samples):
+    """Return (mean, median, stdev) over a list of timing samples in ms.
+    stdev is the sample standard deviation (n-1); returns 0.0 for n==1
+    since stdev is undefined.
+    """
+    mean = statistics.mean(samples)
+    median = statistics.median(samples)
+    stdev = statistics.stdev(samples) if len(samples) > 1 else 0.0
+    return mean, median, stdev
+
+
+def fmt_speed(samples):
+    """Format a list of timing samples as 'mean=X median=Y stdev=Z'."""
+    mean, median, stdev = summarize(samples)
+    return f"mean={mean:.1f}ms median={median:.1f}ms stdev={stdev:.2f}ms"
+
+
 def create_index(ds, factory_string):
     index = faiss.index_factory(ds.d, factory_string)
     index.train(ds.get_train())
@@ -73,13 +120,14 @@ def create_index(ds, factory_string):
 def handle_index(prefix, index, ds, mem, k):
     gt_I = ds.get_groundtruth(k)
     _, I_res = index.search(ds.get_queries(), k)
-    avg_speed = trials(index, ds.get_queries(), k)
+    speed_samples = repeated_trials(trials, index, ds.get_queries(), k)
+    mean_speed, _, _ = summarize(speed_samples)
     recall = compute_recall(gt_I, I_res)
     print(
-        f"{prefix} recall@{k}: {recall}.  Average speed: {avg_speed:.1f}ms.  Memory: {mem/1e6:.3f}MB"
+        f"{prefix} recall@{k}: {recall}.  Speed: {fmt_speed(speed_samples)}.  Memory: {mem/1e6:.3f}MB"
     )
     recall_speed_data[prefix][0].append(recall)
-    recall_speed_data[prefix][1].append(avg_speed)
+    recall_speed_data[prefix][1].append(mean_speed)
     recall_speed_data[prefix][2].append(f"k={k}")
     recall_memory_data[prefix][0].append(recall)
     recall_memory_data[prefix][1].append(mem)
@@ -91,13 +139,16 @@ def handle_ivf_index(prefix, index, ds, mem, k, params):
     for nprobe in 4, 16, 32:
         params.nprobe = nprobe
         _, I_res = faiss.search_with_parameters(index, ds.get_queries(), k, params)
-        avg_speed = trials_ivf(index, ds.get_queries(), k, params)
+        speed_samples = repeated_trials(
+            trials_ivf, index, ds.get_queries(), k, params
+        )
+        mean_speed, _, _ = summarize(speed_samples)
         recall = compute_recall(gt_I, I_res)
         print(
-            f"{prefix} nprobe={nprobe}: recall@{k}: {recall}.  Average speed: {avg_speed:.1f}ms.  Memory: {mem/1e6:.3f}MB"
+            f"{prefix} nprobe={nprobe}: recall@{k}: {recall}.  Speed: {fmt_speed(speed_samples)}.  Memory: {mem/1e6:.3f}MB"
         )
         recall_speed_data[prefix][0].append(recall)
-        recall_speed_data[prefix][1].append(avg_speed)
+        recall_speed_data[prefix][1].append(mean_speed)
         recall_speed_data[prefix][2].append(f"k={k}, nprobe={nprobe}")
         recall_memory_data[prefix][0].append(recall)
         recall_memory_data[prefix][1].append(mem)
@@ -106,7 +157,7 @@ def handle_ivf_index(prefix, index, ds, mem, k, params):
 # pyre-ignore
 def vary_k_nprobe_measuring_recall_and_memory(prefix, index, ds, mem):
     classname = type(index).__name__
-    for k in 1, 10, 100:
+    for k in (100,):
         if classname in [
             "IndexRaBitQ",
             "IndexPQFastScan",
@@ -131,51 +182,69 @@ def vary_k_nprobe_measuring_recall_and_memory(prefix, index, ds, mem):
             handle_ivf_index(prefix, index, ds, mem, k, params)
 
 """:py '605360559215064'"""
-# IndexRaBitQ
-
-fac_s = "RaBitQ"
-non_ivf_rbq = faiss.index_factory(ds.d, fac_s)
-non_ivf_rbq.qb = qb
-non_ivf_rbq.train(ds.get_train())
-non_ivf_rbq.add(ds.get_database())
-mem = non_ivf_rbq.code_size * non_ivf_rbq.ntotal
-
-vary_k_nprobe_measuring_recall_and_memory(fac_s, non_ivf_rbq, ds, mem)
-
-del non_ivf_rbq
-
-""":py '3928150077498381'"""
-# IndexIVFRaBitQ with no random rotation
-
-fac_s = f"IVF{nlist},RaBitQ"
-rbq1 = faiss.index_factory(ds.d, fac_s)
-rbq1.qb = qb
-rbq1.train(ds.get_train())
-rbq1.add(ds.get_database())
-mem = rbq1.code_size * rbq1.ntotal
-
-vary_k_nprobe_measuring_recall_and_memory(fac_s, rbq1, ds, mem)
-
-del rbq1
-
-""":py '1484145352968190'"""
-# IndexIVFRaBitQ with random rotation
-
-fac_s = f"IVF{nlist},RaBitQ"
-rbq2 = faiss.index_factory(ds.d, fac_s)
-rbq2.qb = qb
-rrot = faiss.RandomRotationMatrix(ds.d, ds.d)
-rrot.init(123)
-index_pt = faiss.IndexPreTransform(rrot, rbq2)
-index_pt.train(ds.get_train())
-index_pt.add(ds.get_database())
-mem = rbq2.code_size * index_pt.ntotal
-
-vary_k_nprobe_measuring_recall_and_memory(fac_s + "_RROT", index_pt, ds, mem)
+# RaBitQ kernels swept across dimensions. Each iteration rebuilds the
+# dataset and the three rabitq index variants. Suffix _d{d} on the
+# result key keeps the per-dimension series distinct in the plots.
+
+for d in DIMENSIONS:
+    print(f"\n========== d={d} ==========")
+    # Dataset sized to keep the full 4-dimension sweep under ~10 minutes.
+    # nq=1k is enough for stable timeit averages across 10 trials; nb=200k
+    # keeps groundtruth (brute-force knn over xb) tractable at d=1024;
+    # nt=100k still satisfies the IVF k-means training-points floor for
+    # nlist=1000 (39 × 1000 = 39k minimum).
+    ds = SyntheticDataset(d, 100_000, 200_000, 1_000)
+
+    # IndexRaBitQ
+    fac_s = "RaBitQ"
+    non_ivf_rbq = faiss.index_factory(ds.d, fac_s)
+    non_ivf_rbq.qb = qb
+    non_ivf_rbq.train(ds.get_train())
+    non_ivf_rbq.add(ds.get_database())
+    mem = non_ivf_rbq.code_size * non_ivf_rbq.ntotal
+
+    vary_k_nprobe_measuring_recall_and_memory(f"{fac_s}_d{d}", non_ivf_rbq, ds, mem)
+
+    del non_ivf_rbq
+
+    # IndexIVFRaBitQ with no random rotation
+    fac_s = f"IVF{nlist},RaBitQ"
+    rbq1 = faiss.index_factory(ds.d, fac_s)
+    rbq1.qb = qb
+    rbq1.train(ds.get_train())
+    rbq1.add(ds.get_database())
+    mem = rbq1.code_size * rbq1.ntotal
+
+    vary_k_nprobe_measuring_recall_and_memory(f"{fac_s}_d{d}", rbq1, ds, mem)
+
+    del rbq1
+
+    # IndexIVFRaBitQ with random rotation
+    fac_s = f"IVF{nlist},RaBitQ"
+    rbq2 = faiss.index_factory(ds.d, fac_s)
+    rbq2.qb = qb
+    rrot = faiss.RandomRotationMatrix(ds.d, ds.d)
+    rrot.init(123)
+    index_pt = faiss.IndexPreTransform(rrot, rbq2)
+    index_pt.train(ds.get_train())
+    index_pt.add(ds.get_database())
+    mem = rbq2.code_size * index_pt.ntotal
+
+    vary_k_nprobe_measuring_recall_and_memory(
+        f"{fac_s}_RROT_d{d}", index_pt, ds, mem
+    )
 
-del index_pt
+    del index_pt
 
 """:py '644702398382829'"""
+# Non-rabitq baselines (SQ, PQfs, HNSW) below. These don't exercise the
+# rabitq SIMD kernels, so we don't sweep dimensions for them; instead
+# we pick one dimension and build them once. Change BASELINE_D if you
+# want a different working point, or comment out the cells below if
+# you only care about the rabitq sweep.
+BASELINE_D = 256
+ds = SyntheticDataset(BASELINE_D, 100_000, 200_000, 1_000)
+
 # IndexScalarQuantizer
 
 for M in [4, 6, 8]:
@@ -270,7 +339,7 @@ def vary_k_nprobe_measuring_recall_and_memory(prefix, index, ds, mem):
         speeds,
         linestyle=" ",
         marker="o",
-        color=colors[i],
+        color=colors[i % len(colors)],
         label=key,
         markersize=15,
     )
@@ -311,15 +380,15 @@ def vary_k_nprobe_measuring_recall_and_memory(prefix, index, ds, mem):
         mems,
         linestyle=" ",
         marker="o",
-        color=colors[i],
+        color=colors[i % len(colors)],
         label=key,
         markersize=10,
     )
 
     texts = []
     if i == 0:
-        texts.append(plt.text(recalls[0], mems[0], "RaBitQ"))
-        texts.append(plt.text(recalls[1], mems[1], "RaBitQ"))
+        for j in range(min(2, len(recalls))):
+            texts.append(plt.text(recalls[j], mems[j], "RaBitQ"))
     adjust_text(
         texts,
         arrowprops=dict(arrowstyle="-", color="black", lw=0.5),
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
@@ -34,6 +34,14 @@ set(FAISS_SIMD_AVX512_SRC
   utils/simd_impl/rabitq_avx512.cpp
   utils/simd_impl/super_kmeans_kernels_avx512.cpp
 )
+# AVX-512 sources that additionally require AVX512_VPOPCNTDQ
+# (Sapphire Rapids and later). Compiled into faiss_avx512_spr only,
+# and into the DD faiss target with an extra per-file -mavx512vpopcntdq
+# flag. Not compiled into faiss_avx512 — that target stops at the
+# baseline AVX-512F/CD/VL/DQ/BW feature set.
+set(FAISS_SIMD_AVX512_SPR_SRC
+  utils/simd_impl/rabitq_avx512_spr.cpp
+)
 set(FAISS_SIMD_NEON_SRC
   impl/fast_scan/impl-neon.cpp
   impl/scalar_quantizer/sq-neon.cpp
@@ -61,7 +69,7 @@ set(FAISS_SIMD_RVV_SRC
 )
 # Select SIMD sources based on target architecture
 if(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64|amd64|AMD64)")
-  set(FAISS_SIMD_SRC ${FAISS_SIMD_AVX2_SRC} ${FAISS_SIMD_AVX512_SRC})
+  set(FAISS_SIMD_SRC ${FAISS_SIMD_AVX2_SRC} ${FAISS_SIMD_AVX512_SRC} ${FAISS_SIMD_AVX512_SPR_SRC})
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64|arm64|ARM64)")
   set(FAISS_SIMD_SRC ${FAISS_SIMD_NEON_SRC} ${FAISS_SIMD_SVE_SRC})
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(riscv64|riscv)")
@@ -467,7 +475,7 @@ else()
   # we need bigobj for the swig wrapper
   add_compile_options(/bigobj)
 endif()
-target_sources(faiss_avx512_spr PRIVATE ${FAISS_SIMD_AVX2_SRC} ${FAISS_SIMD_AVX512_SRC})
+target_sources(faiss_avx512_spr PRIVATE ${FAISS_SIMD_AVX2_SRC} ${FAISS_SIMD_AVX512_SRC} ${FAISS_SIMD_AVX512_SPR_SRC})
 target_compile_definitions(faiss_avx512_spr PRIVATE COMPILE_SIMD_AVX2 COMPILE_SIMD_AVX512 COMPILE_SIMD_AVX512_SPR )
 
 add_library(faiss_sve ${FAISS_SRC})
@@ -525,6 +533,14 @@ if(FAISS_OPT_LEVEL STREQUAL "dd")
         PROPERTIES COMPILE_OPTIONS
           "-mavx512f;-mavx512cd;-mavx512vl;-mavx512dq;-mavx512bw;-mfma;-mf16c;-mpopcnt"
       )
+      # SPR-only sources additionally require AVX512_VPOPCNTDQ.
+      # vpopcntq is the whole point of these files, so the extra flag
+      # is mandatory; without it the intrinsics fail to compile.
+      set_source_files_properties(${FAISS_SIMD_AVX512_SPR_SRC}
+        TARGET_DIRECTORY faiss
+        PROPERTIES COMPILE_OPTIONS
+          "-mavx512f;-mavx512cd;-mavx512vl;-mavx512dq;-mavx512bw;-mavx512vpopcntdq;-mfma;-mf16c;-mpopcnt"
+      )
     else()
       # Per-file SIMD flags (MSVC)
       add_compile_options(/bigobj)
@@ -536,6 +552,15 @@ if(FAISS_OPT_LEVEL STREQUAL "dd")
         TARGET_DIRECTORY faiss
         PROPERTIES COMPILE_OPTIONS "/arch:AVX512"
       )
+      # MSVC has no per-feature flag for VPOPCNTDQ; /arch:AVX512
+      # enables it as part of the AVX-512 feature set used by recent
+      # toolchains. (Newer MSVC supports __isa_available checks at
+      # runtime; the SPR specialization is gated by COMPILE_SIMD_AVX512_SPR
+      # at the source level, so this is safe.)
+      set_source_files_properties(${FAISS_SIMD_AVX512_SPR_SRC}
+        TARGET_DIRECTORY faiss
+        PROPERTIES COMPILE_OPTIONS "/arch:AVX512"
+      )
     endif()
   elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "(aarch64|arm64|ARM64)")
     # ARM NEON is always available on aarch64, no special compiler flags needed
diff --git a/faiss/impl/RaBitQUtils.cpp b/faiss/impl/RaBitQUtils.cpp
@@ -321,7 +321,7 @@ float compute_full_multibit_distance(
         size_t d,
         size_t ex_bits,
         MetricType metric_type) {
-    return with_selected_simd_levels<AVAILABLE_SIMD_LEVELS_A0>(
+    return with_selected_simd_levels<AVAILABLE_SIMD_LEVELS_A0_SPR>(
             [&]<SIMDLevel SL>() {
                 return compute_full_multibit_distance<SL>(
                         sign_bits,
diff --git a/faiss/impl/RaBitQuantizer.cpp b/faiss/impl/RaBitQuantizer.cpp
@@ -551,7 +551,13 @@ FlatCodesDistanceComputer* RaBitQuantizer::get_distance_computer(
     // Dispatch on SIMDLevel once here so the distance computer methods
     // call the SIMD-specialized rabitq functions directly (no per-call
     // with_simd_level overhead).
-    return with_selected_simd_levels<AVAILABLE_SIMD_LEVELS_A0>(
+    //
+    // Use A0_SPR (which includes AVX512_SPR) so that on Sapphire Rapids
+    // and later x86 microarchitectures the VPOPCNTDQ-based RaBitQ
+    // specialization in rabitq_avx512_spr.cpp is selected. On AVX-512
+    // CPUs without VPOPCNTDQ, dispatch falls through to the AVX512
+    // specialization in rabitq_avx512.cpp.
+    return with_selected_simd_levels<AVAILABLE_SIMD_LEVELS_A0_SPR>(
             [&]<SIMDLevel SL>() -> FlatCodesDistanceComputer* {
                 if (qb == 0) {
                     auto dc =
diff --git a/faiss/impl/simd_dispatch.h b/faiss/impl/simd_dispatch.h
@@ -36,6 +36,12 @@ constexpr int AVAILABLE_SIMD_LEVELS_AVX2_NEON = AVAILABLE_SIMD_LEVELS_NONE |
 constexpr int AVAILABLE_SIMD_LEVELS_A0 = AVAILABLE_SIMD_LEVELS_AVX2_NEON |
         (1 << int(SIMDLevel::AVX512)) | (1 << int(SIMDLevel::RISCV_RVV));
 
+// A0_SPR: same as A0 + AVX512_SPR (for functions with a dedicated SPR
+// specialization on top of an AVX512 fallback). Currently used by the
+// RaBitQ popcount kernels, which use VPOPCNTDQ on SPR+.
+constexpr int AVAILABLE_SIMD_LEVELS_A0_SPR =
+        AVAILABLE_SIMD_LEVELS_A0 | (1 << int(SIMDLevel::AVX512_SPR));
+
 // A1: same + ARM_SVE (for functions with dedicated SVE implementations)
 constexpr int AVAILABLE_SIMD_LEVELS_A1 =
         AVAILABLE_SIMD_LEVELS_A0 | (1 << int(SIMDLevel::ARM_SVE));
diff --git a/faiss/utils/simd_impl/rabitq_avx512_spr.cpp b/faiss/utils/simd_impl/rabitq_avx512_spr.cpp