facebookresearch
diff --git a/‎.github/workflows/build-pull-request.yml‎
Lines changed: 27 additions & 0 deletions b/‎.github/workflows/build-pull-request.yml‎
Lines changed: 27 additions & 0 deletions
diff --git a/‎.github/workflows/index-io-backward-compatibility.yml‎
Lines changed: 13 additions & 36 deletions b/‎.github/workflows/index-io-backward-compatibility.yml‎
Lines changed: 13 additions & 36 deletions
diff --git a/‎INSTALL.md‎
Lines changed: 7 additions & 5 deletions b/‎INSTALL.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎benchs/CMakeLists.txt‎
Lines changed: 9 additions & 2 deletions b/‎benchs/CMakeLists.txt‎
Lines changed: 9 additions & 2 deletions
diff --git a/‎benchs/avx512_result_handlers/bench_avx512_result_handler.cpp‎
Lines changed: 127 additions & 0 deletions b/‎benchs/avx512_result_handlers/bench_avx512_result_handler.cpp‎
Lines changed: 127 additions & 0 deletions
@@ -38,6 +38,9 @@ jobs:
         uses: actions/checkout@v4
       - name: Build and Test (cmake)
         uses: ./.github/actions/build_cmake
+      - name: Build C++ demos
+        run: |
+          make -C build demo_diversity_result_handler
   linux-x86_64-AVX2-cmake:
     name: Linux x86_64 AVX2 (cmake)
     needs: linux-x86_64-cmake
@@ -71,6 +74,17 @@ jobs:
         uses: ./.github/actions/build_cmake
         with:
           opt_level: avx512_spr
+  linux-x86_64-DD-cmake:
+    name: Linux x86_64 Dynamic Dispatch (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: faiss-aws-m7i.large
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+        with:
+          opt_level: dd
   linux-x86_64-GPU-cmake:
     name: Linux x86_64 GPU (cmake)
     needs: linux-x86_64-cmake
@@ -108,6 +122,19 @@ jobs:
         env:
           # Context: https://github.com/facebookresearch/faiss/wiki/Troubleshooting#surprising-faiss-openmp-and-openblas-interaction
           OPENBLAS_NUM_THREADS: '1'
+  linux-arm64-DD-cmake:
+    name: Linux arm64 Dynamic Dispatch (cmake)
+    needs: linux-x86_64-cmake
+    runs-on: faiss-aws-r8g.large
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Build and Test (cmake)
+        uses: ./.github/actions/build_cmake
+        with:
+          opt_level: dd
+        env:
+          OPENBLAS_NUM_THREADS: '1'
   linux-x86_64-conda:
     name: Linux x86_64 (conda)
     needs: linux-x86_64-cmake
 
@@ -45,29 +45,21 @@ jobs:
           echo "Files created by CMake build:"
           ls -lh ${{ env.SHARED_DATA_DIR }}
 
-      # Step 2: Install conda faiss-cpu and read files
-      - name: Clean cmake-built packages
+      # Step 2: Install conda faiss-cpu in a clean environment and read files
+      - name: Create conda read environment with faiss-cpu
         shell: bash
         run: |
           eval "$(conda shell.bash hook)"
-          conda activate
-          # Remove packages that conflict with faiss-cpu
-          conda remove -y numpy scipy pytest gflags swig cmake make mkl mkl-devel || true
-
-      - name: Install faiss-cpu from pytorch channel
-        shell: bash
-        run: |
-          eval "$(conda shell.bash hook)"
-          conda activate
-          conda list
-          conda install -y -c pytorch faiss-cpu=1.13.2
+          conda create -n faiss_conda_read -y python=3.11
+          conda activate faiss_conda_read
+          conda install -y -c pytorch -c conda-forge faiss-cpu=1.13.2
           conda list
 
       - name: Run Conda reader (read Faiss index and verify)
         shell: bash
         run: |
           eval "$(conda shell.bash hook)"
-          conda activate
+          conda activate faiss_conda_read
           python tests/index_io_backward_compatibility/conda_reader.py ${{ env.SHARED_DATA_DIR }}
 
       - name: Upload artifacts from cmake->conda test
@@ -90,20 +82,14 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
 
-      # Step 1: Install conda faiss-cpu package and write files
-      - name: Setup miniconda
-        uses: conda-incubator/setup-miniconda@v3
-        with:
-          python-version: '3.11'
-          miniforge-version: latest
-
-      - name: Install faiss-cpu from pytorch channel
+      # Step 1: Install conda faiss-cpu in a clean environment and write files
+      - name: Create conda write environment with faiss-cpu
         shell: bash
         run: |
           eval "$(conda shell.bash hook)"
-          conda activate
-          # Install pre-built faiss-cpu
-          conda install -y -c pytorch faiss-cpu=1.13.2
+          conda create -n faiss_conda_write -y python=3.11
+          conda activate faiss_conda_write
+          conda install -y -c pytorch -c conda-forge faiss-cpu=1.13.2
           conda list
 
       - name: Create shared data directory
@@ -116,7 +102,7 @@ jobs:
         shell: bash
         run: |
           eval "$(conda shell.bash hook)"
-          conda activate
+          conda activate faiss_conda_write
           python tests/index_io_backward_compatibility/conda_writer.py ${{ env.SHARED_DATA_DIR }}
 
       - name: Verify files were written
@@ -125,19 +111,10 @@ jobs:
           echo "Files created by Conda build:"
           ls -lh ${{ env.SHARED_DATA_DIR }}
 
-      # Step 2: Rebuild with CMake and read files
-      - name: Clean conda artifacts
-        shell: bash
-        run: |
-          # Uninstall conda-built faiss to avoid conflicts
-          eval "$(conda shell.bash hook)"
-          conda activate
-          conda uninstall -y faiss-cpu || true
-
+      # Step 2: Build with CMake and read files
       - name: Build with CMake
         uses: ./.github/actions/build_cmake
         with:
-          setup_conda: 'false'
           upload_artifacts: 'false'
 
       - name: Run CMake reader (read Faiss index and verify)
 
@@ -12,29 +12,31 @@ To install the latest stable release:
 
 ``` shell
 # CPU-only version
-$ conda install -c pytorch faiss-cpu=1.13.2
+$ conda install -c pytorch -c conda-forge faiss-cpu=1.13.2
 
 # GPU(+CPU) version
-$ conda install -c pytorch -c nvidia faiss-gpu=1.13.2
+$ conda install -c pytorch -c nvidia -c conda-forge faiss-gpu=1.13.2
 
 # GPU(+CPU) version with NVIDIA cuVS
 $ conda install -c pytorch -c nvidia -c rapidsai -c conda-forge libnvjitlink faiss-gpu-cuvs=1.13.2
 
 # GPU(+CPU) version using AMD ROCm not yet available
 ```
 
-For faiss-gpu, the nvidia channel is required for CUDA, which is not published in the main anaconda channel.
+The conda-forge channel is required for up-to-date dependencies (MKL on x86-64, OpenBLAS on ARM), which are not regularly updated in the default Anaconda channel.
+
+For faiss-gpu, the nvidia channel is additionally required for CUDA, which is not published in the main anaconda channel.
 
 For faiss-gpu-cuvs, the rapidsai, conda-forge and nvidia channels are required.
 
 Nightly pre-release packages can be installed as follows:
 
 ``` shell
 # CPU-only version
-$ conda install -c pytorch/label/nightly faiss-cpu
+$ conda install -c pytorch/label/nightly -c conda-forge faiss-cpu
 
 # GPU(+CPU) version
-$ conda install -c pytorch/label/nightly -c nvidia faiss-gpu=1.13.2
+$ conda install -c pytorch/label/nightly -c nvidia -c conda-forge faiss-gpu=1.13.2
 
 # GPU(+CPU) version with NVIDIA cuVS (package built with CUDA 12.6)
 conda install -c pytorch -c rapidsai -c rapidsai-nightly -c conda-forge -c nvidia pytorch/label/nightly::faiss-gpu-cuvs 'cuda-version=12.6'
 
@@ -3,8 +3,15 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-
+find_package(BLAS REQUIRED)
+find_package(LAPACK REQUIRED)
+find_package(OpenMP REQUIRED)
 
 add_executable(bench_ivf_selector EXCLUDE_FROM_ALL bench_ivf_selector.cpp)
-target_link_libraries(bench_ivf_selector PRIVATE faiss)
+target_link_libraries(bench_ivf_selector PRIVATE faiss_avx512 ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} OpenMP::OpenMP_CXX)
+target_compile_options(bench_ivf_selector PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>)
 
+add_executable(bench_result_handler_overhead EXCLUDE_FROM_ALL
+  bench_result_handler_overhead.cpp)
+target_link_libraries(bench_result_handler_overhead PRIVATE faiss_avx512 ${BLAS_LIBRARIES} ${LAPACK_LIBRARIES} OpenMP::OpenMP_CXX)
+target_compile_options(bench_result_handler_overhead PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>)
@@ -0,0 +1,127 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "faiss_avx512_result_handler.h"
+
+#include <faiss/IndexIVF.h>
+#include <faiss/index_factory.h>
+#include <faiss/utils/random.h>
+#include <faiss/utils/utils.h>
+#include <omp.h>
+
+#include <cstdio>
+#include <memory>
+#include <vector>
+
+using namespace faiss;
+
+// Parameters
+constexpr int d = 64;          // dimension
+constexpr size_t nb = 1000000; // database size
+constexpr size_t nt = 10000;   // training size
+constexpr size_t nq = 100;     // number of queries
+constexpr int nrun = 5;        // number of timing runs
+
+int main() {
+    // Use single OpenMP thread
+
+    printf("Generating nt=%zu nb=%zu nq=%zu vectors of dimension %d\n",
+           nt,
+           nb,
+           nq,
+           d);
+    std::vector<float> xt(nt * d), xb(nb * d), xq(nq * d);
+    rand_smooth_vectors(nt, d, xt.data(), 1234);
+    rand_smooth_vectors(nb, d, xb.data(), 4567);
+    rand_smooth_vectors(nq, d, xq.data(), 7890);
+
+    // Build IVF1024,Flat index
+    printf("Building IVF1024,Flat index...\n");
+    std::unique_ptr<Index> index(index_factory(d, "IVF1024,Flat", METRIC_L2));
+
+    printf("Training index...\n");
+    index->train(nt, xt.data());
+
+    printf("Adding %zu vectors to index...\n", nb);
+    index->add(nb, xb.data());
+
+    // Set nprobe for IVF index
+    IndexIVF* index_ivf = dynamic_cast<IndexIVF*>(index.get());
+    if (index_ivf) {
+    }
+    omp_set_num_threads(1);
+
+    // Test with varying k values
+    std::vector<size_t> k_values = {1, 10, 20, 50, 100, 200, 500, 1000};
+    std::vector<size_t> nprobe_values = {1, 2, 4, 8, 16, 64};
+
+    printf("\nBenchmarking with %d OpenMP thread(s), %d runs per config\n",
+           omp_get_max_threads(),
+           nrun);
+    printf("%-8s %15s %15s %10s\n",
+           "k",
+           "baseline(ms)",
+           "avx512(ms)",
+           "speedup");
+    printf("------------------------------------------------------------\n");
+
+    for (size_t nprobe : nprobe_values) {
+        index_ivf->nprobe = nprobe;
+        printf("============ nprobe=%zu ===========\n", nprobe);
+        for (size_t k : k_values) {
+            std::vector<float> D_ref(nq * k);
+            std::vector<idx_t> I_ref(nq * k);
+            std::vector<float> D_avx(nq * k);
+            std::vector<idx_t> I_avx(nq * k);
+
+            // Warmup
+            index->search(nq, xq.data(), k, D_ref.data(), I_ref.data());
+
+            // Benchmark baseline search
+            double t0 = getmillisecs();
+            for (int run = 0; run < nrun; run++) {
+                for (size_t q = 0; q < nq; q++) {
+                    index->search(
+                            1,
+                            xq.data() + q * d,
+                            k,
+                            D_ref.data() + q * k,
+                            I_ref.data() + q * k);
+                }
+            }
+            double baseline_time = (getmillisecs() - t0) / nrun;
+
+            // Warmup AVX512 handler
+            ReservoirResultHandlerAVX512 handler(k);
+            for (size_t q = 0; q < nq; q++) {
+                handler.begin();
+                index->search1(xq.data() + q * d, handler);
+                handler.end(D_avx.data() + q * k, I_avx.data() + q * k);
+            }
+
+            // Benchmark AVX512 result handler
+            t0 = getmillisecs();
+            for (int run = 0; run < nrun; run++) {
+                for (size_t q = 0; q < nq; q++) {
+                    handler.begin();
+                    index->search1(xq.data() + q * d, handler);
+                    handler.end(D_avx.data() + q * k, I_avx.data() + q * k);
+                }
+            }
+            double avx512_time = (getmillisecs() - t0) / nrun;
+
+            double speedup = baseline_time / avx512_time;
+            printf("%-8zu %15.3f %15.3f %10.2fx\n",
+                   k,
+                   baseline_time,
+                   avx512_time,
+                   speedup);
+        }
+    }
+
+    return 0;
+}