Skip to content

Commit 6d54e48

Browse files
committed
refactor: migrate NEON PQ kernels to dynamic dispatch framework
- Remove standalone sra_krl/ library, inline kernels as C++ template specializations under impl/pq_code_distance/ - Replace #ifdef __aarch64__ with COMPILE_SIMD_ARM_NEON and SIMDLevel::ARM_NEON templates - Convert .c files to .cpp, drop LANGUAGE CXX workarounds - No algorithmic changes, benchmarks consistent with prior results
1 parent 7d83232 commit 6d54e48

263 files changed

Lines changed: 11668 additions & 12144 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

CMakeLists.txt

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -53,11 +53,7 @@ project(faiss
5353
LANGUAGES ${FAISS_LANGUAGES})
5454
include(GNUInstallDirs)
5555

56-
if(FAISS_ENABLE_CUVS)
57-
set(CMAKE_CXX_STANDARD 17)
58-
else()
59-
set(CMAKE_CXX_STANDARD 20)
60-
endif()
56+
set(CMAKE_CXX_STANDARD 20)
6157

6258
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")
6359

INSTALL.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ pre-release nightly builds.
66

77
- The CPU-only faiss-cpu conda package is currently available on Linux (x86-64 and aarch64), OSX (arm64 only), and Windows (x86-64)
88
- faiss-gpu, containing both CPU and GPU indices, is available on Linux (x86-64 only) for CUDA 11.4 and 12.1
9-
- faiss-gpu-cuvs package containing GPU indices provided by [NVIDIA cuVS](https://github.com/rapidsai/cuvs/) version 25.10, is available on Linux (x86-64 only) for CUDA 12.4.
9+
- faiss-gpu-cuvs package containing GPU indices provided by [NVIDIA cuVS](https://github.com/rapidsai/cuvs/) version 26.02, is available on Linux (x86-64 only) for CUDA 12.4.
1010

1111
To install the latest stable release:
1212

@@ -61,7 +61,7 @@ found to run on other platforms as well, see
6161
[other platforms](https://github.com/facebookresearch/faiss/wiki/Related-projects#bindings-to-other-languages-and-porting-to-other-platforms).
6262

6363
The basic requirements are:
64-
- a C++17 compiler (with OpenMP support version 2 or higher),
64+
- a C++20 compiler (with OpenMP support version 2 or higher),
6565
- a BLAS implementation (on Intel machines we strongly recommend using Intel MKL for best
6666
performance).
6767

@@ -72,7 +72,7 @@ The optional requirements are:
7272
- for AMD GPUs:
7373
- AMD ROCm,
7474
- for using NVIDIA cuVS implementations:
75-
- libcuvs=25.10
75+
- libcuvs=26.02
7676
- for the python bindings:
7777
- python 3,
7878
- numpy,
@@ -87,9 +87,9 @@ section of the wiki](https://github.com/facebookresearch/faiss/wiki/Troubleshoot
8787

8888
The libcuvs dependency should be installed via conda:
8989
```
90-
conda install -c rapidsai -c conda-forge -c nvidia libcuvs=25.10 'cuda-version=12.6'
90+
conda install -c rapidsai -c conda-forge -c nvidia libcuvs=26.02 'cuda-version=12.6'
9191
```
92-
For more ways to install cuVS 25.10, refer to the [RAPIDS Installation Guide](https://docs.rapids.ai/install).
92+
For more ways to install cuVS 26.02, refer to the [RAPIDS Installation Guide](https://docs.rapids.ai/install).
9393

9494
### Building with Intel(R) SVS
9595

benchs/README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ Certain tests / benchmarks might be outdated.
348348
* bench_hamming_computer.cpp - specialized implementations for Hamming distance computations
349349
* bench_heap_replace.cpp - benchmarks different implementations of certain calls for a Heap data structure
350350
* bench_hnsw.py - benchmarks HNSW in combination with other ones for SIFT1M dataset
351+
* bench_hnsw_prune_headroom.py - benchmarks HNSW prune_headroom recall and build time impact
351352
* bench_index_flat.py - benchmarks IndexFlatL2 on a synthetic dataset
352353
* bench_index_pq.py - benchmarks PQ on SIFT1M dataset
353354
* bench_ivf_fastscan_single_query.py - benchmarks a single query for different nprobe levels for IVF{nlist},PQ{M}x4fs on BIGANN dataset
Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
#
3+
# This source code is licensed under the MIT license found in the
4+
# LICENSE file in the root directory of this source tree.
5+
6+
"""
7+
Benchmark script for testing HNSW prune_headroom recall impact.
8+
9+
Compares recall and build time between different prune_headroom values.
10+
Default comparison is between:
11+
- Baseline: prune_headroom = 0.0 (original behavior, no headroom)
12+
- With headroom: prune_headroom = 0.2 (proposed default)
13+
14+
Usage:
15+
python bench_hnsw_prune_headroom.py
16+
python bench_hnsw_prune_headroom.py --nb 100000 --d 256
17+
python bench_hnsw_prune_headroom.py --headroom_values 0.0 0.1 0.2 0.3
18+
"""
19+
20+
import argparse
21+
import time
22+
23+
import faiss
24+
25+
try:
26+
from faiss.contrib.datasets_fb import DatasetSIFT1M
27+
except ImportError:
28+
from faiss.contrib.datasets import DatasetSIFT1M
29+
30+
from faiss.contrib.datasets import SyntheticDataset
31+
32+
33+
def compute_recall(I, gt, k):
34+
"""Compute recall@k given search results I and ground truth gt."""
35+
nq = gt.shape[0]
36+
return faiss.eval_intersection(I[:, :k], gt[:, :k]) / (nq * k)
37+
38+
39+
def build_hnsw_index(d, m, xb, ef_construction, prune_headroom):
40+
"""Build an HNSW index with the specified configuration."""
41+
index = faiss.IndexHNSWSQ(d, faiss.ScalarQuantizer.QT_4bit, m)
42+
index.hnsw.efConstruction = ef_construction
43+
index.hnsw.prune_headroom = prune_headroom
44+
45+
index.train(xb)
46+
start_time = time.time()
47+
index.add(xb)
48+
build_time = time.time() - start_time
49+
50+
return index, build_time
51+
52+
53+
def run_benchmark(
54+
d=384,
55+
m=32,
56+
nb=50000,
57+
nq=1000,
58+
reps=3,
59+
ef_construction=40,
60+
ef_search_values=None,
61+
k_values=None,
62+
headroom_values=None,
63+
use_sift1m=False,
64+
):
65+
"""
66+
Run the prune_headroom recall benchmark.
67+
68+
Args:
69+
d: Dimension of vectors
70+
nb: Number of base vectors
71+
nq: Number of query vectors
72+
ef_construction: efConstruction parameter for HNSW
73+
ef_search_values: List of efSearch values to test
74+
k_values: List of k values for recall@k
75+
headroom_values: List of prune_headroom values to compare
76+
use_sift1m: Use SIFT1M dataset instead of synthetic
77+
78+
Returns:
79+
Dictionary containing benchmark results
80+
"""
81+
if ef_search_values is None:
82+
ef_search_values = [16, 32, 64, 128, 256]
83+
if k_values is None:
84+
k_values = [1, 10]
85+
if headroom_values is None:
86+
headroom_values = [0.0, 0.2]
87+
88+
if use_sift1m:
89+
print("Loading SIFT1M dataset")
90+
ds = DatasetSIFT1M()
91+
xb = ds.get_database()
92+
xq = ds.get_queries()
93+
d = xb.shape[1]
94+
nb = xb.shape[0]
95+
nq = xq.shape[0]
96+
else:
97+
print(f"Generating synthetic dataset: d={d}, nb={nb}, nq={nq}")
98+
ds = SyntheticDataset(d=d, nt=0, nb=nb, nq=nq)
99+
xb = ds.get_database()
100+
xq = ds.get_queries()
101+
102+
max_k = max(k_values)
103+
print(f"Computing ground truth for k={max_k}")
104+
gt = ds.get_groundtruth(k=max_k)
105+
106+
results = {"build_times": {}, "ndis_search": {}, "recalls": {}}
107+
108+
for headroom in headroom_values:
109+
for rep in range(reps):
110+
index, build_time = build_hnsw_index(
111+
d, m, xb, ef_construction, headroom)
112+
results["build_times"][headroom] = build_time
113+
114+
faiss.cvar.hnsw_stats.reset()
115+
row = {}
116+
results["recalls"][(headroom, rep)] = row
117+
for ef_search in ef_search_values:
118+
index.hnsw.efSearch = ef_search
119+
_, I = index.search(xq, max_k)
120+
121+
col = {}
122+
row[ef_search] = col
123+
for k in k_values:
124+
recall = compute_recall(I, gt, k)
125+
col[k] = recall
126+
ndis_search = faiss.cvar.hnsw_stats.ndis / nq
127+
results["ndis_search"][headroom] = ndis_search
128+
print(
129+
f"HNSW{m}(prune_headroom={headroom:4.2f}): "
130+
f"{build_time=:4.2f}s, {ndis_search=:5.1f}"
131+
)
132+
133+
print_results_table(results, ef_search_values, k_values, headroom_values)
134+
return results
135+
136+
137+
def print_results_table(results, ef_search_values, k_values, headroom_values):
138+
139+
for k in k_values:
140+
header_parts = [f"{k=:2} "]
141+
for ef_search in ef_search_values:
142+
header_parts.append(f"ef={ef_search:3}")
143+
header = " | ".join(header_parts)
144+
145+
print(f"\n{header}")
146+
print("-" * len(header))
147+
for (h, _), row in results["recalls"].items():
148+
row_parts = [f"h={h:4.2f}"]
149+
for ef_search in ef_search_values:
150+
recall = row[ef_search][k]
151+
row_parts.append(f"{recall:6.4f}")
152+
print(" | ".join(row_parts))
153+
154+
155+
if __name__ == "__main__":
156+
parser = argparse.ArgumentParser(
157+
description="HNSW prune_headroom recall and build time benchmark"
158+
)
159+
parser.add_argument(
160+
"--d",
161+
type=int,
162+
default=128,
163+
help="Dimension of vectors (default: 128)",
164+
)
165+
parser.add_argument(
166+
"--m",
167+
type=int,
168+
default=32,
169+
help="Node degree (M, default: 32)",
170+
)
171+
parser.add_argument(
172+
"--nb",
173+
type=int,
174+
default=50000,
175+
help="Number of base vectors (default: 50000)",
176+
)
177+
parser.add_argument(
178+
"--nq",
179+
type=int,
180+
default=10000,
181+
help="Number of query vectors (default: 10000)",
182+
)
183+
parser.add_argument(
184+
"--ef_construction",
185+
type=int,
186+
default=40,
187+
help="efConstruction parameter (default: 40)",
188+
)
189+
parser.add_argument(
190+
"--ef_search",
191+
type=int,
192+
nargs="+",
193+
default=[16, 32, 64, 128, 256],
194+
help="efSearch values to test (default: 16 32 64 128 256)",
195+
)
196+
parser.add_argument(
197+
"--k",
198+
type=int,
199+
nargs="+",
200+
default=[1, 10],
201+
help="k values for recall@k (default: 1 10)",
202+
)
203+
parser.add_argument(
204+
"--headroom_values",
205+
type=float,
206+
nargs="+",
207+
default=[0.0, 0.04, 0.08, 0.12, 0.16, 0.20],
208+
help="prune_headroom values to compare (default: 0.0 0.2)",
209+
)
210+
parser.add_argument(
211+
"--reps",
212+
type=int,
213+
default=3,
214+
help="Number of repetitions (default: 3)",
215+
)
216+
parser.add_argument(
217+
"--sift1m",
218+
action="store_true",
219+
help="Use SIFT1M dataset instead of synthetic",
220+
)
221+
args = parser.parse_args()
222+
223+
run_benchmark(
224+
d=args.d,
225+
m=args.m,
226+
nb=args.nb,
227+
nq=args.nq,
228+
reps=args.reps,
229+
ef_construction=args.ef_construction,
230+
ef_search_values=args.ef_search,
231+
k_values=args.k,
232+
headroom_values=args.headroom_values,
233+
use_sift1m=args.sift1m,
234+
)

c_api/impl/AuxIndexStructures_c.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ int faiss_RangeSearchPartialResult_new_result(
280280
idx_t qno,
281281
FaissRangeQueryResult** qr) {
282282
try {
283-
auto q = &reinterpret_cast<RangeSearchPartialResult*>(res)->new_result(
283+
auto& q = reinterpret_cast<RangeSearchPartialResult*>(res)->new_result(
284284
qno);
285285
if (qr) {
286286
*qr = reinterpret_cast<FaissRangeQueryResult*>(&q);

cmake/thirdparty/fetch_rapids.cmake

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,11 @@
1515
# or implied. See the License for the specific language governing permissions and limitations under
1616
# the License.
1717
# =============================================================================
18-
set(RAPIDS_VERSION "25.10")
18+
set(RAPIDS_VERSION "26.02")
1919
set(rapids-cmake-version ${RAPIDS_VERSION})
2020

2121
if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
22-
file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
22+
file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/release/${RAPIDS_VERSION}/RAPIDS.cmake
2323
${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)
2424
endif()
2525
include(${CMAKE_CURRENT_BINARY_DIR}/FAISS_RAPIDS.cmake)

conda/faiss-gpu-cuvs/meta.yaml

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66
{% set version = environ.get('GIT_DESCRIBE_TAG').lstrip('v') %}
77
{% set suffix = "_nightly" if environ.get('PACKAGE_TYPE') == 'nightly' else "" %}
88
{% set number = GIT_DESCRIBE_NUMBER %}
9-
{% set cuda_constraints=">=12.6,<12.7" %}
10-
{% set libcublas_constraints=">=12.6,<12.7" %}
11-
{% set cudart_constraints=">=12.6,<12.7" %}
9+
{% set cuda_major = (cudatoolkit | default("12.0")).split('.')[0] | int %}
10+
{% set cuda_constraints=">=13.2,<13.3" %}
11+
{% set libcublas_constraints=">=13.3,<13.4" %} # libcublas 13.3 is actually for cuda 13.2
12+
{% set cudart_constraints=">=13.2,<13.3" %}
1213

1314
package:
1415
name: faiss-pkg
@@ -60,7 +61,7 @@ outputs:
6061
- _openmp_mutex =4.5=2_kmp_llvm # [x86_64]
6162
- mkl >=2024.2.2 # [x86_64]
6263
- openblas =0.3.30 # [not x86_64]
63-
- libcuvs =25.10
64+
- libcuvs =26.02
6465
- cuda-version {{ cuda_constraints }}
6566
- libsvs-runtime =0.2.0 # [x86_64 and linux]
6667
run:
@@ -69,7 +70,7 @@ outputs:
6970
- openblas =0.3.30 # [not x86_64]
7071
- cuda-cudart {{ cuda_constraints }}
7172
- libcublas {{ libcublas_constraints }}
72-
- libcuvs =25.10
73+
- libcuvs =26.02
7374
- cuda-version {{ cuda_constraints }}
7475
- libnvjitlink
7576
- libsvs-runtime =0.2.0 # [x86_64 and linux]
@@ -91,10 +92,10 @@ outputs:
9192
string: "py{{ PY_VER }}_h{{ PKG_HASH }}_{{ number }}_cuda{{ cudatoolkit }}{{ suffix }}"
9293
requirements:
9394
build:
94-
- {{ compiler('cxx') }} =12.4
95-
- sysroot_linux-64 =2.17 # [linux64]
95+
- {{ compiler('cxx') }} =14.2
96+
- sysroot_linux-64 =2.34 # [linux64]
9697
- swig =4.0
97-
- cmake >=3.26.4
98+
- cmake >=3.30.4
9899
- make =4.2 # [not win]
99100
- _openmp_mutex =4.5=2_kmp_llvm # [x86_64]
100101
- mkl >=2024.2.2 # [x86_64]
@@ -117,13 +118,20 @@ outputs:
117118
requires:
118119
- numpy >=2.0,<2.3
119120
- scipy
121+
# TODO: remove cuda_major guard when we move to PyPI (pytorch via PyPI works on CUDA 13)
122+
{% if cuda_major < 13 %}
120123
- pytorch-gpu >=2.7
124+
{% endif %}
121125
commands:
122126
- python -X faulthandler -m unittest discover -v -s tests/ -p "test_*"
127+
{% if cuda_major < 13 %}
123128
- python -X faulthandler -m unittest discover -v -s tests/ -p "torch_*"
129+
{% endif %}
124130
- cp tests/common_faiss_tests.py faiss/gpu/test
125131
- python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "test_*"
132+
{% if cuda_major < 13 %}
126133
- python -X faulthandler -m unittest discover -v -s faiss/gpu/test/ -p "torch_*"
134+
{% endif %}
127135
- sh test_cpu_dispatch.sh # [linux64]
128136
files:
129137
- test_cpu_dispatch.sh # [linux64]

0 commit comments

Comments
 (0)