diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py new file mode 100644 index 0000000000..fcd52210b9 --- /dev/null +++ b/benchs/bench_ivfpq_panorama.py @@ -0,0 +1,293 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +import multiprocessing as mp +import time + +import faiss +import matplotlib.pyplot as plt +import numpy as np +from scipy.linalg import block_diag +from sklearn.decomposition import PCA + +try: + from faiss.contrib.datasets_fb import DatasetGIST1M +except ImportError: + from faiss.contrib.datasets import DatasetGIST1M + +ds = DatasetGIST1M() + +SUBSET = 0.1 # Set to 1.0 for full dataset + +xq = ds.get_queries() +xb_full = ds.get_database() +nb_full = xb_full.shape[0] +nb = int(nb_full * SUBSET) +xb = xb_full[:nb].copy() +del xb_full + +gt = ds.get_groundtruth() if SUBSET == 1.0 else None +xt = ds.get_train()[:max(nb // 2, 50000)] + +nb, d = xb.shape +nq = xq.shape[0] +nt = xt.shape[0] + +k = 10 + +if gt is None: + print(f"Computing ground truth for {SUBSET*100:.0f}% subset ({nb} vectors)...") + flat = faiss.IndexFlatL2(d) + flat.add(xb) + _, gt = flat.search(xq, k) +else: + gt = gt[:, :k] + +print(f"Database: {nb} x {d}, Queries: {nq}, Train: {nt}") + +ALPHA = 8 +M_values = [960, 480, 240] +nbits = 8 +nlist = 128 +n_levels = 16 +nprobes = [1, 2, 4, 8, 16, 32, 64] + + +def get_ivf_index(index): + if isinstance(index, faiss.IndexPreTransform): + return faiss.downcast_index(index.index) + return index + + +def eval_recall(index, nprobe_val): + faiss.cvar.indexPanorama_stats.reset() + t0 = time.time() + _, I = index.search(xq, k=k) + t = time.time() - t0 + speed = t * 1000 / nq + qps = 1000 / speed + + recall = np.mean( + [len(set(gt[i]) & set(I[i])) / k for i in range(nq)], + ) + ratio_dims_scanned = faiss.cvar.indexPanorama_stats.ratio_dims_scanned + print( + f"\tnprobe {nprobe_val:3d}, Recall@{k}: " + f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}, " + f"dims scanned: {ratio_dims_scanned * 100:.1f}%" + ) + + return recall, qps + + +def eval_index(index, label): + ivf_index = get_ivf_index(index) + + faiss.omp_set_num_threads(1) + + data = [] + print(f"====== {label}") + for nprobe in nprobes: + ivf_index.nprobe = nprobe + recall, qps = eval_recall(index, nprobe) + data.append((recall, qps)) + + data = np.array(data) + plt.plot(data[:, 0], data[:, 1], "o-", label=label) + + +def build_ivfpq(M): + """Build vanilla IVFPQ (no transform) via index_factory.""" + index = faiss.index_factory(d, f"IVF{nlist},PQ{M}x{nbits}") + faiss.omp_set_num_threads(mp.cpu_count()) + index.train(xt) + index.add(xb) + return index + + +def compute_level_energies(variances, n_levels, block_size): + """Sum per-dimension variances into per-level total energies.""" + return np.array([ + np.sum(variances[l * block_size : (l + 1) * block_size]) + for l in range(n_levels) + ]) + + +def find_n_spill(variances, level_start, block_size, max_energy_per_level, d): + """Find the smallest number of extra dimensions to spill into. + + After a random rotation over (block_size + n_spill) dims, each dim gets + uniform expected energy. The level's expected energy becomes: + block_size * total_subspace_energy / (block_size + n_spill) + + Returns the smallest n_spill >= 1 where this is <= max_energy_per_level, + or all remaining dims if the cap can't be reached. + """ + level_end = level_start + block_size + max_extra = d - level_end + if max_extra == 0: + return 0 + + total = np.sum(variances[level_start:level_end]) + for n in range(1, max_extra + 1): + total += variances[level_end + n - 1] + if block_size * total / (block_size + n) <= max_energy_per_level: + return n + + return max_extra + + +def random_orthogonal(size, rng): + """Haar-distributed random orthogonal matrix via QR of Gaussian.""" + H = rng.randn(size, size).astype(np.float32) + Q, R = np.linalg.qr(H) + Q *= np.sign(np.diag(R))[:, None] + return Q + + +def build_energy_spill_rotation(eigenvalues, n_levels, block_size, + alpha, seed=42): + """Orthogonal matrix that caps per-level energy via localized rotations. + + Iterates over levels sequentially. When a level's effective energy + exceeds alpha * avg_energy_per_level, applies a random rotation spanning + that level plus enough subsequent dimensions to bring the expected level + energy down to the cap. + + Variances are tracked analytically: after each rotation the dims in the + rotated subspace are set to uniform expected variance. + + Returns (spill_rotation, effective_variances). + """ + d = len(eigenvalues) + total_energy = float(np.sum(eigenvalues)) + max_energy_per_level = alpha * total_energy / n_levels + + variances = eigenvalues.astype(np.float32).copy() + spill_matrix = np.eye(d, dtype=np.float32) + rng = np.random.RandomState(seed) + + for level in range(n_levels): + start = level * block_size + end = start + block_size + level_energy = float(np.sum(variances[start:end])) + + if level_energy <= max_energy_per_level: + continue + + n_spill = find_n_spill( + variances, start, block_size, max_energy_per_level, d, + ) + if n_spill == 0: + continue + + sub_end = end + n_spill + Q = random_orthogonal(block_size + n_spill, rng) + + full_Q = np.eye(d, dtype=np.float32) + full_Q[start:sub_end, start:sub_end] = Q + spill_matrix = full_Q @ spill_matrix + + avg_var = float(np.sum(variances[start:sub_end])) / (block_size + n_spill) + variances[start:sub_end] = avg_var + + return spill_matrix, variances + + +def build_level_equalization_rotation(d, n_levels, block_size, seed=77): + """Block-diagonal random rotation for within-level energy equalization.""" + rng = np.random.RandomState(seed) + blocks = [random_orthogonal(block_size, rng) for _ in range(n_levels)] + return block_diag(*blocks).astype(np.float32) + + +def print_energy_diagnostics(eigenvalues, effective_variances, n_levels, + block_size, alpha): + """Print per-level energy before/after the spill transform.""" + before = compute_level_energies(eigenvalues, n_levels, block_size) + after = compute_level_energies(effective_variances, n_levels, block_size) + total = float(np.sum(eigenvalues)) + cap = alpha * total / n_levels + + +def make_pca_level_rotation_transform(xt, n_levels, alpha=ALPHA, seed=77): + """Build PCA + energy-spill + per-level rotation as one LinearTransform. + + Pipeline: y = R_eq @ R_spill @ P @ (x - mean) + 1. Center + PCA project (P, mean) + 2. Energy spill across levels (R_spill) + 3. Within-level equalization (R_eq, block-diagonal) + + Stored as: A = R_eq @ R_spill @ P, b = -A @ mean + """ + dim = xt.shape[1] + block_size = dim // n_levels + + pca = PCA(n_components=dim) + pca.fit(xt) + P = pca.components_.astype(np.float32) + mean = pca.mean_.astype(np.float32) + eigenvalues = pca.explained_variance_.astype(np.float32) + + R_spill, effective_variances = build_energy_spill_rotation( + eigenvalues, n_levels, block_size, alpha, seed=seed, + ) + print_energy_diagnostics( + eigenvalues, effective_variances, n_levels, block_size, alpha, + ) + + R_eq = build_level_equalization_rotation( + dim, n_levels, block_size, seed=seed + 1, + ) + + combined = (R_eq @ R_spill @ P).astype(np.float32) + + lt = faiss.LinearTransform(dim, dim, True) + faiss.copy_array_to_vector(combined.ravel(), lt.A) + faiss.copy_array_to_vector(-(combined @ mean).ravel(), lt.b) + lt.is_trained = True + lt.have_bias = True + + return lt + + +def build_ivfpq_panorama(M, n_levels, alpha=ALPHA): + """Build PCA + EnergySpill + LevelRotation + IVFPQPanorama.""" + lt = make_pca_level_rotation_transform(xt, n_levels, alpha=alpha) + + quantizer = faiss.IndexFlatL2(d) + ivfpq_pano = faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, + ) + + index = faiss.IndexPreTransform(lt, ivfpq_pano) + + faiss.omp_set_num_threads(mp.cpu_count()) + index.train(xt) + index.add(xb) + + return index + + +plt.figure(figsize=(10, 7), dpi=80) + +for M in M_values: + ivfpq = build_ivfpq(M) + eval_index(ivfpq, label=f"IVFPQ (M={M})") + del ivfpq + + pano = build_ivfpq_panorama(M, n_levels) + eval_index(pano, label=f"PCA+Spill+Rot + IVFPQPanorama (M={M})") + del pano + +plt.title( + f"IVFPQ Panorama on GIST ({SUBSET*100:.0f}% subset, nlist={nlist})", +) +plt.xlabel(f"Recall@{k}") +plt.ylabel("QPS") +plt.yscale("log") +plt.legend(bbox_to_anchor=(1.02, 0.1), loc="upper left", borderaxespad=0) +plt.savefig("bench_ivfpq_panorama.png", bbox_inches="tight") +print("\nBenchmark complete! Plot saved to bench_ivfpq_panorama.png") diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index 42e6dd14eb..fdd25d4500 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -10,6 +10,7 @@ # ============================================================================= set(FAISS_SIMD_AVX2_SRC impl/fast_scan/impl-avx2.cpp + impl/panorama_kernels/panorama_kernels-avx2.cpp impl/pq_code_distance/pq_code_distance-avx2.cpp impl/scalar_quantizer/sq-avx2.cpp impl/approx_topk/avx2.cpp @@ -17,17 +18,20 @@ set(FAISS_SIMD_AVX2_SRC ) set(FAISS_SIMD_AVX512_SRC impl/fast_scan/impl-avx512.cpp + impl/panorama_kernels/panorama_kernels-avx512.cpp impl/pq_code_distance/pq_code_distance-avx512.cpp impl/scalar_quantizer/sq-avx512.cpp utils/simd_impl/distances_avx512.cpp ) set(FAISS_SIMD_NEON_SRC impl/fast_scan/impl-neon.cpp + impl/panorama_kernels/panorama_kernels-neon.cpp impl/scalar_quantizer/sq-neon.cpp impl/approx_topk/neon.cpp utils/simd_impl/distances_aarch64.cpp ) set(FAISS_SIMD_SVE_SRC + impl/panorama_kernels/panorama_kernels-sve.cpp impl/pq_code_distance/pq_code_distance-sve.cpp utils/simd_impl/distances_arm_sve.cpp ) @@ -63,6 +67,7 @@ set(FAISS_SRC IndexIVFFlat.cpp IndexIVFFlatPanorama.cpp IndexIVFPQ.cpp + IndexIVFPQPanorama.cpp IndexIVFFastScan.cpp IndexIVFAdditiveQuantizerFastScan.cpp IndexIVFPQFastScan.cpp @@ -105,6 +110,7 @@ set(FAISS_SRC impl/NSG.cpp impl/PolysemousTraining.cpp impl/ProductQuantizer.cpp + impl/panorama_kernels/panorama_kernels-generic.cpp impl/pq_code_distance/pq_code_distance-generic.cpp impl/AdditiveQuantizer.cpp impl/RaBitQuantizer.cpp @@ -127,6 +133,7 @@ set(FAISS_SRC impl/zerocopy_io.cpp impl/NNDescent.cpp impl/Panorama.cpp + impl/PanoramaPQ.cpp impl/PanoramaStats.cpp invlists/BlockInvertedLists.cpp invlists/DirectMap.cpp @@ -184,6 +191,7 @@ set(FAISS_HEADERS IndexIVFFlat.h IndexIVFFlatPanorama.h IndexIVFPQ.h + IndexIVFPQPanorama.h IndexIVFFastScan.h IndexIVFAdditiveQuantizerFastScan.h IndexIVFPQFastScan.h @@ -236,6 +244,7 @@ set(FAISS_HEADERS impl/NNDescent.h impl/NSG.h impl/Panorama.h + impl/PanoramaPQ.h impl/PanoramaStats.h impl/PolysemousTraining.h impl/ProductQuantizer-inl.h @@ -278,6 +287,8 @@ set(FAISS_HEADERS impl/fast_scan/simd_result_handlers.h impl/zerocopy_io.h utils/pq_code_distance.h + impl/panorama_kernels/panorama_kernels.h + impl/panorama_kernels/panorama_kernels-inl.h impl/pq_code_distance/pq_code_distance-inl.h invlists/BlockInvertedLists.h invlists/DirectMap.h @@ -354,6 +365,15 @@ endif() # Export FAISS_HEADERS variable to parent scope. set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE) +# Detect BMI2 compiler support (PEXT/PDEP used in Panorama code compression). +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-mbmi2" COMPILER_SUPPORTS_BMI2) +if(COMPILER_SUPPORTS_BMI2) + set(FAISS_BMI2_FLAGS "-mbmi2") +else() + set(FAISS_BMI2_FLAGS "") +endif() + add_library(faiss ${FAISS_SRC}) add_library(faiss_avx2 ${FAISS_SRC}) @@ -361,7 +381,7 @@ if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE) endif() if(NOT WIN32) - target_compile_options(faiss_avx2 PRIVATE $<$:-mavx2 -mfma -mf16c -mpopcnt>) + target_compile_options(faiss_avx2 PRIVATE $<$:-mavx2 -mfma -mf16c -mpopcnt ${FAISS_BMI2_FLAGS}>) else() # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64 @@ -381,7 +401,7 @@ endif() if(NOT WIN32) # All modern CPUs support F, CD, VL, DQ, BW extensions. # Ref: https://en.wikipedia.org/wiki/AVX512 - target_compile_options(faiss_avx512 PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>) + target_compile_options(faiss_avx512 PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt ${FAISS_BMI2_FLAGS}>) else() target_compile_options(faiss_avx512 PRIVATE $<$:/arch:AVX512>) # we need bigobj for the swig wrapper diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp index 7eb76958ad..a17215737d 100644 --- a/faiss/IndexFlat.cpp +++ b/faiss/IndexFlat.cpp @@ -696,7 +696,7 @@ void IndexFlatPanorama::add(idx_t n, const float* x) { const uint8_t* code = reinterpret_cast(x); pano.copy_codes_to_level_layout(codes.data(), offset, n, code); - pano.compute_cumulative_sums(cum_sums.data(), offset, n, x); + pano.compute_cumulative_sums(cum_sums.data(), offset, n, code); } void IndexFlatPanorama::search( @@ -893,12 +893,12 @@ void IndexFlatPanorama::search_subset( bool pruned = false; for (size_t level = 0; level < n_levels; level++) { local_stats.total_dims_scanned += - pano.level_width_floats; + pano.level_width_dims; // Refine distance size_t actual_level_width = std::min( - pano.level_width_floats, - d - level * pano.level_width_floats); + pano.level_width_dims, + d - level * pano.level_width_dims); float dot_product = fvec_inner_product( x_ptr, p_ptr, actual_level_width); if constexpr (is_sim) { @@ -931,8 +931,8 @@ void IndexFlatPanorama::search_subset( } cum_sum_offset++; - x_ptr += pano.level_width_floats; - p_ptr += pano.level_width_floats; + x_ptr += pano.level_width_dims; + p_ptr += pano.level_width_dims; } if (!pruned) { diff --git a/faiss/IndexFlat.h b/faiss/IndexFlat.h index 4cdc7d4386..7e10f05b25 100644 --- a/faiss/IndexFlat.h +++ b/faiss/IndexFlat.h @@ -104,7 +104,7 @@ struct IndexFlatPanorama : IndexFlat { const size_t batch_size; const size_t n_levels; std::vector cum_sums; - Panorama pano; + PanoramaFlat pano; /** * @param d dimensionality of the input vectors @@ -120,7 +120,7 @@ struct IndexFlatPanorama : IndexFlat { : IndexFlat(d_in, metric), batch_size(batch_size_in), n_levels(n_levels_in), - pano(code_size, n_levels_in, batch_size_in) { + pano(d_in, n_levels_in, batch_size_in) { FAISS_THROW_IF_NOT( metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT); } diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp index f87003a76c..8f2f5f3e8a 100644 --- a/faiss/IndexHNSW.cpp +++ b/faiss/IndexHNSW.cpp @@ -680,7 +680,7 @@ IndexHNSWFlatPanorama::IndexHNSWFlatPanorama( MetricType metric) : IndexHNSWFlat(d_in, M, metric), cum_sums(), - pano(d_in * sizeof(float), num_panorama_levels_in, 1), + pano(d_in, num_panorama_levels_in, 1), num_panorama_levels(num_panorama_levels_in) { // For now, we only support L2 distance. // Supporting dot product and cosine distance is a trivial addition @@ -696,7 +696,8 @@ IndexHNSWFlatPanorama::IndexHNSWFlatPanorama( void IndexHNSWFlatPanorama::add(idx_t n, const float* x) { idx_t n0 = ntotal; cum_sums.resize((ntotal + n) * (pano.n_levels + 1)); - pano.compute_cumulative_sums(cum_sums.data(), n0, n, x); + pano.compute_cumulative_sums( + cum_sums.data(), n0, n, reinterpret_cast(x)); IndexHNSWFlat::add(n, x); } diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h index a43828d428..17d54ecaa2 100644 --- a/faiss/IndexHNSW.h +++ b/faiss/IndexHNSW.h @@ -179,7 +179,7 @@ struct IndexHNSWFlatPanorama : IndexHNSWFlat { } std::vector cum_sums; - Panorama pano; + PanoramaFlat pano; const size_t num_panorama_levels; }; diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp index 7bda245275..d05c73a049 100644 --- a/faiss/IndexIVFFlatPanorama.cpp +++ b/faiss/IndexIVFFlatPanorama.cpp @@ -39,7 +39,8 @@ IndexIVFFlatPanorama::IndexIVFFlatPanorama( // We construct the inverted lists here so that we can use the // level-oriented storage. This does not cause a leak as we constructed // IndexIVF first, with own_invlists set to false. - this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, n_levels); + auto* pano = new PanoramaFlat(d, n_levels, Panorama::kDefaultBatchSize); + this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano); this->own_invlists = own_invlists_in; } @@ -51,6 +52,7 @@ template struct IVFFlatScannerPanorama : InvertedListScanner { VectorDistance vd; const ArrayInvertedListsPanorama* storage; + const PanoramaFlat* pano_flat; using C = typename VectorDistance::C; static constexpr MetricType metric = VectorDistance::metric; @@ -61,10 +63,14 @@ struct IVFFlatScannerPanorama : InvertedListScanner { const IDSelector* sel_in) : InvertedListScanner(store_pairs_in, sel_in), vd(vd_in), - storage(storage_in) { + storage(storage_in), + pano_flat( + dynamic_cast( + storage_in->pano.get())) { + FAISS_THROW_IF_NOT(pano_flat); keep_max = vd.is_similarity; code_size = vd.d * sizeof(float); - cum_sums.resize(storage->n_levels + 1); + cum_sums.resize(pano_flat->n_levels + 1); } const float* xi = nullptr; @@ -72,7 +78,7 @@ struct IVFFlatScannerPanorama : InvertedListScanner { float q_norm = 0.0f; void set_query(const float* query) override { this->xi = query; - this->storage->pano.compute_query_cum_sums(query, cum_sums.data()); + pano_flat->compute_query_cum_sums(query, cum_sums.data()); q_norm = cum_sums[0] * cum_sums[0]; } @@ -96,22 +102,22 @@ struct IVFFlatScannerPanorama : InvertedListScanner { ResultHandler& handler) const override { size_t nup = 0; - const size_t n_batches = - (list_size + storage->kBatchSize - 1) / storage->kBatchSize; + const size_t bs = pano_flat->batch_size; + const size_t n_batches = (list_size + bs - 1) / bs; const float* cum_sums_data = storage->get_cum_sums(list_no); - std::vector exact_distances(storage->kBatchSize); - std::vector active_indices(storage->kBatchSize); + std::vector exact_distances(bs); + std::vector active_indices(bs); PanoramaStats local_stats; local_stats.reset(); for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { - size_t batch_start = batch_no * storage->kBatchSize; + size_t batch_start = batch_no * bs; size_t num_active = with_metric_type(metric, [&]() { - return storage->pano.progressive_filter_batch( + return pano_flat->progressive_filter_batch( codes, cum_sums_data, xi, diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp new file mode 100644 index 0000000000..92862713f7 --- /dev/null +++ b/faiss/IndexIVFPQPanorama.cpp @@ -0,0 +1,231 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace faiss { + +/***************************************** + * Constructor + ******************************************/ + +IndexIVFPQPanorama::IndexIVFPQPanorama( + Index* quantizer, + size_t d, + size_t nlist, + size_t M, + size_t nbits_per_idx, + int n_levels, + size_t batch_size, + MetricType metric, + bool own_invlists) + : IndexIVFPQ(quantizer, d, nlist, M, nbits_per_idx, metric, false), + n_levels(n_levels), + batch_size(batch_size), + levels_size(d / n_levels) { + FAISS_THROW_IF_NOT_MSG( + M % n_levels == 0, "M must be divisible by n_levels"); + FAISS_THROW_IF_NOT_MSG( + batch_size % 64 == 0, "batch_size must be multiple of 64"); + FAISS_THROW_IF_NOT_MSG(nbits_per_idx == 8, "only 8-bit PQ codes supported"); + FAISS_THROW_IF_NOT_MSG( + M == code_size, "M must equal code_size for 8-bit PQ"); + FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported"); + + auto* pano = + new PanoramaPQ(d, code_size, n_levels, batch_size, &pq, quantizer); + this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano); + this->own_invlists = own_invlists; +} + +/***************************************** + * Panorama scanner — overrides scan_codes with batch processing + ******************************************/ + +namespace { + +using idx_t = faiss::idx_t; + +template +struct IVFPQScannerPanorama : InvertedListScanner { + const IndexIVFPQPanorama& index; + const ProductQuantizer& pq; + const ArrayInvertedListsPanorama* storage; + const PanoramaPQ* pano_pq; + + // Query state + const float* qi = nullptr; + std::vector query_cum_norms; + std::vector sim_table_2; + + // Per-list state + float coarse_dis = 0; + + IVFPQScannerPanorama( + const IndexIVFPQPanorama& index, + const ArrayInvertedListsPanorama* storage, + bool store_pairs, + const IDSelector* sel) + : InvertedListScanner(store_pairs, sel), + index(index), + pq(index.pq), + storage(storage), + pano_pq(dynamic_cast(storage->pano.get())) { + FAISS_THROW_IF_NOT(pano_pq); + this->keep_max = is_similarity_metric(index.metric_type); + this->code_size = pq.code_size; + query_cum_norms.resize(index.n_levels + 1); + sim_table_2.resize(pq.M * pq.ksub); + } + + void set_query(const float* query) override { + this->qi = query; + + FAISS_ASSERT(index.by_residual); + FAISS_ASSERT(index.use_precomputed_table == 1); + + pq.compute_inner_prod_table(qi, sim_table_2.data()); + + // The PQ distance LUT is -2 * inner_prod_table; apply in-place + // so scan_codes() can use sim_table_2 directly. + const size_t n = pq.M * pq.ksub; + for (size_t i = 0; i < n; i++) { + sim_table_2[i] *= -2.0f; + } + + pano_pq->compute_query_cum_sums(qi, query_cum_norms.data()); + } + + void set_list(idx_t list_no, float coarse_dis) override { + this->list_no = list_no; + this->coarse_dis = coarse_dis; + } + + float distance_to_code(const uint8_t* code) const override { + FAISS_THROW_MSG("IndexIVFPQPanorama does not support distance_to_code"); + } + + size_t scan_codes( + size_t list_size, + const uint8_t* /* codes (column-major in storage) */, + const idx_t* ids, + float* distances, + idx_t* labels, + size_t k) const override { + size_t nup = 0; + + const size_t bs = index.batch_size; + const size_t ls = pano_pq->level_width_bytes; + + const size_t n_batches = (list_size + bs - 1) / bs; + const uint8_t* col_codes = storage->get_codes(list_no); + const float* list_cum_sums = storage->get_cum_sums(list_no); + const float* list_init_dists = storage->get_init_dists(list_no); + + // Scratch buffers. + std::vector exact_distances(bs); + std::vector bitset(bs); + std::vector active_indices(bs); + std::vector compressed_codes(bs * ls); + float dis0 = coarse_dis; + + PanoramaStats local_stats; + local_stats.reset(); + + for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { + size_t num_active = pano_pq->progressive_filter_batch( + col_codes, + list_cum_sums, + list_init_dists, + sim_table_2.data(), + query_cum_norms.data(), + dis0, + list_size, + batch_no, + ids, + sel, + exact_distances, + active_indices, + bitset, + compressed_codes, + distances[0], + local_stats); + + // Insert surviving candidates into heap. + for (size_t i = 0; i < num_active; i++) { + float dis = dis0 + exact_distances[i]; + if (C::cmp(distances[0], dis)) { + idx_t id = store_pairs + ? lo_build(list_no, active_indices[i]) + : ids[active_indices[i]]; + heap_replace_top(k, distances, labels, dis, id); + nup++; + } + } + } + + indexPanorama_stats.add(local_stats); + return nup; + } + + size_t scan_codes( + size_t n, + const uint8_t* codes, + const idx_t* ids, + ResultHandler& handler) const override { + FAISS_THROW_MSG( + "IndexIVFPQPanorama: ResultHandler scan_codes not supported"); + } +}; + +} // anonymous namespace + +/***************************************** + * get_InvertedListScanner + ******************************************/ + +InvertedListScanner* IndexIVFPQPanorama::get_InvertedListScanner( + bool store_pairs, + const IDSelector* sel, + const IVFSearchParameters*) const { + FAISS_THROW_IF_NOT_MSG( + metric_type == METRIC_L2, "only L2 metric supported"); + FAISS_THROW_IF_NOT_MSG( + use_precomputed_table == 1, + "Panorama PQ requires use_precomputed_table == 1"); + FAISS_THROW_IF_NOT_MSG(pq.nbits == 8, "only 8-bit PQ codes supported"); + FAISS_THROW_IF_NOT_MSG(by_residual, "Panorama PQ requires by_residual"); + FAISS_THROW_IF_NOT_MSG( + polysemous_ht == 0, "Panorama PQ does not support polysemous"); + + const auto* storage = + dynamic_cast(invlists); + FAISS_THROW_IF_NOT_MSG( + storage, "IndexIVFPQPanorama requires ArrayInvertedListsPanorama"); + + if (sel) { + return new IVFPQScannerPanorama, true>( + *this, storage, store_pairs, sel); + } else { + return new IVFPQScannerPanorama, false>( + *this, storage, store_pairs, sel); + } +} + +} // namespace faiss diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h new file mode 100644 index 0000000000..8bcc97b624 --- /dev/null +++ b/faiss/IndexIVFPQPanorama.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifndef FAISS_INDEX_IVFPQ_PANORAMA_H +#define FAISS_INDEX_IVFPQ_PANORAMA_H + +#include + +#include +#include + +namespace faiss { + +/// Panorama adaptation of IndexIVFPQ following +/// https://www.arxiv.org/pdf/2510.00566. +/// +/// IDEA: +/// Panorama adapts the storage layout within each cluster and uses +/// Cauchy-Schwarz pruning to skip unnecessary distance computations. +/// Combined with orthogonal transforms upstream that concentrate signal +/// energy in the early PQ subquantizers (like PCA), Panorama can prune +/// the majority of candidates after computing only a fraction of the +/// full PQ distance. +/// +/// STORAGE LAYOUT: +/// Standard IVFPQ stores codes row-major: [point0_code, point1_code, ...]. +/// Panorama transposes codes into column-major within each batch: +/// for each batch of `batch_size` points, codes are stored as +/// M columns of `batch_size` bytes each. The M columns are grouped +/// into `n_levels` levels of `level_width_bytes` columns, enabling incremental +/// distance computation level-by-level. +/// +/// Storage is managed by ArrayInvertedListsPanorama with a PanoramaPQ +/// instance that handles code transposition and cumulative sum computation +/// (via PQ decoding) on insertion. +/// +/// OVERHEAD: +/// Panorama precomputes per-point cumulative residual norms at insertion +/// time. Storage overhead is (n_levels + 1) floats per point for +/// cum_sums. Initial exact distances are computed on-the-fly during +/// search using the precomputed_table (no extra per-point storage). +/// +/// CONSTRAINTS: +/// - Only L2 metric is supported (for now). +/// - Only 8-bit PQ codes (nbits_per_idx == 8). +/// - M must be divisible by n_levels. +/// - batch_size must be a multiple of 64. +/// - use_precomputed_table must be 1. +/// +/// NOTE: +/// We inherit from IndexIVFPQ and override only get_InvertedListScanner(). +/// The base IndexIVF::search_preassigned() handles all search +/// orchestration — no search code is duplicated. +/// Storage (transposition + cum_sums) is handled by +/// ArrayInvertedListsPanorama, so no add() override is needed. +struct IndexIVFPQPanorama : public IndexIVFPQ { + int n_levels; + size_t batch_size; + + size_t levels_size; + + IndexIVFPQPanorama( + Index* quantizer, + size_t d, + size_t nlist, + size_t M, + size_t nbits_per_idx, + int n_levels, + size_t batch_size = Panorama::kDefaultBatchSize, + MetricType metric = METRIC_L2, + bool own_invlists = true); + + IndexIVFPQPanorama() = default; + + InvertedListScanner* get_InvertedListScanner( + bool store_pairs, + const IDSelector* sel, + const IVFSearchParameters* params) const override; +}; + +} // namespace faiss + +#endif diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp index 381adff002..0dd07a374a 100644 --- a/faiss/impl/HNSW.cpp +++ b/faiss/impl/HNSW.cpp @@ -864,10 +864,10 @@ int search_from_candidates_panorama( while (curr_panorama_level < num_panorama_levels && batch_size > 0) { float query_cum_norm = query_cum_sums[curr_panorama_level + 1]; - size_t start_dim = curr_panorama_level * - panorama_index->pano.level_width_floats; + size_t start_dim = + curr_panorama_level * panorama_index->pano.level_width_dims; size_t end_dim = (curr_panorama_level + 1) * - panorama_index->pano.level_width_floats; + panorama_index->pano.level_width_dims; end_dim = std::min(end_dim, static_cast(panorama_index->d)); size_t i = 0; diff --git a/faiss/impl/Panorama.cpp b/faiss/impl/Panorama.cpp index 34211eceba..dd105f5ec4 100644 --- a/faiss/impl/Panorama.cpp +++ b/faiss/impl/Panorama.cpp @@ -26,7 +26,7 @@ inline void compute_cum_sums_impl( float* output, size_t d, size_t n_levels, - size_t level_width_floats, + size_t level_width_dims, OffsetFunc&& get_offset) { // Iterate backwards through levels, accumulating sum as we go. // This avoids computing the suffix sum for each vector, which takes @@ -34,9 +34,9 @@ inline void compute_cum_sums_impl( float sum = 0.0f; for (int level = n_levels - 1; level >= 0; level--) { - size_t start_idx = level * level_width_floats; + size_t start_idx = level * level_width_dims; size_t end_idx = std::min( - (level + 1) * level_width_floats, static_cast(d)); + (level + 1) * level_width_dims, static_cast(d)); for (size_t j = start_idx; j < end_idx; j++) { sum += vector[j] * vector[j]; @@ -51,14 +51,16 @@ inline void compute_cum_sums_impl( } // namespace /************************************************************** - * Panorama structure implementation + * Panorama base class implementation **************************************************************/ Panorama::Panorama( + size_t d_in, size_t code_size_in, size_t n_levels_in, size_t batch_size_in) - : code_size(code_size_in), + : d(d_in), + code_size(code_size_in), n_levels(n_levels_in), batch_size(batch_size_in) { set_derived_values(); @@ -66,9 +68,7 @@ Panorama::Panorama( void Panorama::set_derived_values() { FAISS_THROW_IF_NOT_MSG(n_levels > 0, "Panorama: n_levels must be > 0"); - this->d = code_size / sizeof(float); - this->level_width_floats = ((d + n_levels - 1) / n_levels); - this->level_width = this->level_width_floats * sizeof(float); + level_width_bytes = (code_size + n_levels - 1) / n_levels; } /** @@ -93,10 +93,10 @@ void Panorama::copy_codes_to_level_layout( // Copy entry into level-oriented layout for this batch. size_t batch_offset = batch_no * batch_size * code_size; for (size_t level = 0; level < n_levels; level++) { - size_t level_offset = level * level_width * batch_size; - size_t start_byte = level * level_width; - size_t actual_level_width = - std::min(level_width, code_size - level * level_width); + size_t level_offset = level * level_width_bytes * batch_size; + size_t start_byte = level * level_width_bytes; + size_t actual_level_width = std::min( + level_width_bytes, code_size - level * level_width_bytes); const uint8_t* src = code + entry_idx * code_size + start_byte; uint8_t* dest = codes + batch_offset + level_offset + @@ -107,38 +107,12 @@ void Panorama::copy_codes_to_level_layout( } } -void Panorama::compute_cumulative_sums( - float* cumsum_base, - size_t offset, - size_t n_entry, - const float* vectors) const { - for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { - size_t current_pos = offset + entry_idx; - size_t batch_no = current_pos / batch_size; - size_t pos_in_batch = current_pos % batch_size; - - const float* vector = vectors + entry_idx * d; - size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1); - - auto get_offset = [&](size_t level) { - return cumsum_batch_offset + level * batch_size + pos_in_batch; - }; - - compute_cum_sums_impl( - vector, - cumsum_base, - d, - n_levels, - level_width_floats, - get_offset); - } -} - void Panorama::compute_query_cum_sums(const float* query, float* query_cum_sums) const { + size_t level_dims = (d + n_levels - 1) / n_levels; auto get_offset = [](size_t level) { return level; }; compute_cum_sums_impl( - query, query_cum_sums, d, n_levels, level_width_floats, get_offset); + query, query_cum_sums, d, n_levels, level_dims, get_offset); } void Panorama::reconstruct(idx_t key, float* recons, const uint8_t* codes_base) @@ -150,12 +124,12 @@ void Panorama::reconstruct(idx_t key, float* recons, const uint8_t* codes_base) size_t batch_offset = batch_no * batch_size * code_size; for (size_t level = 0; level < n_levels; level++) { - size_t level_offset = level * level_width * batch_size; + size_t level_offset = level * level_width_bytes * batch_size; const uint8_t* src = codes_base + batch_offset + level_offset + - pos_in_batch * level_width; - uint8_t* dest = recons_buffer + level * level_width; - size_t copy_size = - std::min(level_width, code_size - level * level_width); + pos_in_batch * level_width_bytes; + uint8_t* dest = recons_buffer + level * level_width_bytes; + size_t copy_size = std::min( + level_width_bytes, code_size - level * level_width_bytes); memcpy(dest, src, copy_size); } } @@ -182,9 +156,9 @@ void Panorama::copy_entry( for (size_t level = 0; level < n_levels; level++) { // Copy code - size_t level_offset = level * level_width * batch_size; - size_t actual_level_width = - std::min(level_width, code_size - level * level_width); + size_t level_offset = level * level_width_bytes * batch_size; + size_t actual_level_width = std::min( + level_width_bytes, code_size - level * level_width_bytes); const uint8_t* src = src_codes + src_batch_offset + level_offset + src_pos_in_batch * actual_level_width; @@ -202,4 +176,38 @@ void Panorama::copy_entry( dest_cum_sums[dest_offset] = src_cum_sums[src_offset]; } } + +/************************************************************** + * PanoramaFlat implementation + **************************************************************/ + +PanoramaFlat::PanoramaFlat(size_t d, size_t n_levels, size_t batch_size) + : Panorama(d, d * sizeof(float), n_levels, batch_size) { + level_width_dims = (d + n_levels - 1) / n_levels; + level_width_bytes = level_width_dims * sizeof(float); +} + +void PanoramaFlat::compute_cumulative_sums( + float* cumsum_base, + size_t offset, + size_t n_entry, + const uint8_t* code) const { + const float* vectors = reinterpret_cast(code); + for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { + size_t current_pos = offset + entry_idx; + size_t batch_no = current_pos / batch_size; + size_t pos_in_batch = current_pos % batch_size; + + const float* vector = vectors + entry_idx * d; + size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1); + + auto get_offset = [&](size_t level) { + return cumsum_batch_offset + level * batch_size + pos_in_batch; + }; + + compute_cum_sums_impl( + vector, cumsum_base, d, n_levels, level_width_dims, get_offset); + } +} + } // namespace faiss diff --git a/faiss/impl/Panorama.h b/faiss/impl/Panorama.h index 79a23a64a7..58e889a1f0 100644 --- a/faiss/impl/Panorama.h +++ b/faiss/impl/Panorama.h @@ -40,35 +40,48 @@ namespace faiss { * Coupled with the appropriate orthogonal PreTransform (e.g. PCA, Cayley, * etc.), Panorama can prune the vast majority of dimensions, greatly * accelerating the refinement stage. + * + * This is the abstract base class. Concrete subclasses (PanoramaFlat, + * PanoramaPQ) implement compute_cumulative_sums and progressive_filter_batch + * for their respective code formats. */ struct Panorama { + static constexpr size_t kDefaultBatchSize = 128; + size_t d = 0; size_t code_size = 0; size_t n_levels = 0; - size_t level_width = 0; - size_t level_width_floats = 0; + size_t level_width_bytes = 0; size_t batch_size = 0; - explicit Panorama(size_t code_size, size_t n_levels, size_t batch_size); + Panorama() = default; + Panorama(size_t d, size_t code_size, size_t n_levels, size_t batch_size); + + virtual ~Panorama() = default; void set_derived_values(); /// Helper method to copy codes into level-oriented batch layout at a given /// offset in the list. - void copy_codes_to_level_layout( + /// PanoramaFlat uses row-major within each level (point bytes contiguous). + /// PanoramaPQ overrides to use column-major (subquantizer columns + /// contiguous). + virtual void copy_codes_to_level_layout( uint8_t* codes, size_t offset, size_t n_entry, const uint8_t* code); - /// Helper method to compute the cumulative sums of the codes. - /// The cumsums also follow the level-oriented batch layout to minimize the + /// Compute the cumulative sums (suffix norms) for database vectors. + /// The cumsums follow the level-oriented batch layout to minimize the /// number of random memory accesses. - void compute_cumulative_sums( + /// Subclasses interpret the raw code bytes according to their format: + /// PanoramaFlat reinterprets as float*, PanoramaPQ decodes via PQ. + virtual void compute_cumulative_sums( float* cumsum_base, size_t offset, size_t n_entry, - const float* vectors) const; + const uint8_t* code) const = 0; /// Compute the cumulative sums of the query vector. void compute_query_cum_sums(const float* query, float* query_cum_sums) @@ -83,7 +96,32 @@ struct Panorama { size_t dest_idx, size_t src_idx) const; - /// Panorama's core progressive filtering algorithm: + virtual void reconstruct( + idx_t key, + float* recons, + const uint8_t* codes_base) const; +}; + +/** + * Panorama for flat (uncompressed) float vectors. + * + * Codes are raw float vectors (code_size = d * sizeof(float)). + * compute_cumulative_sums interprets codes as floats. + * progressive_filter_batch computes dot products on raw float storage. + */ +struct PanoramaFlat : Panorama { + size_t level_width_dims = 0; + + PanoramaFlat() = default; + PanoramaFlat(size_t d, size_t n_levels, size_t batch_size); + + void compute_cumulative_sums( + float* cumsum_base, + size_t offset, + size_t n_entry, + const uint8_t* code) const override; + + /// Panorama's core progressive filtering algorithm for flat codes: /// Process vectors in batches for cache efficiency. For each batch: /// 1. Apply ID selection filter and initialize distances /// (||y||^2 + ||x||^2). @@ -113,12 +151,10 @@ struct Panorama { std::vector& exact_distances, float threshold, PanoramaStats& local_stats) const; - - void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) const; }; template -size_t Panorama::progressive_filter_batch( +size_t PanoramaFlat::progressive_filter_batch( const uint8_t* codes_base, const float* cum_sums, const float* query, @@ -173,18 +209,18 @@ size_t Panorama::progressive_filter_batch( float query_cum_norm = query_cum_sums[level + 1]; - size_t level_offset = level * level_width * batch_size; + size_t level_offset = level * level_width_bytes * batch_size; const float* level_storage = (const float*)(storage_base + level_offset); size_t next_active = 0; for (size_t i = 0; i < num_active; i++) { uint32_t idx = active_indices[i]; - size_t actual_level_width = std::min( - level_width_floats, d - level * level_width_floats); + size_t actual_level_width = + std::min(level_width_dims, d - level * level_width_dims); const float* yj = level_storage + idx * actual_level_width; - const float* query_level = query + level * level_width_floats; + const float* query_level = query + level * level_width_dims; float dot_product = fvec_inner_product(query_level, yj, actual_level_width); diff --git a/faiss/impl/PanoramaPQ.cpp b/faiss/impl/PanoramaPQ.cpp new file mode 100644 index 0000000000..f535ae9b84 --- /dev/null +++ b/faiss/impl/PanoramaPQ.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include + +#include + +namespace faiss { + +void PanoramaPQ::copy_codes_to_level_layout( + uint8_t* codes, + size_t offset, + size_t n_entry, + const uint8_t* code) { + const size_t ls = level_width_bytes; + const size_t bs = batch_size; + + for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { + size_t current_pos = offset + entry_idx; + size_t batch_no = current_pos / bs; + size_t pos_in_batch = current_pos % bs; + size_t batch_offset = batch_no * bs * code_size; + + for (size_t level = 0; level < n_levels; level++) { + size_t level_offset = level * ls * bs; + size_t start_byte = level * ls; + + for (size_t li = 0; li < ls && (start_byte + li) < code_size; + li++) { + codes[batch_offset + level_offset + li * bs + pos_in_batch] = + code[entry_idx * code_size + start_byte + li]; + } + } + } +} + +void PanoramaPQ::reconstruct( + idx_t key, + float* recons, + const uint8_t* codes_base) const { + uint8_t* recons_buffer = reinterpret_cast(recons); + const size_t ls = level_width_bytes; + const size_t bs = batch_size; + + size_t batch_no = key / bs; + size_t pos_in_batch = key % bs; + size_t batch_offset = batch_no * bs * code_size; + + for (size_t level = 0; level < n_levels; level++) { + size_t level_offset = level * ls * bs; + size_t start_byte = level * ls; + + for (size_t li = 0; li < ls && (start_byte + li) < code_size; li++) { + recons_buffer[start_byte + li] = codes_base + [batch_offset + level_offset + li * bs + pos_in_batch]; + } + } +} + +PanoramaPQ::PanoramaPQ( + size_t d, + size_t code_size, + size_t n_levels, + size_t batch_size, + const ProductQuantizer* pq, + const Index* quantizer) + : Panorama(d, code_size, n_levels, batch_size), + pq(pq), + quantizer(quantizer), + levels_size(d / n_levels) { + FAISS_THROW_IF_NOT_MSG( + code_size % n_levels == 0, + "PanoramaPQ: code_size must be divisible by n_levels"); + FAISS_THROW_IF_NOT_MSG(pq != nullptr, "PanoramaPQ: pq must not be null"); +} + +void PanoramaPQ::compute_cumulative_sums( + float* cumsum_base, + size_t offset, + size_t n_entry, + const uint8_t* code) const { + for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { + size_t current_pos = offset + entry_idx; + size_t batch_no = current_pos / batch_size; + size_t pos_in_batch = current_pos % batch_size; + + // Decode PQ code to float vector. + std::vector vec(d); + pq->decode(code + entry_idx * code_size, vec.data()); + + // Compute suffix sums of squared norms. + std::vector suffix(d + 1, 0.0f); + for (int j = d - 1; j >= 0; j--) { + suffix[j] = suffix[j + 1] + vec[j] * vec[j]; + } + + // Write into batch-oriented layout. + size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1); + for (size_t level = 0; level < n_levels; level++) { + size_t start_idx = level * levels_size; + size_t out_offset = + cumsum_batch_offset + level * batch_size + pos_in_batch; + cumsum_base[out_offset] = + start_idx < d ? std::sqrt(suffix[start_idx]) : 0.0f; + } + + size_t last_offset = + cumsum_batch_offset + n_levels * batch_size + pos_in_batch; + cumsum_base[last_offset] = 0.0f; + } +} + +void PanoramaPQ::compute_init_distances( + float* init_dists_base, + size_t list_no, + size_t offset, + size_t n_entry, + const uint8_t* code) const { + FAISS_THROW_IF_NOT_MSG( + quantizer != nullptr, + "PanoramaPQ: quantizer required for compute_init_distances"); + + std::vector centroid(d); + quantizer->reconstruct(list_no, centroid.data()); + + for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { + std::vector vec(d); + pq->decode(code + entry_idx * code_size, vec.data()); + + float init_dist = 0.0f; + for (size_t j = 0; j < d; j++) { + init_dist += vec[j] * vec[j] + 2 * vec[j] * centroid[j]; + } + + size_t point_idx = offset + entry_idx; + init_dists_base[point_idx] = init_dist; + } +} + +} // namespace faiss diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h new file mode 100644 index 0000000000..b94623cb5e --- /dev/null +++ b/faiss/impl/PanoramaPQ.h @@ -0,0 +1,209 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include + +namespace faiss { + +/** + * Panorama for PQ-compressed vectors. + * + * Codes are PQ codes (code_size = M bytes for 8-bit PQ). + * compute_cumulative_sums decodes via PQ then computes suffix norms. + * progressive_filter_batch uses LUT accumulation with panorama_kernels. + */ +struct PanoramaPQ : Panorama { + const ProductQuantizer* pq = nullptr; + const Index* quantizer = nullptr; + size_t levels_size = 0; + + PanoramaPQ() = default; + PanoramaPQ( + size_t d, + size_t code_size, + size_t n_levels, + size_t batch_size, + const ProductQuantizer* pq, + const Index* quantizer = nullptr); + + void copy_codes_to_level_layout( + uint8_t* codes, + size_t offset, + size_t n_entry, + const uint8_t* code) override; + + void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) + const override; + + void compute_cumulative_sums( + float* cumsum_base, + size_t offset, + size_t n_entry, + const uint8_t* code) const override; + + /// Precompute per-point init distances: ||r||^2 + 2. + /// Requires quantizer to be set. Layout is flat per-list, + /// padded to batch_size boundaries. + void compute_init_distances( + float* init_dists_base, + size_t list_no, + size_t offset, + size_t n_entry, + const uint8_t* code) const; + + /// Progressive filtering for PQ codes: processes one batch. + /// + /// Initializes exact_distances from precomputed init_dists + /// (||r||^2 + 2), then refines with the query-specific + /// sim_table_2 level-by-level with Cauchy-Schwarz pruning. + /// + /// @param col_codes Column-major codes for this inverted list. + /// @param list_cum_sums Cumulative sums for this inverted list. + /// @param init_dists Precomputed init distances for this list. + /// @param sim_table_2 -2 * inner_prod_table (query-specific LUT). + /// @param query_cum_norms Query suffix norms per level. + /// @param coarse_dis Coarse distance (dis0) for this list. + /// @param list_size Total number of vectors in this list. + /// @param batch_no Which batch to process. + /// @param ids ID array for the inverted list. + /// @param sel ID selector for filtering (may be nullptr). + /// @param exact_distances [out] Scratch buffer for partial distances. + /// @param active_indices [out] Scratch buffer for survivor indices. + /// @param bitset Scratch buffer for code compression. + /// @param compressed_codes Scratch buffer for compressed codes. + /// @param threshold Current heap threshold for pruning. + /// @param local_stats [out] Accumulated pruning statistics. + /// @return Number of surviving candidates in active_indices. + template + size_t progressive_filter_batch( + const uint8_t* col_codes, + const float* list_cum_sums, + const float* init_dists, + const float* sim_table_2, + const float* query_cum_norms, + float coarse_dis, + size_t list_size, + size_t batch_no, + const idx_t* ids, + const IDSelector* sel, + std::vector& exact_distances, + std::vector& active_indices, + std::vector& bitset, + std::vector& compressed_codes, + float threshold, + PanoramaStats& local_stats) const { + const size_t bs = batch_size; + const size_t ls = level_width_bytes; + const size_t ksub = pq->ksub; + + size_t curr_batch_size = std::min(list_size - batch_no * bs, bs); + size_t b_offset = batch_no * bs; + + // Initialize active set with ID-filtered vectors. + std::fill(bitset.begin(), bitset.end(), 0); + size_t num_active = 0; + const float* batch_init = init_dists + b_offset; + for (size_t i = 0; i < curr_batch_size; i++) { + size_t global_idx = b_offset + i; + if (use_sel) { + idx_t id = ids[global_idx]; + if (!sel->is_member(id)) { + continue; + } + } + active_indices[num_active] = global_idx; + exact_distances[num_active] = batch_init[i]; + bitset[i] = 1; + num_active++; + } + + if (num_active == 0) { + return 0; + } + + const uint8_t* batch_codes = col_codes + b_offset * code_size; + + const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1); + + size_t next_num_active = num_active; + size_t batch_offset = batch_no * bs; + const size_t total_active = next_num_active; + + local_stats.total_dims += total_active * n_levels; + + for (size_t level = 0; level < n_levels && next_num_active > 0; + level++) { + local_stats.total_dims_scanned += next_num_active; + + size_t level_sim_offset = level * ksub * ls; + + float query_cum_norm = 2 * query_cum_norms[level + 1]; + + const float* cum_sums_level = batch_cums + bs * (level + 1); + const uint8_t* codes_level = batch_codes + bs * ls * level; + + const float* sim_table_level = sim_table_2 + level_sim_offset; + + bool is_sparse = next_num_active < bs / 16; + + size_t num_active_for_filtering = 0; + if (is_sparse) { + for (size_t li = 0; li < ls; li++) { + size_t byte_off = li * bs; + const float* chunk_sim = sim_table_level + li * ksub; + for (size_t i = 0; i < next_num_active; i++) { + size_t real_idx = active_indices[i] - batch_offset; + exact_distances[i] += + chunk_sim[codes_level[byte_off + real_idx]]; + } + } + num_active_for_filtering = next_num_active; + } else { + auto [cc, na] = panorama_kernels::process_code_compression( + next_num_active, + bs, + ls, + compressed_codes.data(), + bitset.data(), + codes_level); + + panorama_kernels::process_level( + ls, + bs, + na, + const_cast(sim_table_level), + cc, + exact_distances.data()); + num_active_for_filtering = na; + } + + next_num_active = panorama_kernels::process_filtering( + num_active_for_filtering, + exact_distances.data(), + active_indices.data(), + const_cast(cum_sums_level), + bitset.data(), + batch_offset, + coarse_dis, + query_cum_norm, + threshold); + } + + return next_num_active; + } +}; + +} // namespace faiss diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index f0c433f544..19b9f4b3f8 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -54,6 +55,7 @@ #include #include #include +#include #ifdef FAISS_ENABLE_SVS #include #include @@ -480,18 +482,18 @@ std::unique_ptr read_InvertedLists_up( FAISS_CHECK_DESERIALIZATION_LOOP_LIMIT(nlist, "ilpn nlist"); READ1(code_size); READ1(n_levels); + constexpr size_t bs = Panorama::kDefaultBatchSize; FAISS_THROW_IF_NOT_FMT( - n_levels > 0, "invalid ilpn n_levels %zd", n_levels); + n_levels > 0, "invalid ilpn n_levels %zd", n_levels); + auto* pano = new PanoramaFlat(code_size / sizeof(float), n_levels, bs); auto ailp = std::make_unique( - nlist, code_size, n_levels); + nlist, code_size, pano); std::vector sizes(nlist); read_ArrayInvertedLists_sizes(f, sizes); + for (size_t i = 0; i < nlist; i++) { ailp->ids[i].resize(sizes[i]); - size_t num_elems = - ((sizes[i] + ArrayInvertedListsPanorama::kBatchSize - 1) / - ArrayInvertedListsPanorama::kBatchSize) * - ArrayInvertedListsPanorama::kBatchSize; + size_t num_elems = ((sizes[i] + bs - 1) / bs) * bs; ailp->codes[i].resize(num_elems * code_size); ailp->cum_sums[i].resize(num_elems * (n_levels + 1)); } @@ -1535,6 +1537,60 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { READVECTOR(ivsp->trained); read_InvertedLists(*ivsp, f, io_flags); idx = std::move(ivsp); + } else if (h == fourcc("IwPP")) { + auto ivpp = std::make_unique(); + read_ivf_header(ivpp.get(), f); + READ1(ivpp->by_residual); + READ1(ivpp->code_size); + read_ProductQuantizer(&ivpp->pq, f); + READ1(ivpp->n_levels); + READ1(ivpp->batch_size); + ivpp->levels_size = ivpp->d / ivpp->n_levels; + read_InvertedLists(*ivpp, f, io_flags); + // The "ilpn" reader creates a PanoramaFlat placeholder; replace + // it with PanoramaPQ now that we have the ProductQuantizer. + auto* storage = + dynamic_cast(ivpp->invlists); + if (storage) { + auto* pano_pq = new PanoramaPQ( + ivpp->d, + ivpp->code_size, + ivpp->n_levels, + ivpp->batch_size, + &ivpp->pq, + ivpp->quantizer); + storage->pano.reset(pano_pq); + + // Recompute init_dists from stored codes + quantizer. + for (size_t list_no = 0; list_no < ivpp->nlist; list_no++) { + size_t list_size = storage->ids[list_no].size(); + if (list_size == 0) + continue; + size_t bs = pano_pq->batch_size; + size_t padded = ((list_size + bs - 1) / bs) * bs; + storage->init_dists[list_no].resize(padded); + + // Reconstruct row-major codes, then compute init distances. + std::vector row_code(ivpp->code_size); + for (size_t i = 0; i < list_size; i++) { + pano_pq->reconstruct( + i, + reinterpret_cast(row_code.data()), + storage->codes[list_no].data()); + pano_pq->compute_init_distances( + storage->init_dists[list_no].data(), + list_no, + i, + 1, + row_code.data()); + } + } + } + if (ivpp->is_trained) { + ivpp->use_precomputed_table = 1; + ivpp->precompute_table(); + } + idx = std::move(ivpp); } else if ( h == fourcc("IvPQ") || h == fourcc("IvQR") || h == fourcc("IwPQ") || h == fourcc("IwQR")) { @@ -1666,8 +1722,8 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { size_t nlevels; READ1(nlevels); const_cast(idx_panorama->num_panorama_levels) = nlevels; - const_cast(idx_panorama->pano) = - Panorama(idx_panorama->d * sizeof(float), nlevels, 1); + const_cast(idx_panorama->pano) = + PanoramaFlat(idx_panorama->d, nlevels, 1); READVECTOR(idx_panorama->cum_sums); } if (h == fourcc("IHNc") || h == fourcc("IHc2")) { diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp index 03cc9bdd69..66f0ed325f 100644 --- a/faiss/impl/index_write.cpp +++ b/faiss/impl/index_write.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -273,7 +274,7 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) { WRITE1(h); WRITE1(ailp->nlist); WRITE1(ailp->code_size); - WRITE1(ailp->n_levels); + WRITE1(ailp->pano->n_levels); uint32_t list_type = fourcc("full"); WRITE1(list_type); std::vector sizes; @@ -774,6 +775,18 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) { WRITE1(ivsp->threshold_type); WRITEVECTOR(ivsp->trained); write_InvertedLists(ivsp->invlists, f); + } else if ( + const IndexIVFPQPanorama* ivpp = + dynamic_cast(idx)) { + uint32_t h = fourcc("IwPP"); + WRITE1(h); + write_ivf_header(ivpp, f); + WRITE1(ivpp->by_residual); + WRITE1(ivpp->code_size); + write_ProductQuantizer(&ivpp->pq, f); + WRITE1(ivpp->n_levels); + WRITE1(ivpp->batch_size); + write_InvertedLists(ivpp->invlists, f); } else if (const IndexIVFPQ* ivpq = dynamic_cast(idx)) { const IndexIVFPQR* ivfpqr = dynamic_cast(idx); diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp new file mode 100644 index 0000000000..070475b067 --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -0,0 +1,211 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// AVX2 implementations of Panorama kernels. +// Uses 256-bit gather for process_level, scalar filtering (no +// compress instruction in AVX2), and BMI2 PEXT/PDEP for code +// compression where available. + +#ifdef COMPILE_SIMD_AVX2 + +#include + +#include + +#include + +namespace faiss { +namespace panorama_kernels { + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + size_t byte_idx = 0; + + // Process 4 chunks at a time to amortize loop overhead and keep + // the accumulator in registers across chunks. + for (; byte_idx + 3 < level_width_bytes; byte_idx += 4) { + size_t byte_offset0 = (byte_idx + 0) * max_batch_size; + size_t byte_offset1 = (byte_idx + 1) * max_batch_size; + size_t byte_offset2 = (byte_idx + 2) * max_batch_size; + size_t byte_offset3 = (byte_idx + 3) * max_batch_size; + + float* sim_table0 = sim_table + (byte_idx + 0) * 256; + float* sim_table1 = sim_table + (byte_idx + 1) * 256; + float* sim_table2 = sim_table + (byte_idx + 2) * 256; + float* sim_table3 = sim_table + (byte_idx + 3) * 256; + + size_t batch_idx = 0; + for (; batch_idx + 7 < num_active; batch_idx += 8) { + __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx); + + // Load 8 byte codes, zero-extend to 32-bit indices. + __m128i raw0 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset0 + batch_idx)); + __m128i raw1 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset1 + batch_idx)); + __m128i raw2 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset2 + batch_idx)); + __m128i raw3 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset3 + batch_idx)); + + __m256i codes0 = _mm256_cvtepu8_epi32(raw0); + __m256i codes1 = _mm256_cvtepu8_epi32(raw1); + __m256i codes2 = _mm256_cvtepu8_epi32(raw2); + __m256i codes3 = _mm256_cvtepu8_epi32(raw3); + + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table0, codes0, sizeof(float))); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table1, codes1, sizeof(float))); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table2, codes2, sizeof(float))); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table3, codes3, sizeof(float))); + + _mm256_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx++) { + float acc = exact_distances[batch_idx]; + acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]]; + acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]]; + acc += sim_table2[compressed_codes[byte_offset2 + batch_idx]]; + acc += sim_table3[compressed_codes[byte_offset3 + batch_idx]]; + exact_distances[batch_idx] = acc; + } + } + + for (; byte_idx < level_width_bytes; byte_idx++) { + size_t byte_offset = byte_idx * max_batch_size; + float* sim_table_ptr = sim_table + byte_idx * 256; + + size_t batch_idx = 0; + for (; batch_idx + 7 < num_active; batch_idx += 8) { + __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx); + __m128i raw = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset + batch_idx)); + __m256i codes = _mm256_cvtepu8_epi32(raw); + __m256 m_dist = + _mm256_i32gather_ps(sim_table_ptr, codes, sizeof(float)); + acc = _mm256_add_ps(acc, m_dist); + _mm256_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx++) { + exact_distances[batch_idx] += + sim_table_ptr[compressed_codes[byte_offset + batch_idx]]; + } + } +} + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + uint8_t* compressed_codes = compressed_codes_begin; + size_t num_active = 0; + + // An important optimization is to skip the compression if all points + // are active, as we can just use the compressed_codes_begin pointer. + if (next_num_active < max_batch_size) { + // Compress the codes: here we don't need to process remainders + // as long as `max_batch_size` is a multiple of 64 (which we + // assert in the constructor). Conveniently, compressed_codes is + // allocated to `max_batch_size` * `level_width_bytes` elements. + // `num_active` is guaranteed to always be less than or equal to + // `max_batch_size`. Only the last batch may be smaller than + // `max_batch_size`, the caller ensures that the batch and + // bitset are padded with zeros. + compressed_codes = compressed_codes_begin; + for (size_t point_idx = 0; point_idx < max_batch_size; + point_idx += 64) { + // Build a 64-bit mask from the byteset: each byte is + // 0 or 1, collect into a single bitmask. + uint64_t mask = 0; +#ifdef __BMI2__ + for (int g = 0; g < 8; g++) { + uint64_t bytes; + memcpy(&bytes, bitset + point_idx + g * 8, 8); + uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL); + mask |= ((uint64_t)bits << (g * 8)); + } +#else + for (int b = 0; b < 64; b++) { + if (bitset[point_idx + b]) + mask |= (1ULL << b); + } +#endif + + // Byte-level stream compaction. +#ifdef __BMI2__ + // PEXT/PDEP path: process 8 bytes at a time. PDEP + // expands the per-byte mask bits into a per-byte lane + // mask, then PEXT extracts only the selected bytes. + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; + int write_pos = 0; + for (int g = 0; g < 8; g++) { + uint64_t src_val; + memcpy(&src_val, src + g * 8, 8); + uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF); + uint64_t byte_mask = + _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF; + uint64_t compressed_val = _pext_u64(src_val, byte_mask); + int count = __builtin_popcount(submask); + memcpy(dst + write_pos, &compressed_val, 8); + write_pos += count; + } + } +#else + // Scalar fallback: scan set bits one by one and copy + // the corresponding code byte. + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; + int write_pos = 0; + uint64_t m = mask; + while (m) { + int bit = __builtin_ctzll(m); + dst[write_pos++] = src[bit]; + m &= m - 1; + } + } +#endif + + num_active += __builtin_popcountll(mask); + } + } else { + num_active = next_num_active; + compressed_codes = const_cast(codes); + } + + return std::make_pair(compressed_codes, num_active); +} + +} // namespace panorama_kernels +} // namespace faiss + +#endif // COMPILE_SIMD_AVX2 diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp new file mode 100644 index 0000000000..811fb34579 --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -0,0 +1,250 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_AVX512 + +#include + +#include + +#include + +namespace faiss { +namespace panorama_kernels { + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + size_t byte_idx = 0; + + // Process 4 chunks at a time to amortize loop overhead and keep + // the accumulator in registers across chunks. + for (; byte_idx + 3 < level_width_bytes; byte_idx += 4) { + size_t byte_offset0 = (byte_idx + 0) * max_batch_size; + size_t byte_offset1 = (byte_idx + 1) * max_batch_size; + size_t byte_offset2 = (byte_idx + 2) * max_batch_size; + size_t byte_offset3 = (byte_idx + 3) * max_batch_size; + + float* sim_table0 = sim_table + (byte_idx + 0) * 256; + float* sim_table1 = sim_table + (byte_idx + 1) * 256; + float* sim_table2 = sim_table + (byte_idx + 2) * 256; + float* sim_table3 = sim_table + (byte_idx + 3) * 256; + + size_t batch_idx = 0; + for (; batch_idx + 15 < num_active; batch_idx += 16) { + __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); + + __m128i comp0 = _mm_loadu_si128( + (__m128i*)(compressed_codes + byte_offset0 + batch_idx)); + __m128i comp1 = _mm_loadu_si128( + (__m128i*)(compressed_codes + byte_offset1 + batch_idx)); + __m128i comp2 = _mm_loadu_si128( + (__m128i*)(compressed_codes + byte_offset2 + batch_idx)); + __m128i comp3 = _mm_loadu_si128( + (__m128i*)(compressed_codes + byte_offset3 + batch_idx)); + + __m512i codes0 = _mm512_cvtepu8_epi32(comp0); + __m512i codes1 = _mm512_cvtepu8_epi32(comp1); + __m512i codes2 = _mm512_cvtepu8_epi32(comp2); + __m512i codes3 = _mm512_cvtepu8_epi32(comp3); + + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes0, sim_table0, sizeof(float))); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes1, sim_table1, sizeof(float))); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes2, sim_table2, sizeof(float))); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes3, sim_table3, sizeof(float))); + + _mm512_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx++) { + float acc = exact_distances[batch_idx]; + acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]]; + acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]]; + acc += sim_table2[compressed_codes[byte_offset2 + batch_idx]]; + acc += sim_table3[compressed_codes[byte_offset3 + batch_idx]]; + exact_distances[batch_idx] = acc; + } + } + + for (; byte_idx < level_width_bytes; byte_idx++) { + size_t byte_offset = byte_idx * max_batch_size; + float* sim_table_ptr = sim_table + byte_idx * 256; + + size_t batch_idx = 0; + for (; batch_idx + 15 < num_active; batch_idx += 16) { + __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); + __m128i comp = _mm_loadu_si128( + (__m128i*)(compressed_codes + byte_offset + batch_idx)); + __m512i codes = _mm512_cvtepu8_epi32(comp); + __m512 m_dist = + _mm512_i32gather_ps(codes, sim_table_ptr, sizeof(float)); + acc = _mm512_add_ps(acc, m_dist); + _mm512_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx++) { + exact_distances[batch_idx] += + sim_table_ptr[compressed_codes[byte_offset + batch_idx]]; + } + } +} + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + uint8_t* compressed_codes = compressed_codes_begin; + size_t num_active = 0; + + // An important optimization is to skip the compression if all points + // are active, as we can just use the compressed_codes_begin pointer. + if (next_num_active < max_batch_size) { + // Compress the codes: here we don't need to process remainders + // as long as `max_batch_size` is a multiple of 64 (which we + // assert in the constructor). Conveniently, compressed_codes is + // allocated to `max_batch_size` * `level_width_bytes` elements. + // `num_active` is guaranteed to always be less than or equal to + // `max_batch_size`. Only the last batch may be smaller than + // `max_batch_size`, the caller ensures that the batch and + // bitset are padded with zeros. + compressed_codes = compressed_codes_begin; + for (size_t point_idx = 0; point_idx < max_batch_size; + point_idx += 64) { + // Build a 64-bit mask from the byteset: each byte is + // 0 or 1, collect into a single bitmask. + uint64_t mask = 0; +#ifdef __BMI2__ + // PEXT path: extract the LSB of each byte into a + // single bit, producing a 64-bit bitmask. + for (int g = 0; g < 8; g++) { + uint64_t bytes; + memcpy(&bytes, bitset + point_idx + g * 8, 8); + uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL); + mask |= ((uint64_t)bits << (g * 8)); + } +#else + for (int b = 0; b < 64; b++) { + if (bitset[point_idx + b]) + mask |= (1ULL << b); + } +#endif + + // Byte-level stream compaction (replaces + // _mm512_maskz_compress_epi8 which requires VBMI2). +#ifdef __BMI2__ + // PEXT/PDEP path: process 8 bytes at a time. PDEP + // expands the per-byte mask bits into a per-byte lane + // mask, then PEXT extracts only the selected bytes. + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; + int write_pos = 0; + for (int g = 0; g < 8; g++) { + uint64_t src_val; + memcpy(&src_val, src + g * 8, 8); + uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF); + uint64_t byte_mask = + _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF; + uint64_t compressed_val = _pext_u64(src_val, byte_mask); + int count = __builtin_popcount(submask); + memcpy(dst + write_pos, &compressed_val, 8); + write_pos += count; + } + } +#else + // Scalar fallback: scan set bits one by one and copy + // the corresponding code byte. + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; + int write_pos = 0; + uint64_t m = mask; + while (m) { + int bit = __builtin_ctzll(m); + dst[write_pos++] = src[bit]; + m &= m - 1; + } + } +#endif + + num_active += __builtin_popcountll(mask); + } + } else { + num_active = next_num_active; + compressed_codes = const_cast(codes); + } + + return std::make_pair(compressed_codes, num_active); +} + +#ifdef COMPILE_SIMD_AVX512_SPR +// AVX512_SPR: Sapphire Rapids is a superset of AVX512. Reuse the +// AVX512 implementation until a dedicated SPR specialization is written. + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + process_level_impl( + level_width_bytes, + max_batch_size, + num_active, + sim_table, + compressed_codes, + exact_distances); +} + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl< + SIMDLevel::AVX512_SPR>( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + return process_code_compression_impl( + next_num_active, + max_batch_size, + level_width_bytes, + compressed_codes_begin, + bitset, + codes); +} +#endif // COMPILE_SIMD_AVX512_SPR + +} // namespace panorama_kernels +} // namespace faiss + +#endif // COMPILE_SIMD_AVX512 diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp new file mode 100644 index 0000000000..9601cec46d --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp @@ -0,0 +1,189 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// This TU provides: +// 1. _impl specializations for NONE, using scalar code. +// 2. Non-templated Panorama kernel dispatch wrappers +// (process_level, process_filtering, process_code_compression) declared +// in panorama_kernels.h. These use DISPATCH_SIMDLevel to route to the +// best available SIMD implementation via the _impl function template +// specializations defined in the per-SIMD .cpp files. + +#include + +#include + +#ifdef __BMI2__ +#include +#endif + +namespace faiss { +namespace panorama_kernels { + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + for (size_t byte_idx = 0; byte_idx < level_width_bytes; byte_idx++) { + size_t byte_offset = byte_idx * max_batch_size; + float* chunk_sim = sim_table + byte_idx * 256; + for (size_t i = 0; i < num_active; i++) { + exact_distances[i] += chunk_sim[compressed_codes[byte_offset + i]]; + } + } +} + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + uint8_t* compressed_codes = compressed_codes_begin; + size_t num_active = 0; + + // An important optimization is to skip the compression if all points + // are active, as we can just use the compressed_codes_begin pointer. + if (next_num_active < max_batch_size) { + compressed_codes = compressed_codes_begin; + for (size_t point_idx = 0; point_idx < max_batch_size; + point_idx += 64) { + // Build a 64-bit mask from the byteset: each byte is + // 0 or 1, collect into a single bitmask. + uint64_t mask = 0; +#ifdef __BMI2__ + for (int g = 0; g < 8; g++) { + uint64_t bytes; + memcpy(&bytes, bitset + point_idx + g * 8, 8); + uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL); + mask |= ((uint64_t)bits << (g * 8)); + } +#else + for (int b = 0; b < 64; b++) { + if (bitset[point_idx + b]) + mask |= (1ULL << b); + } +#endif + + // Byte-level stream compaction. +#ifdef __BMI2__ + // PEXT/PDEP path: process 8 bytes at a time. PDEP + // expands the per-byte mask bits into a per-byte lane + // mask, then PEXT extracts only the selected bytes. + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; + int write_pos = 0; + for (int g = 0; g < 8; g++) { + uint64_t src_val; + memcpy(&src_val, src + g * 8, 8); + uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF); + uint64_t byte_mask = + _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF; + uint64_t compressed_val = _pext_u64(src_val, byte_mask); + int count = __builtin_popcount(submask); + memcpy(dst + write_pos, &compressed_val, 8); + write_pos += count; + } + } +#else + // Scalar fallback: scan set bits one by one and copy + // the corresponding code byte. + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; + int write_pos = 0; + uint64_t m = mask; + while (m) { + int bit = __builtin_ctzll(m); + dst[write_pos++] = src[bit]; + m &= m - 1; + } + } +#endif + + num_active += __builtin_popcountll(mask); + } + } else { + num_active = next_num_active; + compressed_codes = const_cast(codes); + } + + return std::make_pair(compressed_codes, num_active); +} + +void process_level( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + DISPATCH_SIMDLevel( + process_level_impl, + level_width_bytes, + max_batch_size, + num_active, + sim_table, + compressed_codes, + exact_distances); +} + +size_t process_filtering( + size_t num_active, + float* exact_distances, + uint32_t* active_indices, + float* cum_sums, + uint8_t* bitset, + size_t batch_offset, + float dis0, + float query_cum_norm, + float heap_max) { + size_t next_num_active = 0; + for (size_t i = 0; i < num_active; i++) { + float exact_distance = exact_distances[i]; + float cum_sum = cum_sums[active_indices[i] - batch_offset]; + float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm; + + bool keep = heap_max > lower_bound; + active_indices[next_num_active] = active_indices[i]; + exact_distances[next_num_active] = exact_distance; + bitset[active_indices[i] - batch_offset] = keep; + next_num_active += keep; + } + return next_num_active; +} + +std::pair process_code_compression( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + DISPATCH_SIMDLevel( + process_code_compression_impl, + next_num_active, + max_batch_size, + level_width_bytes, + compressed_codes_begin, + bitset, + codes); +} + +} // namespace panorama_kernels +} // namespace faiss diff --git a/faiss/impl/panorama_kernels/panorama_kernels-inl.h b/faiss/impl/panorama_kernels/panorama_kernels-inl.h new file mode 100644 index 0000000000..b1ae4316db --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-inl.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +/** + * @file panorama_kernels-inl.h + * @brief Private header for Panorama kernel SIMD implementations. + * + * This is a PRIVATE header — do not include in public APIs or user code. + * Only faiss internal .cpp files (the per-SIMD implementation files and + * panorama_kernels-generic.cpp) should include this header. + * + * This header re-exports the public API (panorama_kernels.h) plus the + * simd_dispatch.h machinery needed by the implementation files. + */ + +#include +#include diff --git a/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp b/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp new file mode 100644 index 0000000000..88eba0b574 --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// ARM NEON implementations of Panorama kernels. +// TODO(@AlSchlo, @aknayar): implement NEON-optimized panorama kernels. +// Currently delegates to the scalar (NONE) implementation. + +#ifdef COMPILE_SIMD_ARM_NEON + +#include + +namespace faiss { +namespace panorama_kernels { + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + process_level_impl( + level_width_bytes, + max_batch_size, + num_active, + sim_table, + compressed_codes, + exact_distances); +} + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + return process_code_compression_impl( + next_num_active, + max_batch_size, + level_width_bytes, + compressed_codes_begin, + bitset, + codes); +} + +} // namespace panorama_kernels +} // namespace faiss + +#endif // COMPILE_SIMD_ARM_NEON diff --git a/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp b/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp new file mode 100644 index 0000000000..af89f6aac9 --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// ARM SVE implementations of Panorama kernels. +// TODO(@AlSchlo, @aknayar): implement SVE-optimized panorama kernels. +// Currently delegates to the scalar (NONE) implementation. + +#ifdef COMPILE_SIMD_ARM_SVE + +#include + +namespace faiss { +namespace panorama_kernels { + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + process_level_impl( + level_width_bytes, + max_batch_size, + num_active, + sim_table, + compressed_codes, + exact_distances); +} + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + return process_code_compression_impl( + next_num_active, + max_batch_size, + level_width_bytes, + compressed_codes_begin, + bitset, + codes); +} + +} // namespace panorama_kernels +} // namespace faiss + +#endif // COMPILE_SIMD_ARM_SVE diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h new file mode 100644 index 0000000000..3a67415523 --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +/** + * @file panorama_kernels.h + * @brief Panorama search kernels with SIMD-dispatched implementations. + * + * The three core kernels of the Panorama progressive filtering search: + * - process_level: accumulate PQ distance table lookups over chunks + * - process_filtering: Cauchy-Schwarz lower bound pruning with stream + * compaction + * - process_code_compression: byte-level stream compaction of PQ codes + */ + +#include +#include +#include + +#include +#include + +namespace faiss { +namespace panorama_kernels { + +template +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances); + +template +std::pair process_code_compression_impl( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes); + +/// Accumulate PQ distance table lookups over chunks. +/// +/// For each chunk, looks up `sim_table[compressed_codes[i]]` and +/// accumulates into `exact_distances[i]` for all active elements. +/// Iterates chunks first to keep the LUT slice in L1 cache. +/// The AVX2/AVX-512 versions unroll 4 chunks at a time. +FAISS_API void process_level( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances); + +/// Filter active elements using Cauchy-Schwarz lower bound pruning. +/// +/// Computes a lower bound on the true distance for each active element +/// and removes elements that cannot improve the current heap top. +/// Uses stream compaction to pack surviving elements contiguously. +/// Updates the bitset to reflect which elements were removed. +FAISS_API size_t process_filtering( + size_t num_active, + float* exact_distances, + uint32_t* active_indices, + float* cum_sums, + uint8_t* bitset, + size_t batch_offset, + float dis0, + float query_cum_norm, + float heap_max); + +/// Byte-level stream compaction of PQ codes using the active bitset. +/// +/// An important optimization is to skip the compression if all points +/// are active, as we can just use the original codes pointer. +/// +/// Compress the codes: here we don't need to process remainders +/// as long as `max_batch_size` is a multiple of 64 (which we +/// assert in the constructor). Conveniently, compressed_codes is +/// allocated to `max_batch_size` * `level_width_bytes` elements. +/// `num_active` is guaranteed to always be less than or equal to +/// `max_batch_size`. Only the last batch may be smaller than +/// `max_batch_size`, the caller ensures that the batch and +/// bitset are padded with zeros. +FAISS_API std::pair process_code_compression( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes); + +} // namespace panorama_kernels +} // namespace faiss diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp index 1e66227aea..8acbd02003 100644 --- a/faiss/index_factory.cpp +++ b/faiss/index_factory.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -355,6 +356,12 @@ IndexIVF* parse_IndexIVF( /*by_residual=*/true, own_il); } + if (match("PQ([0-9]+)(x[0-9]+)?Panorama([0-9]+)?")) { + int M = mres_to_int(sm[1]), nbit = mres_to_int(sm[2], 8, 1); + int nlevels = mres_to_int(sm[3], 8); + return new IndexIVFPQPanorama( + get_q(), d, nlist, M, nbit, nlevels, 128, mt, own_il); + } if (match("PQ([0-9]+)(x[0-9]+)?(np)?")) { int M = mres_to_int(sm[1]), nbit = mres_to_int(sm[2], 8, 1); IndexIVFPQ* index_ivf = diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp index 6f8e88ef35..63a4d3383c 100644 --- a/faiss/invlists/InvertedLists.cpp +++ b/faiss/invlists/InvertedLists.cpp @@ -11,6 +11,7 @@ #include #include +#include #include namespace faiss { @@ -354,22 +355,15 @@ ArrayInvertedLists::~ArrayInvertedLists() {} ArrayInvertedListsPanorama::ArrayInvertedListsPanorama( size_t nlist_in, size_t code_size_in, - size_t n_levels_in) - : ArrayInvertedLists(nlist_in, code_size_in), - n_levels(n_levels_in), - level_width( - (((code_size_in / sizeof(float)) + n_levels_in - 1) / - n_levels_in) * - sizeof(float)), - pano(code_size_in, n_levels_in, kBatchSize) { - FAISS_THROW_IF_NOT(n_levels_in > 0); - FAISS_THROW_IF_NOT(code_size_in % sizeof(float) == 0); + Panorama* pano_in) + : ArrayInvertedLists(nlist_in, code_size_in), pano(pano_in) { + FAISS_THROW_IF_NOT(pano != nullptr); + FAISS_THROW_IF_NOT(pano_in->n_levels > 0); FAISS_THROW_IF_NOT_MSG( - !use_iterator, - "IndexIVFFlatPanorama does not support iterators, use vanilla IndexIVFFlat instead"); - FAISS_ASSERT(level_width % sizeof(float) == 0); + !use_iterator, "Panorama does not support iterators"); cum_sums.resize(nlist_in); + init_dists.resize(nlist_in); } const float* ArrayInvertedListsPanorama::get_cum_sums(size_t list_no) const { @@ -377,6 +371,11 @@ const float* ArrayInvertedListsPanorama::get_cum_sums(size_t list_no) const { return cum_sums[list_no].data(); } +const float* ArrayInvertedListsPanorama::get_init_dists(size_t list_no) const { + assert(list_no < nlist); + return init_dists[list_no].data(); +} + size_t ArrayInvertedListsPanorama::add_entries( size_t list_no, size_t n_entry, @@ -389,15 +388,21 @@ size_t ArrayInvertedListsPanorama::add_entries( memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry); size_t new_size = o + n_entry; - size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize; - codes[list_no].resize(num_batches * kBatchSize * code_size); - cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1)); + size_t bs = pano->batch_size; + size_t num_batches = (new_size + bs - 1) / bs; + size_t padded = num_batches * bs; + codes[list_no].resize(padded * code_size); + cum_sums[list_no].resize(padded * (pano->n_levels + 1)); + + pano->copy_codes_to_level_layout(codes[list_no].data(), o, n_entry, code); + pano->compute_cumulative_sums(cum_sums[list_no].data(), o, n_entry, code); - // Cast to float* is safe here as we guarantee codes are always float - // vectors for `IndexIVFFlatPanorama` (verified by the constructor). - const float* vectors = reinterpret_cast(code); - pano.copy_codes_to_level_layout(codes[list_no].data(), o, n_entry, code); - pano.compute_cumulative_sums(cum_sums[list_no].data(), o, n_entry, vectors); + auto* pano_pq = dynamic_cast(pano.get()); + if (pano_pq) { + init_dists[list_no].resize(padded); + pano_pq->compute_init_distances( + init_dists[list_no].data(), list_no, o, n_entry, code); + } return o; } @@ -413,21 +418,30 @@ void ArrayInvertedListsPanorama::update_entries( memcpy(&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry); - // Cast to float* is safe here as we guarantee codes are always float - // vectors for `IndexIVFFlatPanorama` (verified by the constructor). - const float* vectors = reinterpret_cast(code); - pano.copy_codes_to_level_layout( + pano->copy_codes_to_level_layout( codes[list_no].data(), offset, n_entry, code); - pano.compute_cumulative_sums( - cum_sums[list_no].data(), offset, n_entry, vectors); + pano->compute_cumulative_sums( + cum_sums[list_no].data(), offset, n_entry, code); + + auto* pano_pq = dynamic_cast(pano.get()); + if (pano_pq) { + pano_pq->compute_init_distances( + init_dists[list_no].data(), list_no, offset, n_entry, code); + } } void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) { ids[list_no].resize(new_size); - size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize; - codes[list_no].resize(num_batches * kBatchSize * code_size); - cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1)); + size_t bs = pano->batch_size; + size_t num_batches = (new_size + bs - 1) / bs; + size_t padded = num_batches * bs; + codes[list_no].resize(padded * code_size); + cum_sums[list_no].resize(padded * (pano->n_levels + 1)); + + if (dynamic_cast(pano.get())) { + init_dists[list_no].resize(padded); + } } const uint8_t* ArrayInvertedListsPanorama::get_single_code( @@ -439,7 +453,7 @@ const uint8_t* ArrayInvertedListsPanorama::get_single_code( uint8_t* recons_buffer = new uint8_t[code_size]; float* recons = reinterpret_cast(recons_buffer); - pano.reconstruct(offset, recons, codes[list_no].data()); + pano->reconstruct(offset, recons, codes[list_no].data()); return recons_buffer; } diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h index 743bcad62d..f6ea588efd 100644 --- a/faiss/invlists/InvertedLists.h +++ b/faiss/invlists/InvertedLists.h @@ -15,6 +15,7 @@ * the interface. */ +#include #include #include @@ -277,21 +278,23 @@ struct ArrayInvertedLists : InvertedLists { ~ArrayInvertedLists() override; }; -/// Level-oriented storage as defined in the IVFFlat section of Panorama +/// Level-oriented storage as defined in the Panorama paper /// (https://www.arxiv.org/pdf/2510.00566). +/// Works with both flat codes (PanoramaFlat) and PQ codes (PanoramaPQ) +/// via the virtual Panorama interface. struct ArrayInvertedListsPanorama : ArrayInvertedLists { - static constexpr size_t kBatchSize = 128; std::vector> cum_sums; - const size_t n_levels; - const size_t level_width; // in code units - Panorama pano; + std::vector> init_dists; + std::unique_ptr pano; + /// Takes ownership of the provided Panorama*. ArrayInvertedListsPanorama( size_t nlist_in, size_t code_size_in, - size_t n_levels_in); + Panorama* pano_in); const float* get_cum_sums(size_t list_no) const; + const float* get_init_dists(size_t list_no) const; size_t add_entries( size_t list_no, diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py index 05b376efce..1e82912ca4 100644 --- a/faiss/python/__init__.py +++ b/faiss/python/__init__.py @@ -177,6 +177,7 @@ def replacement_function(*args): add_ref_in_constructor(IndexPreTransform, {2: [0, 1], 1: [0]}) add_ref_in_method(IndexPreTransform, 'prepend_transform', 0) add_ref_in_constructor(IndexIVFPQ, 0) +add_ref_in_constructor(IndexIVFPQPanorama, 0) add_ref_in_constructor(IndexIVFPQR, 0) add_ref_in_constructor(IndexIVFPQFastScan, 0) add_ref_in_constructor(IndexIVFResidualQuantizer, 0) diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig index af9b87e981..a7ce6a3ae4 100644 --- a/faiss/python/swigfaiss.swig +++ b/faiss/python/swigfaiss.swig @@ -96,6 +96,7 @@ typedef uint64_t size_t; #include #include #include +#include #include #include #include @@ -541,6 +542,7 @@ void gpu_sync_all_devices() %include %include +%ignore faiss::ArrayInvertedListsPanorama::pano; %include %include %ignore BlockInvertedListsIOHook; @@ -594,6 +596,7 @@ void gpu_sync_all_devices() %ignore faiss::IndexIVFPQ::alloc_type; %include +%include %include %include @@ -779,6 +782,7 @@ void gpu_sync_all_devices() DOWNCAST ( IndexIVFRaBitQ ) DOWNCAST ( IndexIVFRaBitQFastScan ) DOWNCAST ( IndexIVFIndependentQuantizer) + DOWNCAST ( IndexIVFPQPanorama ) DOWNCAST ( IndexIVFPQR ) DOWNCAST ( IndexIVFPQ ) DOWNCAST ( IndexIVFPQFastScan ) diff --git a/tests/test_factory.py b/tests/test_factory.py index 2246eb8c10..922ab14cf0 100644 --- a/tests/test_factory.py +++ b/tests/test_factory.py @@ -70,6 +70,21 @@ def test_factory_6(self): assert index.d == 128 assert index.metric_type == faiss.METRIC_L2 + def test_factory_panorama(self): + index = faiss.index_factory(64, "IVF16,PQ16x8Panorama4") + assert isinstance(index, faiss.IndexIVFPQPanorama) + assert index.n_levels == 4 + assert index.pq.M == 16 + + index = faiss.index_factory(64, "IVF16,PQ16Panorama") + assert isinstance(index, faiss.IndexIVFPQPanorama) + assert index.n_levels == 8 # default + + index = faiss.index_factory(64, "PCA64,IVF16,PQ16x8Panorama4") + ivf = faiss.downcast_index(index.index) + assert isinstance(ivf, faiss.IndexIVFPQPanorama) + assert ivf.n_levels == 4 + def test_factory_HNSW(self): index = faiss.index_factory(12, "HNSW32") assert index.storage.sa_code_size() == 12 * 4 diff --git a/tests/test_ivfpq_panorama.py b/tests/test_ivfpq_panorama.py new file mode 100644 index 0000000000..d2e28d78d6 --- /dev/null +++ b/tests/test_ivfpq_panorama.py @@ -0,0 +1,617 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +Comprehensive test suite for IndexIVFPQPanorama. + +Panorama is an adaptation of IndexIVFPQ that uses level-oriented storage +and progressive filtering with Cauchy-Schwarz bounds to achieve significant +speedups when combined with PCA or Cayley transforms, with zero loss in +accuracy. + +Paper: https://www.arxiv.org/pdf/2510.00566 + +Constraints specific to IndexIVFPQPanorama: + - Only L2 metric is supported. + - Only 8-bit PQ codes (nbits == 8). + - M must be divisible by n_levels. + - batch_size must be a multiple of 64. + - use_precomputed_table must be 1. +""" + +import unittest + +import faiss +import numpy as np +from faiss.contrib.datasets import SyntheticDataset + + +class TestIndexIVFPQPanorama(unittest.TestCase): + """Test Suite for IndexIVFPQPanorama.""" + + # Helper methods for index creation and data generation + + def generate_data(self, d, nt, nb, nq, seed=42): + ds = SyntheticDataset(d, nt, nb, nq, seed=seed) + return ds.get_train(), ds.get_database(), ds.get_queries() + + def create_ivfpq(self, d, nlist, M, nbits, xt, xb=None, nprobe=None): + """Create and train a standard IndexIVFPQ (L2 only).""" + quantizer = faiss.IndexFlatL2(d) + index = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) + index.train(xt) + if xb is not None: + index.add(xb) + if nprobe is not None: + index.nprobe = nprobe + return index + + def create_panorama( + self, d, nlist, M, nbits, n_levels, xt, xb=None, + nprobe=None, batch_size=128, + ): + """Create IndexIVFPQPanorama from a freshly trained IVFPQ. + + Trains a temporary IndexIVFPQ, copies PQ centroids and quantizer + into the Panorama index, then sets up precomputed tables. + """ + quantizer = faiss.IndexFlatL2(d) + trained = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) + trained.train(xt) + + trained.own_fields = False + pano = faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, batch_size, + ) + centroids = faiss.vector_to_array(trained.pq.centroids) + faiss.copy_array_to_vector(centroids, pano.pq.centroids) + pano.is_trained = True + pano.use_precomputed_table = 1 + pano.precompute_table() + + if xb is not None: + pano.add(xb) + if nprobe is not None: + pano.nprobe = nprobe + return pano + + def create_pair( + self, d, nlist, M, nbits, n_levels, xt, xb=None, + nprobe=None, batch_size=128, + ): + """Create an IVFPQ and an IVFPQPanorama sharing the same training. + + Both indexes use the same quantizer centroids and PQ codebook, + so search results should be identical. + """ + quantizer = faiss.IndexFlatL2(d) + trained = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) + trained.train(xt) + + # Build the IVFPQ baseline from the trained state. + ivfpq = faiss.clone_index(trained) + + # Build the Panorama from the same trained state. + trained.own_fields = False + pano = faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, batch_size, + ) + centroids = faiss.vector_to_array(trained.pq.centroids) + faiss.copy_array_to_vector(centroids, pano.pq.centroids) + pano.is_trained = True + pano.use_precomputed_table = 1 + pano.precompute_table() + + if xb is not None: + ivfpq.add(xb) + pano.add(xb) + if nprobe is not None: + ivfpq.nprobe = nprobe + pano.nprobe = nprobe + return ivfpq, pano + + def assert_search_results_equal( + self, + D_regular, + I_regular, + D_panorama, + I_panorama, + rtol=1e-4, + atol=1e-6, + otol=1e-3, + ): + overlap_rate = np.mean(I_regular == I_panorama) + + self.assertGreater( + overlap_rate, + 1 - otol, + f"Overlap rate {overlap_rate:.6f} is not > {1 - otol:.3f}. ", + ) + np.testing.assert_allclose( + D_regular, + D_panorama, + rtol=rtol, + atol=atol, + err_msg="Distances mismatch", + ) + + # Core functionality tests + + def test_exact_match_with_ivfpq(self): + """Core test: Panorama must return identical results to IndexIVFPQ""" + d, nb, nt, nq = 64, 50000, 60000, 500 + nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=42) + + for nprobe in [1, 4, 16, 64]: + with self.subTest(nprobe=nprobe): + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_exact_match_with_ivfpq_medium(self): + """Core test: Medium scale version""" + d, nb, nt, nq = 32, 10000, 15000, 200 + nlist, M, nbits, n_levels, k = 32, 8, 8, 4, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=42) + + for nprobe in [1, 4, 8, nlist]: + with self.subTest(nprobe=nprobe): + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + # Parameter variation tests + + def test_different_n_levels(self): + """Test correctness with various n_levels parameter values""" + d, nb, nt, nq = 64, 25000, 40000, 200 + nlist, M, nbits, k = 64, 16, 8, 15 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=456) + + # Train IVFPQ once for the baseline. + ivfpq = self.create_ivfpq(d, nlist, M, nbits, xt, xb, nprobe=16) + D_base, I_base = ivfpq.search(xq, k) + + nt = faiss.omp_get_max_threads() + faiss.omp_set_num_threads(1) + + prev_ratio = float("inf") + # n_levels must divide M=16. + for n_levels in [1, 2, 4, 8, 16]: + with self.subTest(n_levels=n_levels): + faiss.cvar.indexPanorama_stats.reset() + + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=16, + ) + D, I = pano.search(xq, k) + self.assert_search_results_equal(D_base, I_base, D, I) + + ratio = faiss.cvar.indexPanorama_stats.ratio_dims_scanned + self.assertLess(ratio, prev_ratio) + prev_ratio = ratio + + faiss.omp_set_num_threads(nt) + + def test_different_M_and_n_levels(self): + """Test various M / n_levels combinations""" + test_cases = [ + (32, 8, 2), # M=8, n_levels=2, chunk=4 + (64, 16, 4), # M=16, n_levels=4, chunk=4 + (64, 32, 8), # M=32, n_levels=8, chunk=4 + ] + for d, M, n_levels in test_cases: + with self.subTest(d=d, M=M, n_levels=n_levels): + nb, nt, nq, nlist, nbits, k = 10000, 15000, 100, 32, 8, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=789) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=8, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_single_level(self): + """Test edge case with n_levels=1 (no pruning, equivalent to IVFPQ)""" + d, nb, nt, nq = 32, 5000, 7000, 50 + nlist, M, nbits, n_levels, k = 16, 8, 8, 1, 5 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=333) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=4, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_max_levels(self): + """Test edge case with n_levels=M (each level is one subquantizer)""" + d, nb, nt, nq = 64, 5000, 7000, 50 + nlist, M, nbits, n_levels, k = 16, 16, 8, 16, 5 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=444) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=4, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + # ID selector tests + + def test_id_selector_range(self): + """Test ID filtering with range selector""" + d, nb, nt, nq = 64, 50000, 60000, 300 + nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=321) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=16, + ) + + params = faiss.SearchParametersIVF() + params.sel = faiss.IDSelectorRange(10000, 30000) + + D_regular, I_regular = ivfpq.search(xq, k, params=params) + D_panorama, I_panorama = pano.search(xq, k, params=params) + + valid = I_panorama[I_panorama >= 0] + self.assertTrue(np.all(valid >= 10000)) + self.assertTrue(np.all(valid < 30000)) + + np.testing.assert_array_equal(I_regular, I_panorama) + np.testing.assert_allclose(D_regular, D_panorama, rtol=1e-4) + + def test_id_selector_batch(self): + """Test ID filtering with batch selector""" + d, nb, nt, nq = 64, 30000, 45000, 200 + nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=654) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=16, + ) + + allowed_ids = np.array([i * 50 for i in range(500)], dtype=np.int64) + params = faiss.SearchParametersIVF() + params.sel = faiss.IDSelectorBatch(allowed_ids) + + D_regular, I_regular = ivfpq.search(xq, k, params=params) + D_panorama, I_panorama = pano.search(xq, k, params=params) + + allowed_set = set(allowed_ids) | {-1} + for id_val in I_panorama.flatten(): + self.assertIn(int(id_val), allowed_set) + + np.testing.assert_array_equal(I_regular, I_panorama) + np.testing.assert_allclose(D_regular, D_panorama, rtol=1e-4) + + def test_selector_excludes_all(self): + """Test selector that excludes all results""" + d, nb, nt, nq = 32, 3000, 5000, 5 + nlist, M, nbits, n_levels, k = 8, 8, 8, 4, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=999) + + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist, + ) + + params = faiss.SearchParametersIVF() + params.sel = faiss.IDSelectorRange(nb + 100, nb + 200) + + D, I = pano.search(xq, k, params=params) + self.assertTrue(np.all(I == -1)) + + # Batch size and edge case tests + + def test_batch_boundaries(self): + """Test correctness at various database sizes relative to batch_size""" + d, nq = 64, 50 + nlist, M, nbits, n_levels, k = 16, 16, 8, 4, 10 + xq = np.random.rand(nq, d).astype("float32") + + batch_size = 128 + test_sizes = [ + batch_size - 1, + batch_size, + batch_size + 1, + batch_size * 2, + batch_size * 3 - 1, + ] + for nb in test_sizes: + with self.subTest(nb=nb): + nt = max(nb, 500) + np.random.seed(987) + xt = np.random.rand(nt, d).astype("float32") + xb = np.random.rand(nb, d).astype("float32") + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, + nprobe=nlist, batch_size=batch_size, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_different_batch_sizes(self): + """Test correctness across different internal batch sizes""" + d, nb, nt, nq = 64, 10000, 15000, 50 + nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=4242) + + ivfpq = self.create_ivfpq(d, nlist, M, nbits, xt, xb, nprobe=8) + D_base, I_base = ivfpq.search(xq, k) + + for bs in [64, 128, 256, 512, 1024]: + with self.subTest(batch_size=bs): + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, + nprobe=8, batch_size=bs, + ) + D, I = pano.search(xq, k) + self.assert_search_results_equal(D_base, I_base, D, I) + + def test_very_small_dataset(self): + """Test with dataset smaller than batch size""" + test_cases = [10, 50, 100] + + for nb in test_cases: + with self.subTest(nb=nb): + d, nlist, M, nbits, n_levels = 32, 4, 4, 8, 2 + nt, nq = max(nb, 1500), 5 + k = min(3, nb) + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=666 + nb) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_single_vector_per_cluster(self): + """Test extreme case where clusters have very few vectors""" + d, nb, nt, nq = 32, 20, 3000, 5 + nlist, M, nbits, n_levels, k = 16, 4, 8, 2, 3 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=1313) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_empty_result_handling(self): + """Test handling of empty search results (shapes only)""" + d, nb, nt, nq = 32, 100, 3000, 10 + nlist, M, nbits, n_levels, k = 8, 4, 8, 2, 10 + xt, xb, _ = self.generate_data(d, nt, nb, nq, seed=111) + xq = np.random.rand(nq, d).astype("float32") + 10.0 + + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=1, + ) + D, I = pano.search(xq, k) + + self.assertEqual(D.shape, (nq, k)) + self.assertEqual(I.shape, (nq, k)) + + # Dynamic operations tests + + def test_incremental_add(self): + """Test adding vectors incrementally in multiple batches""" + d, nt = 64, 20000 + nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 15 + xt = np.random.rand(nt, d).astype("float32") + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, nprobe=16, + ) + + for batch_nb in [5000, 10000, 15000]: + xb_batch = np.random.rand(batch_nb, d).astype("float32") + ivfpq.add(xb_batch) + pano.add(xb_batch) + + nq = 100 + xq = np.random.rand(nq, d).astype("float32") + + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_add_search_add_search(self): + """Test interleaved add and search operations""" + d, nt = 32, 500 + nlist, M, nbits, n_levels, k = 8, 8, 8, 4, 5 + np.random.seed(555) + xt = np.random.rand(nt, d).astype("float32") + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, nprobe=4, + ) + + xb1 = np.random.rand(200, d).astype("float32") + ivfpq.add(xb1) + pano.add(xb1) + + xq1 = np.random.rand(10, d).astype("float32") + D_reg_1, I_reg_1 = ivfpq.search(xq1, k) + D_pan_1, I_pan_1 = pano.search(xq1, k) + self.assert_search_results_equal(D_reg_1, I_reg_1, D_pan_1, I_pan_1) + + xb2 = np.random.rand(300, d).astype("float32") + ivfpq.add(xb2) + pano.add(xb2) + + xq2 = np.random.rand(10, d).astype("float32") + D_reg_2, I_reg_2 = ivfpq.search(xq2, k) + D_pan_2, I_pan_2 = pano.search(xq2, k) + self.assert_search_results_equal(D_reg_2, I_reg_2, D_pan_2, I_pan_2) + + # Serialization tests + + def test_serialization(self): + """Test write/read preserves search results""" + d, nb, nt, nq = 64, 10000, 15000, 100 + nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 20 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=2024) + + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=8, + ) + + D_before, I_before = pano.search(xq, k) + pano_after = faiss.deserialize_index(faiss.serialize_index(pano)) + D_after, I_after = pano_after.search(xq, k) + + np.testing.assert_array_equal(I_before, I_after) + np.testing.assert_allclose(D_before, D_after, rtol=1e-5) + + def test_serialization_preserves_params(self): + """Test serialization preserves n_levels and batch_size correctly""" + d, nb, nt, nq = 64, 10000, 15000, 50 + nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=2025) + + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=4, + ) + D_before, I_before = pano.search(xq, k) + + pano_after = faiss.deserialize_index( + faiss.serialize_index(pano) + ) + self.assertEqual(pano_after.batch_size, 128) + self.assertEqual(pano_after.n_levels, n_levels) + + D_after, I_after = pano_after.search(xq, k) + np.testing.assert_array_equal(I_before, I_after) + np.testing.assert_allclose(D_before, D_after, rtol=1e-5) + + # Statistics tests + + def test_ratio_dims_scanned(self): + """Test that ratio_dims_scanned is 1.0 at n_levels=1 and strictly + less for higher n_levels. + + Unlike IndexFlatPanorama, PQ quantization error prevents achieving + the ideal 1/n_levels ratio even on synthetic data. We verify that + n_levels=1 gives ratio=1.0 (exhaustive) and that multi-level + pruning is effective (ratio well below 1.0). + """ + d, nb, nt, nq = 64, 25000, 40000, 10 + nlist, M, nbits, k = 32, 16, 8, 1 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=5678) + + nt_threads = faiss.omp_get_max_threads() + faiss.omp_set_num_threads(1) + + faiss.cvar.indexPanorama_stats.reset() + pano_1 = self.create_panorama( + d, nlist, M, nbits, 1, xt, xb, nprobe=8, + ) + pano_1.search(xq, k) + ratio_1 = faiss.cvar.indexPanorama_stats.ratio_dims_scanned + self.assertAlmostEqual(ratio_1, 1.0, places=3) + + faiss.cvar.indexPanorama_stats.reset() + pano_16 = self.create_panorama( + d, nlist, M, nbits, 16, xt, xb, nprobe=8, + ) + pano_16.search(xq, k) + ratio_16 = faiss.cvar.indexPanorama_stats.ratio_dims_scanned + self.assertLess(ratio_16, 0.6) + + faiss.omp_set_num_threads(nt_threads) + + def test_pruning_improves_with_n_levels(self): + """Test that increasing n_levels reduces the fraction scanned""" + d, nb, nt, nq = 64, 25000, 40000, 50 + nlist, M, nbits, k = 32, 16, 8, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=1234) + + nt_threads = faiss.omp_get_max_threads() + faiss.omp_set_num_threads(1) + + prev_ratio = float("inf") + for n_levels in [1, 2, 4, 8, 16]: + with self.subTest(n_levels=n_levels): + faiss.cvar.indexPanorama_stats.reset() + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=8, + ) + pano.search(xq, k) + ratio = faiss.cvar.indexPanorama_stats.ratio_dims_scanned + self.assertLessEqual(ratio, prev_ratio) + prev_ratio = ratio + + faiss.omp_set_num_threads(nt_threads) + + # Constraint validation tests + + def test_rejects_non_l2_metric(self): + """Verify that non-L2 metrics are rejected""" + d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 4 + quantizer = faiss.IndexFlatIP(d) + with self.assertRaises(RuntimeError): + faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, 128, + faiss.METRIC_INNER_PRODUCT, + ) + + def test_rejects_invalid_batch_size(self): + """Verify that non-multiple-of-64 batch_size is rejected""" + d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 4 + quantizer = faiss.IndexFlatL2(d) + with self.assertRaises(RuntimeError): + faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, 100, + ) + + def test_rejects_m_not_divisible_by_n_levels(self): + """Verify that M not divisible by n_levels is rejected""" + d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 3 + quantizer = faiss.IndexFlatL2(d) + with self.assertRaises(RuntimeError): + faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, 128, + )