From c132cc13812b6da4d8fa12728b4613453aaf95c4 Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Sun, 15 Mar 2026 06:01:51 +0000 Subject: [PATCH 01/41] Initial commit --- faiss/CMakeLists.txt | 2 + faiss/IndexIVF.h | 38 ++++ faiss/IndexIVFPQ.cpp | 519 ++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 557 insertions(+), 2 deletions(-) diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index aeef0ed6cb..5a6c37ffbd 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -63,6 +63,7 @@ set(FAISS_SRC IndexIVFFlat.cpp IndexIVFFlatPanorama.cpp IndexIVFPQ.cpp + IndexIVFPQPanorama.cpp IndexIVFFastScan.cpp IndexIVFAdditiveQuantizerFastScan.cpp IndexIVFPQFastScan.cpp @@ -184,6 +185,7 @@ set(FAISS_HEADERS IndexIVFFlat.h IndexIVFFlatPanorama.h IndexIVFPQ.h + IndexIVFPQPanorama.h IndexIVFFastScan.h IndexIVFAdditiveQuantizerFastScan.h IndexIVFPQFastScan.h diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h index ef744688d6..d66523d245 100644 --- a/faiss/IndexIVF.h +++ b/faiss/IndexIVF.h @@ -19,9 +19,12 @@ #include #include #include +#include namespace faiss { +struct IndexIVFPQPanorama; + /** Encapsulates a quantizer object for the IndexIVF * * The class isolates the fields that are independent of the storage @@ -497,6 +500,15 @@ struct InvertedListScanner { /// following codes come from this inverted list virtual void set_list(idx_t list_no, float coarse_dis); + virtual void set_list_panorama( + idx_t list_no, + float coarse_dis, + float* sim_table, + float* dis0_ptr, + bool update) {} + + virtual void set_sim_table(float* sim_table, float dis0_ptr) {} + /// compute a single query-to-code distance virtual float distance_to_code(const uint8_t* code) const = 0; @@ -553,6 +565,32 @@ struct InvertedListScanner { const idx_t* ids, ResultHandler& handler) const; + virtual size_t process_batch( + const ProductQuantizer& pq, + uint8_t* compressed_codes, + size_t cluster_id, + size_t batch_no, + float coarse_dis_i, + size_t curr_batch_size, + size_t max_batch_size, + size_t chunk_size, + float epsilon, + size_t n_levels, + const uint8_t* codes_batch, + float* cums, + float* query_cum_norms, + uint32_t* active_indices, + uint8_t* bitset, + float* exact_distances, + const idx_t* ids, + float* heap_sim, + idx_t* heap_ids, + size_t k, + float* dis0_cache, + float* sim_table_cache) { + return 0; + } + virtual ~InvertedListScanner() {} }; diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp index a909d81db9..9e074d1c29 100644 --- a/faiss/IndexIVFPQ.cpp +++ b/faiss/IndexIVFPQ.cpp @@ -9,13 +9,15 @@ #include +#include +#include #include #include #include +#include #include #include - -#include +#include #include #include @@ -760,6 +762,65 @@ struct QueryTables { return dis0; } + + float precompute_list_tables_L2_panorama(float* sim_table_ptr) { + float dis0 = 0; + + if (use_precomputed_table == 1) { + dis0 = coarse_dis; + + const size_t n = pq.M * pq.ksub; + const float bf = -2.0f; + const float* b = sim_table_2; + float* c = sim_table_ptr; + +#ifdef __AVX512F__ + const size_t n16 = n / 16; + const size_t n_for_masking = n % 16; + + const __m512 bfmm = _mm512_set1_ps(bf); + + size_t idx = 0; + for (idx = 0; idx < n16 * 16; idx += 16) { + const __m512 bx = _mm512_loadu_ps(b + idx); + const __m512 abmul = _mm512_mul_ps(bfmm, bx); + _mm512_storeu_ps(c + idx, abmul); + } + + if (n_for_masking > 0) { + const __mmask16 mask = (1 << n_for_masking) - 1; + const __m512 bx = _mm512_maskz_loadu_ps(mask, b + idx); + const __m512 abmul = _mm512_mul_ps(bfmm, bx); + _mm512_mask_storeu_ps(c + idx, mask, abmul); + } +#else + for (size_t idx = 0; idx < n; idx++) { + c[idx] = bf * b[idx]; + } +#endif + + sim_table = sim_table_ptr; + } else { + FAISS_THROW_MSG( + "Panorama PQ only supports use_precomputed_table == 1"); + } + + return dis0; + } + + float precompute_list_tables_panorama(float* sim_table_ptr) { + float dis0 = 0; + uint64_t t0; + TIC; + if (by_residual) { + if (metric_type == METRIC_INNER_PRODUCT) + dis0 = precompute_list_tables_IP(); + else + dis0 = precompute_list_tables_L2_panorama(sim_table_ptr); + } + init_list_cycles += TOC; + return dis0; + } }; template @@ -791,6 +852,39 @@ struct WrappedSearchResult { } }; +template +struct KnnSearchResultsPanorama { + idx_t key; + const idx_t* ids; + const IDSelector* sel; + + size_t k; + float* heap_sim; + idx_t* heap_ids; + + size_t nup; + + inline bool skip_entry(idx_t j) { + return use_sel && !sel->is_member(ids[j]); + } + + inline bool should_keep(float dis) { + return C::cmp(heap_sim[0], dis); + } + + inline float top() { + return heap_sim[0]; + } + + inline void add(idx_t j, float dis) { + if (C::cmp(heap_sim[0], dis)) { + idx_t id = ids ? ids[j] : lo_build(key, j); + heap_replace_top(k, heap_sim, heap_ids, dis, id); + nup++; + } + } +}; + /***************************************************** * Scaning the codes. * The scanning functions call their favorite precompute_* @@ -821,6 +915,26 @@ struct IVFPQScannerT : QueryTables { } } + void init_list_panorama( + idx_t list_no, + float coarse_dis, + int mode, + float* sim_table, + float* dis0_ptr, + bool update) { + this->key = list_no; + this->coarse_dis = coarse_dis; + + if (mode == 2) { + if (update) { + *dis0_ptr = precompute_list_tables_panorama(sim_table); + } + dis0 = *dis0_ptr; + } else if (mode == 1) { + dis0 = precompute_list_table_pointers(); + } + } + /***************************************************** * Scaning the codes: simple PQ scan. *****************************************************/ @@ -1207,6 +1321,407 @@ struct IVFPQScanner : IVFPQScannerT, this->init_list(list_no, coarse_dis, precompute_mode); } + void set_list_panorama( + idx_t list_no, + float coarse_dis, + float* sim_table, + float* dis0_ptr, + bool update) override { + this->list_no = list_no; + this->init_list_panorama( + list_no, + coarse_dis, + precompute_mode, + sim_table, + dis0_ptr, + update); + } + + void set_sim_table(float* sim_table, float dis0) override { + this->sim_table = sim_table; + this->dis0 = dis0; + } + +#ifdef __AVX512F__ + inline void process_chunks( + size_t chunk_size, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + size_t chunk_idx = 0; + for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) { + size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size; + size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size; + size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size; + size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size; + + float* sim_table0 = sim_table + (chunk_idx + 0) * 256; + float* sim_table1 = sim_table + (chunk_idx + 1) * 256; + float* sim_table2 = sim_table + (chunk_idx + 2) * 256; + float* sim_table3 = sim_table + (chunk_idx + 3) * 256; + + size_t batch_idx = 0; + for (; batch_idx + 15 < num_active; batch_idx += 16) { + __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); + + __m128i comp0 = _mm_loadu_si128( + (__m128i*)(compressed_codes + chunk_offset0 + batch_idx)); + __m512i codes0 = _mm512_cvtepu8_epi32(comp0); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes0, sim_table0, sizeof(float))); + + __m128i comp1 = _mm_loadu_si128( + (__m128i*)(compressed_codes + chunk_offset1 + batch_idx)); + __m512i codes1 = _mm512_cvtepu8_epi32(comp1); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes1, sim_table1, sizeof(float))); + + __m128i comp2 = _mm_loadu_si128( + (__m128i*)(compressed_codes + chunk_offset2 + batch_idx)); + __m512i codes2 = _mm512_cvtepu8_epi32(comp2); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes2, sim_table2, sizeof(float))); + + __m128i comp3 = _mm_loadu_si128( + (__m128i*)(compressed_codes + chunk_offset3 + batch_idx)); + __m512i codes3 = _mm512_cvtepu8_epi32(comp3); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes3, sim_table3, sizeof(float))); + + _mm512_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx += 1) { + float acc = exact_distances[batch_idx]; + acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]]; + acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]]; + acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]]; + acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]]; + exact_distances[batch_idx] = acc; + } + } + + for (; chunk_idx < chunk_size; chunk_idx++) { + size_t chunk_offset = chunk_idx * max_batch_size; + float* sim_table_ptr = sim_table + chunk_idx * 256; + + size_t batch_idx = 0; + for (; batch_idx + 15 < num_active; batch_idx += 16) { + __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); + __m128i comp = _mm_loadu_si128( + (__m128i*)(compressed_codes + chunk_offset + batch_idx)); + __m512i codes = _mm512_cvtepu8_epi32(comp); + __m512 m_dist = _mm512_i32gather_ps( + codes, sim_table_ptr, sizeof(float)); + acc = _mm512_add_ps(acc, m_dist); + _mm512_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx += 1) { + exact_distances[batch_idx] += sim_table_ptr + [compressed_codes[chunk_offset + batch_idx]]; + } + } + } + + inline size_t process_filtering( + size_t num_active, + float* exact_distances, + uint32_t* active_indices, + __m512i batch_offset_broadcast, + float* cum_sums, + __m512 dis0_broadcast, + __m512 query_cum_norm_broadcast, + __m512 epsilon_broadcast, + __m512 heap_max_broadcast, + uint8_t* bitset, + size_t batch_offset, + float dis0, + float query_cum_norm, + float epsilon, + float heap_max) { + size_t next_num_active = 0; + size_t batch_idx = 0; + + for (; batch_idx + 15 < num_active; batch_idx += 16) { + __m512 exact_distances_batch = + _mm512_loadu_ps(exact_distances + batch_idx); + + __m512i active_indices_batch = + _mm512_loadu_si512(active_indices + batch_idx); + __m512i offsetted_active_indices_batch = _mm512_sub_epi32( + active_indices_batch, batch_offset_broadcast); + __m512 cum_sums_batch = _mm512_i32gather_ps( + offsetted_active_indices_batch, cum_sums, sizeof(float)); + + __m512 exact_distances_batch_dis0 = + _mm512_add_ps(exact_distances_batch, dis0_broadcast); + __m512 cauchy_schwarz_bound = + _mm512_mul_ps(query_cum_norm_broadcast, cum_sums_batch); + cauchy_schwarz_bound = + _mm512_mul_ps(cauchy_schwarz_bound, epsilon_broadcast); + + __m512 lower_bound = _mm512_sub_ps( + exact_distances_batch_dis0, cauchy_schwarz_bound); + __mmask16 mask_should_keep = _mm512_cmp_ps_mask( + lower_bound, heap_max_broadcast, _CMP_LT_OQ); + + __m512i compressed_active_indices_vec = _mm512_mask_compress_epi32( + _mm512_setzero_si512(), + mask_should_keep, + active_indices_batch); + _mm512_storeu_si512( + active_indices + next_num_active, + compressed_active_indices_vec); + + __m512 compressed_exact_distances_vec = _mm512_mask_compress_ps( + _mm512_setzero_ps(), + mask_should_keep, + exact_distances_batch); + _mm512_storeu_ps( + exact_distances + next_num_active, + compressed_exact_distances_vec); + + alignas(64) uint32_t indices_to_remove[16]; + __mmask16 mask_should_remove = ~mask_should_keep; + size_t num_to_remove = _mm_popcnt_u32(mask_should_remove); + + __m512i compressed_indices_to_remove_vec = + _mm512_mask_compress_epi32( + _mm512_setzero_si512(), + mask_should_remove, + active_indices_batch); + _mm512_storeu_si512( + indices_to_remove, compressed_indices_to_remove_vec); + + for (size_t idx = 0; idx < num_to_remove; idx++) { + bitset[indices_to_remove[idx] - batch_offset] = 0; + } + + next_num_active += _mm_popcnt_u32(mask_should_keep); + } + + for (; batch_idx < num_active; batch_idx++) { + float exact_distance = exact_distances[batch_idx]; + + float cum_sum = cum_sums[active_indices[batch_idx] - batch_offset]; + float cauchy_schwarz_bound = cum_sum * query_cum_norm; + float lower_bound = + exact_distance - cauchy_schwarz_bound * epsilon + dis0; + + uint32_t should_keep = heap_max > lower_bound; + active_indices[next_num_active] = active_indices[batch_idx]; + exact_distances[next_num_active] = exact_distance; + + bitset[active_indices[batch_idx] - batch_offset] = should_keep; + + next_num_active += should_keep; + } + + return next_num_active; + } + + inline std::pair process_code_compression( + size_t level, + size_t next_num_active, + size_t max_batch_size, + size_t chunk_size, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + uint8_t* compressed_codes = compressed_codes_begin; + size_t num_active = 0; + + if (next_num_active < max_batch_size) { + compressed_codes = compressed_codes_begin; + for (size_t point_idx = 0; point_idx < max_batch_size; + point_idx += 64) { + __m512i active_byteset = _mm512_loadu_si512(bitset + point_idx); + __mmask64 mask = _mm512_cmpneq_epi8_mask( + active_byteset, _mm512_setzero_si512()); + + for (size_t ci = 0; ci < chunk_size; ci++) { + size_t chunk_offset = ci * max_batch_size; + size_t write_pos = 0; + uint64_t m = (uint64_t)mask; + while (m) { + int bit = __builtin_ctzll(m); + compressed_codes[chunk_offset + num_active + write_pos] = + codes[chunk_offset + point_idx + bit]; + write_pos++; + m &= m - 1; + } + } + + num_active += _mm_popcnt_u64(mask); + } + } else { + num_active = next_num_active; + compressed_codes = const_cast(codes); + } + + return std::make_pair(compressed_codes, num_active); + } +#endif // __AVX512F__ + + inline void process_chunks_sparse( + size_t chunk_size, + size_t max_batch_size, + size_t num_active, + float* sim_table, + const uint8_t* codes, + float* exact_distances, + uint32_t* active_indices, + size_t batch_offset, + size_t ksub) { + for (size_t ci = 0; ci < chunk_size; ci++) { + size_t chunk_offset = ci * max_batch_size; + float* chunk_sim_table = sim_table + ci * ksub; + + for (size_t batch_idx = 0; batch_idx < num_active; batch_idx++) { + size_t real_idx = active_indices[batch_idx] - batch_offset; + uint8_t code = codes[chunk_offset + real_idx]; + exact_distances[batch_idx] += chunk_sim_table[code]; + } + } + } + +#ifdef __AVX512F__ + size_t process_batch( + const ProductQuantizer& pq, + uint8_t* compressed_codes, + size_t cluster_id, + size_t batch_no, + float coarse_dis_i, + size_t curr_batch_size, + size_t max_batch_size, + size_t chunk_size, + float epsilon, + size_t n_levels, + const uint8_t* codes_batch, + float* cums, + float* query_cum_norms, + uint32_t* active_indices, + uint8_t* bitset, + float* exact_distances, + const idx_t* ids, + float* heap_sim, + idx_t* heap_ids, + size_t k, + float* dis0_cache, + float* sim_table_cache) override { + KnnSearchResultsPanorama res = { + this->key, + this->store_pairs ? nullptr : ids, + this->sel, + k, + heap_sim, + heap_ids, + 0}; + uint8_t* compressed_codes_begin = compressed_codes; + size_t total_active = 0; + __m512 epsilon_broadcast = _mm512_set1_ps(epsilon); + + size_t next_num_active = curr_batch_size; + float dis0 = 0; + size_t batch_offset = batch_no * max_batch_size; + __m512i batch_offset_broadcast = _mm512_set1_epi32(batch_offset); + for (size_t level = 0; (level < n_levels) && (next_num_active > 0); + level++) { + total_active += next_num_active; + + size_t level_offset_sim_table = level * pq.ksub * chunk_size; + this->set_list_panorama( + cluster_id, + coarse_dis_i, + sim_table_cache + level_offset_sim_table, + dis0_cache, + level == 0 && batch_no == 0); + this->set_sim_table( + sim_table_cache + level_offset_sim_table, *dis0_cache); + + dis0 = this->dis0; + __m512 dis0_bcast = _mm512_set1_ps(dis0); + + float query_cum_norm = 2 * query_cum_norms[level + 1]; + __m512 query_cum_norm_broadcast = _mm512_set1_ps(query_cum_norm); + + float heap_max = res.top(); + __m512 heap_max_broadcast = _mm512_set1_ps(heap_max); + + float* cum_sums = cums + curr_batch_size * level; + const uint8_t* codes = + codes_batch + max_batch_size * chunk_size * level; + + bool is_sparse = next_num_active < max_batch_size / 16; + float* sim_table = this->sim_table; + + size_t num_active_for_filtering = 0; + if (is_sparse) { + process_chunks_sparse( + chunk_size, + max_batch_size, + next_num_active, + sim_table, + codes, + exact_distances, + active_indices, + batch_offset, + pq.ksub); + num_active_for_filtering = next_num_active; + } else { + auto [cc, na] = process_code_compression( + level, + next_num_active, + max_batch_size, + chunk_size, + compressed_codes_begin, + bitset, + codes); + + process_chunks( + chunk_size, + max_batch_size, + na, + sim_table, + cc, + exact_distances); + num_active_for_filtering = na; + } + + next_num_active = process_filtering( + num_active_for_filtering, + exact_distances, + active_indices, + batch_offset_broadcast, + cum_sums, + dis0_bcast, + query_cum_norm_broadcast, + epsilon_broadcast, + heap_max_broadcast, + bitset, + batch_offset, + dis0, + query_cum_norm, + epsilon, + heap_max); + } + + for (size_t batch_idx = 0; batch_idx < next_num_active; batch_idx++) { + res.add(active_indices[batch_idx], + dis0 + exact_distances[batch_idx]); + } + + return total_active; + } +#endif // __AVX512F__ + float distance_to_code(const uint8_t* code) const override { assert(precompute_mode == 2); float dis = this->dis0 + From bb842d0b2f7f090eaf64152da6739003c3358276 Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Sun, 15 Mar 2026 06:02:15 +0000 Subject: [PATCH 02/41] Initial commit --- faiss/IndexIVFPQPanorama.cpp | 509 +++++++++++++++++++++++++++++++++++ faiss/IndexIVFPQPanorama.h | 70 +++++ 2 files changed, 579 insertions(+) create mode 100644 faiss/IndexIVFPQPanorama.cpp create mode 100644 faiss/IndexIVFPQPanorama.h diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp new file mode 100644 index 0000000000..ba54da4cb4 --- /dev/null +++ b/faiss/IndexIVFPQPanorama.cpp @@ -0,0 +1,509 @@ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +namespace faiss { + +static uint64_t total_active = 0; +static uint64_t total_points = 0; + +IndexIVFPQPanorama::IndexIVFPQPanorama( + Index* quantizer, + size_t d, + size_t nlist, + size_t M, + size_t nbits_per_idx, + int n_levels, + float epsilon, + size_t batch_size, + MetricType metric, + bool own_invlists) + : IndexIVFPQ( + quantizer, + d, + nlist, + M, + nbits_per_idx, + metric, + own_invlists), + n_levels(n_levels), + added(false), + chunk_size(code_size / n_levels), + levels_size(d / n_levels), + nbits_per_idx(nbits_per_idx), + m_level_width(M / n_levels), + epsilon(epsilon), + batch_size(batch_size) { + FAISS_ASSERT(M % n_levels == 0); + FAISS_ASSERT(batch_size % 64 == 0); + + printf("N levels = %d\n", n_levels); + printf("M = code_size = %zu\n", M); + printf("Nbits per idx = %u (fixed)\n", 8); + printf("Nlist = %zu\n", nlist); + printf("Batch size = %zuB\n", batch_size); + + FAISS_ASSERT(m_level_width > 0); + FAISS_ASSERT(nbits_per_idx == 8); + FAISS_ASSERT(M == code_size); + FAISS_ASSERT(metric == METRIC_L2); +} + +void IndexIVFPQPanorama::add(idx_t n, const float* x) { + FAISS_ASSERT(!added); + added = true; + + num_points = n; + IndexIVFPQ::add(n, x); + + size_t new_n = 0; + column_offsets = new size_t[nlist]; + for (size_t i = 0; i < nlist; i++) { + column_offsets[i] = new_n; + size_t batch_n = (invlists->list_size(i) + batch_size - 1) / batch_size; + size_t rounded_n = batch_n * batch_size; + new_n += rounded_n * code_size; + } + + column_storage = new uint8_t[code_size * new_n]; + + for (size_t list_no = 0; list_no < nlist; list_no++) { + size_t col_offset = column_offsets[list_no]; + size_t list_size = invlists->list_size(list_no); + size_t n_batches = (list_size + batch_size - 1) / batch_size; + for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { + size_t batch_offset = batch_no * batch_size * code_size; + size_t curr_batch_size = + std::min(list_size - batch_no * batch_size, batch_size); + for (size_t m = 0; m < pq.M; m++) { + size_t m_offset = m * batch_size; + for (size_t point_idx = 0; point_idx < batch_size; + point_idx++) { + uint8_t* dest = column_storage + col_offset + batch_offset + + m_offset + point_idx; + const uint8_t* codes = invlists->get_codes(list_no); + + if (point_idx < curr_batch_size) { + const uint8_t* src = codes + batch_offset + + point_idx * code_size + m; + memcpy(dest, src, 1); + } else { + *dest = 0; + } + } + } + } + } + + cum_sums = new float[(n_levels + 1) * n]; + cum_sum_offsets = new size_t[nlist]; + + init_exact_distances = new float[n]; + init_exact_distances_offsets = new size_t[nlist]; + + size_t cum_size = 0; + size_t init_size = 0; + for (size_t list_no = 0; list_no < nlist; list_no++) { + cum_sum_offsets[list_no] = cum_size; + cum_size += invlists->list_size(list_no) * (n_levels + 1); + + init_exact_distances_offsets[list_no] = init_size; + init_size += invlists->list_size(list_no); + } + + for (size_t list_no = 0; list_no < nlist; list_no++) { + const idx_t* idx = invlists->get_ids(list_no); + size_t list_size = invlists->list_size(list_no); + + std::vector centroid(d); + quantizer->reconstruct(list_no, centroid.data()); + + size_t n_batches = (list_size + batch_size - 1) / batch_size; + + for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { + size_t b_offset = batch_no * batch_size; + size_t curr_batch_size = + std::min(list_size - batch_no * batch_size, batch_size); + + for (size_t point_idx = 0; point_idx < curr_batch_size; + point_idx++) { + float init_exact_distance = 0.0f; + + std::vector vector(d); + const uint8_t* code = + invlists->get_single_code(list_no, b_offset + point_idx); + pq.decode(code, vector.data()); + + std::vector suffix_sums(d + 1); + suffix_sums[d] = 0.0f; + + for (int j = d - 1; j >= 0; j--) { + init_exact_distance += + vector[j] * vector[j] + 2 * vector[j] * centroid[j]; + float squaredVal = vector[j] * vector[j]; + suffix_sums[j] = suffix_sums[j + 1] + squaredVal; + } + + for (int level = 0; level < n_levels; level++) { + int start_idx = level * levels_size; + size_t offset = cum_sum_offsets[list_no] + + b_offset * (n_levels + 1) + + level * curr_batch_size + point_idx; + if (start_idx < (int)d) { + cum_sums[offset] = sqrt(suffix_sums[start_idx]); + } else { + cum_sums[offset] = 0.0f; + } + } + + size_t offset = cum_sum_offsets[list_no] + + b_offset * (n_levels + 1) + + n_levels * curr_batch_size + point_idx; + cum_sums[offset] = 0.0f; + + size_t init_offset = init_exact_distances_offsets[list_no]; + init_exact_distances[init_offset + b_offset + point_idx] = + init_exact_distance; + } + } + } +} + +void IndexIVFPQPanorama::search( + idx_t n, + const float* x, + idx_t k, + float* distances, + idx_t* labels, + const SearchParameters* params_in) const { + FAISS_THROW_IF_NOT(k > 0); + const IVFSearchParameters* params = nullptr; + if (params_in) { + params = dynamic_cast(params_in); + FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type"); + } + const size_t nprobe = + std::min(nlist, params ? params->nprobe : this->nprobe); + FAISS_THROW_IF_NOT(nprobe > 0); + + auto sub_search_func = [this, k, nprobe, params]( + idx_t n, + const float* x, + float* distances, + idx_t* labels, + IndexIVFStats* ivf_stats) { + std::unique_ptr idx(new idx_t[n * nprobe]); + std::unique_ptr coarse_dis(new float[n * nprobe]); + + quantizer->search( + n, + x, + nprobe, + coarse_dis.get(), + idx.get(), + params ? params->quantizer_params : nullptr); + + invlists->prefetch_lists(idx.get(), n * nprobe); + + search_preassigned( + n, + x, + k, + idx.get(), + coarse_dis.get(), + distances, + labels, + false, + params, + ivf_stats); + }; + + if ((parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT) == 0) { + int nt = std::min(omp_get_max_threads(), int(n)); + std::vector stats(nt); + std::mutex exception_mutex; + std::string exception_string; + +#pragma omp parallel for if (nt > 1) + for (idx_t slice = 0; slice < nt; slice++) { + IndexIVFStats local_stats; + idx_t i0 = n * slice / nt; + idx_t i1 = n * (slice + 1) / nt; + if (i1 > i0) { + try { + sub_search_func( + i1 - i0, + x + i0 * d, + distances + i0 * k, + labels + i0 * k, + &stats[slice]); + } catch (const std::exception& e) { + std::lock_guard lock(exception_mutex); + exception_string = e.what(); + } + } + } + + if (!exception_string.empty()) { + FAISS_THROW_FMT( + "search error: %s", exception_string.c_str()); + } + } else { + sub_search_func(n, x, distances, labels, &indexIVF_stats); + } +} + +void IndexIVFPQPanorama::search_preassigned( + idx_t n, + const float* x, + idx_t k, + const idx_t* keys, + const float* coarse_dis, + float* distances, + idx_t* labels, + bool store_pairs, + const IVFSearchParameters* params, + IndexIVFStats* ivf_stats) const { + FAISS_THROW_IF_NOT(k > 0); + + idx_t nprobe = params ? params->nprobe : this->nprobe; + nprobe = std::min((idx_t)nlist, nprobe); + FAISS_THROW_IF_NOT(nprobe > 0); + + const idx_t unlimited_list_size = std::numeric_limits::max(); + idx_t max_codes = params ? params->max_codes : this->max_codes; + IDSelector* sel = params ? params->sel : nullptr; + const IDSelectorRange* selr = dynamic_cast(sel); + if (selr) { + if (selr->assume_sorted) { + sel = nullptr; + } else { + selr = nullptr; + } + } + + FAISS_THROW_IF_NOT_MSG( + !(sel && store_pairs), + "selector and store_pairs cannot be combined"); + + FAISS_THROW_IF_NOT_MSG( + !invlists->use_iterator || (max_codes == 0 && store_pairs == false), + "iterable inverted lists don't support max_codes and store_pairs"); + + size_t nlistv = 0, ndis = 0, nheap = 0; + + using HeapForIP = CMin; + using HeapForL2 = CMax; + + bool interrupt = false; + std::mutex exception_mutex; + std::string exception_string; + + int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT; + bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT); + + FAISS_THROW_IF_NOT_MSG( + max_codes == 0 || pmode == 0 || pmode == 3, + "max_codes supported only for parallel_mode = 0 or 3"); + + if (max_codes == 0) { + max_codes = unlimited_list_size; + } + + [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 && + (pmode == 0 ? false + : pmode == 3 ? n > 1 + : pmode == 1 ? nprobe > 1 + : nprobe * n > 1); + + void* inverted_list_context = + params ? params->inverted_list_context : nullptr; + + const size_t sim_table_size = pq.ksub * pq.M; + std::vector sim_table_cache(nprobe * sim_table_size); + std::vector dis0s_cache(nprobe); + + std::vector suffixSums(d + 1); + std::vector query_cum_norms(n_levels + 1); + std::vector query(d); + std::vector exact_distances(batch_size); + std::vector bitset(batch_size); + std::vector active_indices(batch_size); + std::vector compressed_codes(batch_size * chunk_size); + +#pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis, nheap) + { + std::unique_ptr scanner( + get_InvertedListScanner(store_pairs, sel, params)); + + auto init_result = [&](float* simi, idx_t* idxi) { + if (!do_heap_init) + return; + if (metric_type == METRIC_INNER_PRODUCT) { + heap_heapify(k, simi, idxi); + } else { + heap_heapify(k, simi, idxi); + } + }; + + auto reorder_result = [&](float* simi, idx_t* idxi) { + if (!do_heap_init) + return; + if (metric_type == METRIC_INNER_PRODUCT) { + heap_reorder(k, simi, idxi); + } else { + heap_reorder(k, simi, idxi); + } + }; + + FAISS_ASSERT(pmode == 0); + if (pmode == 0) { +#pragma omp for + for (idx_t i = 0; i < n; i++) { + if (interrupt) { + continue; + } + + scanner->set_query(x + i * d); + suffixSums[d] = 0.0f; + + const float* q = x + i * d; + + for (int j = d - 1; j >= 0; --j) { + float squaredVal = q[j] * q[j]; + suffixSums[j] = suffixSums[j + 1] + squaredVal; + } + + for (int level_idx = 0; level_idx < n_levels; level_idx++) { + int startIdx = level_idx * levels_size; + if (startIdx < (int)d) { + query_cum_norms[level_idx] = sqrt(suffixSums[startIdx]); + } else { + query_cum_norms[level_idx] = 0.0f; + } + } + query_cum_norms[n_levels] = 0.0f; + + float* simi = distances + i * k; + idx_t* idxi = labels + i * k; + + init_result(simi, idxi); + + idx_t nscan = 0; + + for (size_t list_no = 0; list_no < (size_t)nprobe; list_no++) { + idx_t cluster_id = keys[i * nprobe + list_no]; + size_t list_size = invlists->list_size(cluster_id); + size_t n_batches = + (list_size + batch_size - 1) / batch_size; + + std::unique_ptr sids; + const idx_t* ids = + std::make_unique( + invlists, cluster_id) + ->get(); + + for (size_t batch_no = 0; batch_no < n_batches; + batch_no++) { + size_t curr_batch_size = std::min( + list_size - batch_no * batch_size, batch_size); + size_t b_offset = batch_no * batch_size; + + std::iota( + active_indices.begin(), + active_indices.begin() + curr_batch_size, + b_offset); + std::fill( + bitset.begin(), + bitset.begin() + curr_batch_size, + 1); + std::fill( + bitset.begin() + curr_batch_size, + bitset.end(), + 0); + std::fill( + compressed_codes.begin(), + compressed_codes.end(), + 0); + + for (size_t idx = 0; idx < curr_batch_size; idx++) { + exact_distances[idx] = init_exact_distances + [init_exact_distances_offsets[cluster_id] + + b_offset + idx]; + } + + const uint8_t* codes = column_storage + + column_offsets[cluster_id] + + b_offset * code_size; + float* cums = cum_sums + cum_sum_offsets[cluster_id] + + b_offset * (n_levels + 1); + + total_points += curr_batch_size * n_levels; + + total_active += scanner->process_batch( + pq, + compressed_codes.data(), + cluster_id, + batch_no, + coarse_dis[i * nprobe + list_no], + curr_batch_size, + batch_size, + chunk_size, + epsilon, + n_levels, + codes, + cums, + query_cum_norms.data(), + active_indices.data(), + bitset.data(), + exact_distances.data(), + ids, + simi, + idxi, + k, + &dis0s_cache[list_no], + sim_table_cache.data() + + list_no * sim_table_size); + } + } + + reorder_result(simi, idxi); + + if (InterruptCallback::is_interrupted()) { + interrupt = true; + } + } + } + } + + if (interrupt) { + if (!exception_string.empty()) { + FAISS_THROW_FMT( + "search interrupted with: %s", exception_string.c_str()); + } else { + FAISS_THROW_MSG("computation interrupted"); + } + } + + printf("total_active: %f\n", (float)total_active / total_points); +} + +} // namespace faiss diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h new file mode 100644 index 0000000000..46a19e6b09 --- /dev/null +++ b/faiss/IndexIVFPQPanorama.h @@ -0,0 +1,70 @@ +#ifndef FAISS_INDEX_IVFPQ_PANORAMA_H +#define FAISS_INDEX_IVFPQ_PANORAMA_H + +#include + +#include +#include +#include + +namespace faiss { + +struct IndexIVFPQPanorama : public IndexIVFPQ { + const int n_levels; + uint8_t* column_storage; + + size_t* column_offsets; + float* cum_sums; + size_t* cum_sum_offsets; + + float* init_exact_distances; + size_t* init_exact_distances_offsets; + + const size_t chunk_size; + const size_t levels_size; + bool added; + size_t num_points; + size_t batch_size; + size_t nbits_per_idx; + size_t m_level_width; + + float epsilon; + + IndexIVFPQPanorama( + Index* quantizer, + size_t d, + size_t nlist, + size_t M, + size_t nbits_per_idx, + int n_levels, + float epsilon, + size_t batch_size = 128, + MetricType metric = METRIC_L2, + bool own_invlists = true); + + void add(idx_t n, const float* x) override; + + void search( + idx_t n, + const float* x, + idx_t k, + float* distances, + idx_t* labels, + const SearchParameters* params_in) const; + + void search_preassigned( + idx_t n, + const float* x, + idx_t k, + const idx_t* keys, + const float* coarse_dis, + float* distances, + idx_t* labels, + bool store_pairs, + const IVFSearchParameters* params, + IndexIVFStats* ivf_stats) const override; +}; + +} // namespace faiss + +#endif From e37d2d428343e4d0e3b5b723adf4bc3863ffb29c Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Tue, 17 Mar 2026 04:15:37 +0000 Subject: [PATCH 03/41] Checkpoint with comments --- faiss/IndexIVFPQ.cpp | 94 +++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 32 deletions(-) diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp index 9e074d1c29..0db6c0470e 100644 --- a/faiss/IndexIVFPQ.cpp +++ b/faiss/IndexIVFPQ.cpp @@ -774,7 +774,6 @@ struct QueryTables { const float* b = sim_table_2; float* c = sim_table_ptr; -#ifdef __AVX512F__ const size_t n16 = n / 16; const size_t n_for_masking = n % 16; @@ -793,11 +792,6 @@ struct QueryTables { const __m512 abmul = _mm512_mul_ps(bfmm, bx); _mm512_mask_storeu_ps(c + idx, mask, abmul); } -#else - for (size_t idx = 0; idx < n; idx++) { - c[idx] = bf * b[idx]; - } -#endif sim_table = sim_table_ptr; } else { @@ -1342,7 +1336,6 @@ struct IVFPQScanner : IVFPQScannerT, this->dis0 = dis0; } -#ifdef __AVX512F__ inline void process_chunks( size_t chunk_size, size_t max_batch_size, @@ -1366,29 +1359,33 @@ struct IVFPQScanner : IVFPQScannerT, for (; batch_idx + 15 < num_active; batch_idx += 16) { __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); - __m128i comp0 = _mm_loadu_si128( - (__m128i*)(compressed_codes + chunk_offset0 + batch_idx)); + __m128i comp0 = + _mm_loadu_si128((__m128i*)(compressed_codes + + chunk_offset0 + batch_idx)); __m512i codes0 = _mm512_cvtepu8_epi32(comp0); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes0, sim_table0, sizeof(float))); - __m128i comp1 = _mm_loadu_si128( - (__m128i*)(compressed_codes + chunk_offset1 + batch_idx)); + __m128i comp1 = + _mm_loadu_si128((__m128i*)(compressed_codes + + chunk_offset1 + batch_idx)); __m512i codes1 = _mm512_cvtepu8_epi32(comp1); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes1, sim_table1, sizeof(float))); - __m128i comp2 = _mm_loadu_si128( - (__m128i*)(compressed_codes + chunk_offset2 + batch_idx)); + __m128i comp2 = + _mm_loadu_si128((__m128i*)(compressed_codes + + chunk_offset2 + batch_idx)); __m512i codes2 = _mm512_cvtepu8_epi32(comp2); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes2, sim_table2, sizeof(float))); - __m128i comp3 = _mm_loadu_si128( - (__m128i*)(compressed_codes + chunk_offset3 + batch_idx)); + __m128i comp3 = + _mm_loadu_si128((__m128i*)(compressed_codes + + chunk_offset3 + batch_idx)); __m512i codes3 = _mm512_cvtepu8_epi32(comp3); acc = _mm512_add_ps( acc, @@ -1414,8 +1411,8 @@ struct IVFPQScanner : IVFPQScannerT, size_t batch_idx = 0; for (; batch_idx + 15 < num_active; batch_idx += 16) { __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); - __m128i comp = _mm_loadu_si128( - (__m128i*)(compressed_codes + chunk_offset + batch_idx)); + __m128i comp = _mm_loadu_si128(( + __m128i*)(compressed_codes + chunk_offset + batch_idx)); __m512i codes = _mm512_cvtepu8_epi32(comp); __m512 m_dist = _mm512_i32gather_ps( codes, sim_table_ptr, sizeof(float)); @@ -1488,6 +1485,11 @@ struct IVFPQScanner : IVFPQScannerT, exact_distances + next_num_active, compressed_exact_distances_vec); + // Update bitset for removed items. + // Unfortunatelly, this is not vectorized as AVX-512 does not + // support a way to scatter at a 1-byte granularity. + // However, we can use a mask to compress the indices and then + // sequentially set the bitset. alignas(64) uint32_t indices_to_remove[16]; __mmask16 mask_should_remove = ~mask_should_keep; size_t num_to_remove = _mm_popcnt_u32(mask_should_remove); @@ -1538,7 +1540,18 @@ struct IVFPQScanner : IVFPQScannerT, uint8_t* compressed_codes = compressed_codes_begin; size_t num_active = 0; + // An important optimization is to skip the compression if we all points + // are active, as we can just use the compressed_codes_begin + // pointer. if (next_num_active < max_batch_size) { + // Compress the codes: here we don't need to process remainders + // as long as `max_batch_size` is a multiple of 64 (which we + // assert in the constructor). Conveniently, compressed_codes is + // allocated to `max_batch_size` * `chunk_size` elements. + // `num_active` is guaranteed to always be less than or equal to + // `max_batch_size`. Only the last batch may be smaller than + // `max_batch_size`, the caller ensures that the batch and + // bitset are padded with zeros. compressed_codes = compressed_codes_begin; for (size_t point_idx = 0; point_idx < max_batch_size; point_idx += 64) { @@ -1546,17 +1559,16 @@ struct IVFPQScanner : IVFPQScannerT, __mmask64 mask = _mm512_cmpneq_epi8_mask( active_byteset, _mm512_setzero_si512()); - for (size_t ci = 0; ci < chunk_size; ci++) { - size_t chunk_offset = ci * max_batch_size; - size_t write_pos = 0; - uint64_t m = (uint64_t)mask; - while (m) { - int bit = __builtin_ctzll(m); - compressed_codes[chunk_offset + num_active + write_pos] = - codes[chunk_offset + point_idx + bit]; - write_pos++; - m &= m - 1; - } + for (size_t chunk_idx = 0; chunk_idx < chunk_size; + chunk_idx++) { + size_t chunk_offset = chunk_idx * max_batch_size; + __m512i codes_batch_vec = _mm512_loadu_si512( + codes + chunk_offset + point_idx); + __m512i compressed_batch = + _mm512_maskz_compress_epi8(mask, codes_batch_vec); + _mm512_storeu_si512( + compressed_codes + chunk_offset + num_active, + compressed_batch); } num_active += _mm_popcnt_u64(mask); @@ -1568,7 +1580,6 @@ struct IVFPQScanner : IVFPQScannerT, return std::make_pair(compressed_codes, num_active); } -#endif // __AVX512F__ inline void process_chunks_sparse( size_t chunk_size, @@ -1592,7 +1603,6 @@ struct IVFPQScanner : IVFPQScannerT, } } -#ifdef __AVX512F__ size_t process_batch( const ProductQuantizer& pq, uint8_t* compressed_codes, @@ -1628,20 +1638,32 @@ struct IVFPQScanner : IVFPQScannerT, size_t total_active = 0; __m512 epsilon_broadcast = _mm512_set1_ps(epsilon); + // The remaining active elements computed at the end of each level. + // We initialize to `curr_batch_size` for continuity. size_t next_num_active = curr_batch_size; + // For historical reasons, we initialize dis0 only at + // the beginning of the first level, but we need to access it after + // all levels have been processed, so we declare dis0 here. float dis0 = 0; + // Given that `active_indices` indexes the cluster directly, we need + // to offset it by the batch offset when updating the bitset and + // accessing the cum_sums. This way we avoid yet another layer of + // indirection. size_t batch_offset = batch_no * max_batch_size; __m512i batch_offset_broadcast = _mm512_set1_epi32(batch_offset); for (size_t level = 0; (level < n_levels) && (next_num_active > 0); level++) { total_active += next_num_active; + // This ensures the LUT is poitning to the right offset, and is + // properly initialized. We only compute dis0 distances once for + // each cluster, and cache the result. size_t level_offset_sim_table = level * pq.ksub * chunk_size; this->set_list_panorama( cluster_id, coarse_dis_i, sim_table_cache + level_offset_sim_table, - dis0_cache, + dis0_cache, // Only init once for each cluster. level == 0 && batch_no == 0); this->set_sim_table( sim_table_cache + level_offset_sim_table, *dis0_cache); @@ -1649,12 +1671,15 @@ struct IVFPQScanner : IVFPQScannerT, dis0 = this->dis0; __m512 dis0_bcast = _mm512_set1_ps(dis0); + // We multiply by two here so we don't have to do it in the + // kernel. float query_cum_norm = 2 * query_cum_norms[level + 1]; __m512 query_cum_norm_broadcast = _mm512_set1_ps(query_cum_norm); float heap_max = res.top(); __m512 heap_max_broadcast = _mm512_set1_ps(heap_max); + // Codes has padding potentially, cumsum does not. float* cum_sums = cums + curr_batch_size * level; const uint8_t* codes = codes_batch + max_batch_size * chunk_size * level; @@ -1662,6 +1687,10 @@ struct IVFPQScanner : IVFPQScannerT, bool is_sparse = next_num_active < max_batch_size / 16; float* sim_table = this->sim_table; + // Phase 1: Process all chunks and accumulate distances. + // We iterate over chunks first as this keeps the same LUT slice + // within the L1 cache. To avoid register thrashing, we unroll + // 4 chunks at a time. size_t num_active_for_filtering = 0; if (is_sparse) { process_chunks_sparse( @@ -1695,6 +1724,7 @@ struct IVFPQScanner : IVFPQScannerT, num_active_for_filtering = na; } + // Phase 2: Filtering logic using accumulated distances. next_num_active = process_filtering( num_active_for_filtering, exact_distances, @@ -1713,6 +1743,7 @@ struct IVFPQScanner : IVFPQScannerT, heap_max); } + // Phase 3: Insert remaining candidates to heap. for (size_t batch_idx = 0; batch_idx < next_num_active; batch_idx++) { res.add(active_indices[batch_idx], dis0 + exact_distances[batch_idx]); @@ -1720,7 +1751,6 @@ struct IVFPQScanner : IVFPQScannerT, return total_active; } -#endif // __AVX512F__ float distance_to_code(const uint8_t* code) const override { assert(precompute_mode == 2); From 03e4520be1302e0c4a877d9038bb8f8a66653a88 Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Tue, 17 Mar 2026 04:33:12 +0000 Subject: [PATCH 04/41] fix compile issues --- faiss/IndexIVFPQPanorama.cpp | 2 +- faiss/python/swigfaiss.swig | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index ba54da4cb4..84226c3e5f 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -503,7 +503,7 @@ void IndexIVFPQPanorama::search_preassigned( } } - printf("total_active: %f\n", (float)total_active / total_points); + printf("v0: total_active: %f\n", (float)total_active / total_points); } } // namespace faiss diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig index 033dc8d072..75292ecb7f 100644 --- a/faiss/python/swigfaiss.swig +++ b/faiss/python/swigfaiss.swig @@ -96,6 +96,7 @@ typedef uint64_t size_t; #include #include #include +#include #include #include #include @@ -594,6 +595,7 @@ void gpu_sync_all_devices() %ignore faiss::IndexIVFPQ::alloc_type; %include +%include %include %include @@ -779,6 +781,7 @@ void gpu_sync_all_devices() DOWNCAST ( IndexIVFRaBitQ ) DOWNCAST ( IndexIVFRaBitQFastScan ) DOWNCAST ( IndexIVFIndependentQuantizer) + DOWNCAST ( IndexIVFPQPanorama ) DOWNCAST ( IndexIVFPQR ) DOWNCAST ( IndexIVFPQ ) DOWNCAST ( IndexIVFPQFastScan ) From e10bccc7f42a14b59399b7c16998235d4587570d Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Tue, 17 Mar 2026 06:13:51 +0000 Subject: [PATCH 05/41] Amazing --- benchs/bench_ivfpq_panorama.py | 154 +++++++++ faiss/CMakeLists.txt | 17 +- faiss/IndexIVFPQ.cpp | 303 ++---------------- faiss/IndexIVFPQPanorama.cpp | 6 +- .../panorama_kernels-avx2.cpp | 239 ++++++++++++++ .../panorama_kernels-avx512.cpp | 238 ++++++++++++++ .../panorama_kernels-generic.cpp | 155 +++++++++ .../impl/panorama_kernels/panorama_kernels.h | 89 +++++ 8 files changed, 910 insertions(+), 291 deletions(-) create mode 100644 benchs/bench_ivfpq_panorama.py create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-generic.cpp create mode 100644 faiss/impl/panorama_kernels/panorama_kernels.h diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py new file mode 100644 index 0000000000..ebd1336092 --- /dev/null +++ b/benchs/bench_ivfpq_panorama.py @@ -0,0 +1,154 @@ +# Quick 10% verification of IVFPQPanorama (with index caching) + +import multiprocessing as mp +import os +import time + +import faiss +import numpy as np + +print("Compile options:", faiss.get_compile_options(), flush=True) + + +def fvecs_read(fname): + a = np.fromfile(fname, dtype="float32") + d = a[0].view("int32") + return a.reshape(-1, d + 1)[:, 1:].copy() + + +GIST_DIR = "/home/lutex/PCA_init" +CACHE_DIR = "/home/lutex/faiss-panorama/index_cache" +os.makedirs(CACHE_DIR, exist_ok=True) + +IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index") +IVFPQ_TRAINED_CACHE = os.path.join(CACHE_DIR, "ivfpq_trained_10pct.index") + +print("Loading GIST1M data (10% subset)...", flush=True) +xb_full = fvecs_read(os.path.join(GIST_DIR, "gist1m_base.fvecs")) +xq = fvecs_read(os.path.join(GIST_DIR, "gist1m_query.fvecs")) + +nb_full, d = xb_full.shape +nb = nb_full // 10 # 10% = 100000 +xb = xb_full[:nb].copy() +del xb_full + +nq = xq.shape[0] +print(f"Database: {nb} x {d}, Queries: {nq} x {d}", flush=True) + +xt = xb[:50000].copy() + +k = 10 +M = 960 +nbits = 8 +nlist = 64 +n_levels = 8 +epsilon = 1.0 +batch_size = 128 + +GT_PATH = os.path.join(CACHE_DIR, "gt_10pct.npy") +if os.path.exists(GT_PATH): + gt_I = np.load(GT_PATH) + print(f"Loaded cached ground truth: {gt_I.shape}", flush=True) +else: + print("Computing ground truth on 10% subset...", flush=True) + flat = faiss.IndexFlatL2(d) + flat.add(xb) + _, gt_I = flat.search(xq, k) + np.save(GT_PATH, gt_I) + print("Ground truth computed and cached.", flush=True) + + +def eval_recall(index, nprobe_val): + t0 = time.time() + _, I = index.search(xq, k=k) + t = time.time() - t0 + speed = t * 1000 / nq + qps = 1000 / speed + corrects = sum(len(set(gt_I[i]) & set(I[i])) for i in range(nq)) + recall = corrects / (nq * k) + print( + f"\tnprobe {nprobe_val:3d}, Recall@{k}: " + f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}", + flush=True, + ) + return recall, qps + + +# faiss.omp_set_num_threads(mp.cpu_count()) + +# --- IVFPQ baseline (cached) --- +if os.path.exists(IVFPQ_CACHE): + print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True) + t0 = time.time() + ivfpq = faiss.read_index(IVFPQ_CACHE) + print(f" Loaded in {time.time() - t0:.1f}s", flush=True) +else: + print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True) + quantizer = faiss.IndexFlatL2(d) + ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) + t0 = time.time() + ivfpq.train(xt) + print(f" Training took {time.time() - t0:.1f}s", flush=True) + + print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) + faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE) + + t0 = time.time() + ivfpq.add(xb) + print(f" Adding took {time.time() - t0:.1f}s", flush=True) + + print(f" Saving full index to {IVFPQ_CACHE}...", flush=True) + faiss.write_index(ivfpq, IVFPQ_CACHE) + +faiss.omp_set_num_threads(1) +print("\n====== IVFPQ baseline", flush=True) +for nprobe in [1, 2, 4, 8, 16]: + ivfpq.nprobe = nprobe + eval_recall(ivfpq, nprobe) + +# --- IVFPQPanorama (reuse trained PQ from cache) --- +faiss.omp_set_num_threads(mp.cpu_count()) + +if os.path.exists(IVFPQ_TRAINED_CACHE): + print(f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", flush=True) + trained = faiss.read_index(IVFPQ_TRAINED_CACHE) + quantizer2 = trained.quantizer + trained.own_fields = False + + ivfpq_pano = faiss.IndexIVFPQPanorama( + quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size + ) + centroids = faiss.vector_to_array(trained.pq.centroids) + faiss.copy_array_to_vector(centroids, ivfpq_pano.pq.centroids) + ivfpq_pano.is_trained = True + ivfpq_pano.use_precomputed_table = 1 + ivfpq_pano.precompute_table() + + print(" Reused trained PQ (skipped training).", flush=True) + t0 = time.time() + ivfpq_pano.add(xb) + print(f" Adding took {time.time() - t0:.1f}s", flush=True) +else: + print( + f"\nBuilding IVFPQPanorama from scratch: nlist={nlist}, M={M}, nbits={nbits}, " + f"n_levels={n_levels}, epsilon={epsilon}, batch_size={batch_size}", + flush=True, + ) + quantizer2 = faiss.IndexFlatL2(d) + ivfpq_pano = faiss.IndexIVFPQPanorama( + quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size + ) + t0 = time.time() + ivfpq_pano.train(xt) + print(f" Training took {time.time() - t0:.1f}s", flush=True) + t0 = time.time() + ivfpq_pano.add(xb) + print(f" Adding took {time.time() - t0:.1f}s", flush=True) + +faiss.omp_set_num_threads(1) +print("\n====== IVFPQPanorama", flush=True) +for nprobe in [1, 2, 4, 8, 16]: + ivfpq_pano.nprobe = nprobe + eval_recall(ivfpq_pano, nprobe) + +print("\nVerification complete!", flush=True) diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index 5a6c37ffbd..84a6eb1aac 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -10,6 +10,7 @@ # ============================================================================= set(FAISS_SIMD_AVX2_SRC impl/fast_scan/impl-avx2.cpp + impl/panorama_kernels/panorama_kernels-avx2.cpp impl/pq_code_distance/pq_code_distance-avx2.cpp impl/scalar_quantizer/sq-avx2.cpp impl/approx_topk/avx2.cpp @@ -17,6 +18,7 @@ set(FAISS_SIMD_AVX2_SRC ) set(FAISS_SIMD_AVX512_SRC impl/fast_scan/impl-avx512.cpp + impl/panorama_kernels/panorama_kernels-avx512.cpp impl/pq_code_distance/pq_code_distance-avx512.cpp impl/scalar_quantizer/sq-avx512.cpp utils/simd_impl/distances_avx512.cpp @@ -106,6 +108,7 @@ set(FAISS_SRC impl/NSG.cpp impl/PolysemousTraining.cpp impl/ProductQuantizer.cpp + impl/panorama_kernels/panorama_kernels-generic.cpp impl/pq_code_distance/pq_code_distance-generic.cpp impl/AdditiveQuantizer.cpp impl/RaBitQuantizer.cpp @@ -280,6 +283,7 @@ set(FAISS_HEADERS impl/fast_scan/simd_result_handlers.h impl/zerocopy_io.h utils/pq_code_distance.h + impl/panorama_kernels/panorama_kernels.h impl/pq_code_distance/pq_code_distance-inl.h invlists/BlockInvertedLists.h invlists/DirectMap.h @@ -356,6 +360,15 @@ endif() # Export FAISS_HEADERS variable to parent scope. set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE) +# Detect BMI2 compiler support (PEXT/PDEP used in Panorama code compression). +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-mbmi2" COMPILER_SUPPORTS_BMI2) +if(COMPILER_SUPPORTS_BMI2) + set(FAISS_BMI2_FLAGS "-mbmi2") +else() + set(FAISS_BMI2_FLAGS "") +endif() + add_library(faiss ${FAISS_SRC}) add_library(faiss_avx2 ${FAISS_SRC}) @@ -363,7 +376,7 @@ if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512" set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE) endif() if(NOT WIN32) - target_compile_options(faiss_avx2 PRIVATE $<$:-mavx2 -mfma -mf16c -mpopcnt>) + target_compile_options(faiss_avx2 PRIVATE $<$:-mavx2 -mfma -mf16c -mpopcnt ${FAISS_BMI2_FLAGS}>) else() # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64 @@ -383,7 +396,7 @@ endif() if(NOT WIN32) # All modern CPUs support F, CD, VL, DQ, BW extensions. # Ref: https://en.wikipedia.org/wiki/AVX512 - target_compile_options(faiss_avx512 PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>) + target_compile_options(faiss_avx512 PRIVATE $<$:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt ${FAISS_BMI2_FLAGS}>) else() target_compile_options(faiss_avx512 PRIVATE $<$:/arch:AVX512>) # we need bigobj for the swig wrapper diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp index 0db6c0470e..aca27f903d 100644 --- a/faiss/IndexIVFPQ.cpp +++ b/faiss/IndexIVFPQ.cpp @@ -9,7 +9,6 @@ #include -#include #include #include #include @@ -33,6 +32,7 @@ #include #include #include +#include #include namespace faiss { @@ -774,23 +774,8 @@ struct QueryTables { const float* b = sim_table_2; float* c = sim_table_ptr; - const size_t n16 = n / 16; - const size_t n_for_masking = n % 16; - - const __m512 bfmm = _mm512_set1_ps(bf); - - size_t idx = 0; - for (idx = 0; idx < n16 * 16; idx += 16) { - const __m512 bx = _mm512_loadu_ps(b + idx); - const __m512 abmul = _mm512_mul_ps(bfmm, bx); - _mm512_storeu_ps(c + idx, abmul); - } - - if (n_for_masking > 0) { - const __mmask16 mask = (1 << n_for_masking) - 1; - const __m512 bx = _mm512_maskz_loadu_ps(mask, b + idx); - const __m512 abmul = _mm512_mul_ps(bfmm, bx); - _mm512_mask_storeu_ps(c + idx, mask, abmul); + for (size_t idx = 0; idx < n; idx++) { + c[idx] = bf * b[idx]; } sim_table = sim_table_ptr; @@ -1336,250 +1321,10 @@ struct IVFPQScanner : IVFPQScannerT, this->dis0 = dis0; } - inline void process_chunks( - size_t chunk_size, - size_t max_batch_size, - size_t num_active, - float* sim_table, - uint8_t* compressed_codes, - float* exact_distances) { - size_t chunk_idx = 0; - for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) { - size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size; - size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size; - size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size; - size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size; - - float* sim_table0 = sim_table + (chunk_idx + 0) * 256; - float* sim_table1 = sim_table + (chunk_idx + 1) * 256; - float* sim_table2 = sim_table + (chunk_idx + 2) * 256; - float* sim_table3 = sim_table + (chunk_idx + 3) * 256; - - size_t batch_idx = 0; - for (; batch_idx + 15 < num_active; batch_idx += 16) { - __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); - - __m128i comp0 = - _mm_loadu_si128((__m128i*)(compressed_codes + - chunk_offset0 + batch_idx)); - __m512i codes0 = _mm512_cvtepu8_epi32(comp0); - acc = _mm512_add_ps( - acc, - _mm512_i32gather_ps(codes0, sim_table0, sizeof(float))); - - __m128i comp1 = - _mm_loadu_si128((__m128i*)(compressed_codes + - chunk_offset1 + batch_idx)); - __m512i codes1 = _mm512_cvtepu8_epi32(comp1); - acc = _mm512_add_ps( - acc, - _mm512_i32gather_ps(codes1, sim_table1, sizeof(float))); - - __m128i comp2 = - _mm_loadu_si128((__m128i*)(compressed_codes + - chunk_offset2 + batch_idx)); - __m512i codes2 = _mm512_cvtepu8_epi32(comp2); - acc = _mm512_add_ps( - acc, - _mm512_i32gather_ps(codes2, sim_table2, sizeof(float))); - - __m128i comp3 = - _mm_loadu_si128((__m128i*)(compressed_codes + - chunk_offset3 + batch_idx)); - __m512i codes3 = _mm512_cvtepu8_epi32(comp3); - acc = _mm512_add_ps( - acc, - _mm512_i32gather_ps(codes3, sim_table3, sizeof(float))); - - _mm512_storeu_ps(exact_distances + batch_idx, acc); - } - - for (; batch_idx < num_active; batch_idx += 1) { - float acc = exact_distances[batch_idx]; - acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]]; - acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]]; - acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]]; - acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]]; - exact_distances[batch_idx] = acc; - } - } - - for (; chunk_idx < chunk_size; chunk_idx++) { - size_t chunk_offset = chunk_idx * max_batch_size; - float* sim_table_ptr = sim_table + chunk_idx * 256; - - size_t batch_idx = 0; - for (; batch_idx + 15 < num_active; batch_idx += 16) { - __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); - __m128i comp = _mm_loadu_si128(( - __m128i*)(compressed_codes + chunk_offset + batch_idx)); - __m512i codes = _mm512_cvtepu8_epi32(comp); - __m512 m_dist = _mm512_i32gather_ps( - codes, sim_table_ptr, sizeof(float)); - acc = _mm512_add_ps(acc, m_dist); - _mm512_storeu_ps(exact_distances + batch_idx, acc); - } - - for (; batch_idx < num_active; batch_idx += 1) { - exact_distances[batch_idx] += sim_table_ptr - [compressed_codes[chunk_offset + batch_idx]]; - } - } - } - - inline size_t process_filtering( - size_t num_active, - float* exact_distances, - uint32_t* active_indices, - __m512i batch_offset_broadcast, - float* cum_sums, - __m512 dis0_broadcast, - __m512 query_cum_norm_broadcast, - __m512 epsilon_broadcast, - __m512 heap_max_broadcast, - uint8_t* bitset, - size_t batch_offset, - float dis0, - float query_cum_norm, - float epsilon, - float heap_max) { - size_t next_num_active = 0; - size_t batch_idx = 0; - - for (; batch_idx + 15 < num_active; batch_idx += 16) { - __m512 exact_distances_batch = - _mm512_loadu_ps(exact_distances + batch_idx); - - __m512i active_indices_batch = - _mm512_loadu_si512(active_indices + batch_idx); - __m512i offsetted_active_indices_batch = _mm512_sub_epi32( - active_indices_batch, batch_offset_broadcast); - __m512 cum_sums_batch = _mm512_i32gather_ps( - offsetted_active_indices_batch, cum_sums, sizeof(float)); - - __m512 exact_distances_batch_dis0 = - _mm512_add_ps(exact_distances_batch, dis0_broadcast); - __m512 cauchy_schwarz_bound = - _mm512_mul_ps(query_cum_norm_broadcast, cum_sums_batch); - cauchy_schwarz_bound = - _mm512_mul_ps(cauchy_schwarz_bound, epsilon_broadcast); - - __m512 lower_bound = _mm512_sub_ps( - exact_distances_batch_dis0, cauchy_schwarz_bound); - __mmask16 mask_should_keep = _mm512_cmp_ps_mask( - lower_bound, heap_max_broadcast, _CMP_LT_OQ); - - __m512i compressed_active_indices_vec = _mm512_mask_compress_epi32( - _mm512_setzero_si512(), - mask_should_keep, - active_indices_batch); - _mm512_storeu_si512( - active_indices + next_num_active, - compressed_active_indices_vec); - - __m512 compressed_exact_distances_vec = _mm512_mask_compress_ps( - _mm512_setzero_ps(), - mask_should_keep, - exact_distances_batch); - _mm512_storeu_ps( - exact_distances + next_num_active, - compressed_exact_distances_vec); - - // Update bitset for removed items. - // Unfortunatelly, this is not vectorized as AVX-512 does not - // support a way to scatter at a 1-byte granularity. - // However, we can use a mask to compress the indices and then - // sequentially set the bitset. - alignas(64) uint32_t indices_to_remove[16]; - __mmask16 mask_should_remove = ~mask_should_keep; - size_t num_to_remove = _mm_popcnt_u32(mask_should_remove); - - __m512i compressed_indices_to_remove_vec = - _mm512_mask_compress_epi32( - _mm512_setzero_si512(), - mask_should_remove, - active_indices_batch); - _mm512_storeu_si512( - indices_to_remove, compressed_indices_to_remove_vec); - - for (size_t idx = 0; idx < num_to_remove; idx++) { - bitset[indices_to_remove[idx] - batch_offset] = 0; - } - - next_num_active += _mm_popcnt_u32(mask_should_keep); - } - - for (; batch_idx < num_active; batch_idx++) { - float exact_distance = exact_distances[batch_idx]; - - float cum_sum = cum_sums[active_indices[batch_idx] - batch_offset]; - float cauchy_schwarz_bound = cum_sum * query_cum_norm; - float lower_bound = - exact_distance - cauchy_schwarz_bound * epsilon + dis0; - - uint32_t should_keep = heap_max > lower_bound; - active_indices[next_num_active] = active_indices[batch_idx]; - exact_distances[next_num_active] = exact_distance; - - bitset[active_indices[batch_idx] - batch_offset] = should_keep; - - next_num_active += should_keep; - } - - return next_num_active; - } - - inline std::pair process_code_compression( - size_t level, - size_t next_num_active, - size_t max_batch_size, - size_t chunk_size, - uint8_t* compressed_codes_begin, - uint8_t* bitset, - const uint8_t* codes) { - uint8_t* compressed_codes = compressed_codes_begin; - size_t num_active = 0; - - // An important optimization is to skip the compression if we all points - // are active, as we can just use the compressed_codes_begin - // pointer. - if (next_num_active < max_batch_size) { - // Compress the codes: here we don't need to process remainders - // as long as `max_batch_size` is a multiple of 64 (which we - // assert in the constructor). Conveniently, compressed_codes is - // allocated to `max_batch_size` * `chunk_size` elements. - // `num_active` is guaranteed to always be less than or equal to - // `max_batch_size`. Only the last batch may be smaller than - // `max_batch_size`, the caller ensures that the batch and - // bitset are padded with zeros. - compressed_codes = compressed_codes_begin; - for (size_t point_idx = 0; point_idx < max_batch_size; - point_idx += 64) { - __m512i active_byteset = _mm512_loadu_si512(bitset + point_idx); - __mmask64 mask = _mm512_cmpneq_epi8_mask( - active_byteset, _mm512_setzero_si512()); - - for (size_t chunk_idx = 0; chunk_idx < chunk_size; - chunk_idx++) { - size_t chunk_offset = chunk_idx * max_batch_size; - __m512i codes_batch_vec = _mm512_loadu_si512( - codes + chunk_offset + point_idx); - __m512i compressed_batch = - _mm512_maskz_compress_epi8(mask, codes_batch_vec); - _mm512_storeu_si512( - compressed_codes + chunk_offset + num_active, - compressed_batch); - } - - num_active += _mm_popcnt_u64(mask); - } - } else { - num_active = next_num_active; - compressed_codes = const_cast(codes); - } - - return std::make_pair(compressed_codes, num_active); - } + // Panorama kernels (process_chunks, process_filtering, + // process_code_compression) are implemented in + // faiss/impl/panorama_kernels/ with scalar and AVX-512 variants. + // The linker selects the right one based on the SIMD compile target. inline void process_chunks_sparse( size_t chunk_size, @@ -1636,7 +1381,6 @@ struct IVFPQScanner : IVFPQScannerT, 0}; uint8_t* compressed_codes_begin = compressed_codes; size_t total_active = 0; - __m512 epsilon_broadcast = _mm512_set1_ps(epsilon); // The remaining active elements computed at the end of each level. // We initialize to `curr_batch_size` for continuity. @@ -1650,12 +1394,11 @@ struct IVFPQScanner : IVFPQScannerT, // accessing the cum_sums. This way we avoid yet another layer of // indirection. size_t batch_offset = batch_no * max_batch_size; - __m512i batch_offset_broadcast = _mm512_set1_epi32(batch_offset); for (size_t level = 0; (level < n_levels) && (next_num_active > 0); level++) { total_active += next_num_active; - // This ensures the LUT is poitning to the right offset, and is + // This ensures the LUT is pointing to the right offset, and is // properly initialized. We only compute dis0 distances once for // each cluster, and cache the result. size_t level_offset_sim_table = level * pq.ksub * chunk_size; @@ -1669,15 +1412,12 @@ struct IVFPQScanner : IVFPQScannerT, sim_table_cache + level_offset_sim_table, *dis0_cache); dis0 = this->dis0; - __m512 dis0_bcast = _mm512_set1_ps(dis0); // We multiply by two here so we don't have to do it in the // kernel. float query_cum_norm = 2 * query_cum_norms[level + 1]; - __m512 query_cum_norm_broadcast = _mm512_set1_ps(query_cum_norm); float heap_max = res.top(); - __m512 heap_max_broadcast = _mm512_set1_ps(heap_max); // Codes has padding potentially, cumsum does not. float* cum_sums = cums + curr_batch_size * level; @@ -1705,16 +1445,16 @@ struct IVFPQScanner : IVFPQScannerT, pq.ksub); num_active_for_filtering = next_num_active; } else { - auto [cc, na] = process_code_compression( - level, - next_num_active, - max_batch_size, - chunk_size, - compressed_codes_begin, - bitset, - codes); - - process_chunks( + auto [cc, na] = + panorama_kernels::process_code_compression( + next_num_active, + max_batch_size, + chunk_size, + compressed_codes_begin, + bitset, + codes); + + panorama_kernels::process_chunks( chunk_size, max_batch_size, na, @@ -1725,16 +1465,11 @@ struct IVFPQScanner : IVFPQScannerT, } // Phase 2: Filtering logic using accumulated distances. - next_num_active = process_filtering( + next_num_active = panorama_kernels::process_filtering( num_active_for_filtering, exact_distances, active_indices, - batch_offset_broadcast, cum_sums, - dis0_bcast, - query_cum_norm_broadcast, - epsilon_broadcast, - heap_max_broadcast, bitset, batch_offset, dis0, diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index 84226c3e5f..aae0811176 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -439,10 +439,6 @@ void IndexIVFPQPanorama::search_preassigned( bitset.begin() + curr_batch_size, bitset.end(), 0); - std::fill( - compressed_codes.begin(), - compressed_codes.end(), - 0); for (size_t idx = 0; idx < curr_batch_size; idx++) { exact_distances[idx] = init_exact_distances @@ -503,7 +499,7 @@ void IndexIVFPQPanorama::search_preassigned( } } - printf("v0: total_active: %f\n", (float)total_active / total_points); + printf("vv: total_active: %f\n", (float)total_active / total_points); } } // namespace faiss diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp new file mode 100644 index 0000000000..235c5d4d78 --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -0,0 +1,239 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// AVX2 implementations of Panorama kernels. +// Uses 256-bit gather for process_chunks, scalar filtering (no +// compress instruction in AVX2), and BMI2 PEXT/PDEP for code +// compression where available. + +#ifdef COMPILE_SIMD_AVX2 +#ifndef COMPILE_SIMD_AVX512 + +#include + +#include + +#include + +namespace faiss { +namespace panorama_kernels { + +void process_chunks( + size_t chunk_size, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + size_t chunk_idx = 0; + + // Process 4 chunks at a time to amortize loop overhead and keep + // the accumulator in registers across chunks. + for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) { + size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size; + size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size; + size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size; + size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size; + + float* sim_table0 = sim_table + (chunk_idx + 0) * 256; + float* sim_table1 = sim_table + (chunk_idx + 1) * 256; + float* sim_table2 = sim_table + (chunk_idx + 2) * 256; + float* sim_table3 = sim_table + (chunk_idx + 3) * 256; + + size_t batch_idx = 0; + for (; batch_idx + 7 < num_active; batch_idx += 8) { + __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx); + + // Load 8 byte codes, zero-extend to 32-bit indices. + __m128i raw0 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + chunk_offset0 + batch_idx)); + __m256i codes0 = _mm256_cvtepu8_epi32(raw0); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table0, codes0, sizeof(float))); + + __m128i raw1 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + chunk_offset1 + batch_idx)); + __m256i codes1 = _mm256_cvtepu8_epi32(raw1); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table1, codes1, sizeof(float))); + + __m128i raw2 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + chunk_offset2 + batch_idx)); + __m256i codes2 = _mm256_cvtepu8_epi32(raw2); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table2, codes2, sizeof(float))); + + __m128i raw3 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + chunk_offset3 + batch_idx)); + __m256i codes3 = _mm256_cvtepu8_epi32(raw3); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table3, codes3, sizeof(float))); + + _mm256_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx += 1) { + float acc = exact_distances[batch_idx]; + acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]]; + acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]]; + acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]]; + acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]]; + exact_distances[batch_idx] = acc; + } + } + + for (; chunk_idx < chunk_size; chunk_idx++) { + size_t chunk_offset = chunk_idx * max_batch_size; + float* sim_table_ptr = sim_table + chunk_idx * 256; + + size_t batch_idx = 0; + for (; batch_idx + 7 < num_active; batch_idx += 8) { + __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx); + __m128i raw = _mm_loadl_epi64( + (__m128i*)(compressed_codes + chunk_offset + batch_idx)); + __m256i codes = _mm256_cvtepu8_epi32(raw); + __m256 m_dist = _mm256_i32gather_ps( + sim_table_ptr, codes, sizeof(float)); + acc = _mm256_add_ps(acc, m_dist); + _mm256_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx += 1) { + exact_distances[batch_idx] += sim_table_ptr + [compressed_codes[chunk_offset + batch_idx]]; + } + } +} + +size_t process_filtering( + size_t num_active, + float* exact_distances, + uint32_t* active_indices, + float* cum_sums, + uint8_t* bitset, + size_t batch_offset, + float dis0, + float query_cum_norm, + float epsilon, + float heap_max) { + size_t next_num_active = 0; + for (size_t i = 0; i < num_active; i++) { + float exact_distance = exact_distances[i]; + float cum_sum = cum_sums[active_indices[i] - batch_offset]; + float lower_bound = + exact_distance + dis0 - cum_sum * query_cum_norm * epsilon; + + bool keep = heap_max > lower_bound; + active_indices[next_num_active] = active_indices[i]; + exact_distances[next_num_active] = exact_distance; + bitset[active_indices[i] - batch_offset] = keep; + next_num_active += keep; + } + return next_num_active; +} + +std::pair process_code_compression( + size_t next_num_active, + size_t max_batch_size, + size_t chunk_size, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + uint8_t* compressed_codes = compressed_codes_begin; + size_t num_active = 0; + + // An important optimization is to skip the compression if all points + // are active, as we can just use the compressed_codes_begin pointer. + if (next_num_active < max_batch_size) { + // Compress the codes: here we don't need to process remainders + // as long as `max_batch_size` is a multiple of 64 (which we + // assert in the constructor). Conveniently, compressed_codes is + // allocated to `max_batch_size` * `chunk_size` elements. + // `num_active` is guaranteed to always be less than or equal to + // `max_batch_size`. Only the last batch may be smaller than + // `max_batch_size`, the caller ensures that the batch and + // bitset are padded with zeros. + compressed_codes = compressed_codes_begin; + for (size_t point_idx = 0; point_idx < max_batch_size; + point_idx += 64) { + // Build a 64-bit mask from the byteset: each byte is + // 0 or 1, collect into a single bitmask. + uint64_t mask = 0; +#ifdef __BMI2__ + for (int g = 0; g < 8; g++) { + uint64_t bytes; + memcpy(&bytes, bitset + point_idx + g * 8, 8); + uint8_t bits = (uint8_t)_pext_u64( + bytes, 0x0101010101010101ULL); + mask |= ((uint64_t)bits << (g * 8)); + } +#else + for (int b = 0; b < 64; b++) { + if (bitset[point_idx + b]) + mask |= (1ULL << b); + } +#endif + + // Byte-level stream compaction. +#ifdef __BMI2__ + // PEXT/PDEP path: process 8 bytes at a time. PDEP + // expands the per-byte mask bits into a per-byte lane + // mask, then PEXT extracts only the selected bytes. + for (size_t ci = 0; ci < chunk_size; ci++) { + size_t chunk_offset = ci * max_batch_size; + const uint8_t* src = codes + chunk_offset + point_idx; + uint8_t* dst = compressed_codes + chunk_offset + num_active; + int write_pos = 0; + for (int g = 0; g < 8; g++) { + uint64_t src_val; + memcpy(&src_val, src + g * 8, 8); + uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF); + uint64_t byte_mask = + _pdep_u64(submask, 0x0101010101010101ULL) * + 0xFF; + uint64_t compressed_val = _pext_u64(src_val, byte_mask); + int count = __builtin_popcount(submask); + memcpy(dst + write_pos, &compressed_val, 8); + write_pos += count; + } + } +#else + // Scalar fallback: scan set bits one by one and copy + // the corresponding code byte. + for (size_t ci = 0; ci < chunk_size; ci++) { + size_t chunk_offset = ci * max_batch_size; + const uint8_t* src = codes + chunk_offset + point_idx; + uint8_t* dst = compressed_codes + chunk_offset + num_active; + int write_pos = 0; + uint64_t m = mask; + while (m) { + int bit = __builtin_ctzll(m); + dst[write_pos++] = src[bit]; + m &= m - 1; + } + } +#endif + + num_active += __builtin_popcountll(mask); + } + } else { + num_active = next_num_active; + compressed_codes = const_cast(codes); + } + + return std::make_pair(compressed_codes, num_active); +} + +} // namespace panorama_kernels +} // namespace faiss + +#endif // COMPILE_SIMD_AVX512 +#endif // COMPILE_SIMD_AVX2 diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp new file mode 100644 index 0000000000..6c6f0f24db --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_AVX512 + +#include + +#include + +#include + +namespace faiss { +namespace panorama_kernels { + +void process_chunks( + size_t chunk_size, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + size_t chunk_idx = 0; + + // Process 4 chunks at a time to amortize loop overhead and keep + // the accumulator in registers across chunks. + for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) { + size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size; + size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size; + size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size; + size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size; + + float* sim_table0 = sim_table + (chunk_idx + 0) * 256; + float* sim_table1 = sim_table + (chunk_idx + 1) * 256; + float* sim_table2 = sim_table + (chunk_idx + 2) * 256; + float* sim_table3 = sim_table + (chunk_idx + 3) * 256; + + size_t batch_idx = 0; + for (; batch_idx + 15 < num_active; batch_idx += 16) { + __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); + + __m128i comp0 = + _mm_loadu_si128((__m128i*)(compressed_codes + + chunk_offset0 + batch_idx)); + __m512i codes0 = _mm512_cvtepu8_epi32(comp0); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes0, sim_table0, sizeof(float))); + + __m128i comp1 = + _mm_loadu_si128((__m128i*)(compressed_codes + + chunk_offset1 + batch_idx)); + __m512i codes1 = _mm512_cvtepu8_epi32(comp1); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes1, sim_table1, sizeof(float))); + + __m128i comp2 = + _mm_loadu_si128((__m128i*)(compressed_codes + + chunk_offset2 + batch_idx)); + __m512i codes2 = _mm512_cvtepu8_epi32(comp2); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes2, sim_table2, sizeof(float))); + + __m128i comp3 = + _mm_loadu_si128((__m128i*)(compressed_codes + + chunk_offset3 + batch_idx)); + __m512i codes3 = _mm512_cvtepu8_epi32(comp3); + acc = _mm512_add_ps( + acc, + _mm512_i32gather_ps(codes3, sim_table3, sizeof(float))); + + _mm512_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx += 1) { + float acc = exact_distances[batch_idx]; + acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]]; + acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]]; + acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]]; + acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]]; + exact_distances[batch_idx] = acc; + } + } + + for (; chunk_idx < chunk_size; chunk_idx++) { + size_t chunk_offset = chunk_idx * max_batch_size; + float* sim_table_ptr = sim_table + chunk_idx * 256; + + size_t batch_idx = 0; + for (; batch_idx + 15 < num_active; batch_idx += 16) { + __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); + __m128i comp = _mm_loadu_si128(( + __m128i*)(compressed_codes + chunk_offset + batch_idx)); + __m512i codes = _mm512_cvtepu8_epi32(comp); + __m512 m_dist = _mm512_i32gather_ps( + codes, sim_table_ptr, sizeof(float)); + acc = _mm512_add_ps(acc, m_dist); + _mm512_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx += 1) { + exact_distances[batch_idx] += sim_table_ptr + [compressed_codes[chunk_offset + batch_idx]]; + } + } +} + +size_t process_filtering( + size_t num_active, + float* exact_distances, + uint32_t* active_indices, + float* cum_sums, + uint8_t* bitset, + size_t batch_offset, + float dis0, + float query_cum_norm, + float epsilon, + float heap_max) { + size_t next_num_active = 0; + for (size_t i = 0; i < num_active; i++) { + float exact_distance = exact_distances[i]; + float cum_sum = cum_sums[active_indices[i] - batch_offset]; + float lower_bound = + exact_distance + dis0 - cum_sum * query_cum_norm * epsilon; + + bool keep = heap_max > lower_bound; + active_indices[next_num_active] = active_indices[i]; + exact_distances[next_num_active] = exact_distance; + bitset[active_indices[i] - batch_offset] = keep; + next_num_active += keep; + } + return next_num_active; +} + +std::pair process_code_compression( + size_t next_num_active, + size_t max_batch_size, + size_t chunk_size, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + uint8_t* compressed_codes = compressed_codes_begin; + size_t num_active = 0; + + // An important optimization is to skip the compression if all points + // are active, as we can just use the compressed_codes_begin pointer. + if (next_num_active < max_batch_size) { + // Compress the codes: here we don't need to process remainders + // as long as `max_batch_size` is a multiple of 64 (which we + // assert in the constructor). Conveniently, compressed_codes is + // allocated to `max_batch_size` * `chunk_size` elements. + // `num_active` is guaranteed to always be less than or equal to + // `max_batch_size`. Only the last batch may be smaller than + // `max_batch_size`, the caller ensures that the batch and + // bitset are padded with zeros. + compressed_codes = compressed_codes_begin; + for (size_t point_idx = 0; point_idx < max_batch_size; + point_idx += 64) { + // Build a 64-bit mask from the byteset: each byte is + // 0 or 1, collect into a single bitmask. + uint64_t mask = 0; +#ifdef __BMI2__ + // PEXT path: extract the LSB of each byte into a + // single bit, producing a 64-bit bitmask. + for (int g = 0; g < 8; g++) { + uint64_t bytes; + memcpy(&bytes, bitset + point_idx + g * 8, 8); + uint8_t bits = (uint8_t)_pext_u64( + bytes, 0x0101010101010101ULL); + mask |= ((uint64_t)bits << (g * 8)); + } +#else + for (int b = 0; b < 64; b++) { + if (bitset[point_idx + b]) + mask |= (1ULL << b); + } +#endif + + // Byte-level stream compaction (replaces + // _mm512_maskz_compress_epi8 which requires VBMI2). +#ifdef __BMI2__ + // PEXT/PDEP path: process 8 bytes at a time. PDEP + // expands the per-byte mask bits into a per-byte lane + // mask, then PEXT extracts only the selected bytes. + for (size_t ci = 0; ci < chunk_size; ci++) { + size_t chunk_offset = ci * max_batch_size; + const uint8_t* src = codes + chunk_offset + point_idx; + uint8_t* dst = compressed_codes + chunk_offset + num_active; + int write_pos = 0; + for (int g = 0; g < 8; g++) { + uint64_t src_val; + memcpy(&src_val, src + g * 8, 8); + uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF); + uint64_t byte_mask = + _pdep_u64(submask, 0x0101010101010101ULL) * + 0xFF; + uint64_t compressed_val = _pext_u64(src_val, byte_mask); + int count = __builtin_popcount(submask); + memcpy(dst + write_pos, &compressed_val, 8); + write_pos += count; + } + } +#else + // Scalar fallback: scan set bits one by one and copy + // the corresponding code byte. + for (size_t ci = 0; ci < chunk_size; ci++) { + size_t chunk_offset = ci * max_batch_size; + const uint8_t* src = codes + chunk_offset + point_idx; + uint8_t* dst = compressed_codes + chunk_offset + num_active; + int write_pos = 0; + uint64_t m = mask; + while (m) { + int bit = __builtin_ctzll(m); + dst[write_pos++] = src[bit]; + m &= m - 1; + } + } +#endif + + num_active += __builtin_popcountll(mask); + } + } else { + num_active = next_num_active; + compressed_codes = const_cast(codes); + } + + return std::make_pair(compressed_codes, num_active); +} + +} // namespace panorama_kernels +} // namespace faiss + +#endif // COMPILE_SIMD_AVX512 diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp new file mode 100644 index 0000000000..ab9f7acb57 --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp @@ -0,0 +1,155 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Scalar implementations of Panorama kernels. +// Compiled only when no SIMD variant (AVX2/AVX-512) is available. + +#if !defined(COMPILE_SIMD_AVX2) && !defined(COMPILE_SIMD_AVX512) + +#include + +#include + +#ifdef __BMI2__ +#include +#endif + +namespace faiss { +namespace panorama_kernels { + +void process_chunks( + size_t chunk_size, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + for (size_t chunk_idx = 0; chunk_idx < chunk_size; chunk_idx++) { + size_t chunk_offset = chunk_idx * max_batch_size; + float* chunk_sim = sim_table + chunk_idx * 256; + for (size_t i = 0; i < num_active; i++) { + exact_distances[i] += + chunk_sim[compressed_codes[chunk_offset + i]]; + } + } +} + +size_t process_filtering( + size_t num_active, + float* exact_distances, + uint32_t* active_indices, + float* cum_sums, + uint8_t* bitset, + size_t batch_offset, + float dis0, + float query_cum_norm, + float epsilon, + float heap_max) { + size_t next_num_active = 0; + for (size_t i = 0; i < num_active; i++) { + float exact_distance = exact_distances[i]; + float cum_sum = cum_sums[active_indices[i] - batch_offset]; + float lower_bound = + exact_distance + dis0 - cum_sum * query_cum_norm * epsilon; + + bool keep = heap_max > lower_bound; + active_indices[next_num_active] = active_indices[i]; + exact_distances[next_num_active] = exact_distance; + bitset[active_indices[i] - batch_offset] = keep; + next_num_active += keep; + } + return next_num_active; +} + +std::pair process_code_compression( + size_t next_num_active, + size_t max_batch_size, + size_t chunk_size, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + uint8_t* compressed_codes = compressed_codes_begin; + size_t num_active = 0; + + // An important optimization is to skip the compression if all points + // are active, as we can just use the compressed_codes_begin pointer. + if (next_num_active < max_batch_size) { + compressed_codes = compressed_codes_begin; + for (size_t point_idx = 0; point_idx < max_batch_size; + point_idx += 64) { + // Build a 64-bit mask from the byteset: each byte is + // 0 or 1, collect into a single bitmask. + uint64_t mask = 0; +#ifdef __BMI2__ + for (int g = 0; g < 8; g++) { + uint64_t bytes; + memcpy(&bytes, bitset + point_idx + g * 8, 8); + uint8_t bits = (uint8_t)_pext_u64( + bytes, 0x0101010101010101ULL); + mask |= ((uint64_t)bits << (g * 8)); + } +#else + for (int b = 0; b < 64; b++) { + if (bitset[point_idx + b]) + mask |= (1ULL << b); + } +#endif + + // Byte-level stream compaction. +#ifdef __BMI2__ + // PEXT/PDEP path: process 8 bytes at a time. PDEP + // expands the per-byte mask bits into a per-byte lane + // mask, then PEXT extracts only the selected bytes. + for (size_t ci = 0; ci < chunk_size; ci++) { + size_t chunk_offset = ci * max_batch_size; + const uint8_t* src = codes + chunk_offset + point_idx; + uint8_t* dst = compressed_codes + chunk_offset + num_active; + int write_pos = 0; + for (int g = 0; g < 8; g++) { + uint64_t src_val; + memcpy(&src_val, src + g * 8, 8); + uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF); + uint64_t byte_mask = + _pdep_u64(submask, 0x0101010101010101ULL) * + 0xFF; + uint64_t compressed_val = _pext_u64(src_val, byte_mask); + int count = __builtin_popcount(submask); + memcpy(dst + write_pos, &compressed_val, 8); + write_pos += count; + } + } +#else + // Scalar fallback: scan set bits one by one and copy + // the corresponding code byte. + for (size_t ci = 0; ci < chunk_size; ci++) { + size_t chunk_offset = ci * max_batch_size; + const uint8_t* src = codes + chunk_offset + point_idx; + uint8_t* dst = compressed_codes + chunk_offset + num_active; + int write_pos = 0; + uint64_t m = mask; + while (m) { + int bit = __builtin_ctzll(m); + dst[write_pos++] = src[bit]; + m &= m - 1; + } + } +#endif + + num_active += __builtin_popcountll(mask); + } + } else { + num_active = next_num_active; + compressed_codes = const_cast(codes); + } + + return std::make_pair(compressed_codes, num_active); +} + +} // namespace panorama_kernels +} // namespace faiss + +#endif // !COMPILE_SIMD_AVX2 && !COMPILE_SIMD_AVX512 diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h new file mode 100644 index 0000000000..6c8d007ddd --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +/** + * @file panorama_kernels.h + * @brief Panorama search kernels with scalar and AVX-512 implementations. + * + * The three core kernels of the Panorama progressive filtering search: + * - process_chunks: accumulate PQ distance table lookups over chunks + * - process_filtering: Cauchy-Schwarz lower bound pruning with stream + * compaction + * - process_code_compression: byte-level stream compaction of PQ codes + * + * Implementations live in panorama_kernels-generic.cpp (scalar) and + * panorama_kernels-avx512.cpp (AVX-512 gather/compress + BMI2 PEXT/PDEP). + */ + +#include +#include +#include + +namespace faiss { +namespace panorama_kernels { + +/// Accumulate PQ distance table lookups over chunks. +/// +/// For each chunk, looks up `sim_table[compressed_codes[i]]` and +/// accumulates into `exact_distances[i]` for all active elements. +/// Iterates chunks first to keep the LUT slice in L1 cache. +/// The AVX-512 version unrolls 4 chunks at a time. +void process_chunks( + size_t chunk_size, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances); + +/// Filter active elements using Cauchy-Schwarz lower bound pruning. +/// +/// Computes a lower bound on the true distance for each active element +/// and removes elements that cannot improve the current heap top. +/// Uses stream compaction to pack surviving elements contiguously. +/// Updates the bitset to reflect which elements were removed. +/// +/// Unfortunately, AVX-512 does not support a way to scatter at a +/// 1-byte granularity, so the bitset update for removed items is +/// done sequentially after compressing the indices. +size_t process_filtering( + size_t num_active, + float* exact_distances, + uint32_t* active_indices, + float* cum_sums, + uint8_t* bitset, + size_t batch_offset, + float dis0, + float query_cum_norm, + float epsilon, + float heap_max); + +/// Byte-level stream compaction of PQ codes using the active bitset. +/// +/// An important optimization is to skip the compression if all points +/// are active, as we can just use the original codes pointer. +/// +/// Compress the codes: here we don't need to process remainders +/// as long as `max_batch_size` is a multiple of 64 (which we +/// assert in the constructor). Conveniently, compressed_codes is +/// allocated to `max_batch_size` * `chunk_size` elements. +/// `num_active` is guaranteed to always be less than or equal to +/// `max_batch_size`. Only the last batch may be smaller than +/// `max_batch_size`, the caller ensures that the batch and +/// bitset are padded with zeros. +std::pair process_code_compression( + size_t next_num_active, + size_t max_batch_size, + size_t chunk_size, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes); + +} // namespace panorama_kernels +} // namespace faiss From a1f7274725de36dcfa187d1b8440a1b0f2b7ebba Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Wed, 18 Mar 2026 04:21:49 +0000 Subject: [PATCH 06/41] Unread but fast --- benchs/bench_ivfpq_panorama.py | 55 +-- faiss/IndexIVF.h | 37 -- faiss/IndexIVFPQ.cpp | 279 -------------- faiss/IndexIVFPQPanorama.cpp | 674 +++++++++++++++------------------ faiss/IndexIVFPQPanorama.h | 98 +++-- 5 files changed, 390 insertions(+), 753 deletions(-) diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py index ebd1336092..7c965d54fe 100644 --- a/benchs/bench_ivfpq_panorama.py +++ b/benchs/bench_ivfpq_panorama.py @@ -16,7 +16,7 @@ def fvecs_read(fname): return a.reshape(-1, d + 1)[:, 1:].copy() -GIST_DIR = "/home/lutex/PCA_init" +GIST_DIR = "/datasets/PCA_init" CACHE_DIR = "/home/lutex/faiss-panorama/index_cache" os.makedirs(CACHE_DIR, exist_ok=True) @@ -74,7 +74,7 @@ def eval_recall(index, nprobe_val): return recall, qps -# faiss.omp_set_num_threads(mp.cpu_count()) +faiss.omp_set_num_threads(mp.cpu_count()) # --- IVFPQ baseline (cached) --- if os.path.exists(IVFPQ_CACHE): @@ -109,41 +109,46 @@ def eval_recall(index, nprobe_val): # --- IVFPQPanorama (reuse trained PQ from cache) --- faiss.omp_set_num_threads(mp.cpu_count()) -if os.path.exists(IVFPQ_TRAINED_CACHE): - print(f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", flush=True) - trained = faiss.read_index(IVFPQ_TRAINED_CACHE) - quantizer2 = trained.quantizer - trained.own_fields = False - ivfpq_pano = faiss.IndexIVFPQPanorama( +def build_panorama_from_trained(trained_index): + quantizer2 = trained_index.quantizer + trained_index.own_fields = False + + pano = faiss.IndexIVFPQPanorama( quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size ) - centroids = faiss.vector_to_array(trained.pq.centroids) - faiss.copy_array_to_vector(centroids, ivfpq_pano.pq.centroids) - ivfpq_pano.is_trained = True - ivfpq_pano.use_precomputed_table = 1 - ivfpq_pano.precompute_table() + centroids = faiss.vector_to_array(trained_index.pq.centroids) + faiss.copy_array_to_vector(centroids, pano.pq.centroids) + pano.is_trained = True + pano.use_precomputed_table = 1 + pano.precompute_table() + return pano + +if os.path.exists(IVFPQ_TRAINED_CACHE): + print(f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", flush=True) + trained = faiss.read_index(IVFPQ_TRAINED_CACHE) + ivfpq_pano = build_panorama_from_trained(trained) print(" Reused trained PQ (skipped training).", flush=True) - t0 = time.time() - ivfpq_pano.add(xb) - print(f" Adding took {time.time() - t0:.1f}s", flush=True) else: print( - f"\nBuilding IVFPQPanorama from scratch: nlist={nlist}, M={M}, nbits={nbits}, " - f"n_levels={n_levels}, epsilon={epsilon}, batch_size={batch_size}", + f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}", flush=True, ) quantizer2 = faiss.IndexFlatL2(d) - ivfpq_pano = faiss.IndexIVFPQPanorama( - quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size - ) + trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits) t0 = time.time() - ivfpq_pano.train(xt) + trained.train(xt) print(f" Training took {time.time() - t0:.1f}s", flush=True) - t0 = time.time() - ivfpq_pano.add(xb) - print(f" Adding took {time.time() - t0:.1f}s", flush=True) + + print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) + faiss.write_index(trained, IVFPQ_TRAINED_CACHE) + + ivfpq_pano = build_panorama_from_trained(trained) + +t0 = time.time() +ivfpq_pano.add(xb) +print(f" Adding took {time.time() - t0:.1f}s", flush=True) faiss.omp_set_num_threads(1) print("\n====== IVFPQPanorama", flush=True) diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h index d66523d245..a02665452f 100644 --- a/faiss/IndexIVF.h +++ b/faiss/IndexIVF.h @@ -19,11 +19,9 @@ #include #include #include -#include namespace faiss { -struct IndexIVFPQPanorama; /** Encapsulates a quantizer object for the IndexIVF * @@ -500,15 +498,6 @@ struct InvertedListScanner { /// following codes come from this inverted list virtual void set_list(idx_t list_no, float coarse_dis); - virtual void set_list_panorama( - idx_t list_no, - float coarse_dis, - float* sim_table, - float* dis0_ptr, - bool update) {} - - virtual void set_sim_table(float* sim_table, float dis0_ptr) {} - /// compute a single query-to-code distance virtual float distance_to_code(const uint8_t* code) const = 0; @@ -565,32 +554,6 @@ struct InvertedListScanner { const idx_t* ids, ResultHandler& handler) const; - virtual size_t process_batch( - const ProductQuantizer& pq, - uint8_t* compressed_codes, - size_t cluster_id, - size_t batch_no, - float coarse_dis_i, - size_t curr_batch_size, - size_t max_batch_size, - size_t chunk_size, - float epsilon, - size_t n_levels, - const uint8_t* codes_batch, - float* cums, - float* query_cum_norms, - uint32_t* active_indices, - uint8_t* bitset, - float* exact_distances, - const idx_t* ids, - float* heap_sim, - idx_t* heap_ids, - size_t k, - float* dis0_cache, - float* sim_table_cache) { - return 0; - } - virtual ~InvertedListScanner() {} }; diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp index aca27f903d..270c092740 100644 --- a/faiss/IndexIVFPQ.cpp +++ b/faiss/IndexIVFPQ.cpp @@ -32,7 +32,6 @@ #include #include #include -#include #include namespace faiss { @@ -762,44 +761,6 @@ struct QueryTables { return dis0; } - - float precompute_list_tables_L2_panorama(float* sim_table_ptr) { - float dis0 = 0; - - if (use_precomputed_table == 1) { - dis0 = coarse_dis; - - const size_t n = pq.M * pq.ksub; - const float bf = -2.0f; - const float* b = sim_table_2; - float* c = sim_table_ptr; - - for (size_t idx = 0; idx < n; idx++) { - c[idx] = bf * b[idx]; - } - - sim_table = sim_table_ptr; - } else { - FAISS_THROW_MSG( - "Panorama PQ only supports use_precomputed_table == 1"); - } - - return dis0; - } - - float precompute_list_tables_panorama(float* sim_table_ptr) { - float dis0 = 0; - uint64_t t0; - TIC; - if (by_residual) { - if (metric_type == METRIC_INNER_PRODUCT) - dis0 = precompute_list_tables_IP(); - else - dis0 = precompute_list_tables_L2_panorama(sim_table_ptr); - } - init_list_cycles += TOC; - return dis0; - } }; template @@ -831,39 +792,6 @@ struct WrappedSearchResult { } }; -template -struct KnnSearchResultsPanorama { - idx_t key; - const idx_t* ids; - const IDSelector* sel; - - size_t k; - float* heap_sim; - idx_t* heap_ids; - - size_t nup; - - inline bool skip_entry(idx_t j) { - return use_sel && !sel->is_member(ids[j]); - } - - inline bool should_keep(float dis) { - return C::cmp(heap_sim[0], dis); - } - - inline float top() { - return heap_sim[0]; - } - - inline void add(idx_t j, float dis) { - if (C::cmp(heap_sim[0], dis)) { - idx_t id = ids ? ids[j] : lo_build(key, j); - heap_replace_top(k, heap_sim, heap_ids, dis, id); - nup++; - } - } -}; - /***************************************************** * Scaning the codes. * The scanning functions call their favorite precompute_* @@ -894,26 +822,6 @@ struct IVFPQScannerT : QueryTables { } } - void init_list_panorama( - idx_t list_no, - float coarse_dis, - int mode, - float* sim_table, - float* dis0_ptr, - bool update) { - this->key = list_no; - this->coarse_dis = coarse_dis; - - if (mode == 2) { - if (update) { - *dis0_ptr = precompute_list_tables_panorama(sim_table); - } - dis0 = *dis0_ptr; - } else if (mode == 1) { - dis0 = precompute_list_table_pointers(); - } - } - /***************************************************** * Scaning the codes: simple PQ scan. *****************************************************/ @@ -1300,193 +1208,6 @@ struct IVFPQScanner : IVFPQScannerT, this->init_list(list_no, coarse_dis, precompute_mode); } - void set_list_panorama( - idx_t list_no, - float coarse_dis, - float* sim_table, - float* dis0_ptr, - bool update) override { - this->list_no = list_no; - this->init_list_panorama( - list_no, - coarse_dis, - precompute_mode, - sim_table, - dis0_ptr, - update); - } - - void set_sim_table(float* sim_table, float dis0) override { - this->sim_table = sim_table; - this->dis0 = dis0; - } - - // Panorama kernels (process_chunks, process_filtering, - // process_code_compression) are implemented in - // faiss/impl/panorama_kernels/ with scalar and AVX-512 variants. - // The linker selects the right one based on the SIMD compile target. - - inline void process_chunks_sparse( - size_t chunk_size, - size_t max_batch_size, - size_t num_active, - float* sim_table, - const uint8_t* codes, - float* exact_distances, - uint32_t* active_indices, - size_t batch_offset, - size_t ksub) { - for (size_t ci = 0; ci < chunk_size; ci++) { - size_t chunk_offset = ci * max_batch_size; - float* chunk_sim_table = sim_table + ci * ksub; - - for (size_t batch_idx = 0; batch_idx < num_active; batch_idx++) { - size_t real_idx = active_indices[batch_idx] - batch_offset; - uint8_t code = codes[chunk_offset + real_idx]; - exact_distances[batch_idx] += chunk_sim_table[code]; - } - } - } - - size_t process_batch( - const ProductQuantizer& pq, - uint8_t* compressed_codes, - size_t cluster_id, - size_t batch_no, - float coarse_dis_i, - size_t curr_batch_size, - size_t max_batch_size, - size_t chunk_size, - float epsilon, - size_t n_levels, - const uint8_t* codes_batch, - float* cums, - float* query_cum_norms, - uint32_t* active_indices, - uint8_t* bitset, - float* exact_distances, - const idx_t* ids, - float* heap_sim, - idx_t* heap_ids, - size_t k, - float* dis0_cache, - float* sim_table_cache) override { - KnnSearchResultsPanorama res = { - this->key, - this->store_pairs ? nullptr : ids, - this->sel, - k, - heap_sim, - heap_ids, - 0}; - uint8_t* compressed_codes_begin = compressed_codes; - size_t total_active = 0; - - // The remaining active elements computed at the end of each level. - // We initialize to `curr_batch_size` for continuity. - size_t next_num_active = curr_batch_size; - // For historical reasons, we initialize dis0 only at - // the beginning of the first level, but we need to access it after - // all levels have been processed, so we declare dis0 here. - float dis0 = 0; - // Given that `active_indices` indexes the cluster directly, we need - // to offset it by the batch offset when updating the bitset and - // accessing the cum_sums. This way we avoid yet another layer of - // indirection. - size_t batch_offset = batch_no * max_batch_size; - for (size_t level = 0; (level < n_levels) && (next_num_active > 0); - level++) { - total_active += next_num_active; - - // This ensures the LUT is pointing to the right offset, and is - // properly initialized. We only compute dis0 distances once for - // each cluster, and cache the result. - size_t level_offset_sim_table = level * pq.ksub * chunk_size; - this->set_list_panorama( - cluster_id, - coarse_dis_i, - sim_table_cache + level_offset_sim_table, - dis0_cache, // Only init once for each cluster. - level == 0 && batch_no == 0); - this->set_sim_table( - sim_table_cache + level_offset_sim_table, *dis0_cache); - - dis0 = this->dis0; - - // We multiply by two here so we don't have to do it in the - // kernel. - float query_cum_norm = 2 * query_cum_norms[level + 1]; - - float heap_max = res.top(); - - // Codes has padding potentially, cumsum does not. - float* cum_sums = cums + curr_batch_size * level; - const uint8_t* codes = - codes_batch + max_batch_size * chunk_size * level; - - bool is_sparse = next_num_active < max_batch_size / 16; - float* sim_table = this->sim_table; - - // Phase 1: Process all chunks and accumulate distances. - // We iterate over chunks first as this keeps the same LUT slice - // within the L1 cache. To avoid register thrashing, we unroll - // 4 chunks at a time. - size_t num_active_for_filtering = 0; - if (is_sparse) { - process_chunks_sparse( - chunk_size, - max_batch_size, - next_num_active, - sim_table, - codes, - exact_distances, - active_indices, - batch_offset, - pq.ksub); - num_active_for_filtering = next_num_active; - } else { - auto [cc, na] = - panorama_kernels::process_code_compression( - next_num_active, - max_batch_size, - chunk_size, - compressed_codes_begin, - bitset, - codes); - - panorama_kernels::process_chunks( - chunk_size, - max_batch_size, - na, - sim_table, - cc, - exact_distances); - num_active_for_filtering = na; - } - - // Phase 2: Filtering logic using accumulated distances. - next_num_active = panorama_kernels::process_filtering( - num_active_for_filtering, - exact_distances, - active_indices, - cum_sums, - bitset, - batch_offset, - dis0, - query_cum_norm, - epsilon, - heap_max); - } - - // Phase 3: Insert remaining candidates to heap. - for (size_t batch_idx = 0; batch_idx < next_num_active; batch_idx++) { - res.add(active_indices[batch_idx], - dis0 + exact_distances[batch_idx]); - } - - return total_active; - } - float distance_to_code(const uint8_t* code) const override { assert(precompute_mode == 2); float dis = this->dis0 + diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index aae0811176..8ac3df210a 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -1,29 +1,27 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + #include -#include -#include -#include -#include #include -#include +#include #include -#include -#include +#include #include -#include -#include - -#include -#include -#include #include -#include +#include +#include namespace faiss { -static uint64_t total_active = 0; -static uint64_t total_points = 0; +/***************************************** + * Constructor + ******************************************/ IndexIVFPQPanorama::IndexIVFPQPanorama( Index* quantizer, @@ -45,78 +43,65 @@ IndexIVFPQPanorama::IndexIVFPQPanorama( metric, own_invlists), n_levels(n_levels), - added(false), + epsilon(epsilon), + batch_size(batch_size), chunk_size(code_size / n_levels), levels_size(d / n_levels), - nbits_per_idx(nbits_per_idx), - m_level_width(M / n_levels), - epsilon(epsilon), - batch_size(batch_size) { - FAISS_ASSERT(M % n_levels == 0); - FAISS_ASSERT(batch_size % 64 == 0); - - printf("N levels = %d\n", n_levels); - printf("M = code_size = %zu\n", M); - printf("Nbits per idx = %u (fixed)\n", 8); - printf("Nlist = %zu\n", nlist); - printf("Batch size = %zuB\n", batch_size); - - FAISS_ASSERT(m_level_width > 0); - FAISS_ASSERT(nbits_per_idx == 8); - FAISS_ASSERT(M == code_size); - FAISS_ASSERT(metric == METRIC_L2); + m_level_width(M / n_levels) { + FAISS_THROW_IF_NOT_MSG(M % n_levels == 0, "M must be divisible by n_levels"); + FAISS_THROW_IF_NOT_MSG(batch_size % 64 == 0, "batch_size must be multiple of 64"); + FAISS_THROW_IF_NOT_MSG(nbits_per_idx == 8, "only 8-bit PQ codes supported"); + FAISS_THROW_IF_NOT_MSG(M == code_size, "M must equal code_size for 8-bit PQ"); + FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported"); } +/***************************************** + * add — transpose codes into column-major layout and precompute norms + ******************************************/ + void IndexIVFPQPanorama::add(idx_t n, const float* x) { - FAISS_ASSERT(!added); + FAISS_THROW_IF_NOT_MSG(!added, "IndexIVFPQPanorama only supports a single add() call"); added = true; - num_points = n; + IndexIVFPQ::add(n, x); - size_t new_n = 0; + // Compute column offsets (each list rounded up to batch_size). + size_t total_column_bytes = 0; column_offsets = new size_t[nlist]; for (size_t i = 0; i < nlist; i++) { - column_offsets[i] = new_n; - size_t batch_n = (invlists->list_size(i) + batch_size - 1) / batch_size; - size_t rounded_n = batch_n * batch_size; - new_n += rounded_n * code_size; + column_offsets[i] = total_column_bytes; + size_t n_batches = + (invlists->list_size(i) + batch_size - 1) / batch_size; + total_column_bytes += n_batches * batch_size * code_size; } - column_storage = new uint8_t[code_size * new_n]; - + // Transpose codes from row-major [point0_code, point1_code, ...] into + // column-major within each batch: M columns of batch_size bytes each. + column_storage = new uint8_t[total_column_bytes](); for (size_t list_no = 0; list_no < nlist; list_no++) { size_t col_offset = column_offsets[list_no]; size_t list_size = invlists->list_size(list_no); size_t n_batches = (list_size + batch_size - 1) / batch_size; + const uint8_t* row_codes = invlists->get_codes(list_no); + for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { size_t batch_offset = batch_no * batch_size * code_size; size_t curr_batch_size = std::min(list_size - batch_no * batch_size, batch_size); for (size_t m = 0; m < pq.M; m++) { - size_t m_offset = m * batch_size; - for (size_t point_idx = 0; point_idx < batch_size; - point_idx++) { - uint8_t* dest = column_storage + col_offset + batch_offset + - m_offset + point_idx; - const uint8_t* codes = invlists->get_codes(list_no); - - if (point_idx < curr_batch_size) { - const uint8_t* src = codes + batch_offset + - point_idx * code_size + m; - memcpy(dest, src, 1); - } else { - *dest = 0; - } + for (size_t p = 0; p < curr_batch_size; p++) { + column_storage[col_offset + batch_offset + + m * batch_size + p] = + row_codes[batch_no * batch_size * code_size + + p * code_size + m]; } } } } - cum_sums = new float[(n_levels + 1) * n]; + // Precompute cumulative residual norms and initial exact distances. cum_sum_offsets = new size_t[nlist]; - - init_exact_distances = new float[n]; init_exact_distances_offsets = new size_t[nlist]; size_t cum_size = 0; @@ -124,13 +109,14 @@ void IndexIVFPQPanorama::add(idx_t n, const float* x) { for (size_t list_no = 0; list_no < nlist; list_no++) { cum_sum_offsets[list_no] = cum_size; cum_size += invlists->list_size(list_no) * (n_levels + 1); - init_exact_distances_offsets[list_no] = init_size; init_size += invlists->list_size(list_no); } + cum_sums = new float[cum_size]; + init_exact_distances = new float[init_size]; + for (size_t list_no = 0; list_no < nlist; list_no++) { - const idx_t* idx = invlists->get_ids(list_no); size_t list_size = invlists->list_size(list_no); std::vector centroid(d); @@ -141,365 +127,299 @@ void IndexIVFPQPanorama::add(idx_t n, const float* x) { for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { size_t b_offset = batch_no * batch_size; size_t curr_batch_size = - std::min(list_size - batch_no * batch_size, batch_size); + std::min(list_size - b_offset, batch_size); - for (size_t point_idx = 0; point_idx < curr_batch_size; - point_idx++) { - float init_exact_distance = 0.0f; - - std::vector vector(d); + for (size_t p = 0; p < curr_batch_size; p++) { + std::vector vec(d); const uint8_t* code = - invlists->get_single_code(list_no, b_offset + point_idx); - pq.decode(code, vector.data()); - - std::vector suffix_sums(d + 1); - suffix_sums[d] = 0.0f; + invlists->get_single_code(list_no, b_offset + p); + pq.decode(code, vec.data()); + float init_dist = 0.0f; + std::vector suffix(d + 1, 0.0f); for (int j = d - 1; j >= 0; j--) { - init_exact_distance += - vector[j] * vector[j] + 2 * vector[j] * centroid[j]; - float squaredVal = vector[j] * vector[j]; - suffix_sums[j] = suffix_sums[j + 1] + squaredVal; + init_dist += vec[j] * vec[j] + 2 * vec[j] * centroid[j]; + suffix[j] = suffix[j + 1] + vec[j] * vec[j]; } for (int level = 0; level < n_levels; level++) { int start_idx = level * levels_size; size_t offset = cum_sum_offsets[list_no] + b_offset * (n_levels + 1) + - level * curr_batch_size + point_idx; - if (start_idx < (int)d) { - cum_sums[offset] = sqrt(suffix_sums[start_idx]); - } else { - cum_sums[offset] = 0.0f; - } + level * curr_batch_size + p; + cum_sums[offset] = start_idx < (int)d + ? std::sqrt(suffix[start_idx]) + : 0.0f; } - size_t offset = cum_sum_offsets[list_no] + + size_t last_offset = cum_sum_offsets[list_no] + b_offset * (n_levels + 1) + - n_levels * curr_batch_size + point_idx; - cum_sums[offset] = 0.0f; + n_levels * curr_batch_size + p; + cum_sums[last_offset] = 0.0f; - size_t init_offset = init_exact_distances_offsets[list_no]; - init_exact_distances[init_offset + b_offset + point_idx] = - init_exact_distance; + init_exact_distances + [init_exact_distances_offsets[list_no] + b_offset + p] = + init_dist; } } } } -void IndexIVFPQPanorama::search( - idx_t n, - const float* x, - idx_t k, - float* distances, - idx_t* labels, - const SearchParameters* params_in) const { - FAISS_THROW_IF_NOT(k > 0); - const IVFSearchParameters* params = nullptr; - if (params_in) { - params = dynamic_cast(params_in); - FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type"); - } - const size_t nprobe = - std::min(nlist, params ? params->nprobe : this->nprobe); - FAISS_THROW_IF_NOT(nprobe > 0); - - auto sub_search_func = [this, k, nprobe, params]( - idx_t n, - const float* x, - float* distances, - idx_t* labels, - IndexIVFStats* ivf_stats) { - std::unique_ptr idx(new idx_t[n * nprobe]); - std::unique_ptr coarse_dis(new float[n * nprobe]); - - quantizer->search( - n, - x, - nprobe, - coarse_dis.get(), - idx.get(), - params ? params->quantizer_params : nullptr); - - invlists->prefetch_lists(idx.get(), n * nprobe); - - search_preassigned( - n, - x, - k, - idx.get(), - coarse_dis.get(), - distances, - labels, - false, - params, - ivf_stats); - }; - - if ((parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT) == 0) { - int nt = std::min(omp_get_max_threads(), int(n)); - std::vector stats(nt); - std::mutex exception_mutex; - std::string exception_string; - -#pragma omp parallel for if (nt > 1) - for (idx_t slice = 0; slice < nt; slice++) { - IndexIVFStats local_stats; - idx_t i0 = n * slice / nt; - idx_t i1 = n * (slice + 1) / nt; - if (i1 > i0) { - try { - sub_search_func( - i1 - i0, - x + i0 * d, - distances + i0 * k, - labels + i0 * k, - &stats[slice]); - } catch (const std::exception& e) { - std::lock_guard lock(exception_mutex); - exception_string = e.what(); - } - } - } - - if (!exception_string.empty()) { - FAISS_THROW_FMT( - "search error: %s", exception_string.c_str()); - } - } else { - sub_search_func(n, x, distances, labels, &indexIVF_stats); +/***************************************** + * Panorama scanner — overrides scan_codes with batch processing + ******************************************/ + +namespace { + +using idx_t = faiss::idx_t; + +template +struct IVFPQScannerPanorama : InvertedListScanner { + const IndexIVFPQPanorama& index; + const ProductQuantizer& pq; + + // Query state + const float* qi = nullptr; + std::vector query_cum_norms; + std::vector sim_table_2; + + // Per-list state + float coarse_dis = 0; + + IVFPQScannerPanorama( + const IndexIVFPQPanorama& index, + bool store_pairs, + const IDSelector* sel) + : InvertedListScanner(store_pairs, sel), + index(index), + pq(index.pq) { + this->keep_max = is_similarity_metric(index.metric_type); + this->code_size = pq.code_size; + query_cum_norms.resize(index.n_levels + 1); + sim_table_2.resize(pq.M * pq.ksub); } -} - -void IndexIVFPQPanorama::search_preassigned( - idx_t n, - const float* x, - idx_t k, - const idx_t* keys, - const float* coarse_dis, - float* distances, - idx_t* labels, - bool store_pairs, - const IVFSearchParameters* params, - IndexIVFStats* ivf_stats) const { - FAISS_THROW_IF_NOT(k > 0); - - idx_t nprobe = params ? params->nprobe : this->nprobe; - nprobe = std::min((idx_t)nlist, nprobe); - FAISS_THROW_IF_NOT(nprobe > 0); - - const idx_t unlimited_list_size = std::numeric_limits::max(); - idx_t max_codes = params ? params->max_codes : this->max_codes; - IDSelector* sel = params ? params->sel : nullptr; - const IDSelectorRange* selr = dynamic_cast(sel); - if (selr) { - if (selr->assume_sorted) { - sel = nullptr; - } else { - selr = nullptr; - } - } - - FAISS_THROW_IF_NOT_MSG( - !(sel && store_pairs), - "selector and store_pairs cannot be combined"); - FAISS_THROW_IF_NOT_MSG( - !invlists->use_iterator || (max_codes == 0 && store_pairs == false), - "iterable inverted lists don't support max_codes and store_pairs"); - - size_t nlistv = 0, ndis = 0, nheap = 0; - - using HeapForIP = CMin; - using HeapForL2 = CMax; - - bool interrupt = false; - std::mutex exception_mutex; - std::string exception_string; + void set_query(const float* query) override { + this->qi = query; - int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT; - bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT); + FAISS_ASSERT(index.by_residual); + FAISS_ASSERT(index.use_precomputed_table == 1); - FAISS_THROW_IF_NOT_MSG( - max_codes == 0 || pmode == 0 || pmode == 3, - "max_codes supported only for parallel_mode = 0 or 3"); + pq.compute_inner_prod_table(qi, sim_table_2.data()); - if (max_codes == 0) { - max_codes = unlimited_list_size; + // Compute query suffix sums → cum norms per level. + std::vector suffix(index.d + 1, 0.0f); + for (int j = index.d - 1; j >= 0; j--) { + suffix[j] = suffix[j + 1] + qi[j] * qi[j]; + } + for (int level = 0; level < index.n_levels; level++) { + int start = level * index.levels_size; + query_cum_norms[level] = + start < (int)index.d ? std::sqrt(suffix[start]) : 0.0f; + } + query_cum_norms[index.n_levels] = 0.0f; } - [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 && - (pmode == 0 ? false - : pmode == 3 ? n > 1 - : pmode == 1 ? nprobe > 1 - : nprobe * n > 1); - - void* inverted_list_context = - params ? params->inverted_list_context : nullptr; - - const size_t sim_table_size = pq.ksub * pq.M; - std::vector sim_table_cache(nprobe * sim_table_size); - std::vector dis0s_cache(nprobe); - - std::vector suffixSums(d + 1); - std::vector query_cum_norms(n_levels + 1); - std::vector query(d); - std::vector exact_distances(batch_size); - std::vector bitset(batch_size); - std::vector active_indices(batch_size); - std::vector compressed_codes(batch_size * chunk_size); - -#pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis, nheap) - { - std::unique_ptr scanner( - get_InvertedListScanner(store_pairs, sel, params)); - - auto init_result = [&](float* simi, idx_t* idxi) { - if (!do_heap_init) - return; - if (metric_type == METRIC_INNER_PRODUCT) { - heap_heapify(k, simi, idxi); - } else { - heap_heapify(k, simi, idxi); - } - }; - - auto reorder_result = [&](float* simi, idx_t* idxi) { - if (!do_heap_init) - return; - if (metric_type == METRIC_INNER_PRODUCT) { - heap_reorder(k, simi, idxi); - } else { - heap_reorder(k, simi, idxi); - } - }; - - FAISS_ASSERT(pmode == 0); - if (pmode == 0) { -#pragma omp for - for (idx_t i = 0; i < n; i++) { - if (interrupt) { - continue; - } + void set_list(idx_t list_no, float coarse_dis) override { + this->list_no = list_no; + this->coarse_dis = coarse_dis; + } - scanner->set_query(x + i * d); - suffixSums[d] = 0.0f; + float distance_to_code(const uint8_t* code) const override { + FAISS_THROW_MSG( + "IndexIVFPQPanorama does not support distance_to_code"); + } - const float* q = x + i * d; + size_t scan_codes( + size_t list_size, + const uint8_t* /* codes (row-major, unused) */, + const idx_t* ids, + float* distances, + idx_t* labels, + size_t k) const override { + size_t nup = 0; + + const size_t bs = index.batch_size; + const size_t cs = index.chunk_size; + const int n_levels = index.n_levels; + const float epsilon = index.epsilon; + + const size_t n_batches = (list_size + bs - 1) / bs; + const size_t sim_table_size = pq.ksub * pq.M; + + // Panorama column-major codes for this list. + const uint8_t* col_codes = + index.column_storage + index.column_offsets[list_no]; + const float* list_cum_sums = + index.cum_sums + index.cum_sum_offsets[list_no]; + const float* list_init_dists = + index.init_exact_distances + + index.init_exact_distances_offsets[list_no]; + + // Scratch buffers. + std::vector exact_distances(bs); + std::vector bitset(bs); + std::vector active_indices(bs); + std::vector compressed_codes(bs * cs); + std::vector sim_table_cache(sim_table_size); + float dis0_cache = 0; - for (int j = d - 1; j >= 0; --j) { - float squaredVal = q[j] * q[j]; - suffixSums[j] = suffixSums[j + 1] + squaredVal; - } + for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { + size_t curr_batch_size = + std::min(list_size - batch_no * bs, bs); + size_t b_offset = batch_no * bs; + + // Initialize active set. + std::iota( + active_indices.begin(), + active_indices.begin() + curr_batch_size, + b_offset); + std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1); + std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0); + + for (size_t idx = 0; idx < curr_batch_size; idx++) { + exact_distances[idx] = list_init_dists[b_offset + idx]; + } - for (int level_idx = 0; level_idx < n_levels; level_idx++) { - int startIdx = level_idx * levels_size; - if (startIdx < (int)d) { - query_cum_norms[level_idx] = sqrt(suffixSums[startIdx]); - } else { - query_cum_norms[level_idx] = 0.0f; + const uint8_t* batch_codes = col_codes + b_offset * code_size; + const float* batch_cums = + list_cum_sums + b_offset * (n_levels + 1); + + size_t next_num_active = curr_batch_size; + float dis0 = 0; + size_t batch_offset = batch_no * bs; + + for (int level = 0; + level < n_levels && next_num_active > 0; + level++) { + // Compute sim table for this level (cached across batches + // within same list, only for first batch). + size_t level_sim_offset = level * pq.ksub * cs; + + if (level == 0 && batch_no == 0) { + // Precompute LUT: sim_table = -2 * sim_table_2 + // (the precomputed_table term is added via dis0). + dis0_cache = coarse_dis; + const size_t n = pq.M * pq.ksub; + for (size_t i = 0; i < n; i++) { + sim_table_cache[i] = -2.0f * sim_table_2[i]; } } - query_cum_norms[n_levels] = 0.0f; - - float* simi = distances + i * k; - idx_t* idxi = labels + i * k; - - init_result(simi, idxi); - - idx_t nscan = 0; - - for (size_t list_no = 0; list_no < (size_t)nprobe; list_no++) { - idx_t cluster_id = keys[i * nprobe + list_no]; - size_t list_size = invlists->list_size(cluster_id); - size_t n_batches = - (list_size + batch_size - 1) / batch_size; - - std::unique_ptr sids; - const idx_t* ids = - std::make_unique( - invlists, cluster_id) - ->get(); - - for (size_t batch_no = 0; batch_no < n_batches; - batch_no++) { - size_t curr_batch_size = std::min( - list_size - batch_no * batch_size, batch_size); - size_t b_offset = batch_no * batch_size; - - std::iota( - active_indices.begin(), - active_indices.begin() + curr_batch_size, - b_offset); - std::fill( - bitset.begin(), - bitset.begin() + curr_batch_size, - 1); - std::fill( - bitset.begin() + curr_batch_size, - bitset.end(), - 0); - - for (size_t idx = 0; idx < curr_batch_size; idx++) { - exact_distances[idx] = init_exact_distances - [init_exact_distances_offsets[cluster_id] + - b_offset + idx]; + dis0 = dis0_cache; + + float query_cum_norm = + 2 * query_cum_norms[level + 1]; + float heap_max = distances[0]; + + const float* cum_sums_level = + batch_cums + curr_batch_size * level; + const uint8_t* codes_level = + batch_codes + bs * cs * level; + + float* sim_table_level = + sim_table_cache.data() + level_sim_offset; + + bool is_sparse = next_num_active < bs / 16; + + size_t num_active_for_filtering = 0; + if (is_sparse) { + // Sparse path: use active_indices for indirection. + for (size_t ci = 0; ci < cs; ci++) { + size_t chunk_off = ci * bs; + float* chunk_sim = sim_table_level + ci * pq.ksub; + for (size_t i = 0; i < next_num_active; i++) { + size_t real_idx = + active_indices[i] - batch_offset; + exact_distances[i] += + chunk_sim[codes_level[chunk_off + real_idx]]; } - - const uint8_t* codes = column_storage + - column_offsets[cluster_id] + - b_offset * code_size; - float* cums = cum_sums + cum_sum_offsets[cluster_id] + - b_offset * (n_levels + 1); - - total_points += curr_batch_size * n_levels; - - total_active += scanner->process_batch( - pq, - compressed_codes.data(), - cluster_id, - batch_no, - coarse_dis[i * nprobe + list_no], - curr_batch_size, - batch_size, - chunk_size, - epsilon, - n_levels, - codes, - cums, - query_cum_norms.data(), - active_indices.data(), - bitset.data(), - exact_distances.data(), - ids, - simi, - idxi, - k, - &dis0s_cache[list_no], - sim_table_cache.data() + - list_no * sim_table_size); } + num_active_for_filtering = next_num_active; + } else { + auto [cc, na] = + panorama_kernels::process_code_compression( + next_num_active, + bs, + cs, + compressed_codes.data(), + bitset.data(), + codes_level); + + panorama_kernels::process_chunks( + cs, bs, na, sim_table_level, cc, + exact_distances.data()); + num_active_for_filtering = na; } - reorder_result(simi, idxi); + next_num_active = panorama_kernels::process_filtering( + num_active_for_filtering, + exact_distances.data(), + active_indices.data(), + const_cast(cum_sums_level), + bitset.data(), + batch_offset, + dis0, + query_cum_norm, + epsilon, + heap_max); + } - if (InterruptCallback::is_interrupted()) { - interrupt = true; + // Insert surviving candidates into heap. + for (size_t i = 0; i < next_num_active; i++) { + float dis = dis0 + exact_distances[i]; + if (C::cmp(distances[0], dis)) { + idx_t id = store_pairs + ? lo_build(list_no, active_indices[i]) + : ids[active_indices[i]]; + heap_replace_top(k, distances, labels, dis, id); + nup++; } } } + + return nup; } - if (interrupt) { - if (!exception_string.empty()) { - FAISS_THROW_FMT( - "search interrupted with: %s", exception_string.c_str()); - } else { - FAISS_THROW_MSG("computation interrupted"); - } + size_t scan_codes( + size_t n, + const uint8_t* codes, + const idx_t* ids, + ResultHandler& handler) const override { + FAISS_THROW_MSG( + "IndexIVFPQPanorama: ResultHandler scan_codes not supported"); } +}; - printf("vv: total_active: %f\n", (float)total_active / total_points); +} // anonymous namespace + +/***************************************** + * get_InvertedListScanner + ******************************************/ + +InvertedListScanner* IndexIVFPQPanorama::get_InvertedListScanner( + bool store_pairs, + const IDSelector* sel, + const IVFSearchParameters*) const { + FAISS_THROW_IF_NOT_MSG( + metric_type == METRIC_L2, "only L2 metric supported"); + FAISS_THROW_IF_NOT_MSG( + use_precomputed_table == 1, + "Panorama PQ requires use_precomputed_table == 1"); + FAISS_THROW_IF_NOT_MSG( + pq.nbits == 8, "only 8-bit PQ codes supported"); + FAISS_THROW_IF_NOT_MSG( + by_residual, "Panorama PQ requires by_residual"); + FAISS_THROW_IF_NOT_MSG( + polysemous_ht == 0, + "Panorama PQ does not support polysemous"); + + if (sel) { + return new IVFPQScannerPanorama, true>( + *this, store_pairs, sel); + } else { + return new IVFPQScannerPanorama, false>( + *this, store_pairs, sel); + } } } // namespace faiss diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h index 46a19e6b09..9fa3d34e7a 100644 --- a/faiss/IndexIVFPQPanorama.h +++ b/faiss/IndexIVFPQPanorama.h @@ -1,34 +1,75 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + #ifndef FAISS_INDEX_IVFPQ_PANORAMA_H #define FAISS_INDEX_IVFPQ_PANORAMA_H #include #include -#include -#include namespace faiss { +/// Panorama adaptation of IndexIVFPQ following +/// https://www.arxiv.org/pdf/2510.00566. +/// +/// IDEA: +/// Panorama adapts the storage layout within each cluster and uses +/// Cauchy-Schwarz pruning to skip unnecessary distance computations. +/// Combined with orthogonal transforms upstream that concentrate signal +/// energy in the early PQ subquantizers (like PCA), Panorama can prune +/// the majority of candidates after computing only a fraction of the +/// full PQ distance. +/// +/// STORAGE LAYOUT: +/// Standard IVFPQ stores codes row-major: [point0_code, point1_code, ...]. +/// Panorama transposes codes into column-major within each batch: +/// for each batch of `batch_size` points, codes are stored as +/// M columns of `batch_size` bytes each. The M columns are grouped +/// into `n_levels` levels of `chunk_size` columns, enabling incremental +/// distance computation level-by-level. +/// +/// OVERHEAD: +/// Panorama precomputes per-point cumulative residual norms and initial +/// exact distances at insertion time. Storage overhead is +/// (n_levels + 1) floats per point for cum_sums, plus 1 float per +/// point for init_exact_distances. +/// +/// CONSTRAINTS: +/// - Only L2 metric is supported. +/// - Only 8-bit PQ codes (nbits_per_idx == 8). +/// - M must be divisible by n_levels. +/// - batch_size must be a multiple of 64. +/// - use_precomputed_table must be 1. +/// +/// NOTE: +/// We inherit from IndexIVFPQ and override only get_InvertedListScanner() +/// and add(). The base IndexIVF::search_preassigned() handles all search +/// orchestration — no search code is duplicated. struct IndexIVFPQPanorama : public IndexIVFPQ { - const int n_levels; - uint8_t* column_storage; + int n_levels; + float epsilon; + size_t batch_size; - size_t* column_offsets; - float* cum_sums; - size_t* cum_sum_offsets; + size_t chunk_size; + size_t levels_size; + size_t m_level_width; - float* init_exact_distances; - size_t* init_exact_distances_offsets; + bool added = false; + size_t num_points = 0; - const size_t chunk_size; - const size_t levels_size; - bool added; - size_t num_points; - size_t batch_size; - size_t nbits_per_idx; - size_t m_level_width; + uint8_t* column_storage = nullptr; + size_t* column_offsets = nullptr; - float epsilon; + float* cum_sums = nullptr; + size_t* cum_sum_offsets = nullptr; + + float* init_exact_distances = nullptr; + size_t* init_exact_distances_offsets = nullptr; IndexIVFPQPanorama( Index* quantizer, @@ -42,27 +83,14 @@ struct IndexIVFPQPanorama : public IndexIVFPQ { MetricType metric = METRIC_L2, bool own_invlists = true); + IndexIVFPQPanorama() = default; + void add(idx_t n, const float* x) override; - void search( - idx_t n, - const float* x, - idx_t k, - float* distances, - idx_t* labels, - const SearchParameters* params_in) const; - - void search_preassigned( - idx_t n, - const float* x, - idx_t k, - const idx_t* keys, - const float* coarse_dis, - float* distances, - idx_t* labels, + InvertedListScanner* get_InvertedListScanner( bool store_pairs, - const IVFSearchParameters* params, - IndexIVFStats* ivf_stats) const override; + const IDSelector* sel, + const IVFSearchParameters* params) const override; }; } // namespace faiss From 3f51144f5a071c3c647f3dee49c00f8e5e1ba58c Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Wed, 18 Mar 2026 05:43:54 +0000 Subject: [PATCH 07/41] Bench fixed --- benchs/bench_ivfpq_panorama.py | 87 +++++++++++++++++++--------------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py index 7c965d54fe..fcb40b466e 100644 --- a/benchs/bench_ivfpq_panorama.py +++ b/benchs/bench_ivfpq_panorama.py @@ -22,6 +22,7 @@ def fvecs_read(fname): IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index") IVFPQ_TRAINED_CACHE = os.path.join(CACHE_DIR, "ivfpq_trained_10pct.index") +IVFPQ_PANO_CACHE = os.path.join(CACHE_DIR, "ivfpq_pano_10pct.index") print("Loading GIST1M data (10% subset)...", flush=True) xb_full = fvecs_read(os.path.join(GIST_DIR, "gist1m_base.fvecs")) @@ -106,49 +107,59 @@ def eval_recall(index, nprobe_val): ivfpq.nprobe = nprobe eval_recall(ivfpq, nprobe) -# --- IVFPQPanorama (reuse trained PQ from cache) --- +# --- IVFPQPanorama (cached separately) --- faiss.omp_set_num_threads(mp.cpu_count()) - -def build_panorama_from_trained(trained_index): - quantizer2 = trained_index.quantizer - trained_index.own_fields = False - - pano = faiss.IndexIVFPQPanorama( - quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size - ) - centroids = faiss.vector_to_array(trained_index.pq.centroids) - faiss.copy_array_to_vector(centroids, pano.pq.centroids) - pano.is_trained = True - pano.use_precomputed_table = 1 - pano.precompute_table() - return pano - - -if os.path.exists(IVFPQ_TRAINED_CACHE): - print(f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", flush=True) - trained = faiss.read_index(IVFPQ_TRAINED_CACHE) - ivfpq_pano = build_panorama_from_trained(trained) - print(" Reused trained PQ (skipped training).", flush=True) -else: - print( - f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}", - flush=True, - ) - quantizer2 = faiss.IndexFlatL2(d) - trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits) +if os.path.exists(IVFPQ_PANO_CACHE): + print(f"\nLoading cached IVFPQPanorama from {IVFPQ_PANO_CACHE}...", flush=True) t0 = time.time() - trained.train(xt) - print(f" Training took {time.time() - t0:.1f}s", flush=True) - - print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) - faiss.write_index(trained, IVFPQ_TRAINED_CACHE) + ivfpq_pano = faiss.read_index(IVFPQ_PANO_CACHE) + print(f" Loaded in {time.time() - t0:.1f}s", flush=True) +else: + def build_panorama_from_trained(trained_index): + quantizer2 = trained_index.quantizer + trained_index.own_fields = False + + pano = faiss.IndexIVFPQPanorama( + quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size + ) + centroids = faiss.vector_to_array(trained_index.pq.centroids) + faiss.copy_array_to_vector(centroids, pano.pq.centroids) + pano.is_trained = True + pano.use_precomputed_table = 1 + pano.precompute_table() + return pano + + if os.path.exists(IVFPQ_TRAINED_CACHE): + print( + f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", + flush=True, + ) + trained = faiss.read_index(IVFPQ_TRAINED_CACHE) + ivfpq_pano = build_panorama_from_trained(trained) + print(" Reused trained PQ (skipped training).", flush=True) + else: + print( + f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}", + flush=True, + ) + quantizer2 = faiss.IndexFlatL2(d) + trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits) + t0 = time.time() + trained.train(xt) + print(f" Training took {time.time() - t0:.1f}s", flush=True) + + print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) + faiss.write_index(trained, IVFPQ_TRAINED_CACHE) + + ivfpq_pano = build_panorama_from_trained(trained) - ivfpq_pano = build_panorama_from_trained(trained) + t0 = time.time() + ivfpq_pano.add(xb) + print(f" Adding took {time.time() - t0:.1f}s", flush=True) -t0 = time.time() -ivfpq_pano.add(xb) -print(f" Adding took {time.time() - t0:.1f}s", flush=True) + print(f" Saving IVFPQPanorama to {IVFPQ_PANO_CACHE}...", flush=True) + faiss.write_index(ivfpq_pano, IVFPQ_PANO_CACHE) faiss.omp_set_num_threads(1) print("\n====== IVFPQPanorama", flush=True) From bb46ee3edbbdb86a65810827211c58eff4a52b02 Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Wed, 18 Mar 2026 05:49:53 +0000 Subject: [PATCH 08/41] Remove epsilon --- benchs/bench_ivfpq_panorama.py | 55 +++++++++---------- faiss/IndexIVFPQPanorama.cpp | 4 -- faiss/IndexIVFPQPanorama.h | 2 - .../panorama_kernels-avx2.cpp | 3 +- .../panorama_kernels-avx512.cpp | 3 +- .../panorama_kernels-generic.cpp | 3 +- .../impl/panorama_kernels/panorama_kernels.h | 1 - 7 files changed, 30 insertions(+), 41 deletions(-) diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py index fcb40b466e..615ffe01b5 100644 --- a/benchs/bench_ivfpq_panorama.py +++ b/benchs/bench_ivfpq_panorama.py @@ -43,7 +43,6 @@ def fvecs_read(fname): nbits = 8 nlist = 64 n_levels = 8 -epsilon = 1.0 batch_size = 128 GT_PATH = os.path.join(CACHE_DIR, "gt_10pct.npy") @@ -75,37 +74,37 @@ def eval_recall(index, nprobe_val): return recall, qps -faiss.omp_set_num_threads(mp.cpu_count()) +# faiss.omp_set_num_threads(mp.cpu_count()) -# --- IVFPQ baseline (cached) --- -if os.path.exists(IVFPQ_CACHE): - print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True) - t0 = time.time() - ivfpq = faiss.read_index(IVFPQ_CACHE) - print(f" Loaded in {time.time() - t0:.1f}s", flush=True) -else: - print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True) - quantizer = faiss.IndexFlatL2(d) - ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) - t0 = time.time() - ivfpq.train(xt) - print(f" Training took {time.time() - t0:.1f}s", flush=True) +# # --- IVFPQ baseline (cached) --- +# if os.path.exists(IVFPQ_CACHE): +# print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True) +# t0 = time.time() +# ivfpq = faiss.read_index(IVFPQ_CACHE) +# print(f" Loaded in {time.time() - t0:.1f}s", flush=True) +# else: +# print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True) +# quantizer = faiss.IndexFlatL2(d) +# ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) +# t0 = time.time() +# ivfpq.train(xt) +# print(f" Training took {time.time() - t0:.1f}s", flush=True) - print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) - faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE) +# print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) +# faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE) - t0 = time.time() - ivfpq.add(xb) - print(f" Adding took {time.time() - t0:.1f}s", flush=True) +# t0 = time.time() +# ivfpq.add(xb) +# print(f" Adding took {time.time() - t0:.1f}s", flush=True) - print(f" Saving full index to {IVFPQ_CACHE}...", flush=True) - faiss.write_index(ivfpq, IVFPQ_CACHE) +# print(f" Saving full index to {IVFPQ_CACHE}...", flush=True) +# faiss.write_index(ivfpq, IVFPQ_CACHE) -faiss.omp_set_num_threads(1) -print("\n====== IVFPQ baseline", flush=True) -for nprobe in [1, 2, 4, 8, 16]: - ivfpq.nprobe = nprobe - eval_recall(ivfpq, nprobe) +# faiss.omp_set_num_threads(1) +# print("\n====== IVFPQ baseline", flush=True) +# for nprobe in [1, 2, 4, 8, 16]: +# ivfpq.nprobe = nprobe +# eval_recall(ivfpq, nprobe) # --- IVFPQPanorama (cached separately) --- faiss.omp_set_num_threads(mp.cpu_count()) @@ -121,7 +120,7 @@ def build_panorama_from_trained(trained_index): trained_index.own_fields = False pano = faiss.IndexIVFPQPanorama( - quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size + quantizer2, d, nlist, M, nbits, n_levels, batch_size ) centroids = faiss.vector_to_array(trained_index.pq.centroids) faiss.copy_array_to_vector(centroids, pano.pq.centroids) diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index 8ac3df210a..3104820e19 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -30,7 +30,6 @@ IndexIVFPQPanorama::IndexIVFPQPanorama( size_t M, size_t nbits_per_idx, int n_levels, - float epsilon, size_t batch_size, MetricType metric, bool own_invlists) @@ -43,7 +42,6 @@ IndexIVFPQPanorama::IndexIVFPQPanorama( metric, own_invlists), n_levels(n_levels), - epsilon(epsilon), batch_size(batch_size), chunk_size(code_size / n_levels), levels_size(d / n_levels), @@ -242,7 +240,6 @@ struct IVFPQScannerPanorama : InvertedListScanner { const size_t bs = index.batch_size; const size_t cs = index.chunk_size; const int n_levels = index.n_levels; - const float epsilon = index.epsilon; const size_t n_batches = (list_size + bs - 1) / bs; const size_t sim_table_size = pq.ksub * pq.M; @@ -360,7 +357,6 @@ struct IVFPQScannerPanorama : InvertedListScanner { batch_offset, dis0, query_cum_norm, - epsilon, heap_max); } diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h index 9fa3d34e7a..aaee470e25 100644 --- a/faiss/IndexIVFPQPanorama.h +++ b/faiss/IndexIVFPQPanorama.h @@ -52,7 +52,6 @@ namespace faiss { /// orchestration — no search code is duplicated. struct IndexIVFPQPanorama : public IndexIVFPQ { int n_levels; - float epsilon; size_t batch_size; size_t chunk_size; @@ -78,7 +77,6 @@ struct IndexIVFPQPanorama : public IndexIVFPQ { size_t M, size_t nbits_per_idx, int n_levels, - float epsilon, size_t batch_size = 128, MetricType metric = METRIC_L2, bool own_invlists = true); diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp index 235c5d4d78..46728b1cdd 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -122,14 +122,13 @@ size_t process_filtering( size_t batch_offset, float dis0, float query_cum_norm, - float epsilon, float heap_max) { size_t next_num_active = 0; for (size_t i = 0; i < num_active; i++) { float exact_distance = exact_distances[i]; float cum_sum = cum_sums[active_indices[i] - batch_offset]; float lower_bound = - exact_distance + dis0 - cum_sum * query_cum_norm * epsilon; + exact_distance + dis0 - cum_sum * query_cum_norm; bool keep = heap_max > lower_bound; active_indices[next_num_active] = active_indices[i]; diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp index 6c6f0f24db..7733d5a6da 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -119,14 +119,13 @@ size_t process_filtering( size_t batch_offset, float dis0, float query_cum_norm, - float epsilon, float heap_max) { size_t next_num_active = 0; for (size_t i = 0; i < num_active; i++) { float exact_distance = exact_distances[i]; float cum_sum = cum_sums[active_indices[i] - batch_offset]; float lower_bound = - exact_distance + dis0 - cum_sum * query_cum_norm * epsilon; + exact_distance + dis0 - cum_sum * query_cum_norm; bool keep = heap_max > lower_bound; active_indices[next_num_active] = active_indices[i]; diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp index ab9f7acb57..cfd1283c80 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp @@ -47,14 +47,13 @@ size_t process_filtering( size_t batch_offset, float dis0, float query_cum_norm, - float epsilon, float heap_max) { size_t next_num_active = 0; for (size_t i = 0; i < num_active; i++) { float exact_distance = exact_distances[i]; float cum_sum = cum_sums[active_indices[i] - batch_offset]; float lower_bound = - exact_distance + dis0 - cum_sum * query_cum_norm * epsilon; + exact_distance + dis0 - cum_sum * query_cum_norm; bool keep = heap_max > lower_bound; active_indices[next_num_active] = active_indices[i]; diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h index 6c8d007ddd..aed8a87660 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels.h +++ b/faiss/impl/panorama_kernels/panorama_kernels.h @@ -61,7 +61,6 @@ size_t process_filtering( size_t batch_offset, float dis0, float query_cum_norm, - float epsilon, float heap_max); /// Byte-level stream compaction of PQ codes using the active bitset. From f21aac14df599b1eb983c66e604461dc92636c49 Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Wed, 18 Mar 2026 06:35:40 +0000 Subject: [PATCH 09/41] Fix the LUT --- faiss/IndexIVFPQPanorama.cpp | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index 3104820e19..01f5b0a7c5 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -205,6 +205,13 @@ struct IVFPQScannerPanorama : InvertedListScanner { pq.compute_inner_prod_table(qi, sim_table_2.data()); + // The PQ distance LUT is -2 * inner_prod_table; apply in-place + // so scan_codes() can use sim_table_2 directly. + const size_t n = pq.M * pq.ksub; + for (size_t i = 0; i < n; i++) { + sim_table_2[i] *= -2.0f; + } + // Compute query suffix sums → cum norms per level. std::vector suffix(index.d + 1, 0.0f); for (int j = index.d - 1; j >= 0; j--) { @@ -242,8 +249,6 @@ struct IVFPQScannerPanorama : InvertedListScanner { const int n_levels = index.n_levels; const size_t n_batches = (list_size + bs - 1) / bs; - const size_t sim_table_size = pq.ksub * pq.M; - // Panorama column-major codes for this list. const uint8_t* col_codes = index.column_storage + index.column_offsets[list_no]; @@ -258,8 +263,7 @@ struct IVFPQScannerPanorama : InvertedListScanner { std::vector bitset(bs); std::vector active_indices(bs); std::vector compressed_codes(bs * cs); - std::vector sim_table_cache(sim_table_size); - float dis0_cache = 0; + float dis0 = coarse_dis; for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { size_t curr_batch_size = @@ -283,27 +287,13 @@ struct IVFPQScannerPanorama : InvertedListScanner { list_cum_sums + b_offset * (n_levels + 1); size_t next_num_active = curr_batch_size; - float dis0 = 0; size_t batch_offset = batch_no * bs; for (int level = 0; level < n_levels && next_num_active > 0; level++) { - // Compute sim table for this level (cached across batches - // within same list, only for first batch). size_t level_sim_offset = level * pq.ksub * cs; - if (level == 0 && batch_no == 0) { - // Precompute LUT: sim_table = -2 * sim_table_2 - // (the precomputed_table term is added via dis0). - dis0_cache = coarse_dis; - const size_t n = pq.M * pq.ksub; - for (size_t i = 0; i < n; i++) { - sim_table_cache[i] = -2.0f * sim_table_2[i]; - } - } - dis0 = dis0_cache; - float query_cum_norm = 2 * query_cum_norms[level + 1]; float heap_max = distances[0]; @@ -314,7 +304,8 @@ struct IVFPQScannerPanorama : InvertedListScanner { batch_codes + bs * cs * level; float* sim_table_level = - sim_table_cache.data() + level_sim_offset; + const_cast(sim_table_2.data()) + + level_sim_offset; bool is_sparse = next_num_active < bs / 16; From 0a6966d2b2aba3de074911e298e2742ed9c841bf Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Wed, 18 Mar 2026 06:49:19 +0000 Subject: [PATCH 10/41] Use precomputed table and remove init exact dist --- faiss/IndexIVFPQPanorama.cpp | 123 +++++++++++++++++------------------ faiss/IndexIVFPQPanorama.h | 11 ++-- 2 files changed, 63 insertions(+), 71 deletions(-) diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index 01f5b0a7c5..f9c6b6e7dd 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -46,10 +46,13 @@ IndexIVFPQPanorama::IndexIVFPQPanorama( chunk_size(code_size / n_levels), levels_size(d / n_levels), m_level_width(M / n_levels) { - FAISS_THROW_IF_NOT_MSG(M % n_levels == 0, "M must be divisible by n_levels"); - FAISS_THROW_IF_NOT_MSG(batch_size % 64 == 0, "batch_size must be multiple of 64"); + FAISS_THROW_IF_NOT_MSG( + M % n_levels == 0, "M must be divisible by n_levels"); + FAISS_THROW_IF_NOT_MSG( + batch_size % 64 == 0, "batch_size must be multiple of 64"); FAISS_THROW_IF_NOT_MSG(nbits_per_idx == 8, "only 8-bit PQ codes supported"); - FAISS_THROW_IF_NOT_MSG(M == code_size, "M must equal code_size for 8-bit PQ"); + FAISS_THROW_IF_NOT_MSG( + M == code_size, "M must equal code_size for 8-bit PQ"); FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported"); } @@ -58,7 +61,8 @@ IndexIVFPQPanorama::IndexIVFPQPanorama( ******************************************/ void IndexIVFPQPanorama::add(idx_t n, const float* x) { - FAISS_THROW_IF_NOT_MSG(!added, "IndexIVFPQPanorama only supports a single add() call"); + FAISS_THROW_IF_NOT_MSG( + !added, "IndexIVFPQPanorama only supports a single add() call"); added = true; num_points = n; @@ -89,43 +93,39 @@ void IndexIVFPQPanorama::add(idx_t n, const float* x) { std::min(list_size - batch_no * batch_size, batch_size); for (size_t m = 0; m < pq.M; m++) { for (size_t p = 0; p < curr_batch_size; p++) { - column_storage[col_offset + batch_offset + - m * batch_size + p] = - row_codes[batch_no * batch_size * code_size + - p * code_size + m]; + column_storage + [col_offset + batch_offset + m * batch_size + p] = + row_codes + [batch_no * batch_size * code_size + + p * code_size + m]; } } } } - // Precompute cumulative residual norms and initial exact distances. + // Precompute cumulative residual norms (suffix sums of ||y_R||^2). + // init_exact_distances are computed on-the-fly during search using + // the precomputed_table, so we only need cum_sums here. cum_sum_offsets = new size_t[nlist]; - init_exact_distances_offsets = new size_t[nlist]; size_t cum_size = 0; - size_t init_size = 0; for (size_t list_no = 0; list_no < nlist; list_no++) { cum_sum_offsets[list_no] = cum_size; cum_size += invlists->list_size(list_no) * (n_levels + 1); - init_exact_distances_offsets[list_no] = init_size; - init_size += invlists->list_size(list_no); } cum_sums = new float[cum_size]; - init_exact_distances = new float[init_size]; for (size_t list_no = 0; list_no < nlist; list_no++) { size_t list_size = invlists->list_size(list_no); - - std::vector centroid(d); - quantizer->reconstruct(list_no, centroid.data()); + if (list_size == 0) + continue; size_t n_batches = (list_size + batch_size - 1) / batch_size; for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { size_t b_offset = batch_no * batch_size; - size_t curr_batch_size = - std::min(list_size - b_offset, batch_size); + size_t curr_batch_size = std::min(list_size - b_offset, batch_size); for (size_t p = 0; p < curr_batch_size; p++) { std::vector vec(d); @@ -133,10 +133,8 @@ void IndexIVFPQPanorama::add(idx_t n, const float* x) { invlists->get_single_code(list_no, b_offset + p); pq.decode(code, vec.data()); - float init_dist = 0.0f; std::vector suffix(d + 1, 0.0f); for (int j = d - 1; j >= 0; j--) { - init_dist += vec[j] * vec[j] + 2 * vec[j] * centroid[j]; suffix[j] = suffix[j + 1] + vec[j] * vec[j]; } @@ -151,13 +149,9 @@ void IndexIVFPQPanorama::add(idx_t n, const float* x) { } size_t last_offset = cum_sum_offsets[list_no] + - b_offset * (n_levels + 1) + - n_levels * curr_batch_size + p; + b_offset * (n_levels + 1) + n_levels * curr_batch_size + + p; cum_sums[last_offset] = 0.0f; - - init_exact_distances - [init_exact_distances_offsets[list_no] + b_offset + p] = - init_dist; } } } @@ -231,8 +225,7 @@ struct IVFPQScannerPanorama : InvertedListScanner { } float distance_to_code(const uint8_t* code) const override { - FAISS_THROW_MSG( - "IndexIVFPQPanorama does not support distance_to_code"); + FAISS_THROW_MSG("IndexIVFPQPanorama does not support distance_to_code"); } size_t scan_codes( @@ -254,9 +247,8 @@ struct IVFPQScannerPanorama : InvertedListScanner { index.column_storage + index.column_offsets[list_no]; const float* list_cum_sums = index.cum_sums + index.cum_sum_offsets[list_no]; - const float* list_init_dists = - index.init_exact_distances + - index.init_exact_distances_offsets[list_no]; + const float* precomp = + index.precomputed_table.data() + list_no * pq.M * pq.ksub; // Scratch buffers. std::vector exact_distances(bs); @@ -266,8 +258,7 @@ struct IVFPQScannerPanorama : InvertedListScanner { float dis0 = coarse_dis; for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { - size_t curr_batch_size = - std::min(list_size - batch_no * bs, bs); + size_t curr_batch_size = std::min(list_size - batch_no * bs, bs); size_t b_offset = batch_no * bs; // Initialize active set. @@ -278,30 +269,35 @@ struct IVFPQScannerPanorama : InvertedListScanner { std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1); std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0); + const uint8_t* batch_codes = col_codes + b_offset * code_size; + + // Compute init_exact_distance on-the-fly from the + // precomputed table: sum_m(precomp[m * ksub + code[m]]). + // Codes are column-major: point p's code for subquantizer + // m is at batch_codes[m * bs + p]. for (size_t idx = 0; idx < curr_batch_size; idx++) { - exact_distances[idx] = list_init_dists[b_offset + idx]; + float init_dist = 0.0f; + for (size_t m = 0; m < pq.M; m++) { + uint8_t code_val = batch_codes[m * bs + idx]; + init_dist += precomp[m * pq.ksub + code_val]; + } + exact_distances[idx] = init_dist; } - - const uint8_t* batch_codes = col_codes + b_offset * code_size; - const float* batch_cums = - list_cum_sums + b_offset * (n_levels + 1); + const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1); size_t next_num_active = curr_batch_size; size_t batch_offset = batch_no * bs; - for (int level = 0; - level < n_levels && next_num_active > 0; + for (int level = 0; level < n_levels && next_num_active > 0; level++) { size_t level_sim_offset = level * pq.ksub * cs; - float query_cum_norm = - 2 * query_cum_norms[level + 1]; + float query_cum_norm = 2 * query_cum_norms[level + 1]; float heap_max = distances[0]; const float* cum_sums_level = batch_cums + curr_batch_size * level; - const uint8_t* codes_level = - batch_codes + bs * cs * level; + const uint8_t* codes_level = batch_codes + bs * cs * level; float* sim_table_level = const_cast(sim_table_2.data()) + @@ -316,25 +312,27 @@ struct IVFPQScannerPanorama : InvertedListScanner { size_t chunk_off = ci * bs; float* chunk_sim = sim_table_level + ci * pq.ksub; for (size_t i = 0; i < next_num_active; i++) { - size_t real_idx = - active_indices[i] - batch_offset; - exact_distances[i] += - chunk_sim[codes_level[chunk_off + real_idx]]; + size_t real_idx = active_indices[i] - batch_offset; + exact_distances[i] += chunk_sim + [codes_level[chunk_off + real_idx]]; } } num_active_for_filtering = next_num_active; } else { - auto [cc, na] = - panorama_kernels::process_code_compression( - next_num_active, - bs, - cs, - compressed_codes.data(), - bitset.data(), - codes_level); + auto [cc, na] = panorama_kernels::process_code_compression( + next_num_active, + bs, + cs, + compressed_codes.data(), + bitset.data(), + codes_level); panorama_kernels::process_chunks( - cs, bs, na, sim_table_level, cc, + cs, + bs, + na, + sim_table_level, + cc, exact_distances.data()); num_active_for_filtering = na; } @@ -392,13 +390,10 @@ InvertedListScanner* IndexIVFPQPanorama::get_InvertedListScanner( FAISS_THROW_IF_NOT_MSG( use_precomputed_table == 1, "Panorama PQ requires use_precomputed_table == 1"); + FAISS_THROW_IF_NOT_MSG(pq.nbits == 8, "only 8-bit PQ codes supported"); + FAISS_THROW_IF_NOT_MSG(by_residual, "Panorama PQ requires by_residual"); FAISS_THROW_IF_NOT_MSG( - pq.nbits == 8, "only 8-bit PQ codes supported"); - FAISS_THROW_IF_NOT_MSG( - by_residual, "Panorama PQ requires by_residual"); - FAISS_THROW_IF_NOT_MSG( - polysemous_ht == 0, - "Panorama PQ does not support polysemous"); + polysemous_ht == 0, "Panorama PQ does not support polysemous"); if (sel) { return new IVFPQScannerPanorama, true>( diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h index aaee470e25..a97107f54d 100644 --- a/faiss/IndexIVFPQPanorama.h +++ b/faiss/IndexIVFPQPanorama.h @@ -34,10 +34,10 @@ namespace faiss { /// distance computation level-by-level. /// /// OVERHEAD: -/// Panorama precomputes per-point cumulative residual norms and initial -/// exact distances at insertion time. Storage overhead is -/// (n_levels + 1) floats per point for cum_sums, plus 1 float per -/// point for init_exact_distances. +/// Panorama precomputes per-point cumulative residual norms at insertion +/// time. Storage overhead is (n_levels + 1) floats per point for +/// cum_sums. Initial exact distances are computed on-the-fly during +/// search using the precomputed_table (no extra per-point storage). /// /// CONSTRAINTS: /// - Only L2 metric is supported. @@ -67,9 +67,6 @@ struct IndexIVFPQPanorama : public IndexIVFPQ { float* cum_sums = nullptr; size_t* cum_sum_offsets = nullptr; - float* init_exact_distances = nullptr; - size_t* init_exact_distances_offsets = nullptr; - IndexIVFPQPanorama( Index* quantizer, size_t d, From 4bf6785654abac00b3548633634dfbb16c06419c Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Thu, 19 Mar 2026 07:12:31 +0000 Subject: [PATCH 11/41] clean but slow --- benchs/bench_ivfpq_panorama.py | 60 +++---- faiss/CMakeLists.txt | 2 + faiss/IndexFlat.cpp | 12 +- faiss/IndexFlat.h | 4 +- faiss/IndexHNSW.cpp | 5 +- faiss/IndexHNSW.h | 2 +- faiss/IndexIVFFlatPanorama.cpp | 18 ++- faiss/IndexIVFPQPanorama.cpp | 269 ++++++------------------------- faiss/IndexIVFPQPanorama.h | 22 +-- faiss/impl/HNSW.cpp | 6 +- faiss/impl/Panorama.cpp | 111 +++++++------ faiss/impl/Panorama.h | 64 ++++++-- faiss/impl/index_read.cpp | 40 ++++- faiss/impl/index_write.cpp | 15 +- faiss/invlists/InvertedLists.cpp | 39 ++--- faiss/invlists/InvertedLists.h | 12 +- faiss/python/swigfaiss.swig | 1 + 17 files changed, 298 insertions(+), 384 deletions(-) diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py index 615ffe01b5..5ae4fc2152 100644 --- a/benchs/bench_ivfpq_panorama.py +++ b/benchs/bench_ivfpq_panorama.py @@ -59,6 +59,7 @@ def fvecs_read(fname): def eval_recall(index, nprobe_val): + faiss.cvar.indexPanorama_stats.reset() t0 = time.time() _, I = index.search(xq, k=k) t = time.time() - t0 @@ -66,47 +67,50 @@ def eval_recall(index, nprobe_val): qps = 1000 / speed corrects = sum(len(set(gt_I[i]) & set(I[i])) for i in range(nq)) recall = corrects / (nq * k) + stats = faiss.cvar.indexPanorama_stats + pct_active = stats.ratio_dims_scanned * 100 print( f"\tnprobe {nprobe_val:3d}, Recall@{k}: " - f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}", + f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}, " + f"active: {pct_active:.1f}%", flush=True, ) return recall, qps -# faiss.omp_set_num_threads(mp.cpu_count()) +faiss.omp_set_num_threads(mp.cpu_count()) -# # --- IVFPQ baseline (cached) --- -# if os.path.exists(IVFPQ_CACHE): -# print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True) -# t0 = time.time() -# ivfpq = faiss.read_index(IVFPQ_CACHE) -# print(f" Loaded in {time.time() - t0:.1f}s", flush=True) -# else: -# print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True) -# quantizer = faiss.IndexFlatL2(d) -# ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) -# t0 = time.time() -# ivfpq.train(xt) -# print(f" Training took {time.time() - t0:.1f}s", flush=True) +# --- IVFPQ baseline (cached) --- +if os.path.exists(IVFPQ_CACHE): + print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True) + t0 = time.time() + ivfpq = faiss.read_index(IVFPQ_CACHE) + print(f" Loaded in {time.time() - t0:.1f}s", flush=True) +else: + print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True) + quantizer = faiss.IndexFlatL2(d) + ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) + t0 = time.time() + ivfpq.train(xt) + print(f" Training took {time.time() - t0:.1f}s", flush=True) -# print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) -# faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE) + print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) + faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE) -# t0 = time.time() -# ivfpq.add(xb) -# print(f" Adding took {time.time() - t0:.1f}s", flush=True) + t0 = time.time() + ivfpq.add(xb) + print(f" Adding took {time.time() - t0:.1f}s", flush=True) -# print(f" Saving full index to {IVFPQ_CACHE}...", flush=True) -# faiss.write_index(ivfpq, IVFPQ_CACHE) + print(f" Saving full index to {IVFPQ_CACHE}...", flush=True) + faiss.write_index(ivfpq, IVFPQ_CACHE) -# faiss.omp_set_num_threads(1) -# print("\n====== IVFPQ baseline", flush=True) -# for nprobe in [1, 2, 4, 8, 16]: -# ivfpq.nprobe = nprobe -# eval_recall(ivfpq, nprobe) +faiss.omp_set_num_threads(1) +print("\n====== IVFPQ baseline", flush=True) +for nprobe in [1, 2, 4, 8, 16]: + ivfpq.nprobe = nprobe + eval_recall(ivfpq, nprobe) -# --- IVFPQPanorama (cached separately) --- +# --- IVFPQPanorama (cached) --- faiss.omp_set_num_threads(mp.cpu_count()) if os.path.exists(IVFPQ_PANO_CACHE): diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index 84a6eb1aac..5e752d9d3b 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -131,6 +131,7 @@ set(FAISS_SRC impl/zerocopy_io.cpp impl/NNDescent.cpp impl/Panorama.cpp + impl/PanoramaPQ.cpp impl/PanoramaStats.cpp invlists/BlockInvertedLists.cpp invlists/DirectMap.cpp @@ -241,6 +242,7 @@ set(FAISS_HEADERS impl/NNDescent.h impl/NSG.h impl/Panorama.h + impl/PanoramaPQ.h impl/PanoramaStats.h impl/PolysemousTraining.h impl/ProductQuantizer-inl.h diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp index 599d5b1e6c..27ed1090da 100644 --- a/faiss/IndexFlat.cpp +++ b/faiss/IndexFlat.cpp @@ -695,7 +695,7 @@ void IndexFlatPanorama::add(idx_t n, const float* x) { const uint8_t* code = reinterpret_cast(x); pano.copy_codes_to_level_layout(codes.data(), offset, n, code); - pano.compute_cumulative_sums(cum_sums.data(), offset, n, x); + pano.compute_cumulative_sums(cum_sums.data(), offset, n, code); } void IndexFlatPanorama::search( @@ -892,12 +892,12 @@ void IndexFlatPanorama::search_subset( bool pruned = false; for (size_t level = 0; level < n_levels; level++) { local_stats.total_dims_scanned += - pano.level_width_floats; + pano.level_width_dims; // Refine distance size_t actual_level_width = std::min( - pano.level_width_floats, - d - level * pano.level_width_floats); + pano.level_width_dims, + d - level * pano.level_width_dims); float dot_product = fvec_inner_product( x_ptr, p_ptr, actual_level_width); if constexpr (is_sim) { @@ -930,8 +930,8 @@ void IndexFlatPanorama::search_subset( } cum_sum_offset++; - x_ptr += pano.level_width_floats; - p_ptr += pano.level_width_floats; + x_ptr += pano.level_width_dims; + p_ptr += pano.level_width_dims; } if (!pruned) { diff --git a/faiss/IndexFlat.h b/faiss/IndexFlat.h index f5870166ee..ccc0126d28 100644 --- a/faiss/IndexFlat.h +++ b/faiss/IndexFlat.h @@ -104,7 +104,7 @@ struct IndexFlatPanorama : IndexFlat { const size_t batch_size; const size_t n_levels; std::vector cum_sums; - Panorama pano; + PanoramaFlat pano; /** * @param d dimensionality of the input vectors @@ -120,7 +120,7 @@ struct IndexFlatPanorama : IndexFlat { : IndexFlat(d, metric), batch_size(batch_size), n_levels(n_levels), - pano(code_size, n_levels, batch_size) { + pano(d, n_levels, batch_size) { FAISS_THROW_IF_NOT( metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT); } diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp index fb02433778..fadea613c9 100644 --- a/faiss/IndexHNSW.cpp +++ b/faiss/IndexHNSW.cpp @@ -672,7 +672,7 @@ IndexHNSWFlatPanorama::IndexHNSWFlatPanorama( MetricType metric) : IndexHNSWFlat(d, M, metric), cum_sums(), - pano(d * sizeof(float), num_panorama_levels, 1), + pano(d, num_panorama_levels, 1), num_panorama_levels(num_panorama_levels) { // For now, we only support L2 distance. // Supporting dot product and cosine distance is a trivial addition @@ -688,7 +688,8 @@ IndexHNSWFlatPanorama::IndexHNSWFlatPanorama( void IndexHNSWFlatPanorama::add(idx_t n, const float* x) { idx_t n0 = ntotal; cum_sums.resize((ntotal + n) * (pano.n_levels + 1)); - pano.compute_cumulative_sums(cum_sums.data(), n0, n, x); + pano.compute_cumulative_sums( + cum_sums.data(), n0, n, reinterpret_cast(x)); IndexHNSWFlat::add(n, x); } diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h index a43828d428..17d54ecaa2 100644 --- a/faiss/IndexHNSW.h +++ b/faiss/IndexHNSW.h @@ -179,7 +179,7 @@ struct IndexHNSWFlatPanorama : IndexHNSWFlat { } std::vector cum_sums; - Panorama pano; + PanoramaFlat pano; const size_t num_panorama_levels; }; diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp index 01a548b412..5e678be28c 100644 --- a/faiss/IndexIVFFlatPanorama.cpp +++ b/faiss/IndexIVFFlatPanorama.cpp @@ -38,7 +38,9 @@ IndexIVFFlatPanorama::IndexIVFFlatPanorama( // We construct the inverted lists here so that we can use the // level-oriented storage. This does not cause a leak as we constructed // IndexIVF first, with own_invlists set to false. - this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, n_levels); + auto* pano = new PanoramaFlat( + d, n_levels, ArrayInvertedListsPanorama::kBatchSize); + this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano); this->own_invlists = own_invlists; } @@ -50,6 +52,7 @@ template struct IVFFlatScannerPanorama : InvertedListScanner { VectorDistance vd; const ArrayInvertedListsPanorama* storage; + const PanoramaFlat* pano_flat; using C = typename VectorDistance::C; static constexpr MetricType metric = VectorDistance::metric; @@ -58,10 +61,15 @@ struct IVFFlatScannerPanorama : InvertedListScanner { const ArrayInvertedListsPanorama* storage, bool store_pairs, const IDSelector* sel) - : InvertedListScanner(store_pairs, sel), vd(vd), storage(storage) { + : InvertedListScanner(store_pairs, sel), + vd(vd), + storage(storage), + pano_flat( + dynamic_cast(storage->pano.get())) { + FAISS_THROW_IF_NOT(pano_flat); keep_max = vd.is_similarity; code_size = vd.d * sizeof(float); - cum_sums.resize(storage->n_levels + 1); + cum_sums.resize(pano_flat->n_levels + 1); } const float* xi = nullptr; @@ -69,7 +77,7 @@ struct IVFFlatScannerPanorama : InvertedListScanner { float q_norm = 0.0f; void set_query(const float* query) override { this->xi = query; - this->storage->pano.compute_query_cum_sums(query, cum_sums.data()); + pano_flat->compute_query_cum_sums(query, cum_sums.data()); q_norm = cum_sums[0] * cum_sums[0]; } @@ -107,7 +115,7 @@ struct IVFFlatScannerPanorama : InvertedListScanner { size_t batch_start = batch_no * storage->kBatchSize; size_t num_active = with_metric_type(metric, [&]() { - return storage->pano.progressive_filter_batch( + return pano_flat->progressive_filter_batch( codes, cum_sums_data, xi, diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index f9c6b6e7dd..504958b544 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -14,7 +14,9 @@ #include #include -#include +#include +#include +#include #include namespace faiss { @@ -33,19 +35,11 @@ IndexIVFPQPanorama::IndexIVFPQPanorama( size_t batch_size, MetricType metric, bool own_invlists) - : IndexIVFPQ( - quantizer, - d, - nlist, - M, - nbits_per_idx, - metric, - own_invlists), + : IndexIVFPQ(quantizer, d, nlist, M, nbits_per_idx, metric, false), n_levels(n_levels), batch_size(batch_size), chunk_size(code_size / n_levels), - levels_size(d / n_levels), - m_level_width(M / n_levels) { + levels_size(d / n_levels) { FAISS_THROW_IF_NOT_MSG( M % n_levels == 0, "M must be divisible by n_levels"); FAISS_THROW_IF_NOT_MSG( @@ -54,107 +48,10 @@ IndexIVFPQPanorama::IndexIVFPQPanorama( FAISS_THROW_IF_NOT_MSG( M == code_size, "M must equal code_size for 8-bit PQ"); FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported"); -} - -/***************************************** - * add — transpose codes into column-major layout and precompute norms - ******************************************/ - -void IndexIVFPQPanorama::add(idx_t n, const float* x) { - FAISS_THROW_IF_NOT_MSG( - !added, "IndexIVFPQPanorama only supports a single add() call"); - added = true; - num_points = n; - - IndexIVFPQ::add(n, x); - - // Compute column offsets (each list rounded up to batch_size). - size_t total_column_bytes = 0; - column_offsets = new size_t[nlist]; - for (size_t i = 0; i < nlist; i++) { - column_offsets[i] = total_column_bytes; - size_t n_batches = - (invlists->list_size(i) + batch_size - 1) / batch_size; - total_column_bytes += n_batches * batch_size * code_size; - } - - // Transpose codes from row-major [point0_code, point1_code, ...] into - // column-major within each batch: M columns of batch_size bytes each. - column_storage = new uint8_t[total_column_bytes](); - for (size_t list_no = 0; list_no < nlist; list_no++) { - size_t col_offset = column_offsets[list_no]; - size_t list_size = invlists->list_size(list_no); - size_t n_batches = (list_size + batch_size - 1) / batch_size; - const uint8_t* row_codes = invlists->get_codes(list_no); - - for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { - size_t batch_offset = batch_no * batch_size * code_size; - size_t curr_batch_size = - std::min(list_size - batch_no * batch_size, batch_size); - for (size_t m = 0; m < pq.M; m++) { - for (size_t p = 0; p < curr_batch_size; p++) { - column_storage - [col_offset + batch_offset + m * batch_size + p] = - row_codes - [batch_no * batch_size * code_size + - p * code_size + m]; - } - } - } - } - - // Precompute cumulative residual norms (suffix sums of ||y_R||^2). - // init_exact_distances are computed on-the-fly during search using - // the precomputed_table, so we only need cum_sums here. - cum_sum_offsets = new size_t[nlist]; - - size_t cum_size = 0; - for (size_t list_no = 0; list_no < nlist; list_no++) { - cum_sum_offsets[list_no] = cum_size; - cum_size += invlists->list_size(list_no) * (n_levels + 1); - } - - cum_sums = new float[cum_size]; - - for (size_t list_no = 0; list_no < nlist; list_no++) { - size_t list_size = invlists->list_size(list_no); - if (list_size == 0) - continue; - size_t n_batches = (list_size + batch_size - 1) / batch_size; - - for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { - size_t b_offset = batch_no * batch_size; - size_t curr_batch_size = std::min(list_size - b_offset, batch_size); - - for (size_t p = 0; p < curr_batch_size; p++) { - std::vector vec(d); - const uint8_t* code = - invlists->get_single_code(list_no, b_offset + p); - pq.decode(code, vec.data()); - - std::vector suffix(d + 1, 0.0f); - for (int j = d - 1; j >= 0; j--) { - suffix[j] = suffix[j + 1] + vec[j] * vec[j]; - } - - for (int level = 0; level < n_levels; level++) { - int start_idx = level * levels_size; - size_t offset = cum_sum_offsets[list_no] + - b_offset * (n_levels + 1) + - level * curr_batch_size + p; - cum_sums[offset] = start_idx < (int)d - ? std::sqrt(suffix[start_idx]) - : 0.0f; - } - - size_t last_offset = cum_sum_offsets[list_no] + - b_offset * (n_levels + 1) + n_levels * curr_batch_size + - p; - cum_sums[last_offset] = 0.0f; - } - } - } + auto* pano = new PanoramaPQ(d, code_size, n_levels, batch_size, &pq); + this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano); + this->own_invlists = own_invlists; } /***************************************** @@ -169,6 +66,8 @@ template struct IVFPQScannerPanorama : InvertedListScanner { const IndexIVFPQPanorama& index; const ProductQuantizer& pq; + const ArrayInvertedListsPanorama* storage; + const PanoramaPQ* pano_pq; // Query state const float* qi = nullptr; @@ -180,11 +79,15 @@ struct IVFPQScannerPanorama : InvertedListScanner { IVFPQScannerPanorama( const IndexIVFPQPanorama& index, + const ArrayInvertedListsPanorama* storage, bool store_pairs, const IDSelector* sel) : InvertedListScanner(store_pairs, sel), index(index), - pq(index.pq) { + pq(index.pq), + storage(storage), + pano_pq(dynamic_cast(storage->pano.get())) { + FAISS_THROW_IF_NOT(pano_pq); this->keep_max = is_similarity_metric(index.metric_type); this->code_size = pq.code_size; query_cum_norms.resize(index.n_levels + 1); @@ -206,17 +109,7 @@ struct IVFPQScannerPanorama : InvertedListScanner { sim_table_2[i] *= -2.0f; } - // Compute query suffix sums → cum norms per level. - std::vector suffix(index.d + 1, 0.0f); - for (int j = index.d - 1; j >= 0; j--) { - suffix[j] = suffix[j + 1] + qi[j] * qi[j]; - } - for (int level = 0; level < index.n_levels; level++) { - int start = level * index.levels_size; - query_cum_norms[level] = - start < (int)index.d ? std::sqrt(suffix[start]) : 0.0f; - } - query_cum_norms[index.n_levels] = 0.0f; + pano_pq->compute_query_cum_sums(qi, query_cum_norms.data()); } void set_list(idx_t list_no, float coarse_dis) override { @@ -230,7 +123,7 @@ struct IVFPQScannerPanorama : InvertedListScanner { size_t scan_codes( size_t list_size, - const uint8_t* /* codes (row-major, unused) */, + const uint8_t* /* codes (column-major in storage) */, const idx_t* ids, float* distances, idx_t* labels, @@ -239,14 +132,10 @@ struct IVFPQScannerPanorama : InvertedListScanner { const size_t bs = index.batch_size; const size_t cs = index.chunk_size; - const int n_levels = index.n_levels; const size_t n_batches = (list_size + bs - 1) / bs; - // Panorama column-major codes for this list. - const uint8_t* col_codes = - index.column_storage + index.column_offsets[list_no]; - const float* list_cum_sums = - index.cum_sums + index.cum_sum_offsets[list_no]; + const uint8_t* col_codes = storage->get_codes(list_no); + const float* list_cum_sums = storage->get_cum_sums(list_no); const float* precomp = index.precomputed_table.data() + list_no * pq.M * pq.ksub; @@ -257,100 +146,28 @@ struct IVFPQScannerPanorama : InvertedListScanner { std::vector compressed_codes(bs * cs); float dis0 = coarse_dis; - for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { - size_t curr_batch_size = std::min(list_size - batch_no * bs, bs); - size_t b_offset = batch_no * bs; - - // Initialize active set. - std::iota( - active_indices.begin(), - active_indices.begin() + curr_batch_size, - b_offset); - std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1); - std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0); - - const uint8_t* batch_codes = col_codes + b_offset * code_size; - - // Compute init_exact_distance on-the-fly from the - // precomputed table: sum_m(precomp[m * ksub + code[m]]). - // Codes are column-major: point p's code for subquantizer - // m is at batch_codes[m * bs + p]. - for (size_t idx = 0; idx < curr_batch_size; idx++) { - float init_dist = 0.0f; - for (size_t m = 0; m < pq.M; m++) { - uint8_t code_val = batch_codes[m * bs + idx]; - init_dist += precomp[m * pq.ksub + code_val]; - } - exact_distances[idx] = init_dist; - } - const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1); - - size_t next_num_active = curr_batch_size; - size_t batch_offset = batch_no * bs; - - for (int level = 0; level < n_levels && next_num_active > 0; - level++) { - size_t level_sim_offset = level * pq.ksub * cs; - - float query_cum_norm = 2 * query_cum_norms[level + 1]; - float heap_max = distances[0]; - - const float* cum_sums_level = - batch_cums + curr_batch_size * level; - const uint8_t* codes_level = batch_codes + bs * cs * level; - - float* sim_table_level = - const_cast(sim_table_2.data()) + - level_sim_offset; - - bool is_sparse = next_num_active < bs / 16; - - size_t num_active_for_filtering = 0; - if (is_sparse) { - // Sparse path: use active_indices for indirection. - for (size_t ci = 0; ci < cs; ci++) { - size_t chunk_off = ci * bs; - float* chunk_sim = sim_table_level + ci * pq.ksub; - for (size_t i = 0; i < next_num_active; i++) { - size_t real_idx = active_indices[i] - batch_offset; - exact_distances[i] += chunk_sim - [codes_level[chunk_off + real_idx]]; - } - } - num_active_for_filtering = next_num_active; - } else { - auto [cc, na] = panorama_kernels::process_code_compression( - next_num_active, - bs, - cs, - compressed_codes.data(), - bitset.data(), - codes_level); - - panorama_kernels::process_chunks( - cs, - bs, - na, - sim_table_level, - cc, - exact_distances.data()); - num_active_for_filtering = na; - } + PanoramaStats local_stats; + local_stats.reset(); - next_num_active = panorama_kernels::process_filtering( - num_active_for_filtering, - exact_distances.data(), - active_indices.data(), - const_cast(cum_sums_level), - bitset.data(), - batch_offset, - dis0, - query_cum_norm, - heap_max); - } + for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { + size_t num_active = pano_pq->progressive_filter_batch( + col_codes, + list_cum_sums, + precomp, + sim_table_2.data(), + query_cum_norms.data(), + dis0, + list_size, + batch_no, + exact_distances, + active_indices, + bitset, + compressed_codes, + distances[0], + local_stats); // Insert surviving candidates into heap. - for (size_t i = 0; i < next_num_active; i++) { + for (size_t i = 0; i < num_active; i++) { float dis = dis0 + exact_distances[i]; if (C::cmp(distances[0], dis)) { idx_t id = store_pairs @@ -362,6 +179,7 @@ struct IVFPQScannerPanorama : InvertedListScanner { } } + indexPanorama_stats.add(local_stats); return nup; } @@ -395,12 +213,17 @@ InvertedListScanner* IndexIVFPQPanorama::get_InvertedListScanner( FAISS_THROW_IF_NOT_MSG( polysemous_ht == 0, "Panorama PQ does not support polysemous"); + const auto* storage = + dynamic_cast(invlists); + FAISS_THROW_IF_NOT_MSG( + storage, "IndexIVFPQPanorama requires ArrayInvertedListsPanorama"); + if (sel) { return new IVFPQScannerPanorama, true>( - *this, store_pairs, sel); + *this, storage, store_pairs, sel); } else { return new IVFPQScannerPanorama, false>( - *this, store_pairs, sel); + *this, storage, store_pairs, sel); } } diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h index a97107f54d..717308bb07 100644 --- a/faiss/IndexIVFPQPanorama.h +++ b/faiss/IndexIVFPQPanorama.h @@ -33,6 +33,10 @@ namespace faiss { /// into `n_levels` levels of `chunk_size` columns, enabling incremental /// distance computation level-by-level. /// +/// Storage is managed by ArrayInvertedListsPanorama with a PanoramaPQ +/// instance that handles code transposition and cumulative sum computation +/// (via PQ decoding) on insertion. +/// /// OVERHEAD: /// Panorama precomputes per-point cumulative residual norms at insertion /// time. Storage overhead is (n_levels + 1) floats per point for @@ -47,25 +51,17 @@ namespace faiss { /// - use_precomputed_table must be 1. /// /// NOTE: -/// We inherit from IndexIVFPQ and override only get_InvertedListScanner() -/// and add(). The base IndexIVF::search_preassigned() handles all search +/// We inherit from IndexIVFPQ and override only get_InvertedListScanner(). +/// The base IndexIVF::search_preassigned() handles all search /// orchestration — no search code is duplicated. +/// Storage (transposition + cum_sums) is handled by +/// ArrayInvertedListsPanorama, so no add() override is needed. struct IndexIVFPQPanorama : public IndexIVFPQ { int n_levels; size_t batch_size; size_t chunk_size; size_t levels_size; - size_t m_level_width; - - bool added = false; - size_t num_points = 0; - - uint8_t* column_storage = nullptr; - size_t* column_offsets = nullptr; - - float* cum_sums = nullptr; - size_t* cum_sum_offsets = nullptr; IndexIVFPQPanorama( Index* quantizer, @@ -80,8 +76,6 @@ struct IndexIVFPQPanorama : public IndexIVFPQ { IndexIVFPQPanorama() = default; - void add(idx_t n, const float* x) override; - InvertedListScanner* get_InvertedListScanner( bool store_pairs, const IDSelector* sel, diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp index 0191ef152f..ea204d2450 100644 --- a/faiss/impl/HNSW.cpp +++ b/faiss/impl/HNSW.cpp @@ -851,10 +851,10 @@ int search_from_candidates_panorama( while (curr_panorama_level < num_panorama_levels && batch_size > 0) { float query_cum_norm = query_cum_sums[curr_panorama_level + 1]; - size_t start_dim = curr_panorama_level * - panorama_index->pano.level_width_floats; + size_t start_dim = + curr_panorama_level * panorama_index->pano.level_width_dims; size_t end_dim = (curr_panorama_level + 1) * - panorama_index->pano.level_width_floats; + panorama_index->pano.level_width_dims; end_dim = std::min(end_dim, static_cast(panorama_index->d)); size_t i = 0; diff --git a/faiss/impl/Panorama.cpp b/faiss/impl/Panorama.cpp index 970a0cefa6..a3928e4fce 100644 --- a/faiss/impl/Panorama.cpp +++ b/faiss/impl/Panorama.cpp @@ -26,7 +26,7 @@ inline void compute_cum_sums_impl( float* output, size_t d, size_t n_levels, - size_t level_width_floats, + size_t level_width_dims, OffsetFunc&& get_offset) { // Iterate backwards through levels, accumulating sum as we go. // This avoids computing the suffix sum for each vector, which takes @@ -34,9 +34,9 @@ inline void compute_cum_sums_impl( float sum = 0.0f; for (int level = n_levels - 1; level >= 0; level--) { - size_t start_idx = level * level_width_floats; + size_t start_idx = level * level_width_dims; size_t end_idx = std::min( - (level + 1) * level_width_floats, static_cast(d)); + (level + 1) * level_width_dims, static_cast(d)); for (size_t j = start_idx; j < end_idx; j++) { sum += vector[j] * vector[j]; @@ -51,19 +51,24 @@ inline void compute_cum_sums_impl( } // namespace /************************************************************** - * Panorama structure implementation + * Panorama base class implementation **************************************************************/ -Panorama::Panorama(size_t code_size, size_t n_levels, size_t batch_size) - : code_size(code_size), n_levels(n_levels), batch_size(batch_size) { +Panorama::Panorama( + size_t d, + size_t code_size, + size_t n_levels, + size_t batch_size) + : d(d), + code_size(code_size), + n_levels(n_levels), + batch_size(batch_size) { set_derived_values(); } void Panorama::set_derived_values() { FAISS_THROW_IF_NOT_MSG(n_levels > 0, "Panorama: n_levels must be > 0"); - this->d = code_size / sizeof(float); - this->level_width_floats = ((d + n_levels - 1) / n_levels); - this->level_width = this->level_width_floats * sizeof(float); + level_width_bytes = (code_size + n_levels - 1) / n_levels; } /** @@ -88,10 +93,10 @@ void Panorama::copy_codes_to_level_layout( // Copy entry into level-oriented layout for this batch. size_t batch_offset = batch_no * batch_size * code_size; for (size_t level = 0; level < n_levels; level++) { - size_t level_offset = level * level_width * batch_size; - size_t start_byte = level * level_width; - size_t actual_level_width = - std::min(level_width, code_size - level * level_width); + size_t level_offset = level * level_width_bytes * batch_size; + size_t start_byte = level * level_width_bytes; + size_t actual_level_width = std::min( + level_width_bytes, code_size - level * level_width_bytes); const uint8_t* src = code + entry_idx * code_size + start_byte; uint8_t* dest = codes + batch_offset + level_offset + @@ -102,38 +107,12 @@ void Panorama::copy_codes_to_level_layout( } } -void Panorama::compute_cumulative_sums( - float* cumsum_base, - size_t offset, - size_t n_entry, - const float* vectors) const { - for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { - size_t current_pos = offset + entry_idx; - size_t batch_no = current_pos / batch_size; - size_t pos_in_batch = current_pos % batch_size; - - const float* vector = vectors + entry_idx * d; - size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1); - - auto get_offset = [&](size_t level) { - return cumsum_batch_offset + level * batch_size + pos_in_batch; - }; - - compute_cum_sums_impl( - vector, - cumsum_base, - d, - n_levels, - level_width_floats, - get_offset); - } -} - void Panorama::compute_query_cum_sums(const float* query, float* query_cum_sums) const { + size_t level_dims = (d + n_levels - 1) / n_levels; auto get_offset = [](size_t level) { return level; }; compute_cum_sums_impl( - query, query_cum_sums, d, n_levels, level_width_floats, get_offset); + query, query_cum_sums, d, n_levels, level_dims, get_offset); } void Panorama::reconstruct(idx_t key, float* recons, const uint8_t* codes_base) @@ -145,12 +124,12 @@ void Panorama::reconstruct(idx_t key, float* recons, const uint8_t* codes_base) size_t batch_offset = batch_no * batch_size * code_size; for (size_t level = 0; level < n_levels; level++) { - size_t level_offset = level * level_width * batch_size; + size_t level_offset = level * level_width_bytes * batch_size; const uint8_t* src = codes_base + batch_offset + level_offset + - pos_in_batch * level_width; - uint8_t* dest = recons_buffer + level * level_width; - size_t copy_size = - std::min(level_width, code_size - level * level_width); + pos_in_batch * level_width_bytes; + uint8_t* dest = recons_buffer + level * level_width_bytes; + size_t copy_size = std::min( + level_width_bytes, code_size - level * level_width_bytes); memcpy(dest, src, copy_size); } } @@ -177,9 +156,9 @@ void Panorama::copy_entry( for (size_t level = 0; level < n_levels; level++) { // Copy code - size_t level_offset = level * level_width * batch_size; - size_t actual_level_width = - std::min(level_width, code_size - level * level_width); + size_t level_offset = level * level_width_bytes * batch_size; + size_t actual_level_width = std::min( + level_width_bytes, code_size - level * level_width_bytes); const uint8_t* src = src_codes + src_batch_offset + level_offset + src_pos_in_batch * actual_level_width; @@ -197,4 +176,38 @@ void Panorama::copy_entry( dest_cum_sums[dest_offset] = src_cum_sums[src_offset]; } } + +/************************************************************** + * PanoramaFlat implementation + **************************************************************/ + +PanoramaFlat::PanoramaFlat(size_t d, size_t n_levels, size_t batch_size) + : Panorama(d, d * sizeof(float), n_levels, batch_size) { + level_width_dims = (d + n_levels - 1) / n_levels; + level_width_bytes = level_width_dims * sizeof(float); +} + +void PanoramaFlat::compute_cumulative_sums( + float* cumsum_base, + size_t offset, + size_t n_entry, + const uint8_t* code) const { + const float* vectors = reinterpret_cast(code); + for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { + size_t current_pos = offset + entry_idx; + size_t batch_no = current_pos / batch_size; + size_t pos_in_batch = current_pos % batch_size; + + const float* vector = vectors + entry_idx * d; + size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1); + + auto get_offset = [&](size_t level) { + return cumsum_batch_offset + level * batch_size + pos_in_batch; + }; + + compute_cum_sums_impl( + vector, cumsum_base, d, n_levels, level_width_dims, get_offset); + } +} + } // namespace faiss diff --git a/faiss/impl/Panorama.h b/faiss/impl/Panorama.h index 79a23a64a7..8f33fc8d1a 100644 --- a/faiss/impl/Panorama.h +++ b/faiss/impl/Panorama.h @@ -40,35 +40,46 @@ namespace faiss { * Coupled with the appropriate orthogonal PreTransform (e.g. PCA, Cayley, * etc.), Panorama can prune the vast majority of dimensions, greatly * accelerating the refinement stage. + * + * This is the abstract base class. Concrete subclasses (PanoramaFlat, + * PanoramaPQ) implement compute_cumulative_sums and progressive_filter_batch + * for their respective code formats. */ struct Panorama { size_t d = 0; size_t code_size = 0; size_t n_levels = 0; - size_t level_width = 0; - size_t level_width_floats = 0; + size_t level_width_bytes = 0; size_t batch_size = 0; - explicit Panorama(size_t code_size, size_t n_levels, size_t batch_size); + Panorama() = default; + Panorama(size_t d, size_t code_size, size_t n_levels, size_t batch_size); + + virtual ~Panorama() = default; void set_derived_values(); /// Helper method to copy codes into level-oriented batch layout at a given /// offset in the list. - void copy_codes_to_level_layout( + /// PanoramaFlat uses row-major within each level (point bytes contiguous). + /// PanoramaPQ overrides to use column-major (subquantizer columns + /// contiguous). + virtual void copy_codes_to_level_layout( uint8_t* codes, size_t offset, size_t n_entry, const uint8_t* code); - /// Helper method to compute the cumulative sums of the codes. - /// The cumsums also follow the level-oriented batch layout to minimize the + /// Compute the cumulative sums (suffix norms) for database vectors. + /// The cumsums follow the level-oriented batch layout to minimize the /// number of random memory accesses. - void compute_cumulative_sums( + /// Subclasses interpret the raw code bytes according to their format: + /// PanoramaFlat reinterprets as float*, PanoramaPQ decodes via PQ. + virtual void compute_cumulative_sums( float* cumsum_base, size_t offset, size_t n_entry, - const float* vectors) const; + const uint8_t* code) const = 0; /// Compute the cumulative sums of the query vector. void compute_query_cum_sums(const float* query, float* query_cum_sums) @@ -83,7 +94,30 @@ struct Panorama { size_t dest_idx, size_t src_idx) const; - /// Panorama's core progressive filtering algorithm: + virtual void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) + const; +}; + +/** + * Panorama for flat (uncompressed) float vectors. + * + * Codes are raw float vectors (code_size = d * sizeof(float)). + * compute_cumulative_sums interprets codes as floats. + * progressive_filter_batch computes dot products on raw float storage. + */ +struct PanoramaFlat : Panorama { + size_t level_width_dims = 0; + + PanoramaFlat() = default; + PanoramaFlat(size_t d, size_t n_levels, size_t batch_size); + + void compute_cumulative_sums( + float* cumsum_base, + size_t offset, + size_t n_entry, + const uint8_t* code) const override; + + /// Panorama's core progressive filtering algorithm for flat codes: /// Process vectors in batches for cache efficiency. For each batch: /// 1. Apply ID selection filter and initialize distances /// (||y||^2 + ||x||^2). @@ -113,12 +147,10 @@ struct Panorama { std::vector& exact_distances, float threshold, PanoramaStats& local_stats) const; - - void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) const; }; template -size_t Panorama::progressive_filter_batch( +size_t PanoramaFlat::progressive_filter_batch( const uint8_t* codes_base, const float* cum_sums, const float* query, @@ -173,18 +205,18 @@ size_t Panorama::progressive_filter_batch( float query_cum_norm = query_cum_sums[level + 1]; - size_t level_offset = level * level_width * batch_size; + size_t level_offset = level * level_width_bytes * batch_size; const float* level_storage = (const float*)(storage_base + level_offset); size_t next_active = 0; for (size_t i = 0; i < num_active; i++) { uint32_t idx = active_indices[i]; - size_t actual_level_width = std::min( - level_width_floats, d - level * level_width_floats); + size_t actual_level_width = + std::min(level_width_dims, d - level * level_width_dims); const float* yj = level_storage + idx * actual_level_width; - const float* query_level = query + level * level_width_floats; + const float* query_level = query + level * level_width_dims; float dot_product = fvec_inner_product(query_level, yj, actual_level_width); diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index 49e0458631..7cc9ee9d08 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include @@ -53,6 +54,7 @@ #include #include #include +#include #ifdef FAISS_ENABLE_SVS #include #include @@ -400,8 +402,12 @@ std::unique_ptr read_InvertedLists_up( FAISS_CHECK_DESERIALIZATION_LOOP_LIMIT(nlist, "ilpn nlist"); READ1(code_size); READ1(n_levels); + auto* pano = new PanoramaFlat( + code_size / sizeof(float), + n_levels, + ArrayInvertedListsPanorama::kBatchSize); auto ailp = std::make_unique( - nlist, code_size, n_levels); + nlist, code_size, pano); std::vector sizes(nlist); read_ArrayInvertedLists_sizes(f, sizes); for (size_t i = 0; i < nlist; i++) { @@ -1365,6 +1371,34 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { READVECTOR(ivsp->trained); read_InvertedLists(*ivsp, f, io_flags); idx = std::move(ivsp); + } else if (h == fourcc("IwPP")) { + auto ivpp = std::make_unique(); + read_ivf_header(ivpp.get(), f); + READ1(ivpp->by_residual); + READ1(ivpp->code_size); + read_ProductQuantizer(&ivpp->pq, f); + READ1(ivpp->n_levels); + READ1(ivpp->batch_size); + ivpp->chunk_size = ivpp->code_size / ivpp->n_levels; + ivpp->levels_size = ivpp->d / ivpp->n_levels; + read_InvertedLists(*ivpp, f, io_flags); + // The "ilpn" reader creates a PanoramaFlat placeholder; replace + // it with PanoramaPQ now that we have the ProductQuantizer. + auto* storage = + dynamic_cast(ivpp->invlists); + if (storage) { + storage->pano.reset(new PanoramaPQ( + ivpp->d, + ivpp->code_size, + ivpp->n_levels, + ivpp->batch_size, + &ivpp->pq)); + } + if (ivpp->is_trained) { + ivpp->use_precomputed_table = 1; + ivpp->precompute_table(); + } + idx = std::move(ivpp); } else if ( h == fourcc("IvPQ") || h == fourcc("IvQR") || h == fourcc("IwPQ") || h == fourcc("IwQR")) { @@ -1496,8 +1530,8 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { size_t nlevels; READ1(nlevels); const_cast(idx_panorama->num_panorama_levels) = nlevels; - const_cast(idx_panorama->pano) = - Panorama(idx_panorama->d * sizeof(float), nlevels, 1); + const_cast(idx_panorama->pano) = + PanoramaFlat(idx_panorama->d, nlevels, 1); READVECTOR(idx_panorama->cum_sums); } if (h == fourcc("IHNc") || h == fourcc("IHc2")) { diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp index 04257c76a6..02d0870bbc 100644 --- a/faiss/impl/index_write.cpp +++ b/faiss/impl/index_write.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -273,7 +274,7 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) { WRITE1(h); WRITE1(ailp->nlist); WRITE1(ailp->code_size); - WRITE1(ailp->n_levels); + WRITE1(ailp->pano->n_levels); uint32_t list_type = fourcc("full"); WRITE1(list_type); std::vector sizes; @@ -774,6 +775,18 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) { WRITE1(ivsp->threshold_type); WRITEVECTOR(ivsp->trained); write_InvertedLists(ivsp->invlists, f); + } else if ( + const IndexIVFPQPanorama* ivpp = + dynamic_cast(idx)) { + uint32_t h = fourcc("IwPP"); + WRITE1(h); + write_ivf_header(ivpp, f); + WRITE1(ivpp->by_residual); + WRITE1(ivpp->code_size); + write_ProductQuantizer(&ivpp->pq, f); + WRITE1(ivpp->n_levels); + WRITE1(ivpp->batch_size); + write_InvertedLists(ivpp->invlists, f); } else if (const IndexIVFPQ* ivpq = dynamic_cast(idx)) { const IndexIVFPQR* ivfpqr = dynamic_cast(idx); diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp index 448b969736..313228301c 100644 --- a/faiss/invlists/InvertedLists.cpp +++ b/faiss/invlists/InvertedLists.cpp @@ -353,19 +353,12 @@ ArrayInvertedLists::~ArrayInvertedLists() {} ArrayInvertedListsPanorama::ArrayInvertedListsPanorama( size_t nlist, size_t code_size, - size_t n_levels) - : ArrayInvertedLists(nlist, code_size), - n_levels(n_levels), - level_width( - (((code_size / sizeof(float)) + n_levels - 1) / n_levels) * - sizeof(float)), - pano(code_size, n_levels, kBatchSize) { - FAISS_THROW_IF_NOT(n_levels > 0); - FAISS_THROW_IF_NOT(code_size % sizeof(float) == 0); + Panorama* pano) + : ArrayInvertedLists(nlist, code_size), pano(pano) { + FAISS_THROW_IF_NOT(pano != nullptr); + FAISS_THROW_IF_NOT(pano->n_levels > 0); FAISS_THROW_IF_NOT_MSG( - !use_iterator, - "IndexIVFFlatPanorama does not support iterators, use vanilla IndexIVFFlat instead"); - FAISS_ASSERT(level_width % sizeof(float) == 0); + !use_iterator, "Panorama does not support iterators"); cum_sums.resize(nlist); } @@ -389,13 +382,10 @@ size_t ArrayInvertedListsPanorama::add_entries( size_t new_size = o + n_entry; size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize; codes[list_no].resize(num_batches * kBatchSize * code_size); - cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1)); + cum_sums[list_no].resize(num_batches * kBatchSize * (pano->n_levels + 1)); - // Cast to float* is safe here as we guarantee codes are always float - // vectors for `IndexIVFFlatPanorama` (verified by the constructor). - const float* vectors = reinterpret_cast(code); - pano.copy_codes_to_level_layout(codes[list_no].data(), o, n_entry, code); - pano.compute_cumulative_sums(cum_sums[list_no].data(), o, n_entry, vectors); + pano->copy_codes_to_level_layout(codes[list_no].data(), o, n_entry, code); + pano->compute_cumulative_sums(cum_sums[list_no].data(), o, n_entry, code); return o; } @@ -411,13 +401,10 @@ void ArrayInvertedListsPanorama::update_entries( memcpy(&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry); - // Cast to float* is safe here as we guarantee codes are always float - // vectors for `IndexIVFFlatPanorama` (verified by the constructor). - const float* vectors = reinterpret_cast(code); - pano.copy_codes_to_level_layout( + pano->copy_codes_to_level_layout( codes[list_no].data(), offset, n_entry, code); - pano.compute_cumulative_sums( - cum_sums[list_no].data(), offset, n_entry, vectors); + pano->compute_cumulative_sums( + cum_sums[list_no].data(), offset, n_entry, code); } void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) { @@ -425,7 +412,7 @@ void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) { size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize; codes[list_no].resize(num_batches * kBatchSize * code_size); - cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1)); + cum_sums[list_no].resize(num_batches * kBatchSize * (pano->n_levels + 1)); } const uint8_t* ArrayInvertedListsPanorama::get_single_code( @@ -437,7 +424,7 @@ const uint8_t* ArrayInvertedListsPanorama::get_single_code( uint8_t* recons_buffer = new uint8_t[code_size]; float* recons = reinterpret_cast(recons_buffer); - pano.reconstruct(offset, recons, codes[list_no].data()); + pano->reconstruct(offset, recons, codes[list_no].data()); return recons_buffer; } diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h index 43c1ecc0c5..edc29995b9 100644 --- a/faiss/invlists/InvertedLists.h +++ b/faiss/invlists/InvertedLists.h @@ -15,6 +15,7 @@ * the interface. */ +#include #include #include @@ -277,16 +278,17 @@ struct ArrayInvertedLists : InvertedLists { ~ArrayInvertedLists() override; }; -/// Level-oriented storage as defined in the IVFFlat section of Panorama +/// Level-oriented storage as defined in the Panorama paper /// (https://www.arxiv.org/pdf/2510.00566). +/// Works with both flat codes (PanoramaFlat) and PQ codes (PanoramaPQ) +/// via the virtual Panorama interface. struct ArrayInvertedListsPanorama : ArrayInvertedLists { static constexpr size_t kBatchSize = 128; std::vector> cum_sums; - const size_t n_levels; - const size_t level_width; // in code units - Panorama pano; + std::unique_ptr pano; - ArrayInvertedListsPanorama(size_t nlist, size_t code_size, size_t n_levels); + /// Takes ownership of the provided Panorama*. + ArrayInvertedListsPanorama(size_t nlist, size_t code_size, Panorama* pano); const float* get_cum_sums(size_t list_no) const; diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig index 75292ecb7f..371dabdf7e 100644 --- a/faiss/python/swigfaiss.swig +++ b/faiss/python/swigfaiss.swig @@ -542,6 +542,7 @@ void gpu_sync_all_devices() %include %include +%ignore faiss::ArrayInvertedListsPanorama::pano; %include %include %ignore BlockInvertedListsIOHook; From a8878f07e39041de4494db8d3e0a1018bfc7e568 Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Fri, 20 Mar 2026 04:18:01 +0000 Subject: [PATCH 12/41] Add missing files --- faiss/impl/PanoramaPQ.cpp | 122 ++++++++++++++++++++++ faiss/impl/PanoramaPQ.h | 212 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 334 insertions(+) create mode 100644 faiss/impl/PanoramaPQ.cpp create mode 100644 faiss/impl/PanoramaPQ.h diff --git a/faiss/impl/PanoramaPQ.cpp b/faiss/impl/PanoramaPQ.cpp new file mode 100644 index 0000000000..832bfaf91d --- /dev/null +++ b/faiss/impl/PanoramaPQ.cpp @@ -0,0 +1,122 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include + +#include + +namespace faiss { + +void PanoramaPQ::copy_codes_to_level_layout( + uint8_t* codes, + size_t offset, + size_t n_entry, + const uint8_t* code) { + const size_t cs = chunk_size; + const size_t bs = batch_size; + + for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { + size_t current_pos = offset + entry_idx; + size_t batch_no = current_pos / bs; + size_t pos_in_batch = current_pos % bs; + size_t batch_offset = batch_no * bs * code_size; + + for (size_t level = 0; level < n_levels; level++) { + size_t level_offset = level * cs * bs; + size_t start_byte = level * cs; + + for (size_t ci = 0; ci < cs && (start_byte + ci) < code_size; + ci++) { + codes[batch_offset + level_offset + ci * bs + pos_in_batch] = + code[entry_idx * code_size + start_byte + ci]; + } + } + } +} + +void PanoramaPQ::reconstruct( + idx_t key, + float* recons, + const uint8_t* codes_base) const { + uint8_t* recons_buffer = reinterpret_cast(recons); + const size_t cs = chunk_size; + const size_t bs = batch_size; + + size_t batch_no = key / bs; + size_t pos_in_batch = key % bs; + size_t batch_offset = batch_no * bs * code_size; + + for (size_t level = 0; level < n_levels; level++) { + size_t level_offset = level * cs * bs; + size_t start_byte = level * cs; + + for (size_t ci = 0; ci < cs && (start_byte + ci) < code_size; ci++) { + recons_buffer[start_byte + ci] = + codes_base[batch_offset + level_offset + ci * bs + + pos_in_batch]; + } + } +} + +PanoramaPQ::PanoramaPQ( + size_t d, + size_t code_size, + size_t n_levels, + size_t batch_size, + const ProductQuantizer* pq) + : Panorama(d, code_size, n_levels, batch_size), + pq(pq), + chunk_size(code_size / n_levels), + levels_size(d / n_levels) { + FAISS_THROW_IF_NOT_MSG( + code_size % n_levels == 0, + "PanoramaPQ: code_size must be divisible by n_levels"); + FAISS_THROW_IF_NOT_MSG(pq != nullptr, "PanoramaPQ: pq must not be null"); +} + +void PanoramaPQ::compute_cumulative_sums( + float* cumsum_base, + size_t offset, + size_t n_entry, + const uint8_t* code) const { + for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { + size_t current_pos = offset + entry_idx; + size_t batch_no = current_pos / batch_size; + size_t pos_in_batch = current_pos % batch_size; + + // Decode PQ code to float vector. + std::vector vec(d); + pq->decode(code + entry_idx * code_size, vec.data()); + + // Compute suffix sums of squared norms. + std::vector suffix(d + 1, 0.0f); + for (int j = d - 1; j >= 0; j--) { + suffix[j] = suffix[j + 1] + vec[j] * vec[j]; + } + + // Write into batch-oriented layout. + size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1); + for (size_t level = 0; level < n_levels; level++) { + size_t start_idx = level * levels_size; + size_t out_offset = cumsum_batch_offset + level * batch_size + + pos_in_batch; + cumsum_base[out_offset] = start_idx < d + ? std::sqrt(suffix[start_idx]) + : 0.0f; + } + + size_t last_offset = cumsum_batch_offset + n_levels * batch_size + + pos_in_batch; + cumsum_base[last_offset] = 0.0f; + } +} + +} // namespace faiss diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h new file mode 100644 index 0000000000..1ddc0c3897 --- /dev/null +++ b/faiss/impl/PanoramaPQ.h @@ -0,0 +1,212 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include + +namespace faiss { + +/** + * Panorama for PQ-compressed vectors. + * + * Codes are PQ codes (code_size = M bytes for 8-bit PQ). + * compute_cumulative_sums decodes via PQ then computes suffix norms. + * progressive_filter_batch uses LUT accumulation with panorama_kernels. + */ +struct PanoramaPQ : Panorama { + const ProductQuantizer* pq = nullptr; + size_t chunk_size = 0; + size_t levels_size = 0; + + PanoramaPQ() = default; + PanoramaPQ( + size_t d, + size_t code_size, + size_t n_levels, + size_t batch_size, + const ProductQuantizer* pq); + + void copy_codes_to_level_layout( + uint8_t* codes, + size_t offset, + size_t n_entry, + const uint8_t* code) override; + + void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) + const override; + + void compute_cumulative_sums( + float* cumsum_base, + size_t offset, + size_t n_entry, + const uint8_t* code) const override; + + /// Progressive filtering for PQ codes: processes one batch. + /// + /// Follows the same pattern as PanoramaFlat: initializes exact_distances + /// with squared norms (||r||^2 from stored cum_sums + dis0), then + /// processes the inner-product contribution level-by-level with pruning. + /// The SIMD-optimized process_chunks kernel handles the init phase + /// (precomp table over all M subquantizers) in a single vectorized pass. + /// + /// @param col_codes Column-major codes for this inverted list. + /// @param list_cum_sums Cumulative sums for this inverted list. + /// @param precomp Precomputed table slice for this list. + /// @param sim_table_2 -2 * inner_prod_table (query-specific LUT). + /// @param query_cum_norms Query suffix norms per level. + /// @param coarse_dis Coarse distance (dis0) for this list. + /// @param list_size Total number of vectors in this list. + /// @param batch_no Which batch to process. + /// @param exact_distances [out] Scratch buffer for partial distances. + /// @param active_indices [out] Scratch buffer for survivor indices. + /// @param bitset Scratch buffer for code compression. + /// @param compressed_codes Scratch buffer for compressed codes. + /// @param threshold Current heap threshold for pruning. + /// @param local_stats [out] Accumulated pruning statistics. + /// @return Number of surviving candidates in active_indices. + template + size_t progressive_filter_batch( + const uint8_t* col_codes, + const float* list_cum_sums, + const float* precomp, + const float* sim_table_2, + const float* query_cum_norms, + float coarse_dis, + size_t list_size, + size_t batch_no, + std::vector& exact_distances, + std::vector& active_indices, + std::vector& bitset, + std::vector& compressed_codes, + float threshold, + PanoramaStats& local_stats) const; +}; + +template +size_t PanoramaPQ::progressive_filter_batch( + const uint8_t* col_codes, + const float* list_cum_sums, + const float* precomp, + const float* sim_table_2, + const float* query_cum_norms, + float coarse_dis, + size_t list_size, + size_t batch_no, + std::vector& exact_distances, + std::vector& active_indices, + std::vector& bitset, + std::vector& compressed_codes, + float threshold, + PanoramaStats& local_stats) const { + const size_t bs = batch_size; + const size_t cs = chunk_size; + const size_t M = pq->M; + const size_t ksub = pq->ksub; + + size_t curr_batch_size = std::min(list_size - batch_no * bs, bs); + size_t b_offset = batch_no * bs; + + // Initialize active set. + std::iota( + active_indices.begin(), + active_indices.begin() + curr_batch_size, + b_offset); + std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1); + std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0); + + const uint8_t* batch_codes = col_codes + b_offset * code_size; + + // SIMD init: compute precomp distances for all M subquantizers. + // process_chunks naturally handles column-major codes and does + // cache-friendly 1KB-at-a-time table lookups with AVX-512 gathers. + std::fill( + exact_distances.begin(), + exact_distances.begin() + curr_batch_size, + 0.0f); + panorama_kernels::process_chunks( + M, + bs, + curr_batch_size, + const_cast(precomp), + const_cast(batch_codes), + exact_distances.data()); + + const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1); + + size_t next_num_active = curr_batch_size; + size_t batch_offset = batch_no * bs; + size_t total_active = next_num_active; + + for (size_t level = 0; level < n_levels && next_num_active > 0; level++) { + local_stats.total_dims_scanned += next_num_active; + local_stats.total_dims += total_active; + + size_t level_sim_offset = level * ksub * cs; + + float query_cum_norm = 2 * query_cum_norms[level + 1]; + + const float* cum_sums_level = batch_cums + bs * (level + 1); + const uint8_t* codes_level = batch_codes + bs * cs * level; + + const float* sim_table_level = sim_table_2 + level_sim_offset; + + bool is_sparse = next_num_active < bs / 16; + + size_t num_active_for_filtering = 0; + if (is_sparse) { + for (size_t ci = 0; ci < cs; ci++) { + size_t chunk_off = ci * bs; + const float* chunk_sim = sim_table_level + ci * ksub; + for (size_t i = 0; i < next_num_active; i++) { + size_t real_idx = active_indices[i] - batch_offset; + exact_distances[i] += + chunk_sim[codes_level[chunk_off + real_idx]]; + } + } + num_active_for_filtering = next_num_active; + } else { + auto [cc, na] = panorama_kernels::process_code_compression( + next_num_active, + bs, + cs, + compressed_codes.data(), + bitset.data(), + codes_level); + + panorama_kernels::process_chunks( + cs, + bs, + na, + const_cast(sim_table_level), + cc, + exact_distances.data()); + num_active_for_filtering = na; + } + + next_num_active = panorama_kernels::process_filtering( + num_active_for_filtering, + exact_distances.data(), + active_indices.data(), + const_cast(cum_sums_level), + bitset.data(), + batch_offset, + coarse_dis, + query_cum_norm, + threshold); + } + + return next_num_active; +} + +} // namespace faiss From fcb0cdf60ee6fe23da2ae46e8685f85ea37a82a4 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Fri, 20 Mar 2026 05:01:10 +0000 Subject: [PATCH 13/41] BEST COMMIT YET --- benchs/bench_ivfpq_panorama.py | 60 ++++++++++++++++---------------- faiss/IndexIVFPQPanorama.cpp | 7 ++-- faiss/impl/PanoramaPQ.cpp | 31 ++++++++++++++++- faiss/impl/PanoramaPQ.h | 49 +++++++++++++------------- faiss/impl/index_read.cpp | 15 +++++++- faiss/impl/index_write.cpp | 11 +++++- faiss/invlists/InvertedLists.cpp | 34 +++++++++++++++--- faiss/invlists/InvertedLists.h | 2 ++ 8 files changed, 143 insertions(+), 66 deletions(-) diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py index 5ae4fc2152..eafeebb7e8 100644 --- a/benchs/bench_ivfpq_panorama.py +++ b/benchs/bench_ivfpq_panorama.py @@ -17,7 +17,7 @@ def fvecs_read(fname): GIST_DIR = "/datasets/PCA_init" -CACHE_DIR = "/home/lutex/faiss-panorama/index_cache" +CACHE_DIR = "/home/akash/faiss-panorama/index_cache" os.makedirs(CACHE_DIR, exist_ok=True) IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index") @@ -80,35 +80,35 @@ def eval_recall(index, nprobe_val): faiss.omp_set_num_threads(mp.cpu_count()) -# --- IVFPQ baseline (cached) --- -if os.path.exists(IVFPQ_CACHE): - print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True) - t0 = time.time() - ivfpq = faiss.read_index(IVFPQ_CACHE) - print(f" Loaded in {time.time() - t0:.1f}s", flush=True) -else: - print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True) - quantizer = faiss.IndexFlatL2(d) - ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) - t0 = time.time() - ivfpq.train(xt) - print(f" Training took {time.time() - t0:.1f}s", flush=True) - - print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) - faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE) - - t0 = time.time() - ivfpq.add(xb) - print(f" Adding took {time.time() - t0:.1f}s", flush=True) - - print(f" Saving full index to {IVFPQ_CACHE}...", flush=True) - faiss.write_index(ivfpq, IVFPQ_CACHE) - -faiss.omp_set_num_threads(1) -print("\n====== IVFPQ baseline", flush=True) -for nprobe in [1, 2, 4, 8, 16]: - ivfpq.nprobe = nprobe - eval_recall(ivfpq, nprobe) +# # --- IVFPQ baseline (cached) --- +# if os.path.exists(IVFPQ_CACHE): +# print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True) +# t0 = time.time() +# ivfpq = faiss.read_index(IVFPQ_CACHE) +# print(f" Loaded in {time.time() - t0:.1f}s", flush=True) +# else: +# print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True) +# quantizer = faiss.IndexFlatL2(d) +# ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) +# t0 = time.time() +# ivfpq.train(xt) +# print(f" Training took {time.time() - t0:.1f}s", flush=True) + +# print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) +# faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE) + +# t0 = time.time() +# ivfpq.add(xb) +# print(f" Adding took {time.time() - t0:.1f}s", flush=True) + +# print(f" Saving full index to {IVFPQ_CACHE}...", flush=True) +# faiss.write_index(ivfpq, IVFPQ_CACHE) + +# faiss.omp_set_num_threads(1) +# print("\n====== IVFPQ baseline", flush=True) +# for nprobe in [1, 2, 4, 8, 16]: +# ivfpq.nprobe = nprobe +# eval_recall(ivfpq, nprobe) # --- IVFPQPanorama (cached) --- faiss.omp_set_num_threads(mp.cpu_count()) diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index 504958b544..6dcbdad261 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -49,7 +49,7 @@ IndexIVFPQPanorama::IndexIVFPQPanorama( M == code_size, "M must equal code_size for 8-bit PQ"); FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported"); - auto* pano = new PanoramaPQ(d, code_size, n_levels, batch_size, &pq); + auto* pano = new PanoramaPQ(d, code_size, n_levels, batch_size, &pq, quantizer); this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano); this->own_invlists = own_invlists; } @@ -136,8 +136,7 @@ struct IVFPQScannerPanorama : InvertedListScanner { const size_t n_batches = (list_size + bs - 1) / bs; const uint8_t* col_codes = storage->get_codes(list_no); const float* list_cum_sums = storage->get_cum_sums(list_no); - const float* precomp = - index.precomputed_table.data() + list_no * pq.M * pq.ksub; + const float* list_init_dists = storage->get_init_dists(list_no); // Scratch buffers. std::vector exact_distances(bs); @@ -153,7 +152,7 @@ struct IVFPQScannerPanorama : InvertedListScanner { size_t num_active = pano_pq->progressive_filter_batch( col_codes, list_cum_sums, - precomp, + list_init_dists, sim_table_2.data(), query_cum_norms.data(), dis0, diff --git a/faiss/impl/PanoramaPQ.cpp b/faiss/impl/PanoramaPQ.cpp index 832bfaf91d..02e70967b0 100644 --- a/faiss/impl/PanoramaPQ.cpp +++ b/faiss/impl/PanoramaPQ.cpp @@ -71,9 +71,11 @@ PanoramaPQ::PanoramaPQ( size_t code_size, size_t n_levels, size_t batch_size, - const ProductQuantizer* pq) + const ProductQuantizer* pq, + const Index* quantizer) : Panorama(d, code_size, n_levels, batch_size), pq(pq), + quantizer(quantizer), chunk_size(code_size / n_levels), levels_size(d / n_levels) { FAISS_THROW_IF_NOT_MSG( @@ -119,4 +121,31 @@ void PanoramaPQ::compute_cumulative_sums( } } +void PanoramaPQ::compute_init_distances( + float* init_dists_base, + size_t list_no, + size_t offset, + size_t n_entry, + const uint8_t* code) const { + FAISS_THROW_IF_NOT_MSG( + quantizer != nullptr, + "PanoramaPQ: quantizer required for compute_init_distances"); + + std::vector centroid(d); + quantizer->reconstruct(list_no, centroid.data()); + + for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { + std::vector vec(d); + pq->decode(code + entry_idx * code_size, vec.data()); + + float init_dist = 0.0f; + for (size_t j = 0; j < d; j++) { + init_dist += vec[j] * vec[j] + 2 * vec[j] * centroid[j]; + } + + size_t point_idx = offset + entry_idx; + init_dists_base[point_idx] = init_dist; + } +} + } // namespace faiss diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h index 1ddc0c3897..91d443b840 100644 --- a/faiss/impl/PanoramaPQ.h +++ b/faiss/impl/PanoramaPQ.h @@ -7,6 +7,7 @@ #pragma once +#include #include #include #include @@ -26,6 +27,7 @@ namespace faiss { */ struct PanoramaPQ : Panorama { const ProductQuantizer* pq = nullptr; + const Index* quantizer = nullptr; size_t chunk_size = 0; size_t levels_size = 0; @@ -35,7 +37,8 @@ struct PanoramaPQ : Panorama { size_t code_size, size_t n_levels, size_t batch_size, - const ProductQuantizer* pq); + const ProductQuantizer* pq, + const Index* quantizer = nullptr); void copy_codes_to_level_layout( uint8_t* codes, @@ -52,17 +55,25 @@ struct PanoramaPQ : Panorama { size_t n_entry, const uint8_t* code) const override; + /// Precompute per-point init distances: ||r||^2 + 2. + /// Requires quantizer to be set. Layout is flat per-list, + /// padded to batch_size boundaries. + void compute_init_distances( + float* init_dists_base, + size_t list_no, + size_t offset, + size_t n_entry, + const uint8_t* code) const; + /// Progressive filtering for PQ codes: processes one batch. /// - /// Follows the same pattern as PanoramaFlat: initializes exact_distances - /// with squared norms (||r||^2 from stored cum_sums + dis0), then - /// processes the inner-product contribution level-by-level with pruning. - /// The SIMD-optimized process_chunks kernel handles the init phase - /// (precomp table over all M subquantizers) in a single vectorized pass. + /// Initializes exact_distances from precomputed init_dists + /// (||r||^2 + 2), then refines with the query-specific + /// sim_table_2 level-by-level with Cauchy-Schwarz pruning. /// /// @param col_codes Column-major codes for this inverted list. /// @param list_cum_sums Cumulative sums for this inverted list. - /// @param precomp Precomputed table slice for this list. + /// @param init_dists Precomputed init distances for this list. /// @param sim_table_2 -2 * inner_prod_table (query-specific LUT). /// @param query_cum_norms Query suffix norms per level. /// @param coarse_dis Coarse distance (dis0) for this list. @@ -79,7 +90,7 @@ struct PanoramaPQ : Panorama { size_t progressive_filter_batch( const uint8_t* col_codes, const float* list_cum_sums, - const float* precomp, + const float* init_dists, const float* sim_table_2, const float* query_cum_norms, float coarse_dis, @@ -97,7 +108,7 @@ template size_t PanoramaPQ::progressive_filter_batch( const uint8_t* col_codes, const float* list_cum_sums, - const float* precomp, + const float* init_dists, const float* sim_table_2, const float* query_cum_norms, float coarse_dis, @@ -111,7 +122,6 @@ size_t PanoramaPQ::progressive_filter_batch( PanoramaStats& local_stats) const { const size_t bs = batch_size; const size_t cs = chunk_size; - const size_t M = pq->M; const size_t ksub = pq->ksub; size_t curr_batch_size = std::min(list_size - batch_no * bs, bs); @@ -127,26 +137,15 @@ size_t PanoramaPQ::progressive_filter_batch( const uint8_t* batch_codes = col_codes + b_offset * code_size; - // SIMD init: compute precomp distances for all M subquantizers. - // process_chunks naturally handles column-major codes and does - // cache-friendly 1KB-at-a-time table lookups with AVX-512 gathers. - std::fill( - exact_distances.begin(), - exact_distances.begin() + curr_batch_size, - 0.0f); - panorama_kernels::process_chunks( - M, - bs, - curr_batch_size, - const_cast(precomp), - const_cast(batch_codes), - exact_distances.data()); + // Load precomputed init distances (||r||^2 + 2). + const float* batch_init = init_dists + b_offset; + std::copy(batch_init, batch_init + curr_batch_size, exact_distances.begin()); const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1); size_t next_num_active = curr_batch_size; size_t batch_offset = batch_no * bs; - size_t total_active = next_num_active; + const size_t total_active = next_num_active; for (size_t level = 0; level < n_levels && next_num_active > 0; level++) { local_stats.total_dims_scanned += next_num_active; diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index 7cc9ee9d08..88d07845e8 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -410,6 +410,9 @@ std::unique_ptr read_InvertedLists_up( nlist, code_size, pano); std::vector sizes(nlist); read_ArrayInvertedLists_sizes(f, sizes); + + bool has_init_dists; + READ1(has_init_dists); for (size_t i = 0; i < nlist; i++) { ailp->ids[i].resize(sizes[i]); size_t num_elems = @@ -418,6 +421,9 @@ std::unique_ptr read_InvertedLists_up( ArrayInvertedListsPanorama::kBatchSize; ailp->codes[i].resize(num_elems * code_size); ailp->cum_sums[i].resize(num_elems * (n_levels + 1)); + if (has_init_dists) { + ailp->init_dists[i].resize(num_elems); + } } for (size_t i = 0; i < nlist; i++) { size_t n = sizes[i]; @@ -427,6 +433,12 @@ std::unique_ptr read_InvertedLists_up( read_vector_with_known_size(ailp->ids[i], f, n); read_vector_with_known_size( ailp->cum_sums[i], f, ailp->cum_sums[i].size()); + if (has_init_dists) { + read_vector_with_known_size( + ailp->init_dists[i], + f, + ailp->init_dists[i].size()); + } } } return ailp; @@ -1392,7 +1404,8 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { ivpp->code_size, ivpp->n_levels, ivpp->batch_size, - &ivpp->pq)); + &ivpp->pq, + ivpp->quantizer)); } if (ivpp->is_trained) { ivpp->use_precomputed_table = 1; diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp index 02d0870bbc..2f6e1d52f7 100644 --- a/faiss/impl/index_write.cpp +++ b/faiss/impl/index_write.cpp @@ -284,7 +284,11 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) { } WRITEVECTOR(sizes); - // Write codes, ids, and cum_sums + bool has_init_dists = !ailp->init_dists.empty() && + ailp->init_dists[0].size() > 0; + WRITE1(has_init_dists); + + // Write codes, ids, cum_sums, and optionally init_dists for (size_t i = 0; i < ailp->nlist; i++) { size_t n = ailp->ids[i].size(); if (n > 0) { @@ -292,6 +296,11 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) { WRITEANDCHECK(ailp->ids[i].data(), n); WRITEANDCHECK( ailp->cum_sums[i].data(), ailp->cum_sums[i].size()); + if (has_init_dists) { + WRITEANDCHECK( + ailp->init_dists[i].data(), + ailp->init_dists[i].size()); + } } } } else if ( diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp index 313228301c..b0256d7073 100644 --- a/faiss/invlists/InvertedLists.cpp +++ b/faiss/invlists/InvertedLists.cpp @@ -11,6 +11,7 @@ #include #include +#include #include namespace faiss { @@ -361,6 +362,7 @@ ArrayInvertedListsPanorama::ArrayInvertedListsPanorama( !use_iterator, "Panorama does not support iterators"); cum_sums.resize(nlist); + init_dists.resize(nlist); } const float* ArrayInvertedListsPanorama::get_cum_sums(size_t list_no) const { @@ -368,6 +370,11 @@ const float* ArrayInvertedListsPanorama::get_cum_sums(size_t list_no) const { return cum_sums[list_no].data(); } +const float* ArrayInvertedListsPanorama::get_init_dists(size_t list_no) const { + assert(list_no < nlist); + return init_dists[list_no].data(); +} + size_t ArrayInvertedListsPanorama::add_entries( size_t list_no, size_t n_entry, @@ -381,12 +388,20 @@ size_t ArrayInvertedListsPanorama::add_entries( size_t new_size = o + n_entry; size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize; - codes[list_no].resize(num_batches * kBatchSize * code_size); - cum_sums[list_no].resize(num_batches * kBatchSize * (pano->n_levels + 1)); + size_t padded = num_batches * kBatchSize; + codes[list_no].resize(padded * code_size); + cum_sums[list_no].resize(padded * (pano->n_levels + 1)); pano->copy_codes_to_level_layout(codes[list_no].data(), o, n_entry, code); pano->compute_cumulative_sums(cum_sums[list_no].data(), o, n_entry, code); + auto* pano_pq = dynamic_cast(pano.get()); + if (pano_pq) { + init_dists[list_no].resize(padded); + pano_pq->compute_init_distances( + init_dists[list_no].data(), list_no, o, n_entry, code); + } + return o; } @@ -405,14 +420,25 @@ void ArrayInvertedListsPanorama::update_entries( codes[list_no].data(), offset, n_entry, code); pano->compute_cumulative_sums( cum_sums[list_no].data(), offset, n_entry, code); + + auto* pano_pq = dynamic_cast(pano.get()); + if (pano_pq) { + pano_pq->compute_init_distances( + init_dists[list_no].data(), list_no, offset, n_entry, code); + } } void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) { ids[list_no].resize(new_size); size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize; - codes[list_no].resize(num_batches * kBatchSize * code_size); - cum_sums[list_no].resize(num_batches * kBatchSize * (pano->n_levels + 1)); + size_t padded = num_batches * kBatchSize; + codes[list_no].resize(padded * code_size); + cum_sums[list_no].resize(padded * (pano->n_levels + 1)); + + if (init_dists[list_no].size() > 0) { + init_dists[list_no].resize(padded); + } } const uint8_t* ArrayInvertedListsPanorama::get_single_code( diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h index edc29995b9..842344e50c 100644 --- a/faiss/invlists/InvertedLists.h +++ b/faiss/invlists/InvertedLists.h @@ -285,12 +285,14 @@ struct ArrayInvertedLists : InvertedLists { struct ArrayInvertedListsPanorama : ArrayInvertedLists { static constexpr size_t kBatchSize = 128; std::vector> cum_sums; + std::vector> init_dists; std::unique_ptr pano; /// Takes ownership of the provided Panorama*. ArrayInvertedListsPanorama(size_t nlist, size_t code_size, Panorama* pano); const float* get_cum_sums(size_t list_no) const; + const float* get_init_dists(size_t list_no) const; size_t add_entries( size_t list_no, From f1ea5963a9153f5a3a8e55827c04b15cdcfe8b7c Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Fri, 20 Mar 2026 05:11:38 +0000 Subject: [PATCH 14/41] Fix levels as well --- faiss/impl/PanoramaPQ.h | 186 +++++++++++++++++++--------------------- 1 file changed, 87 insertions(+), 99 deletions(-) diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h index 91d443b840..fcf2d59c61 100644 --- a/faiss/impl/PanoramaPQ.h +++ b/faiss/impl/PanoramaPQ.h @@ -101,111 +101,99 @@ struct PanoramaPQ : Panorama { std::vector& bitset, std::vector& compressed_codes, float threshold, - PanoramaStats& local_stats) const; -}; + PanoramaStats& local_stats) const { + const size_t bs = batch_size; + const size_t cs = chunk_size; + const size_t ksub = pq->ksub; + + size_t curr_batch_size = std::min(list_size - batch_no * bs, bs); + size_t b_offset = batch_no * bs; + + // Initialize active set. + std::iota( + active_indices.begin(), + active_indices.begin() + curr_batch_size, + b_offset); + std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1); + std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0); + + const uint8_t* batch_codes = col_codes + b_offset * code_size; + + // Load precomputed init distances (||r||^2 + 2). + const float* batch_init = init_dists + b_offset; + std::copy( + batch_init, + batch_init + curr_batch_size, + exact_distances.begin()); + + const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1); + + size_t next_num_active = curr_batch_size; + size_t batch_offset = batch_no * bs; + const size_t total_active = next_num_active; + + local_stats.total_dims += total_active * n_levels; + + for (size_t level = 0; level < n_levels && next_num_active > 0; + level++) { + local_stats.total_dims_scanned += next_num_active; -template -size_t PanoramaPQ::progressive_filter_batch( - const uint8_t* col_codes, - const float* list_cum_sums, - const float* init_dists, - const float* sim_table_2, - const float* query_cum_norms, - float coarse_dis, - size_t list_size, - size_t batch_no, - std::vector& exact_distances, - std::vector& active_indices, - std::vector& bitset, - std::vector& compressed_codes, - float threshold, - PanoramaStats& local_stats) const { - const size_t bs = batch_size; - const size_t cs = chunk_size; - const size_t ksub = pq->ksub; - - size_t curr_batch_size = std::min(list_size - batch_no * bs, bs); - size_t b_offset = batch_no * bs; - - // Initialize active set. - std::iota( - active_indices.begin(), - active_indices.begin() + curr_batch_size, - b_offset); - std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1); - std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0); - - const uint8_t* batch_codes = col_codes + b_offset * code_size; - - // Load precomputed init distances (||r||^2 + 2). - const float* batch_init = init_dists + b_offset; - std::copy(batch_init, batch_init + curr_batch_size, exact_distances.begin()); - - const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1); - - size_t next_num_active = curr_batch_size; - size_t batch_offset = batch_no * bs; - const size_t total_active = next_num_active; - - for (size_t level = 0; level < n_levels && next_num_active > 0; level++) { - local_stats.total_dims_scanned += next_num_active; - local_stats.total_dims += total_active; - - size_t level_sim_offset = level * ksub * cs; - - float query_cum_norm = 2 * query_cum_norms[level + 1]; - - const float* cum_sums_level = batch_cums + bs * (level + 1); - const uint8_t* codes_level = batch_codes + bs * cs * level; - - const float* sim_table_level = sim_table_2 + level_sim_offset; - - bool is_sparse = next_num_active < bs / 16; - - size_t num_active_for_filtering = 0; - if (is_sparse) { - for (size_t ci = 0; ci < cs; ci++) { - size_t chunk_off = ci * bs; - const float* chunk_sim = sim_table_level + ci * ksub; - for (size_t i = 0; i < next_num_active; i++) { - size_t real_idx = active_indices[i] - batch_offset; - exact_distances[i] += - chunk_sim[codes_level[chunk_off + real_idx]]; + size_t level_sim_offset = level * ksub * cs; + + float query_cum_norm = 2 * query_cum_norms[level + 1]; + + const float* cum_sums_level = batch_cums + bs * (level + 1); + const uint8_t* codes_level = batch_codes + bs * cs * level; + + const float* sim_table_level = sim_table_2 + level_sim_offset; + + bool is_sparse = next_num_active < bs / 16; + + size_t num_active_for_filtering = 0; + if (is_sparse) { + for (size_t ci = 0; ci < cs; ci++) { + size_t chunk_off = ci * bs; + const float* chunk_sim = sim_table_level + ci * ksub; + for (size_t i = 0; i < next_num_active; i++) { + size_t real_idx = active_indices[i] - batch_offset; + exact_distances[i] += + chunk_sim[codes_level[chunk_off + real_idx]]; + } } + num_active_for_filtering = next_num_active; + } else { + auto [cc, na] = panorama_kernels::process_code_compression( + next_num_active, + bs, + cs, + compressed_codes.data(), + bitset.data(), + codes_level); + + panorama_kernels::process_chunks( + cs, + bs, + na, + const_cast(sim_table_level), + cc, + exact_distances.data()); + num_active_for_filtering = na; } - num_active_for_filtering = next_num_active; - } else { - auto [cc, na] = panorama_kernels::process_code_compression( - next_num_active, - bs, - cs, - compressed_codes.data(), + + next_num_active = panorama_kernels::process_filtering( + num_active_for_filtering, + exact_distances.data(), + active_indices.data(), + const_cast(cum_sums_level), bitset.data(), - codes_level); - - panorama_kernels::process_chunks( - cs, - bs, - na, - const_cast(sim_table_level), - cc, - exact_distances.data()); - num_active_for_filtering = na; + batch_offset, + coarse_dis, + query_cum_norm, + threshold); } - next_num_active = panorama_kernels::process_filtering( - num_active_for_filtering, - exact_distances.data(), - active_indices.data(), - const_cast(cum_sums_level), - bitset.data(), - batch_offset, - coarse_dis, - query_cum_norm, - threshold); + return next_num_active; } - - return next_num_active; -} +}; } // namespace faiss From a34ceed5a456762743906cf19e843c35993be701 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Fri, 20 Mar 2026 05:28:00 +0000 Subject: [PATCH 15/41] More cleanup --- faiss/impl/index_read.cpp | 44 ++++++++++++++++++++++---------- faiss/impl/index_write.cpp | 11 +------- faiss/invlists/InvertedLists.cpp | 2 +- 3 files changed, 33 insertions(+), 24 deletions(-) diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index 88d07845e8..21d76e35c4 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -411,8 +411,6 @@ std::unique_ptr read_InvertedLists_up( std::vector sizes(nlist); read_ArrayInvertedLists_sizes(f, sizes); - bool has_init_dists; - READ1(has_init_dists); for (size_t i = 0; i < nlist; i++) { ailp->ids[i].resize(sizes[i]); size_t num_elems = @@ -421,9 +419,6 @@ std::unique_ptr read_InvertedLists_up( ArrayInvertedListsPanorama::kBatchSize; ailp->codes[i].resize(num_elems * code_size); ailp->cum_sums[i].resize(num_elems * (n_levels + 1)); - if (has_init_dists) { - ailp->init_dists[i].resize(num_elems); - } } for (size_t i = 0; i < nlist; i++) { size_t n = sizes[i]; @@ -433,12 +428,6 @@ std::unique_ptr read_InvertedLists_up( read_vector_with_known_size(ailp->ids[i], f, n); read_vector_with_known_size( ailp->cum_sums[i], f, ailp->cum_sums[i].size()); - if (has_init_dists) { - read_vector_with_known_size( - ailp->init_dists[i], - f, - ailp->init_dists[i].size()); - } } } return ailp; @@ -1399,13 +1388,42 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { auto* storage = dynamic_cast(ivpp->invlists); if (storage) { - storage->pano.reset(new PanoramaPQ( + auto* pano_pq = new PanoramaPQ( ivpp->d, ivpp->code_size, ivpp->n_levels, ivpp->batch_size, &ivpp->pq, - ivpp->quantizer)); + ivpp->quantizer); + storage->pano.reset(pano_pq); + + // Recompute init_dists from stored codes + quantizer. + for (size_t list_no = 0; list_no < ivpp->nlist; list_no++) { + size_t list_size = storage->ids[list_no].size(); + if (list_size == 0) + continue; + size_t padded = + ((list_size + + ArrayInvertedListsPanorama::kBatchSize - 1) / + ArrayInvertedListsPanorama::kBatchSize) * + ArrayInvertedListsPanorama::kBatchSize; + storage->init_dists[list_no].resize(padded); + + // Reconstruct row-major codes, then compute init distances. + std::vector row_code(ivpp->code_size); + for (size_t i = 0; i < list_size; i++) { + pano_pq->reconstruct( + i, + reinterpret_cast(row_code.data()), + storage->codes[list_no].data()); + pano_pq->compute_init_distances( + storage->init_dists[list_no].data(), + list_no, + i, + 1, + row_code.data()); + } + } } if (ivpp->is_trained) { ivpp->use_precomputed_table = 1; diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp index 2f6e1d52f7..02d0870bbc 100644 --- a/faiss/impl/index_write.cpp +++ b/faiss/impl/index_write.cpp @@ -284,11 +284,7 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) { } WRITEVECTOR(sizes); - bool has_init_dists = !ailp->init_dists.empty() && - ailp->init_dists[0].size() > 0; - WRITE1(has_init_dists); - - // Write codes, ids, cum_sums, and optionally init_dists + // Write codes, ids, and cum_sums for (size_t i = 0; i < ailp->nlist; i++) { size_t n = ailp->ids[i].size(); if (n > 0) { @@ -296,11 +292,6 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) { WRITEANDCHECK(ailp->ids[i].data(), n); WRITEANDCHECK( ailp->cum_sums[i].data(), ailp->cum_sums[i].size()); - if (has_init_dists) { - WRITEANDCHECK( - ailp->init_dists[i].data(), - ailp->init_dists[i].size()); - } } } } else if ( diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp index b0256d7073..e9f137f6b7 100644 --- a/faiss/invlists/InvertedLists.cpp +++ b/faiss/invlists/InvertedLists.cpp @@ -436,7 +436,7 @@ void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) { codes[list_no].resize(padded * code_size); cum_sums[list_no].resize(padded * (pano->n_levels + 1)); - if (init_dists[list_no].size() > 0) { + if (dynamic_cast(pano.get())) { init_dists[list_no].resize(padded); } } From 2a6b0f78c4c7f0a14c4dfc4d1c2c50dd784878f7 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sat, 21 Mar 2026 04:34:14 +0000 Subject: [PATCH 16/41] Batch size --- faiss/IndexIVFFlatPanorama.cpp | 13 ++++++------- faiss/impl/index_read.cpp | 16 ++++++---------- faiss/invlists/InvertedLists.cpp | 10 ++++++---- faiss/invlists/InvertedLists.h | 1 - 4 files changed, 18 insertions(+), 22 deletions(-) diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp index 5e678be28c..2dc7cd5594 100644 --- a/faiss/IndexIVFFlatPanorama.cpp +++ b/faiss/IndexIVFFlatPanorama.cpp @@ -38,8 +38,7 @@ IndexIVFFlatPanorama::IndexIVFFlatPanorama( // We construct the inverted lists here so that we can use the // level-oriented storage. This does not cause a leak as we constructed // IndexIVF first, with own_invlists set to false. - auto* pano = new PanoramaFlat( - d, n_levels, ArrayInvertedListsPanorama::kBatchSize); + auto* pano = new PanoramaFlat(d, n_levels, 128); this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano); this->own_invlists = own_invlists; } @@ -100,19 +99,19 @@ struct IVFFlatScannerPanorama : InvertedListScanner { ResultHandler& handler) const override { size_t nup = 0; - const size_t n_batches = - (list_size + storage->kBatchSize - 1) / storage->kBatchSize; + const size_t bs = pano_flat->batch_size; + const size_t n_batches = (list_size + bs - 1) / bs; const float* cum_sums_data = storage->get_cum_sums(list_no); - std::vector exact_distances(storage->kBatchSize); - std::vector active_indices(storage->kBatchSize); + std::vector exact_distances(bs); + std::vector active_indices(bs); PanoramaStats local_stats; local_stats.reset(); for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { - size_t batch_start = batch_no * storage->kBatchSize; + size_t batch_start = batch_no * bs; size_t num_active = with_metric_type(metric, [&]() { return pano_flat->progressive_filter_batch( diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index 21d76e35c4..e5632c9d84 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -402,10 +402,9 @@ std::unique_ptr read_InvertedLists_up( FAISS_CHECK_DESERIALIZATION_LOOP_LIMIT(nlist, "ilpn nlist"); READ1(code_size); READ1(n_levels); + constexpr size_t kFlatBatchSize = 128; auto* pano = new PanoramaFlat( - code_size / sizeof(float), - n_levels, - ArrayInvertedListsPanorama::kBatchSize); + code_size / sizeof(float), n_levels, kFlatBatchSize); auto ailp = std::make_unique( nlist, code_size, pano); std::vector sizes(nlist); @@ -414,9 +413,8 @@ std::unique_ptr read_InvertedLists_up( for (size_t i = 0; i < nlist; i++) { ailp->ids[i].resize(sizes[i]); size_t num_elems = - ((sizes[i] + ArrayInvertedListsPanorama::kBatchSize - 1) / - ArrayInvertedListsPanorama::kBatchSize) * - ArrayInvertedListsPanorama::kBatchSize; + ((sizes[i] + kFlatBatchSize - 1) / kFlatBatchSize) * + kFlatBatchSize; ailp->codes[i].resize(num_elems * code_size); ailp->cum_sums[i].resize(num_elems * (n_levels + 1)); } @@ -1402,11 +1400,9 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { size_t list_size = storage->ids[list_no].size(); if (list_size == 0) continue; + size_t bs = pano_pq->batch_size; size_t padded = - ((list_size + - ArrayInvertedListsPanorama::kBatchSize - 1) / - ArrayInvertedListsPanorama::kBatchSize) * - ArrayInvertedListsPanorama::kBatchSize; + ((list_size + bs - 1) / bs) * bs; storage->init_dists[list_no].resize(padded); // Reconstruct row-major codes, then compute init distances. diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp index e9f137f6b7..0cdb7a2b07 100644 --- a/faiss/invlists/InvertedLists.cpp +++ b/faiss/invlists/InvertedLists.cpp @@ -387,8 +387,9 @@ size_t ArrayInvertedListsPanorama::add_entries( memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry); size_t new_size = o + n_entry; - size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize; - size_t padded = num_batches * kBatchSize; + size_t bs = pano->batch_size; + size_t num_batches = (new_size + bs - 1) / bs; + size_t padded = num_batches * bs; codes[list_no].resize(padded * code_size); cum_sums[list_no].resize(padded * (pano->n_levels + 1)); @@ -431,8 +432,9 @@ void ArrayInvertedListsPanorama::update_entries( void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) { ids[list_no].resize(new_size); - size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize; - size_t padded = num_batches * kBatchSize; + size_t bs = pano->batch_size; + size_t num_batches = (new_size + bs - 1) / bs; + size_t padded = num_batches * bs; codes[list_no].resize(padded * code_size); cum_sums[list_no].resize(padded * (pano->n_levels + 1)); diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h index 842344e50c..620c0cbbff 100644 --- a/faiss/invlists/InvertedLists.h +++ b/faiss/invlists/InvertedLists.h @@ -283,7 +283,6 @@ struct ArrayInvertedLists : InvertedLists { /// Works with both flat codes (PanoramaFlat) and PQ codes (PanoramaPQ) /// via the virtual Panorama interface. struct ArrayInvertedListsPanorama : ArrayInvertedLists { - static constexpr size_t kBatchSize = 128; std::vector> cum_sums; std::vector> init_dists; std::unique_ptr pano; From 53033507f509cfede4d4c810d87ecabdc7169b0d Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sat, 21 Mar 2026 05:01:36 +0000 Subject: [PATCH 17/41] selector --- faiss/IndexIVFPQPanorama.cpp | 33 ++++++++++++++------------- faiss/impl/PanoramaPQ.h | 43 ++++++++++++++++++++++-------------- faiss/python/__init__.py | 1 + 3 files changed, 46 insertions(+), 31 deletions(-) diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index 6dcbdad261..26c9eccbd3 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -149,21 +149,24 @@ struct IVFPQScannerPanorama : InvertedListScanner { local_stats.reset(); for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { - size_t num_active = pano_pq->progressive_filter_batch( - col_codes, - list_cum_sums, - list_init_dists, - sim_table_2.data(), - query_cum_norms.data(), - dis0, - list_size, - batch_no, - exact_distances, - active_indices, - bitset, - compressed_codes, - distances[0], - local_stats); + size_t num_active = + pano_pq->progressive_filter_batch( + col_codes, + list_cum_sums, + list_init_dists, + sim_table_2.data(), + query_cum_norms.data(), + dis0, + list_size, + batch_no, + ids, + sel, + exact_distances, + active_indices, + bitset, + compressed_codes, + distances[0], + local_stats); // Insert surviving candidates into heap. for (size_t i = 0; i < num_active; i++) { diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h index fcf2d59c61..a8e35a5699 100644 --- a/faiss/impl/PanoramaPQ.h +++ b/faiss/impl/PanoramaPQ.h @@ -79,6 +79,8 @@ struct PanoramaPQ : Panorama { /// @param coarse_dis Coarse distance (dis0) for this list. /// @param list_size Total number of vectors in this list. /// @param batch_no Which batch to process. + /// @param ids ID array for the inverted list. + /// @param sel ID selector for filtering (may be nullptr). /// @param exact_distances [out] Scratch buffer for partial distances. /// @param active_indices [out] Scratch buffer for survivor indices. /// @param bitset Scratch buffer for code compression. @@ -86,7 +88,7 @@ struct PanoramaPQ : Panorama { /// @param threshold Current heap threshold for pruning. /// @param local_stats [out] Accumulated pruning statistics. /// @return Number of surviving candidates in active_indices. - template + template size_t progressive_filter_batch( const uint8_t* col_codes, const float* list_cum_sums, @@ -96,6 +98,8 @@ struct PanoramaPQ : Panorama { float coarse_dis, size_t list_size, size_t batch_no, + const idx_t* ids, + const IDSelector* sel, std::vector& exact_distances, std::vector& active_indices, std::vector& bitset, @@ -109,26 +113,33 @@ struct PanoramaPQ : Panorama { size_t curr_batch_size = std::min(list_size - batch_no * bs, bs); size_t b_offset = batch_no * bs; - // Initialize active set. - std::iota( - active_indices.begin(), - active_indices.begin() + curr_batch_size, - b_offset); - std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1); - std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0); + // Initialize active set with ID-filtered vectors. + std::fill(bitset.begin(), bitset.end(), 0); + size_t num_active = 0; + const float* batch_init = init_dists + b_offset; + for (size_t i = 0; i < curr_batch_size; i++) { + size_t global_idx = b_offset + i; + if (use_sel) { + idx_t id = ids[global_idx]; + if (!sel->is_member(id)) { + continue; + } + } + active_indices[num_active] = global_idx; + exact_distances[num_active] = batch_init[i]; + bitset[i] = 1; + num_active++; + } - const uint8_t* batch_codes = col_codes + b_offset * code_size; + if (num_active == 0) { + return 0; + } - // Load precomputed init distances (||r||^2 + 2). - const float* batch_init = init_dists + b_offset; - std::copy( - batch_init, - batch_init + curr_batch_size, - exact_distances.begin()); + const uint8_t* batch_codes = col_codes + b_offset * code_size; const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1); - size_t next_num_active = curr_batch_size; + size_t next_num_active = num_active; size_t batch_offset = batch_no * bs; const size_t total_active = next_num_active; diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py index 05b376efce..1e82912ca4 100644 --- a/faiss/python/__init__.py +++ b/faiss/python/__init__.py @@ -177,6 +177,7 @@ def replacement_function(*args): add_ref_in_constructor(IndexPreTransform, {2: [0, 1], 1: [0]}) add_ref_in_method(IndexPreTransform, 'prepend_transform', 0) add_ref_in_constructor(IndexIVFPQ, 0) +add_ref_in_constructor(IndexIVFPQPanorama, 0) add_ref_in_constructor(IndexIVFPQR, 0) add_ref_in_constructor(IndexIVFPQFastScan, 0) add_ref_in_constructor(IndexIVFResidualQuantizer, 0) From a6e55288c0f7248ce79bb517dcb1d659a5397d3e Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sat, 21 Mar 2026 05:05:08 +0000 Subject: [PATCH 18/41] First pass at tests --- tests/test_ivfpq_panorama.py | 617 +++++++++++++++++++++++++++++++++++ 1 file changed, 617 insertions(+) create mode 100644 tests/test_ivfpq_panorama.py diff --git a/tests/test_ivfpq_panorama.py b/tests/test_ivfpq_panorama.py new file mode 100644 index 0000000000..d8b16a128e --- /dev/null +++ b/tests/test_ivfpq_panorama.py @@ -0,0 +1,617 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. + +""" +Comprehensive test suite for IndexIVFPQPanorama. + +Panorama is an adaptation of IndexIVFPQ that uses level-oriented storage +and progressive filtering with Cauchy-Schwarz bounds to achieve significant +speedups when combined with PCA or Cayley transforms, with zero loss in +accuracy. + +Paper: https://www.arxiv.org/pdf/2510.00566 + +Constraints specific to IndexIVFPQPanorama: + - Only L2 metric is supported. + - Only 8-bit PQ codes (nbits == 8). + - M must be divisible by n_levels. + - batch_size must be a multiple of 64. + - use_precomputed_table must be 1. +""" + +import unittest + +import faiss +import numpy as np +from faiss.contrib.datasets import SyntheticDataset + + +class TestIndexIVFPQPanorama(unittest.TestCase): + """Test Suite for IndexIVFPQPanorama.""" + + # Helper methods for index creation and data generation + + def generate_data(self, d, nt, nb, nq, seed=42): + ds = SyntheticDataset(d, nt, nb, nq, seed=seed) + return ds.get_train(), ds.get_database(), ds.get_queries() + + def create_ivfpq(self, d, nlist, M, nbits, xt, xb=None, nprobe=None): + """Create and train a standard IndexIVFPQ (L2 only).""" + quantizer = faiss.IndexFlatL2(d) + index = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) + index.train(xt) + if xb is not None: + index.add(xb) + if nprobe is not None: + index.nprobe = nprobe + return index + + def create_panorama( + self, d, nlist, M, nbits, n_levels, xt, xb=None, + nprobe=None, batch_size=128, + ): + """Create IndexIVFPQPanorama from a freshly trained IVFPQ. + + Trains a temporary IndexIVFPQ, copies PQ centroids and quantizer + into the Panorama index, then sets up precomputed tables. + """ + quantizer = faiss.IndexFlatL2(d) + trained = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) + trained.train(xt) + + trained.own_fields = False + pano = faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, batch_size, + ) + centroids = faiss.vector_to_array(trained.pq.centroids) + faiss.copy_array_to_vector(centroids, pano.pq.centroids) + pano.is_trained = True + pano.use_precomputed_table = 1 + pano.precompute_table() + + if xb is not None: + pano.add(xb) + if nprobe is not None: + pano.nprobe = nprobe + return pano + + def create_pair( + self, d, nlist, M, nbits, n_levels, xt, xb=None, + nprobe=None, batch_size=128, + ): + """Create an IVFPQ and an IVFPQPanorama sharing the same training. + + Both indexes use the same quantizer centroids and PQ codebook, + so search results should be identical. + """ + quantizer = faiss.IndexFlatL2(d) + trained = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) + trained.train(xt) + + # Build the IVFPQ baseline from the trained state. + ivfpq = faiss.clone_index(trained) + + # Build the Panorama from the same trained state. + trained.own_fields = False + pano = faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, batch_size, + ) + centroids = faiss.vector_to_array(trained.pq.centroids) + faiss.copy_array_to_vector(centroids, pano.pq.centroids) + pano.is_trained = True + pano.use_precomputed_table = 1 + pano.precompute_table() + + if xb is not None: + ivfpq.add(xb) + pano.add(xb) + if nprobe is not None: + ivfpq.nprobe = nprobe + pano.nprobe = nprobe + return ivfpq, pano + + def assert_search_results_equal( + self, + D_regular, + I_regular, + D_panorama, + I_panorama, + rtol=1e-4, + atol=1e-6, + otol=1e-3, + ): + overlap_rate = np.mean(I_regular == I_panorama) + + self.assertGreater( + overlap_rate, + 1 - otol, + f"Overlap rate {overlap_rate:.6f} is not > {1 - otol:.3f}. ", + ) + np.testing.assert_allclose( + D_regular, + D_panorama, + rtol=rtol, + atol=atol, + err_msg="Distances mismatch", + ) + + # Core functionality tests + + def test_exact_match_with_ivfpq(self): + """Core test: Panorama must return identical results to IndexIVFPQ""" + d, nb, nt, nq = 64, 50000, 60000, 500 + nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=42) + + for nprobe in [1, 4, 16, 64]: + with self.subTest(nprobe=nprobe): + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_exact_match_with_ivfpq_medium(self): + """Core test: Medium scale version""" + d, nb, nt, nq = 32, 10000, 15000, 200 + nlist, M, nbits, n_levels, k = 32, 8, 8, 4, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=42) + + for nprobe in [1, 4, 8, nlist]: + with self.subTest(nprobe=nprobe): + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + # Parameter variation tests + + def test_different_n_levels(self): + """Test correctness with various n_levels parameter values""" + d, nb, nt, nq = 64, 25000, 40000, 200 + nlist, M, nbits, k = 64, 16, 8, 15 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=456) + + # Train IVFPQ once for the baseline. + ivfpq = self.create_ivfpq(d, nlist, M, nbits, xt, xb, nprobe=16) + D_base, I_base = ivfpq.search(xq, k) + + nt = faiss.omp_get_max_threads() + faiss.omp_set_num_threads(1) + + prev_ratio = float("inf") + # n_levels must divide M=16. + for n_levels in [1, 2, 4, 8, 16]: + with self.subTest(n_levels=n_levels): + faiss.cvar.indexPanorama_stats.reset() + + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=16, + ) + D, I = pano.search(xq, k) + self.assert_search_results_equal(D_base, I_base, D, I) + + ratio = faiss.cvar.indexPanorama_stats.ratio_dims_scanned + self.assertLess(ratio, prev_ratio) + prev_ratio = ratio + + faiss.omp_set_num_threads(nt) + + def test_different_M_and_n_levels(self): + """Test various M / n_levels combinations""" + test_cases = [ + (32, 8, 2), # M=8, n_levels=2, chunk=4 + (64, 16, 4), # M=16, n_levels=4, chunk=4 + (64, 32, 8), # M=32, n_levels=8, chunk=4 + ] + for d, M, n_levels in test_cases: + with self.subTest(d=d, M=M, n_levels=n_levels): + nb, nt, nq, nlist, nbits, k = 10000, 15000, 100, 32, 8, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=789) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=8, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_single_level(self): + """Test edge case with n_levels=1 (no pruning, equivalent to IVFPQ)""" + d, nb, nt, nq = 32, 5000, 7000, 50 + nlist, M, nbits, n_levels, k = 16, 8, 8, 1, 5 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=333) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=4, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_max_levels(self): + """Test edge case with n_levels=M (each level is one subquantizer)""" + d, nb, nt, nq = 64, 5000, 7000, 50 + nlist, M, nbits, n_levels, k = 16, 16, 8, 16, 5 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=444) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=4, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + # ID selector tests + + def test_id_selector_range(self): + """Test ID filtering with range selector""" + d, nb, nt, nq = 64, 50000, 60000, 300 + nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=321) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=16, + ) + + params = faiss.SearchParametersIVF() + params.sel = faiss.IDSelectorRange(10000, 30000) + + D_regular, I_regular = ivfpq.search(xq, k, params=params) + D_panorama, I_panorama = pano.search(xq, k, params=params) + + valid = I_panorama[I_panorama >= 0] + self.assertTrue(np.all(valid >= 10000)) + self.assertTrue(np.all(valid < 30000)) + + np.testing.assert_array_equal(I_regular, I_panorama) + np.testing.assert_allclose(D_regular, D_panorama, rtol=1e-4) + + def test_id_selector_batch(self): + """Test ID filtering with batch selector""" + d, nb, nt, nq = 64, 30000, 45000, 200 + nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=654) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=16, + ) + + allowed_ids = np.array([i * 50 for i in range(500)], dtype=np.int64) + params = faiss.SearchParametersIVF() + params.sel = faiss.IDSelectorBatch(allowed_ids) + + D_regular, I_regular = ivfpq.search(xq, k, params=params) + D_panorama, I_panorama = pano.search(xq, k, params=params) + + allowed_set = set(allowed_ids) | {-1} + for id_val in I_panorama.flatten(): + self.assertIn(int(id_val), allowed_set) + + np.testing.assert_array_equal(I_regular, I_panorama) + np.testing.assert_allclose(D_regular, D_panorama, rtol=1e-4) + + def test_selector_excludes_all(self): + """Test selector that excludes all results""" + d, nb, nt, nq = 32, 3000, 5000, 5 + nlist, M, nbits, n_levels, k = 8, 8, 8, 4, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=999) + + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist, + ) + + params = faiss.SearchParametersIVF() + params.sel = faiss.IDSelectorRange(nb + 100, nb + 200) + + D, I = pano.search(xq, k, params=params) + self.assertTrue(np.all(I == -1)) + + # Batch size and edge case tests + + def test_batch_boundaries(self): + """Test correctness at various database sizes relative to batch_size""" + d, nq = 64, 50 + nlist, M, nbits, n_levels, k = 16, 16, 8, 4, 10 + xq = np.random.rand(nq, d).astype("float32") + + batch_size = 128 + test_sizes = [ + batch_size - 1, + batch_size, + batch_size + 1, + batch_size * 2, + batch_size * 3 - 1, + ] + for nb in test_sizes: + with self.subTest(nb=nb): + nt = max(nb, 500) + np.random.seed(987) + xt = np.random.rand(nt, d).astype("float32") + xb = np.random.rand(nb, d).astype("float32") + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, + nprobe=nlist, batch_size=batch_size, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_different_batch_sizes(self): + """Test correctness across different internal batch sizes""" + d, nb, nt, nq = 64, 10000, 15000, 50 + nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=4242) + + ivfpq = self.create_ivfpq(d, nlist, M, nbits, xt, xb, nprobe=8) + D_base, I_base = ivfpq.search(xq, k) + + for bs in [64, 128, 256, 512, 1024]: + with self.subTest(batch_size=bs): + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, + nprobe=8, batch_size=bs, + ) + D, I = pano.search(xq, k) + self.assert_search_results_equal(D_base, I_base, D, I) + + def test_very_small_dataset(self): + """Test with dataset smaller than batch size""" + test_cases = [10, 50, 100] + + for nb in test_cases: + with self.subTest(nb=nb): + d, nlist, M, nbits, n_levels = 32, 4, 4, 8, 2 + nt, nq = max(nb, 1500), 5 + k = min(3, nb) + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=666 + nb) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_single_vector_per_cluster(self): + """Test extreme case where clusters have very few vectors""" + d, nb, nt, nq = 32, 20, 3000, 5 + nlist, M, nbits, n_levels, k = 16, 4, 8, 2, 3 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=1313) + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist, + ) + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_empty_result_handling(self): + """Test handling of empty search results (shapes only)""" + d, nb, nt, nq = 32, 100, 3000, 10 + nlist, M, nbits, n_levels, k = 8, 4, 8, 2, 10 + xt, xb, _ = self.generate_data(d, nt, nb, nq, seed=111) + xq = np.random.rand(nq, d).astype("float32") + 10.0 + + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=1, + ) + D, I = pano.search(xq, k) + + self.assertEqual(D.shape, (nq, k)) + self.assertEqual(I.shape, (nq, k)) + + # Dynamic operations tests + + def test_incremental_add(self): + """Test adding vectors incrementally in multiple batches""" + d, nt = 64, 20000 + nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 15 + xt = np.random.rand(nt, d).astype("float32") + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, nprobe=16, + ) + + for batch_nb in [5000, 10000, 15000]: + xb_batch = np.random.rand(batch_nb, d).astype("float32") + ivfpq.add(xb_batch) + pano.add(xb_batch) + + nq = 100 + xq = np.random.rand(nq, d).astype("float32") + + D_regular, I_regular = ivfpq.search(xq, k) + D_panorama, I_panorama = pano.search(xq, k) + + self.assert_search_results_equal( + D_regular, I_regular, D_panorama, I_panorama + ) + + def test_add_search_add_search(self): + """Test interleaved add and search operations""" + d, nt = 32, 500 + nlist, M, nbits, n_levels, k = 8, 8, 8, 4, 5 + np.random.seed(555) + xt = np.random.rand(nt, d).astype("float32") + + ivfpq, pano = self.create_pair( + d, nlist, M, nbits, n_levels, xt, nprobe=4, + ) + + xb1 = np.random.rand(200, d).astype("float32") + ivfpq.add(xb1) + pano.add(xb1) + + xq1 = np.random.rand(10, d).astype("float32") + D_reg_1, I_reg_1 = ivfpq.search(xq1, k) + D_pan_1, I_pan_1 = pano.search(xq1, k) + self.assert_search_results_equal(D_reg_1, I_reg_1, D_pan_1, I_pan_1) + + xb2 = np.random.rand(300, d).astype("float32") + ivfpq.add(xb2) + pano.add(xb2) + + xq2 = np.random.rand(10, d).astype("float32") + D_reg_2, I_reg_2 = ivfpq.search(xq2, k) + D_pan_2, I_pan_2 = pano.search(xq2, k) + self.assert_search_results_equal(D_reg_2, I_reg_2, D_pan_2, I_pan_2) + + # Serialization tests + + def test_serialization(self): + """Test write/read preserves search results""" + d, nb, nt, nq = 64, 10000, 15000, 100 + nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 20 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=2024) + + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=8, + ) + + D_before, I_before = pano.search(xq, k) + pano_after = faiss.deserialize_index(faiss.serialize_index(pano)) + D_after, I_after = pano_after.search(xq, k) + + np.testing.assert_array_equal(I_before, I_after) + np.testing.assert_allclose(D_before, D_after, rtol=1e-5) + + def test_serialization_preserves_params(self): + """Test serialization preserves n_levels and batch_size correctly""" + d, nb, nt, nq = 64, 10000, 15000, 50 + nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=2025) + + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=4, + ) + D_before, I_before = pano.search(xq, k) + + pano_after = faiss.deserialize_index( + faiss.serialize_index(pano) + ) + self.assertEqual(pano_after.batch_size, 128) + self.assertEqual(pano_after.n_levels, n_levels) + + D_after, I_after = pano_after.search(xq, k) + np.testing.assert_array_equal(I_before, I_after) + np.testing.assert_allclose(D_before, D_after, rtol=1e-5) + + # Statistics tests + + def test_ratio_dims_scanned(self): + """Test that ratio_dims_scanned is 1.0 at n_levels=1 and strictly + less for higher n_levels. + + Unlike IndexFlatPanorama, PQ quantization error prevents achieving + the ideal 1/n_levels ratio even on synthetic data. We verify that + n_levels=1 gives ratio=1.0 (exhaustive) and that multi-level + pruning is effective (ratio well below 1.0). + """ + d, nb, nt, nq = 64, 25000, 40000, 10 + nlist, M, nbits, k = 32, 16, 8, 1 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=5678) + + nt_threads = faiss.omp_get_max_threads() + faiss.omp_set_num_threads(1) + + faiss.cvar.indexPanorama_stats.reset() + pano_1 = self.create_panorama( + d, nlist, M, nbits, 1, xt, xb, nprobe=8, + ) + pano_1.search(xq, k) + ratio_1 = faiss.cvar.indexPanorama_stats.ratio_dims_scanned + self.assertAlmostEqual(ratio_1, 1.0, places=3) + + faiss.cvar.indexPanorama_stats.reset() + pano_16 = self.create_panorama( + d, nlist, M, nbits, 16, xt, xb, nprobe=8, + ) + pano_16.search(xq, k) + ratio_16 = faiss.cvar.indexPanorama_stats.ratio_dims_scanned + self.assertLess(ratio_16, 0.55) + + faiss.omp_set_num_threads(nt_threads) + + def test_pruning_improves_with_n_levels(self): + """Test that increasing n_levels reduces the fraction scanned""" + d, nb, nt, nq = 64, 25000, 40000, 50 + nlist, M, nbits, k = 32, 16, 8, 10 + xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=1234) + + nt_threads = faiss.omp_get_max_threads() + faiss.omp_set_num_threads(1) + + prev_ratio = float("inf") + for n_levels in [1, 2, 4, 8, 16]: + with self.subTest(n_levels=n_levels): + faiss.cvar.indexPanorama_stats.reset() + pano = self.create_panorama( + d, nlist, M, nbits, n_levels, xt, xb, nprobe=8, + ) + pano.search(xq, k) + ratio = faiss.cvar.indexPanorama_stats.ratio_dims_scanned + self.assertLessEqual(ratio, prev_ratio) + prev_ratio = ratio + + faiss.omp_set_num_threads(nt_threads) + + # Constraint validation tests + + def test_rejects_non_l2_metric(self): + """Verify that non-L2 metrics are rejected""" + d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 4 + quantizer = faiss.IndexFlatIP(d) + with self.assertRaises(RuntimeError): + faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, 128, + faiss.METRIC_INNER_PRODUCT, + ) + + def test_rejects_invalid_batch_size(self): + """Verify that non-multiple-of-64 batch_size is rejected""" + d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 4 + quantizer = faiss.IndexFlatL2(d) + with self.assertRaises(RuntimeError): + faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, 100, + ) + + def test_rejects_m_not_divisible_by_n_levels(self): + """Verify that M not divisible by n_levels is rejected""" + d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 3 + quantizer = faiss.IndexFlatL2(d) + with self.assertRaises(RuntimeError): + faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, 128, + ) From 0be0e1f09a0351b2105ce5dc514479418ed61751 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sat, 21 Mar 2026 06:01:51 +0000 Subject: [PATCH 19/41] Bench first pass --- benchs/bench_ivfpq_panorama.py | 208 +++++++++++----------------- benchs/bench_ivfpq_panorama_test.py | 173 +++++++++++++++++++++++ faiss/index_factory.cpp | 7 + tests/test_factory.py | 15 ++ tests/test_ivfpq_panorama.py | 2 +- 5 files changed, 277 insertions(+), 128 deletions(-) create mode 100644 benchs/bench_ivfpq_panorama_test.py diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py index eafeebb7e8..7c2c689b83 100644 --- a/benchs/bench_ivfpq_panorama.py +++ b/benchs/bench_ivfpq_panorama.py @@ -1,61 +1,60 @@ -# Quick 10% verification of IVFPQPanorama (with index caching) +# Copyright (c) Meta Platforms, Inc. and affiliates. +# +# This source code is licensed under the MIT license found in the +# LICENSE file in the root directory of this source tree. import multiprocessing as mp -import os import time import faiss +import matplotlib.pyplot as plt import numpy as np -print("Compile options:", faiss.get_compile_options(), flush=True) +try: + from faiss.contrib.datasets_fb import DatasetGIST1M +except ImportError: + from faiss.contrib.datasets import DatasetGIST1M +ds = DatasetGIST1M() -def fvecs_read(fname): - a = np.fromfile(fname, dtype="float32") - d = a[0].view("int32") - return a.reshape(-1, d + 1)[:, 1:].copy() +SUBSET = 0.1 # Set to 1.0 for full dataset - -GIST_DIR = "/datasets/PCA_init" -CACHE_DIR = "/home/akash/faiss-panorama/index_cache" -os.makedirs(CACHE_DIR, exist_ok=True) - -IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index") -IVFPQ_TRAINED_CACHE = os.path.join(CACHE_DIR, "ivfpq_trained_10pct.index") -IVFPQ_PANO_CACHE = os.path.join(CACHE_DIR, "ivfpq_pano_10pct.index") - -print("Loading GIST1M data (10% subset)...", flush=True) -xb_full = fvecs_read(os.path.join(GIST_DIR, "gist1m_base.fvecs")) -xq = fvecs_read(os.path.join(GIST_DIR, "gist1m_query.fvecs")) - -nb_full, d = xb_full.shape -nb = nb_full // 10 # 10% = 100000 +xq = ds.get_queries() +xb_full = ds.get_database() +nb_full = xb_full.shape[0] +nb = int(nb_full * SUBSET) xb = xb_full[:nb].copy() del xb_full -nq = xq.shape[0] -print(f"Database: {nb} x {d}, Queries: {nq} x {d}", flush=True) +gt = ds.get_groundtruth() if SUBSET == 1.0 else None +xt = ds.get_train()[:max(nb // 2, 50000)] -xt = xb[:50000].copy() +nb, d = xb.shape +nq = xq.shape[0] +nt = xt.shape[0] k = 10 -M = 960 -nbits = 8 -nlist = 64 -n_levels = 8 -batch_size = 128 - -GT_PATH = os.path.join(CACHE_DIR, "gt_10pct.npy") -if os.path.exists(GT_PATH): - gt_I = np.load(GT_PATH) - print(f"Loaded cached ground truth: {gt_I.shape}", flush=True) -else: - print("Computing ground truth on 10% subset...", flush=True) + +if gt is None: + print(f"Computing ground truth for {SUBSET*100:.0f}% subset ({nb} vectors)...") flat = faiss.IndexFlatL2(d) flat.add(xb) - _, gt_I = flat.search(xq, k) - np.save(GT_PATH, gt_I) - print("Ground truth computed and cached.", flush=True) + _, gt = flat.search(xq, k) +else: + gt = gt[:, :k] + +print(f"Database: {nb} x {d}, Queries: {nq}, Train: {nt}") + +M_values = [960, 480, 240] +nbits = 8 +nlist = 128 +n_levels = 16 + + +def get_ivf_index(index): + if isinstance(index, faiss.IndexPreTransform): + return faiss.downcast_index(index.index) + return index def eval_recall(index, nprobe_val): @@ -65,109 +64,64 @@ def eval_recall(index, nprobe_val): t = time.time() - t0 speed = t * 1000 / nq qps = 1000 / speed - corrects = sum(len(set(gt_I[i]) & set(I[i])) for i in range(nq)) + + corrects = (gt == I).sum() recall = corrects / (nq * k) - stats = faiss.cvar.indexPanorama_stats - pct_active = stats.ratio_dims_scanned * 100 + ratio_dims_scanned = faiss.cvar.indexPanorama_stats.ratio_dims_scanned print( f"\tnprobe {nprobe_val:3d}, Recall@{k}: " f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}, " - f"active: {pct_active:.1f}%", - flush=True, + f"dims scanned: {ratio_dims_scanned * 100:.1f}%" ) + return recall, qps -faiss.omp_set_num_threads(mp.cpu_count()) +def build_index(name): + index = faiss.index_factory(d, name) -# # --- IVFPQ baseline (cached) --- -# if os.path.exists(IVFPQ_CACHE): -# print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True) -# t0 = time.time() -# ivfpq = faiss.read_index(IVFPQ_CACHE) -# print(f" Loaded in {time.time() - t0:.1f}s", flush=True) -# else: -# print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True) -# quantizer = faiss.IndexFlatL2(d) -# ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) -# t0 = time.time() -# ivfpq.train(xt) -# print(f" Training took {time.time() - t0:.1f}s", flush=True) + faiss.omp_set_num_threads(mp.cpu_count()) + index.train(xt) + index.add(xb) -# print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) -# faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE) + return index -# t0 = time.time() -# ivfpq.add(xb) -# print(f" Adding took {time.time() - t0:.1f}s", flush=True) -# print(f" Saving full index to {IVFPQ_CACHE}...", flush=True) -# faiss.write_index(ivfpq, IVFPQ_CACHE) +def eval_and_plot(name, label=None): + index = build_index(name) + ivf_index = get_ivf_index(index) -# faiss.omp_set_num_threads(1) -# print("\n====== IVFPQ baseline", flush=True) -# for nprobe in [1, 2, 4, 8, 16]: -# ivfpq.nprobe = nprobe -# eval_recall(ivfpq, nprobe) + faiss.omp_set_num_threads(1) -# --- IVFPQPanorama (cached) --- -faiss.omp_set_num_threads(mp.cpu_count()) + data = [] + print(f"====== {label or name}") + for nprobe in nprobes: + ivf_index.nprobe = nprobe + recall, qps = eval_recall(index, nprobe) + data.append((recall, qps)) -if os.path.exists(IVFPQ_PANO_CACHE): - print(f"\nLoading cached IVFPQPanorama from {IVFPQ_PANO_CACHE}...", flush=True) - t0 = time.time() - ivfpq_pano = faiss.read_index(IVFPQ_PANO_CACHE) - print(f" Loaded in {time.time() - t0:.1f}s", flush=True) -else: - def build_panorama_from_trained(trained_index): - quantizer2 = trained_index.quantizer - trained_index.own_fields = False - - pano = faiss.IndexIVFPQPanorama( - quantizer2, d, nlist, M, nbits, n_levels, batch_size - ) - centroids = faiss.vector_to_array(trained_index.pq.centroids) - faiss.copy_array_to_vector(centroids, pano.pq.centroids) - pano.is_trained = True - pano.use_precomputed_table = 1 - pano.precompute_table() - return pano - - if os.path.exists(IVFPQ_TRAINED_CACHE): - print( - f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", - flush=True, - ) - trained = faiss.read_index(IVFPQ_TRAINED_CACHE) - ivfpq_pano = build_panorama_from_trained(trained) - print(" Reused trained PQ (skipped training).", flush=True) - else: - print( - f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}", - flush=True, - ) - quantizer2 = faiss.IndexFlatL2(d) - trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits) - t0 = time.time() - trained.train(xt) - print(f" Training took {time.time() - t0:.1f}s", flush=True) - - print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) - faiss.write_index(trained, IVFPQ_TRAINED_CACHE) - - ivfpq_pano = build_panorama_from_trained(trained) + data = np.array(data) + plt.plot(data[:, 0], data[:, 1], "o-", label=label or name) - t0 = time.time() - ivfpq_pano.add(xb) - print(f" Adding took {time.time() - t0:.1f}s", flush=True) - print(f" Saving IVFPQPanorama to {IVFPQ_PANO_CACHE}...", flush=True) - faiss.write_index(ivfpq_pano, IVFPQ_PANO_CACHE) +nprobes = [1, 2, 4, 8, 16, 32, 64] + +plt.figure(figsize=(10, 7), dpi=80) -faiss.omp_set_num_threads(1) -print("\n====== IVFPQPanorama", flush=True) -for nprobe in [1, 2, 4, 8, 16]: - ivfpq_pano.nprobe = nprobe - eval_recall(ivfpq_pano, nprobe) +for M in M_values: + eval_and_plot( + f"IVF{nlist},PQ{M}x{nbits}", + label=f"IVFPQ (M={M})", + ) + eval_and_plot( + f"PCA{d},IVF{nlist},PQ{M}x{nbits}Panorama{n_levels}", + label=f"PCA + IVFPQPanorama (M={M})", + ) -print("\nVerification complete!", flush=True) +plt.title(f"IVFPQ Panorama on GIST1M (nlist={nlist})") +plt.xlabel(f"Recall@{k}") +plt.ylabel("QPS") +plt.yscale("log") +plt.legend(bbox_to_anchor=(1.02, 0.1), loc="upper left", borderaxespad=0) +plt.savefig("bench_ivfpq_panorama.png", bbox_inches="tight") +print("\nBenchmark complete! Plot saved to bench_ivfpq_panorama.png") diff --git a/benchs/bench_ivfpq_panorama_test.py b/benchs/bench_ivfpq_panorama_test.py new file mode 100644 index 0000000000..38fe14614c --- /dev/null +++ b/benchs/bench_ivfpq_panorama_test.py @@ -0,0 +1,173 @@ +# Quick 10% verification of IVFPQPanorama (with index caching) + +import multiprocessing as mp +import os +import time + +import faiss +import numpy as np + +print("Compile options:", faiss.get_compile_options(), flush=True) + + +def fvecs_read(fname): + a = np.fromfile(fname, dtype="float32") + d = a[0].view("int32") + return a.reshape(-1, d + 1)[:, 1:].copy() + + +GIST_DIR = "/datasets/PCA_init" +CACHE_DIR = "/home/akash/faiss-panorama/index_cache" +os.makedirs(CACHE_DIR, exist_ok=True) + +IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index") +IVFPQ_TRAINED_CACHE = os.path.join(CACHE_DIR, "ivfpq_trained_10pct.index") +IVFPQ_PANO_CACHE = os.path.join(CACHE_DIR, "ivfpq_pano_10pct.index") + +print("Loading GIST1M data (10% subset)...", flush=True) +xb_full = fvecs_read(os.path.join(GIST_DIR, "gist1m_base.fvecs")) +xq = fvecs_read(os.path.join(GIST_DIR, "gist1m_query.fvecs")) + +nb_full, d = xb_full.shape +nb = nb_full // 10 # 10% = 100000 +xb = xb_full[:nb].copy() +del xb_full + +nq = xq.shape[0] +print(f"Database: {nb} x {d}, Queries: {nq} x {d}", flush=True) + +xt = xb[:50000].copy() + +k = 10 +M = 960 +nbits = 8 +nlist = 64 +n_levels = 16 +batch_size = 128 + +GT_PATH = os.path.join(CACHE_DIR, "gt_10pct.npy") +if os.path.exists(GT_PATH): + gt_I = np.load(GT_PATH) + print(f"Loaded cached ground truth: {gt_I.shape}", flush=True) +else: + print("Computing ground truth on 10% subset...", flush=True) + flat = faiss.IndexFlatL2(d) + flat.add(xb) + _, gt_I = flat.search(xq, k) + np.save(GT_PATH, gt_I) + print("Ground truth computed and cached.", flush=True) + + +def eval_recall(index, nprobe_val): + faiss.cvar.indexPanorama_stats.reset() + t0 = time.time() + _, I = index.search(xq, k=k) + t = time.time() - t0 + speed = t * 1000 / nq + qps = 1000 / speed + corrects = sum(len(set(gt_I[i]) & set(I[i])) for i in range(nq)) + recall = corrects / (nq * k) + stats = faiss.cvar.indexPanorama_stats + pct_active = stats.ratio_dims_scanned * 100 + print( + f"\tnprobe {nprobe_val:3d}, Recall@{k}: " + f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}, " + f"active: {pct_active:.1f}%", + flush=True, + ) + return recall, qps + + +faiss.omp_set_num_threads(mp.cpu_count()) + +# # --- IVFPQ baseline (cached) --- +# if os.path.exists(IVFPQ_CACHE): +# print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True) +# t0 = time.time() +# ivfpq = faiss.read_index(IVFPQ_CACHE) +# print(f" Loaded in {time.time() - t0:.1f}s", flush=True) +# else: +# print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True) +# quantizer = faiss.IndexFlatL2(d) +# ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) +# t0 = time.time() +# ivfpq.train(xt) +# print(f" Training took {time.time() - t0:.1f}s", flush=True) + +# print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) +# faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE) + +# t0 = time.time() +# ivfpq.add(xb) +# print(f" Adding took {time.time() - t0:.1f}s", flush=True) + +# print(f" Saving full index to {IVFPQ_CACHE}...", flush=True) +# faiss.write_index(ivfpq, IVFPQ_CACHE) + +# faiss.omp_set_num_threads(1) +# print("\n====== IVFPQ baseline", flush=True) +# for nprobe in [1, 2, 4, 8, 16]: +# ivfpq.nprobe = nprobe +# eval_recall(ivfpq, nprobe) + +# --- IVFPQPanorama (cached) --- +faiss.omp_set_num_threads(mp.cpu_count()) + +if os.path.exists(IVFPQ_PANO_CACHE): + print(f"\nLoading cached IVFPQPanorama from {IVFPQ_PANO_CACHE}...", flush=True) + t0 = time.time() + ivfpq_pano = faiss.read_index(IVFPQ_PANO_CACHE) + print(f" Loaded in {time.time() - t0:.1f}s", flush=True) +else: + def build_panorama_from_trained(trained_index): + quantizer2 = trained_index.quantizer + trained_index.own_fields = False + + pano = faiss.IndexIVFPQPanorama( + quantizer2, d, nlist, M, nbits, n_levels, batch_size + ) + centroids = faiss.vector_to_array(trained_index.pq.centroids) + faiss.copy_array_to_vector(centroids, pano.pq.centroids) + pano.is_trained = True + pano.use_precomputed_table = 1 + pano.precompute_table() + return pano + + if os.path.exists(IVFPQ_TRAINED_CACHE): + print( + f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", + flush=True, + ) + trained = faiss.read_index(IVFPQ_TRAINED_CACHE) + ivfpq_pano = build_panorama_from_trained(trained) + print(" Reused trained PQ (skipped training).", flush=True) + else: + print( + f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}", + flush=True, + ) + quantizer2 = faiss.IndexFlatL2(d) + trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits) + t0 = time.time() + trained.train(xt) + print(f" Training took {time.time() - t0:.1f}s", flush=True) + + print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) + faiss.write_index(trained, IVFPQ_TRAINED_CACHE) + + ivfpq_pano = build_panorama_from_trained(trained) + + t0 = time.time() + ivfpq_pano.add(xb) + print(f" Adding took {time.time() - t0:.1f}s", flush=True) + + print(f" Saving IVFPQPanorama to {IVFPQ_PANO_CACHE}...", flush=True) + faiss.write_index(ivfpq_pano, IVFPQ_PANO_CACHE) + +faiss.omp_set_num_threads(1) +print("\n====== IVFPQPanorama", flush=True) +for nprobe in [1, 2, 4, 8, 16]: + ivfpq_pano.nprobe = nprobe + eval_recall(ivfpq_pano, nprobe) + +print("\nVerification complete!", flush=True) diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp index fb4c442440..22097c96d7 100644 --- a/faiss/index_factory.cpp +++ b/faiss/index_factory.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -354,6 +355,12 @@ IndexIVF* parse_IndexIVF( /*by_residual=*/true, own_il); } + if (match("PQ([0-9]+)(x[0-9]+)?Panorama([0-9]+)?")) { + int M = mres_to_int(sm[1]), nbit = mres_to_int(sm[2], 8, 1); + int nlevels = mres_to_int(sm[3], 8); + return new IndexIVFPQPanorama( + get_q(), d, nlist, M, nbit, nlevels, 128, mt, own_il); + } if (match("PQ([0-9]+)(x[0-9]+)?(np)?")) { int M = mres_to_int(sm[1]), nbit = mres_to_int(sm[2], 8, 1); IndexIVFPQ* index_ivf = diff --git a/tests/test_factory.py b/tests/test_factory.py index 2246eb8c10..922ab14cf0 100644 --- a/tests/test_factory.py +++ b/tests/test_factory.py @@ -70,6 +70,21 @@ def test_factory_6(self): assert index.d == 128 assert index.metric_type == faiss.METRIC_L2 + def test_factory_panorama(self): + index = faiss.index_factory(64, "IVF16,PQ16x8Panorama4") + assert isinstance(index, faiss.IndexIVFPQPanorama) + assert index.n_levels == 4 + assert index.pq.M == 16 + + index = faiss.index_factory(64, "IVF16,PQ16Panorama") + assert isinstance(index, faiss.IndexIVFPQPanorama) + assert index.n_levels == 8 # default + + index = faiss.index_factory(64, "PCA64,IVF16,PQ16x8Panorama4") + ivf = faiss.downcast_index(index.index) + assert isinstance(ivf, faiss.IndexIVFPQPanorama) + assert ivf.n_levels == 4 + def test_factory_HNSW(self): index = faiss.index_factory(12, "HNSW32") assert index.storage.sa_code_size() == 12 * 4 diff --git a/tests/test_ivfpq_panorama.py b/tests/test_ivfpq_panorama.py index d8b16a128e..d2e28d78d6 100644 --- a/tests/test_ivfpq_panorama.py +++ b/tests/test_ivfpq_panorama.py @@ -559,7 +559,7 @@ def test_ratio_dims_scanned(self): ) pano_16.search(xq, k) ratio_16 = faiss.cvar.indexPanorama_stats.ratio_dims_scanned - self.assertLess(ratio_16, 0.55) + self.assertLess(ratio_16, 0.6) faiss.omp_set_num_threads(nt_threads) From 267ced5e2fdf5af17caae8f76013404a1fe41466 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sat, 21 Mar 2026 06:42:18 +0000 Subject: [PATCH 20/41] Alexis genius idea v1 --- benchs/bench_ivfpq_panorama.py | 100 +++++++++++++++++++++++++-------- 1 file changed, 77 insertions(+), 23 deletions(-) diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py index 7c2c689b83..e761642a14 100644 --- a/benchs/bench_ivfpq_panorama.py +++ b/benchs/bench_ivfpq_panorama.py @@ -9,6 +9,8 @@ import faiss import matplotlib.pyplot as plt import numpy as np +from scipy.linalg import block_diag +from sklearn.decomposition import PCA try: from faiss.contrib.datasets_fb import DatasetGIST1M @@ -49,6 +51,7 @@ nbits = 8 nlist = 128 n_levels = 16 +nprobes = [1, 2, 4, 8, 16, 32, 64] def get_ivf_index(index): @@ -77,46 +80,97 @@ def eval_recall(index, nprobe_val): return recall, qps -def build_index(name): - index = faiss.index_factory(d, name) - - faiss.omp_set_num_threads(mp.cpu_count()) - index.train(xt) - index.add(xb) - - return index - - -def eval_and_plot(name, label=None): - index = build_index(name) +def eval_index(index, label): ivf_index = get_ivf_index(index) faiss.omp_set_num_threads(1) data = [] - print(f"====== {label or name}") + print(f"====== {label}") for nprobe in nprobes: ivf_index.nprobe = nprobe recall, qps = eval_recall(index, nprobe) data.append((recall, qps)) data = np.array(data) - plt.plot(data[:, 0], data[:, 1], "o-", label=label or name) + plt.plot(data[:, 0], data[:, 1], "o-", label=label) -nprobes = [1, 2, 4, 8, 16, 32, 64] +def build_ivfpq(M): + """Build vanilla IVFPQ (no transform) via index_factory.""" + index = faiss.index_factory(d, f"IVF{nlist},PQ{M}x{nbits}") + faiss.omp_set_num_threads(mp.cpu_count()) + index.train(xt) + index.add(xb) + return index + + +def make_pca_level_rotation_transform(xt, n_levels, seed=77): + """Build a fused PCA + per-level random rotation as a LinearTransform. + + FAISS LinearTransform applies: y = A_stored @ x + b (column-vector) + We want: y = R_block @ P @ (x - mean) + 1. Center x + 2. PCA project (P @ x_centered) + 3. Per-level rotation (R_block @ z_pca) + + So: A_stored = R_block @ P, b = -A_stored @ mean + """ + pca = PCA(n_components=d) + pca.fit(xt) + + P = pca.components_.astype(np.float32) # (d, d) + mean = pca.mean_.astype(np.float32) # (d,) + + block_size = d // n_levels + rng = np.random.RandomState(seed) + blocks = [] + for _ in range(n_levels): + H = rng.randn(block_size, block_size).astype(np.float32) + Q, R = np.linalg.qr(H) + Q *= np.sign(np.diag(R))[:, None] + blocks.append(Q) + A = block_diag(*blocks).astype(np.float32) # (d, d) + + combined = A @ P # (d, d) -- rotation AFTER PCA + + lt = faiss.LinearTransform(d, d, True) + faiss.copy_array_to_vector(combined.ravel(), lt.A) + faiss.copy_array_to_vector(-(combined @ mean).ravel(), lt.b) + lt.is_trained = True + lt.have_bias = True + + return lt + + +def build_ivfpq_panorama(M, n_levels): + """Build PCA + LevelRotation + IVFPQPanorama.""" + lt = make_pca_level_rotation_transform(xt, n_levels) + + quantizer = faiss.IndexFlatL2(d) + ivfpq_pano = faiss.IndexIVFPQPanorama( + quantizer, d, nlist, M, nbits, n_levels, + ) + + index = faiss.IndexPreTransform(lt, ivfpq_pano) + + faiss.omp_set_num_threads(mp.cpu_count()) + index.train(xt) + index.add(xb) + + return index + plt.figure(figsize=(10, 7), dpi=80) for M in M_values: - eval_and_plot( - f"IVF{nlist},PQ{M}x{nbits}", - label=f"IVFPQ (M={M})", - ) - eval_and_plot( - f"PCA{d},IVF{nlist},PQ{M}x{nbits}Panorama{n_levels}", - label=f"PCA + IVFPQPanorama (M={M})", - ) + ivfpq = build_ivfpq(M) + eval_index(ivfpq, label=f"IVFPQ (M={M})") + del ivfpq + + pano = build_ivfpq_panorama(M, n_levels) + eval_index(pano, label=f"PCA+Rot + IVFPQPanorama (M={M})") + del pano plt.title(f"IVFPQ Panorama on GIST1M (nlist={nlist})") plt.xlabel(f"Recall@{k}") From 46d4445ddbfde100f13c2ff47996dc4e8d4d5859 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sat, 21 Mar 2026 07:49:53 +0000 Subject: [PATCH 21/41] Alexis v2 --- benchs/bench_ivfpq_panorama.py | 161 +++++++++++++++++++++++++++------ 1 file changed, 135 insertions(+), 26 deletions(-) diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py index e761642a14..806c56fcc4 100644 --- a/benchs/bench_ivfpq_panorama.py +++ b/benchs/bench_ivfpq_panorama.py @@ -47,6 +47,7 @@ print(f"Database: {nb} x {d}, Queries: {nq}, Train: {nt}") +ALPHA = 8 M_values = [960, 480, 240] nbits = 8 nlist = 128 @@ -105,36 +106,144 @@ def build_ivfpq(M): return index -def make_pca_level_rotation_transform(xt, n_levels, seed=77): - """Build a fused PCA + per-level random rotation as a LinearTransform. +def compute_level_energies(variances, n_levels, block_size): + """Sum per-dimension variances into per-level total energies.""" + return np.array([ + np.sum(variances[l * block_size : (l + 1) * block_size]) + for l in range(n_levels) + ]) - FAISS LinearTransform applies: y = A_stored @ x + b (column-vector) - We want: y = R_block @ P @ (x - mean) - 1. Center x - 2. PCA project (P @ x_centered) - 3. Per-level rotation (R_block @ z_pca) - So: A_stored = R_block @ P, b = -A_stored @ mean +def find_n_spill(variances, level_start, block_size, max_energy_per_level, d): + """Find the smallest number of extra dimensions to spill into. + + After a random rotation over (block_size + n_spill) dims, each dim gets + uniform expected energy. The level's expected energy becomes: + block_size * total_subspace_energy / (block_size + n_spill) + + Returns the smallest n_spill >= 1 where this is <= max_energy_per_level, + or all remaining dims if the cap can't be reached. """ - pca = PCA(n_components=d) - pca.fit(xt) + level_end = level_start + block_size + max_extra = d - level_end + if max_extra == 0: + return 0 + + total = np.sum(variances[level_start:level_end]) + for n in range(1, max_extra + 1): + total += variances[level_end + n - 1] + if block_size * total / (block_size + n) <= max_energy_per_level: + return n + + return max_extra + + +def random_orthogonal(size, rng): + """Haar-distributed random orthogonal matrix via QR of Gaussian.""" + H = rng.randn(size, size).astype(np.float32) + Q, R = np.linalg.qr(H) + Q *= np.sign(np.diag(R))[:, None] + return Q + + +def build_energy_spill_rotation(eigenvalues, n_levels, block_size, + alpha, seed=42): + """Orthogonal matrix that caps per-level energy via localized rotations. + + Iterates over levels sequentially. When a level's effective energy + exceeds alpha * avg_energy_per_level, applies a random rotation spanning + that level plus enough subsequent dimensions to bring the expected level + energy down to the cap. + + Variances are tracked analytically: after each rotation the dims in the + rotated subspace are set to uniform expected variance. + + Returns (spill_rotation, effective_variances). + """ + d = len(eigenvalues) + total_energy = float(np.sum(eigenvalues)) + max_energy_per_level = alpha * total_energy / n_levels + + variances = eigenvalues.astype(np.float32).copy() + spill_matrix = np.eye(d, dtype=np.float32) + rng = np.random.RandomState(seed) + + for level in range(n_levels): + start = level * block_size + end = start + block_size + level_energy = float(np.sum(variances[start:end])) + + if level_energy <= max_energy_per_level: + continue + + n_spill = find_n_spill( + variances, start, block_size, max_energy_per_level, d, + ) + if n_spill == 0: + continue - P = pca.components_.astype(np.float32) # (d, d) - mean = pca.mean_.astype(np.float32) # (d,) + sub_end = end + n_spill + Q = random_orthogonal(block_size + n_spill, rng) - block_size = d // n_levels + full_Q = np.eye(d, dtype=np.float32) + full_Q[start:sub_end, start:sub_end] = Q + spill_matrix = full_Q @ spill_matrix + + avg_var = float(np.sum(variances[start:sub_end])) / (block_size + n_spill) + variances[start:sub_end] = avg_var + + return spill_matrix, variances + + +def build_level_equalization_rotation(d, n_levels, block_size, seed=77): + """Block-diagonal random rotation for within-level energy equalization.""" rng = np.random.RandomState(seed) - blocks = [] - for _ in range(n_levels): - H = rng.randn(block_size, block_size).astype(np.float32) - Q, R = np.linalg.qr(H) - Q *= np.sign(np.diag(R))[:, None] - blocks.append(Q) - A = block_diag(*blocks).astype(np.float32) # (d, d) + blocks = [random_orthogonal(block_size, rng) for _ in range(n_levels)] + return block_diag(*blocks).astype(np.float32) + + +def print_energy_diagnostics(eigenvalues, effective_variances, n_levels, + block_size, alpha): + """Print per-level energy before/after the spill transform.""" + before = compute_level_energies(eigenvalues, n_levels, block_size) + after = compute_level_energies(effective_variances, n_levels, block_size) + total = float(np.sum(eigenvalues)) + cap = alpha * total / n_levels + + +def make_pca_level_rotation_transform(xt, n_levels, alpha=ALPHA, seed=77): + """Build PCA + energy-spill + per-level rotation as one LinearTransform. + + Pipeline: y = R_eq @ R_spill @ P @ (x - mean) + 1. Center + PCA project (P, mean) + 2. Energy spill across levels (R_spill) + 3. Within-level equalization (R_eq, block-diagonal) + + Stored as: A = R_eq @ R_spill @ P, b = -A @ mean + """ + dim = xt.shape[1] + block_size = dim // n_levels + + pca = PCA(n_components=dim) + pca.fit(xt) + P = pca.components_.astype(np.float32) + mean = pca.mean_.astype(np.float32) + eigenvalues = pca.explained_variance_.astype(np.float32) + + R_spill, effective_variances = build_energy_spill_rotation( + eigenvalues, n_levels, block_size, alpha, seed=seed, + ) + print_energy_diagnostics( + eigenvalues, effective_variances, n_levels, block_size, alpha, + ) + + R_eq = build_level_equalization_rotation( + dim, n_levels, block_size, seed=seed + 1, + ) - combined = A @ P # (d, d) -- rotation AFTER PCA + combined = (R_eq @ R_spill @ P).astype(np.float32) - lt = faiss.LinearTransform(d, d, True) + lt = faiss.LinearTransform(dim, dim, True) faiss.copy_array_to_vector(combined.ravel(), lt.A) faiss.copy_array_to_vector(-(combined @ mean).ravel(), lt.b) lt.is_trained = True @@ -143,9 +252,9 @@ def make_pca_level_rotation_transform(xt, n_levels, seed=77): return lt -def build_ivfpq_panorama(M, n_levels): - """Build PCA + LevelRotation + IVFPQPanorama.""" - lt = make_pca_level_rotation_transform(xt, n_levels) +def build_ivfpq_panorama(M, n_levels, alpha=ALPHA): + """Build PCA + EnergySpill + LevelRotation + IVFPQPanorama.""" + lt = make_pca_level_rotation_transform(xt, n_levels, alpha=alpha) quantizer = faiss.IndexFlatL2(d) ivfpq_pano = faiss.IndexIVFPQPanorama( @@ -169,7 +278,7 @@ def build_ivfpq_panorama(M, n_levels): del ivfpq pano = build_ivfpq_panorama(M, n_levels) - eval_index(pano, label=f"PCA+Rot + IVFPQPanorama (M={M})") + eval_index(pano, label=f"PCA+Spill+Rot + IVFPQPanorama (M={M})") del pano plt.title(f"IVFPQ Panorama on GIST1M (nlist={nlist})") From b815ea5c84db387950bdd63ce542d13df41e7f0f Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Sat, 21 Mar 2026 08:24:58 +0000 Subject: [PATCH 22/41] Remove side bench --- benchs/bench_ivfpq_panorama_test.py | 173 ---------------------------- 1 file changed, 173 deletions(-) delete mode 100644 benchs/bench_ivfpq_panorama_test.py diff --git a/benchs/bench_ivfpq_panorama_test.py b/benchs/bench_ivfpq_panorama_test.py deleted file mode 100644 index 38fe14614c..0000000000 --- a/benchs/bench_ivfpq_panorama_test.py +++ /dev/null @@ -1,173 +0,0 @@ -# Quick 10% verification of IVFPQPanorama (with index caching) - -import multiprocessing as mp -import os -import time - -import faiss -import numpy as np - -print("Compile options:", faiss.get_compile_options(), flush=True) - - -def fvecs_read(fname): - a = np.fromfile(fname, dtype="float32") - d = a[0].view("int32") - return a.reshape(-1, d + 1)[:, 1:].copy() - - -GIST_DIR = "/datasets/PCA_init" -CACHE_DIR = "/home/akash/faiss-panorama/index_cache" -os.makedirs(CACHE_DIR, exist_ok=True) - -IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index") -IVFPQ_TRAINED_CACHE = os.path.join(CACHE_DIR, "ivfpq_trained_10pct.index") -IVFPQ_PANO_CACHE = os.path.join(CACHE_DIR, "ivfpq_pano_10pct.index") - -print("Loading GIST1M data (10% subset)...", flush=True) -xb_full = fvecs_read(os.path.join(GIST_DIR, "gist1m_base.fvecs")) -xq = fvecs_read(os.path.join(GIST_DIR, "gist1m_query.fvecs")) - -nb_full, d = xb_full.shape -nb = nb_full // 10 # 10% = 100000 -xb = xb_full[:nb].copy() -del xb_full - -nq = xq.shape[0] -print(f"Database: {nb} x {d}, Queries: {nq} x {d}", flush=True) - -xt = xb[:50000].copy() - -k = 10 -M = 960 -nbits = 8 -nlist = 64 -n_levels = 16 -batch_size = 128 - -GT_PATH = os.path.join(CACHE_DIR, "gt_10pct.npy") -if os.path.exists(GT_PATH): - gt_I = np.load(GT_PATH) - print(f"Loaded cached ground truth: {gt_I.shape}", flush=True) -else: - print("Computing ground truth on 10% subset...", flush=True) - flat = faiss.IndexFlatL2(d) - flat.add(xb) - _, gt_I = flat.search(xq, k) - np.save(GT_PATH, gt_I) - print("Ground truth computed and cached.", flush=True) - - -def eval_recall(index, nprobe_val): - faiss.cvar.indexPanorama_stats.reset() - t0 = time.time() - _, I = index.search(xq, k=k) - t = time.time() - t0 - speed = t * 1000 / nq - qps = 1000 / speed - corrects = sum(len(set(gt_I[i]) & set(I[i])) for i in range(nq)) - recall = corrects / (nq * k) - stats = faiss.cvar.indexPanorama_stats - pct_active = stats.ratio_dims_scanned * 100 - print( - f"\tnprobe {nprobe_val:3d}, Recall@{k}: " - f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}, " - f"active: {pct_active:.1f}%", - flush=True, - ) - return recall, qps - - -faiss.omp_set_num_threads(mp.cpu_count()) - -# # --- IVFPQ baseline (cached) --- -# if os.path.exists(IVFPQ_CACHE): -# print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True) -# t0 = time.time() -# ivfpq = faiss.read_index(IVFPQ_CACHE) -# print(f" Loaded in {time.time() - t0:.1f}s", flush=True) -# else: -# print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True) -# quantizer = faiss.IndexFlatL2(d) -# ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits) -# t0 = time.time() -# ivfpq.train(xt) -# print(f" Training took {time.time() - t0:.1f}s", flush=True) - -# print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) -# faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE) - -# t0 = time.time() -# ivfpq.add(xb) -# print(f" Adding took {time.time() - t0:.1f}s", flush=True) - -# print(f" Saving full index to {IVFPQ_CACHE}...", flush=True) -# faiss.write_index(ivfpq, IVFPQ_CACHE) - -# faiss.omp_set_num_threads(1) -# print("\n====== IVFPQ baseline", flush=True) -# for nprobe in [1, 2, 4, 8, 16]: -# ivfpq.nprobe = nprobe -# eval_recall(ivfpq, nprobe) - -# --- IVFPQPanorama (cached) --- -faiss.omp_set_num_threads(mp.cpu_count()) - -if os.path.exists(IVFPQ_PANO_CACHE): - print(f"\nLoading cached IVFPQPanorama from {IVFPQ_PANO_CACHE}...", flush=True) - t0 = time.time() - ivfpq_pano = faiss.read_index(IVFPQ_PANO_CACHE) - print(f" Loaded in {time.time() - t0:.1f}s", flush=True) -else: - def build_panorama_from_trained(trained_index): - quantizer2 = trained_index.quantizer - trained_index.own_fields = False - - pano = faiss.IndexIVFPQPanorama( - quantizer2, d, nlist, M, nbits, n_levels, batch_size - ) - centroids = faiss.vector_to_array(trained_index.pq.centroids) - faiss.copy_array_to_vector(centroids, pano.pq.centroids) - pano.is_trained = True - pano.use_precomputed_table = 1 - pano.precompute_table() - return pano - - if os.path.exists(IVFPQ_TRAINED_CACHE): - print( - f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", - flush=True, - ) - trained = faiss.read_index(IVFPQ_TRAINED_CACHE) - ivfpq_pano = build_panorama_from_trained(trained) - print(" Reused trained PQ (skipped training).", flush=True) - else: - print( - f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}", - flush=True, - ) - quantizer2 = faiss.IndexFlatL2(d) - trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits) - t0 = time.time() - trained.train(xt) - print(f" Training took {time.time() - t0:.1f}s", flush=True) - - print(f" Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True) - faiss.write_index(trained, IVFPQ_TRAINED_CACHE) - - ivfpq_pano = build_panorama_from_trained(trained) - - t0 = time.time() - ivfpq_pano.add(xb) - print(f" Adding took {time.time() - t0:.1f}s", flush=True) - - print(f" Saving IVFPQPanorama to {IVFPQ_PANO_CACHE}...", flush=True) - faiss.write_index(ivfpq_pano, IVFPQ_PANO_CACHE) - -faiss.omp_set_num_threads(1) -print("\n====== IVFPQPanorama", flush=True) -for nprobe in [1, 2, 4, 8, 16]: - ivfpq_pano.nprobe = nprobe - eval_recall(ivfpq_pano, nprobe) - -print("\nVerification complete!", flush=True) From 1ecda7eaa58eaf95ede4a70d4decda75973caf26 Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Sat, 21 Mar 2026 08:57:08 +0000 Subject: [PATCH 23/41] format and fix some merge bugs --- faiss/IndexFlat.h | 2 +- faiss/IndexHNSW.cpp | 2 +- faiss/IndexIVFPQPanorama.cpp | 38 ++++++++--------- faiss/impl/Panorama.h | 6 ++- faiss/impl/PanoramaPQ.cpp | 19 ++++----- faiss/impl/index_read.cpp | 3 +- faiss/impl/index_write.cpp | 2 +- .../panorama_kernels-avx2.cpp | 17 ++++---- .../panorama_kernels-avx512.cpp | 41 ++++++++----------- .../panorama_kernels-generic.cpp | 12 ++---- faiss/index_factory.cpp | 2 +- 11 files changed, 64 insertions(+), 80 deletions(-) diff --git a/faiss/IndexFlat.h b/faiss/IndexFlat.h index 632768e9ff..7e10f05b25 100644 --- a/faiss/IndexFlat.h +++ b/faiss/IndexFlat.h @@ -120,7 +120,7 @@ struct IndexFlatPanorama : IndexFlat { : IndexFlat(d_in, metric), batch_size(batch_size_in), n_levels(n_levels_in), - pano(code_size, n_levels_in, batch_size_in) { + pano(d_in, n_levels_in, batch_size_in) { FAISS_THROW_IF_NOT( metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT); } diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp index e99796ef5a..8f2f5f3e8a 100644 --- a/faiss/IndexHNSW.cpp +++ b/faiss/IndexHNSW.cpp @@ -680,7 +680,7 @@ IndexHNSWFlatPanorama::IndexHNSWFlatPanorama( MetricType metric) : IndexHNSWFlat(d_in, M, metric), cum_sums(), - pano(d_in * sizeof(float), num_panorama_levels_in, 1), + pano(d_in, num_panorama_levels_in, 1), num_panorama_levels(num_panorama_levels_in) { // For now, we only support L2 distance. // Supporting dot product and cosine distance is a trivial addition diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index 26c9eccbd3..4848326553 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -49,7 +49,8 @@ IndexIVFPQPanorama::IndexIVFPQPanorama( M == code_size, "M must equal code_size for 8-bit PQ"); FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported"); - auto* pano = new PanoramaPQ(d, code_size, n_levels, batch_size, &pq, quantizer); + auto* pano = + new PanoramaPQ(d, code_size, n_levels, batch_size, &pq, quantizer); this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano); this->own_invlists = own_invlists; } @@ -149,24 +150,23 @@ struct IVFPQScannerPanorama : InvertedListScanner { local_stats.reset(); for (size_t batch_no = 0; batch_no < n_batches; batch_no++) { - size_t num_active = - pano_pq->progressive_filter_batch( - col_codes, - list_cum_sums, - list_init_dists, - sim_table_2.data(), - query_cum_norms.data(), - dis0, - list_size, - batch_no, - ids, - sel, - exact_distances, - active_indices, - bitset, - compressed_codes, - distances[0], - local_stats); + size_t num_active = pano_pq->progressive_filter_batch( + col_codes, + list_cum_sums, + list_init_dists, + sim_table_2.data(), + query_cum_norms.data(), + dis0, + list_size, + batch_no, + ids, + sel, + exact_distances, + active_indices, + bitset, + compressed_codes, + distances[0], + local_stats); // Insert surviving candidates into heap. for (size_t i = 0; i < num_active; i++) { diff --git a/faiss/impl/Panorama.h b/faiss/impl/Panorama.h index 8f33fc8d1a..fcf3136d44 100644 --- a/faiss/impl/Panorama.h +++ b/faiss/impl/Panorama.h @@ -94,8 +94,10 @@ struct Panorama { size_t dest_idx, size_t src_idx) const; - virtual void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) - const; + virtual void reconstruct( + idx_t key, + float* recons, + const uint8_t* codes_base) const; }; /** diff --git a/faiss/impl/PanoramaPQ.cpp b/faiss/impl/PanoramaPQ.cpp index 02e70967b0..bd80f7f81c 100644 --- a/faiss/impl/PanoramaPQ.cpp +++ b/faiss/impl/PanoramaPQ.cpp @@ -7,7 +7,6 @@ #include -#include #include #include @@ -59,9 +58,8 @@ void PanoramaPQ::reconstruct( size_t start_byte = level * cs; for (size_t ci = 0; ci < cs && (start_byte + ci) < code_size; ci++) { - recons_buffer[start_byte + ci] = - codes_base[batch_offset + level_offset + ci * bs + - pos_in_batch]; + recons_buffer[start_byte + ci] = codes_base + [batch_offset + level_offset + ci * bs + pos_in_batch]; } } } @@ -108,15 +106,14 @@ void PanoramaPQ::compute_cumulative_sums( size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1); for (size_t level = 0; level < n_levels; level++) { size_t start_idx = level * levels_size; - size_t out_offset = cumsum_batch_offset + level * batch_size + - pos_in_batch; - cumsum_base[out_offset] = start_idx < d - ? std::sqrt(suffix[start_idx]) - : 0.0f; + size_t out_offset = + cumsum_batch_offset + level * batch_size + pos_in_batch; + cumsum_base[out_offset] = + start_idx < d ? std::sqrt(suffix[start_idx]) : 0.0f; } - size_t last_offset = cumsum_batch_offset + n_levels * batch_size + - pos_in_batch; + size_t last_offset = + cumsum_batch_offset + n_levels * batch_size + pos_in_batch; cumsum_base[last_offset] = 0.0f; } } diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index 3cd555e0d8..864cc52455 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -1508,8 +1508,7 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { if (list_size == 0) continue; size_t bs = pano_pq->batch_size; - size_t padded = - ((list_size + bs - 1) / bs) * bs; + size_t padded = ((list_size + bs - 1) / bs) * bs; storage->init_dists[list_no].resize(padded); // Reconstruct row-major codes, then compute init distances. diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp index 5ebae71acf..66f0ed325f 100644 --- a/faiss/impl/index_write.cpp +++ b/faiss/impl/index_write.cpp @@ -34,8 +34,8 @@ #include #include #include -#include #include +#include #include #include #include diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp index 46728b1cdd..5633f6c874 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -100,15 +100,15 @@ void process_chunks( __m128i raw = _mm_loadl_epi64( (__m128i*)(compressed_codes + chunk_offset + batch_idx)); __m256i codes = _mm256_cvtepu8_epi32(raw); - __m256 m_dist = _mm256_i32gather_ps( - sim_table_ptr, codes, sizeof(float)); + __m256 m_dist = + _mm256_i32gather_ps(sim_table_ptr, codes, sizeof(float)); acc = _mm256_add_ps(acc, m_dist); _mm256_storeu_ps(exact_distances + batch_idx, acc); } for (; batch_idx < num_active; batch_idx += 1) { - exact_distances[batch_idx] += sim_table_ptr - [compressed_codes[chunk_offset + batch_idx]]; + exact_distances[batch_idx] += + sim_table_ptr[compressed_codes[chunk_offset + batch_idx]]; } } } @@ -127,8 +127,7 @@ size_t process_filtering( for (size_t i = 0; i < num_active; i++) { float exact_distance = exact_distances[i]; float cum_sum = cum_sums[active_indices[i] - batch_offset]; - float lower_bound = - exact_distance + dis0 - cum_sum * query_cum_norm; + float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm; bool keep = heap_max > lower_bound; active_indices[next_num_active] = active_indices[i]; @@ -170,8 +169,7 @@ std::pair process_code_compression( for (int g = 0; g < 8; g++) { uint64_t bytes; memcpy(&bytes, bitset + point_idx + g * 8, 8); - uint8_t bits = (uint8_t)_pext_u64( - bytes, 0x0101010101010101ULL); + uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL); mask |= ((uint64_t)bits << (g * 8)); } #else @@ -196,8 +194,7 @@ std::pair process_code_compression( memcpy(&src_val, src + g * 8, 8); uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF); uint64_t byte_mask = - _pdep_u64(submask, 0x0101010101010101ULL) * - 0xFF; + _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF; uint64_t compressed_val = _pext_u64(src_val, byte_mask); int count = __builtin_popcount(submask); memcpy(dst + write_pos, &compressed_val, 8); diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp index 7733d5a6da..a73461a8dc 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -42,33 +42,29 @@ void process_chunks( for (; batch_idx + 15 < num_active; batch_idx += 16) { __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); - __m128i comp0 = - _mm_loadu_si128((__m128i*)(compressed_codes + - chunk_offset0 + batch_idx)); + __m128i comp0 = _mm_loadu_si128( + (__m128i*)(compressed_codes + chunk_offset0 + batch_idx)); __m512i codes0 = _mm512_cvtepu8_epi32(comp0); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes0, sim_table0, sizeof(float))); - __m128i comp1 = - _mm_loadu_si128((__m128i*)(compressed_codes + - chunk_offset1 + batch_idx)); + __m128i comp1 = _mm_loadu_si128( + (__m128i*)(compressed_codes + chunk_offset1 + batch_idx)); __m512i codes1 = _mm512_cvtepu8_epi32(comp1); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes1, sim_table1, sizeof(float))); - __m128i comp2 = - _mm_loadu_si128((__m128i*)(compressed_codes + - chunk_offset2 + batch_idx)); + __m128i comp2 = _mm_loadu_si128( + (__m128i*)(compressed_codes + chunk_offset2 + batch_idx)); __m512i codes2 = _mm512_cvtepu8_epi32(comp2); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes2, sim_table2, sizeof(float))); - __m128i comp3 = - _mm_loadu_si128((__m128i*)(compressed_codes + - chunk_offset3 + batch_idx)); + __m128i comp3 = _mm_loadu_si128( + (__m128i*)(compressed_codes + chunk_offset3 + batch_idx)); __m512i codes3 = _mm512_cvtepu8_epi32(comp3); acc = _mm512_add_ps( acc, @@ -94,18 +90,18 @@ void process_chunks( size_t batch_idx = 0; for (; batch_idx + 15 < num_active; batch_idx += 16) { __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); - __m128i comp = _mm_loadu_si128(( - __m128i*)(compressed_codes + chunk_offset + batch_idx)); + __m128i comp = _mm_loadu_si128( + (__m128i*)(compressed_codes + chunk_offset + batch_idx)); __m512i codes = _mm512_cvtepu8_epi32(comp); - __m512 m_dist = _mm512_i32gather_ps( - codes, sim_table_ptr, sizeof(float)); + __m512 m_dist = + _mm512_i32gather_ps(codes, sim_table_ptr, sizeof(float)); acc = _mm512_add_ps(acc, m_dist); _mm512_storeu_ps(exact_distances + batch_idx, acc); } for (; batch_idx < num_active; batch_idx += 1) { - exact_distances[batch_idx] += sim_table_ptr - [compressed_codes[chunk_offset + batch_idx]]; + exact_distances[batch_idx] += + sim_table_ptr[compressed_codes[chunk_offset + batch_idx]]; } } } @@ -124,8 +120,7 @@ size_t process_filtering( for (size_t i = 0; i < num_active; i++) { float exact_distance = exact_distances[i]; float cum_sum = cum_sums[active_indices[i] - batch_offset]; - float lower_bound = - exact_distance + dis0 - cum_sum * query_cum_norm; + float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm; bool keep = heap_max > lower_bound; active_indices[next_num_active] = active_indices[i]; @@ -169,8 +164,7 @@ std::pair process_code_compression( for (int g = 0; g < 8; g++) { uint64_t bytes; memcpy(&bytes, bitset + point_idx + g * 8, 8); - uint8_t bits = (uint8_t)_pext_u64( - bytes, 0x0101010101010101ULL); + uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL); mask |= ((uint64_t)bits << (g * 8)); } #else @@ -196,8 +190,7 @@ std::pair process_code_compression( memcpy(&src_val, src + g * 8, 8); uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF); uint64_t byte_mask = - _pdep_u64(submask, 0x0101010101010101ULL) * - 0xFF; + _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF; uint64_t compressed_val = _pext_u64(src_val, byte_mask); int count = __builtin_popcount(submask); memcpy(dst + write_pos, &compressed_val, 8); diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp index cfd1283c80..73d5ba24d9 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp @@ -32,8 +32,7 @@ void process_chunks( size_t chunk_offset = chunk_idx * max_batch_size; float* chunk_sim = sim_table + chunk_idx * 256; for (size_t i = 0; i < num_active; i++) { - exact_distances[i] += - chunk_sim[compressed_codes[chunk_offset + i]]; + exact_distances[i] += chunk_sim[compressed_codes[chunk_offset + i]]; } } } @@ -52,8 +51,7 @@ size_t process_filtering( for (size_t i = 0; i < num_active; i++) { float exact_distance = exact_distances[i]; float cum_sum = cum_sums[active_indices[i] - batch_offset]; - float lower_bound = - exact_distance + dis0 - cum_sum * query_cum_norm; + float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm; bool keep = heap_max > lower_bound; active_indices[next_num_active] = active_indices[i]; @@ -87,8 +85,7 @@ std::pair process_code_compression( for (int g = 0; g < 8; g++) { uint64_t bytes; memcpy(&bytes, bitset + point_idx + g * 8, 8); - uint8_t bits = (uint8_t)_pext_u64( - bytes, 0x0101010101010101ULL); + uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL); mask |= ((uint64_t)bits << (g * 8)); } #else @@ -113,8 +110,7 @@ std::pair process_code_compression( memcpy(&src_val, src + g * 8, 8); uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF); uint64_t byte_mask = - _pdep_u64(submask, 0x0101010101010101ULL) * - 0xFF; + _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF; uint64_t compressed_val = _pext_u64(src_val, byte_mask); int count = __builtin_popcount(submask); memcpy(dst + write_pos, &compressed_val, 8); diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp index ba8050962d..8acbd02003 100644 --- a/faiss/index_factory.cpp +++ b/faiss/index_factory.cpp @@ -29,8 +29,8 @@ #include #include #include -#include #include +#include #include #include #include From 5a599c46f9cdedef3e29b790305e1ea95ade39c5 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sat, 21 Mar 2026 17:55:06 +0000 Subject: [PATCH 24/41] Fix bug --- faiss/invlists/InvertedLists.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp index 41638f2e39..63a4d3383c 100644 --- a/faiss/invlists/InvertedLists.cpp +++ b/faiss/invlists/InvertedLists.cpp @@ -363,6 +363,7 @@ ArrayInvertedListsPanorama::ArrayInvertedListsPanorama( !use_iterator, "Panorama does not support iterators"); cum_sums.resize(nlist_in); + init_dists.resize(nlist_in); } const float* ArrayInvertedListsPanorama::get_cum_sums(size_t list_no) const { From 5cf76c49717bdcdbfde0b0937c690734f29fc42d Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 00:20:24 +0000 Subject: [PATCH 25/41] Get rid of "chunk_size" --- faiss/IndexIVFPQPanorama.cpp | 5 +- faiss/IndexIVFPQPanorama.h | 3 +- faiss/impl/PanoramaPQ.cpp | 27 ++++---- faiss/impl/PanoramaPQ.h | 19 +++--- faiss/impl/index_read.cpp | 1 - .../panorama_kernels-avx2.cpp | 68 +++++++++---------- .../panorama_kernels-avx512.cpp | 68 +++++++++---------- .../panorama_kernels-generic.cpp | 28 ++++---- .../impl/panorama_kernels/panorama_kernels.h | 6 +- 9 files changed, 110 insertions(+), 115 deletions(-) diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp index 4848326553..92862713f7 100644 --- a/faiss/IndexIVFPQPanorama.cpp +++ b/faiss/IndexIVFPQPanorama.cpp @@ -38,7 +38,6 @@ IndexIVFPQPanorama::IndexIVFPQPanorama( : IndexIVFPQ(quantizer, d, nlist, M, nbits_per_idx, metric, false), n_levels(n_levels), batch_size(batch_size), - chunk_size(code_size / n_levels), levels_size(d / n_levels) { FAISS_THROW_IF_NOT_MSG( M % n_levels == 0, "M must be divisible by n_levels"); @@ -132,7 +131,7 @@ struct IVFPQScannerPanorama : InvertedListScanner { size_t nup = 0; const size_t bs = index.batch_size; - const size_t cs = index.chunk_size; + const size_t ls = pano_pq->level_width_bytes; const size_t n_batches = (list_size + bs - 1) / bs; const uint8_t* col_codes = storage->get_codes(list_no); @@ -143,7 +142,7 @@ struct IVFPQScannerPanorama : InvertedListScanner { std::vector exact_distances(bs); std::vector bitset(bs); std::vector active_indices(bs); - std::vector compressed_codes(bs * cs); + std::vector compressed_codes(bs * ls); float dis0 = coarse_dis; PanoramaStats local_stats; diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h index 717308bb07..74d38677e1 100644 --- a/faiss/IndexIVFPQPanorama.h +++ b/faiss/IndexIVFPQPanorama.h @@ -30,7 +30,7 @@ namespace faiss { /// Panorama transposes codes into column-major within each batch: /// for each batch of `batch_size` points, codes are stored as /// M columns of `batch_size` bytes each. The M columns are grouped -/// into `n_levels` levels of `chunk_size` columns, enabling incremental +/// into `n_levels` levels of `level_width_bytes` columns, enabling incremental /// distance computation level-by-level. /// /// Storage is managed by ArrayInvertedListsPanorama with a PanoramaPQ @@ -60,7 +60,6 @@ struct IndexIVFPQPanorama : public IndexIVFPQ { int n_levels; size_t batch_size; - size_t chunk_size; size_t levels_size; IndexIVFPQPanorama( diff --git a/faiss/impl/PanoramaPQ.cpp b/faiss/impl/PanoramaPQ.cpp index bd80f7f81c..f535ae9b84 100644 --- a/faiss/impl/PanoramaPQ.cpp +++ b/faiss/impl/PanoramaPQ.cpp @@ -19,7 +19,7 @@ void PanoramaPQ::copy_codes_to_level_layout( size_t offset, size_t n_entry, const uint8_t* code) { - const size_t cs = chunk_size; + const size_t ls = level_width_bytes; const size_t bs = batch_size; for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) { @@ -29,13 +29,13 @@ void PanoramaPQ::copy_codes_to_level_layout( size_t batch_offset = batch_no * bs * code_size; for (size_t level = 0; level < n_levels; level++) { - size_t level_offset = level * cs * bs; - size_t start_byte = level * cs; + size_t level_offset = level * ls * bs; + size_t start_byte = level * ls; - for (size_t ci = 0; ci < cs && (start_byte + ci) < code_size; - ci++) { - codes[batch_offset + level_offset + ci * bs + pos_in_batch] = - code[entry_idx * code_size + start_byte + ci]; + for (size_t li = 0; li < ls && (start_byte + li) < code_size; + li++) { + codes[batch_offset + level_offset + li * bs + pos_in_batch] = + code[entry_idx * code_size + start_byte + li]; } } } @@ -46,7 +46,7 @@ void PanoramaPQ::reconstruct( float* recons, const uint8_t* codes_base) const { uint8_t* recons_buffer = reinterpret_cast(recons); - const size_t cs = chunk_size; + const size_t ls = level_width_bytes; const size_t bs = batch_size; size_t batch_no = key / bs; @@ -54,12 +54,12 @@ void PanoramaPQ::reconstruct( size_t batch_offset = batch_no * bs * code_size; for (size_t level = 0; level < n_levels; level++) { - size_t level_offset = level * cs * bs; - size_t start_byte = level * cs; + size_t level_offset = level * ls * bs; + size_t start_byte = level * ls; - for (size_t ci = 0; ci < cs && (start_byte + ci) < code_size; ci++) { - recons_buffer[start_byte + ci] = codes_base - [batch_offset + level_offset + ci * bs + pos_in_batch]; + for (size_t li = 0; li < ls && (start_byte + li) < code_size; li++) { + recons_buffer[start_byte + li] = codes_base + [batch_offset + level_offset + li * bs + pos_in_batch]; } } } @@ -74,7 +74,6 @@ PanoramaPQ::PanoramaPQ( : Panorama(d, code_size, n_levels, batch_size), pq(pq), quantizer(quantizer), - chunk_size(code_size / n_levels), levels_size(d / n_levels) { FAISS_THROW_IF_NOT_MSG( code_size % n_levels == 0, diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h index a8e35a5699..a3901db795 100644 --- a/faiss/impl/PanoramaPQ.h +++ b/faiss/impl/PanoramaPQ.h @@ -28,7 +28,6 @@ namespace faiss { struct PanoramaPQ : Panorama { const ProductQuantizer* pq = nullptr; const Index* quantizer = nullptr; - size_t chunk_size = 0; size_t levels_size = 0; PanoramaPQ() = default; @@ -107,7 +106,7 @@ struct PanoramaPQ : Panorama { float threshold, PanoramaStats& local_stats) const { const size_t bs = batch_size; - const size_t cs = chunk_size; + const size_t ls = level_width_bytes; const size_t ksub = pq->ksub; size_t curr_batch_size = std::min(list_size - batch_no * bs, bs); @@ -149,12 +148,12 @@ struct PanoramaPQ : Panorama { level++) { local_stats.total_dims_scanned += next_num_active; - size_t level_sim_offset = level * ksub * cs; + size_t level_sim_offset = level * ksub * ls; float query_cum_norm = 2 * query_cum_norms[level + 1]; const float* cum_sums_level = batch_cums + bs * (level + 1); - const uint8_t* codes_level = batch_codes + bs * cs * level; + const uint8_t* codes_level = batch_codes + bs * ls * level; const float* sim_table_level = sim_table_2 + level_sim_offset; @@ -162,13 +161,13 @@ struct PanoramaPQ : Panorama { size_t num_active_for_filtering = 0; if (is_sparse) { - for (size_t ci = 0; ci < cs; ci++) { - size_t chunk_off = ci * bs; - const float* chunk_sim = sim_table_level + ci * ksub; + for (size_t li = 0; li < ls; li++) { + size_t byte_off = li * bs; + const float* chunk_sim = sim_table_level + li * ksub; for (size_t i = 0; i < next_num_active; i++) { size_t real_idx = active_indices[i] - batch_offset; exact_distances[i] += - chunk_sim[codes_level[chunk_off + real_idx]]; + chunk_sim[codes_level[byte_off + real_idx]]; } } num_active_for_filtering = next_num_active; @@ -176,13 +175,13 @@ struct PanoramaPQ : Panorama { auto [cc, na] = panorama_kernels::process_code_compression( next_num_active, bs, - cs, + ls, compressed_codes.data(), bitset.data(), codes_level); panorama_kernels::process_chunks( - cs, + ls, bs, na, const_cast(sim_table_level), diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index 864cc52455..d9b9fceb9d 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -1485,7 +1485,6 @@ std::unique_ptr read_index_up(IOReader* f, int io_flags) { read_ProductQuantizer(&ivpp->pq, f); READ1(ivpp->n_levels); READ1(ivpp->batch_size); - ivpp->chunk_size = ivpp->code_size / ivpp->n_levels; ivpp->levels_size = ivpp->d / ivpp->n_levels; read_InvertedLists(*ivpp, f, io_flags); // The "ilpn" reader creates a PanoramaFlat placeholder; replace diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp index 5633f6c874..ff5a86d678 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -23,26 +23,26 @@ namespace faiss { namespace panorama_kernels { void process_chunks( - size_t chunk_size, + size_t level_width_bytes, size_t max_batch_size, size_t num_active, float* sim_table, uint8_t* compressed_codes, float* exact_distances) { - size_t chunk_idx = 0; + size_t byte_idx = 0; // Process 4 chunks at a time to amortize loop overhead and keep // the accumulator in registers across chunks. - for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) { - size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size; - size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size; - size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size; - size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size; + for (; byte_idx + 3 < level_width_bytes; byte_idx += 4) { + size_t byte_offset0 = (byte_idx + 0) * max_batch_size; + size_t byte_offset1 = (byte_idx + 1) * max_batch_size; + size_t byte_offset2 = (byte_idx + 2) * max_batch_size; + size_t byte_offset3 = (byte_idx + 3) * max_batch_size; - float* sim_table0 = sim_table + (chunk_idx + 0) * 256; - float* sim_table1 = sim_table + (chunk_idx + 1) * 256; - float* sim_table2 = sim_table + (chunk_idx + 2) * 256; - float* sim_table3 = sim_table + (chunk_idx + 3) * 256; + float* sim_table0 = sim_table + (byte_idx + 0) * 256; + float* sim_table1 = sim_table + (byte_idx + 1) * 256; + float* sim_table2 = sim_table + (byte_idx + 2) * 256; + float* sim_table3 = sim_table + (byte_idx + 3) * 256; size_t batch_idx = 0; for (; batch_idx + 7 < num_active; batch_idx += 8) { @@ -50,28 +50,28 @@ void process_chunks( // Load 8 byte codes, zero-extend to 32-bit indices. __m128i raw0 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + chunk_offset0 + batch_idx)); + (__m128i*)(compressed_codes + byte_offset0 + batch_idx)); __m256i codes0 = _mm256_cvtepu8_epi32(raw0); acc = _mm256_add_ps( acc, _mm256_i32gather_ps(sim_table0, codes0, sizeof(float))); __m128i raw1 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + chunk_offset1 + batch_idx)); + (__m128i*)(compressed_codes + byte_offset1 + batch_idx)); __m256i codes1 = _mm256_cvtepu8_epi32(raw1); acc = _mm256_add_ps( acc, _mm256_i32gather_ps(sim_table1, codes1, sizeof(float))); __m128i raw2 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + chunk_offset2 + batch_idx)); + (__m128i*)(compressed_codes + byte_offset2 + batch_idx)); __m256i codes2 = _mm256_cvtepu8_epi32(raw2); acc = _mm256_add_ps( acc, _mm256_i32gather_ps(sim_table2, codes2, sizeof(float))); __m128i raw3 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + chunk_offset3 + batch_idx)); + (__m128i*)(compressed_codes + byte_offset3 + batch_idx)); __m256i codes3 = _mm256_cvtepu8_epi32(raw3); acc = _mm256_add_ps( acc, @@ -82,23 +82,23 @@ void process_chunks( for (; batch_idx < num_active; batch_idx += 1) { float acc = exact_distances[batch_idx]; - acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]]; - acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]]; - acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]]; - acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]]; + acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]]; + acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]]; + acc += sim_table2[compressed_codes[byte_offset2 + batch_idx]]; + acc += sim_table3[compressed_codes[byte_offset3 + batch_idx]]; exact_distances[batch_idx] = acc; } } - for (; chunk_idx < chunk_size; chunk_idx++) { - size_t chunk_offset = chunk_idx * max_batch_size; - float* sim_table_ptr = sim_table + chunk_idx * 256; + for (; byte_idx < level_width_bytes; byte_idx++) { + size_t byte_offset = byte_idx * max_batch_size; + float* sim_table_ptr = sim_table + byte_idx * 256; size_t batch_idx = 0; for (; batch_idx + 7 < num_active; batch_idx += 8) { __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx); __m128i raw = _mm_loadl_epi64( - (__m128i*)(compressed_codes + chunk_offset + batch_idx)); + (__m128i*)(compressed_codes + byte_offset + batch_idx)); __m256i codes = _mm256_cvtepu8_epi32(raw); __m256 m_dist = _mm256_i32gather_ps(sim_table_ptr, codes, sizeof(float)); @@ -108,7 +108,7 @@ void process_chunks( for (; batch_idx < num_active; batch_idx += 1) { exact_distances[batch_idx] += - sim_table_ptr[compressed_codes[chunk_offset + batch_idx]]; + sim_table_ptr[compressed_codes[byte_offset + batch_idx]]; } } } @@ -141,7 +141,7 @@ size_t process_filtering( std::pair process_code_compression( size_t next_num_active, size_t max_batch_size, - size_t chunk_size, + size_t level_width_bytes, uint8_t* compressed_codes_begin, uint8_t* bitset, const uint8_t* codes) { @@ -154,7 +154,7 @@ std::pair process_code_compression( // Compress the codes: here we don't need to process remainders // as long as `max_batch_size` is a multiple of 64 (which we // assert in the constructor). Conveniently, compressed_codes is - // allocated to `max_batch_size` * `chunk_size` elements. + // allocated to `max_batch_size` * `level_width_bytes` elements. // `num_active` is guaranteed to always be less than or equal to // `max_batch_size`. Only the last batch may be smaller than // `max_batch_size`, the caller ensures that the batch and @@ -184,10 +184,10 @@ std::pair process_code_compression( // PEXT/PDEP path: process 8 bytes at a time. PDEP // expands the per-byte mask bits into a per-byte lane // mask, then PEXT extracts only the selected bytes. - for (size_t ci = 0; ci < chunk_size; ci++) { - size_t chunk_offset = ci * max_batch_size; - const uint8_t* src = codes + chunk_offset + point_idx; - uint8_t* dst = compressed_codes + chunk_offset + num_active; + for (size_t ci = 0; ci < level_width_bytes; ci++) { + size_t byte_offset = ci * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; for (int g = 0; g < 8; g++) { uint64_t src_val; @@ -204,10 +204,10 @@ std::pair process_code_compression( #else // Scalar fallback: scan set bits one by one and copy // the corresponding code byte. - for (size_t ci = 0; ci < chunk_size; ci++) { - size_t chunk_offset = ci * max_batch_size; - const uint8_t* src = codes + chunk_offset + point_idx; - uint8_t* dst = compressed_codes + chunk_offset + num_active; + for (size_t ci = 0; ci < level_width_bytes; ci++) { + size_t byte_offset = ci * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; uint64_t m = mask; while (m) { diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp index a73461a8dc..e8cdb93af7 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -17,54 +17,54 @@ namespace faiss { namespace panorama_kernels { void process_chunks( - size_t chunk_size, + size_t level_width_bytes, size_t max_batch_size, size_t num_active, float* sim_table, uint8_t* compressed_codes, float* exact_distances) { - size_t chunk_idx = 0; + size_t byte_idx = 0; // Process 4 chunks at a time to amortize loop overhead and keep // the accumulator in registers across chunks. - for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) { - size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size; - size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size; - size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size; - size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size; + for (; byte_idx + 3 < level_width_bytes; byte_idx += 4) { + size_t byte_offset0 = (byte_idx + 0) * max_batch_size; + size_t byte_offset1 = (byte_idx + 1) * max_batch_size; + size_t byte_offset2 = (byte_idx + 2) * max_batch_size; + size_t byte_offset3 = (byte_idx + 3) * max_batch_size; - float* sim_table0 = sim_table + (chunk_idx + 0) * 256; - float* sim_table1 = sim_table + (chunk_idx + 1) * 256; - float* sim_table2 = sim_table + (chunk_idx + 2) * 256; - float* sim_table3 = sim_table + (chunk_idx + 3) * 256; + float* sim_table0 = sim_table + (byte_idx + 0) * 256; + float* sim_table1 = sim_table + (byte_idx + 1) * 256; + float* sim_table2 = sim_table + (byte_idx + 2) * 256; + float* sim_table3 = sim_table + (byte_idx + 3) * 256; size_t batch_idx = 0; for (; batch_idx + 15 < num_active; batch_idx += 16) { __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); __m128i comp0 = _mm_loadu_si128( - (__m128i*)(compressed_codes + chunk_offset0 + batch_idx)); + (__m128i*)(compressed_codes + byte_offset0 + batch_idx)); __m512i codes0 = _mm512_cvtepu8_epi32(comp0); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes0, sim_table0, sizeof(float))); __m128i comp1 = _mm_loadu_si128( - (__m128i*)(compressed_codes + chunk_offset1 + batch_idx)); + (__m128i*)(compressed_codes + byte_offset1 + batch_idx)); __m512i codes1 = _mm512_cvtepu8_epi32(comp1); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes1, sim_table1, sizeof(float))); __m128i comp2 = _mm_loadu_si128( - (__m128i*)(compressed_codes + chunk_offset2 + batch_idx)); + (__m128i*)(compressed_codes + byte_offset2 + batch_idx)); __m512i codes2 = _mm512_cvtepu8_epi32(comp2); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes2, sim_table2, sizeof(float))); __m128i comp3 = _mm_loadu_si128( - (__m128i*)(compressed_codes + chunk_offset3 + batch_idx)); + (__m128i*)(compressed_codes + byte_offset3 + batch_idx)); __m512i codes3 = _mm512_cvtepu8_epi32(comp3); acc = _mm512_add_ps( acc, @@ -75,23 +75,23 @@ void process_chunks( for (; batch_idx < num_active; batch_idx += 1) { float acc = exact_distances[batch_idx]; - acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]]; - acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]]; - acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]]; - acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]]; + acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]]; + acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]]; + acc += sim_table2[compressed_codes[byte_offset2 + batch_idx]]; + acc += sim_table3[compressed_codes[byte_offset3 + batch_idx]]; exact_distances[batch_idx] = acc; } } - for (; chunk_idx < chunk_size; chunk_idx++) { - size_t chunk_offset = chunk_idx * max_batch_size; - float* sim_table_ptr = sim_table + chunk_idx * 256; + for (; byte_idx < level_width_bytes; byte_idx++) { + size_t byte_offset = byte_idx * max_batch_size; + float* sim_table_ptr = sim_table + byte_idx * 256; size_t batch_idx = 0; for (; batch_idx + 15 < num_active; batch_idx += 16) { __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx); __m128i comp = _mm_loadu_si128( - (__m128i*)(compressed_codes + chunk_offset + batch_idx)); + (__m128i*)(compressed_codes + byte_offset + batch_idx)); __m512i codes = _mm512_cvtepu8_epi32(comp); __m512 m_dist = _mm512_i32gather_ps(codes, sim_table_ptr, sizeof(float)); @@ -101,7 +101,7 @@ void process_chunks( for (; batch_idx < num_active; batch_idx += 1) { exact_distances[batch_idx] += - sim_table_ptr[compressed_codes[chunk_offset + batch_idx]]; + sim_table_ptr[compressed_codes[byte_offset + batch_idx]]; } } } @@ -134,7 +134,7 @@ size_t process_filtering( std::pair process_code_compression( size_t next_num_active, size_t max_batch_size, - size_t chunk_size, + size_t level_width_bytes, uint8_t* compressed_codes_begin, uint8_t* bitset, const uint8_t* codes) { @@ -147,7 +147,7 @@ std::pair process_code_compression( // Compress the codes: here we don't need to process remainders // as long as `max_batch_size` is a multiple of 64 (which we // assert in the constructor). Conveniently, compressed_codes is - // allocated to `max_batch_size` * `chunk_size` elements. + // allocated to `max_batch_size` * `level_width_bytes` elements. // `num_active` is guaranteed to always be less than or equal to // `max_batch_size`. Only the last batch may be smaller than // `max_batch_size`, the caller ensures that the batch and @@ -180,10 +180,10 @@ std::pair process_code_compression( // PEXT/PDEP path: process 8 bytes at a time. PDEP // expands the per-byte mask bits into a per-byte lane // mask, then PEXT extracts only the selected bytes. - for (size_t ci = 0; ci < chunk_size; ci++) { - size_t chunk_offset = ci * max_batch_size; - const uint8_t* src = codes + chunk_offset + point_idx; - uint8_t* dst = compressed_codes + chunk_offset + num_active; + for (size_t ci = 0; ci < level_width_bytes; ci++) { + size_t byte_offset = ci * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; for (int g = 0; g < 8; g++) { uint64_t src_val; @@ -200,10 +200,10 @@ std::pair process_code_compression( #else // Scalar fallback: scan set bits one by one and copy // the corresponding code byte. - for (size_t ci = 0; ci < chunk_size; ci++) { - size_t chunk_offset = ci * max_batch_size; - const uint8_t* src = codes + chunk_offset + point_idx; - uint8_t* dst = compressed_codes + chunk_offset + num_active; + for (size_t ci = 0; ci < level_width_bytes; ci++) { + size_t byte_offset = ci * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; uint64_t m = mask; while (m) { diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp index 73d5ba24d9..603e51bc73 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp @@ -22,17 +22,17 @@ namespace faiss { namespace panorama_kernels { void process_chunks( - size_t chunk_size, + size_t level_width_bytes, size_t max_batch_size, size_t num_active, float* sim_table, uint8_t* compressed_codes, float* exact_distances) { - for (size_t chunk_idx = 0; chunk_idx < chunk_size; chunk_idx++) { - size_t chunk_offset = chunk_idx * max_batch_size; - float* chunk_sim = sim_table + chunk_idx * 256; + for (size_t byte_idx = 0; byte_idx < level_width_bytes; byte_idx++) { + size_t byte_offset = byte_idx * max_batch_size; + float* chunk_sim = sim_table + byte_idx * 256; for (size_t i = 0; i < num_active; i++) { - exact_distances[i] += chunk_sim[compressed_codes[chunk_offset + i]]; + exact_distances[i] += chunk_sim[compressed_codes[byte_offset + i]]; } } } @@ -65,7 +65,7 @@ size_t process_filtering( std::pair process_code_compression( size_t next_num_active, size_t max_batch_size, - size_t chunk_size, + size_t level_width_bytes, uint8_t* compressed_codes_begin, uint8_t* bitset, const uint8_t* codes) { @@ -100,10 +100,10 @@ std::pair process_code_compression( // PEXT/PDEP path: process 8 bytes at a time. PDEP // expands the per-byte mask bits into a per-byte lane // mask, then PEXT extracts only the selected bytes. - for (size_t ci = 0; ci < chunk_size; ci++) { - size_t chunk_offset = ci * max_batch_size; - const uint8_t* src = codes + chunk_offset + point_idx; - uint8_t* dst = compressed_codes + chunk_offset + num_active; + for (size_t ci = 0; ci < level_width_bytes; ci++) { + size_t byte_offset = ci * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; for (int g = 0; g < 8; g++) { uint64_t src_val; @@ -120,10 +120,10 @@ std::pair process_code_compression( #else // Scalar fallback: scan set bits one by one and copy // the corresponding code byte. - for (size_t ci = 0; ci < chunk_size; ci++) { - size_t chunk_offset = ci * max_batch_size; - const uint8_t* src = codes + chunk_offset + point_idx; - uint8_t* dst = compressed_codes + chunk_offset + num_active; + for (size_t ci = 0; ci < level_width_bytes; ci++) { + size_t byte_offset = ci * max_batch_size; + const uint8_t* src = codes + byte_offset + point_idx; + uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; uint64_t m = mask; while (m) { diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h index aed8a87660..1ff74086e6 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels.h +++ b/faiss/impl/panorama_kernels/panorama_kernels.h @@ -35,7 +35,7 @@ namespace panorama_kernels { /// Iterates chunks first to keep the LUT slice in L1 cache. /// The AVX-512 version unrolls 4 chunks at a time. void process_chunks( - size_t chunk_size, + size_t level_width_bytes, size_t max_batch_size, size_t num_active, float* sim_table, @@ -71,7 +71,7 @@ size_t process_filtering( /// Compress the codes: here we don't need to process remainders /// as long as `max_batch_size` is a multiple of 64 (which we /// assert in the constructor). Conveniently, compressed_codes is -/// allocated to `max_batch_size` * `chunk_size` elements. +/// allocated to `max_batch_size` * `level_width_bytes` elements. /// `num_active` is guaranteed to always be less than or equal to /// `max_batch_size`. Only the last batch may be smaller than /// `max_batch_size`, the caller ensures that the batch and @@ -79,7 +79,7 @@ size_t process_filtering( std::pair process_code_compression( size_t next_num_active, size_t max_batch_size, - size_t chunk_size, + size_t level_width_bytes, uint8_t* compressed_codes_begin, uint8_t* bitset, const uint8_t* codes); From ed678957700bfe8378c70315b5172e58018344a9 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 00:22:02 +0000 Subject: [PATCH 26/41] Format --- faiss/IndexIVF.h | 1 - 1 file changed, 1 deletion(-) diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h index 972a86d92e..d3a4bb891b 100644 --- a/faiss/IndexIVF.h +++ b/faiss/IndexIVF.h @@ -22,7 +22,6 @@ namespace faiss { - /** Encapsulates a quantizer object for the IndexIVF * * The class isolates the fields that are independent of the storage From 26de19a371158697b57b8e070b00d44b183845a7 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 00:39:43 +0000 Subject: [PATCH 27/41] Remove more "ci" --- faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp | 8 ++++---- faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp | 8 ++++---- faiss/impl/panorama_kernels/panorama_kernels-generic.cpp | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp index ff5a86d678..a580cc8a29 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -184,8 +184,8 @@ std::pair process_code_compression( // PEXT/PDEP path: process 8 bytes at a time. PDEP // expands the per-byte mask bits into a per-byte lane // mask, then PEXT extracts only the selected bytes. - for (size_t ci = 0; ci < level_width_bytes; ci++) { - size_t byte_offset = ci * max_batch_size; + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; const uint8_t* src = codes + byte_offset + point_idx; uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; @@ -204,8 +204,8 @@ std::pair process_code_compression( #else // Scalar fallback: scan set bits one by one and copy // the corresponding code byte. - for (size_t ci = 0; ci < level_width_bytes; ci++) { - size_t byte_offset = ci * max_batch_size; + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; const uint8_t* src = codes + byte_offset + point_idx; uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp index e8cdb93af7..139b0e8867 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -180,8 +180,8 @@ std::pair process_code_compression( // PEXT/PDEP path: process 8 bytes at a time. PDEP // expands the per-byte mask bits into a per-byte lane // mask, then PEXT extracts only the selected bytes. - for (size_t ci = 0; ci < level_width_bytes; ci++) { - size_t byte_offset = ci * max_batch_size; + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; const uint8_t* src = codes + byte_offset + point_idx; uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; @@ -200,8 +200,8 @@ std::pair process_code_compression( #else // Scalar fallback: scan set bits one by one and copy // the corresponding code byte. - for (size_t ci = 0; ci < level_width_bytes; ci++) { - size_t byte_offset = ci * max_batch_size; + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; const uint8_t* src = codes + byte_offset + point_idx; uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp index 603e51bc73..2c64fd22db 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp @@ -100,8 +100,8 @@ std::pair process_code_compression( // PEXT/PDEP path: process 8 bytes at a time. PDEP // expands the per-byte mask bits into a per-byte lane // mask, then PEXT extracts only the selected bytes. - for (size_t ci = 0; ci < level_width_bytes; ci++) { - size_t byte_offset = ci * max_batch_size; + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; const uint8_t* src = codes + byte_offset + point_idx; uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; @@ -120,8 +120,8 @@ std::pair process_code_compression( #else // Scalar fallback: scan set bits one by one and copy // the corresponding code byte. - for (size_t ci = 0; ci < level_width_bytes; ci++) { - size_t byte_offset = ci * max_batch_size; + for (size_t li = 0; li < level_width_bytes; li++) { + size_t byte_offset = li * max_batch_size; const uint8_t* src = codes + byte_offset + point_idx; uint8_t* dst = compressed_codes + byte_offset + num_active; int write_pos = 0; From c8d22ecd4f231f4f38ccafc728ea2ba81e6911ae Mon Sep 17 00:00:00 2001 From: Alexis Schlomer Date: Sun, 22 Mar 2026 00:46:11 +0000 Subject: [PATCH 28/41] Fix recall on bench --- benchs/bench_ivfpq_panorama.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py index 806c56fcc4..fcd52210b9 100644 --- a/benchs/bench_ivfpq_panorama.py +++ b/benchs/bench_ivfpq_panorama.py @@ -69,8 +69,9 @@ def eval_recall(index, nprobe_val): speed = t * 1000 / nq qps = 1000 / speed - corrects = (gt == I).sum() - recall = corrects / (nq * k) + recall = np.mean( + [len(set(gt[i]) & set(I[i])) / k for i in range(nq)], + ) ratio_dims_scanned = faiss.cvar.indexPanorama_stats.ratio_dims_scanned print( f"\tnprobe {nprobe_val:3d}, Recall@{k}: " @@ -281,7 +282,9 @@ def build_ivfpq_panorama(M, n_levels, alpha=ALPHA): eval_index(pano, label=f"PCA+Spill+Rot + IVFPQPanorama (M={M})") del pano -plt.title(f"IVFPQ Panorama on GIST1M (nlist={nlist})") +plt.title( + f"IVFPQ Panorama on GIST ({SUBSET*100:.0f}% subset, nlist={nlist})", +) plt.xlabel(f"Recall@{k}") plt.ylabel("QPS") plt.yscale("log") From e91980c50564dd49a8b45b9b1697ae9570966350 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 01:14:50 +0000 Subject: [PATCH 29/41] Remove batch size magic number --- faiss/IndexIVFFlatPanorama.cpp | 2 +- faiss/IndexIVFPQPanorama.h | 3 ++- faiss/impl/Panorama.h | 2 ++ faiss/impl/index_read.cpp | 7 +++---- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp index 335be43ca7..65587b22a9 100644 --- a/faiss/IndexIVFFlatPanorama.cpp +++ b/faiss/IndexIVFFlatPanorama.cpp @@ -39,7 +39,7 @@ IndexIVFFlatPanorama::IndexIVFFlatPanorama( // We construct the inverted lists here so that we can use the // level-oriented storage. This does not cause a leak as we constructed // IndexIVF first, with own_invlists set to false. - auto* pano = new PanoramaFlat(d, n_levels, 128); + auto* pano = new PanoramaFlat(d, n_levels, Panorama::kDefaultBatchSize); this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano); this->own_invlists = own_invlists_in; } diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h index 74d38677e1..9d6ef0dd3b 100644 --- a/faiss/IndexIVFPQPanorama.h +++ b/faiss/IndexIVFPQPanorama.h @@ -11,6 +11,7 @@ #include #include +#include namespace faiss { @@ -69,7 +70,7 @@ struct IndexIVFPQPanorama : public IndexIVFPQ { size_t M, size_t nbits_per_idx, int n_levels, - size_t batch_size = 128, + size_t batch_size = Panorama::kDefaultBatchSize, MetricType metric = METRIC_L2, bool own_invlists = true); diff --git a/faiss/impl/Panorama.h b/faiss/impl/Panorama.h index fcf3136d44..58e889a1f0 100644 --- a/faiss/impl/Panorama.h +++ b/faiss/impl/Panorama.h @@ -46,6 +46,8 @@ namespace faiss { * for their respective code formats. */ struct Panorama { + static constexpr size_t kDefaultBatchSize = 128; + size_t d = 0; size_t code_size = 0; size_t n_levels = 0; diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index d9b9fceb9d..11ba4deeb1 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -431,9 +431,9 @@ std::unique_ptr read_InvertedLists_up( FAISS_CHECK_DESERIALIZATION_LOOP_LIMIT(nlist, "ilpn nlist"); READ1(code_size); READ1(n_levels); - constexpr size_t kFlatBatchSize = 128; + constexpr size_t bs = Panorama::kDefaultBatchSize; auto* pano = new PanoramaFlat( - code_size / sizeof(float), n_levels, kFlatBatchSize); + code_size / sizeof(float), n_levels, bs); auto ailp = std::make_unique( nlist, code_size, pano); std::vector sizes(nlist); @@ -442,8 +442,7 @@ std::unique_ptr read_InvertedLists_up( for (size_t i = 0; i < nlist; i++) { ailp->ids[i].resize(sizes[i]); size_t num_elems = - ((sizes[i] + kFlatBatchSize - 1) / kFlatBatchSize) * - kFlatBatchSize; + ((sizes[i] + bs - 1) / bs) * bs; ailp->codes[i].resize(num_elems * code_size); ailp->cum_sums[i].resize(num_elems * (n_levels + 1)); } From 69552f21a6f7fa879a42ca453911edb4c114c542 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 01:18:20 +0000 Subject: [PATCH 30/41] vd_in --- faiss/IndexIVFFlatPanorama.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp index 65587b22a9..d05c73a049 100644 --- a/faiss/IndexIVFFlatPanorama.cpp +++ b/faiss/IndexIVFFlatPanorama.cpp @@ -57,12 +57,12 @@ struct IVFFlatScannerPanorama : InvertedListScanner { static constexpr MetricType metric = VectorDistance::metric; IVFFlatScannerPanorama( - const VectorDistance& vd, + const VectorDistance& vd_in, const ArrayInvertedListsPanorama* storage_in, bool store_pairs_in, const IDSelector* sel_in) : InvertedListScanner(store_pairs_in, sel_in), - vd(vd), + vd(vd_in), storage(storage_in), pano_flat( dynamic_cast( From 084381a3ab40a39e99c9512c3674ba6bc0c344fc Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 01:19:40 +0000 Subject: [PATCH 31/41] Clean diffs --- faiss/IndexIVFPQ.cpp | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp index 2d0bba4228..3af6fddff7 100644 --- a/faiss/IndexIVFPQ.cpp +++ b/faiss/IndexIVFPQ.cpp @@ -9,14 +9,13 @@ #include -#include #include #include #include -#include #include #include -#include + +#include #include #include From eb9f1b76aba963c05cd16d05455d04e43477ce40 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 01:21:03 +0000 Subject: [PATCH 32/41] Format --- faiss/impl/index_read.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index 11ba4deeb1..b7cef27a10 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -432,8 +432,7 @@ std::unique_ptr read_InvertedLists_up( READ1(code_size); READ1(n_levels); constexpr size_t bs = Panorama::kDefaultBatchSize; - auto* pano = new PanoramaFlat( - code_size / sizeof(float), n_levels, bs); + auto* pano = new PanoramaFlat(code_size / sizeof(float), n_levels, bs); auto ailp = std::make_unique( nlist, code_size, pano); std::vector sizes(nlist); @@ -441,8 +440,7 @@ std::unique_ptr read_InvertedLists_up( for (size_t i = 0; i < nlist; i++) { ailp->ids[i].resize(sizes[i]); - size_t num_elems = - ((sizes[i] + bs - 1) / bs) * bs; + size_t num_elems = ((sizes[i] + bs - 1) / bs) * bs; ailp->codes[i].resize(num_elems * code_size); ailp->cum_sums[i].resize(num_elems * (n_levels + 1)); } From 8a375c2ac33e65e40fc82c2900ef09b0a6cccd4e Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 01:38:28 +0000 Subject: [PATCH 33/41] process_level --- faiss/impl/PanoramaPQ.h | 2 +- faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp | 4 ++-- faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp | 2 +- faiss/impl/panorama_kernels/panorama_kernels-generic.cpp | 2 +- faiss/impl/panorama_kernels/panorama_kernels.h | 4 ++-- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h index a3901db795..b94623cb5e 100644 --- a/faiss/impl/PanoramaPQ.h +++ b/faiss/impl/PanoramaPQ.h @@ -180,7 +180,7 @@ struct PanoramaPQ : Panorama { bitset.data(), codes_level); - panorama_kernels::process_chunks( + panorama_kernels::process_level( ls, bs, na, diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp index a580cc8a29..a10a3f3c72 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -6,7 +6,7 @@ */ // AVX2 implementations of Panorama kernels. -// Uses 256-bit gather for process_chunks, scalar filtering (no +// Uses 256-bit gather for process_level, scalar filtering (no // compress instruction in AVX2), and BMI2 PEXT/PDEP for code // compression where available. @@ -22,7 +22,7 @@ namespace faiss { namespace panorama_kernels { -void process_chunks( +void process_level( size_t level_width_bytes, size_t max_batch_size, size_t num_active, diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp index 139b0e8867..8181327c14 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -16,7 +16,7 @@ namespace faiss { namespace panorama_kernels { -void process_chunks( +void process_level( size_t level_width_bytes, size_t max_batch_size, size_t num_active, diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp index 2c64fd22db..664485f5f9 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp @@ -21,7 +21,7 @@ namespace faiss { namespace panorama_kernels { -void process_chunks( +void process_level( size_t level_width_bytes, size_t max_batch_size, size_t num_active, diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h index 1ff74086e6..0bbcad0eef 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels.h +++ b/faiss/impl/panorama_kernels/panorama_kernels.h @@ -12,7 +12,7 @@ * @brief Panorama search kernels with scalar and AVX-512 implementations. * * The three core kernels of the Panorama progressive filtering search: - * - process_chunks: accumulate PQ distance table lookups over chunks + * - process_level: accumulate PQ distance table lookups over chunks * - process_filtering: Cauchy-Schwarz lower bound pruning with stream * compaction * - process_code_compression: byte-level stream compaction of PQ codes @@ -34,7 +34,7 @@ namespace panorama_kernels { /// accumulates into `exact_distances[i]` for all active elements. /// Iterates chunks first to keep the LUT slice in L1 cache. /// The AVX-512 version unrolls 4 chunks at a time. -void process_chunks( +void process_level( size_t level_width_bytes, size_t max_batch_size, size_t num_active, From a624893d93567ca2e921a633c14e7a59706e4f1b Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 01:44:59 +0000 Subject: [PATCH 34/41] for now --- faiss/IndexIVFPQPanorama.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h index 9d6ef0dd3b..8bcc97b624 100644 --- a/faiss/IndexIVFPQPanorama.h +++ b/faiss/IndexIVFPQPanorama.h @@ -45,7 +45,7 @@ namespace faiss { /// search using the precomputed_table (no extra per-point storage). /// /// CONSTRAINTS: -/// - Only L2 metric is supported. +/// - Only L2 metric is supported (for now). /// - Only 8-bit PQ codes (nbits_per_idx == 8). /// - M must be divisible by n_levels. /// - batch_size must be a multiple of 64. From d7070225fda006beb638e7fe035a59c1f3145053 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 02:06:06 +0000 Subject: [PATCH 35/41] Reorder unrolled AVX512 instructions --- .../panorama_kernels-avx512.cpp | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp index 8181327c14..6d22358153 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -44,28 +44,27 @@ void process_level( __m128i comp0 = _mm_loadu_si128( (__m128i*)(compressed_codes + byte_offset0 + batch_idx)); + __m128i comp1 = _mm_loadu_si128( + (__m128i*)(compressed_codes + byte_offset1 + batch_idx)); + __m128i comp2 = _mm_loadu_si128( + (__m128i*)(compressed_codes + byte_offset2 + batch_idx)); + __m128i comp3 = _mm_loadu_si128( + (__m128i*)(compressed_codes + byte_offset3 + batch_idx)); + __m512i codes0 = _mm512_cvtepu8_epi32(comp0); + __m512i codes1 = _mm512_cvtepu8_epi32(comp1); + __m512i codes2 = _mm512_cvtepu8_epi32(comp2); + __m512i codes3 = _mm512_cvtepu8_epi32(comp3); + acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes0, sim_table0, sizeof(float))); - - __m128i comp1 = _mm_loadu_si128( - (__m128i*)(compressed_codes + byte_offset1 + batch_idx)); - __m512i codes1 = _mm512_cvtepu8_epi32(comp1); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes1, sim_table1, sizeof(float))); - - __m128i comp2 = _mm_loadu_si128( - (__m128i*)(compressed_codes + byte_offset2 + batch_idx)); - __m512i codes2 = _mm512_cvtepu8_epi32(comp2); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes2, sim_table2, sizeof(float))); - - __m128i comp3 = _mm_loadu_si128( - (__m128i*)(compressed_codes + byte_offset3 + batch_idx)); - __m512i codes3 = _mm512_cvtepu8_epi32(comp3); acc = _mm512_add_ps( acc, _mm512_i32gather_ps(codes3, sim_table3, sizeof(float))); From cc59df9da52902210486244b232558d5a249b51a Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 02:22:50 +0000 Subject: [PATCH 36/41] Remove redundant unrolling --- .../panorama_kernels-avx2.cpp | 38 +------------------ .../panorama_kernels-avx512.cpp | 4 +- 2 files changed, 3 insertions(+), 39 deletions(-) diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp index a10a3f3c72..2089ded936 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -44,43 +44,7 @@ void process_level( float* sim_table2 = sim_table + (byte_idx + 2) * 256; float* sim_table3 = sim_table + (byte_idx + 3) * 256; - size_t batch_idx = 0; - for (; batch_idx + 7 < num_active; batch_idx += 8) { - __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx); - - // Load 8 byte codes, zero-extend to 32-bit indices. - __m128i raw0 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + byte_offset0 + batch_idx)); - __m256i codes0 = _mm256_cvtepu8_epi32(raw0); - acc = _mm256_add_ps( - acc, - _mm256_i32gather_ps(sim_table0, codes0, sizeof(float))); - - __m128i raw1 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + byte_offset1 + batch_idx)); - __m256i codes1 = _mm256_cvtepu8_epi32(raw1); - acc = _mm256_add_ps( - acc, - _mm256_i32gather_ps(sim_table1, codes1, sizeof(float))); - - __m128i raw2 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + byte_offset2 + batch_idx)); - __m256i codes2 = _mm256_cvtepu8_epi32(raw2); - acc = _mm256_add_ps( - acc, - _mm256_i32gather_ps(sim_table2, codes2, sizeof(float))); - - __m128i raw3 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + byte_offset3 + batch_idx)); - __m256i codes3 = _mm256_cvtepu8_epi32(raw3); - acc = _mm256_add_ps( - acc, - _mm256_i32gather_ps(sim_table3, codes3, sizeof(float))); - - _mm256_storeu_ps(exact_distances + batch_idx, acc); - } - - for (; batch_idx < num_active; batch_idx += 1) { + for (size_t batch_idx = 0; batch_idx < num_active; batch_idx++) { float acc = exact_distances[batch_idx]; acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]]; acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]]; diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp index 6d22358153..e8976378f3 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -72,7 +72,7 @@ void process_level( _mm512_storeu_ps(exact_distances + batch_idx, acc); } - for (; batch_idx < num_active; batch_idx += 1) { + for (; batch_idx < num_active; batch_idx++) { float acc = exact_distances[batch_idx]; acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]]; acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]]; @@ -98,7 +98,7 @@ void process_level( _mm512_storeu_ps(exact_distances + batch_idx, acc); } - for (; batch_idx < num_active; batch_idx += 1) { + for (; batch_idx < num_active; batch_idx++) { exact_distances[batch_idx] += sim_table_ptr[compressed_codes[byte_offset + batch_idx]]; } From 447981030ab89d32869afa9ace423d759977dd74 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 02:34:10 +0000 Subject: [PATCH 37/41] Revert "Remove redundant unrolling" This reverts commit cc59df9da52902210486244b232558d5a249b51a. --- .../panorama_kernels-avx2.cpp | 38 ++++++++++++++++++- .../panorama_kernels-avx512.cpp | 4 +- 2 files changed, 39 insertions(+), 3 deletions(-) diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp index 2089ded936..a10a3f3c72 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -44,7 +44,43 @@ void process_level( float* sim_table2 = sim_table + (byte_idx + 2) * 256; float* sim_table3 = sim_table + (byte_idx + 3) * 256; - for (size_t batch_idx = 0; batch_idx < num_active; batch_idx++) { + size_t batch_idx = 0; + for (; batch_idx + 7 < num_active; batch_idx += 8) { + __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx); + + // Load 8 byte codes, zero-extend to 32-bit indices. + __m128i raw0 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset0 + batch_idx)); + __m256i codes0 = _mm256_cvtepu8_epi32(raw0); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table0, codes0, sizeof(float))); + + __m128i raw1 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset1 + batch_idx)); + __m256i codes1 = _mm256_cvtepu8_epi32(raw1); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table1, codes1, sizeof(float))); + + __m128i raw2 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset2 + batch_idx)); + __m256i codes2 = _mm256_cvtepu8_epi32(raw2); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table2, codes2, sizeof(float))); + + __m128i raw3 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset3 + batch_idx)); + __m256i codes3 = _mm256_cvtepu8_epi32(raw3); + acc = _mm256_add_ps( + acc, + _mm256_i32gather_ps(sim_table3, codes3, sizeof(float))); + + _mm256_storeu_ps(exact_distances + batch_idx, acc); + } + + for (; batch_idx < num_active; batch_idx += 1) { float acc = exact_distances[batch_idx]; acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]]; acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]]; diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp index e8976378f3..6d22358153 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -72,7 +72,7 @@ void process_level( _mm512_storeu_ps(exact_distances + batch_idx, acc); } - for (; batch_idx < num_active; batch_idx++) { + for (; batch_idx < num_active; batch_idx += 1) { float acc = exact_distances[batch_idx]; acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]]; acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]]; @@ -98,7 +98,7 @@ void process_level( _mm512_storeu_ps(exact_distances + batch_idx, acc); } - for (; batch_idx < num_active; batch_idx++) { + for (; batch_idx < num_active; batch_idx += 1) { exact_distances[batch_idx] += sim_table_ptr[compressed_codes[byte_offset + batch_idx]]; } From 7db3f1ec8fe56dc25e2d623daca61b99af15afaa Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 02:35:15 +0000 Subject: [PATCH 38/41] Clean --- faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp | 4 ++-- faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp index a10a3f3c72..a45446d545 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -80,7 +80,7 @@ void process_level( _mm256_storeu_ps(exact_distances + batch_idx, acc); } - for (; batch_idx < num_active; batch_idx += 1) { + for (; batch_idx < num_active; batch_idx++) { float acc = exact_distances[batch_idx]; acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]]; acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]]; @@ -106,7 +106,7 @@ void process_level( _mm256_storeu_ps(exact_distances + batch_idx, acc); } - for (; batch_idx < num_active; batch_idx += 1) { + for (; batch_idx < num_active; batch_idx++) { exact_distances[batch_idx] += sim_table_ptr[compressed_codes[byte_offset + batch_idx]]; } diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp index 6d22358153..e8976378f3 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -72,7 +72,7 @@ void process_level( _mm512_storeu_ps(exact_distances + batch_idx, acc); } - for (; batch_idx < num_active; batch_idx += 1) { + for (; batch_idx < num_active; batch_idx++) { float acc = exact_distances[batch_idx]; acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]]; acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]]; @@ -98,7 +98,7 @@ void process_level( _mm512_storeu_ps(exact_distances + batch_idx, acc); } - for (; batch_idx < num_active; batch_idx += 1) { + for (; batch_idx < num_active; batch_idx++) { exact_distances[batch_idx] += sim_table_ptr[compressed_codes[byte_offset + batch_idx]]; } From 9b8ac6ecfadb2e5c76db5ba4258be748e58d171f Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 17:00:51 +0000 Subject: [PATCH 39/41] Fix build --- faiss/impl/panorama_kernels/panorama_kernels-generic.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp index 664485f5f9..3a1f592f49 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp @@ -11,6 +11,7 @@ #if !defined(COMPILE_SIMD_AVX2) && !defined(COMPILE_SIMD_AVX512) #include +#include #include From 53835d58e5fb5cd50bac66be622fa27ce92316c2 Mon Sep 17 00:00:00 2001 From: Akash Nayar Date: Sun, 22 Mar 2026 18:23:15 +0000 Subject: [PATCH 40/41] SIMD refactor and add NEON / SVE stubs --- faiss/CMakeLists.txt | 3 + .../panorama_kernels-avx2.cpp | 60 +++------- .../panorama_kernels-avx512.cpp | 77 ++++++++----- .../panorama_kernels-generic.cpp | 108 ++++++++++++------ .../panorama_kernels/panorama_kernels-inl.h | 23 ++++ .../panorama_kernels-neon.cpp | 58 ++++++++++ .../panorama_kernels/panorama_kernels-sve.cpp | 58 ++++++++++ .../impl/panorama_kernels/panorama_kernels.h | 38 ++++-- 8 files changed, 308 insertions(+), 117 deletions(-) create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-inl.h create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-neon.cpp create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-sve.cpp diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index 5e752d9d3b..354a2ef883 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -25,11 +25,13 @@ set(FAISS_SIMD_AVX512_SRC ) set(FAISS_SIMD_NEON_SRC impl/fast_scan/impl-neon.cpp + impl/panorama_kernels/panorama_kernels-neon.cpp impl/scalar_quantizer/sq-neon.cpp impl/approx_topk/neon.cpp utils/simd_impl/distances_aarch64.cpp ) set(FAISS_SIMD_SVE_SRC + impl/panorama_kernels/panorama_kernels-sve.cpp impl/pq_code_distance/pq_code_distance-sve.cpp utils/simd_impl/distances_arm_sve.cpp ) @@ -286,6 +288,7 @@ set(FAISS_HEADERS impl/zerocopy_io.h utils/pq_code_distance.h impl/panorama_kernels/panorama_kernels.h + impl/panorama_kernels/panorama_kernels-inl.h impl/pq_code_distance/pq_code_distance-inl.h invlists/BlockInvertedLists.h invlists/DirectMap.h diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp index a45446d545..070475b067 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp @@ -11,18 +11,19 @@ // compression where available. #ifdef COMPILE_SIMD_AVX2 -#ifndef COMPILE_SIMD_AVX512 #include -#include +#include #include namespace faiss { namespace panorama_kernels { -void process_level( +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( size_t level_width_bytes, size_t max_batch_size, size_t num_active, @@ -51,28 +52,27 @@ void process_level( // Load 8 byte codes, zero-extend to 32-bit indices. __m128i raw0 = _mm_loadl_epi64( (__m128i*)(compressed_codes + byte_offset0 + batch_idx)); + __m128i raw1 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset1 + batch_idx)); + __m128i raw2 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset2 + batch_idx)); + __m128i raw3 = _mm_loadl_epi64( + (__m128i*)(compressed_codes + byte_offset3 + batch_idx)); + __m256i codes0 = _mm256_cvtepu8_epi32(raw0); + __m256i codes1 = _mm256_cvtepu8_epi32(raw1); + __m256i codes2 = _mm256_cvtepu8_epi32(raw2); + __m256i codes3 = _mm256_cvtepu8_epi32(raw3); + acc = _mm256_add_ps( acc, _mm256_i32gather_ps(sim_table0, codes0, sizeof(float))); - - __m128i raw1 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + byte_offset1 + batch_idx)); - __m256i codes1 = _mm256_cvtepu8_epi32(raw1); acc = _mm256_add_ps( acc, _mm256_i32gather_ps(sim_table1, codes1, sizeof(float))); - - __m128i raw2 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + byte_offset2 + batch_idx)); - __m256i codes2 = _mm256_cvtepu8_epi32(raw2); acc = _mm256_add_ps( acc, _mm256_i32gather_ps(sim_table2, codes2, sizeof(float))); - - __m128i raw3 = _mm_loadl_epi64( - (__m128i*)(compressed_codes + byte_offset3 + batch_idx)); - __m256i codes3 = _mm256_cvtepu8_epi32(raw3); acc = _mm256_add_ps( acc, _mm256_i32gather_ps(sim_table3, codes3, sizeof(float))); @@ -113,32 +113,9 @@ void process_level( } } -size_t process_filtering( - size_t num_active, - float* exact_distances, - uint32_t* active_indices, - float* cum_sums, - uint8_t* bitset, - size_t batch_offset, - float dis0, - float query_cum_norm, - float heap_max) { - size_t next_num_active = 0; - for (size_t i = 0; i < num_active; i++) { - float exact_distance = exact_distances[i]; - float cum_sum = cum_sums[active_indices[i] - batch_offset]; - float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm; - - bool keep = heap_max > lower_bound; - active_indices[next_num_active] = active_indices[i]; - exact_distances[next_num_active] = exact_distance; - bitset[active_indices[i] - batch_offset] = keep; - next_num_active += keep; - } - return next_num_active; -} - -std::pair process_code_compression( +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl( size_t next_num_active, size_t max_batch_size, size_t level_width_bytes, @@ -231,5 +208,4 @@ std::pair process_code_compression( } // namespace panorama_kernels } // namespace faiss -#endif // COMPILE_SIMD_AVX512 #endif // COMPILE_SIMD_AVX2 diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp index e8976378f3..811fb34579 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp @@ -9,14 +9,16 @@ #include -#include +#include #include namespace faiss { namespace panorama_kernels { -void process_level( +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( size_t level_width_bytes, size_t max_batch_size, size_t num_active, @@ -105,32 +107,9 @@ void process_level( } } -size_t process_filtering( - size_t num_active, - float* exact_distances, - uint32_t* active_indices, - float* cum_sums, - uint8_t* bitset, - size_t batch_offset, - float dis0, - float query_cum_norm, - float heap_max) { - size_t next_num_active = 0; - for (size_t i = 0; i < num_active; i++) { - float exact_distance = exact_distances[i]; - float cum_sum = cum_sums[active_indices[i] - batch_offset]; - float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm; - - bool keep = heap_max > lower_bound; - active_indices[next_num_active] = active_indices[i]; - exact_distances[next_num_active] = exact_distance; - bitset[active_indices[i] - batch_offset] = keep; - next_num_active += keep; - } - return next_num_active; -} - -std::pair process_code_compression( +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl( size_t next_num_active, size_t max_batch_size, size_t level_width_bytes, @@ -223,6 +202,48 @@ std::pair process_code_compression( return std::make_pair(compressed_codes, num_active); } +#ifdef COMPILE_SIMD_AVX512_SPR +// AVX512_SPR: Sapphire Rapids is a superset of AVX512. Reuse the +// AVX512 implementation until a dedicated SPR specialization is written. + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + process_level_impl( + level_width_bytes, + max_batch_size, + num_active, + sim_table, + compressed_codes, + exact_distances); +} + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl< + SIMDLevel::AVX512_SPR>( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + return process_code_compression_impl( + next_num_active, + max_batch_size, + level_width_bytes, + compressed_codes_begin, + bitset, + codes); +} +#endif // COMPILE_SIMD_AVX512_SPR + } // namespace panorama_kernels } // namespace faiss diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp index 3a1f592f49..9601cec46d 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp +++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp @@ -5,13 +5,15 @@ * LICENSE file in the root directory of this source tree. */ -// Scalar implementations of Panorama kernels. -// Compiled only when no SIMD variant (AVX2/AVX-512) is available. +// This TU provides: +// 1. _impl specializations for NONE, using scalar code. +// 2. Non-templated Panorama kernel dispatch wrappers +// (process_level, process_filtering, process_code_compression) declared +// in panorama_kernels.h. These use DISPATCH_SIMDLevel to route to the +// best available SIMD implementation via the _impl function template +// specializations defined in the per-SIMD .cpp files. -#if !defined(COMPILE_SIMD_AVX2) && !defined(COMPILE_SIMD_AVX512) - -#include -#include +#include #include @@ -22,7 +24,9 @@ namespace faiss { namespace panorama_kernels { -void process_level( +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( size_t level_width_bytes, size_t max_batch_size, size_t num_active, @@ -38,32 +42,9 @@ void process_level( } } -size_t process_filtering( - size_t num_active, - float* exact_distances, - uint32_t* active_indices, - float* cum_sums, - uint8_t* bitset, - size_t batch_offset, - float dis0, - float query_cum_norm, - float heap_max) { - size_t next_num_active = 0; - for (size_t i = 0; i < num_active; i++) { - float exact_distance = exact_distances[i]; - float cum_sum = cum_sums[active_indices[i] - batch_offset]; - float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm; - - bool keep = heap_max > lower_bound; - active_indices[next_num_active] = active_indices[i]; - exact_distances[next_num_active] = exact_distance; - bitset[active_indices[i] - batch_offset] = keep; - next_num_active += keep; - } - return next_num_active; -} - -std::pair process_code_compression( +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl( size_t next_num_active, size_t max_batch_size, size_t level_width_bytes, @@ -145,7 +126,64 @@ std::pair process_code_compression( return std::make_pair(compressed_codes, num_active); } +void process_level( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + DISPATCH_SIMDLevel( + process_level_impl, + level_width_bytes, + max_batch_size, + num_active, + sim_table, + compressed_codes, + exact_distances); +} + +size_t process_filtering( + size_t num_active, + float* exact_distances, + uint32_t* active_indices, + float* cum_sums, + uint8_t* bitset, + size_t batch_offset, + float dis0, + float query_cum_norm, + float heap_max) { + size_t next_num_active = 0; + for (size_t i = 0; i < num_active; i++) { + float exact_distance = exact_distances[i]; + float cum_sum = cum_sums[active_indices[i] - batch_offset]; + float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm; + + bool keep = heap_max > lower_bound; + active_indices[next_num_active] = active_indices[i]; + exact_distances[next_num_active] = exact_distance; + bitset[active_indices[i] - batch_offset] = keep; + next_num_active += keep; + } + return next_num_active; +} + +std::pair process_code_compression( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + DISPATCH_SIMDLevel( + process_code_compression_impl, + next_num_active, + max_batch_size, + level_width_bytes, + compressed_codes_begin, + bitset, + codes); +} + } // namespace panorama_kernels } // namespace faiss - -#endif // !COMPILE_SIMD_AVX2 && !COMPILE_SIMD_AVX512 diff --git a/faiss/impl/panorama_kernels/panorama_kernels-inl.h b/faiss/impl/panorama_kernels/panorama_kernels-inl.h new file mode 100644 index 0000000000..b1ae4316db --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-inl.h @@ -0,0 +1,23 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +/** + * @file panorama_kernels-inl.h + * @brief Private header for Panorama kernel SIMD implementations. + * + * This is a PRIVATE header — do not include in public APIs or user code. + * Only faiss internal .cpp files (the per-SIMD implementation files and + * panorama_kernels-generic.cpp) should include this header. + * + * This header re-exports the public API (panorama_kernels.h) plus the + * simd_dispatch.h machinery needed by the implementation files. + */ + +#include +#include diff --git a/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp b/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp new file mode 100644 index 0000000000..88eba0b574 --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// ARM NEON implementations of Panorama kernels. +// TODO(@AlSchlo, @aknayar): implement NEON-optimized panorama kernels. +// Currently delegates to the scalar (NONE) implementation. + +#ifdef COMPILE_SIMD_ARM_NEON + +#include + +namespace faiss { +namespace panorama_kernels { + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + process_level_impl( + level_width_bytes, + max_batch_size, + num_active, + sim_table, + compressed_codes, + exact_distances); +} + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + return process_code_compression_impl( + next_num_active, + max_batch_size, + level_width_bytes, + compressed_codes_begin, + bitset, + codes); +} + +} // namespace panorama_kernels +} // namespace faiss + +#endif // COMPILE_SIMD_ARM_NEON diff --git a/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp b/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp new file mode 100644 index 0000000000..af89f6aac9 --- /dev/null +++ b/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// ARM SVE implementations of Panorama kernels. +// TODO(@AlSchlo, @aknayar): implement SVE-optimized panorama kernels. +// Currently delegates to the scalar (NONE) implementation. + +#ifdef COMPILE_SIMD_ARM_SVE + +#include + +namespace faiss { +namespace panorama_kernels { + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances) { + process_level_impl( + level_width_bytes, + max_batch_size, + num_active, + sim_table, + compressed_codes, + exact_distances); +} + +// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization) +template <> +std::pair process_code_compression_impl( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes) { + return process_code_compression_impl( + next_num_active, + max_batch_size, + level_width_bytes, + compressed_codes_begin, + bitset, + codes); +} + +} // namespace panorama_kernels +} // namespace faiss + +#endif // COMPILE_SIMD_ARM_SVE diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h index 0bbcad0eef..3a67415523 100644 --- a/faiss/impl/panorama_kernels/panorama_kernels.h +++ b/faiss/impl/panorama_kernels/panorama_kernels.h @@ -9,32 +9,50 @@ /** * @file panorama_kernels.h - * @brief Panorama search kernels with scalar and AVX-512 implementations. + * @brief Panorama search kernels with SIMD-dispatched implementations. * * The three core kernels of the Panorama progressive filtering search: * - process_level: accumulate PQ distance table lookups over chunks * - process_filtering: Cauchy-Schwarz lower bound pruning with stream * compaction * - process_code_compression: byte-level stream compaction of PQ codes - * - * Implementations live in panorama_kernels-generic.cpp (scalar) and - * panorama_kernels-avx512.cpp (AVX-512 gather/compress + BMI2 PEXT/PDEP). */ #include #include #include +#include +#include + namespace faiss { namespace panorama_kernels { +template +void process_level_impl( + size_t level_width_bytes, + size_t max_batch_size, + size_t num_active, + float* sim_table, + uint8_t* compressed_codes, + float* exact_distances); + +template +std::pair process_code_compression_impl( + size_t next_num_active, + size_t max_batch_size, + size_t level_width_bytes, + uint8_t* compressed_codes_begin, + uint8_t* bitset, + const uint8_t* codes); + /// Accumulate PQ distance table lookups over chunks. /// /// For each chunk, looks up `sim_table[compressed_codes[i]]` and /// accumulates into `exact_distances[i]` for all active elements. /// Iterates chunks first to keep the LUT slice in L1 cache. -/// The AVX-512 version unrolls 4 chunks at a time. -void process_level( +/// The AVX2/AVX-512 versions unroll 4 chunks at a time. +FAISS_API void process_level( size_t level_width_bytes, size_t max_batch_size, size_t num_active, @@ -48,11 +66,7 @@ void process_level( /// and removes elements that cannot improve the current heap top. /// Uses stream compaction to pack surviving elements contiguously. /// Updates the bitset to reflect which elements were removed. -/// -/// Unfortunately, AVX-512 does not support a way to scatter at a -/// 1-byte granularity, so the bitset update for removed items is -/// done sequentially after compressing the indices. -size_t process_filtering( +FAISS_API size_t process_filtering( size_t num_active, float* exact_distances, uint32_t* active_indices, @@ -76,7 +90,7 @@ size_t process_filtering( /// `max_batch_size`. Only the last batch may be smaller than /// `max_batch_size`, the caller ensures that the batch and /// bitset are padded with zeros. -std::pair process_code_compression( +FAISS_API std::pair process_code_compression( size_t next_num_active, size_t max_batch_size, size_t level_width_bytes, From 9c4731cb245dcf69411502e9f44b8688957a4392 Mon Sep 17 00:00:00 2001 From: zoeyeye Date: Wed, 25 Mar 2026 17:07:55 -0700 Subject: [PATCH 41/41] Update index_read.cpp solve lint error. --- faiss/impl/index_read.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp index c4b645a48e..19b9f4b3f8 100644 --- a/faiss/impl/index_read.cpp +++ b/faiss/impl/index_read.cpp @@ -483,8 +483,8 @@ std::unique_ptr read_InvertedLists_up( READ1(code_size); READ1(n_levels); constexpr size_t bs = Panorama::kDefaultBatchSize; - FAISS_THROW_IF_NOT_FMT( - n_levels > 0, "invalid ilpn n_levels %zd", n_levels); + FAISS_THROW_IF_NOT_FMT( + n_levels > 0, "invalid ilpn n_levels %zd", n_levels); auto* pano = new PanoramaFlat(code_size / sizeof(float), n_levels, bs); auto ailp = std::make_unique( nlist, code_size, pano);