From c132cc13812b6da4d8fa12728b4613453aaf95c4 Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Sun, 15 Mar 2026 06:01:51 +0000
Subject: [PATCH 01/41] Initial commit

---
 faiss/CMakeLists.txt |   2 +
 faiss/IndexIVF.h     |  38 ++++
 faiss/IndexIVFPQ.cpp | 519 ++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 557 insertions(+), 2 deletions(-)
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index aeef0ed6cb..5a6c37ffbd 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -63,6 +63,7 @@ set(FAISS_SRC
   IndexIVFFlat.cpp
   IndexIVFFlatPanorama.cpp
   IndexIVFPQ.cpp
+  IndexIVFPQPanorama.cpp
   IndexIVFFastScan.cpp
   IndexIVFAdditiveQuantizerFastScan.cpp
   IndexIVFPQFastScan.cpp
@@ -184,6 +185,7 @@ set(FAISS_HEADERS
   IndexIVFFlat.h
   IndexIVFFlatPanorama.h
   IndexIVFPQ.h
+  IndexIVFPQPanorama.h
   IndexIVFFastScan.h
   IndexIVFAdditiveQuantizerFastScan.h
   IndexIVFPQFastScan.h
diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h
index ef744688d6..d66523d245 100644
--- a/faiss/IndexIVF.h
+++ b/faiss/IndexIVF.h
@@ -19,9 +19,12 @@
 #include <faiss/invlists/DirectMap.h>
 #include <faiss/invlists/InvertedLists.h>
 #include <faiss/utils/Heap.h>
+#include <faiss/impl/ProductQuantizer.h>
 
 namespace faiss {
 
+struct IndexIVFPQPanorama;
+
 /** Encapsulates a quantizer object for the IndexIVF
  *
  * The class isolates the fields that are independent of the storage
@@ -497,6 +500,15 @@ struct InvertedListScanner {
     /// following codes come from this inverted list
     virtual void set_list(idx_t list_no, float coarse_dis);
 
+    virtual void set_list_panorama(
+            idx_t list_no,
+            float coarse_dis,
+            float* sim_table,
+            float* dis0_ptr,
+            bool update) {}
+
+    virtual void set_sim_table(float* sim_table, float dis0_ptr) {}
+
     /// compute a single query-to-code distance
     virtual float distance_to_code(const uint8_t* code) const = 0;
 
@@ -553,6 +565,32 @@ struct InvertedListScanner {
             const idx_t* ids,
             ResultHandler& handler) const;
 
+    virtual size_t process_batch(
+            const ProductQuantizer& pq,
+            uint8_t* compressed_codes,
+            size_t cluster_id,
+            size_t batch_no,
+            float coarse_dis_i,
+            size_t curr_batch_size,
+            size_t max_batch_size,
+            size_t chunk_size,
+            float epsilon,
+            size_t n_levels,
+            const uint8_t* codes_batch,
+            float* cums,
+            float* query_cum_norms,
+            uint32_t* active_indices,
+            uint8_t* bitset,
+            float* exact_distances,
+            const idx_t* ids,
+            float* heap_sim,
+            idx_t* heap_ids,
+            size_t k,
+            float* dis0_cache,
+            float* sim_table_cache) {
+        return 0;
+    }
+
     virtual ~InvertedListScanner() {}
 };
 
diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp
index a909d81db9..9e074d1c29 100644
--- a/faiss/IndexIVFPQ.cpp
+++ b/faiss/IndexIVFPQ.cpp
@@ -9,13 +9,15 @@
 
 #include <faiss/IndexIVFPQ.h>
 
+#include <immintrin.h>
+#include <algorithm>
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
+#include <cstddef>
 #include <cstdint>
 #include <cstdio>
-
-#include <algorithm>
+#include <utility>
 
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/distances_dispatch.h>
@@ -760,6 +762,65 @@ struct QueryTables {
 
         return dis0;
     }
+
+    float precompute_list_tables_L2_panorama(float* sim_table_ptr) {
+        float dis0 = 0;
+
+        if (use_precomputed_table == 1) {
+            dis0 = coarse_dis;
+
+            const size_t n = pq.M * pq.ksub;
+            const float bf = -2.0f;
+            const float* b = sim_table_2;
+            float* c = sim_table_ptr;
+
+#ifdef __AVX512F__
+            const size_t n16 = n / 16;
+            const size_t n_for_masking = n % 16;
+
+            const __m512 bfmm = _mm512_set1_ps(bf);
+
+            size_t idx = 0;
+            for (idx = 0; idx < n16 * 16; idx += 16) {
+                const __m512 bx = _mm512_loadu_ps(b + idx);
+                const __m512 abmul = _mm512_mul_ps(bfmm, bx);
+                _mm512_storeu_ps(c + idx, abmul);
+            }
+
+            if (n_for_masking > 0) {
+                const __mmask16 mask = (1 << n_for_masking) - 1;
+                const __m512 bx = _mm512_maskz_loadu_ps(mask, b + idx);
+                const __m512 abmul = _mm512_mul_ps(bfmm, bx);
+                _mm512_mask_storeu_ps(c + idx, mask, abmul);
+            }
+#else
+            for (size_t idx = 0; idx < n; idx++) {
+                c[idx] = bf * b[idx];
+            }
+#endif
+
+            sim_table = sim_table_ptr;
+        } else {
+            FAISS_THROW_MSG(
+                    "Panorama PQ only supports use_precomputed_table == 1");
+        }
+
+        return dis0;
+    }
+
+    float precompute_list_tables_panorama(float* sim_table_ptr) {
+        float dis0 = 0;
+        uint64_t t0;
+        TIC;
+        if (by_residual) {
+            if (metric_type == METRIC_INNER_PRODUCT)
+                dis0 = precompute_list_tables_IP();
+            else
+                dis0 = precompute_list_tables_L2_panorama(sim_table_ptr);
+        }
+        init_list_cycles += TOC;
+        return dis0;
+    }
 };
 
 template <class C, bool use_sel>
@@ -791,6 +852,39 @@ struct WrappedSearchResult {
     }
 };
 
+template <class C, bool use_sel>
+struct KnnSearchResultsPanorama {
+    idx_t key;
+    const idx_t* ids;
+    const IDSelector* sel;
+
+    size_t k;
+    float* heap_sim;
+    idx_t* heap_ids;
+
+    size_t nup;
+
+    inline bool skip_entry(idx_t j) {
+        return use_sel && !sel->is_member(ids[j]);
+    }
+
+    inline bool should_keep(float dis) {
+        return C::cmp(heap_sim[0], dis);
+    }
+
+    inline float top() {
+        return heap_sim[0];
+    }
+
+    inline void add(idx_t j, float dis) {
+        if (C::cmp(heap_sim[0], dis)) {
+            idx_t id = ids ? ids[j] : lo_build(key, j);
+            heap_replace_top<C>(k, heap_sim, heap_ids, dis, id);
+            nup++;
+        }
+    }
+};
+
 /*****************************************************
  * Scaning the codes.
  * The scanning functions call their favorite precompute_*
@@ -821,6 +915,26 @@ struct IVFPQScannerT : QueryTables {
         }
     }
 
+    void init_list_panorama(
+            idx_t list_no,
+            float coarse_dis,
+            int mode,
+            float* sim_table,
+            float* dis0_ptr,
+            bool update) {
+        this->key = list_no;
+        this->coarse_dis = coarse_dis;
+
+        if (mode == 2) {
+            if (update) {
+                *dis0_ptr = precompute_list_tables_panorama(sim_table);
+            }
+            dis0 = *dis0_ptr;
+        } else if (mode == 1) {
+            dis0 = precompute_list_table_pointers();
+        }
+    }
+
     /*****************************************************
      * Scaning the codes: simple PQ scan.
      *****************************************************/
@@ -1207,6 +1321,407 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
         this->init_list(list_no, coarse_dis, precompute_mode);
     }
 
+    void set_list_panorama(
+            idx_t list_no,
+            float coarse_dis,
+            float* sim_table,
+            float* dis0_ptr,
+            bool update) override {
+        this->list_no = list_no;
+        this->init_list_panorama(
+                list_no,
+                coarse_dis,
+                precompute_mode,
+                sim_table,
+                dis0_ptr,
+                update);
+    }
+
+    void set_sim_table(float* sim_table, float dis0) override {
+        this->sim_table = sim_table;
+        this->dis0 = dis0;
+    }
+
+#ifdef __AVX512F__
+    inline void process_chunks(
+            size_t chunk_size,
+            size_t max_batch_size,
+            size_t num_active,
+            float* sim_table,
+            uint8_t* compressed_codes,
+            float* exact_distances) {
+        size_t chunk_idx = 0;
+        for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) {
+            size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size;
+            size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size;
+            size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size;
+            size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size;
+
+            float* sim_table0 = sim_table + (chunk_idx + 0) * 256;
+            float* sim_table1 = sim_table + (chunk_idx + 1) * 256;
+            float* sim_table2 = sim_table + (chunk_idx + 2) * 256;
+            float* sim_table3 = sim_table + (chunk_idx + 3) * 256;
+
+            size_t batch_idx = 0;
+            for (; batch_idx + 15 < num_active; batch_idx += 16) {
+                __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
+
+                __m128i comp0 = _mm_loadu_si128(
+                        (__m128i*)(compressed_codes + chunk_offset0 + batch_idx));
+                __m512i codes0 = _mm512_cvtepu8_epi32(comp0);
+                acc = _mm512_add_ps(
+                        acc,
+                        _mm512_i32gather_ps(codes0, sim_table0, sizeof(float)));
+
+                __m128i comp1 = _mm_loadu_si128(
+                        (__m128i*)(compressed_codes + chunk_offset1 + batch_idx));
+                __m512i codes1 = _mm512_cvtepu8_epi32(comp1);
+                acc = _mm512_add_ps(
+                        acc,
+                        _mm512_i32gather_ps(codes1, sim_table1, sizeof(float)));
+
+                __m128i comp2 = _mm_loadu_si128(
+                        (__m128i*)(compressed_codes + chunk_offset2 + batch_idx));
+                __m512i codes2 = _mm512_cvtepu8_epi32(comp2);
+                acc = _mm512_add_ps(
+                        acc,
+                        _mm512_i32gather_ps(codes2, sim_table2, sizeof(float)));
+
+                __m128i comp3 = _mm_loadu_si128(
+                        (__m128i*)(compressed_codes + chunk_offset3 + batch_idx));
+                __m512i codes3 = _mm512_cvtepu8_epi32(comp3);
+                acc = _mm512_add_ps(
+                        acc,
+                        _mm512_i32gather_ps(codes3, sim_table3, sizeof(float)));
+
+                _mm512_storeu_ps(exact_distances + batch_idx, acc);
+            }
+
+            for (; batch_idx < num_active; batch_idx += 1) {
+                float acc = exact_distances[batch_idx];
+                acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]];
+                acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]];
+                acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]];
+                acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]];
+                exact_distances[batch_idx] = acc;
+            }
+        }
+
+        for (; chunk_idx < chunk_size; chunk_idx++) {
+            size_t chunk_offset = chunk_idx * max_batch_size;
+            float* sim_table_ptr = sim_table + chunk_idx * 256;
+
+            size_t batch_idx = 0;
+            for (; batch_idx + 15 < num_active; batch_idx += 16) {
+                __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
+                __m128i comp = _mm_loadu_si128(
+                        (__m128i*)(compressed_codes + chunk_offset + batch_idx));
+                __m512i codes = _mm512_cvtepu8_epi32(comp);
+                __m512 m_dist = _mm512_i32gather_ps(
+                        codes, sim_table_ptr, sizeof(float));
+                acc = _mm512_add_ps(acc, m_dist);
+                _mm512_storeu_ps(exact_distances + batch_idx, acc);
+            }
+
+            for (; batch_idx < num_active; batch_idx += 1) {
+                exact_distances[batch_idx] += sim_table_ptr
+                        [compressed_codes[chunk_offset + batch_idx]];
+            }
+        }
+    }
+
+    inline size_t process_filtering(
+            size_t num_active,
+            float* exact_distances,
+            uint32_t* active_indices,
+            __m512i batch_offset_broadcast,
+            float* cum_sums,
+            __m512 dis0_broadcast,
+            __m512 query_cum_norm_broadcast,
+            __m512 epsilon_broadcast,
+            __m512 heap_max_broadcast,
+            uint8_t* bitset,
+            size_t batch_offset,
+            float dis0,
+            float query_cum_norm,
+            float epsilon,
+            float heap_max) {
+        size_t next_num_active = 0;
+        size_t batch_idx = 0;
+
+        for (; batch_idx + 15 < num_active; batch_idx += 16) {
+            __m512 exact_distances_batch =
+                    _mm512_loadu_ps(exact_distances + batch_idx);
+
+            __m512i active_indices_batch =
+                    _mm512_loadu_si512(active_indices + batch_idx);
+            __m512i offsetted_active_indices_batch = _mm512_sub_epi32(
+                    active_indices_batch, batch_offset_broadcast);
+            __m512 cum_sums_batch = _mm512_i32gather_ps(
+                    offsetted_active_indices_batch, cum_sums, sizeof(float));
+
+            __m512 exact_distances_batch_dis0 =
+                    _mm512_add_ps(exact_distances_batch, dis0_broadcast);
+            __m512 cauchy_schwarz_bound =
+                    _mm512_mul_ps(query_cum_norm_broadcast, cum_sums_batch);
+            cauchy_schwarz_bound =
+                    _mm512_mul_ps(cauchy_schwarz_bound, epsilon_broadcast);
+
+            __m512 lower_bound = _mm512_sub_ps(
+                    exact_distances_batch_dis0, cauchy_schwarz_bound);
+            __mmask16 mask_should_keep = _mm512_cmp_ps_mask(
+                    lower_bound, heap_max_broadcast, _CMP_LT_OQ);
+
+            __m512i compressed_active_indices_vec = _mm512_mask_compress_epi32(
+                    _mm512_setzero_si512(),
+                    mask_should_keep,
+                    active_indices_batch);
+            _mm512_storeu_si512(
+                    active_indices + next_num_active,
+                    compressed_active_indices_vec);
+
+            __m512 compressed_exact_distances_vec = _mm512_mask_compress_ps(
+                    _mm512_setzero_ps(),
+                    mask_should_keep,
+                    exact_distances_batch);
+            _mm512_storeu_ps(
+                    exact_distances + next_num_active,
+                    compressed_exact_distances_vec);
+
+            alignas(64) uint32_t indices_to_remove[16];
+            __mmask16 mask_should_remove = ~mask_should_keep;
+            size_t num_to_remove = _mm_popcnt_u32(mask_should_remove);
+
+            __m512i compressed_indices_to_remove_vec =
+                    _mm512_mask_compress_epi32(
+                            _mm512_setzero_si512(),
+                            mask_should_remove,
+                            active_indices_batch);
+            _mm512_storeu_si512(
+                    indices_to_remove, compressed_indices_to_remove_vec);
+
+            for (size_t idx = 0; idx < num_to_remove; idx++) {
+                bitset[indices_to_remove[idx] - batch_offset] = 0;
+            }
+
+            next_num_active += _mm_popcnt_u32(mask_should_keep);
+        }
+
+        for (; batch_idx < num_active; batch_idx++) {
+            float exact_distance = exact_distances[batch_idx];
+
+            float cum_sum = cum_sums[active_indices[batch_idx] - batch_offset];
+            float cauchy_schwarz_bound = cum_sum * query_cum_norm;
+            float lower_bound =
+                    exact_distance - cauchy_schwarz_bound * epsilon + dis0;
+
+            uint32_t should_keep = heap_max > lower_bound;
+            active_indices[next_num_active] = active_indices[batch_idx];
+            exact_distances[next_num_active] = exact_distance;
+
+            bitset[active_indices[batch_idx] - batch_offset] = should_keep;
+
+            next_num_active += should_keep;
+        }
+
+        return next_num_active;
+    }
+
+    inline std::pair<uint8_t*, size_t> process_code_compression(
+            size_t level,
+            size_t next_num_active,
+            size_t max_batch_size,
+            size_t chunk_size,
+            uint8_t* compressed_codes_begin,
+            uint8_t* bitset,
+            const uint8_t* codes) {
+        uint8_t* compressed_codes = compressed_codes_begin;
+        size_t num_active = 0;
+
+        if (next_num_active < max_batch_size) {
+            compressed_codes = compressed_codes_begin;
+            for (size_t point_idx = 0; point_idx < max_batch_size;
+                 point_idx += 64) {
+                __m512i active_byteset = _mm512_loadu_si512(bitset + point_idx);
+                __mmask64 mask = _mm512_cmpneq_epi8_mask(
+                        active_byteset, _mm512_setzero_si512());
+
+                for (size_t ci = 0; ci < chunk_size; ci++) {
+                    size_t chunk_offset = ci * max_batch_size;
+                    size_t write_pos = 0;
+                    uint64_t m = (uint64_t)mask;
+                    while (m) {
+                        int bit = __builtin_ctzll(m);
+                        compressed_codes[chunk_offset + num_active + write_pos] =
+                                codes[chunk_offset + point_idx + bit];
+                        write_pos++;
+                        m &= m - 1;
+                    }
+                }
+
+                num_active += _mm_popcnt_u64(mask);
+            }
+        } else {
+            num_active = next_num_active;
+            compressed_codes = const_cast<uint8_t*>(codes);
+        }
+
+        return std::make_pair(compressed_codes, num_active);
+    }
+#endif // __AVX512F__
+
+    inline void process_chunks_sparse(
+            size_t chunk_size,
+            size_t max_batch_size,
+            size_t num_active,
+            float* sim_table,
+            const uint8_t* codes,
+            float* exact_distances,
+            uint32_t* active_indices,
+            size_t batch_offset,
+            size_t ksub) {
+        for (size_t ci = 0; ci < chunk_size; ci++) {
+            size_t chunk_offset = ci * max_batch_size;
+            float* chunk_sim_table = sim_table + ci * ksub;
+
+            for (size_t batch_idx = 0; batch_idx < num_active; batch_idx++) {
+                size_t real_idx = active_indices[batch_idx] - batch_offset;
+                uint8_t code = codes[chunk_offset + real_idx];
+                exact_distances[batch_idx] += chunk_sim_table[code];
+            }
+        }
+    }
+
+#ifdef __AVX512F__
+    size_t process_batch(
+            const ProductQuantizer& pq,
+            uint8_t* compressed_codes,
+            size_t cluster_id,
+            size_t batch_no,
+            float coarse_dis_i,
+            size_t curr_batch_size,
+            size_t max_batch_size,
+            size_t chunk_size,
+            float epsilon,
+            size_t n_levels,
+            const uint8_t* codes_batch,
+            float* cums,
+            float* query_cum_norms,
+            uint32_t* active_indices,
+            uint8_t* bitset,
+            float* exact_distances,
+            const idx_t* ids,
+            float* heap_sim,
+            idx_t* heap_ids,
+            size_t k,
+            float* dis0_cache,
+            float* sim_table_cache) override {
+        KnnSearchResultsPanorama<C, use_sel> res = {
+                this->key,
+                this->store_pairs ? nullptr : ids,
+                this->sel,
+                k,
+                heap_sim,
+                heap_ids,
+                0};
+        uint8_t* compressed_codes_begin = compressed_codes;
+        size_t total_active = 0;
+        __m512 epsilon_broadcast = _mm512_set1_ps(epsilon);
+
+        size_t next_num_active = curr_batch_size;
+        float dis0 = 0;
+        size_t batch_offset = batch_no * max_batch_size;
+        __m512i batch_offset_broadcast = _mm512_set1_epi32(batch_offset);
+        for (size_t level = 0; (level < n_levels) && (next_num_active > 0);
+             level++) {
+            total_active += next_num_active;
+
+            size_t level_offset_sim_table = level * pq.ksub * chunk_size;
+            this->set_list_panorama(
+                    cluster_id,
+                    coarse_dis_i,
+                    sim_table_cache + level_offset_sim_table,
+                    dis0_cache,
+                    level == 0 && batch_no == 0);
+            this->set_sim_table(
+                    sim_table_cache + level_offset_sim_table, *dis0_cache);
+
+            dis0 = this->dis0;
+            __m512 dis0_bcast = _mm512_set1_ps(dis0);
+
+            float query_cum_norm = 2 * query_cum_norms[level + 1];
+            __m512 query_cum_norm_broadcast = _mm512_set1_ps(query_cum_norm);
+
+            float heap_max = res.top();
+            __m512 heap_max_broadcast = _mm512_set1_ps(heap_max);
+
+            float* cum_sums = cums + curr_batch_size * level;
+            const uint8_t* codes =
+                    codes_batch + max_batch_size * chunk_size * level;
+
+            bool is_sparse = next_num_active < max_batch_size / 16;
+            float* sim_table = this->sim_table;
+
+            size_t num_active_for_filtering = 0;
+            if (is_sparse) {
+                process_chunks_sparse(
+                        chunk_size,
+                        max_batch_size,
+                        next_num_active,
+                        sim_table,
+                        codes,
+                        exact_distances,
+                        active_indices,
+                        batch_offset,
+                        pq.ksub);
+                num_active_for_filtering = next_num_active;
+            } else {
+                auto [cc, na] = process_code_compression(
+                        level,
+                        next_num_active,
+                        max_batch_size,
+                        chunk_size,
+                        compressed_codes_begin,
+                        bitset,
+                        codes);
+
+                process_chunks(
+                        chunk_size,
+                        max_batch_size,
+                        na,
+                        sim_table,
+                        cc,
+                        exact_distances);
+                num_active_for_filtering = na;
+            }
+
+            next_num_active = process_filtering(
+                    num_active_for_filtering,
+                    exact_distances,
+                    active_indices,
+                    batch_offset_broadcast,
+                    cum_sums,
+                    dis0_bcast,
+                    query_cum_norm_broadcast,
+                    epsilon_broadcast,
+                    heap_max_broadcast,
+                    bitset,
+                    batch_offset,
+                    dis0,
+                    query_cum_norm,
+                    epsilon,
+                    heap_max);
+        }
+
+        for (size_t batch_idx = 0; batch_idx < next_num_active; batch_idx++) {
+            res.add(active_indices[batch_idx],
+                    dis0 + exact_distances[batch_idx]);
+        }
+
+        return total_active;
+    }
+#endif // __AVX512F__
+
     float distance_to_code(const uint8_t* code) const override {
         assert(precompute_mode == 2);
         float dis = this->dis0 +

From bb842d0b2f7f090eaf64152da6739003c3358276 Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Sun, 15 Mar 2026 06:02:15 +0000
Subject: [PATCH 02/41] Initial commit

---
 faiss/IndexIVFPQPanorama.cpp | 509 +++++++++++++++++++++++++++++++++++
 faiss/IndexIVFPQPanorama.h   |  70 +++++
 2 files changed, 579 insertions(+)
 create mode 100644 faiss/IndexIVFPQPanorama.cpp
 create mode 100644 faiss/IndexIVFPQPanorama.h

diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
new file mode 100644
index 0000000000..ba54da4cb4
--- /dev/null
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -0,0 +1,509 @@
+#include <faiss/IndexIVFPQPanorama.h>
+#include <omp.h>
+#include <cstdint>
+#include <memory>
+#include <mutex>
+
+#include <algorithm>
+#include <cinttypes>
+#include <cstdio>
+#include <iostream>
+#include <limits>
+#include <numeric>
+
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/utils.h>
+
+#include <faiss/IndexFlat.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/CodePacker.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
+
+namespace faiss {
+
+static uint64_t total_active = 0;
+static uint64_t total_points = 0;
+
+IndexIVFPQPanorama::IndexIVFPQPanorama(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t M,
+        size_t nbits_per_idx,
+        int n_levels,
+        float epsilon,
+        size_t batch_size,
+        MetricType metric,
+        bool own_invlists)
+        : IndexIVFPQ(
+                  quantizer,
+                  d,
+                  nlist,
+                  M,
+                  nbits_per_idx,
+                  metric,
+                  own_invlists),
+          n_levels(n_levels),
+          added(false),
+          chunk_size(code_size / n_levels),
+          levels_size(d / n_levels),
+          nbits_per_idx(nbits_per_idx),
+          m_level_width(M / n_levels),
+          epsilon(epsilon),
+          batch_size(batch_size) {
+    FAISS_ASSERT(M % n_levels == 0);
+    FAISS_ASSERT(batch_size % 64 == 0);
+
+    printf("N levels = %d\n", n_levels);
+    printf("M = code_size = %zu\n", M);
+    printf("Nbits per idx = %u (fixed)\n", 8);
+    printf("Nlist = %zu\n", nlist);
+    printf("Batch size = %zuB\n", batch_size);
+
+    FAISS_ASSERT(m_level_width > 0);
+    FAISS_ASSERT(nbits_per_idx == 8);
+    FAISS_ASSERT(M == code_size);
+    FAISS_ASSERT(metric == METRIC_L2);
+}
+
+void IndexIVFPQPanorama::add(idx_t n, const float* x) {
+    FAISS_ASSERT(!added);
+    added = true;
+
+    num_points = n;
+    IndexIVFPQ::add(n, x);
+
+    size_t new_n = 0;
+    column_offsets = new size_t[nlist];
+    for (size_t i = 0; i < nlist; i++) {
+        column_offsets[i] = new_n;
+        size_t batch_n = (invlists->list_size(i) + batch_size - 1) / batch_size;
+        size_t rounded_n = batch_n * batch_size;
+        new_n += rounded_n * code_size;
+    }
+
+    column_storage = new uint8_t[code_size * new_n];
+
+    for (size_t list_no = 0; list_no < nlist; list_no++) {
+        size_t col_offset = column_offsets[list_no];
+        size_t list_size = invlists->list_size(list_no);
+        size_t n_batches = (list_size + batch_size - 1) / batch_size;
+        for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
+            size_t batch_offset = batch_no * batch_size * code_size;
+            size_t curr_batch_size =
+                    std::min(list_size - batch_no * batch_size, batch_size);
+            for (size_t m = 0; m < pq.M; m++) {
+                size_t m_offset = m * batch_size;
+                for (size_t point_idx = 0; point_idx < batch_size;
+                     point_idx++) {
+                    uint8_t* dest = column_storage + col_offset + batch_offset +
+                            m_offset + point_idx;
+                    const uint8_t* codes = invlists->get_codes(list_no);
+
+                    if (point_idx < curr_batch_size) {
+                        const uint8_t* src = codes + batch_offset +
+                                point_idx * code_size + m;
+                        memcpy(dest, src, 1);
+                    } else {
+                        *dest = 0;
+                    }
+                }
+            }
+        }
+    }
+
+    cum_sums = new float[(n_levels + 1) * n];
+    cum_sum_offsets = new size_t[nlist];
+
+    init_exact_distances = new float[n];
+    init_exact_distances_offsets = new size_t[nlist];
+
+    size_t cum_size = 0;
+    size_t init_size = 0;
+    for (size_t list_no = 0; list_no < nlist; list_no++) {
+        cum_sum_offsets[list_no] = cum_size;
+        cum_size += invlists->list_size(list_no) * (n_levels + 1);
+
+        init_exact_distances_offsets[list_no] = init_size;
+        init_size += invlists->list_size(list_no);
+    }
+
+    for (size_t list_no = 0; list_no < nlist; list_no++) {
+        const idx_t* idx = invlists->get_ids(list_no);
+        size_t list_size = invlists->list_size(list_no);
+
+        std::vector<float> centroid(d);
+        quantizer->reconstruct(list_no, centroid.data());
+
+        size_t n_batches = (list_size + batch_size - 1) / batch_size;
+
+        for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
+            size_t b_offset = batch_no * batch_size;
+            size_t curr_batch_size =
+                    std::min(list_size - batch_no * batch_size, batch_size);
+
+            for (size_t point_idx = 0; point_idx < curr_batch_size;
+                 point_idx++) {
+                float init_exact_distance = 0.0f;
+
+                std::vector<float> vector(d);
+                const uint8_t* code =
+                        invlists->get_single_code(list_no, b_offset + point_idx);
+                pq.decode(code, vector.data());
+
+                std::vector<float> suffix_sums(d + 1);
+                suffix_sums[d] = 0.0f;
+
+                for (int j = d - 1; j >= 0; j--) {
+                    init_exact_distance +=
+                            vector[j] * vector[j] + 2 * vector[j] * centroid[j];
+                    float squaredVal = vector[j] * vector[j];
+                    suffix_sums[j] = suffix_sums[j + 1] + squaredVal;
+                }
+
+                for (int level = 0; level < n_levels; level++) {
+                    int start_idx = level * levels_size;
+                    size_t offset = cum_sum_offsets[list_no] +
+                            b_offset * (n_levels + 1) +
+                            level * curr_batch_size + point_idx;
+                    if (start_idx < (int)d) {
+                        cum_sums[offset] = sqrt(suffix_sums[start_idx]);
+                    } else {
+                        cum_sums[offset] = 0.0f;
+                    }
+                }
+
+                size_t offset = cum_sum_offsets[list_no] +
+                        b_offset * (n_levels + 1) +
+                        n_levels * curr_batch_size + point_idx;
+                cum_sums[offset] = 0.0f;
+
+                size_t init_offset = init_exact_distances_offsets[list_no];
+                init_exact_distances[init_offset + b_offset + point_idx] =
+                        init_exact_distance;
+            }
+        }
+    }
+}
+
+void IndexIVFPQPanorama::search(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        float* distances,
+        idx_t* labels,
+        const SearchParameters* params_in) const {
+    FAISS_THROW_IF_NOT(k > 0);
+    const IVFSearchParameters* params = nullptr;
+    if (params_in) {
+        params = dynamic_cast<const IVFSearchParameters*>(params_in);
+        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
+    }
+    const size_t nprobe =
+            std::min(nlist, params ? params->nprobe : this->nprobe);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+
+    auto sub_search_func = [this, k, nprobe, params](
+                                   idx_t n,
+                                   const float* x,
+                                   float* distances,
+                                   idx_t* labels,
+                                   IndexIVFStats* ivf_stats) {
+        std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
+        std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
+
+        quantizer->search(
+                n,
+                x,
+                nprobe,
+                coarse_dis.get(),
+                idx.get(),
+                params ? params->quantizer_params : nullptr);
+
+        invlists->prefetch_lists(idx.get(), n * nprobe);
+
+        search_preassigned(
+                n,
+                x,
+                k,
+                idx.get(),
+                coarse_dis.get(),
+                distances,
+                labels,
+                false,
+                params,
+                ivf_stats);
+    };
+
+    if ((parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT) == 0) {
+        int nt = std::min(omp_get_max_threads(), int(n));
+        std::vector<IndexIVFStats> stats(nt);
+        std::mutex exception_mutex;
+        std::string exception_string;
+
+#pragma omp parallel for if (nt > 1)
+        for (idx_t slice = 0; slice < nt; slice++) {
+            IndexIVFStats local_stats;
+            idx_t i0 = n * slice / nt;
+            idx_t i1 = n * (slice + 1) / nt;
+            if (i1 > i0) {
+                try {
+                    sub_search_func(
+                            i1 - i0,
+                            x + i0 * d,
+                            distances + i0 * k,
+                            labels + i0 * k,
+                            &stats[slice]);
+                } catch (const std::exception& e) {
+                    std::lock_guard<std::mutex> lock(exception_mutex);
+                    exception_string = e.what();
+                }
+            }
+        }
+
+        if (!exception_string.empty()) {
+            FAISS_THROW_FMT(
+                    "search error: %s", exception_string.c_str());
+        }
+    } else {
+        sub_search_func(n, x, distances, labels, &indexIVF_stats);
+    }
+}
+
+void IndexIVFPQPanorama::search_preassigned(
+        idx_t n,
+        const float* x,
+        idx_t k,
+        const idx_t* keys,
+        const float* coarse_dis,
+        float* distances,
+        idx_t* labels,
+        bool store_pairs,
+        const IVFSearchParameters* params,
+        IndexIVFStats* ivf_stats) const {
+    FAISS_THROW_IF_NOT(k > 0);
+
+    idx_t nprobe = params ? params->nprobe : this->nprobe;
+    nprobe = std::min((idx_t)nlist, nprobe);
+    FAISS_THROW_IF_NOT(nprobe > 0);
+
+    const idx_t unlimited_list_size = std::numeric_limits<idx_t>::max();
+    idx_t max_codes = params ? params->max_codes : this->max_codes;
+    IDSelector* sel = params ? params->sel : nullptr;
+    const IDSelectorRange* selr = dynamic_cast<const IDSelectorRange*>(sel);
+    if (selr) {
+        if (selr->assume_sorted) {
+            sel = nullptr;
+        } else {
+            selr = nullptr;
+        }
+    }
+
+    FAISS_THROW_IF_NOT_MSG(
+            !(sel && store_pairs),
+            "selector and store_pairs cannot be combined");
+
+    FAISS_THROW_IF_NOT_MSG(
+            !invlists->use_iterator || (max_codes == 0 && store_pairs == false),
+            "iterable inverted lists don't support max_codes and store_pairs");
+
+    size_t nlistv = 0, ndis = 0, nheap = 0;
+
+    using HeapForIP = CMin<float, idx_t>;
+    using HeapForL2 = CMax<float, idx_t>;
+
+    bool interrupt = false;
+    std::mutex exception_mutex;
+    std::string exception_string;
+
+    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
+    bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT);
+
+    FAISS_THROW_IF_NOT_MSG(
+            max_codes == 0 || pmode == 0 || pmode == 3,
+            "max_codes supported only for parallel_mode = 0 or 3");
+
+    if (max_codes == 0) {
+        max_codes = unlimited_list_size;
+    }
+
+    [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 &&
+            (pmode == 0           ? false
+                     : pmode == 3 ? n > 1
+                     : pmode == 1 ? nprobe > 1
+                                  : nprobe * n > 1);
+
+    void* inverted_list_context =
+            params ? params->inverted_list_context : nullptr;
+
+    const size_t sim_table_size = pq.ksub * pq.M;
+    std::vector<float> sim_table_cache(nprobe * sim_table_size);
+    std::vector<float> dis0s_cache(nprobe);
+
+    std::vector<float> suffixSums(d + 1);
+    std::vector<float> query_cum_norms(n_levels + 1);
+    std::vector<float> query(d);
+    std::vector<float> exact_distances(batch_size);
+    std::vector<uint8_t> bitset(batch_size);
+    std::vector<uint32_t> active_indices(batch_size);
+    std::vector<uint8_t> compressed_codes(batch_size * chunk_size);
+
+#pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis, nheap)
+    {
+        std::unique_ptr<InvertedListScanner> scanner(
+                get_InvertedListScanner(store_pairs, sel, params));
+
+        auto init_result = [&](float* simi, idx_t* idxi) {
+            if (!do_heap_init)
+                return;
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_heapify<HeapForIP>(k, simi, idxi);
+            } else {
+                heap_heapify<HeapForL2>(k, simi, idxi);
+            }
+        };
+
+        auto reorder_result = [&](float* simi, idx_t* idxi) {
+            if (!do_heap_init)
+                return;
+            if (metric_type == METRIC_INNER_PRODUCT) {
+                heap_reorder<HeapForIP>(k, simi, idxi);
+            } else {
+                heap_reorder<HeapForL2>(k, simi, idxi);
+            }
+        };
+
+        FAISS_ASSERT(pmode == 0);
+        if (pmode == 0) {
+#pragma omp for
+            for (idx_t i = 0; i < n; i++) {
+                if (interrupt) {
+                    continue;
+                }
+
+                scanner->set_query(x + i * d);
+                suffixSums[d] = 0.0f;
+
+                const float* q = x + i * d;
+
+                for (int j = d - 1; j >= 0; --j) {
+                    float squaredVal = q[j] * q[j];
+                    suffixSums[j] = suffixSums[j + 1] + squaredVal;
+                }
+
+                for (int level_idx = 0; level_idx < n_levels; level_idx++) {
+                    int startIdx = level_idx * levels_size;
+                    if (startIdx < (int)d) {
+                        query_cum_norms[level_idx] = sqrt(suffixSums[startIdx]);
+                    } else {
+                        query_cum_norms[level_idx] = 0.0f;
+                    }
+                }
+                query_cum_norms[n_levels] = 0.0f;
+
+                float* simi = distances + i * k;
+                idx_t* idxi = labels + i * k;
+
+                init_result(simi, idxi);
+
+                idx_t nscan = 0;
+
+                for (size_t list_no = 0; list_no < (size_t)nprobe; list_no++) {
+                    idx_t cluster_id = keys[i * nprobe + list_no];
+                    size_t list_size = invlists->list_size(cluster_id);
+                    size_t n_batches =
+                            (list_size + batch_size - 1) / batch_size;
+
+                    std::unique_ptr<InvertedLists::ScopedIds> sids;
+                    const idx_t* ids =
+                            std::make_unique<InvertedLists::ScopedIds>(
+                                    invlists, cluster_id)
+                                    ->get();
+
+                    for (size_t batch_no = 0; batch_no < n_batches;
+                         batch_no++) {
+                        size_t curr_batch_size = std::min(
+                                list_size - batch_no * batch_size, batch_size);
+                        size_t b_offset = batch_no * batch_size;
+
+                        std::iota(
+                                active_indices.begin(),
+                                active_indices.begin() + curr_batch_size,
+                                b_offset);
+                        std::fill(
+                                bitset.begin(),
+                                bitset.begin() + curr_batch_size,
+                                1);
+                        std::fill(
+                                bitset.begin() + curr_batch_size,
+                                bitset.end(),
+                                0);
+                        std::fill(
+                                compressed_codes.begin(),
+                                compressed_codes.end(),
+                                0);
+
+                        for (size_t idx = 0; idx < curr_batch_size; idx++) {
+                            exact_distances[idx] = init_exact_distances
+                                    [init_exact_distances_offsets[cluster_id] +
+                                     b_offset + idx];
+                        }
+
+                        const uint8_t* codes = column_storage +
+                                column_offsets[cluster_id] +
+                                b_offset * code_size;
+                        float* cums = cum_sums + cum_sum_offsets[cluster_id] +
+                                b_offset * (n_levels + 1);
+
+                        total_points += curr_batch_size * n_levels;
+
+                        total_active += scanner->process_batch(
+                                pq,
+                                compressed_codes.data(),
+                                cluster_id,
+                                batch_no,
+                                coarse_dis[i * nprobe + list_no],
+                                curr_batch_size,
+                                batch_size,
+                                chunk_size,
+                                epsilon,
+                                n_levels,
+                                codes,
+                                cums,
+                                query_cum_norms.data(),
+                                active_indices.data(),
+                                bitset.data(),
+                                exact_distances.data(),
+                                ids,
+                                simi,
+                                idxi,
+                                k,
+                                &dis0s_cache[list_no],
+                                sim_table_cache.data() +
+                                        list_no * sim_table_size);
+                    }
+                }
+
+                reorder_result(simi, idxi);
+
+                if (InterruptCallback::is_interrupted()) {
+                    interrupt = true;
+                }
+            }
+        }
+    }
+
+    if (interrupt) {
+        if (!exception_string.empty()) {
+            FAISS_THROW_FMT(
+                    "search interrupted with: %s", exception_string.c_str());
+        } else {
+            FAISS_THROW_MSG("computation interrupted");
+        }
+    }
+
+    printf("total_active: %f\n", (float)total_active / total_points);
+}
+
+} // namespace faiss
diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h
new file mode 100644
index 0000000000..46a19e6b09
--- /dev/null
+++ b/faiss/IndexIVFPQPanorama.h
@@ -0,0 +1,70 @@
+#ifndef FAISS_INDEX_IVFPQ_PANORAMA_H
+#define FAISS_INDEX_IVFPQ_PANORAMA_H
+
+#include <vector>
+
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/AlignedTable.h>
+
+namespace faiss {
+
+struct IndexIVFPQPanorama : public IndexIVFPQ {
+    const int n_levels;
+    uint8_t* column_storage;
+
+    size_t* column_offsets;
+    float* cum_sums;
+    size_t* cum_sum_offsets;
+
+    float* init_exact_distances;
+    size_t* init_exact_distances_offsets;
+
+    const size_t chunk_size;
+    const size_t levels_size;
+    bool added;
+    size_t num_points;
+    size_t batch_size;
+    size_t nbits_per_idx;
+    size_t m_level_width;
+
+    float epsilon;
+
+    IndexIVFPQPanorama(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,
+            size_t nbits_per_idx,
+            int n_levels,
+            float epsilon,
+            size_t batch_size = 128,
+            MetricType metric = METRIC_L2,
+            bool own_invlists = true);
+
+    void add(idx_t n, const float* x) override;
+
+    void search(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            float* distances,
+            idx_t* labels,
+            const SearchParameters* params_in) const;
+
+    void search_preassigned(
+            idx_t n,
+            const float* x,
+            idx_t k,
+            const idx_t* keys,
+            const float* coarse_dis,
+            float* distances,
+            idx_t* labels,
+            bool store_pairs,
+            const IVFSearchParameters* params,
+            IndexIVFStats* ivf_stats) const override;
+};
+
+} // namespace faiss
+
+#endif

From e37d2d428343e4d0e3b5b723adf4bc3863ffb29c Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Tue, 17 Mar 2026 04:15:37 +0000
Subject: [PATCH 03/41] Checkpoint with comments

---
 faiss/IndexIVFPQ.cpp | 94 +++++++++++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 32 deletions(-)

diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp
index 9e074d1c29..0db6c0470e 100644
--- a/faiss/IndexIVFPQ.cpp
+++ b/faiss/IndexIVFPQ.cpp
@@ -774,7 +774,6 @@ struct QueryTables {
             const float* b = sim_table_2;
             float* c = sim_table_ptr;
 
-#ifdef __AVX512F__
             const size_t n16 = n / 16;
             const size_t n_for_masking = n % 16;
 
@@ -793,11 +792,6 @@ struct QueryTables {
                 const __m512 abmul = _mm512_mul_ps(bfmm, bx);
                 _mm512_mask_storeu_ps(c + idx, mask, abmul);
             }
-#else
-            for (size_t idx = 0; idx < n; idx++) {
-                c[idx] = bf * b[idx];
-            }
-#endif
 
             sim_table = sim_table_ptr;
         } else {
@@ -1342,7 +1336,6 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
         this->dis0 = dis0;
     }
 
-#ifdef __AVX512F__
     inline void process_chunks(
             size_t chunk_size,
             size_t max_batch_size,
@@ -1366,29 +1359,33 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
             for (; batch_idx + 15 < num_active; batch_idx += 16) {
                 __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
 
-                __m128i comp0 = _mm_loadu_si128(
-                        (__m128i*)(compressed_codes + chunk_offset0 + batch_idx));
+                __m128i comp0 =
+                        _mm_loadu_si128((__m128i*)(compressed_codes +
+                                                   chunk_offset0 + batch_idx));
                 __m512i codes0 = _mm512_cvtepu8_epi32(comp0);
                 acc = _mm512_add_ps(
                         acc,
                         _mm512_i32gather_ps(codes0, sim_table0, sizeof(float)));
 
-                __m128i comp1 = _mm_loadu_si128(
-                        (__m128i*)(compressed_codes + chunk_offset1 + batch_idx));
+                __m128i comp1 =
+                        _mm_loadu_si128((__m128i*)(compressed_codes +
+                                                   chunk_offset1 + batch_idx));
                 __m512i codes1 = _mm512_cvtepu8_epi32(comp1);
                 acc = _mm512_add_ps(
                         acc,
                         _mm512_i32gather_ps(codes1, sim_table1, sizeof(float)));
 
-                __m128i comp2 = _mm_loadu_si128(
-                        (__m128i*)(compressed_codes + chunk_offset2 + batch_idx));
+                __m128i comp2 =
+                        _mm_loadu_si128((__m128i*)(compressed_codes +
+                                                   chunk_offset2 + batch_idx));
                 __m512i codes2 = _mm512_cvtepu8_epi32(comp2);
                 acc = _mm512_add_ps(
                         acc,
                         _mm512_i32gather_ps(codes2, sim_table2, sizeof(float)));
 
-                __m128i comp3 = _mm_loadu_si128(
-                        (__m128i*)(compressed_codes + chunk_offset3 + batch_idx));
+                __m128i comp3 =
+                        _mm_loadu_si128((__m128i*)(compressed_codes +
+                                                   chunk_offset3 + batch_idx));
                 __m512i codes3 = _mm512_cvtepu8_epi32(comp3);
                 acc = _mm512_add_ps(
                         acc,
@@ -1414,8 +1411,8 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
             size_t batch_idx = 0;
             for (; batch_idx + 15 < num_active; batch_idx += 16) {
                 __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
-                __m128i comp = _mm_loadu_si128(
-                        (__m128i*)(compressed_codes + chunk_offset + batch_idx));
+                __m128i comp = _mm_loadu_si128((
+                        __m128i*)(compressed_codes + chunk_offset + batch_idx));
                 __m512i codes = _mm512_cvtepu8_epi32(comp);
                 __m512 m_dist = _mm512_i32gather_ps(
                         codes, sim_table_ptr, sizeof(float));
@@ -1488,6 +1485,11 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
                     exact_distances + next_num_active,
                     compressed_exact_distances_vec);
 
+            // Update bitset for removed items.
+            // Unfortunatelly, this is not vectorized as AVX-512 does not
+            // support a way to scatter at a 1-byte granularity.
+            // However, we can use a mask to compress the indices and then
+            // sequentially set the bitset.
             alignas(64) uint32_t indices_to_remove[16];
             __mmask16 mask_should_remove = ~mask_should_keep;
             size_t num_to_remove = _mm_popcnt_u32(mask_should_remove);
@@ -1538,7 +1540,18 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
         uint8_t* compressed_codes = compressed_codes_begin;
         size_t num_active = 0;
 
+        // An important optimization is to skip the compression if we all points
+        // are active, as we can just use the compressed_codes_begin
+        // pointer.
         if (next_num_active < max_batch_size) {
+            // Compress the codes: here we don't need to process remainders
+            // as long as `max_batch_size` is a multiple of 64 (which we
+            // assert in the constructor). Conveniently, compressed_codes is
+            // allocated to `max_batch_size` * `chunk_size` elements.
+            // `num_active` is guaranteed to always be less than or equal to
+            // `max_batch_size`. Only the last batch may be smaller than
+            // `max_batch_size`, the caller ensures that the batch and
+            // bitset are padded with zeros.
             compressed_codes = compressed_codes_begin;
             for (size_t point_idx = 0; point_idx < max_batch_size;
                  point_idx += 64) {
@@ -1546,17 +1559,16 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
                 __mmask64 mask = _mm512_cmpneq_epi8_mask(
                         active_byteset, _mm512_setzero_si512());
 
-                for (size_t ci = 0; ci < chunk_size; ci++) {
-                    size_t chunk_offset = ci * max_batch_size;
-                    size_t write_pos = 0;
-                    uint64_t m = (uint64_t)mask;
-                    while (m) {
-                        int bit = __builtin_ctzll(m);
-                        compressed_codes[chunk_offset + num_active + write_pos] =
-                                codes[chunk_offset + point_idx + bit];
-                        write_pos++;
-                        m &= m - 1;
-                    }
+                for (size_t chunk_idx = 0; chunk_idx < chunk_size;
+                     chunk_idx++) {
+                    size_t chunk_offset = chunk_idx * max_batch_size;
+                    __m512i codes_batch_vec = _mm512_loadu_si512(
+                            codes + chunk_offset + point_idx);
+                    __m512i compressed_batch =
+                            _mm512_maskz_compress_epi8(mask, codes_batch_vec);
+                    _mm512_storeu_si512(
+                            compressed_codes + chunk_offset + num_active,
+                            compressed_batch);
                 }
 
                 num_active += _mm_popcnt_u64(mask);
@@ -1568,7 +1580,6 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
 
         return std::make_pair(compressed_codes, num_active);
     }
-#endif // __AVX512F__
 
     inline void process_chunks_sparse(
             size_t chunk_size,
@@ -1592,7 +1603,6 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
         }
     }
 
-#ifdef __AVX512F__
     size_t process_batch(
             const ProductQuantizer& pq,
             uint8_t* compressed_codes,
@@ -1628,20 +1638,32 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
         size_t total_active = 0;
         __m512 epsilon_broadcast = _mm512_set1_ps(epsilon);
 
+        // The remaining active elements computed at the end of each level.
+        // We initialize to `curr_batch_size` for continuity.
         size_t next_num_active = curr_batch_size;
+        // For historical reasons, we initialize dis0 only at
+        // the beginning of the first level, but we need to access it after
+        // all levels have been processed, so we declare dis0 here.
         float dis0 = 0;
+        // Given that `active_indices` indexes the cluster directly, we need
+        // to offset it by the batch offset when updating the bitset and
+        // accessing the cum_sums. This way we avoid yet another layer of
+        // indirection.
         size_t batch_offset = batch_no * max_batch_size;
         __m512i batch_offset_broadcast = _mm512_set1_epi32(batch_offset);
         for (size_t level = 0; (level < n_levels) && (next_num_active > 0);
              level++) {
             total_active += next_num_active;
 
+            // This ensures the LUT is poitning to the right offset, and is
+            // properly initialized. We only compute dis0 distances once for
+            // each cluster, and cache the result.
             size_t level_offset_sim_table = level * pq.ksub * chunk_size;
             this->set_list_panorama(
                     cluster_id,
                     coarse_dis_i,
                     sim_table_cache + level_offset_sim_table,
-                    dis0_cache,
+                    dis0_cache, // Only init once for each cluster.
                     level == 0 && batch_no == 0);
             this->set_sim_table(
                     sim_table_cache + level_offset_sim_table, *dis0_cache);
@@ -1649,12 +1671,15 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
             dis0 = this->dis0;
             __m512 dis0_bcast = _mm512_set1_ps(dis0);
 
+            // We multiply by two here so we don't have to do it in the
+            // kernel.
             float query_cum_norm = 2 * query_cum_norms[level + 1];
             __m512 query_cum_norm_broadcast = _mm512_set1_ps(query_cum_norm);
 
             float heap_max = res.top();
             __m512 heap_max_broadcast = _mm512_set1_ps(heap_max);
 
+            // Codes has padding potentially, cumsum does not.
             float* cum_sums = cums + curr_batch_size * level;
             const uint8_t* codes =
                     codes_batch + max_batch_size * chunk_size * level;
@@ -1662,6 +1687,10 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
             bool is_sparse = next_num_active < max_batch_size / 16;
             float* sim_table = this->sim_table;
 
+            // Phase 1: Process all chunks and accumulate distances.
+            // We iterate over chunks first as this keeps the same LUT slice
+            // within the L1 cache. To avoid register thrashing, we unroll
+            // 4 chunks at a time.
             size_t num_active_for_filtering = 0;
             if (is_sparse) {
                 process_chunks_sparse(
@@ -1695,6 +1724,7 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
                 num_active_for_filtering = na;
             }
 
+            // Phase 2: Filtering logic using accumulated distances.
             next_num_active = process_filtering(
                     num_active_for_filtering,
                     exact_distances,
@@ -1713,6 +1743,7 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
                     heap_max);
         }
 
+        // Phase 3: Insert remaining candidates to heap.
         for (size_t batch_idx = 0; batch_idx < next_num_active; batch_idx++) {
             res.add(active_indices[batch_idx],
                     dis0 + exact_distances[batch_idx]);
@@ -1720,7 +1751,6 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
 
         return total_active;
     }
-#endif // __AVX512F__
 
     float distance_to_code(const uint8_t* code) const override {
         assert(precompute_mode == 2);

From 03e4520be1302e0c4a877d9038bb8f8a66653a88 Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Tue, 17 Mar 2026 04:33:12 +0000
Subject: [PATCH 04/41] fix compile issues

---
 faiss/IndexIVFPQPanorama.cpp | 2 +-
 faiss/python/swigfaiss.swig  | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index ba54da4cb4..84226c3e5f 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -503,7 +503,7 @@ void IndexIVFPQPanorama::search_preassigned(
         }
     }
 
-    printf("total_active: %f\n", (float)total_active / total_points);
+    printf("v0: total_active: %f\n", (float)total_active / total_points);
 }
 
 } // namespace faiss
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index 033dc8d072..75292ecb7f 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -96,6 +96,7 @@ typedef uint64_t size_t;
 #include <faiss/IndexAdditiveQuantizer.h>
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFFlat.h>
@@ -594,6 +595,7 @@ void gpu_sync_all_devices()
 
 %ignore faiss::IndexIVFPQ::alloc_type;
 %include  <faiss/IndexIVFPQ.h>
+%include  <faiss/IndexIVFPQPanorama.h>
 %include  <faiss/IndexIVFPQR.h>
 %include  <faiss/Index2Layer.h>
 
@@ -779,6 +781,7 @@ void gpu_sync_all_devices()
     DOWNCAST ( IndexIVFRaBitQ )
     DOWNCAST ( IndexIVFRaBitQFastScan )
     DOWNCAST ( IndexIVFIndependentQuantizer)
+    DOWNCAST ( IndexIVFPQPanorama )
     DOWNCAST ( IndexIVFPQR )
     DOWNCAST ( IndexIVFPQ )
     DOWNCAST ( IndexIVFPQFastScan )

From e10bccc7f42a14b59399b7c16998235d4587570d Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Tue, 17 Mar 2026 06:13:51 +0000
Subject: [PATCH 05/41] Amazing

---
 benchs/bench_ivfpq_panorama.py                | 154 +++++++++
 faiss/CMakeLists.txt                          |  17 +-
 faiss/IndexIVFPQ.cpp                          | 303 ++----------------
 faiss/IndexIVFPQPanorama.cpp                  |   6 +-
 .../panorama_kernels-avx2.cpp                 | 239 ++++++++++++++
 .../panorama_kernels-avx512.cpp               | 238 ++++++++++++++
 .../panorama_kernels-generic.cpp              | 155 +++++++++
 .../impl/panorama_kernels/panorama_kernels.h  |  89 +++++
 8 files changed, 910 insertions(+), 291 deletions(-)
 create mode 100644 benchs/bench_ivfpq_panorama.py
 create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
 create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
 create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
 create mode 100644 faiss/impl/panorama_kernels/panorama_kernels.h

diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
new file mode 100644
index 0000000000..ebd1336092
--- /dev/null
+++ b/benchs/bench_ivfpq_panorama.py
@@ -0,0 +1,154 @@
+# Quick 10% verification of IVFPQPanorama (with index caching)
+
+import multiprocessing as mp
+import os
+import time
+
+import faiss
+import numpy as np
+
+print("Compile options:", faiss.get_compile_options(), flush=True)
+
+
+def fvecs_read(fname):
+    a = np.fromfile(fname, dtype="float32")
+    d = a[0].view("int32")
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+GIST_DIR = "/home/lutex/PCA_init"
+CACHE_DIR = "/home/lutex/faiss-panorama/index_cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
+
+IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index")
+IVFPQ_TRAINED_CACHE = os.path.join(CACHE_DIR, "ivfpq_trained_10pct.index")
+
+print("Loading GIST1M data (10% subset)...", flush=True)
+xb_full = fvecs_read(os.path.join(GIST_DIR, "gist1m_base.fvecs"))
+xq = fvecs_read(os.path.join(GIST_DIR, "gist1m_query.fvecs"))
+
+nb_full, d = xb_full.shape
+nb = nb_full // 10  # 10% = 100000
+xb = xb_full[:nb].copy()
+del xb_full
+
+nq = xq.shape[0]
+print(f"Database: {nb} x {d}, Queries: {nq} x {d}", flush=True)
+
+xt = xb[:50000].copy()
+
+k = 10
+M = 960
+nbits = 8
+nlist = 64
+n_levels = 8
+epsilon = 1.0
+batch_size = 128
+
+GT_PATH = os.path.join(CACHE_DIR, "gt_10pct.npy")
+if os.path.exists(GT_PATH):
+    gt_I = np.load(GT_PATH)
+    print(f"Loaded cached ground truth: {gt_I.shape}", flush=True)
+else:
+    print("Computing ground truth on 10% subset...", flush=True)
+    flat = faiss.IndexFlatL2(d)
+    flat.add(xb)
+    _, gt_I = flat.search(xq, k)
+    np.save(GT_PATH, gt_I)
+    print("Ground truth computed and cached.", flush=True)
+
+
+def eval_recall(index, nprobe_val):
+    t0 = time.time()
+    _, I = index.search(xq, k=k)
+    t = time.time() - t0
+    speed = t * 1000 / nq
+    qps = 1000 / speed
+    corrects = sum(len(set(gt_I[i]) & set(I[i])) for i in range(nq))
+    recall = corrects / (nq * k)
+    print(
+        f"\tnprobe {nprobe_val:3d}, Recall@{k}: "
+        f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}",
+        flush=True,
+    )
+    return recall, qps
+
+
+# faiss.omp_set_num_threads(mp.cpu_count())
+
+# --- IVFPQ baseline (cached) ---
+if os.path.exists(IVFPQ_CACHE):
+    print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True)
+    t0 = time.time()
+    ivfpq = faiss.read_index(IVFPQ_CACHE)
+    print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
+else:
+    print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True)
+    quantizer = faiss.IndexFlatL2(d)
+    ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+    t0 = time.time()
+    ivfpq.train(xt)
+    print(f"  Training took {time.time() - t0:.1f}s", flush=True)
+
+    print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
+    faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE)
+
+    t0 = time.time()
+    ivfpq.add(xb)
+    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
+
+    print(f"  Saving full index to {IVFPQ_CACHE}...", flush=True)
+    faiss.write_index(ivfpq, IVFPQ_CACHE)
+
+faiss.omp_set_num_threads(1)
+print("\n====== IVFPQ baseline", flush=True)
+for nprobe in [1, 2, 4, 8, 16]:
+    ivfpq.nprobe = nprobe
+    eval_recall(ivfpq, nprobe)
+
+# --- IVFPQPanorama (reuse trained PQ from cache) ---
+faiss.omp_set_num_threads(mp.cpu_count())
+
+if os.path.exists(IVFPQ_TRAINED_CACHE):
+    print(f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", flush=True)
+    trained = faiss.read_index(IVFPQ_TRAINED_CACHE)
+    quantizer2 = trained.quantizer
+    trained.own_fields = False
+
+    ivfpq_pano = faiss.IndexIVFPQPanorama(
+        quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size
+    )
+    centroids = faiss.vector_to_array(trained.pq.centroids)
+    faiss.copy_array_to_vector(centroids, ivfpq_pano.pq.centroids)
+    ivfpq_pano.is_trained = True
+    ivfpq_pano.use_precomputed_table = 1
+    ivfpq_pano.precompute_table()
+
+    print("  Reused trained PQ (skipped training).", flush=True)
+    t0 = time.time()
+    ivfpq_pano.add(xb)
+    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
+else:
+    print(
+        f"\nBuilding IVFPQPanorama from scratch: nlist={nlist}, M={M}, nbits={nbits}, "
+        f"n_levels={n_levels}, epsilon={epsilon}, batch_size={batch_size}",
+        flush=True,
+    )
+    quantizer2 = faiss.IndexFlatL2(d)
+    ivfpq_pano = faiss.IndexIVFPQPanorama(
+        quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size
+    )
+    t0 = time.time()
+    ivfpq_pano.train(xt)
+    print(f"  Training took {time.time() - t0:.1f}s", flush=True)
+    t0 = time.time()
+    ivfpq_pano.add(xb)
+    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
+
+faiss.omp_set_num_threads(1)
+print("\n====== IVFPQPanorama", flush=True)
+for nprobe in [1, 2, 4, 8, 16]:
+    ivfpq_pano.nprobe = nprobe
+    eval_recall(ivfpq_pano, nprobe)
+
+print("\nVerification complete!", flush=True)
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 5a6c37ffbd..84a6eb1aac 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -10,6 +10,7 @@
 # =============================================================================
 set(FAISS_SIMD_AVX2_SRC
   impl/fast_scan/impl-avx2.cpp
+  impl/panorama_kernels/panorama_kernels-avx2.cpp
   impl/pq_code_distance/pq_code_distance-avx2.cpp
   impl/scalar_quantizer/sq-avx2.cpp
   impl/approx_topk/avx2.cpp
@@ -17,6 +18,7 @@ set(FAISS_SIMD_AVX2_SRC
 )
 set(FAISS_SIMD_AVX512_SRC
   impl/fast_scan/impl-avx512.cpp
+  impl/panorama_kernels/panorama_kernels-avx512.cpp
   impl/pq_code_distance/pq_code_distance-avx512.cpp
   impl/scalar_quantizer/sq-avx512.cpp
   utils/simd_impl/distances_avx512.cpp
@@ -106,6 +108,7 @@ set(FAISS_SRC
   impl/NSG.cpp
   impl/PolysemousTraining.cpp
   impl/ProductQuantizer.cpp
+  impl/panorama_kernels/panorama_kernels-generic.cpp
   impl/pq_code_distance/pq_code_distance-generic.cpp
   impl/AdditiveQuantizer.cpp
   impl/RaBitQuantizer.cpp
@@ -280,6 +283,7 @@ set(FAISS_HEADERS
   impl/fast_scan/simd_result_handlers.h
   impl/zerocopy_io.h
   utils/pq_code_distance.h
+  impl/panorama_kernels/panorama_kernels.h
   impl/pq_code_distance/pq_code_distance-inl.h
   invlists/BlockInvertedLists.h
   invlists/DirectMap.h
@@ -356,6 +360,15 @@ endif()
 # Export FAISS_HEADERS variable to parent scope.
 set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE)
 
+# Detect BMI2 compiler support (PEXT/PDEP used in Panorama code compression).
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-mbmi2" COMPILER_SUPPORTS_BMI2)
+if(COMPILER_SUPPORTS_BMI2)
+  set(FAISS_BMI2_FLAGS "-mbmi2")
+else()
+  set(FAISS_BMI2_FLAGS "")
+endif()
+
 add_library(faiss ${FAISS_SRC})
 
 add_library(faiss_avx2 ${FAISS_SRC})
@@ -363,7 +376,7 @@ if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512"
   set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
 endif()
 if(NOT WIN32)
-  target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
+  target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt ${FAISS_BMI2_FLAGS}>)
 else()
   # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT
   # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
@@ -383,7 +396,7 @@ endif()
 if(NOT WIN32)
   # All modern CPUs support F, CD, VL, DQ, BW extensions.
   # Ref: https://en.wikipedia.org/wiki/AVX512
-  target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>)
+  target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt ${FAISS_BMI2_FLAGS}>)
 else()
   target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
   # we need bigobj for the swig wrapper
diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp
index 0db6c0470e..aca27f903d 100644
--- a/faiss/IndexIVFPQ.cpp
+++ b/faiss/IndexIVFPQ.cpp
@@ -9,7 +9,6 @@
 
 #include <faiss/IndexIVFPQ.h>
 
-#include <immintrin.h>
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
@@ -33,6 +32,7 @@
 #include <faiss/impl/ProductQuantizer.h>
 #include <faiss/impl/ResultHandler.h>
 #include <faiss/impl/pq_code_distance/pq_code_distance-inl.h>
+#include <faiss/impl/panorama_kernels/panorama_kernels.h>
 #include <faiss/impl/simd_dispatch.h>
 
 namespace faiss {
@@ -774,23 +774,8 @@ struct QueryTables {
             const float* b = sim_table_2;
             float* c = sim_table_ptr;
 
-            const size_t n16 = n / 16;
-            const size_t n_for_masking = n % 16;
-
-            const __m512 bfmm = _mm512_set1_ps(bf);
-
-            size_t idx = 0;
-            for (idx = 0; idx < n16 * 16; idx += 16) {
-                const __m512 bx = _mm512_loadu_ps(b + idx);
-                const __m512 abmul = _mm512_mul_ps(bfmm, bx);
-                _mm512_storeu_ps(c + idx, abmul);
-            }
-
-            if (n_for_masking > 0) {
-                const __mmask16 mask = (1 << n_for_masking) - 1;
-                const __m512 bx = _mm512_maskz_loadu_ps(mask, b + idx);
-                const __m512 abmul = _mm512_mul_ps(bfmm, bx);
-                _mm512_mask_storeu_ps(c + idx, mask, abmul);
+            for (size_t idx = 0; idx < n; idx++) {
+                c[idx] = bf * b[idx];
             }
 
             sim_table = sim_table_ptr;
@@ -1336,250 +1321,10 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
         this->dis0 = dis0;
     }
 
-    inline void process_chunks(
-            size_t chunk_size,
-            size_t max_batch_size,
-            size_t num_active,
-            float* sim_table,
-            uint8_t* compressed_codes,
-            float* exact_distances) {
-        size_t chunk_idx = 0;
-        for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) {
-            size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size;
-            size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size;
-            size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size;
-            size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size;
-
-            float* sim_table0 = sim_table + (chunk_idx + 0) * 256;
-            float* sim_table1 = sim_table + (chunk_idx + 1) * 256;
-            float* sim_table2 = sim_table + (chunk_idx + 2) * 256;
-            float* sim_table3 = sim_table + (chunk_idx + 3) * 256;
-
-            size_t batch_idx = 0;
-            for (; batch_idx + 15 < num_active; batch_idx += 16) {
-                __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
-
-                __m128i comp0 =
-                        _mm_loadu_si128((__m128i*)(compressed_codes +
-                                                   chunk_offset0 + batch_idx));
-                __m512i codes0 = _mm512_cvtepu8_epi32(comp0);
-                acc = _mm512_add_ps(
-                        acc,
-                        _mm512_i32gather_ps(codes0, sim_table0, sizeof(float)));
-
-                __m128i comp1 =
-                        _mm_loadu_si128((__m128i*)(compressed_codes +
-                                                   chunk_offset1 + batch_idx));
-                __m512i codes1 = _mm512_cvtepu8_epi32(comp1);
-                acc = _mm512_add_ps(
-                        acc,
-                        _mm512_i32gather_ps(codes1, sim_table1, sizeof(float)));
-
-                __m128i comp2 =
-                        _mm_loadu_si128((__m128i*)(compressed_codes +
-                                                   chunk_offset2 + batch_idx));
-                __m512i codes2 = _mm512_cvtepu8_epi32(comp2);
-                acc = _mm512_add_ps(
-                        acc,
-                        _mm512_i32gather_ps(codes2, sim_table2, sizeof(float)));
-
-                __m128i comp3 =
-                        _mm_loadu_si128((__m128i*)(compressed_codes +
-                                                   chunk_offset3 + batch_idx));
-                __m512i codes3 = _mm512_cvtepu8_epi32(comp3);
-                acc = _mm512_add_ps(
-                        acc,
-                        _mm512_i32gather_ps(codes3, sim_table3, sizeof(float)));
-
-                _mm512_storeu_ps(exact_distances + batch_idx, acc);
-            }
-
-            for (; batch_idx < num_active; batch_idx += 1) {
-                float acc = exact_distances[batch_idx];
-                acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]];
-                acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]];
-                acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]];
-                acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]];
-                exact_distances[batch_idx] = acc;
-            }
-        }
-
-        for (; chunk_idx < chunk_size; chunk_idx++) {
-            size_t chunk_offset = chunk_idx * max_batch_size;
-            float* sim_table_ptr = sim_table + chunk_idx * 256;
-
-            size_t batch_idx = 0;
-            for (; batch_idx + 15 < num_active; batch_idx += 16) {
-                __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
-                __m128i comp = _mm_loadu_si128((
-                        __m128i*)(compressed_codes + chunk_offset + batch_idx));
-                __m512i codes = _mm512_cvtepu8_epi32(comp);
-                __m512 m_dist = _mm512_i32gather_ps(
-                        codes, sim_table_ptr, sizeof(float));
-                acc = _mm512_add_ps(acc, m_dist);
-                _mm512_storeu_ps(exact_distances + batch_idx, acc);
-            }
-
-            for (; batch_idx < num_active; batch_idx += 1) {
-                exact_distances[batch_idx] += sim_table_ptr
-                        [compressed_codes[chunk_offset + batch_idx]];
-            }
-        }
-    }
-
-    inline size_t process_filtering(
-            size_t num_active,
-            float* exact_distances,
-            uint32_t* active_indices,
-            __m512i batch_offset_broadcast,
-            float* cum_sums,
-            __m512 dis0_broadcast,
-            __m512 query_cum_norm_broadcast,
-            __m512 epsilon_broadcast,
-            __m512 heap_max_broadcast,
-            uint8_t* bitset,
-            size_t batch_offset,
-            float dis0,
-            float query_cum_norm,
-            float epsilon,
-            float heap_max) {
-        size_t next_num_active = 0;
-        size_t batch_idx = 0;
-
-        for (; batch_idx + 15 < num_active; batch_idx += 16) {
-            __m512 exact_distances_batch =
-                    _mm512_loadu_ps(exact_distances + batch_idx);
-
-            __m512i active_indices_batch =
-                    _mm512_loadu_si512(active_indices + batch_idx);
-            __m512i offsetted_active_indices_batch = _mm512_sub_epi32(
-                    active_indices_batch, batch_offset_broadcast);
-            __m512 cum_sums_batch = _mm512_i32gather_ps(
-                    offsetted_active_indices_batch, cum_sums, sizeof(float));
-
-            __m512 exact_distances_batch_dis0 =
-                    _mm512_add_ps(exact_distances_batch, dis0_broadcast);
-            __m512 cauchy_schwarz_bound =
-                    _mm512_mul_ps(query_cum_norm_broadcast, cum_sums_batch);
-            cauchy_schwarz_bound =
-                    _mm512_mul_ps(cauchy_schwarz_bound, epsilon_broadcast);
-
-            __m512 lower_bound = _mm512_sub_ps(
-                    exact_distances_batch_dis0, cauchy_schwarz_bound);
-            __mmask16 mask_should_keep = _mm512_cmp_ps_mask(
-                    lower_bound, heap_max_broadcast, _CMP_LT_OQ);
-
-            __m512i compressed_active_indices_vec = _mm512_mask_compress_epi32(
-                    _mm512_setzero_si512(),
-                    mask_should_keep,
-                    active_indices_batch);
-            _mm512_storeu_si512(
-                    active_indices + next_num_active,
-                    compressed_active_indices_vec);
-
-            __m512 compressed_exact_distances_vec = _mm512_mask_compress_ps(
-                    _mm512_setzero_ps(),
-                    mask_should_keep,
-                    exact_distances_batch);
-            _mm512_storeu_ps(
-                    exact_distances + next_num_active,
-                    compressed_exact_distances_vec);
-
-            // Update bitset for removed items.
-            // Unfortunatelly, this is not vectorized as AVX-512 does not
-            // support a way to scatter at a 1-byte granularity.
-            // However, we can use a mask to compress the indices and then
-            // sequentially set the bitset.
-            alignas(64) uint32_t indices_to_remove[16];
-            __mmask16 mask_should_remove = ~mask_should_keep;
-            size_t num_to_remove = _mm_popcnt_u32(mask_should_remove);
-
-            __m512i compressed_indices_to_remove_vec =
-                    _mm512_mask_compress_epi32(
-                            _mm512_setzero_si512(),
-                            mask_should_remove,
-                            active_indices_batch);
-            _mm512_storeu_si512(
-                    indices_to_remove, compressed_indices_to_remove_vec);
-
-            for (size_t idx = 0; idx < num_to_remove; idx++) {
-                bitset[indices_to_remove[idx] - batch_offset] = 0;
-            }
-
-            next_num_active += _mm_popcnt_u32(mask_should_keep);
-        }
-
-        for (; batch_idx < num_active; batch_idx++) {
-            float exact_distance = exact_distances[batch_idx];
-
-            float cum_sum = cum_sums[active_indices[batch_idx] - batch_offset];
-            float cauchy_schwarz_bound = cum_sum * query_cum_norm;
-            float lower_bound =
-                    exact_distance - cauchy_schwarz_bound * epsilon + dis0;
-
-            uint32_t should_keep = heap_max > lower_bound;
-            active_indices[next_num_active] = active_indices[batch_idx];
-            exact_distances[next_num_active] = exact_distance;
-
-            bitset[active_indices[batch_idx] - batch_offset] = should_keep;
-
-            next_num_active += should_keep;
-        }
-
-        return next_num_active;
-    }
-
-    inline std::pair<uint8_t*, size_t> process_code_compression(
-            size_t level,
-            size_t next_num_active,
-            size_t max_batch_size,
-            size_t chunk_size,
-            uint8_t* compressed_codes_begin,
-            uint8_t* bitset,
-            const uint8_t* codes) {
-        uint8_t* compressed_codes = compressed_codes_begin;
-        size_t num_active = 0;
-
-        // An important optimization is to skip the compression if we all points
-        // are active, as we can just use the compressed_codes_begin
-        // pointer.
-        if (next_num_active < max_batch_size) {
-            // Compress the codes: here we don't need to process remainders
-            // as long as `max_batch_size` is a multiple of 64 (which we
-            // assert in the constructor). Conveniently, compressed_codes is
-            // allocated to `max_batch_size` * `chunk_size` elements.
-            // `num_active` is guaranteed to always be less than or equal to
-            // `max_batch_size`. Only the last batch may be smaller than
-            // `max_batch_size`, the caller ensures that the batch and
-            // bitset are padded with zeros.
-            compressed_codes = compressed_codes_begin;
-            for (size_t point_idx = 0; point_idx < max_batch_size;
-                 point_idx += 64) {
-                __m512i active_byteset = _mm512_loadu_si512(bitset + point_idx);
-                __mmask64 mask = _mm512_cmpneq_epi8_mask(
-                        active_byteset, _mm512_setzero_si512());
-
-                for (size_t chunk_idx = 0; chunk_idx < chunk_size;
-                     chunk_idx++) {
-                    size_t chunk_offset = chunk_idx * max_batch_size;
-                    __m512i codes_batch_vec = _mm512_loadu_si512(
-                            codes + chunk_offset + point_idx);
-                    __m512i compressed_batch =
-                            _mm512_maskz_compress_epi8(mask, codes_batch_vec);
-                    _mm512_storeu_si512(
-                            compressed_codes + chunk_offset + num_active,
-                            compressed_batch);
-                }
-
-                num_active += _mm_popcnt_u64(mask);
-            }
-        } else {
-            num_active = next_num_active;
-            compressed_codes = const_cast<uint8_t*>(codes);
-        }
-
-        return std::make_pair(compressed_codes, num_active);
-    }
+    // Panorama kernels (process_chunks, process_filtering,
+    // process_code_compression) are implemented in
+    // faiss/impl/panorama_kernels/ with scalar and AVX-512 variants.
+    // The linker selects the right one based on the SIMD compile target.
 
     inline void process_chunks_sparse(
             size_t chunk_size,
@@ -1636,7 +1381,6 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
                 0};
         uint8_t* compressed_codes_begin = compressed_codes;
         size_t total_active = 0;
-        __m512 epsilon_broadcast = _mm512_set1_ps(epsilon);
 
         // The remaining active elements computed at the end of each level.
         // We initialize to `curr_batch_size` for continuity.
@@ -1650,12 +1394,11 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
         // accessing the cum_sums. This way we avoid yet another layer of
         // indirection.
         size_t batch_offset = batch_no * max_batch_size;
-        __m512i batch_offset_broadcast = _mm512_set1_epi32(batch_offset);
         for (size_t level = 0; (level < n_levels) && (next_num_active > 0);
              level++) {
             total_active += next_num_active;
 
-            // This ensures the LUT is poitning to the right offset, and is
+            // This ensures the LUT is pointing to the right offset, and is
             // properly initialized. We only compute dis0 distances once for
             // each cluster, and cache the result.
             size_t level_offset_sim_table = level * pq.ksub * chunk_size;
@@ -1669,15 +1412,12 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
                     sim_table_cache + level_offset_sim_table, *dis0_cache);
 
             dis0 = this->dis0;
-            __m512 dis0_bcast = _mm512_set1_ps(dis0);
 
             // We multiply by two here so we don't have to do it in the
             // kernel.
             float query_cum_norm = 2 * query_cum_norms[level + 1];
-            __m512 query_cum_norm_broadcast = _mm512_set1_ps(query_cum_norm);
 
             float heap_max = res.top();
-            __m512 heap_max_broadcast = _mm512_set1_ps(heap_max);
 
             // Codes has padding potentially, cumsum does not.
             float* cum_sums = cums + curr_batch_size * level;
@@ -1705,16 +1445,16 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
                         pq.ksub);
                 num_active_for_filtering = next_num_active;
             } else {
-                auto [cc, na] = process_code_compression(
-                        level,
-                        next_num_active,
-                        max_batch_size,
-                        chunk_size,
-                        compressed_codes_begin,
-                        bitset,
-                        codes);
-
-                process_chunks(
+                auto [cc, na] =
+                        panorama_kernels::process_code_compression(
+                                next_num_active,
+                                max_batch_size,
+                                chunk_size,
+                                compressed_codes_begin,
+                                bitset,
+                                codes);
+
+                panorama_kernels::process_chunks(
                         chunk_size,
                         max_batch_size,
                         na,
@@ -1725,16 +1465,11 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
             }
 
             // Phase 2: Filtering logic using accumulated distances.
-            next_num_active = process_filtering(
+            next_num_active = panorama_kernels::process_filtering(
                     num_active_for_filtering,
                     exact_distances,
                     active_indices,
-                    batch_offset_broadcast,
                     cum_sums,
-                    dis0_bcast,
-                    query_cum_norm_broadcast,
-                    epsilon_broadcast,
-                    heap_max_broadcast,
                     bitset,
                     batch_offset,
                     dis0,
diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index 84226c3e5f..aae0811176 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -439,10 +439,6 @@ void IndexIVFPQPanorama::search_preassigned(
                                 bitset.begin() + curr_batch_size,
                                 bitset.end(),
                                 0);
-                        std::fill(
-                                compressed_codes.begin(),
-                                compressed_codes.end(),
-                                0);
 
                         for (size_t idx = 0; idx < curr_batch_size; idx++) {
                             exact_distances[idx] = init_exact_distances
@@ -503,7 +499,7 @@ void IndexIVFPQPanorama::search_preassigned(
         }
     }
 
-    printf("v0: total_active: %f\n", (float)total_active / total_points);
+    printf("vv: total_active: %f\n", (float)total_active / total_points);
 }
 
 } // namespace faiss
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
new file mode 100644
index 0000000000..235c5d4d78
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// AVX2 implementations of Panorama kernels.
+// Uses 256-bit gather for process_chunks, scalar filtering (no
+// compress instruction in AVX2), and BMI2 PEXT/PDEP for code
+// compression where available.
+
+#ifdef COMPILE_SIMD_AVX2
+#ifndef COMPILE_SIMD_AVX512
+
+#include <immintrin.h>
+
+#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+
+#include <cstring>
+
+namespace faiss {
+namespace panorama_kernels {
+
+void process_chunks(
+        size_t chunk_size,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    size_t chunk_idx = 0;
+
+    // Process 4 chunks at a time to amortize loop overhead and keep
+    // the accumulator in registers across chunks.
+    for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) {
+        size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size;
+        size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size;
+        size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size;
+        size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size;
+
+        float* sim_table0 = sim_table + (chunk_idx + 0) * 256;
+        float* sim_table1 = sim_table + (chunk_idx + 1) * 256;
+        float* sim_table2 = sim_table + (chunk_idx + 2) * 256;
+        float* sim_table3 = sim_table + (chunk_idx + 3) * 256;
+
+        size_t batch_idx = 0;
+        for (; batch_idx + 7 < num_active; batch_idx += 8) {
+            __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx);
+
+            // Load 8 byte codes, zero-extend to 32-bit indices.
+            __m128i raw0 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + chunk_offset0 + batch_idx));
+            __m256i codes0 = _mm256_cvtepu8_epi32(raw0);
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table0, codes0, sizeof(float)));
+
+            __m128i raw1 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + chunk_offset1 + batch_idx));
+            __m256i codes1 = _mm256_cvtepu8_epi32(raw1);
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table1, codes1, sizeof(float)));
+
+            __m128i raw2 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + chunk_offset2 + batch_idx));
+            __m256i codes2 = _mm256_cvtepu8_epi32(raw2);
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table2, codes2, sizeof(float)));
+
+            __m128i raw3 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + chunk_offset3 + batch_idx));
+            __m256i codes3 = _mm256_cvtepu8_epi32(raw3);
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table3, codes3, sizeof(float)));
+
+            _mm256_storeu_ps(exact_distances + batch_idx, acc);
+        }
+
+        for (; batch_idx < num_active; batch_idx += 1) {
+            float acc = exact_distances[batch_idx];
+            acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]];
+            acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]];
+            acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]];
+            acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]];
+            exact_distances[batch_idx] = acc;
+        }
+    }
+
+    for (; chunk_idx < chunk_size; chunk_idx++) {
+        size_t chunk_offset = chunk_idx * max_batch_size;
+        float* sim_table_ptr = sim_table + chunk_idx * 256;
+
+        size_t batch_idx = 0;
+        for (; batch_idx + 7 < num_active; batch_idx += 8) {
+            __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx);
+            __m128i raw = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + chunk_offset + batch_idx));
+            __m256i codes = _mm256_cvtepu8_epi32(raw);
+            __m256 m_dist = _mm256_i32gather_ps(
+                    sim_table_ptr, codes, sizeof(float));
+            acc = _mm256_add_ps(acc, m_dist);
+            _mm256_storeu_ps(exact_distances + batch_idx, acc);
+        }
+
+        for (; batch_idx < num_active; batch_idx += 1) {
+            exact_distances[batch_idx] += sim_table_ptr
+                    [compressed_codes[chunk_offset + batch_idx]];
+        }
+    }
+}
+
+size_t process_filtering(
+        size_t num_active,
+        float* exact_distances,
+        uint32_t* active_indices,
+        float* cum_sums,
+        uint8_t* bitset,
+        size_t batch_offset,
+        float dis0,
+        float query_cum_norm,
+        float epsilon,
+        float heap_max) {
+    size_t next_num_active = 0;
+    for (size_t i = 0; i < num_active; i++) {
+        float exact_distance = exact_distances[i];
+        float cum_sum = cum_sums[active_indices[i] - batch_offset];
+        float lower_bound =
+                exact_distance + dis0 - cum_sum * query_cum_norm * epsilon;
+
+        bool keep = heap_max > lower_bound;
+        active_indices[next_num_active] = active_indices[i];
+        exact_distances[next_num_active] = exact_distance;
+        bitset[active_indices[i] - batch_offset] = keep;
+        next_num_active += keep;
+    }
+    return next_num_active;
+}
+
+std::pair<uint8_t*, size_t> process_code_compression(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t chunk_size,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    uint8_t* compressed_codes = compressed_codes_begin;
+    size_t num_active = 0;
+
+    // An important optimization is to skip the compression if all points
+    // are active, as we can just use the compressed_codes_begin pointer.
+    if (next_num_active < max_batch_size) {
+        // Compress the codes: here we don't need to process remainders
+        // as long as `max_batch_size` is a multiple of 64 (which we
+        // assert in the constructor). Conveniently, compressed_codes is
+        // allocated to `max_batch_size` * `chunk_size` elements.
+        // `num_active` is guaranteed to always be less than or equal to
+        // `max_batch_size`. Only the last batch may be smaller than
+        // `max_batch_size`, the caller ensures that the batch and
+        // bitset are padded with zeros.
+        compressed_codes = compressed_codes_begin;
+        for (size_t point_idx = 0; point_idx < max_batch_size;
+             point_idx += 64) {
+            // Build a 64-bit mask from the byteset: each byte is
+            // 0 or 1, collect into a single bitmask.
+            uint64_t mask = 0;
+#ifdef __BMI2__
+            for (int g = 0; g < 8; g++) {
+                uint64_t bytes;
+                memcpy(&bytes, bitset + point_idx + g * 8, 8);
+                uint8_t bits = (uint8_t)_pext_u64(
+                        bytes, 0x0101010101010101ULL);
+                mask |= ((uint64_t)bits << (g * 8));
+            }
+#else
+            for (int b = 0; b < 64; b++) {
+                if (bitset[point_idx + b])
+                    mask |= (1ULL << b);
+            }
+#endif
+
+            // Byte-level stream compaction.
+#ifdef __BMI2__
+            // PEXT/PDEP path: process 8 bytes at a time. PDEP
+            // expands the per-byte mask bits into a per-byte lane
+            // mask, then PEXT extracts only the selected bytes.
+            for (size_t ci = 0; ci < chunk_size; ci++) {
+                size_t chunk_offset = ci * max_batch_size;
+                const uint8_t* src = codes + chunk_offset + point_idx;
+                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+                int write_pos = 0;
+                for (int g = 0; g < 8; g++) {
+                    uint64_t src_val;
+                    memcpy(&src_val, src + g * 8, 8);
+                    uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF);
+                    uint64_t byte_mask =
+                            _pdep_u64(submask, 0x0101010101010101ULL) *
+                            0xFF;
+                    uint64_t compressed_val = _pext_u64(src_val, byte_mask);
+                    int count = __builtin_popcount(submask);
+                    memcpy(dst + write_pos, &compressed_val, 8);
+                    write_pos += count;
+                }
+            }
+#else
+            // Scalar fallback: scan set bits one by one and copy
+            // the corresponding code byte.
+            for (size_t ci = 0; ci < chunk_size; ci++) {
+                size_t chunk_offset = ci * max_batch_size;
+                const uint8_t* src = codes + chunk_offset + point_idx;
+                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+                int write_pos = 0;
+                uint64_t m = mask;
+                while (m) {
+                    int bit = __builtin_ctzll(m);
+                    dst[write_pos++] = src[bit];
+                    m &= m - 1;
+                }
+            }
+#endif
+
+            num_active += __builtin_popcountll(mask);
+        }
+    } else {
+        num_active = next_num_active;
+        compressed_codes = const_cast<uint8_t*>(codes);
+    }
+
+    return std::make_pair(compressed_codes, num_active);
+}
+
+} // namespace panorama_kernels
+} // namespace faiss
+
+#endif // COMPILE_SIMD_AVX512
+#endif // COMPILE_SIMD_AVX2
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
new file mode 100644
index 0000000000..6c6f0f24db
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_AVX512
+
+#include <immintrin.h>
+
+#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+
+#include <cstring>
+
+namespace faiss {
+namespace panorama_kernels {
+
+void process_chunks(
+        size_t chunk_size,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    size_t chunk_idx = 0;
+
+    // Process 4 chunks at a time to amortize loop overhead and keep
+    // the accumulator in registers across chunks.
+    for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) {
+        size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size;
+        size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size;
+        size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size;
+        size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size;
+
+        float* sim_table0 = sim_table + (chunk_idx + 0) * 256;
+        float* sim_table1 = sim_table + (chunk_idx + 1) * 256;
+        float* sim_table2 = sim_table + (chunk_idx + 2) * 256;
+        float* sim_table3 = sim_table + (chunk_idx + 3) * 256;
+
+        size_t batch_idx = 0;
+        for (; batch_idx + 15 < num_active; batch_idx += 16) {
+            __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
+
+            __m128i comp0 =
+                    _mm_loadu_si128((__m128i*)(compressed_codes +
+                                               chunk_offset0 + batch_idx));
+            __m512i codes0 = _mm512_cvtepu8_epi32(comp0);
+            acc = _mm512_add_ps(
+                    acc,
+                    _mm512_i32gather_ps(codes0, sim_table0, sizeof(float)));
+
+            __m128i comp1 =
+                    _mm_loadu_si128((__m128i*)(compressed_codes +
+                                               chunk_offset1 + batch_idx));
+            __m512i codes1 = _mm512_cvtepu8_epi32(comp1);
+            acc = _mm512_add_ps(
+                    acc,
+                    _mm512_i32gather_ps(codes1, sim_table1, sizeof(float)));
+
+            __m128i comp2 =
+                    _mm_loadu_si128((__m128i*)(compressed_codes +
+                                               chunk_offset2 + batch_idx));
+            __m512i codes2 = _mm512_cvtepu8_epi32(comp2);
+            acc = _mm512_add_ps(
+                    acc,
+                    _mm512_i32gather_ps(codes2, sim_table2, sizeof(float)));
+
+            __m128i comp3 =
+                    _mm_loadu_si128((__m128i*)(compressed_codes +
+                                               chunk_offset3 + batch_idx));
+            __m512i codes3 = _mm512_cvtepu8_epi32(comp3);
+            acc = _mm512_add_ps(
+                    acc,
+                    _mm512_i32gather_ps(codes3, sim_table3, sizeof(float)));
+
+            _mm512_storeu_ps(exact_distances + batch_idx, acc);
+        }
+
+        for (; batch_idx < num_active; batch_idx += 1) {
+            float acc = exact_distances[batch_idx];
+            acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]];
+            acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]];
+            acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]];
+            acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]];
+            exact_distances[batch_idx] = acc;
+        }
+    }
+
+    for (; chunk_idx < chunk_size; chunk_idx++) {
+        size_t chunk_offset = chunk_idx * max_batch_size;
+        float* sim_table_ptr = sim_table + chunk_idx * 256;
+
+        size_t batch_idx = 0;
+        for (; batch_idx + 15 < num_active; batch_idx += 16) {
+            __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
+            __m128i comp = _mm_loadu_si128((
+                    __m128i*)(compressed_codes + chunk_offset + batch_idx));
+            __m512i codes = _mm512_cvtepu8_epi32(comp);
+            __m512 m_dist = _mm512_i32gather_ps(
+                    codes, sim_table_ptr, sizeof(float));
+            acc = _mm512_add_ps(acc, m_dist);
+            _mm512_storeu_ps(exact_distances + batch_idx, acc);
+        }
+
+        for (; batch_idx < num_active; batch_idx += 1) {
+            exact_distances[batch_idx] += sim_table_ptr
+                    [compressed_codes[chunk_offset + batch_idx]];
+        }
+    }
+}
+
+size_t process_filtering(
+        size_t num_active,
+        float* exact_distances,
+        uint32_t* active_indices,
+        float* cum_sums,
+        uint8_t* bitset,
+        size_t batch_offset,
+        float dis0,
+        float query_cum_norm,
+        float epsilon,
+        float heap_max) {
+    size_t next_num_active = 0;
+    for (size_t i = 0; i < num_active; i++) {
+        float exact_distance = exact_distances[i];
+        float cum_sum = cum_sums[active_indices[i] - batch_offset];
+        float lower_bound =
+                exact_distance + dis0 - cum_sum * query_cum_norm * epsilon;
+
+        bool keep = heap_max > lower_bound;
+        active_indices[next_num_active] = active_indices[i];
+        exact_distances[next_num_active] = exact_distance;
+        bitset[active_indices[i] - batch_offset] = keep;
+        next_num_active += keep;
+    }
+    return next_num_active;
+}
+
+std::pair<uint8_t*, size_t> process_code_compression(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t chunk_size,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    uint8_t* compressed_codes = compressed_codes_begin;
+    size_t num_active = 0;
+
+    // An important optimization is to skip the compression if all points
+    // are active, as we can just use the compressed_codes_begin pointer.
+    if (next_num_active < max_batch_size) {
+        // Compress the codes: here we don't need to process remainders
+        // as long as `max_batch_size` is a multiple of 64 (which we
+        // assert in the constructor). Conveniently, compressed_codes is
+        // allocated to `max_batch_size` * `chunk_size` elements.
+        // `num_active` is guaranteed to always be less than or equal to
+        // `max_batch_size`. Only the last batch may be smaller than
+        // `max_batch_size`, the caller ensures that the batch and
+        // bitset are padded with zeros.
+        compressed_codes = compressed_codes_begin;
+        for (size_t point_idx = 0; point_idx < max_batch_size;
+             point_idx += 64) {
+            // Build a 64-bit mask from the byteset: each byte is
+            // 0 or 1, collect into a single bitmask.
+            uint64_t mask = 0;
+#ifdef __BMI2__
+            // PEXT path: extract the LSB of each byte into a
+            // single bit, producing a 64-bit bitmask.
+            for (int g = 0; g < 8; g++) {
+                uint64_t bytes;
+                memcpy(&bytes, bitset + point_idx + g * 8, 8);
+                uint8_t bits = (uint8_t)_pext_u64(
+                        bytes, 0x0101010101010101ULL);
+                mask |= ((uint64_t)bits << (g * 8));
+            }
+#else
+            for (int b = 0; b < 64; b++) {
+                if (bitset[point_idx + b])
+                    mask |= (1ULL << b);
+            }
+#endif
+
+            // Byte-level stream compaction (replaces
+            // _mm512_maskz_compress_epi8 which requires VBMI2).
+#ifdef __BMI2__
+            // PEXT/PDEP path: process 8 bytes at a time. PDEP
+            // expands the per-byte mask bits into a per-byte lane
+            // mask, then PEXT extracts only the selected bytes.
+            for (size_t ci = 0; ci < chunk_size; ci++) {
+                size_t chunk_offset = ci * max_batch_size;
+                const uint8_t* src = codes + chunk_offset + point_idx;
+                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+                int write_pos = 0;
+                for (int g = 0; g < 8; g++) {
+                    uint64_t src_val;
+                    memcpy(&src_val, src + g * 8, 8);
+                    uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF);
+                    uint64_t byte_mask =
+                            _pdep_u64(submask, 0x0101010101010101ULL) *
+                            0xFF;
+                    uint64_t compressed_val = _pext_u64(src_val, byte_mask);
+                    int count = __builtin_popcount(submask);
+                    memcpy(dst + write_pos, &compressed_val, 8);
+                    write_pos += count;
+                }
+            }
+#else
+            // Scalar fallback: scan set bits one by one and copy
+            // the corresponding code byte.
+            for (size_t ci = 0; ci < chunk_size; ci++) {
+                size_t chunk_offset = ci * max_batch_size;
+                const uint8_t* src = codes + chunk_offset + point_idx;
+                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+                int write_pos = 0;
+                uint64_t m = mask;
+                while (m) {
+                    int bit = __builtin_ctzll(m);
+                    dst[write_pos++] = src[bit];
+                    m &= m - 1;
+                }
+            }
+#endif
+
+            num_active += __builtin_popcountll(mask);
+        }
+    } else {
+        num_active = next_num_active;
+        compressed_codes = const_cast<uint8_t*>(codes);
+    }
+
+    return std::make_pair(compressed_codes, num_active);
+}
+
+} // namespace panorama_kernels
+} // namespace faiss
+
+#endif // COMPILE_SIMD_AVX512
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
new file mode 100644
index 0000000000..ab9f7acb57
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Scalar implementations of Panorama kernels.
+// Compiled only when no SIMD variant (AVX2/AVX-512) is available.
+
+#if !defined(COMPILE_SIMD_AVX2) && !defined(COMPILE_SIMD_AVX512)
+
+#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+
+#include <cstring>
+
+#ifdef __BMI2__
+#include <immintrin.h>
+#endif
+
+namespace faiss {
+namespace panorama_kernels {
+
+void process_chunks(
+        size_t chunk_size,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    for (size_t chunk_idx = 0; chunk_idx < chunk_size; chunk_idx++) {
+        size_t chunk_offset = chunk_idx * max_batch_size;
+        float* chunk_sim = sim_table + chunk_idx * 256;
+        for (size_t i = 0; i < num_active; i++) {
+            exact_distances[i] +=
+                    chunk_sim[compressed_codes[chunk_offset + i]];
+        }
+    }
+}
+
+size_t process_filtering(
+        size_t num_active,
+        float* exact_distances,
+        uint32_t* active_indices,
+        float* cum_sums,
+        uint8_t* bitset,
+        size_t batch_offset,
+        float dis0,
+        float query_cum_norm,
+        float epsilon,
+        float heap_max) {
+    size_t next_num_active = 0;
+    for (size_t i = 0; i < num_active; i++) {
+        float exact_distance = exact_distances[i];
+        float cum_sum = cum_sums[active_indices[i] - batch_offset];
+        float lower_bound =
+                exact_distance + dis0 - cum_sum * query_cum_norm * epsilon;
+
+        bool keep = heap_max > lower_bound;
+        active_indices[next_num_active] = active_indices[i];
+        exact_distances[next_num_active] = exact_distance;
+        bitset[active_indices[i] - batch_offset] = keep;
+        next_num_active += keep;
+    }
+    return next_num_active;
+}
+
+std::pair<uint8_t*, size_t> process_code_compression(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t chunk_size,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    uint8_t* compressed_codes = compressed_codes_begin;
+    size_t num_active = 0;
+
+    // An important optimization is to skip the compression if all points
+    // are active, as we can just use the compressed_codes_begin pointer.
+    if (next_num_active < max_batch_size) {
+        compressed_codes = compressed_codes_begin;
+        for (size_t point_idx = 0; point_idx < max_batch_size;
+             point_idx += 64) {
+            // Build a 64-bit mask from the byteset: each byte is
+            // 0 or 1, collect into a single bitmask.
+            uint64_t mask = 0;
+#ifdef __BMI2__
+            for (int g = 0; g < 8; g++) {
+                uint64_t bytes;
+                memcpy(&bytes, bitset + point_idx + g * 8, 8);
+                uint8_t bits = (uint8_t)_pext_u64(
+                        bytes, 0x0101010101010101ULL);
+                mask |= ((uint64_t)bits << (g * 8));
+            }
+#else
+            for (int b = 0; b < 64; b++) {
+                if (bitset[point_idx + b])
+                    mask |= (1ULL << b);
+            }
+#endif
+
+            // Byte-level stream compaction.
+#ifdef __BMI2__
+            // PEXT/PDEP path: process 8 bytes at a time. PDEP
+            // expands the per-byte mask bits into a per-byte lane
+            // mask, then PEXT extracts only the selected bytes.
+            for (size_t ci = 0; ci < chunk_size; ci++) {
+                size_t chunk_offset = ci * max_batch_size;
+                const uint8_t* src = codes + chunk_offset + point_idx;
+                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+                int write_pos = 0;
+                for (int g = 0; g < 8; g++) {
+                    uint64_t src_val;
+                    memcpy(&src_val, src + g * 8, 8);
+                    uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF);
+                    uint64_t byte_mask =
+                            _pdep_u64(submask, 0x0101010101010101ULL) *
+                            0xFF;
+                    uint64_t compressed_val = _pext_u64(src_val, byte_mask);
+                    int count = __builtin_popcount(submask);
+                    memcpy(dst + write_pos, &compressed_val, 8);
+                    write_pos += count;
+                }
+            }
+#else
+            // Scalar fallback: scan set bits one by one and copy
+            // the corresponding code byte.
+            for (size_t ci = 0; ci < chunk_size; ci++) {
+                size_t chunk_offset = ci * max_batch_size;
+                const uint8_t* src = codes + chunk_offset + point_idx;
+                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+                int write_pos = 0;
+                uint64_t m = mask;
+                while (m) {
+                    int bit = __builtin_ctzll(m);
+                    dst[write_pos++] = src[bit];
+                    m &= m - 1;
+                }
+            }
+#endif
+
+            num_active += __builtin_popcountll(mask);
+        }
+    } else {
+        num_active = next_num_active;
+        compressed_codes = const_cast<uint8_t*>(codes);
+    }
+
+    return std::make_pair(compressed_codes, num_active);
+}
+
+} // namespace panorama_kernels
+} // namespace faiss
+
+#endif // !COMPILE_SIMD_AVX2 && !COMPILE_SIMD_AVX512
diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h
new file mode 100644
index 0000000000..6c8d007ddd
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/**
+ * @file panorama_kernels.h
+ * @brief Panorama search kernels with scalar and AVX-512 implementations.
+ *
+ * The three core kernels of the Panorama progressive filtering search:
+ * - process_chunks: accumulate PQ distance table lookups over chunks
+ * - process_filtering: Cauchy-Schwarz lower bound pruning with stream
+ *   compaction
+ * - process_code_compression: byte-level stream compaction of PQ codes
+ *
+ * Implementations live in panorama_kernels-generic.cpp (scalar) and
+ * panorama_kernels-avx512.cpp (AVX-512 gather/compress + BMI2 PEXT/PDEP).
+ */
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+namespace faiss {
+namespace panorama_kernels {
+
+/// Accumulate PQ distance table lookups over chunks.
+///
+/// For each chunk, looks up `sim_table[compressed_codes[i]]` and
+/// accumulates into `exact_distances[i]` for all active elements.
+/// Iterates chunks first to keep the LUT slice in L1 cache.
+/// The AVX-512 version unrolls 4 chunks at a time.
+void process_chunks(
+        size_t chunk_size,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances);
+
+/// Filter active elements using Cauchy-Schwarz lower bound pruning.
+///
+/// Computes a lower bound on the true distance for each active element
+/// and removes elements that cannot improve the current heap top.
+/// Uses stream compaction to pack surviving elements contiguously.
+/// Updates the bitset to reflect which elements were removed.
+///
+/// Unfortunately, AVX-512 does not support a way to scatter at a
+/// 1-byte granularity, so the bitset update for removed items is
+/// done sequentially after compressing the indices.
+size_t process_filtering(
+        size_t num_active,
+        float* exact_distances,
+        uint32_t* active_indices,
+        float* cum_sums,
+        uint8_t* bitset,
+        size_t batch_offset,
+        float dis0,
+        float query_cum_norm,
+        float epsilon,
+        float heap_max);
+
+/// Byte-level stream compaction of PQ codes using the active bitset.
+///
+/// An important optimization is to skip the compression if all points
+/// are active, as we can just use the original codes pointer.
+///
+/// Compress the codes: here we don't need to process remainders
+/// as long as `max_batch_size` is a multiple of 64 (which we
+/// assert in the constructor). Conveniently, compressed_codes is
+/// allocated to `max_batch_size` * `chunk_size` elements.
+/// `num_active` is guaranteed to always be less than or equal to
+/// `max_batch_size`. Only the last batch may be smaller than
+/// `max_batch_size`, the caller ensures that the batch and
+/// bitset are padded with zeros.
+std::pair<uint8_t*, size_t> process_code_compression(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t chunk_size,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes);
+
+} // namespace panorama_kernels
+} // namespace faiss

From a1f7274725de36dcfa187d1b8440a1b0f2b7ebba Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Wed, 18 Mar 2026 04:21:49 +0000
Subject: [PATCH 06/41] Unread but fast

---
 benchs/bench_ivfpq_panorama.py |  55 +--
 faiss/IndexIVF.h               |  37 --
 faiss/IndexIVFPQ.cpp           | 279 --------------
 faiss/IndexIVFPQPanorama.cpp   | 674 +++++++++++++++------------------
 faiss/IndexIVFPQPanorama.h     |  98 +++--
 5 files changed, 390 insertions(+), 753 deletions(-)

diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
index ebd1336092..7c965d54fe 100644
--- a/benchs/bench_ivfpq_panorama.py
+++ b/benchs/bench_ivfpq_panorama.py
@@ -16,7 +16,7 @@ def fvecs_read(fname):
     return a.reshape(-1, d + 1)[:, 1:].copy()
 
 
-GIST_DIR = "/home/lutex/PCA_init"
+GIST_DIR = "/datasets/PCA_init"
 CACHE_DIR = "/home/lutex/faiss-panorama/index_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 
@@ -74,7 +74,7 @@ def eval_recall(index, nprobe_val):
     return recall, qps
 
 
-# faiss.omp_set_num_threads(mp.cpu_count())
+faiss.omp_set_num_threads(mp.cpu_count())
 
 # --- IVFPQ baseline (cached) ---
 if os.path.exists(IVFPQ_CACHE):
@@ -109,41 +109,46 @@ def eval_recall(index, nprobe_val):
 # --- IVFPQPanorama (reuse trained PQ from cache) ---
 faiss.omp_set_num_threads(mp.cpu_count())
 
-if os.path.exists(IVFPQ_TRAINED_CACHE):
-    print(f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", flush=True)
-    trained = faiss.read_index(IVFPQ_TRAINED_CACHE)
-    quantizer2 = trained.quantizer
-    trained.own_fields = False
 
-    ivfpq_pano = faiss.IndexIVFPQPanorama(
+def build_panorama_from_trained(trained_index):
+    quantizer2 = trained_index.quantizer
+    trained_index.own_fields = False
+
+    pano = faiss.IndexIVFPQPanorama(
         quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size
     )
-    centroids = faiss.vector_to_array(trained.pq.centroids)
-    faiss.copy_array_to_vector(centroids, ivfpq_pano.pq.centroids)
-    ivfpq_pano.is_trained = True
-    ivfpq_pano.use_precomputed_table = 1
-    ivfpq_pano.precompute_table()
+    centroids = faiss.vector_to_array(trained_index.pq.centroids)
+    faiss.copy_array_to_vector(centroids, pano.pq.centroids)
+    pano.is_trained = True
+    pano.use_precomputed_table = 1
+    pano.precompute_table()
+    return pano
 
+
+if os.path.exists(IVFPQ_TRAINED_CACHE):
+    print(f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", flush=True)
+    trained = faiss.read_index(IVFPQ_TRAINED_CACHE)
+    ivfpq_pano = build_panorama_from_trained(trained)
     print("  Reused trained PQ (skipped training).", flush=True)
-    t0 = time.time()
-    ivfpq_pano.add(xb)
-    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
 else:
     print(
-        f"\nBuilding IVFPQPanorama from scratch: nlist={nlist}, M={M}, nbits={nbits}, "
-        f"n_levels={n_levels}, epsilon={epsilon}, batch_size={batch_size}",
+        f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}",
         flush=True,
     )
     quantizer2 = faiss.IndexFlatL2(d)
-    ivfpq_pano = faiss.IndexIVFPQPanorama(
-        quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size
-    )
+    trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits)
     t0 = time.time()
-    ivfpq_pano.train(xt)
+    trained.train(xt)
     print(f"  Training took {time.time() - t0:.1f}s", flush=True)
-    t0 = time.time()
-    ivfpq_pano.add(xb)
-    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
+
+    print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
+    faiss.write_index(trained, IVFPQ_TRAINED_CACHE)
+
+    ivfpq_pano = build_panorama_from_trained(trained)
+
+t0 = time.time()
+ivfpq_pano.add(xb)
+print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
 
 faiss.omp_set_num_threads(1)
 print("\n====== IVFPQPanorama", flush=True)
diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h
index d66523d245..a02665452f 100644
--- a/faiss/IndexIVF.h
+++ b/faiss/IndexIVF.h
@@ -19,11 +19,9 @@
 #include <faiss/invlists/DirectMap.h>
 #include <faiss/invlists/InvertedLists.h>
 #include <faiss/utils/Heap.h>
-#include <faiss/impl/ProductQuantizer.h>
 
 namespace faiss {
 
-struct IndexIVFPQPanorama;
 
 /** Encapsulates a quantizer object for the IndexIVF
  *
@@ -500,15 +498,6 @@ struct InvertedListScanner {
     /// following codes come from this inverted list
     virtual void set_list(idx_t list_no, float coarse_dis);
 
-    virtual void set_list_panorama(
-            idx_t list_no,
-            float coarse_dis,
-            float* sim_table,
-            float* dis0_ptr,
-            bool update) {}
-
-    virtual void set_sim_table(float* sim_table, float dis0_ptr) {}
-
     /// compute a single query-to-code distance
     virtual float distance_to_code(const uint8_t* code) const = 0;
 
@@ -565,32 +554,6 @@ struct InvertedListScanner {
             const idx_t* ids,
             ResultHandler& handler) const;
 
-    virtual size_t process_batch(
-            const ProductQuantizer& pq,
-            uint8_t* compressed_codes,
-            size_t cluster_id,
-            size_t batch_no,
-            float coarse_dis_i,
-            size_t curr_batch_size,
-            size_t max_batch_size,
-            size_t chunk_size,
-            float epsilon,
-            size_t n_levels,
-            const uint8_t* codes_batch,
-            float* cums,
-            float* query_cum_norms,
-            uint32_t* active_indices,
-            uint8_t* bitset,
-            float* exact_distances,
-            const idx_t* ids,
-            float* heap_sim,
-            idx_t* heap_ids,
-            size_t k,
-            float* dis0_cache,
-            float* sim_table_cache) {
-        return 0;
-    }
-
     virtual ~InvertedListScanner() {}
 };
 
diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp
index aca27f903d..270c092740 100644
--- a/faiss/IndexIVFPQ.cpp
+++ b/faiss/IndexIVFPQ.cpp
@@ -32,7 +32,6 @@
 #include <faiss/impl/ProductQuantizer.h>
 #include <faiss/impl/ResultHandler.h>
 #include <faiss/impl/pq_code_distance/pq_code_distance-inl.h>
-#include <faiss/impl/panorama_kernels/panorama_kernels.h>
 #include <faiss/impl/simd_dispatch.h>
 
 namespace faiss {
@@ -762,44 +761,6 @@ struct QueryTables {
 
         return dis0;
     }
-
-    float precompute_list_tables_L2_panorama(float* sim_table_ptr) {
-        float dis0 = 0;
-
-        if (use_precomputed_table == 1) {
-            dis0 = coarse_dis;
-
-            const size_t n = pq.M * pq.ksub;
-            const float bf = -2.0f;
-            const float* b = sim_table_2;
-            float* c = sim_table_ptr;
-
-            for (size_t idx = 0; idx < n; idx++) {
-                c[idx] = bf * b[idx];
-            }
-
-            sim_table = sim_table_ptr;
-        } else {
-            FAISS_THROW_MSG(
-                    "Panorama PQ only supports use_precomputed_table == 1");
-        }
-
-        return dis0;
-    }
-
-    float precompute_list_tables_panorama(float* sim_table_ptr) {
-        float dis0 = 0;
-        uint64_t t0;
-        TIC;
-        if (by_residual) {
-            if (metric_type == METRIC_INNER_PRODUCT)
-                dis0 = precompute_list_tables_IP();
-            else
-                dis0 = precompute_list_tables_L2_panorama(sim_table_ptr);
-        }
-        init_list_cycles += TOC;
-        return dis0;
-    }
 };
 
 template <class C, bool use_sel>
@@ -831,39 +792,6 @@ struct WrappedSearchResult {
     }
 };
 
-template <class C, bool use_sel>
-struct KnnSearchResultsPanorama {
-    idx_t key;
-    const idx_t* ids;
-    const IDSelector* sel;
-
-    size_t k;
-    float* heap_sim;
-    idx_t* heap_ids;
-
-    size_t nup;
-
-    inline bool skip_entry(idx_t j) {
-        return use_sel && !sel->is_member(ids[j]);
-    }
-
-    inline bool should_keep(float dis) {
-        return C::cmp(heap_sim[0], dis);
-    }
-
-    inline float top() {
-        return heap_sim[0];
-    }
-
-    inline void add(idx_t j, float dis) {
-        if (C::cmp(heap_sim[0], dis)) {
-            idx_t id = ids ? ids[j] : lo_build(key, j);
-            heap_replace_top<C>(k, heap_sim, heap_ids, dis, id);
-            nup++;
-        }
-    }
-};
-
 /*****************************************************
  * Scaning the codes.
  * The scanning functions call their favorite precompute_*
@@ -894,26 +822,6 @@ struct IVFPQScannerT : QueryTables {
         }
     }
 
-    void init_list_panorama(
-            idx_t list_no,
-            float coarse_dis,
-            int mode,
-            float* sim_table,
-            float* dis0_ptr,
-            bool update) {
-        this->key = list_no;
-        this->coarse_dis = coarse_dis;
-
-        if (mode == 2) {
-            if (update) {
-                *dis0_ptr = precompute_list_tables_panorama(sim_table);
-            }
-            dis0 = *dis0_ptr;
-        } else if (mode == 1) {
-            dis0 = precompute_list_table_pointers();
-        }
-    }
-
     /*****************************************************
      * Scaning the codes: simple PQ scan.
      *****************************************************/
@@ -1300,193 +1208,6 @@ struct IVFPQScanner : IVFPQScannerT<idx_t, METRIC_TYPE, PQCodeDist>,
         this->init_list(list_no, coarse_dis, precompute_mode);
     }
 
-    void set_list_panorama(
-            idx_t list_no,
-            float coarse_dis,
-            float* sim_table,
-            float* dis0_ptr,
-            bool update) override {
-        this->list_no = list_no;
-        this->init_list_panorama(
-                list_no,
-                coarse_dis,
-                precompute_mode,
-                sim_table,
-                dis0_ptr,
-                update);
-    }
-
-    void set_sim_table(float* sim_table, float dis0) override {
-        this->sim_table = sim_table;
-        this->dis0 = dis0;
-    }
-
-    // Panorama kernels (process_chunks, process_filtering,
-    // process_code_compression) are implemented in
-    // faiss/impl/panorama_kernels/ with scalar and AVX-512 variants.
-    // The linker selects the right one based on the SIMD compile target.
-
-    inline void process_chunks_sparse(
-            size_t chunk_size,
-            size_t max_batch_size,
-            size_t num_active,
-            float* sim_table,
-            const uint8_t* codes,
-            float* exact_distances,
-            uint32_t* active_indices,
-            size_t batch_offset,
-            size_t ksub) {
-        for (size_t ci = 0; ci < chunk_size; ci++) {
-            size_t chunk_offset = ci * max_batch_size;
-            float* chunk_sim_table = sim_table + ci * ksub;
-
-            for (size_t batch_idx = 0; batch_idx < num_active; batch_idx++) {
-                size_t real_idx = active_indices[batch_idx] - batch_offset;
-                uint8_t code = codes[chunk_offset + real_idx];
-                exact_distances[batch_idx] += chunk_sim_table[code];
-            }
-        }
-    }
-
-    size_t process_batch(
-            const ProductQuantizer& pq,
-            uint8_t* compressed_codes,
-            size_t cluster_id,
-            size_t batch_no,
-            float coarse_dis_i,
-            size_t curr_batch_size,
-            size_t max_batch_size,
-            size_t chunk_size,
-            float epsilon,
-            size_t n_levels,
-            const uint8_t* codes_batch,
-            float* cums,
-            float* query_cum_norms,
-            uint32_t* active_indices,
-            uint8_t* bitset,
-            float* exact_distances,
-            const idx_t* ids,
-            float* heap_sim,
-            idx_t* heap_ids,
-            size_t k,
-            float* dis0_cache,
-            float* sim_table_cache) override {
-        KnnSearchResultsPanorama<C, use_sel> res = {
-                this->key,
-                this->store_pairs ? nullptr : ids,
-                this->sel,
-                k,
-                heap_sim,
-                heap_ids,
-                0};
-        uint8_t* compressed_codes_begin = compressed_codes;
-        size_t total_active = 0;
-
-        // The remaining active elements computed at the end of each level.
-        // We initialize to `curr_batch_size` for continuity.
-        size_t next_num_active = curr_batch_size;
-        // For historical reasons, we initialize dis0 only at
-        // the beginning of the first level, but we need to access it after
-        // all levels have been processed, so we declare dis0 here.
-        float dis0 = 0;
-        // Given that `active_indices` indexes the cluster directly, we need
-        // to offset it by the batch offset when updating the bitset and
-        // accessing the cum_sums. This way we avoid yet another layer of
-        // indirection.
-        size_t batch_offset = batch_no * max_batch_size;
-        for (size_t level = 0; (level < n_levels) && (next_num_active > 0);
-             level++) {
-            total_active += next_num_active;
-
-            // This ensures the LUT is pointing to the right offset, and is
-            // properly initialized. We only compute dis0 distances once for
-            // each cluster, and cache the result.
-            size_t level_offset_sim_table = level * pq.ksub * chunk_size;
-            this->set_list_panorama(
-                    cluster_id,
-                    coarse_dis_i,
-                    sim_table_cache + level_offset_sim_table,
-                    dis0_cache, // Only init once for each cluster.
-                    level == 0 && batch_no == 0);
-            this->set_sim_table(
-                    sim_table_cache + level_offset_sim_table, *dis0_cache);
-
-            dis0 = this->dis0;
-
-            // We multiply by two here so we don't have to do it in the
-            // kernel.
-            float query_cum_norm = 2 * query_cum_norms[level + 1];
-
-            float heap_max = res.top();
-
-            // Codes has padding potentially, cumsum does not.
-            float* cum_sums = cums + curr_batch_size * level;
-            const uint8_t* codes =
-                    codes_batch + max_batch_size * chunk_size * level;
-
-            bool is_sparse = next_num_active < max_batch_size / 16;
-            float* sim_table = this->sim_table;
-
-            // Phase 1: Process all chunks and accumulate distances.
-            // We iterate over chunks first as this keeps the same LUT slice
-            // within the L1 cache. To avoid register thrashing, we unroll
-            // 4 chunks at a time.
-            size_t num_active_for_filtering = 0;
-            if (is_sparse) {
-                process_chunks_sparse(
-                        chunk_size,
-                        max_batch_size,
-                        next_num_active,
-                        sim_table,
-                        codes,
-                        exact_distances,
-                        active_indices,
-                        batch_offset,
-                        pq.ksub);
-                num_active_for_filtering = next_num_active;
-            } else {
-                auto [cc, na] =
-                        panorama_kernels::process_code_compression(
-                                next_num_active,
-                                max_batch_size,
-                                chunk_size,
-                                compressed_codes_begin,
-                                bitset,
-                                codes);
-
-                panorama_kernels::process_chunks(
-                        chunk_size,
-                        max_batch_size,
-                        na,
-                        sim_table,
-                        cc,
-                        exact_distances);
-                num_active_for_filtering = na;
-            }
-
-            // Phase 2: Filtering logic using accumulated distances.
-            next_num_active = panorama_kernels::process_filtering(
-                    num_active_for_filtering,
-                    exact_distances,
-                    active_indices,
-                    cum_sums,
-                    bitset,
-                    batch_offset,
-                    dis0,
-                    query_cum_norm,
-                    epsilon,
-                    heap_max);
-        }
-
-        // Phase 3: Insert remaining candidates to heap.
-        for (size_t batch_idx = 0; batch_idx < next_num_active; batch_idx++) {
-            res.add(active_indices[batch_idx],
-                    dis0 + exact_distances[batch_idx]);
-        }
-
-        return total_active;
-    }
-
     float distance_to_code(const uint8_t* code) const override {
         assert(precompute_mode == 2);
         float dis = this->dis0 +
diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index aae0811176..8ac3df210a 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -1,29 +1,27 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #include <faiss/IndexIVFPQPanorama.h>
-#include <omp.h>
-#include <cstdint>
-#include <memory>
-#include <mutex>
 
 #include <algorithm>
-#include <cinttypes>
+#include <cmath>
 #include <cstdio>
-#include <iostream>
-#include <limits>
+#include <cstring>
 #include <numeric>
 
-#include <faiss/utils/hamming.h>
-#include <faiss/utils/utils.h>
-
-#include <faiss/IndexFlat.h>
-#include <faiss/impl/AuxIndexStructures.h>
-#include <faiss/impl/CodePacker.h>
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
+#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+#include <faiss/utils/Heap.h>
 
 namespace faiss {
 
-static uint64_t total_active = 0;
-static uint64_t total_points = 0;
+/*****************************************
+ * Constructor
+ ******************************************/
 
 IndexIVFPQPanorama::IndexIVFPQPanorama(
         Index* quantizer,
@@ -45,78 +43,65 @@ IndexIVFPQPanorama::IndexIVFPQPanorama(
                   metric,
                   own_invlists),
           n_levels(n_levels),
-          added(false),
+          epsilon(epsilon),
+          batch_size(batch_size),
           chunk_size(code_size / n_levels),
           levels_size(d / n_levels),
-          nbits_per_idx(nbits_per_idx),
-          m_level_width(M / n_levels),
-          epsilon(epsilon),
-          batch_size(batch_size) {
-    FAISS_ASSERT(M % n_levels == 0);
-    FAISS_ASSERT(batch_size % 64 == 0);
-
-    printf("N levels = %d\n", n_levels);
-    printf("M = code_size = %zu\n", M);
-    printf("Nbits per idx = %u (fixed)\n", 8);
-    printf("Nlist = %zu\n", nlist);
-    printf("Batch size = %zuB\n", batch_size);
-
-    FAISS_ASSERT(m_level_width > 0);
-    FAISS_ASSERT(nbits_per_idx == 8);
-    FAISS_ASSERT(M == code_size);
-    FAISS_ASSERT(metric == METRIC_L2);
+          m_level_width(M / n_levels) {
+    FAISS_THROW_IF_NOT_MSG(M % n_levels == 0, "M must be divisible by n_levels");
+    FAISS_THROW_IF_NOT_MSG(batch_size % 64 == 0, "batch_size must be multiple of 64");
+    FAISS_THROW_IF_NOT_MSG(nbits_per_idx == 8, "only 8-bit PQ codes supported");
+    FAISS_THROW_IF_NOT_MSG(M == code_size, "M must equal code_size for 8-bit PQ");
+    FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported");
 }
 
+/*****************************************
+ * add — transpose codes into column-major layout and precompute norms
+ ******************************************/
+
 void IndexIVFPQPanorama::add(idx_t n, const float* x) {
-    FAISS_ASSERT(!added);
+    FAISS_THROW_IF_NOT_MSG(!added, "IndexIVFPQPanorama only supports a single add() call");
     added = true;
-
     num_points = n;
+
     IndexIVFPQ::add(n, x);
 
-    size_t new_n = 0;
+    // Compute column offsets (each list rounded up to batch_size).
+    size_t total_column_bytes = 0;
     column_offsets = new size_t[nlist];
     for (size_t i = 0; i < nlist; i++) {
-        column_offsets[i] = new_n;
-        size_t batch_n = (invlists->list_size(i) + batch_size - 1) / batch_size;
-        size_t rounded_n = batch_n * batch_size;
-        new_n += rounded_n * code_size;
+        column_offsets[i] = total_column_bytes;
+        size_t n_batches =
+                (invlists->list_size(i) + batch_size - 1) / batch_size;
+        total_column_bytes += n_batches * batch_size * code_size;
     }
 
-    column_storage = new uint8_t[code_size * new_n];
-
+    // Transpose codes from row-major [point0_code, point1_code, ...] into
+    // column-major within each batch: M columns of batch_size bytes each.
+    column_storage = new uint8_t[total_column_bytes]();
     for (size_t list_no = 0; list_no < nlist; list_no++) {
         size_t col_offset = column_offsets[list_no];
         size_t list_size = invlists->list_size(list_no);
         size_t n_batches = (list_size + batch_size - 1) / batch_size;
+        const uint8_t* row_codes = invlists->get_codes(list_no);
+
         for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
             size_t batch_offset = batch_no * batch_size * code_size;
             size_t curr_batch_size =
                     std::min(list_size - batch_no * batch_size, batch_size);
             for (size_t m = 0; m < pq.M; m++) {
-                size_t m_offset = m * batch_size;
-                for (size_t point_idx = 0; point_idx < batch_size;
-                     point_idx++) {
-                    uint8_t* dest = column_storage + col_offset + batch_offset +
-                            m_offset + point_idx;
-                    const uint8_t* codes = invlists->get_codes(list_no);
-
-                    if (point_idx < curr_batch_size) {
-                        const uint8_t* src = codes + batch_offset +
-                                point_idx * code_size + m;
-                        memcpy(dest, src, 1);
-                    } else {
-                        *dest = 0;
-                    }
+                for (size_t p = 0; p < curr_batch_size; p++) {
+                    column_storage[col_offset + batch_offset +
+                                   m * batch_size + p] =
+                            row_codes[batch_no * batch_size * code_size +
+                                      p * code_size + m];
                 }
             }
         }
     }
 
-    cum_sums = new float[(n_levels + 1) * n];
+    // Precompute cumulative residual norms and initial exact distances.
     cum_sum_offsets = new size_t[nlist];
-
-    init_exact_distances = new float[n];
     init_exact_distances_offsets = new size_t[nlist];
 
     size_t cum_size = 0;
@@ -124,13 +109,14 @@ void IndexIVFPQPanorama::add(idx_t n, const float* x) {
     for (size_t list_no = 0; list_no < nlist; list_no++) {
         cum_sum_offsets[list_no] = cum_size;
         cum_size += invlists->list_size(list_no) * (n_levels + 1);
-
         init_exact_distances_offsets[list_no] = init_size;
         init_size += invlists->list_size(list_no);
     }
 
+    cum_sums = new float[cum_size];
+    init_exact_distances = new float[init_size];
+
     for (size_t list_no = 0; list_no < nlist; list_no++) {
-        const idx_t* idx = invlists->get_ids(list_no);
         size_t list_size = invlists->list_size(list_no);
 
         std::vector<float> centroid(d);
@@ -141,365 +127,299 @@ void IndexIVFPQPanorama::add(idx_t n, const float* x) {
         for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
             size_t b_offset = batch_no * batch_size;
             size_t curr_batch_size =
-                    std::min(list_size - batch_no * batch_size, batch_size);
+                    std::min(list_size - b_offset, batch_size);
 
-            for (size_t point_idx = 0; point_idx < curr_batch_size;
-                 point_idx++) {
-                float init_exact_distance = 0.0f;
-
-                std::vector<float> vector(d);
+            for (size_t p = 0; p < curr_batch_size; p++) {
+                std::vector<float> vec(d);
                 const uint8_t* code =
-                        invlists->get_single_code(list_no, b_offset + point_idx);
-                pq.decode(code, vector.data());
-
-                std::vector<float> suffix_sums(d + 1);
-                suffix_sums[d] = 0.0f;
+                        invlists->get_single_code(list_no, b_offset + p);
+                pq.decode(code, vec.data());
 
+                float init_dist = 0.0f;
+                std::vector<float> suffix(d + 1, 0.0f);
                 for (int j = d - 1; j >= 0; j--) {
-                    init_exact_distance +=
-                            vector[j] * vector[j] + 2 * vector[j] * centroid[j];
-                    float squaredVal = vector[j] * vector[j];
-                    suffix_sums[j] = suffix_sums[j + 1] + squaredVal;
+                    init_dist += vec[j] * vec[j] + 2 * vec[j] * centroid[j];
+                    suffix[j] = suffix[j + 1] + vec[j] * vec[j];
                 }
 
                 for (int level = 0; level < n_levels; level++) {
                     int start_idx = level * levels_size;
                     size_t offset = cum_sum_offsets[list_no] +
                             b_offset * (n_levels + 1) +
-                            level * curr_batch_size + point_idx;
-                    if (start_idx < (int)d) {
-                        cum_sums[offset] = sqrt(suffix_sums[start_idx]);
-                    } else {
-                        cum_sums[offset] = 0.0f;
-                    }
+                            level * curr_batch_size + p;
+                    cum_sums[offset] = start_idx < (int)d
+                            ? std::sqrt(suffix[start_idx])
+                            : 0.0f;
                 }
 
-                size_t offset = cum_sum_offsets[list_no] +
+                size_t last_offset = cum_sum_offsets[list_no] +
                         b_offset * (n_levels + 1) +
-                        n_levels * curr_batch_size + point_idx;
-                cum_sums[offset] = 0.0f;
+                        n_levels * curr_batch_size + p;
+                cum_sums[last_offset] = 0.0f;
 
-                size_t init_offset = init_exact_distances_offsets[list_no];
-                init_exact_distances[init_offset + b_offset + point_idx] =
-                        init_exact_distance;
+                init_exact_distances
+                        [init_exact_distances_offsets[list_no] + b_offset + p] =
+                                init_dist;
             }
         }
     }
 }
 
-void IndexIVFPQPanorama::search(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        float* distances,
-        idx_t* labels,
-        const SearchParameters* params_in) const {
-    FAISS_THROW_IF_NOT(k > 0);
-    const IVFSearchParameters* params = nullptr;
-    if (params_in) {
-        params = dynamic_cast<const IVFSearchParameters*>(params_in);
-        FAISS_THROW_IF_NOT_MSG(params, "IndexIVF params have incorrect type");
-    }
-    const size_t nprobe =
-            std::min(nlist, params ? params->nprobe : this->nprobe);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    auto sub_search_func = [this, k, nprobe, params](
-                                   idx_t n,
-                                   const float* x,
-                                   float* distances,
-                                   idx_t* labels,
-                                   IndexIVFStats* ivf_stats) {
-        std::unique_ptr<idx_t[]> idx(new idx_t[n * nprobe]);
-        std::unique_ptr<float[]> coarse_dis(new float[n * nprobe]);
-
-        quantizer->search(
-                n,
-                x,
-                nprobe,
-                coarse_dis.get(),
-                idx.get(),
-                params ? params->quantizer_params : nullptr);
-
-        invlists->prefetch_lists(idx.get(), n * nprobe);
-
-        search_preassigned(
-                n,
-                x,
-                k,
-                idx.get(),
-                coarse_dis.get(),
-                distances,
-                labels,
-                false,
-                params,
-                ivf_stats);
-    };
-
-    if ((parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT) == 0) {
-        int nt = std::min(omp_get_max_threads(), int(n));
-        std::vector<IndexIVFStats> stats(nt);
-        std::mutex exception_mutex;
-        std::string exception_string;
-
-#pragma omp parallel for if (nt > 1)
-        for (idx_t slice = 0; slice < nt; slice++) {
-            IndexIVFStats local_stats;
-            idx_t i0 = n * slice / nt;
-            idx_t i1 = n * (slice + 1) / nt;
-            if (i1 > i0) {
-                try {
-                    sub_search_func(
-                            i1 - i0,
-                            x + i0 * d,
-                            distances + i0 * k,
-                            labels + i0 * k,
-                            &stats[slice]);
-                } catch (const std::exception& e) {
-                    std::lock_guard<std::mutex> lock(exception_mutex);
-                    exception_string = e.what();
-                }
-            }
-        }
-
-        if (!exception_string.empty()) {
-            FAISS_THROW_FMT(
-                    "search error: %s", exception_string.c_str());
-        }
-    } else {
-        sub_search_func(n, x, distances, labels, &indexIVF_stats);
+/*****************************************
+ * Panorama scanner — overrides scan_codes with batch processing
+ ******************************************/
+
+namespace {
+
+using idx_t = faiss::idx_t;
+
+template <class C, bool use_sel>
+struct IVFPQScannerPanorama : InvertedListScanner {
+    const IndexIVFPQPanorama& index;
+    const ProductQuantizer& pq;
+
+    // Query state
+    const float* qi = nullptr;
+    std::vector<float> query_cum_norms;
+    std::vector<float> sim_table_2;
+
+    // Per-list state
+    float coarse_dis = 0;
+
+    IVFPQScannerPanorama(
+            const IndexIVFPQPanorama& index,
+            bool store_pairs,
+            const IDSelector* sel)
+            : InvertedListScanner(store_pairs, sel),
+              index(index),
+              pq(index.pq) {
+        this->keep_max = is_similarity_metric(index.metric_type);
+        this->code_size = pq.code_size;
+        query_cum_norms.resize(index.n_levels + 1);
+        sim_table_2.resize(pq.M * pq.ksub);
     }
-}
-
-void IndexIVFPQPanorama::search_preassigned(
-        idx_t n,
-        const float* x,
-        idx_t k,
-        const idx_t* keys,
-        const float* coarse_dis,
-        float* distances,
-        idx_t* labels,
-        bool store_pairs,
-        const IVFSearchParameters* params,
-        IndexIVFStats* ivf_stats) const {
-    FAISS_THROW_IF_NOT(k > 0);
-
-    idx_t nprobe = params ? params->nprobe : this->nprobe;
-    nprobe = std::min((idx_t)nlist, nprobe);
-    FAISS_THROW_IF_NOT(nprobe > 0);
-
-    const idx_t unlimited_list_size = std::numeric_limits<idx_t>::max();
-    idx_t max_codes = params ? params->max_codes : this->max_codes;
-    IDSelector* sel = params ? params->sel : nullptr;
-    const IDSelectorRange* selr = dynamic_cast<const IDSelectorRange*>(sel);
-    if (selr) {
-        if (selr->assume_sorted) {
-            sel = nullptr;
-        } else {
-            selr = nullptr;
-        }
-    }
-
-    FAISS_THROW_IF_NOT_MSG(
-            !(sel && store_pairs),
-            "selector and store_pairs cannot be combined");
 
-    FAISS_THROW_IF_NOT_MSG(
-            !invlists->use_iterator || (max_codes == 0 && store_pairs == false),
-            "iterable inverted lists don't support max_codes and store_pairs");
-
-    size_t nlistv = 0, ndis = 0, nheap = 0;
-
-    using HeapForIP = CMin<float, idx_t>;
-    using HeapForL2 = CMax<float, idx_t>;
-
-    bool interrupt = false;
-    std::mutex exception_mutex;
-    std::string exception_string;
+    void set_query(const float* query) override {
+        this->qi = query;
 
-    int pmode = this->parallel_mode & ~PARALLEL_MODE_NO_HEAP_INIT;
-    bool do_heap_init = !(this->parallel_mode & PARALLEL_MODE_NO_HEAP_INIT);
+        FAISS_ASSERT(index.by_residual);
+        FAISS_ASSERT(index.use_precomputed_table == 1);
 
-    FAISS_THROW_IF_NOT_MSG(
-            max_codes == 0 || pmode == 0 || pmode == 3,
-            "max_codes supported only for parallel_mode = 0 or 3");
+        pq.compute_inner_prod_table(qi, sim_table_2.data());
 
-    if (max_codes == 0) {
-        max_codes = unlimited_list_size;
+        // Compute query suffix sums → cum norms per level.
+        std::vector<float> suffix(index.d + 1, 0.0f);
+        for (int j = index.d - 1; j >= 0; j--) {
+            suffix[j] = suffix[j + 1] + qi[j] * qi[j];
+        }
+        for (int level = 0; level < index.n_levels; level++) {
+            int start = level * index.levels_size;
+            query_cum_norms[level] =
+                    start < (int)index.d ? std::sqrt(suffix[start]) : 0.0f;
+        }
+        query_cum_norms[index.n_levels] = 0.0f;
     }
 
-    [[maybe_unused]] bool do_parallel = omp_get_max_threads() >= 2 &&
-            (pmode == 0           ? false
-                     : pmode == 3 ? n > 1
-                     : pmode == 1 ? nprobe > 1
-                                  : nprobe * n > 1);
-
-    void* inverted_list_context =
-            params ? params->inverted_list_context : nullptr;
-
-    const size_t sim_table_size = pq.ksub * pq.M;
-    std::vector<float> sim_table_cache(nprobe * sim_table_size);
-    std::vector<float> dis0s_cache(nprobe);
-
-    std::vector<float> suffixSums(d + 1);
-    std::vector<float> query_cum_norms(n_levels + 1);
-    std::vector<float> query(d);
-    std::vector<float> exact_distances(batch_size);
-    std::vector<uint8_t> bitset(batch_size);
-    std::vector<uint32_t> active_indices(batch_size);
-    std::vector<uint8_t> compressed_codes(batch_size * chunk_size);
-
-#pragma omp parallel if (do_parallel) reduction(+ : nlistv, ndis, nheap)
-    {
-        std::unique_ptr<InvertedListScanner> scanner(
-                get_InvertedListScanner(store_pairs, sel, params));
-
-        auto init_result = [&](float* simi, idx_t* idxi) {
-            if (!do_heap_init)
-                return;
-            if (metric_type == METRIC_INNER_PRODUCT) {
-                heap_heapify<HeapForIP>(k, simi, idxi);
-            } else {
-                heap_heapify<HeapForL2>(k, simi, idxi);
-            }
-        };
-
-        auto reorder_result = [&](float* simi, idx_t* idxi) {
-            if (!do_heap_init)
-                return;
-            if (metric_type == METRIC_INNER_PRODUCT) {
-                heap_reorder<HeapForIP>(k, simi, idxi);
-            } else {
-                heap_reorder<HeapForL2>(k, simi, idxi);
-            }
-        };
-
-        FAISS_ASSERT(pmode == 0);
-        if (pmode == 0) {
-#pragma omp for
-            for (idx_t i = 0; i < n; i++) {
-                if (interrupt) {
-                    continue;
-                }
+    void set_list(idx_t list_no, float coarse_dis) override {
+        this->list_no = list_no;
+        this->coarse_dis = coarse_dis;
+    }
 
-                scanner->set_query(x + i * d);
-                suffixSums[d] = 0.0f;
+    float distance_to_code(const uint8_t* code) const override {
+        FAISS_THROW_MSG(
+                "IndexIVFPQPanorama does not support distance_to_code");
+    }
 
-                const float* q = x + i * d;
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* /* codes (row-major, unused) */,
+            const idx_t* ids,
+            float* distances,
+            idx_t* labels,
+            size_t k) const override {
+        size_t nup = 0;
+
+        const size_t bs = index.batch_size;
+        const size_t cs = index.chunk_size;
+        const int n_levels = index.n_levels;
+        const float epsilon = index.epsilon;
+
+        const size_t n_batches = (list_size + bs - 1) / bs;
+        const size_t sim_table_size = pq.ksub * pq.M;
+
+        // Panorama column-major codes for this list.
+        const uint8_t* col_codes =
+                index.column_storage + index.column_offsets[list_no];
+        const float* list_cum_sums =
+                index.cum_sums + index.cum_sum_offsets[list_no];
+        const float* list_init_dists =
+                index.init_exact_distances +
+                index.init_exact_distances_offsets[list_no];
+
+        // Scratch buffers.
+        std::vector<float> exact_distances(bs);
+        std::vector<uint8_t> bitset(bs);
+        std::vector<uint32_t> active_indices(bs);
+        std::vector<uint8_t> compressed_codes(bs * cs);
+        std::vector<float> sim_table_cache(sim_table_size);
+        float dis0_cache = 0;
 
-                for (int j = d - 1; j >= 0; --j) {
-                    float squaredVal = q[j] * q[j];
-                    suffixSums[j] = suffixSums[j + 1] + squaredVal;
-                }
+        for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
+            size_t curr_batch_size =
+                    std::min(list_size - batch_no * bs, bs);
+            size_t b_offset = batch_no * bs;
+
+            // Initialize active set.
+            std::iota(
+                    active_indices.begin(),
+                    active_indices.begin() + curr_batch_size,
+                    b_offset);
+            std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1);
+            std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0);
+
+            for (size_t idx = 0; idx < curr_batch_size; idx++) {
+                exact_distances[idx] = list_init_dists[b_offset + idx];
+            }
 
-                for (int level_idx = 0; level_idx < n_levels; level_idx++) {
-                    int startIdx = level_idx * levels_size;
-                    if (startIdx < (int)d) {
-                        query_cum_norms[level_idx] = sqrt(suffixSums[startIdx]);
-                    } else {
-                        query_cum_norms[level_idx] = 0.0f;
+            const uint8_t* batch_codes = col_codes + b_offset * code_size;
+            const float* batch_cums =
+                    list_cum_sums + b_offset * (n_levels + 1);
+
+            size_t next_num_active = curr_batch_size;
+            float dis0 = 0;
+            size_t batch_offset = batch_no * bs;
+
+            for (int level = 0;
+                 level < n_levels && next_num_active > 0;
+                 level++) {
+                // Compute sim table for this level (cached across batches
+                // within same list, only for first batch).
+                size_t level_sim_offset = level * pq.ksub * cs;
+
+                if (level == 0 && batch_no == 0) {
+                    // Precompute LUT: sim_table = -2 * sim_table_2
+                    // (the precomputed_table term is added via dis0).
+                    dis0_cache = coarse_dis;
+                    const size_t n = pq.M * pq.ksub;
+                    for (size_t i = 0; i < n; i++) {
+                        sim_table_cache[i] = -2.0f * sim_table_2[i];
                     }
                 }
-                query_cum_norms[n_levels] = 0.0f;
-
-                float* simi = distances + i * k;
-                idx_t* idxi = labels + i * k;
-
-                init_result(simi, idxi);
-
-                idx_t nscan = 0;
-
-                for (size_t list_no = 0; list_no < (size_t)nprobe; list_no++) {
-                    idx_t cluster_id = keys[i * nprobe + list_no];
-                    size_t list_size = invlists->list_size(cluster_id);
-                    size_t n_batches =
-                            (list_size + batch_size - 1) / batch_size;
-
-                    std::unique_ptr<InvertedLists::ScopedIds> sids;
-                    const idx_t* ids =
-                            std::make_unique<InvertedLists::ScopedIds>(
-                                    invlists, cluster_id)
-                                    ->get();
-
-                    for (size_t batch_no = 0; batch_no < n_batches;
-                         batch_no++) {
-                        size_t curr_batch_size = std::min(
-                                list_size - batch_no * batch_size, batch_size);
-                        size_t b_offset = batch_no * batch_size;
-
-                        std::iota(
-                                active_indices.begin(),
-                                active_indices.begin() + curr_batch_size,
-                                b_offset);
-                        std::fill(
-                                bitset.begin(),
-                                bitset.begin() + curr_batch_size,
-                                1);
-                        std::fill(
-                                bitset.begin() + curr_batch_size,
-                                bitset.end(),
-                                0);
-
-                        for (size_t idx = 0; idx < curr_batch_size; idx++) {
-                            exact_distances[idx] = init_exact_distances
-                                    [init_exact_distances_offsets[cluster_id] +
-                                     b_offset + idx];
+                dis0 = dis0_cache;
+
+                float query_cum_norm =
+                        2 * query_cum_norms[level + 1];
+                float heap_max = distances[0];
+
+                const float* cum_sums_level =
+                        batch_cums + curr_batch_size * level;
+                const uint8_t* codes_level =
+                        batch_codes + bs * cs * level;
+
+                float* sim_table_level =
+                        sim_table_cache.data() + level_sim_offset;
+
+                bool is_sparse = next_num_active < bs / 16;
+
+                size_t num_active_for_filtering = 0;
+                if (is_sparse) {
+                    // Sparse path: use active_indices for indirection.
+                    for (size_t ci = 0; ci < cs; ci++) {
+                        size_t chunk_off = ci * bs;
+                        float* chunk_sim = sim_table_level + ci * pq.ksub;
+                        for (size_t i = 0; i < next_num_active; i++) {
+                            size_t real_idx =
+                                    active_indices[i] - batch_offset;
+                            exact_distances[i] +=
+                                    chunk_sim[codes_level[chunk_off + real_idx]];
                         }
-
-                        const uint8_t* codes = column_storage +
-                                column_offsets[cluster_id] +
-                                b_offset * code_size;
-                        float* cums = cum_sums + cum_sum_offsets[cluster_id] +
-                                b_offset * (n_levels + 1);
-
-                        total_points += curr_batch_size * n_levels;
-
-                        total_active += scanner->process_batch(
-                                pq,
-                                compressed_codes.data(),
-                                cluster_id,
-                                batch_no,
-                                coarse_dis[i * nprobe + list_no],
-                                curr_batch_size,
-                                batch_size,
-                                chunk_size,
-                                epsilon,
-                                n_levels,
-                                codes,
-                                cums,
-                                query_cum_norms.data(),
-                                active_indices.data(),
-                                bitset.data(),
-                                exact_distances.data(),
-                                ids,
-                                simi,
-                                idxi,
-                                k,
-                                &dis0s_cache[list_no],
-                                sim_table_cache.data() +
-                                        list_no * sim_table_size);
                     }
+                    num_active_for_filtering = next_num_active;
+                } else {
+                    auto [cc, na] =
+                            panorama_kernels::process_code_compression(
+                                    next_num_active,
+                                    bs,
+                                    cs,
+                                    compressed_codes.data(),
+                                    bitset.data(),
+                                    codes_level);
+
+                    panorama_kernels::process_chunks(
+                            cs, bs, na, sim_table_level, cc,
+                            exact_distances.data());
+                    num_active_for_filtering = na;
                 }
 
-                reorder_result(simi, idxi);
+                next_num_active = panorama_kernels::process_filtering(
+                        num_active_for_filtering,
+                        exact_distances.data(),
+                        active_indices.data(),
+                        const_cast<float*>(cum_sums_level),
+                        bitset.data(),
+                        batch_offset,
+                        dis0,
+                        query_cum_norm,
+                        epsilon,
+                        heap_max);
+            }
 
-                if (InterruptCallback::is_interrupted()) {
-                    interrupt = true;
+            // Insert surviving candidates into heap.
+            for (size_t i = 0; i < next_num_active; i++) {
+                float dis = dis0 + exact_distances[i];
+                if (C::cmp(distances[0], dis)) {
+                    idx_t id = store_pairs
+                            ? lo_build(list_no, active_indices[i])
+                            : ids[active_indices[i]];
+                    heap_replace_top<C>(k, distances, labels, dis, id);
+                    nup++;
                 }
             }
         }
+
+        return nup;
     }
 
-    if (interrupt) {
-        if (!exception_string.empty()) {
-            FAISS_THROW_FMT(
-                    "search interrupted with: %s", exception_string.c_str());
-        } else {
-            FAISS_THROW_MSG("computation interrupted");
-        }
+    size_t scan_codes(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            ResultHandler& handler) const override {
+        FAISS_THROW_MSG(
+                "IndexIVFPQPanorama: ResultHandler scan_codes not supported");
     }
+};
 
-    printf("vv: total_active: %f\n", (float)total_active / total_points);
+} // anonymous namespace
+
+/*****************************************
+ * get_InvertedListScanner
+ ******************************************/
+
+InvertedListScanner* IndexIVFPQPanorama::get_InvertedListScanner(
+        bool store_pairs,
+        const IDSelector* sel,
+        const IVFSearchParameters*) const {
+    FAISS_THROW_IF_NOT_MSG(
+            metric_type == METRIC_L2, "only L2 metric supported");
+    FAISS_THROW_IF_NOT_MSG(
+            use_precomputed_table == 1,
+            "Panorama PQ requires use_precomputed_table == 1");
+    FAISS_THROW_IF_NOT_MSG(
+            pq.nbits == 8, "only 8-bit PQ codes supported");
+    FAISS_THROW_IF_NOT_MSG(
+            by_residual, "Panorama PQ requires by_residual");
+    FAISS_THROW_IF_NOT_MSG(
+            polysemous_ht == 0,
+            "Panorama PQ does not support polysemous");
+
+    if (sel) {
+        return new IVFPQScannerPanorama<CMax<float, idx_t>, true>(
+                *this, store_pairs, sel);
+    } else {
+        return new IVFPQScannerPanorama<CMax<float, idx_t>, false>(
+                *this, store_pairs, sel);
+    }
 }
 
 } // namespace faiss
diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h
index 46a19e6b09..9fa3d34e7a 100644
--- a/faiss/IndexIVFPQPanorama.h
+++ b/faiss/IndexIVFPQPanorama.h
@@ -1,34 +1,75 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
 #ifndef FAISS_INDEX_IVFPQ_PANORAMA_H
 #define FAISS_INDEX_IVFPQ_PANORAMA_H
 
 #include <vector>
 
 #include <faiss/IndexIVFPQ.h>
-#include <faiss/impl/platform_macros.h>
-#include <faiss/utils/AlignedTable.h>
 
 namespace faiss {
 
+/// Panorama adaptation of IndexIVFPQ following
+/// https://www.arxiv.org/pdf/2510.00566.
+///
+/// IDEA:
+/// Panorama adapts the storage layout within each cluster and uses
+/// Cauchy-Schwarz pruning to skip unnecessary distance computations.
+/// Combined with orthogonal transforms upstream that concentrate signal
+/// energy in the early PQ subquantizers (like PCA), Panorama can prune
+/// the majority of candidates after computing only a fraction of the
+/// full PQ distance.
+///
+/// STORAGE LAYOUT:
+/// Standard IVFPQ stores codes row-major: [point0_code, point1_code, ...].
+/// Panorama transposes codes into column-major within each batch:
+/// for each batch of `batch_size` points, codes are stored as
+/// M columns of `batch_size` bytes each. The M columns are grouped
+/// into `n_levels` levels of `chunk_size` columns, enabling incremental
+/// distance computation level-by-level.
+///
+/// OVERHEAD:
+/// Panorama precomputes per-point cumulative residual norms and initial
+/// exact distances at insertion time. Storage overhead is
+/// (n_levels + 1) floats per point for cum_sums, plus 1 float per
+/// point for init_exact_distances.
+///
+/// CONSTRAINTS:
+/// - Only L2 metric is supported.
+/// - Only 8-bit PQ codes (nbits_per_idx == 8).
+/// - M must be divisible by n_levels.
+/// - batch_size must be a multiple of 64.
+/// - use_precomputed_table must be 1.
+///
+/// NOTE:
+/// We inherit from IndexIVFPQ and override only get_InvertedListScanner()
+/// and add(). The base IndexIVF::search_preassigned() handles all search
+/// orchestration — no search code is duplicated.
 struct IndexIVFPQPanorama : public IndexIVFPQ {
-    const int n_levels;
-    uint8_t* column_storage;
+    int n_levels;
+    float epsilon;
+    size_t batch_size;
 
-    size_t* column_offsets;
-    float* cum_sums;
-    size_t* cum_sum_offsets;
+    size_t chunk_size;
+    size_t levels_size;
+    size_t m_level_width;
 
-    float* init_exact_distances;
-    size_t* init_exact_distances_offsets;
+    bool added = false;
+    size_t num_points = 0;
 
-    const size_t chunk_size;
-    const size_t levels_size;
-    bool added;
-    size_t num_points;
-    size_t batch_size;
-    size_t nbits_per_idx;
-    size_t m_level_width;
+    uint8_t* column_storage = nullptr;
+    size_t* column_offsets = nullptr;
 
-    float epsilon;
+    float* cum_sums = nullptr;
+    size_t* cum_sum_offsets = nullptr;
+
+    float* init_exact_distances = nullptr;
+    size_t* init_exact_distances_offsets = nullptr;
 
     IndexIVFPQPanorama(
             Index* quantizer,
@@ -42,27 +83,14 @@ struct IndexIVFPQPanorama : public IndexIVFPQ {
             MetricType metric = METRIC_L2,
             bool own_invlists = true);
 
+    IndexIVFPQPanorama() = default;
+
     void add(idx_t n, const float* x) override;
 
-    void search(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            float* distances,
-            idx_t* labels,
-            const SearchParameters* params_in) const;
-
-    void search_preassigned(
-            idx_t n,
-            const float* x,
-            idx_t k,
-            const idx_t* keys,
-            const float* coarse_dis,
-            float* distances,
-            idx_t* labels,
+    InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,
-            const IVFSearchParameters* params,
-            IndexIVFStats* ivf_stats) const override;
+            const IDSelector* sel,
+            const IVFSearchParameters* params) const override;
 };
 
 } // namespace faiss

From 3f51144f5a071c3c647f3dee49c00f8e5e1ba58c Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Wed, 18 Mar 2026 05:43:54 +0000
Subject: [PATCH 07/41] Bench fixed

---
 benchs/bench_ivfpq_panorama.py | 87 +++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 38 deletions(-)

diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
index 7c965d54fe..fcb40b466e 100644
--- a/benchs/bench_ivfpq_panorama.py
+++ b/benchs/bench_ivfpq_panorama.py
@@ -22,6 +22,7 @@ def fvecs_read(fname):
 
 IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index")
 IVFPQ_TRAINED_CACHE = os.path.join(CACHE_DIR, "ivfpq_trained_10pct.index")
+IVFPQ_PANO_CACHE = os.path.join(CACHE_DIR, "ivfpq_pano_10pct.index")
 
 print("Loading GIST1M data (10% subset)...", flush=True)
 xb_full = fvecs_read(os.path.join(GIST_DIR, "gist1m_base.fvecs"))
@@ -106,49 +107,59 @@ def eval_recall(index, nprobe_val):
     ivfpq.nprobe = nprobe
     eval_recall(ivfpq, nprobe)
 
-# --- IVFPQPanorama (reuse trained PQ from cache) ---
+# --- IVFPQPanorama (cached separately) ---
 faiss.omp_set_num_threads(mp.cpu_count())
 
-
-def build_panorama_from_trained(trained_index):
-    quantizer2 = trained_index.quantizer
-    trained_index.own_fields = False
-
-    pano = faiss.IndexIVFPQPanorama(
-        quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size
-    )
-    centroids = faiss.vector_to_array(trained_index.pq.centroids)
-    faiss.copy_array_to_vector(centroids, pano.pq.centroids)
-    pano.is_trained = True
-    pano.use_precomputed_table = 1
-    pano.precompute_table()
-    return pano
-
-
-if os.path.exists(IVFPQ_TRAINED_CACHE):
-    print(f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...", flush=True)
-    trained = faiss.read_index(IVFPQ_TRAINED_CACHE)
-    ivfpq_pano = build_panorama_from_trained(trained)
-    print("  Reused trained PQ (skipped training).", flush=True)
-else:
-    print(
-        f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}",
-        flush=True,
-    )
-    quantizer2 = faiss.IndexFlatL2(d)
-    trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits)
+if os.path.exists(IVFPQ_PANO_CACHE):
+    print(f"\nLoading cached IVFPQPanorama from {IVFPQ_PANO_CACHE}...", flush=True)
     t0 = time.time()
-    trained.train(xt)
-    print(f"  Training took {time.time() - t0:.1f}s", flush=True)
-
-    print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
-    faiss.write_index(trained, IVFPQ_TRAINED_CACHE)
+    ivfpq_pano = faiss.read_index(IVFPQ_PANO_CACHE)
+    print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
+else:
+    def build_panorama_from_trained(trained_index):
+        quantizer2 = trained_index.quantizer
+        trained_index.own_fields = False
+
+        pano = faiss.IndexIVFPQPanorama(
+            quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size
+        )
+        centroids = faiss.vector_to_array(trained_index.pq.centroids)
+        faiss.copy_array_to_vector(centroids, pano.pq.centroids)
+        pano.is_trained = True
+        pano.use_precomputed_table = 1
+        pano.precompute_table()
+        return pano
+
+    if os.path.exists(IVFPQ_TRAINED_CACHE):
+        print(
+            f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...",
+            flush=True,
+        )
+        trained = faiss.read_index(IVFPQ_TRAINED_CACHE)
+        ivfpq_pano = build_panorama_from_trained(trained)
+        print("  Reused trained PQ (skipped training).", flush=True)
+    else:
+        print(
+            f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}",
+            flush=True,
+        )
+        quantizer2 = faiss.IndexFlatL2(d)
+        trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits)
+        t0 = time.time()
+        trained.train(xt)
+        print(f"  Training took {time.time() - t0:.1f}s", flush=True)
+
+        print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
+        faiss.write_index(trained, IVFPQ_TRAINED_CACHE)
+
+        ivfpq_pano = build_panorama_from_trained(trained)
 
-    ivfpq_pano = build_panorama_from_trained(trained)
+    t0 = time.time()
+    ivfpq_pano.add(xb)
+    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
 
-t0 = time.time()
-ivfpq_pano.add(xb)
-print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
+    print(f"  Saving IVFPQPanorama to {IVFPQ_PANO_CACHE}...", flush=True)
+    faiss.write_index(ivfpq_pano, IVFPQ_PANO_CACHE)
 
 faiss.omp_set_num_threads(1)
 print("\n====== IVFPQPanorama", flush=True)

From bb46ee3edbbdb86a65810827211c58eff4a52b02 Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Wed, 18 Mar 2026 05:49:53 +0000
Subject: [PATCH 08/41] Remove epsilon

---
 benchs/bench_ivfpq_panorama.py                | 55 +++++++++----------
 faiss/IndexIVFPQPanorama.cpp                  |  4 --
 faiss/IndexIVFPQPanorama.h                    |  2 -
 .../panorama_kernels-avx2.cpp                 |  3 +-
 .../panorama_kernels-avx512.cpp               |  3 +-
 .../panorama_kernels-generic.cpp              |  3 +-
 .../impl/panorama_kernels/panorama_kernels.h  |  1 -
 7 files changed, 30 insertions(+), 41 deletions(-)

diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
index fcb40b466e..615ffe01b5 100644
--- a/benchs/bench_ivfpq_panorama.py
+++ b/benchs/bench_ivfpq_panorama.py
@@ -43,7 +43,6 @@ def fvecs_read(fname):
 nbits = 8
 nlist = 64
 n_levels = 8
-epsilon = 1.0
 batch_size = 128
 
 GT_PATH = os.path.join(CACHE_DIR, "gt_10pct.npy")
@@ -75,37 +74,37 @@ def eval_recall(index, nprobe_val):
     return recall, qps
 
 
-faiss.omp_set_num_threads(mp.cpu_count())
+# faiss.omp_set_num_threads(mp.cpu_count())
 
-# --- IVFPQ baseline (cached) ---
-if os.path.exists(IVFPQ_CACHE):
-    print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True)
-    t0 = time.time()
-    ivfpq = faiss.read_index(IVFPQ_CACHE)
-    print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
-else:
-    print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True)
-    quantizer = faiss.IndexFlatL2(d)
-    ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
-    t0 = time.time()
-    ivfpq.train(xt)
-    print(f"  Training took {time.time() - t0:.1f}s", flush=True)
+# # --- IVFPQ baseline (cached) ---
+# if os.path.exists(IVFPQ_CACHE):
+#     print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True)
+#     t0 = time.time()
+#     ivfpq = faiss.read_index(IVFPQ_CACHE)
+#     print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
+# else:
+#     print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True)
+#     quantizer = faiss.IndexFlatL2(d)
+#     ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+#     t0 = time.time()
+#     ivfpq.train(xt)
+#     print(f"  Training took {time.time() - t0:.1f}s", flush=True)
 
-    print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
-    faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE)
+#     print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
+#     faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE)
 
-    t0 = time.time()
-    ivfpq.add(xb)
-    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
+#     t0 = time.time()
+#     ivfpq.add(xb)
+#     print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
 
-    print(f"  Saving full index to {IVFPQ_CACHE}...", flush=True)
-    faiss.write_index(ivfpq, IVFPQ_CACHE)
+#     print(f"  Saving full index to {IVFPQ_CACHE}...", flush=True)
+#     faiss.write_index(ivfpq, IVFPQ_CACHE)
 
-faiss.omp_set_num_threads(1)
-print("\n====== IVFPQ baseline", flush=True)
-for nprobe in [1, 2, 4, 8, 16]:
-    ivfpq.nprobe = nprobe
-    eval_recall(ivfpq, nprobe)
+# faiss.omp_set_num_threads(1)
+# print("\n====== IVFPQ baseline", flush=True)
+# for nprobe in [1, 2, 4, 8, 16]:
+#     ivfpq.nprobe = nprobe
+#     eval_recall(ivfpq, nprobe)
 
 # --- IVFPQPanorama (cached separately) ---
 faiss.omp_set_num_threads(mp.cpu_count())
@@ -121,7 +120,7 @@ def build_panorama_from_trained(trained_index):
         trained_index.own_fields = False
 
         pano = faiss.IndexIVFPQPanorama(
-            quantizer2, d, nlist, M, nbits, n_levels, epsilon, batch_size
+            quantizer2, d, nlist, M, nbits, n_levels, batch_size
         )
         centroids = faiss.vector_to_array(trained_index.pq.centroids)
         faiss.copy_array_to_vector(centroids, pano.pq.centroids)
diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index 8ac3df210a..3104820e19 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -30,7 +30,6 @@ IndexIVFPQPanorama::IndexIVFPQPanorama(
         size_t M,
         size_t nbits_per_idx,
         int n_levels,
-        float epsilon,
         size_t batch_size,
         MetricType metric,
         bool own_invlists)
@@ -43,7 +42,6 @@ IndexIVFPQPanorama::IndexIVFPQPanorama(
                   metric,
                   own_invlists),
           n_levels(n_levels),
-          epsilon(epsilon),
           batch_size(batch_size),
           chunk_size(code_size / n_levels),
           levels_size(d / n_levels),
@@ -242,7 +240,6 @@ struct IVFPQScannerPanorama : InvertedListScanner {
         const size_t bs = index.batch_size;
         const size_t cs = index.chunk_size;
         const int n_levels = index.n_levels;
-        const float epsilon = index.epsilon;
 
         const size_t n_batches = (list_size + bs - 1) / bs;
         const size_t sim_table_size = pq.ksub * pq.M;
@@ -360,7 +357,6 @@ struct IVFPQScannerPanorama : InvertedListScanner {
                         batch_offset,
                         dis0,
                         query_cum_norm,
-                        epsilon,
                         heap_max);
             }
 
diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h
index 9fa3d34e7a..aaee470e25 100644
--- a/faiss/IndexIVFPQPanorama.h
+++ b/faiss/IndexIVFPQPanorama.h
@@ -52,7 +52,6 @@ namespace faiss {
 /// orchestration — no search code is duplicated.
 struct IndexIVFPQPanorama : public IndexIVFPQ {
     int n_levels;
-    float epsilon;
     size_t batch_size;
 
     size_t chunk_size;
@@ -78,7 +77,6 @@ struct IndexIVFPQPanorama : public IndexIVFPQ {
             size_t M,
             size_t nbits_per_idx,
             int n_levels,
-            float epsilon,
             size_t batch_size = 128,
             MetricType metric = METRIC_L2,
             bool own_invlists = true);
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
index 235c5d4d78..46728b1cdd 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -122,14 +122,13 @@ size_t process_filtering(
         size_t batch_offset,
         float dis0,
         float query_cum_norm,
-        float epsilon,
         float heap_max) {
     size_t next_num_active = 0;
     for (size_t i = 0; i < num_active; i++) {
         float exact_distance = exact_distances[i];
         float cum_sum = cum_sums[active_indices[i] - batch_offset];
         float lower_bound =
-                exact_distance + dis0 - cum_sum * query_cum_norm * epsilon;
+                exact_distance + dis0 - cum_sum * query_cum_norm;
 
         bool keep = heap_max > lower_bound;
         active_indices[next_num_active] = active_indices[i];
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
index 6c6f0f24db..7733d5a6da 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -119,14 +119,13 @@ size_t process_filtering(
         size_t batch_offset,
         float dis0,
         float query_cum_norm,
-        float epsilon,
         float heap_max) {
     size_t next_num_active = 0;
     for (size_t i = 0; i < num_active; i++) {
         float exact_distance = exact_distances[i];
         float cum_sum = cum_sums[active_indices[i] - batch_offset];
         float lower_bound =
-                exact_distance + dis0 - cum_sum * query_cum_norm * epsilon;
+                exact_distance + dis0 - cum_sum * query_cum_norm;
 
         bool keep = heap_max > lower_bound;
         active_indices[next_num_active] = active_indices[i];
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
index ab9f7acb57..cfd1283c80 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
@@ -47,14 +47,13 @@ size_t process_filtering(
         size_t batch_offset,
         float dis0,
         float query_cum_norm,
-        float epsilon,
         float heap_max) {
     size_t next_num_active = 0;
     for (size_t i = 0; i < num_active; i++) {
         float exact_distance = exact_distances[i];
         float cum_sum = cum_sums[active_indices[i] - batch_offset];
         float lower_bound =
-                exact_distance + dis0 - cum_sum * query_cum_norm * epsilon;
+                exact_distance + dis0 - cum_sum * query_cum_norm;
 
         bool keep = heap_max > lower_bound;
         active_indices[next_num_active] = active_indices[i];
diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h
index 6c8d007ddd..aed8a87660 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels.h
+++ b/faiss/impl/panorama_kernels/panorama_kernels.h
@@ -61,7 +61,6 @@ size_t process_filtering(
         size_t batch_offset,
         float dis0,
         float query_cum_norm,
-        float epsilon,
         float heap_max);
 
 /// Byte-level stream compaction of PQ codes using the active bitset.

From f21aac14df599b1eb983c66e604461dc92636c49 Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Wed, 18 Mar 2026 06:35:40 +0000
Subject: [PATCH 09/41] Fix the LUT

---
 faiss/IndexIVFPQPanorama.cpp | 29 ++++++++++-------------------
 1 file changed, 10 insertions(+), 19 deletions(-)

diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index 3104820e19..01f5b0a7c5 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -205,6 +205,13 @@ struct IVFPQScannerPanorama : InvertedListScanner {
 
         pq.compute_inner_prod_table(qi, sim_table_2.data());
 
+        // The PQ distance LUT is -2 * inner_prod_table; apply in-place
+        // so scan_codes() can use sim_table_2 directly.
+        const size_t n = pq.M * pq.ksub;
+        for (size_t i = 0; i < n; i++) {
+            sim_table_2[i] *= -2.0f;
+        }
+
         // Compute query suffix sums → cum norms per level.
         std::vector<float> suffix(index.d + 1, 0.0f);
         for (int j = index.d - 1; j >= 0; j--) {
@@ -242,8 +249,6 @@ struct IVFPQScannerPanorama : InvertedListScanner {
         const int n_levels = index.n_levels;
 
         const size_t n_batches = (list_size + bs - 1) / bs;
-        const size_t sim_table_size = pq.ksub * pq.M;
-
         // Panorama column-major codes for this list.
         const uint8_t* col_codes =
                 index.column_storage + index.column_offsets[list_no];
@@ -258,8 +263,7 @@ struct IVFPQScannerPanorama : InvertedListScanner {
         std::vector<uint8_t> bitset(bs);
         std::vector<uint32_t> active_indices(bs);
         std::vector<uint8_t> compressed_codes(bs * cs);
-        std::vector<float> sim_table_cache(sim_table_size);
-        float dis0_cache = 0;
+        float dis0 = coarse_dis;
 
         for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
             size_t curr_batch_size =
@@ -283,27 +287,13 @@ struct IVFPQScannerPanorama : InvertedListScanner {
                     list_cum_sums + b_offset * (n_levels + 1);
 
             size_t next_num_active = curr_batch_size;
-            float dis0 = 0;
             size_t batch_offset = batch_no * bs;
 
             for (int level = 0;
                  level < n_levels && next_num_active > 0;
                  level++) {
-                // Compute sim table for this level (cached across batches
-                // within same list, only for first batch).
                 size_t level_sim_offset = level * pq.ksub * cs;
 
-                if (level == 0 && batch_no == 0) {
-                    // Precompute LUT: sim_table = -2 * sim_table_2
-                    // (the precomputed_table term is added via dis0).
-                    dis0_cache = coarse_dis;
-                    const size_t n = pq.M * pq.ksub;
-                    for (size_t i = 0; i < n; i++) {
-                        sim_table_cache[i] = -2.0f * sim_table_2[i];
-                    }
-                }
-                dis0 = dis0_cache;
-
                 float query_cum_norm =
                         2 * query_cum_norms[level + 1];
                 float heap_max = distances[0];
@@ -314,7 +304,8 @@ struct IVFPQScannerPanorama : InvertedListScanner {
                         batch_codes + bs * cs * level;
 
                 float* sim_table_level =
-                        sim_table_cache.data() + level_sim_offset;
+                        const_cast<float*>(sim_table_2.data()) +
+                        level_sim_offset;
 
                 bool is_sparse = next_num_active < bs / 16;
 

From 0a6966d2b2aba3de074911e298e2742ed9c841bf Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Wed, 18 Mar 2026 06:49:19 +0000
Subject: [PATCH 10/41] Use precomputed table and remove init exact dist

---
 faiss/IndexIVFPQPanorama.cpp | 123 +++++++++++++++++------------------
 faiss/IndexIVFPQPanorama.h   |  11 ++--
 2 files changed, 63 insertions(+), 71 deletions(-)

diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index 01f5b0a7c5..f9c6b6e7dd 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -46,10 +46,13 @@ IndexIVFPQPanorama::IndexIVFPQPanorama(
           chunk_size(code_size / n_levels),
           levels_size(d / n_levels),
           m_level_width(M / n_levels) {
-    FAISS_THROW_IF_NOT_MSG(M % n_levels == 0, "M must be divisible by n_levels");
-    FAISS_THROW_IF_NOT_MSG(batch_size % 64 == 0, "batch_size must be multiple of 64");
+    FAISS_THROW_IF_NOT_MSG(
+            M % n_levels == 0, "M must be divisible by n_levels");
+    FAISS_THROW_IF_NOT_MSG(
+            batch_size % 64 == 0, "batch_size must be multiple of 64");
     FAISS_THROW_IF_NOT_MSG(nbits_per_idx == 8, "only 8-bit PQ codes supported");
-    FAISS_THROW_IF_NOT_MSG(M == code_size, "M must equal code_size for 8-bit PQ");
+    FAISS_THROW_IF_NOT_MSG(
+            M == code_size, "M must equal code_size for 8-bit PQ");
     FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported");
 }
 
@@ -58,7 +61,8 @@ IndexIVFPQPanorama::IndexIVFPQPanorama(
  ******************************************/
 
 void IndexIVFPQPanorama::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(!added, "IndexIVFPQPanorama only supports a single add() call");
+    FAISS_THROW_IF_NOT_MSG(
+            !added, "IndexIVFPQPanorama only supports a single add() call");
     added = true;
     num_points = n;
 
@@ -89,43 +93,39 @@ void IndexIVFPQPanorama::add(idx_t n, const float* x) {
                     std::min(list_size - batch_no * batch_size, batch_size);
             for (size_t m = 0; m < pq.M; m++) {
                 for (size_t p = 0; p < curr_batch_size; p++) {
-                    column_storage[col_offset + batch_offset +
-                                   m * batch_size + p] =
-                            row_codes[batch_no * batch_size * code_size +
-                                      p * code_size + m];
+                    column_storage
+                            [col_offset + batch_offset + m * batch_size + p] =
+                                    row_codes
+                                            [batch_no * batch_size * code_size +
+                                             p * code_size + m];
                 }
             }
         }
     }
 
-    // Precompute cumulative residual norms and initial exact distances.
+    // Precompute cumulative residual norms (suffix sums of ||y_R||^2).
+    // init_exact_distances are computed on-the-fly during search using
+    // the precomputed_table, so we only need cum_sums here.
     cum_sum_offsets = new size_t[nlist];
-    init_exact_distances_offsets = new size_t[nlist];
 
     size_t cum_size = 0;
-    size_t init_size = 0;
     for (size_t list_no = 0; list_no < nlist; list_no++) {
         cum_sum_offsets[list_no] = cum_size;
         cum_size += invlists->list_size(list_no) * (n_levels + 1);
-        init_exact_distances_offsets[list_no] = init_size;
-        init_size += invlists->list_size(list_no);
     }
 
     cum_sums = new float[cum_size];
-    init_exact_distances = new float[init_size];
 
     for (size_t list_no = 0; list_no < nlist; list_no++) {
         size_t list_size = invlists->list_size(list_no);
-
-        std::vector<float> centroid(d);
-        quantizer->reconstruct(list_no, centroid.data());
+        if (list_size == 0)
+            continue;
 
         size_t n_batches = (list_size + batch_size - 1) / batch_size;
 
         for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
             size_t b_offset = batch_no * batch_size;
-            size_t curr_batch_size =
-                    std::min(list_size - b_offset, batch_size);
+            size_t curr_batch_size = std::min(list_size - b_offset, batch_size);
 
             for (size_t p = 0; p < curr_batch_size; p++) {
                 std::vector<float> vec(d);
@@ -133,10 +133,8 @@ void IndexIVFPQPanorama::add(idx_t n, const float* x) {
                         invlists->get_single_code(list_no, b_offset + p);
                 pq.decode(code, vec.data());
 
-                float init_dist = 0.0f;
                 std::vector<float> suffix(d + 1, 0.0f);
                 for (int j = d - 1; j >= 0; j--) {
-                    init_dist += vec[j] * vec[j] + 2 * vec[j] * centroid[j];
                     suffix[j] = suffix[j + 1] + vec[j] * vec[j];
                 }
 
@@ -151,13 +149,9 @@ void IndexIVFPQPanorama::add(idx_t n, const float* x) {
                 }
 
                 size_t last_offset = cum_sum_offsets[list_no] +
-                        b_offset * (n_levels + 1) +
-                        n_levels * curr_batch_size + p;
+                        b_offset * (n_levels + 1) + n_levels * curr_batch_size +
+                        p;
                 cum_sums[last_offset] = 0.0f;
-
-                init_exact_distances
-                        [init_exact_distances_offsets[list_no] + b_offset + p] =
-                                init_dist;
             }
         }
     }
@@ -231,8 +225,7 @@ struct IVFPQScannerPanorama : InvertedListScanner {
     }
 
     float distance_to_code(const uint8_t* code) const override {
-        FAISS_THROW_MSG(
-                "IndexIVFPQPanorama does not support distance_to_code");
+        FAISS_THROW_MSG("IndexIVFPQPanorama does not support distance_to_code");
     }
 
     size_t scan_codes(
@@ -254,9 +247,8 @@ struct IVFPQScannerPanorama : InvertedListScanner {
                 index.column_storage + index.column_offsets[list_no];
         const float* list_cum_sums =
                 index.cum_sums + index.cum_sum_offsets[list_no];
-        const float* list_init_dists =
-                index.init_exact_distances +
-                index.init_exact_distances_offsets[list_no];
+        const float* precomp =
+                index.precomputed_table.data() + list_no * pq.M * pq.ksub;
 
         // Scratch buffers.
         std::vector<float> exact_distances(bs);
@@ -266,8 +258,7 @@ struct IVFPQScannerPanorama : InvertedListScanner {
         float dis0 = coarse_dis;
 
         for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
-            size_t curr_batch_size =
-                    std::min(list_size - batch_no * bs, bs);
+            size_t curr_batch_size = std::min(list_size - batch_no * bs, bs);
             size_t b_offset = batch_no * bs;
 
             // Initialize active set.
@@ -278,30 +269,35 @@ struct IVFPQScannerPanorama : InvertedListScanner {
             std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1);
             std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0);
 
+            const uint8_t* batch_codes = col_codes + b_offset * code_size;
+
+            // Compute init_exact_distance on-the-fly from the
+            // precomputed table: sum_m(precomp[m * ksub + code[m]]).
+            // Codes are column-major: point p's code for subquantizer
+            // m is at batch_codes[m * bs + p].
             for (size_t idx = 0; idx < curr_batch_size; idx++) {
-                exact_distances[idx] = list_init_dists[b_offset + idx];
+                float init_dist = 0.0f;
+                for (size_t m = 0; m < pq.M; m++) {
+                    uint8_t code_val = batch_codes[m * bs + idx];
+                    init_dist += precomp[m * pq.ksub + code_val];
+                }
+                exact_distances[idx] = init_dist;
             }
-
-            const uint8_t* batch_codes = col_codes + b_offset * code_size;
-            const float* batch_cums =
-                    list_cum_sums + b_offset * (n_levels + 1);
+            const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1);
 
             size_t next_num_active = curr_batch_size;
             size_t batch_offset = batch_no * bs;
 
-            for (int level = 0;
-                 level < n_levels && next_num_active > 0;
+            for (int level = 0; level < n_levels && next_num_active > 0;
                  level++) {
                 size_t level_sim_offset = level * pq.ksub * cs;
 
-                float query_cum_norm =
-                        2 * query_cum_norms[level + 1];
+                float query_cum_norm = 2 * query_cum_norms[level + 1];
                 float heap_max = distances[0];
 
                 const float* cum_sums_level =
                         batch_cums + curr_batch_size * level;
-                const uint8_t* codes_level =
-                        batch_codes + bs * cs * level;
+                const uint8_t* codes_level = batch_codes + bs * cs * level;
 
                 float* sim_table_level =
                         const_cast<float*>(sim_table_2.data()) +
@@ -316,25 +312,27 @@ struct IVFPQScannerPanorama : InvertedListScanner {
                         size_t chunk_off = ci * bs;
                         float* chunk_sim = sim_table_level + ci * pq.ksub;
                         for (size_t i = 0; i < next_num_active; i++) {
-                            size_t real_idx =
-                                    active_indices[i] - batch_offset;
-                            exact_distances[i] +=
-                                    chunk_sim[codes_level[chunk_off + real_idx]];
+                            size_t real_idx = active_indices[i] - batch_offset;
+                            exact_distances[i] += chunk_sim
+                                    [codes_level[chunk_off + real_idx]];
                         }
                     }
                     num_active_for_filtering = next_num_active;
                 } else {
-                    auto [cc, na] =
-                            panorama_kernels::process_code_compression(
-                                    next_num_active,
-                                    bs,
-                                    cs,
-                                    compressed_codes.data(),
-                                    bitset.data(),
-                                    codes_level);
+                    auto [cc, na] = panorama_kernels::process_code_compression(
+                            next_num_active,
+                            bs,
+                            cs,
+                            compressed_codes.data(),
+                            bitset.data(),
+                            codes_level);
 
                     panorama_kernels::process_chunks(
-                            cs, bs, na, sim_table_level, cc,
+                            cs,
+                            bs,
+                            na,
+                            sim_table_level,
+                            cc,
                             exact_distances.data());
                     num_active_for_filtering = na;
                 }
@@ -392,13 +390,10 @@ InvertedListScanner* IndexIVFPQPanorama::get_InvertedListScanner(
     FAISS_THROW_IF_NOT_MSG(
             use_precomputed_table == 1,
             "Panorama PQ requires use_precomputed_table == 1");
+    FAISS_THROW_IF_NOT_MSG(pq.nbits == 8, "only 8-bit PQ codes supported");
+    FAISS_THROW_IF_NOT_MSG(by_residual, "Panorama PQ requires by_residual");
     FAISS_THROW_IF_NOT_MSG(
-            pq.nbits == 8, "only 8-bit PQ codes supported");
-    FAISS_THROW_IF_NOT_MSG(
-            by_residual, "Panorama PQ requires by_residual");
-    FAISS_THROW_IF_NOT_MSG(
-            polysemous_ht == 0,
-            "Panorama PQ does not support polysemous");
+            polysemous_ht == 0, "Panorama PQ does not support polysemous");
 
     if (sel) {
         return new IVFPQScannerPanorama<CMax<float, idx_t>, true>(
diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h
index aaee470e25..a97107f54d 100644
--- a/faiss/IndexIVFPQPanorama.h
+++ b/faiss/IndexIVFPQPanorama.h
@@ -34,10 +34,10 @@ namespace faiss {
 /// distance computation level-by-level.
 ///
 /// OVERHEAD:
-/// Panorama precomputes per-point cumulative residual norms and initial
-/// exact distances at insertion time. Storage overhead is
-/// (n_levels + 1) floats per point for cum_sums, plus 1 float per
-/// point for init_exact_distances.
+/// Panorama precomputes per-point cumulative residual norms at insertion
+/// time. Storage overhead is (n_levels + 1) floats per point for
+/// cum_sums. Initial exact distances are computed on-the-fly during
+/// search using the precomputed_table (no extra per-point storage).
 ///
 /// CONSTRAINTS:
 /// - Only L2 metric is supported.
@@ -67,9 +67,6 @@ struct IndexIVFPQPanorama : public IndexIVFPQ {
     float* cum_sums = nullptr;
     size_t* cum_sum_offsets = nullptr;
 
-    float* init_exact_distances = nullptr;
-    size_t* init_exact_distances_offsets = nullptr;
-
     IndexIVFPQPanorama(
             Index* quantizer,
             size_t d,

From 4bf6785654abac00b3548633634dfbb16c06419c Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Thu, 19 Mar 2026 07:12:31 +0000
Subject: [PATCH 11/41] clean but slow

---
 benchs/bench_ivfpq_panorama.py   |  60 +++----
 faiss/CMakeLists.txt             |   2 +
 faiss/IndexFlat.cpp              |  12 +-
 faiss/IndexFlat.h                |   4 +-
 faiss/IndexHNSW.cpp              |   5 +-
 faiss/IndexHNSW.h                |   2 +-
 faiss/IndexIVFFlatPanorama.cpp   |  18 ++-
 faiss/IndexIVFPQPanorama.cpp     | 269 ++++++-------------------------
 faiss/IndexIVFPQPanorama.h       |  22 +--
 faiss/impl/HNSW.cpp              |   6 +-
 faiss/impl/Panorama.cpp          | 111 +++++++------
 faiss/impl/Panorama.h            |  64 ++++++--
 faiss/impl/index_read.cpp        |  40 ++++-
 faiss/impl/index_write.cpp       |  15 +-
 faiss/invlists/InvertedLists.cpp |  39 ++---
 faiss/invlists/InvertedLists.h   |  12 +-
 faiss/python/swigfaiss.swig      |   1 +
 17 files changed, 298 insertions(+), 384 deletions(-)

diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
index 615ffe01b5..5ae4fc2152 100644
--- a/benchs/bench_ivfpq_panorama.py
+++ b/benchs/bench_ivfpq_panorama.py
@@ -59,6 +59,7 @@ def fvecs_read(fname):
 
 
 def eval_recall(index, nprobe_val):
+    faiss.cvar.indexPanorama_stats.reset()
     t0 = time.time()
     _, I = index.search(xq, k=k)
     t = time.time() - t0
@@ -66,47 +67,50 @@ def eval_recall(index, nprobe_val):
     qps = 1000 / speed
     corrects = sum(len(set(gt_I[i]) & set(I[i])) for i in range(nq))
     recall = corrects / (nq * k)
+    stats = faiss.cvar.indexPanorama_stats
+    pct_active = stats.ratio_dims_scanned * 100
     print(
         f"\tnprobe {nprobe_val:3d}, Recall@{k}: "
-        f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}",
+        f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}, "
+        f"active: {pct_active:.1f}%",
         flush=True,
     )
     return recall, qps
 
 
-# faiss.omp_set_num_threads(mp.cpu_count())
+faiss.omp_set_num_threads(mp.cpu_count())
 
-# # --- IVFPQ baseline (cached) ---
-# if os.path.exists(IVFPQ_CACHE):
-#     print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True)
-#     t0 = time.time()
-#     ivfpq = faiss.read_index(IVFPQ_CACHE)
-#     print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
-# else:
-#     print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True)
-#     quantizer = faiss.IndexFlatL2(d)
-#     ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
-#     t0 = time.time()
-#     ivfpq.train(xt)
-#     print(f"  Training took {time.time() - t0:.1f}s", flush=True)
+# --- IVFPQ baseline (cached) ---
+if os.path.exists(IVFPQ_CACHE):
+    print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True)
+    t0 = time.time()
+    ivfpq = faiss.read_index(IVFPQ_CACHE)
+    print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
+else:
+    print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True)
+    quantizer = faiss.IndexFlatL2(d)
+    ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+    t0 = time.time()
+    ivfpq.train(xt)
+    print(f"  Training took {time.time() - t0:.1f}s", flush=True)
 
-#     print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
-#     faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE)
+    print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
+    faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE)
 
-#     t0 = time.time()
-#     ivfpq.add(xb)
-#     print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
+    t0 = time.time()
+    ivfpq.add(xb)
+    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
 
-#     print(f"  Saving full index to {IVFPQ_CACHE}...", flush=True)
-#     faiss.write_index(ivfpq, IVFPQ_CACHE)
+    print(f"  Saving full index to {IVFPQ_CACHE}...", flush=True)
+    faiss.write_index(ivfpq, IVFPQ_CACHE)
 
-# faiss.omp_set_num_threads(1)
-# print("\n====== IVFPQ baseline", flush=True)
-# for nprobe in [1, 2, 4, 8, 16]:
-#     ivfpq.nprobe = nprobe
-#     eval_recall(ivfpq, nprobe)
+faiss.omp_set_num_threads(1)
+print("\n====== IVFPQ baseline", flush=True)
+for nprobe in [1, 2, 4, 8, 16]:
+    ivfpq.nprobe = nprobe
+    eval_recall(ivfpq, nprobe)
 
-# --- IVFPQPanorama (cached separately) ---
+# --- IVFPQPanorama (cached) ---
 faiss.omp_set_num_threads(mp.cpu_count())
 
 if os.path.exists(IVFPQ_PANO_CACHE):
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 84a6eb1aac..5e752d9d3b 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -131,6 +131,7 @@ set(FAISS_SRC
   impl/zerocopy_io.cpp
   impl/NNDescent.cpp
   impl/Panorama.cpp
+  impl/PanoramaPQ.cpp
   impl/PanoramaStats.cpp
   invlists/BlockInvertedLists.cpp
   invlists/DirectMap.cpp
@@ -241,6 +242,7 @@ set(FAISS_HEADERS
   impl/NNDescent.h
   impl/NSG.h
   impl/Panorama.h
+  impl/PanoramaPQ.h
   impl/PanoramaStats.h
   impl/PolysemousTraining.h
   impl/ProductQuantizer-inl.h
diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp
index 599d5b1e6c..27ed1090da 100644
--- a/faiss/IndexFlat.cpp
+++ b/faiss/IndexFlat.cpp
@@ -695,7 +695,7 @@ void IndexFlatPanorama::add(idx_t n, const float* x) {
 
     const uint8_t* code = reinterpret_cast<const uint8_t*>(x);
     pano.copy_codes_to_level_layout(codes.data(), offset, n, code);
-    pano.compute_cumulative_sums(cum_sums.data(), offset, n, x);
+    pano.compute_cumulative_sums(cum_sums.data(), offset, n, code);
 }
 
 void IndexFlatPanorama::search(
@@ -892,12 +892,12 @@ void IndexFlatPanorama::search_subset(
                         bool pruned = false;
                         for (size_t level = 0; level < n_levels; level++) {
                             local_stats.total_dims_scanned +=
-                                    pano.level_width_floats;
+                                    pano.level_width_dims;
 
                             // Refine distance
                             size_t actual_level_width = std::min(
-                                    pano.level_width_floats,
-                                    d - level * pano.level_width_floats);
+                                    pano.level_width_dims,
+                                    d - level * pano.level_width_dims);
                             float dot_product = fvec_inner_product<SL>(
                                     x_ptr, p_ptr, actual_level_width);
                             if constexpr (is_sim) {
@@ -930,8 +930,8 @@ void IndexFlatPanorama::search_subset(
                             }
 
                             cum_sum_offset++;
-                            x_ptr += pano.level_width_floats;
-                            p_ptr += pano.level_width_floats;
+                            x_ptr += pano.level_width_dims;
+                            p_ptr += pano.level_width_dims;
                         }
 
                         if (!pruned) {
diff --git a/faiss/IndexFlat.h b/faiss/IndexFlat.h
index f5870166ee..ccc0126d28 100644
--- a/faiss/IndexFlat.h
+++ b/faiss/IndexFlat.h
@@ -104,7 +104,7 @@ struct IndexFlatPanorama : IndexFlat {
     const size_t batch_size;
     const size_t n_levels;
     std::vector<float> cum_sums;
-    Panorama pano;
+    PanoramaFlat pano;
 
     /**
      * @param d dimensionality of the input vectors
@@ -120,7 +120,7 @@ struct IndexFlatPanorama : IndexFlat {
             : IndexFlat(d, metric),
               batch_size(batch_size),
               n_levels(n_levels),
-              pano(code_size, n_levels, batch_size) {
+              pano(d, n_levels, batch_size) {
         FAISS_THROW_IF_NOT(
                 metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
     }
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index fb02433778..fadea613c9 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -672,7 +672,7 @@ IndexHNSWFlatPanorama::IndexHNSWFlatPanorama(
         MetricType metric)
         : IndexHNSWFlat(d, M, metric),
           cum_sums(),
-          pano(d * sizeof(float), num_panorama_levels, 1),
+          pano(d, num_panorama_levels, 1),
           num_panorama_levels(num_panorama_levels) {
     // For now, we only support L2 distance.
     // Supporting dot product and cosine distance is a trivial addition
@@ -688,7 +688,8 @@ IndexHNSWFlatPanorama::IndexHNSWFlatPanorama(
 void IndexHNSWFlatPanorama::add(idx_t n, const float* x) {
     idx_t n0 = ntotal;
     cum_sums.resize((ntotal + n) * (pano.n_levels + 1));
-    pano.compute_cumulative_sums(cum_sums.data(), n0, n, x);
+    pano.compute_cumulative_sums(
+            cum_sums.data(), n0, n, reinterpret_cast<const uint8_t*>(x));
     IndexHNSWFlat::add(n, x);
 }
 
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index a43828d428..17d54ecaa2 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -179,7 +179,7 @@ struct IndexHNSWFlatPanorama : IndexHNSWFlat {
     }
 
     std::vector<float> cum_sums;
-    Panorama pano;
+    PanoramaFlat pano;
     const size_t num_panorama_levels;
 };
 
diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp
index 01a548b412..5e678be28c 100644
--- a/faiss/IndexIVFFlatPanorama.cpp
+++ b/faiss/IndexIVFFlatPanorama.cpp
@@ -38,7 +38,9 @@ IndexIVFFlatPanorama::IndexIVFFlatPanorama(
     // We construct the inverted lists here so that we can use the
     // level-oriented storage. This does not cause a leak as we constructed
     // IndexIVF first, with own_invlists set to false.
-    this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, n_levels);
+    auto* pano = new PanoramaFlat(
+            d, n_levels, ArrayInvertedListsPanorama::kBatchSize);
+    this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano);
     this->own_invlists = own_invlists;
 }
 
@@ -50,6 +52,7 @@ template <typename VectorDistance, bool use_sel>
 struct IVFFlatScannerPanorama : InvertedListScanner {
     VectorDistance vd;
     const ArrayInvertedListsPanorama* storage;
+    const PanoramaFlat* pano_flat;
     using C = typename VectorDistance::C;
     static constexpr MetricType metric = VectorDistance::metric;
 
@@ -58,10 +61,15 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
             const ArrayInvertedListsPanorama* storage,
             bool store_pairs,
             const IDSelector* sel)
-            : InvertedListScanner(store_pairs, sel), vd(vd), storage(storage) {
+            : InvertedListScanner(store_pairs, sel),
+              vd(vd),
+              storage(storage),
+              pano_flat(
+                      dynamic_cast<const PanoramaFlat*>(storage->pano.get())) {
+        FAISS_THROW_IF_NOT(pano_flat);
         keep_max = vd.is_similarity;
         code_size = vd.d * sizeof(float);
-        cum_sums.resize(storage->n_levels + 1);
+        cum_sums.resize(pano_flat->n_levels + 1);
     }
 
     const float* xi = nullptr;
@@ -69,7 +77,7 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
     float q_norm = 0.0f;
     void set_query(const float* query) override {
         this->xi = query;
-        this->storage->pano.compute_query_cum_sums(query, cum_sums.data());
+        pano_flat->compute_query_cum_sums(query, cum_sums.data());
         q_norm = cum_sums[0] * cum_sums[0];
     }
 
@@ -107,7 +115,7 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
             size_t batch_start = batch_no * storage->kBatchSize;
 
             size_t num_active = with_metric_type(metric, [&]<MetricType M>() {
-                return storage->pano.progressive_filter_batch<C, M>(
+                return pano_flat->progressive_filter_batch<C, M>(
                         codes,
                         cum_sums_data,
                         xi,
diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index f9c6b6e7dd..504958b544 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -14,7 +14,9 @@
 #include <numeric>
 
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+#include <faiss/impl/PanoramaPQ.h>
+#include <faiss/impl/PanoramaStats.h>
+#include <faiss/invlists/InvertedLists.h>
 #include <faiss/utils/Heap.h>
 
 namespace faiss {
@@ -33,19 +35,11 @@ IndexIVFPQPanorama::IndexIVFPQPanorama(
         size_t batch_size,
         MetricType metric,
         bool own_invlists)
-        : IndexIVFPQ(
-                  quantizer,
-                  d,
-                  nlist,
-                  M,
-                  nbits_per_idx,
-                  metric,
-                  own_invlists),
+        : IndexIVFPQ(quantizer, d, nlist, M, nbits_per_idx, metric, false),
           n_levels(n_levels),
           batch_size(batch_size),
           chunk_size(code_size / n_levels),
-          levels_size(d / n_levels),
-          m_level_width(M / n_levels) {
+          levels_size(d / n_levels) {
     FAISS_THROW_IF_NOT_MSG(
             M % n_levels == 0, "M must be divisible by n_levels");
     FAISS_THROW_IF_NOT_MSG(
@@ -54,107 +48,10 @@ IndexIVFPQPanorama::IndexIVFPQPanorama(
     FAISS_THROW_IF_NOT_MSG(
             M == code_size, "M must equal code_size for 8-bit PQ");
     FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported");
-}
-
-/*****************************************
- * add — transpose codes into column-major layout and precompute norms
- ******************************************/
-
-void IndexIVFPQPanorama::add(idx_t n, const float* x) {
-    FAISS_THROW_IF_NOT_MSG(
-            !added, "IndexIVFPQPanorama only supports a single add() call");
-    added = true;
-    num_points = n;
-
-    IndexIVFPQ::add(n, x);
-
-    // Compute column offsets (each list rounded up to batch_size).
-    size_t total_column_bytes = 0;
-    column_offsets = new size_t[nlist];
-    for (size_t i = 0; i < nlist; i++) {
-        column_offsets[i] = total_column_bytes;
-        size_t n_batches =
-                (invlists->list_size(i) + batch_size - 1) / batch_size;
-        total_column_bytes += n_batches * batch_size * code_size;
-    }
-
-    // Transpose codes from row-major [point0_code, point1_code, ...] into
-    // column-major within each batch: M columns of batch_size bytes each.
-    column_storage = new uint8_t[total_column_bytes]();
-    for (size_t list_no = 0; list_no < nlist; list_no++) {
-        size_t col_offset = column_offsets[list_no];
-        size_t list_size = invlists->list_size(list_no);
-        size_t n_batches = (list_size + batch_size - 1) / batch_size;
-        const uint8_t* row_codes = invlists->get_codes(list_no);
-
-        for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
-            size_t batch_offset = batch_no * batch_size * code_size;
-            size_t curr_batch_size =
-                    std::min(list_size - batch_no * batch_size, batch_size);
-            for (size_t m = 0; m < pq.M; m++) {
-                for (size_t p = 0; p < curr_batch_size; p++) {
-                    column_storage
-                            [col_offset + batch_offset + m * batch_size + p] =
-                                    row_codes
-                                            [batch_no * batch_size * code_size +
-                                             p * code_size + m];
-                }
-            }
-        }
-    }
-
-    // Precompute cumulative residual norms (suffix sums of ||y_R||^2).
-    // init_exact_distances are computed on-the-fly during search using
-    // the precomputed_table, so we only need cum_sums here.
-    cum_sum_offsets = new size_t[nlist];
-
-    size_t cum_size = 0;
-    for (size_t list_no = 0; list_no < nlist; list_no++) {
-        cum_sum_offsets[list_no] = cum_size;
-        cum_size += invlists->list_size(list_no) * (n_levels + 1);
-    }
-
-    cum_sums = new float[cum_size];
-
-    for (size_t list_no = 0; list_no < nlist; list_no++) {
-        size_t list_size = invlists->list_size(list_no);
-        if (list_size == 0)
-            continue;
 
-        size_t n_batches = (list_size + batch_size - 1) / batch_size;
-
-        for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
-            size_t b_offset = batch_no * batch_size;
-            size_t curr_batch_size = std::min(list_size - b_offset, batch_size);
-
-            for (size_t p = 0; p < curr_batch_size; p++) {
-                std::vector<float> vec(d);
-                const uint8_t* code =
-                        invlists->get_single_code(list_no, b_offset + p);
-                pq.decode(code, vec.data());
-
-                std::vector<float> suffix(d + 1, 0.0f);
-                for (int j = d - 1; j >= 0; j--) {
-                    suffix[j] = suffix[j + 1] + vec[j] * vec[j];
-                }
-
-                for (int level = 0; level < n_levels; level++) {
-                    int start_idx = level * levels_size;
-                    size_t offset = cum_sum_offsets[list_no] +
-                            b_offset * (n_levels + 1) +
-                            level * curr_batch_size + p;
-                    cum_sums[offset] = start_idx < (int)d
-                            ? std::sqrt(suffix[start_idx])
-                            : 0.0f;
-                }
-
-                size_t last_offset = cum_sum_offsets[list_no] +
-                        b_offset * (n_levels + 1) + n_levels * curr_batch_size +
-                        p;
-                cum_sums[last_offset] = 0.0f;
-            }
-        }
-    }
+    auto* pano = new PanoramaPQ(d, code_size, n_levels, batch_size, &pq);
+    this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano);
+    this->own_invlists = own_invlists;
 }
 
 /*****************************************
@@ -169,6 +66,8 @@ template <class C, bool use_sel>
 struct IVFPQScannerPanorama : InvertedListScanner {
     const IndexIVFPQPanorama& index;
     const ProductQuantizer& pq;
+    const ArrayInvertedListsPanorama* storage;
+    const PanoramaPQ* pano_pq;
 
     // Query state
     const float* qi = nullptr;
@@ -180,11 +79,15 @@ struct IVFPQScannerPanorama : InvertedListScanner {
 
     IVFPQScannerPanorama(
             const IndexIVFPQPanorama& index,
+            const ArrayInvertedListsPanorama* storage,
             bool store_pairs,
             const IDSelector* sel)
             : InvertedListScanner(store_pairs, sel),
               index(index),
-              pq(index.pq) {
+              pq(index.pq),
+              storage(storage),
+              pano_pq(dynamic_cast<const PanoramaPQ*>(storage->pano.get())) {
+        FAISS_THROW_IF_NOT(pano_pq);
         this->keep_max = is_similarity_metric(index.metric_type);
         this->code_size = pq.code_size;
         query_cum_norms.resize(index.n_levels + 1);
@@ -206,17 +109,7 @@ struct IVFPQScannerPanorama : InvertedListScanner {
             sim_table_2[i] *= -2.0f;
         }
 
-        // Compute query suffix sums → cum norms per level.
-        std::vector<float> suffix(index.d + 1, 0.0f);
-        for (int j = index.d - 1; j >= 0; j--) {
-            suffix[j] = suffix[j + 1] + qi[j] * qi[j];
-        }
-        for (int level = 0; level < index.n_levels; level++) {
-            int start = level * index.levels_size;
-            query_cum_norms[level] =
-                    start < (int)index.d ? std::sqrt(suffix[start]) : 0.0f;
-        }
-        query_cum_norms[index.n_levels] = 0.0f;
+        pano_pq->compute_query_cum_sums(qi, query_cum_norms.data());
     }
 
     void set_list(idx_t list_no, float coarse_dis) override {
@@ -230,7 +123,7 @@ struct IVFPQScannerPanorama : InvertedListScanner {
 
     size_t scan_codes(
             size_t list_size,
-            const uint8_t* /* codes (row-major, unused) */,
+            const uint8_t* /* codes (column-major in storage) */,
             const idx_t* ids,
             float* distances,
             idx_t* labels,
@@ -239,14 +132,10 @@ struct IVFPQScannerPanorama : InvertedListScanner {
 
         const size_t bs = index.batch_size;
         const size_t cs = index.chunk_size;
-        const int n_levels = index.n_levels;
 
         const size_t n_batches = (list_size + bs - 1) / bs;
-        // Panorama column-major codes for this list.
-        const uint8_t* col_codes =
-                index.column_storage + index.column_offsets[list_no];
-        const float* list_cum_sums =
-                index.cum_sums + index.cum_sum_offsets[list_no];
+        const uint8_t* col_codes = storage->get_codes(list_no);
+        const float* list_cum_sums = storage->get_cum_sums(list_no);
         const float* precomp =
                 index.precomputed_table.data() + list_no * pq.M * pq.ksub;
 
@@ -257,100 +146,28 @@ struct IVFPQScannerPanorama : InvertedListScanner {
         std::vector<uint8_t> compressed_codes(bs * cs);
         float dis0 = coarse_dis;
 
-        for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
-            size_t curr_batch_size = std::min(list_size - batch_no * bs, bs);
-            size_t b_offset = batch_no * bs;
-
-            // Initialize active set.
-            std::iota(
-                    active_indices.begin(),
-                    active_indices.begin() + curr_batch_size,
-                    b_offset);
-            std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1);
-            std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0);
-
-            const uint8_t* batch_codes = col_codes + b_offset * code_size;
-
-            // Compute init_exact_distance on-the-fly from the
-            // precomputed table: sum_m(precomp[m * ksub + code[m]]).
-            // Codes are column-major: point p's code for subquantizer
-            // m is at batch_codes[m * bs + p].
-            for (size_t idx = 0; idx < curr_batch_size; idx++) {
-                float init_dist = 0.0f;
-                for (size_t m = 0; m < pq.M; m++) {
-                    uint8_t code_val = batch_codes[m * bs + idx];
-                    init_dist += precomp[m * pq.ksub + code_val];
-                }
-                exact_distances[idx] = init_dist;
-            }
-            const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1);
-
-            size_t next_num_active = curr_batch_size;
-            size_t batch_offset = batch_no * bs;
-
-            for (int level = 0; level < n_levels && next_num_active > 0;
-                 level++) {
-                size_t level_sim_offset = level * pq.ksub * cs;
-
-                float query_cum_norm = 2 * query_cum_norms[level + 1];
-                float heap_max = distances[0];
-
-                const float* cum_sums_level =
-                        batch_cums + curr_batch_size * level;
-                const uint8_t* codes_level = batch_codes + bs * cs * level;
-
-                float* sim_table_level =
-                        const_cast<float*>(sim_table_2.data()) +
-                        level_sim_offset;
-
-                bool is_sparse = next_num_active < bs / 16;
-
-                size_t num_active_for_filtering = 0;
-                if (is_sparse) {
-                    // Sparse path: use active_indices for indirection.
-                    for (size_t ci = 0; ci < cs; ci++) {
-                        size_t chunk_off = ci * bs;
-                        float* chunk_sim = sim_table_level + ci * pq.ksub;
-                        for (size_t i = 0; i < next_num_active; i++) {
-                            size_t real_idx = active_indices[i] - batch_offset;
-                            exact_distances[i] += chunk_sim
-                                    [codes_level[chunk_off + real_idx]];
-                        }
-                    }
-                    num_active_for_filtering = next_num_active;
-                } else {
-                    auto [cc, na] = panorama_kernels::process_code_compression(
-                            next_num_active,
-                            bs,
-                            cs,
-                            compressed_codes.data(),
-                            bitset.data(),
-                            codes_level);
-
-                    panorama_kernels::process_chunks(
-                            cs,
-                            bs,
-                            na,
-                            sim_table_level,
-                            cc,
-                            exact_distances.data());
-                    num_active_for_filtering = na;
-                }
+        PanoramaStats local_stats;
+        local_stats.reset();
 
-                next_num_active = panorama_kernels::process_filtering(
-                        num_active_for_filtering,
-                        exact_distances.data(),
-                        active_indices.data(),
-                        const_cast<float*>(cum_sums_level),
-                        bitset.data(),
-                        batch_offset,
-                        dis0,
-                        query_cum_norm,
-                        heap_max);
-            }
+        for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
+            size_t num_active = pano_pq->progressive_filter_batch<C>(
+                    col_codes,
+                    list_cum_sums,
+                    precomp,
+                    sim_table_2.data(),
+                    query_cum_norms.data(),
+                    dis0,
+                    list_size,
+                    batch_no,
+                    exact_distances,
+                    active_indices,
+                    bitset,
+                    compressed_codes,
+                    distances[0],
+                    local_stats);
 
             // Insert surviving candidates into heap.
-            for (size_t i = 0; i < next_num_active; i++) {
+            for (size_t i = 0; i < num_active; i++) {
                 float dis = dis0 + exact_distances[i];
                 if (C::cmp(distances[0], dis)) {
                     idx_t id = store_pairs
@@ -362,6 +179,7 @@ struct IVFPQScannerPanorama : InvertedListScanner {
             }
         }
 
+        indexPanorama_stats.add(local_stats);
         return nup;
     }
 
@@ -395,12 +213,17 @@ InvertedListScanner* IndexIVFPQPanorama::get_InvertedListScanner(
     FAISS_THROW_IF_NOT_MSG(
             polysemous_ht == 0, "Panorama PQ does not support polysemous");
 
+    const auto* storage =
+            dynamic_cast<const ArrayInvertedListsPanorama*>(invlists);
+    FAISS_THROW_IF_NOT_MSG(
+            storage, "IndexIVFPQPanorama requires ArrayInvertedListsPanorama");
+
     if (sel) {
         return new IVFPQScannerPanorama<CMax<float, idx_t>, true>(
-                *this, store_pairs, sel);
+                *this, storage, store_pairs, sel);
     } else {
         return new IVFPQScannerPanorama<CMax<float, idx_t>, false>(
-                *this, store_pairs, sel);
+                *this, storage, store_pairs, sel);
     }
 }
 
diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h
index a97107f54d..717308bb07 100644
--- a/faiss/IndexIVFPQPanorama.h
+++ b/faiss/IndexIVFPQPanorama.h
@@ -33,6 +33,10 @@ namespace faiss {
 /// into `n_levels` levels of `chunk_size` columns, enabling incremental
 /// distance computation level-by-level.
 ///
+/// Storage is managed by ArrayInvertedListsPanorama with a PanoramaPQ
+/// instance that handles code transposition and cumulative sum computation
+/// (via PQ decoding) on insertion.
+///
 /// OVERHEAD:
 /// Panorama precomputes per-point cumulative residual norms at insertion
 /// time. Storage overhead is (n_levels + 1) floats per point for
@@ -47,25 +51,17 @@ namespace faiss {
 /// - use_precomputed_table must be 1.
 ///
 /// NOTE:
-/// We inherit from IndexIVFPQ and override only get_InvertedListScanner()
-/// and add(). The base IndexIVF::search_preassigned() handles all search
+/// We inherit from IndexIVFPQ and override only get_InvertedListScanner().
+/// The base IndexIVF::search_preassigned() handles all search
 /// orchestration — no search code is duplicated.
+/// Storage (transposition + cum_sums) is handled by
+/// ArrayInvertedListsPanorama, so no add() override is needed.
 struct IndexIVFPQPanorama : public IndexIVFPQ {
     int n_levels;
     size_t batch_size;
 
     size_t chunk_size;
     size_t levels_size;
-    size_t m_level_width;
-
-    bool added = false;
-    size_t num_points = 0;
-
-    uint8_t* column_storage = nullptr;
-    size_t* column_offsets = nullptr;
-
-    float* cum_sums = nullptr;
-    size_t* cum_sum_offsets = nullptr;
 
     IndexIVFPQPanorama(
             Index* quantizer,
@@ -80,8 +76,6 @@ struct IndexIVFPQPanorama : public IndexIVFPQ {
 
     IndexIVFPQPanorama() = default;
 
-    void add(idx_t n, const float* x) override;
-
     InvertedListScanner* get_InvertedListScanner(
             bool store_pairs,
             const IDSelector* sel,
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index 0191ef152f..ea204d2450 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -851,10 +851,10 @@ int search_from_candidates_panorama(
         while (curr_panorama_level < num_panorama_levels && batch_size > 0) {
             float query_cum_norm = query_cum_sums[curr_panorama_level + 1];
 
-            size_t start_dim = curr_panorama_level *
-                    panorama_index->pano.level_width_floats;
+            size_t start_dim =
+                    curr_panorama_level * panorama_index->pano.level_width_dims;
             size_t end_dim = (curr_panorama_level + 1) *
-                    panorama_index->pano.level_width_floats;
+                    panorama_index->pano.level_width_dims;
             end_dim = std::min(end_dim, static_cast<size_t>(panorama_index->d));
 
             size_t i = 0;
diff --git a/faiss/impl/Panorama.cpp b/faiss/impl/Panorama.cpp
index 970a0cefa6..a3928e4fce 100644
--- a/faiss/impl/Panorama.cpp
+++ b/faiss/impl/Panorama.cpp
@@ -26,7 +26,7 @@ inline void compute_cum_sums_impl(
         float* output,
         size_t d,
         size_t n_levels,
-        size_t level_width_floats,
+        size_t level_width_dims,
         OffsetFunc&& get_offset) {
     // Iterate backwards through levels, accumulating sum as we go.
     // This avoids computing the suffix sum for each vector, which takes
@@ -34,9 +34,9 @@ inline void compute_cum_sums_impl(
     float sum = 0.0f;
 
     for (int level = n_levels - 1; level >= 0; level--) {
-        size_t start_idx = level * level_width_floats;
+        size_t start_idx = level * level_width_dims;
         size_t end_idx = std::min(
-                (level + 1) * level_width_floats, static_cast<size_t>(d));
+                (level + 1) * level_width_dims, static_cast<size_t>(d));
 
         for (size_t j = start_idx; j < end_idx; j++) {
             sum += vector[j] * vector[j];
@@ -51,19 +51,24 @@ inline void compute_cum_sums_impl(
 } // namespace
 
 /**************************************************************
- * Panorama structure implementation
+ * Panorama base class implementation
  **************************************************************/
 
-Panorama::Panorama(size_t code_size, size_t n_levels, size_t batch_size)
-        : code_size(code_size), n_levels(n_levels), batch_size(batch_size) {
+Panorama::Panorama(
+        size_t d,
+        size_t code_size,
+        size_t n_levels,
+        size_t batch_size)
+        : d(d),
+          code_size(code_size),
+          n_levels(n_levels),
+          batch_size(batch_size) {
     set_derived_values();
 }
 
 void Panorama::set_derived_values() {
     FAISS_THROW_IF_NOT_MSG(n_levels > 0, "Panorama: n_levels must be > 0");
-    this->d = code_size / sizeof(float);
-    this->level_width_floats = ((d + n_levels - 1) / n_levels);
-    this->level_width = this->level_width_floats * sizeof(float);
+    level_width_bytes = (code_size + n_levels - 1) / n_levels;
 }
 
 /**
@@ -88,10 +93,10 @@ void Panorama::copy_codes_to_level_layout(
         // Copy entry into level-oriented layout for this batch.
         size_t batch_offset = batch_no * batch_size * code_size;
         for (size_t level = 0; level < n_levels; level++) {
-            size_t level_offset = level * level_width * batch_size;
-            size_t start_byte = level * level_width;
-            size_t actual_level_width =
-                    std::min(level_width, code_size - level * level_width);
+            size_t level_offset = level * level_width_bytes * batch_size;
+            size_t start_byte = level * level_width_bytes;
+            size_t actual_level_width = std::min(
+                    level_width_bytes, code_size - level * level_width_bytes);
 
             const uint8_t* src = code + entry_idx * code_size + start_byte;
             uint8_t* dest = codes + batch_offset + level_offset +
@@ -102,38 +107,12 @@ void Panorama::copy_codes_to_level_layout(
     }
 }
 
-void Panorama::compute_cumulative_sums(
-        float* cumsum_base,
-        size_t offset,
-        size_t n_entry,
-        const float* vectors) const {
-    for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
-        size_t current_pos = offset + entry_idx;
-        size_t batch_no = current_pos / batch_size;
-        size_t pos_in_batch = current_pos % batch_size;
-
-        const float* vector = vectors + entry_idx * d;
-        size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1);
-
-        auto get_offset = [&](size_t level) {
-            return cumsum_batch_offset + level * batch_size + pos_in_batch;
-        };
-
-        compute_cum_sums_impl(
-                vector,
-                cumsum_base,
-                d,
-                n_levels,
-                level_width_floats,
-                get_offset);
-    }
-}
-
 void Panorama::compute_query_cum_sums(const float* query, float* query_cum_sums)
         const {
+    size_t level_dims = (d + n_levels - 1) / n_levels;
     auto get_offset = [](size_t level) { return level; };
     compute_cum_sums_impl(
-            query, query_cum_sums, d, n_levels, level_width_floats, get_offset);
+            query, query_cum_sums, d, n_levels, level_dims, get_offset);
 }
 
 void Panorama::reconstruct(idx_t key, float* recons, const uint8_t* codes_base)
@@ -145,12 +124,12 @@ void Panorama::reconstruct(idx_t key, float* recons, const uint8_t* codes_base)
     size_t batch_offset = batch_no * batch_size * code_size;
 
     for (size_t level = 0; level < n_levels; level++) {
-        size_t level_offset = level * level_width * batch_size;
+        size_t level_offset = level * level_width_bytes * batch_size;
         const uint8_t* src = codes_base + batch_offset + level_offset +
-                pos_in_batch * level_width;
-        uint8_t* dest = recons_buffer + level * level_width;
-        size_t copy_size =
-                std::min(level_width, code_size - level * level_width);
+                pos_in_batch * level_width_bytes;
+        uint8_t* dest = recons_buffer + level * level_width_bytes;
+        size_t copy_size = std::min(
+                level_width_bytes, code_size - level * level_width_bytes);
         memcpy(dest, src, copy_size);
     }
 }
@@ -177,9 +156,9 @@ void Panorama::copy_entry(
 
     for (size_t level = 0; level < n_levels; level++) {
         // Copy code
-        size_t level_offset = level * level_width * batch_size;
-        size_t actual_level_width =
-                std::min(level_width, code_size - level * level_width);
+        size_t level_offset = level * level_width_bytes * batch_size;
+        size_t actual_level_width = std::min(
+                level_width_bytes, code_size - level * level_width_bytes);
 
         const uint8_t* src = src_codes + src_batch_offset + level_offset +
                 src_pos_in_batch * actual_level_width;
@@ -197,4 +176,38 @@ void Panorama::copy_entry(
         dest_cum_sums[dest_offset] = src_cum_sums[src_offset];
     }
 }
+
+/**************************************************************
+ * PanoramaFlat implementation
+ **************************************************************/
+
+PanoramaFlat::PanoramaFlat(size_t d, size_t n_levels, size_t batch_size)
+        : Panorama(d, d * sizeof(float), n_levels, batch_size) {
+    level_width_dims = (d + n_levels - 1) / n_levels;
+    level_width_bytes = level_width_dims * sizeof(float);
+}
+
+void PanoramaFlat::compute_cumulative_sums(
+        float* cumsum_base,
+        size_t offset,
+        size_t n_entry,
+        const uint8_t* code) const {
+    const float* vectors = reinterpret_cast<const float*>(code);
+    for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
+        size_t current_pos = offset + entry_idx;
+        size_t batch_no = current_pos / batch_size;
+        size_t pos_in_batch = current_pos % batch_size;
+
+        const float* vector = vectors + entry_idx * d;
+        size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1);
+
+        auto get_offset = [&](size_t level) {
+            return cumsum_batch_offset + level * batch_size + pos_in_batch;
+        };
+
+        compute_cum_sums_impl(
+                vector, cumsum_base, d, n_levels, level_width_dims, get_offset);
+    }
+}
+
 } // namespace faiss
diff --git a/faiss/impl/Panorama.h b/faiss/impl/Panorama.h
index 79a23a64a7..8f33fc8d1a 100644
--- a/faiss/impl/Panorama.h
+++ b/faiss/impl/Panorama.h
@@ -40,35 +40,46 @@ namespace faiss {
  * Coupled with the appropriate orthogonal PreTransform (e.g. PCA, Cayley,
  * etc.), Panorama can prune the vast majority of dimensions, greatly
  * accelerating the refinement stage.
+ *
+ * This is the abstract base class. Concrete subclasses (PanoramaFlat,
+ * PanoramaPQ) implement compute_cumulative_sums and progressive_filter_batch
+ * for their respective code formats.
  */
 struct Panorama {
     size_t d = 0;
     size_t code_size = 0;
     size_t n_levels = 0;
-    size_t level_width = 0;
-    size_t level_width_floats = 0;
+    size_t level_width_bytes = 0;
     size_t batch_size = 0;
 
-    explicit Panorama(size_t code_size, size_t n_levels, size_t batch_size);
+    Panorama() = default;
+    Panorama(size_t d, size_t code_size, size_t n_levels, size_t batch_size);
+
+    virtual ~Panorama() = default;
 
     void set_derived_values();
 
     /// Helper method to copy codes into level-oriented batch layout at a given
     /// offset in the list.
-    void copy_codes_to_level_layout(
+    /// PanoramaFlat uses row-major within each level (point bytes contiguous).
+    /// PanoramaPQ overrides to use column-major (subquantizer columns
+    /// contiguous).
+    virtual void copy_codes_to_level_layout(
             uint8_t* codes,
             size_t offset,
             size_t n_entry,
             const uint8_t* code);
 
-    /// Helper method to compute the cumulative sums of the codes.
-    /// The cumsums also follow the level-oriented batch layout to minimize the
+    /// Compute the cumulative sums (suffix norms) for database vectors.
+    /// The cumsums follow the level-oriented batch layout to minimize the
     /// number of random memory accesses.
-    void compute_cumulative_sums(
+    /// Subclasses interpret the raw code bytes according to their format:
+    /// PanoramaFlat reinterprets as float*, PanoramaPQ decodes via PQ.
+    virtual void compute_cumulative_sums(
             float* cumsum_base,
             size_t offset,
             size_t n_entry,
-            const float* vectors) const;
+            const uint8_t* code) const = 0;
 
     /// Compute the cumulative sums of the query vector.
     void compute_query_cum_sums(const float* query, float* query_cum_sums)
@@ -83,7 +94,30 @@ struct Panorama {
             size_t dest_idx,
             size_t src_idx) const;
 
-    /// Panorama's core progressive filtering algorithm:
+    virtual void reconstruct(idx_t key, float* recons, const uint8_t* codes_base)
+            const;
+};
+
+/**
+ * Panorama for flat (uncompressed) float vectors.
+ *
+ * Codes are raw float vectors (code_size = d * sizeof(float)).
+ * compute_cumulative_sums interprets codes as floats.
+ * progressive_filter_batch computes dot products on raw float storage.
+ */
+struct PanoramaFlat : Panorama {
+    size_t level_width_dims = 0;
+
+    PanoramaFlat() = default;
+    PanoramaFlat(size_t d, size_t n_levels, size_t batch_size);
+
+    void compute_cumulative_sums(
+            float* cumsum_base,
+            size_t offset,
+            size_t n_entry,
+            const uint8_t* code) const override;
+
+    /// Panorama's core progressive filtering algorithm for flat codes:
     /// Process vectors in batches for cache efficiency. For each batch:
     /// 1. Apply ID selection filter and initialize distances
     /// (||y||^2 + ||x||^2).
@@ -113,12 +147,10 @@ struct Panorama {
             std::vector<float>& exact_distances,
             float threshold,
             PanoramaStats& local_stats) const;
-
-    void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) const;
 };
 
 template <typename C, MetricType M>
-size_t Panorama::progressive_filter_batch(
+size_t PanoramaFlat::progressive_filter_batch(
         const uint8_t* codes_base,
         const float* cum_sums,
         const float* query,
@@ -173,18 +205,18 @@ size_t Panorama::progressive_filter_batch(
 
         float query_cum_norm = query_cum_sums[level + 1];
 
-        size_t level_offset = level * level_width * batch_size;
+        size_t level_offset = level * level_width_bytes * batch_size;
         const float* level_storage =
                 (const float*)(storage_base + level_offset);
 
         size_t next_active = 0;
         for (size_t i = 0; i < num_active; i++) {
             uint32_t idx = active_indices[i];
-            size_t actual_level_width = std::min(
-                    level_width_floats, d - level * level_width_floats);
+            size_t actual_level_width =
+                    std::min(level_width_dims, d - level * level_width_dims);
 
             const float* yj = level_storage + idx * actual_level_width;
-            const float* query_level = query + level * level_width_floats;
+            const float* query_level = query + level * level_width_dims;
 
             float dot_product =
                     fvec_inner_product(query_level, yj, actual_level_width);
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 49e0458631..7cc9ee9d08 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -38,6 +38,7 @@
 #include <faiss/IndexIVFIndependentQuantizer.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
+#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFRaBitQ.h>
 #include <faiss/IndexIVFRaBitQFastScan.h>
@@ -53,6 +54,7 @@
 #include <faiss/IndexRaBitQFastScan.h>
 #include <faiss/IndexRefine.h>
 #include <faiss/IndexRowwiseMinMax.h>
+#include <faiss/impl/PanoramaPQ.h>
 #ifdef FAISS_ENABLE_SVS
 #include <faiss/impl/svs_io.h>
 #include <faiss/svs/IndexSVSFlat.h>
@@ -400,8 +402,12 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
         FAISS_CHECK_DESERIALIZATION_LOOP_LIMIT(nlist, "ilpn nlist");
         READ1(code_size);
         READ1(n_levels);
+        auto* pano = new PanoramaFlat(
+                code_size / sizeof(float),
+                n_levels,
+                ArrayInvertedListsPanorama::kBatchSize);
         auto ailp = std::make_unique<ArrayInvertedListsPanorama>(
-                nlist, code_size, n_levels);
+                nlist, code_size, pano);
         std::vector<size_t> sizes(nlist);
         read_ArrayInvertedLists_sizes(f, sizes);
         for (size_t i = 0; i < nlist; i++) {
@@ -1365,6 +1371,34 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         READVECTOR(ivsp->trained);
         read_InvertedLists(*ivsp, f, io_flags);
         idx = std::move(ivsp);
+    } else if (h == fourcc("IwPP")) {
+        auto ivpp = std::make_unique<IndexIVFPQPanorama>();
+        read_ivf_header(ivpp.get(), f);
+        READ1(ivpp->by_residual);
+        READ1(ivpp->code_size);
+        read_ProductQuantizer(&ivpp->pq, f);
+        READ1(ivpp->n_levels);
+        READ1(ivpp->batch_size);
+        ivpp->chunk_size = ivpp->code_size / ivpp->n_levels;
+        ivpp->levels_size = ivpp->d / ivpp->n_levels;
+        read_InvertedLists(*ivpp, f, io_flags);
+        // The "ilpn" reader creates a PanoramaFlat placeholder; replace
+        // it with PanoramaPQ now that we have the ProductQuantizer.
+        auto* storage =
+                dynamic_cast<ArrayInvertedListsPanorama*>(ivpp->invlists);
+        if (storage) {
+            storage->pano.reset(new PanoramaPQ(
+                    ivpp->d,
+                    ivpp->code_size,
+                    ivpp->n_levels,
+                    ivpp->batch_size,
+                    &ivpp->pq));
+        }
+        if (ivpp->is_trained) {
+            ivpp->use_precomputed_table = 1;
+            ivpp->precompute_table();
+        }
+        idx = std::move(ivpp);
     } else if (
             h == fourcc("IvPQ") || h == fourcc("IvQR") || h == fourcc("IwPQ") ||
             h == fourcc("IwQR")) {
@@ -1496,8 +1530,8 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
             size_t nlevels;
             READ1(nlevels);
             const_cast<size_t&>(idx_panorama->num_panorama_levels) = nlevels;
-            const_cast<Panorama&>(idx_panorama->pano) =
-                    Panorama(idx_panorama->d * sizeof(float), nlevels, 1);
+            const_cast<PanoramaFlat&>(idx_panorama->pano) =
+                    PanoramaFlat(idx_panorama->d, nlevels, 1);
             READVECTOR(idx_panorama->cum_sums);
         }
         if (h == fourcc("IHNc") || h == fourcc("IHc2")) {
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 04257c76a6..02d0870bbc 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -34,6 +34,7 @@
 #include <faiss/IndexIVFFlatPanorama.h>
 #include <faiss/IndexIVFIndependentQuantizer.h>
 #include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/IndexIVFPQFastScan.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFRaBitQ.h>
@@ -273,7 +274,7 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) {
         WRITE1(h);
         WRITE1(ailp->nlist);
         WRITE1(ailp->code_size);
-        WRITE1(ailp->n_levels);
+        WRITE1(ailp->pano->n_levels);
         uint32_t list_type = fourcc("full");
         WRITE1(list_type);
         std::vector<size_t> sizes;
@@ -774,6 +775,18 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) {
         WRITE1(ivsp->threshold_type);
         WRITEVECTOR(ivsp->trained);
         write_InvertedLists(ivsp->invlists, f);
+    } else if (
+            const IndexIVFPQPanorama* ivpp =
+                    dynamic_cast<const IndexIVFPQPanorama*>(idx)) {
+        uint32_t h = fourcc("IwPP");
+        WRITE1(h);
+        write_ivf_header(ivpp, f);
+        WRITE1(ivpp->by_residual);
+        WRITE1(ivpp->code_size);
+        write_ProductQuantizer(&ivpp->pq, f);
+        WRITE1(ivpp->n_levels);
+        WRITE1(ivpp->batch_size);
+        write_InvertedLists(ivpp->invlists, f);
     } else if (const IndexIVFPQ* ivpq = dynamic_cast<const IndexIVFPQ*>(idx)) {
         const IndexIVFPQR* ivfpqr = dynamic_cast<const IndexIVFPQR*>(idx);
 
diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index 448b969736..313228301c 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -353,19 +353,12 @@ ArrayInvertedLists::~ArrayInvertedLists() {}
 ArrayInvertedListsPanorama::ArrayInvertedListsPanorama(
         size_t nlist,
         size_t code_size,
-        size_t n_levels)
-        : ArrayInvertedLists(nlist, code_size),
-          n_levels(n_levels),
-          level_width(
-                  (((code_size / sizeof(float)) + n_levels - 1) / n_levels) *
-                  sizeof(float)),
-          pano(code_size, n_levels, kBatchSize) {
-    FAISS_THROW_IF_NOT(n_levels > 0);
-    FAISS_THROW_IF_NOT(code_size % sizeof(float) == 0);
+        Panorama* pano)
+        : ArrayInvertedLists(nlist, code_size), pano(pano) {
+    FAISS_THROW_IF_NOT(pano != nullptr);
+    FAISS_THROW_IF_NOT(pano->n_levels > 0);
     FAISS_THROW_IF_NOT_MSG(
-            !use_iterator,
-            "IndexIVFFlatPanorama does not support iterators, use vanilla IndexIVFFlat instead");
-    FAISS_ASSERT(level_width % sizeof(float) == 0);
+            !use_iterator, "Panorama does not support iterators");
 
     cum_sums.resize(nlist);
 }
@@ -389,13 +382,10 @@ size_t ArrayInvertedListsPanorama::add_entries(
     size_t new_size = o + n_entry;
     size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize;
     codes[list_no].resize(num_batches * kBatchSize * code_size);
-    cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1));
+    cum_sums[list_no].resize(num_batches * kBatchSize * (pano->n_levels + 1));
 
-    // Cast to float* is safe here as we guarantee codes are always float
-    // vectors for `IndexIVFFlatPanorama` (verified by the constructor).
-    const float* vectors = reinterpret_cast<const float*>(code);
-    pano.copy_codes_to_level_layout(codes[list_no].data(), o, n_entry, code);
-    pano.compute_cumulative_sums(cum_sums[list_no].data(), o, n_entry, vectors);
+    pano->copy_codes_to_level_layout(codes[list_no].data(), o, n_entry, code);
+    pano->compute_cumulative_sums(cum_sums[list_no].data(), o, n_entry, code);
 
     return o;
 }
@@ -411,13 +401,10 @@ void ArrayInvertedListsPanorama::update_entries(
 
     memcpy(&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry);
 
-    // Cast to float* is safe here as we guarantee codes are always float
-    // vectors for `IndexIVFFlatPanorama` (verified by the constructor).
-    const float* vectors = reinterpret_cast<const float*>(code);
-    pano.copy_codes_to_level_layout(
+    pano->copy_codes_to_level_layout(
             codes[list_no].data(), offset, n_entry, code);
-    pano.compute_cumulative_sums(
-            cum_sums[list_no].data(), offset, n_entry, vectors);
+    pano->compute_cumulative_sums(
+            cum_sums[list_no].data(), offset, n_entry, code);
 }
 
 void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) {
@@ -425,7 +412,7 @@ void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) {
 
     size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize;
     codes[list_no].resize(num_batches * kBatchSize * code_size);
-    cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1));
+    cum_sums[list_no].resize(num_batches * kBatchSize * (pano->n_levels + 1));
 }
 
 const uint8_t* ArrayInvertedListsPanorama::get_single_code(
@@ -437,7 +424,7 @@ const uint8_t* ArrayInvertedListsPanorama::get_single_code(
     uint8_t* recons_buffer = new uint8_t[code_size];
 
     float* recons = reinterpret_cast<float*>(recons_buffer);
-    pano.reconstruct(offset, recons, codes[list_no].data());
+    pano->reconstruct(offset, recons, codes[list_no].data());
 
     return recons_buffer;
 }
diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h
index 43c1ecc0c5..edc29995b9 100644
--- a/faiss/invlists/InvertedLists.h
+++ b/faiss/invlists/InvertedLists.h
@@ -15,6 +15,7 @@
  * the interface.
  */
 
+#include <memory>
 #include <vector>
 
 #include <faiss/MetricType.h>
@@ -277,16 +278,17 @@ struct ArrayInvertedLists : InvertedLists {
     ~ArrayInvertedLists() override;
 };
 
-/// Level-oriented storage as defined in the IVFFlat section of Panorama
+/// Level-oriented storage as defined in the Panorama paper
 /// (https://www.arxiv.org/pdf/2510.00566).
+/// Works with both flat codes (PanoramaFlat) and PQ codes (PanoramaPQ)
+/// via the virtual Panorama interface.
 struct ArrayInvertedListsPanorama : ArrayInvertedLists {
     static constexpr size_t kBatchSize = 128;
     std::vector<MaybeOwnedVector<float>> cum_sums;
-    const size_t n_levels;
-    const size_t level_width; // in code units
-    Panorama pano;
+    std::unique_ptr<Panorama> pano;
 
-    ArrayInvertedListsPanorama(size_t nlist, size_t code_size, size_t n_levels);
+    /// Takes ownership of the provided Panorama*.
+    ArrayInvertedListsPanorama(size_t nlist, size_t code_size, Panorama* pano);
 
     const float* get_cum_sums(size_t list_no) const;
 
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index 75292ecb7f..371dabdf7e 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -542,6 +542,7 @@ void gpu_sync_all_devices()
 %include  <faiss/IndexAdditiveQuantizer.h>
 %include  <faiss/impl/io.h>
 
+%ignore faiss::ArrayInvertedListsPanorama::pano;
 %include  <faiss/invlists/InvertedLists.h>
 %include  <faiss/invlists/InvertedListsIOHook.h>
 %ignore BlockInvertedListsIOHook;

From a8878f07e39041de4494db8d3e0a1018bfc7e568 Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Fri, 20 Mar 2026 04:18:01 +0000
Subject: [PATCH 12/41] Add missing files

---
 faiss/impl/PanoramaPQ.cpp | 122 ++++++++++++++++++++++
 faiss/impl/PanoramaPQ.h   | 212 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 334 insertions(+)
 create mode 100644 faiss/impl/PanoramaPQ.cpp
 create mode 100644 faiss/impl/PanoramaPQ.h

diff --git a/faiss/impl/PanoramaPQ.cpp b/faiss/impl/PanoramaPQ.cpp
new file mode 100644
index 0000000000..832bfaf91d
--- /dev/null
+++ b/faiss/impl/PanoramaPQ.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/PanoramaPQ.h>
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+
+void PanoramaPQ::copy_codes_to_level_layout(
+        uint8_t* codes,
+        size_t offset,
+        size_t n_entry,
+        const uint8_t* code) {
+    const size_t cs = chunk_size;
+    const size_t bs = batch_size;
+
+    for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
+        size_t current_pos = offset + entry_idx;
+        size_t batch_no = current_pos / bs;
+        size_t pos_in_batch = current_pos % bs;
+        size_t batch_offset = batch_no * bs * code_size;
+
+        for (size_t level = 0; level < n_levels; level++) {
+            size_t level_offset = level * cs * bs;
+            size_t start_byte = level * cs;
+
+            for (size_t ci = 0; ci < cs && (start_byte + ci) < code_size;
+                 ci++) {
+                codes[batch_offset + level_offset + ci * bs + pos_in_batch] =
+                        code[entry_idx * code_size + start_byte + ci];
+            }
+        }
+    }
+}
+
+void PanoramaPQ::reconstruct(
+        idx_t key,
+        float* recons,
+        const uint8_t* codes_base) const {
+    uint8_t* recons_buffer = reinterpret_cast<uint8_t*>(recons);
+    const size_t cs = chunk_size;
+    const size_t bs = batch_size;
+
+    size_t batch_no = key / bs;
+    size_t pos_in_batch = key % bs;
+    size_t batch_offset = batch_no * bs * code_size;
+
+    for (size_t level = 0; level < n_levels; level++) {
+        size_t level_offset = level * cs * bs;
+        size_t start_byte = level * cs;
+
+        for (size_t ci = 0; ci < cs && (start_byte + ci) < code_size; ci++) {
+            recons_buffer[start_byte + ci] =
+                    codes_base[batch_offset + level_offset + ci * bs +
+                               pos_in_batch];
+        }
+    }
+}
+
+PanoramaPQ::PanoramaPQ(
+        size_t d,
+        size_t code_size,
+        size_t n_levels,
+        size_t batch_size,
+        const ProductQuantizer* pq)
+        : Panorama(d, code_size, n_levels, batch_size),
+          pq(pq),
+          chunk_size(code_size / n_levels),
+          levels_size(d / n_levels) {
+    FAISS_THROW_IF_NOT_MSG(
+            code_size % n_levels == 0,
+            "PanoramaPQ: code_size must be divisible by n_levels");
+    FAISS_THROW_IF_NOT_MSG(pq != nullptr, "PanoramaPQ: pq must not be null");
+}
+
+void PanoramaPQ::compute_cumulative_sums(
+        float* cumsum_base,
+        size_t offset,
+        size_t n_entry,
+        const uint8_t* code) const {
+    for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
+        size_t current_pos = offset + entry_idx;
+        size_t batch_no = current_pos / batch_size;
+        size_t pos_in_batch = current_pos % batch_size;
+
+        // Decode PQ code to float vector.
+        std::vector<float> vec(d);
+        pq->decode(code + entry_idx * code_size, vec.data());
+
+        // Compute suffix sums of squared norms.
+        std::vector<float> suffix(d + 1, 0.0f);
+        for (int j = d - 1; j >= 0; j--) {
+            suffix[j] = suffix[j + 1] + vec[j] * vec[j];
+        }
+
+        // Write into batch-oriented layout.
+        size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1);
+        for (size_t level = 0; level < n_levels; level++) {
+            size_t start_idx = level * levels_size;
+            size_t out_offset = cumsum_batch_offset + level * batch_size +
+                    pos_in_batch;
+            cumsum_base[out_offset] = start_idx < d
+                    ? std::sqrt(suffix[start_idx])
+                    : 0.0f;
+        }
+
+        size_t last_offset = cumsum_batch_offset + n_levels * batch_size +
+                pos_in_batch;
+        cumsum_base[last_offset] = 0.0f;
+    }
+}
+
+} // namespace faiss
diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h
new file mode 100644
index 0000000000..1ddc0c3897
--- /dev/null
+++ b/faiss/impl/PanoramaPQ.h
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/impl/Panorama.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+#include <faiss/utils/Heap.h>
+#include <cmath>
+
+#include <numeric>
+
+namespace faiss {
+
+/**
+ * Panorama for PQ-compressed vectors.
+ *
+ * Codes are PQ codes (code_size = M bytes for 8-bit PQ).
+ * compute_cumulative_sums decodes via PQ then computes suffix norms.
+ * progressive_filter_batch uses LUT accumulation with panorama_kernels.
+ */
+struct PanoramaPQ : Panorama {
+    const ProductQuantizer* pq = nullptr;
+    size_t chunk_size = 0;
+    size_t levels_size = 0;
+
+    PanoramaPQ() = default;
+    PanoramaPQ(
+            size_t d,
+            size_t code_size,
+            size_t n_levels,
+            size_t batch_size,
+            const ProductQuantizer* pq);
+
+    void copy_codes_to_level_layout(
+            uint8_t* codes,
+            size_t offset,
+            size_t n_entry,
+            const uint8_t* code) override;
+
+    void reconstruct(idx_t key, float* recons, const uint8_t* codes_base)
+            const override;
+
+    void compute_cumulative_sums(
+            float* cumsum_base,
+            size_t offset,
+            size_t n_entry,
+            const uint8_t* code) const override;
+
+    /// Progressive filtering for PQ codes: processes one batch.
+    ///
+    /// Follows the same pattern as PanoramaFlat: initializes exact_distances
+    /// with squared norms (||r||^2 from stored cum_sums + dis0), then
+    /// processes the inner-product contribution level-by-level with pruning.
+    /// The SIMD-optimized process_chunks kernel handles the init phase
+    /// (precomp table over all M subquantizers) in a single vectorized pass.
+    ///
+    /// @param col_codes       Column-major codes for this inverted list.
+    /// @param list_cum_sums   Cumulative sums for this inverted list.
+    /// @param precomp         Precomputed table slice for this list.
+    /// @param sim_table_2     -2 * inner_prod_table (query-specific LUT).
+    /// @param query_cum_norms Query suffix norms per level.
+    /// @param coarse_dis      Coarse distance (dis0) for this list.
+    /// @param list_size       Total number of vectors in this list.
+    /// @param batch_no        Which batch to process.
+    /// @param exact_distances [out] Scratch buffer for partial distances.
+    /// @param active_indices  [out] Scratch buffer for survivor indices.
+    /// @param bitset          Scratch buffer for code compression.
+    /// @param compressed_codes Scratch buffer for compressed codes.
+    /// @param threshold       Current heap threshold for pruning.
+    /// @param local_stats     [out] Accumulated pruning statistics.
+    /// @return Number of surviving candidates in active_indices.
+    template <typename C>
+    size_t progressive_filter_batch(
+            const uint8_t* col_codes,
+            const float* list_cum_sums,
+            const float* precomp,
+            const float* sim_table_2,
+            const float* query_cum_norms,
+            float coarse_dis,
+            size_t list_size,
+            size_t batch_no,
+            std::vector<float>& exact_distances,
+            std::vector<uint32_t>& active_indices,
+            std::vector<uint8_t>& bitset,
+            std::vector<uint8_t>& compressed_codes,
+            float threshold,
+            PanoramaStats& local_stats) const;
+};
+
+template <typename C>
+size_t PanoramaPQ::progressive_filter_batch(
+        const uint8_t* col_codes,
+        const float* list_cum_sums,
+        const float* precomp,
+        const float* sim_table_2,
+        const float* query_cum_norms,
+        float coarse_dis,
+        size_t list_size,
+        size_t batch_no,
+        std::vector<float>& exact_distances,
+        std::vector<uint32_t>& active_indices,
+        std::vector<uint8_t>& bitset,
+        std::vector<uint8_t>& compressed_codes,
+        float threshold,
+        PanoramaStats& local_stats) const {
+    const size_t bs = batch_size;
+    const size_t cs = chunk_size;
+    const size_t M = pq->M;
+    const size_t ksub = pq->ksub;
+
+    size_t curr_batch_size = std::min(list_size - batch_no * bs, bs);
+    size_t b_offset = batch_no * bs;
+
+    // Initialize active set.
+    std::iota(
+            active_indices.begin(),
+            active_indices.begin() + curr_batch_size,
+            b_offset);
+    std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1);
+    std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0);
+
+    const uint8_t* batch_codes = col_codes + b_offset * code_size;
+
+    // SIMD init: compute precomp distances for all M subquantizers.
+    // process_chunks naturally handles column-major codes and does
+    // cache-friendly 1KB-at-a-time table lookups with AVX-512 gathers.
+    std::fill(
+            exact_distances.begin(),
+            exact_distances.begin() + curr_batch_size,
+            0.0f);
+    panorama_kernels::process_chunks(
+            M,
+            bs,
+            curr_batch_size,
+            const_cast<float*>(precomp),
+            const_cast<uint8_t*>(batch_codes),
+            exact_distances.data());
+
+    const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1);
+
+    size_t next_num_active = curr_batch_size;
+    size_t batch_offset = batch_no * bs;
+    size_t total_active = next_num_active;
+
+    for (size_t level = 0; level < n_levels && next_num_active > 0; level++) {
+        local_stats.total_dims_scanned += next_num_active;
+        local_stats.total_dims += total_active;
+
+        size_t level_sim_offset = level * ksub * cs;
+
+        float query_cum_norm = 2 * query_cum_norms[level + 1];
+
+        const float* cum_sums_level = batch_cums + bs * (level + 1);
+        const uint8_t* codes_level = batch_codes + bs * cs * level;
+
+        const float* sim_table_level = sim_table_2 + level_sim_offset;
+
+        bool is_sparse = next_num_active < bs / 16;
+
+        size_t num_active_for_filtering = 0;
+        if (is_sparse) {
+            for (size_t ci = 0; ci < cs; ci++) {
+                size_t chunk_off = ci * bs;
+                const float* chunk_sim = sim_table_level + ci * ksub;
+                for (size_t i = 0; i < next_num_active; i++) {
+                    size_t real_idx = active_indices[i] - batch_offset;
+                    exact_distances[i] +=
+                            chunk_sim[codes_level[chunk_off + real_idx]];
+                }
+            }
+            num_active_for_filtering = next_num_active;
+        } else {
+            auto [cc, na] = panorama_kernels::process_code_compression(
+                    next_num_active,
+                    bs,
+                    cs,
+                    compressed_codes.data(),
+                    bitset.data(),
+                    codes_level);
+
+            panorama_kernels::process_chunks(
+                    cs,
+                    bs,
+                    na,
+                    const_cast<float*>(sim_table_level),
+                    cc,
+                    exact_distances.data());
+            num_active_for_filtering = na;
+        }
+
+        next_num_active = panorama_kernels::process_filtering(
+                num_active_for_filtering,
+                exact_distances.data(),
+                active_indices.data(),
+                const_cast<float*>(cum_sums_level),
+                bitset.data(),
+                batch_offset,
+                coarse_dis,
+                query_cum_norm,
+                threshold);
+    }
+
+    return next_num_active;
+}
+
+} // namespace faiss

From fcb0cdf60ee6fe23da2ae46e8685f85ea37a82a4 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Fri, 20 Mar 2026 05:01:10 +0000
Subject: [PATCH 13/41] BEST COMMIT YET

---
 benchs/bench_ivfpq_panorama.py   | 60 ++++++++++++++++----------------
 faiss/IndexIVFPQPanorama.cpp     |  7 ++--
 faiss/impl/PanoramaPQ.cpp        | 31 ++++++++++++++++-
 faiss/impl/PanoramaPQ.h          | 49 +++++++++++++-------------
 faiss/impl/index_read.cpp        | 15 +++++++-
 faiss/impl/index_write.cpp       | 11 +++++-
 faiss/invlists/InvertedLists.cpp | 34 +++++++++++++++---
 faiss/invlists/InvertedLists.h   |  2 ++
 8 files changed, 143 insertions(+), 66 deletions(-)

diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
index 5ae4fc2152..eafeebb7e8 100644
--- a/benchs/bench_ivfpq_panorama.py
+++ b/benchs/bench_ivfpq_panorama.py
@@ -17,7 +17,7 @@ def fvecs_read(fname):
 
 
 GIST_DIR = "/datasets/PCA_init"
-CACHE_DIR = "/home/lutex/faiss-panorama/index_cache"
+CACHE_DIR = "/home/akash/faiss-panorama/index_cache"
 os.makedirs(CACHE_DIR, exist_ok=True)
 
 IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index")
@@ -80,35 +80,35 @@ def eval_recall(index, nprobe_val):
 
 faiss.omp_set_num_threads(mp.cpu_count())
 
-# --- IVFPQ baseline (cached) ---
-if os.path.exists(IVFPQ_CACHE):
-    print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True)
-    t0 = time.time()
-    ivfpq = faiss.read_index(IVFPQ_CACHE)
-    print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
-else:
-    print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True)
-    quantizer = faiss.IndexFlatL2(d)
-    ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
-    t0 = time.time()
-    ivfpq.train(xt)
-    print(f"  Training took {time.time() - t0:.1f}s", flush=True)
-
-    print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
-    faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE)
-
-    t0 = time.time()
-    ivfpq.add(xb)
-    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
-
-    print(f"  Saving full index to {IVFPQ_CACHE}...", flush=True)
-    faiss.write_index(ivfpq, IVFPQ_CACHE)
-
-faiss.omp_set_num_threads(1)
-print("\n====== IVFPQ baseline", flush=True)
-for nprobe in [1, 2, 4, 8, 16]:
-    ivfpq.nprobe = nprobe
-    eval_recall(ivfpq, nprobe)
+# # --- IVFPQ baseline (cached) ---
+# if os.path.exists(IVFPQ_CACHE):
+#     print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True)
+#     t0 = time.time()
+#     ivfpq = faiss.read_index(IVFPQ_CACHE)
+#     print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
+# else:
+#     print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True)
+#     quantizer = faiss.IndexFlatL2(d)
+#     ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+#     t0 = time.time()
+#     ivfpq.train(xt)
+#     print(f"  Training took {time.time() - t0:.1f}s", flush=True)
+
+#     print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
+#     faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE)
+
+#     t0 = time.time()
+#     ivfpq.add(xb)
+#     print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
+
+#     print(f"  Saving full index to {IVFPQ_CACHE}...", flush=True)
+#     faiss.write_index(ivfpq, IVFPQ_CACHE)
+
+# faiss.omp_set_num_threads(1)
+# print("\n====== IVFPQ baseline", flush=True)
+# for nprobe in [1, 2, 4, 8, 16]:
+#     ivfpq.nprobe = nprobe
+#     eval_recall(ivfpq, nprobe)
 
 # --- IVFPQPanorama (cached) ---
 faiss.omp_set_num_threads(mp.cpu_count())
diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index 504958b544..6dcbdad261 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -49,7 +49,7 @@ IndexIVFPQPanorama::IndexIVFPQPanorama(
             M == code_size, "M must equal code_size for 8-bit PQ");
     FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported");
 
-    auto* pano = new PanoramaPQ(d, code_size, n_levels, batch_size, &pq);
+    auto* pano = new PanoramaPQ(d, code_size, n_levels, batch_size, &pq, quantizer);
     this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano);
     this->own_invlists = own_invlists;
 }
@@ -136,8 +136,7 @@ struct IVFPQScannerPanorama : InvertedListScanner {
         const size_t n_batches = (list_size + bs - 1) / bs;
         const uint8_t* col_codes = storage->get_codes(list_no);
         const float* list_cum_sums = storage->get_cum_sums(list_no);
-        const float* precomp =
-                index.precomputed_table.data() + list_no * pq.M * pq.ksub;
+        const float* list_init_dists = storage->get_init_dists(list_no);
 
         // Scratch buffers.
         std::vector<float> exact_distances(bs);
@@ -153,7 +152,7 @@ struct IVFPQScannerPanorama : InvertedListScanner {
             size_t num_active = pano_pq->progressive_filter_batch<C>(
                     col_codes,
                     list_cum_sums,
-                    precomp,
+                    list_init_dists,
                     sim_table_2.data(),
                     query_cum_norms.data(),
                     dis0,
diff --git a/faiss/impl/PanoramaPQ.cpp b/faiss/impl/PanoramaPQ.cpp
index 832bfaf91d..02e70967b0 100644
--- a/faiss/impl/PanoramaPQ.cpp
+++ b/faiss/impl/PanoramaPQ.cpp
@@ -71,9 +71,11 @@ PanoramaPQ::PanoramaPQ(
         size_t code_size,
         size_t n_levels,
         size_t batch_size,
-        const ProductQuantizer* pq)
+        const ProductQuantizer* pq,
+        const Index* quantizer)
         : Panorama(d, code_size, n_levels, batch_size),
           pq(pq),
+          quantizer(quantizer),
           chunk_size(code_size / n_levels),
           levels_size(d / n_levels) {
     FAISS_THROW_IF_NOT_MSG(
@@ -119,4 +121,31 @@ void PanoramaPQ::compute_cumulative_sums(
     }
 }
 
+void PanoramaPQ::compute_init_distances(
+        float* init_dists_base,
+        size_t list_no,
+        size_t offset,
+        size_t n_entry,
+        const uint8_t* code) const {
+    FAISS_THROW_IF_NOT_MSG(
+            quantizer != nullptr,
+            "PanoramaPQ: quantizer required for compute_init_distances");
+
+    std::vector<float> centroid(d);
+    quantizer->reconstruct(list_no, centroid.data());
+
+    for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
+        std::vector<float> vec(d);
+        pq->decode(code + entry_idx * code_size, vec.data());
+
+        float init_dist = 0.0f;
+        for (size_t j = 0; j < d; j++) {
+            init_dist += vec[j] * vec[j] + 2 * vec[j] * centroid[j];
+        }
+
+        size_t point_idx = offset + entry_idx;
+        init_dists_base[point_idx] = init_dist;
+    }
+}
+
 } // namespace faiss
diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h
index 1ddc0c3897..91d443b840 100644
--- a/faiss/impl/PanoramaPQ.h
+++ b/faiss/impl/PanoramaPQ.h
@@ -7,6 +7,7 @@
 
 #pragma once
 
+#include <faiss/Index.h>
 #include <faiss/impl/Panorama.h>
 #include <faiss/impl/ProductQuantizer.h>
 #include <faiss/impl/panorama_kernels/panorama_kernels.h>
@@ -26,6 +27,7 @@ namespace faiss {
  */
 struct PanoramaPQ : Panorama {
     const ProductQuantizer* pq = nullptr;
+    const Index* quantizer = nullptr;
     size_t chunk_size = 0;
     size_t levels_size = 0;
 
@@ -35,7 +37,8 @@ struct PanoramaPQ : Panorama {
             size_t code_size,
             size_t n_levels,
             size_t batch_size,
-            const ProductQuantizer* pq);
+            const ProductQuantizer* pq,
+            const Index* quantizer = nullptr);
 
     void copy_codes_to_level_layout(
             uint8_t* codes,
@@ -52,17 +55,25 @@ struct PanoramaPQ : Panorama {
             size_t n_entry,
             const uint8_t* code) const override;
 
+    /// Precompute per-point init distances: ||r||^2 + 2<r, c>.
+    /// Requires quantizer to be set. Layout is flat per-list,
+    /// padded to batch_size boundaries.
+    void compute_init_distances(
+            float* init_dists_base,
+            size_t list_no,
+            size_t offset,
+            size_t n_entry,
+            const uint8_t* code) const;
+
     /// Progressive filtering for PQ codes: processes one batch.
     ///
-    /// Follows the same pattern as PanoramaFlat: initializes exact_distances
-    /// with squared norms (||r||^2 from stored cum_sums + dis0), then
-    /// processes the inner-product contribution level-by-level with pruning.
-    /// The SIMD-optimized process_chunks kernel handles the init phase
-    /// (precomp table over all M subquantizers) in a single vectorized pass.
+    /// Initializes exact_distances from precomputed init_dists
+    /// (||r||^2 + 2<r, c>), then refines with the query-specific
+    /// sim_table_2 level-by-level with Cauchy-Schwarz pruning.
     ///
     /// @param col_codes       Column-major codes for this inverted list.
     /// @param list_cum_sums   Cumulative sums for this inverted list.
-    /// @param precomp         Precomputed table slice for this list.
+    /// @param init_dists      Precomputed init distances for this list.
     /// @param sim_table_2     -2 * inner_prod_table (query-specific LUT).
     /// @param query_cum_norms Query suffix norms per level.
     /// @param coarse_dis      Coarse distance (dis0) for this list.
@@ -79,7 +90,7 @@ struct PanoramaPQ : Panorama {
     size_t progressive_filter_batch(
             const uint8_t* col_codes,
             const float* list_cum_sums,
-            const float* precomp,
+            const float* init_dists,
             const float* sim_table_2,
             const float* query_cum_norms,
             float coarse_dis,
@@ -97,7 +108,7 @@ template <typename C>
 size_t PanoramaPQ::progressive_filter_batch(
         const uint8_t* col_codes,
         const float* list_cum_sums,
-        const float* precomp,
+        const float* init_dists,
         const float* sim_table_2,
         const float* query_cum_norms,
         float coarse_dis,
@@ -111,7 +122,6 @@ size_t PanoramaPQ::progressive_filter_batch(
         PanoramaStats& local_stats) const {
     const size_t bs = batch_size;
     const size_t cs = chunk_size;
-    const size_t M = pq->M;
     const size_t ksub = pq->ksub;
 
     size_t curr_batch_size = std::min(list_size - batch_no * bs, bs);
@@ -127,26 +137,15 @@ size_t PanoramaPQ::progressive_filter_batch(
 
     const uint8_t* batch_codes = col_codes + b_offset * code_size;
 
-    // SIMD init: compute precomp distances for all M subquantizers.
-    // process_chunks naturally handles column-major codes and does
-    // cache-friendly 1KB-at-a-time table lookups with AVX-512 gathers.
-    std::fill(
-            exact_distances.begin(),
-            exact_distances.begin() + curr_batch_size,
-            0.0f);
-    panorama_kernels::process_chunks(
-            M,
-            bs,
-            curr_batch_size,
-            const_cast<float*>(precomp),
-            const_cast<uint8_t*>(batch_codes),
-            exact_distances.data());
+    // Load precomputed init distances (||r||^2 + 2<r, c>).
+    const float* batch_init = init_dists + b_offset;
+    std::copy(batch_init, batch_init + curr_batch_size, exact_distances.begin());
 
     const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1);
 
     size_t next_num_active = curr_batch_size;
     size_t batch_offset = batch_no * bs;
-    size_t total_active = next_num_active;
+    const size_t total_active = next_num_active;
 
     for (size_t level = 0; level < n_levels && next_num_active > 0; level++) {
         local_stats.total_dims_scanned += next_num_active;
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 7cc9ee9d08..88d07845e8 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -410,6 +410,9 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
                 nlist, code_size, pano);
         std::vector<size_t> sizes(nlist);
         read_ArrayInvertedLists_sizes(f, sizes);
+
+        bool has_init_dists;
+        READ1(has_init_dists);
         for (size_t i = 0; i < nlist; i++) {
             ailp->ids[i].resize(sizes[i]);
             size_t num_elems =
@@ -418,6 +421,9 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
                     ArrayInvertedListsPanorama::kBatchSize;
             ailp->codes[i].resize(num_elems * code_size);
             ailp->cum_sums[i].resize(num_elems * (n_levels + 1));
+            if (has_init_dists) {
+                ailp->init_dists[i].resize(num_elems);
+            }
         }
         for (size_t i = 0; i < nlist; i++) {
             size_t n = sizes[i];
@@ -427,6 +433,12 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
                 read_vector_with_known_size(ailp->ids[i], f, n);
                 read_vector_with_known_size(
                         ailp->cum_sums[i], f, ailp->cum_sums[i].size());
+                if (has_init_dists) {
+                    read_vector_with_known_size(
+                            ailp->init_dists[i],
+                            f,
+                            ailp->init_dists[i].size());
+                }
             }
         }
         return ailp;
@@ -1392,7 +1404,8 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
                     ivpp->code_size,
                     ivpp->n_levels,
                     ivpp->batch_size,
-                    &ivpp->pq));
+                    &ivpp->pq,
+                    ivpp->quantizer));
         }
         if (ivpp->is_trained) {
             ivpp->use_precomputed_table = 1;
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 02d0870bbc..2f6e1d52f7 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -284,7 +284,11 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) {
         }
         WRITEVECTOR(sizes);
 
-        // Write codes, ids, and cum_sums
+        bool has_init_dists = !ailp->init_dists.empty() &&
+                ailp->init_dists[0].size() > 0;
+        WRITE1(has_init_dists);
+
+        // Write codes, ids, cum_sums, and optionally init_dists
         for (size_t i = 0; i < ailp->nlist; i++) {
             size_t n = ailp->ids[i].size();
             if (n > 0) {
@@ -292,6 +296,11 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) {
                 WRITEANDCHECK(ailp->ids[i].data(), n);
                 WRITEANDCHECK(
                         ailp->cum_sums[i].data(), ailp->cum_sums[i].size());
+                if (has_init_dists) {
+                    WRITEANDCHECK(
+                            ailp->init_dists[i].data(),
+                            ailp->init_dists[i].size());
+                }
             }
         }
     } else if (
diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index 313228301c..b0256d7073 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -11,6 +11,7 @@
 #include <memory>
 
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/PanoramaPQ.h>
 #include <faiss/utils/utils.h>
 
 namespace faiss {
@@ -361,6 +362,7 @@ ArrayInvertedListsPanorama::ArrayInvertedListsPanorama(
             !use_iterator, "Panorama does not support iterators");
 
     cum_sums.resize(nlist);
+    init_dists.resize(nlist);
 }
 
 const float* ArrayInvertedListsPanorama::get_cum_sums(size_t list_no) const {
@@ -368,6 +370,11 @@ const float* ArrayInvertedListsPanorama::get_cum_sums(size_t list_no) const {
     return cum_sums[list_no].data();
 }
 
+const float* ArrayInvertedListsPanorama::get_init_dists(size_t list_no) const {
+    assert(list_no < nlist);
+    return init_dists[list_no].data();
+}
+
 size_t ArrayInvertedListsPanorama::add_entries(
         size_t list_no,
         size_t n_entry,
@@ -381,12 +388,20 @@ size_t ArrayInvertedListsPanorama::add_entries(
 
     size_t new_size = o + n_entry;
     size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize;
-    codes[list_no].resize(num_batches * kBatchSize * code_size);
-    cum_sums[list_no].resize(num_batches * kBatchSize * (pano->n_levels + 1));
+    size_t padded = num_batches * kBatchSize;
+    codes[list_no].resize(padded * code_size);
+    cum_sums[list_no].resize(padded * (pano->n_levels + 1));
 
     pano->copy_codes_to_level_layout(codes[list_no].data(), o, n_entry, code);
     pano->compute_cumulative_sums(cum_sums[list_no].data(), o, n_entry, code);
 
+    auto* pano_pq = dynamic_cast<PanoramaPQ*>(pano.get());
+    if (pano_pq) {
+        init_dists[list_no].resize(padded);
+        pano_pq->compute_init_distances(
+                init_dists[list_no].data(), list_no, o, n_entry, code);
+    }
+
     return o;
 }
 
@@ -405,14 +420,25 @@ void ArrayInvertedListsPanorama::update_entries(
             codes[list_no].data(), offset, n_entry, code);
     pano->compute_cumulative_sums(
             cum_sums[list_no].data(), offset, n_entry, code);
+
+    auto* pano_pq = dynamic_cast<PanoramaPQ*>(pano.get());
+    if (pano_pq) {
+        pano_pq->compute_init_distances(
+                init_dists[list_no].data(), list_no, offset, n_entry, code);
+    }
 }
 
 void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) {
     ids[list_no].resize(new_size);
 
     size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize;
-    codes[list_no].resize(num_batches * kBatchSize * code_size);
-    cum_sums[list_no].resize(num_batches * kBatchSize * (pano->n_levels + 1));
+    size_t padded = num_batches * kBatchSize;
+    codes[list_no].resize(padded * code_size);
+    cum_sums[list_no].resize(padded * (pano->n_levels + 1));
+
+    if (init_dists[list_no].size() > 0) {
+        init_dists[list_no].resize(padded);
+    }
 }
 
 const uint8_t* ArrayInvertedListsPanorama::get_single_code(
diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h
index edc29995b9..842344e50c 100644
--- a/faiss/invlists/InvertedLists.h
+++ b/faiss/invlists/InvertedLists.h
@@ -285,12 +285,14 @@ struct ArrayInvertedLists : InvertedLists {
 struct ArrayInvertedListsPanorama : ArrayInvertedLists {
     static constexpr size_t kBatchSize = 128;
     std::vector<MaybeOwnedVector<float>> cum_sums;
+    std::vector<MaybeOwnedVector<float>> init_dists;
     std::unique_ptr<Panorama> pano;
 
     /// Takes ownership of the provided Panorama*.
     ArrayInvertedListsPanorama(size_t nlist, size_t code_size, Panorama* pano);
 
     const float* get_cum_sums(size_t list_no) const;
+    const float* get_init_dists(size_t list_no) const;
 
     size_t add_entries(
             size_t list_no,

From f1ea5963a9153f5a3a8e55827c04b15cdcfe8b7c Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Fri, 20 Mar 2026 05:11:38 +0000
Subject: [PATCH 14/41] Fix levels as well

---
 faiss/impl/PanoramaPQ.h | 186 +++++++++++++++++++---------------------
 1 file changed, 87 insertions(+), 99 deletions(-)

diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h
index 91d443b840..fcf2d59c61 100644
--- a/faiss/impl/PanoramaPQ.h
+++ b/faiss/impl/PanoramaPQ.h
@@ -101,111 +101,99 @@ struct PanoramaPQ : Panorama {
             std::vector<uint8_t>& bitset,
             std::vector<uint8_t>& compressed_codes,
             float threshold,
-            PanoramaStats& local_stats) const;
-};
+            PanoramaStats& local_stats) const {
+        const size_t bs = batch_size;
+        const size_t cs = chunk_size;
+        const size_t ksub = pq->ksub;
+
+        size_t curr_batch_size = std::min(list_size - batch_no * bs, bs);
+        size_t b_offset = batch_no * bs;
+
+        // Initialize active set.
+        std::iota(
+                active_indices.begin(),
+                active_indices.begin() + curr_batch_size,
+                b_offset);
+        std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1);
+        std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0);
+
+        const uint8_t* batch_codes = col_codes + b_offset * code_size;
+
+        // Load precomputed init distances (||r||^2 + 2<r, c>).
+        const float* batch_init = init_dists + b_offset;
+        std::copy(
+                batch_init,
+                batch_init + curr_batch_size,
+                exact_distances.begin());
+
+        const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1);
+
+        size_t next_num_active = curr_batch_size;
+        size_t batch_offset = batch_no * bs;
+        const size_t total_active = next_num_active;
+
+        local_stats.total_dims += total_active * n_levels;
+
+        for (size_t level = 0; level < n_levels && next_num_active > 0;
+             level++) {
+            local_stats.total_dims_scanned += next_num_active;
 
-template <typename C>
-size_t PanoramaPQ::progressive_filter_batch(
-        const uint8_t* col_codes,
-        const float* list_cum_sums,
-        const float* init_dists,
-        const float* sim_table_2,
-        const float* query_cum_norms,
-        float coarse_dis,
-        size_t list_size,
-        size_t batch_no,
-        std::vector<float>& exact_distances,
-        std::vector<uint32_t>& active_indices,
-        std::vector<uint8_t>& bitset,
-        std::vector<uint8_t>& compressed_codes,
-        float threshold,
-        PanoramaStats& local_stats) const {
-    const size_t bs = batch_size;
-    const size_t cs = chunk_size;
-    const size_t ksub = pq->ksub;
-
-    size_t curr_batch_size = std::min(list_size - batch_no * bs, bs);
-    size_t b_offset = batch_no * bs;
-
-    // Initialize active set.
-    std::iota(
-            active_indices.begin(),
-            active_indices.begin() + curr_batch_size,
-            b_offset);
-    std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1);
-    std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0);
-
-    const uint8_t* batch_codes = col_codes + b_offset * code_size;
-
-    // Load precomputed init distances (||r||^2 + 2<r, c>).
-    const float* batch_init = init_dists + b_offset;
-    std::copy(batch_init, batch_init + curr_batch_size, exact_distances.begin());
-
-    const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1);
-
-    size_t next_num_active = curr_batch_size;
-    size_t batch_offset = batch_no * bs;
-    const size_t total_active = next_num_active;
-
-    for (size_t level = 0; level < n_levels && next_num_active > 0; level++) {
-        local_stats.total_dims_scanned += next_num_active;
-        local_stats.total_dims += total_active;
-
-        size_t level_sim_offset = level * ksub * cs;
-
-        float query_cum_norm = 2 * query_cum_norms[level + 1];
-
-        const float* cum_sums_level = batch_cums + bs * (level + 1);
-        const uint8_t* codes_level = batch_codes + bs * cs * level;
-
-        const float* sim_table_level = sim_table_2 + level_sim_offset;
-
-        bool is_sparse = next_num_active < bs / 16;
-
-        size_t num_active_for_filtering = 0;
-        if (is_sparse) {
-            for (size_t ci = 0; ci < cs; ci++) {
-                size_t chunk_off = ci * bs;
-                const float* chunk_sim = sim_table_level + ci * ksub;
-                for (size_t i = 0; i < next_num_active; i++) {
-                    size_t real_idx = active_indices[i] - batch_offset;
-                    exact_distances[i] +=
-                            chunk_sim[codes_level[chunk_off + real_idx]];
+            size_t level_sim_offset = level * ksub * cs;
+
+            float query_cum_norm = 2 * query_cum_norms[level + 1];
+
+            const float* cum_sums_level = batch_cums + bs * (level + 1);
+            const uint8_t* codes_level = batch_codes + bs * cs * level;
+
+            const float* sim_table_level = sim_table_2 + level_sim_offset;
+
+            bool is_sparse = next_num_active < bs / 16;
+
+            size_t num_active_for_filtering = 0;
+            if (is_sparse) {
+                for (size_t ci = 0; ci < cs; ci++) {
+                    size_t chunk_off = ci * bs;
+                    const float* chunk_sim = sim_table_level + ci * ksub;
+                    for (size_t i = 0; i < next_num_active; i++) {
+                        size_t real_idx = active_indices[i] - batch_offset;
+                        exact_distances[i] +=
+                                chunk_sim[codes_level[chunk_off + real_idx]];
+                    }
                 }
+                num_active_for_filtering = next_num_active;
+            } else {
+                auto [cc, na] = panorama_kernels::process_code_compression(
+                        next_num_active,
+                        bs,
+                        cs,
+                        compressed_codes.data(),
+                        bitset.data(),
+                        codes_level);
+
+                panorama_kernels::process_chunks(
+                        cs,
+                        bs,
+                        na,
+                        const_cast<float*>(sim_table_level),
+                        cc,
+                        exact_distances.data());
+                num_active_for_filtering = na;
             }
-            num_active_for_filtering = next_num_active;
-        } else {
-            auto [cc, na] = panorama_kernels::process_code_compression(
-                    next_num_active,
-                    bs,
-                    cs,
-                    compressed_codes.data(),
+
+            next_num_active = panorama_kernels::process_filtering(
+                    num_active_for_filtering,
+                    exact_distances.data(),
+                    active_indices.data(),
+                    const_cast<float*>(cum_sums_level),
                     bitset.data(),
-                    codes_level);
-
-            panorama_kernels::process_chunks(
-                    cs,
-                    bs,
-                    na,
-                    const_cast<float*>(sim_table_level),
-                    cc,
-                    exact_distances.data());
-            num_active_for_filtering = na;
+                    batch_offset,
+                    coarse_dis,
+                    query_cum_norm,
+                    threshold);
         }
 
-        next_num_active = panorama_kernels::process_filtering(
-                num_active_for_filtering,
-                exact_distances.data(),
-                active_indices.data(),
-                const_cast<float*>(cum_sums_level),
-                bitset.data(),
-                batch_offset,
-                coarse_dis,
-                query_cum_norm,
-                threshold);
+        return next_num_active;
     }
-
-    return next_num_active;
-}
+};
 
 } // namespace faiss

From a34ceed5a456762743906cf19e843c35993be701 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Fri, 20 Mar 2026 05:28:00 +0000
Subject: [PATCH 15/41] More cleanup

---
 faiss/impl/index_read.cpp        | 44 ++++++++++++++++++++++----------
 faiss/impl/index_write.cpp       | 11 +-------
 faiss/invlists/InvertedLists.cpp |  2 +-
 3 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 88d07845e8..21d76e35c4 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -411,8 +411,6 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
         std::vector<size_t> sizes(nlist);
         read_ArrayInvertedLists_sizes(f, sizes);
 
-        bool has_init_dists;
-        READ1(has_init_dists);
         for (size_t i = 0; i < nlist; i++) {
             ailp->ids[i].resize(sizes[i]);
             size_t num_elems =
@@ -421,9 +419,6 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
                     ArrayInvertedListsPanorama::kBatchSize;
             ailp->codes[i].resize(num_elems * code_size);
             ailp->cum_sums[i].resize(num_elems * (n_levels + 1));
-            if (has_init_dists) {
-                ailp->init_dists[i].resize(num_elems);
-            }
         }
         for (size_t i = 0; i < nlist; i++) {
             size_t n = sizes[i];
@@ -433,12 +428,6 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
                 read_vector_with_known_size(ailp->ids[i], f, n);
                 read_vector_with_known_size(
                         ailp->cum_sums[i], f, ailp->cum_sums[i].size());
-                if (has_init_dists) {
-                    read_vector_with_known_size(
-                            ailp->init_dists[i],
-                            f,
-                            ailp->init_dists[i].size());
-                }
             }
         }
         return ailp;
@@ -1399,13 +1388,42 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         auto* storage =
                 dynamic_cast<ArrayInvertedListsPanorama*>(ivpp->invlists);
         if (storage) {
-            storage->pano.reset(new PanoramaPQ(
+            auto* pano_pq = new PanoramaPQ(
                     ivpp->d,
                     ivpp->code_size,
                     ivpp->n_levels,
                     ivpp->batch_size,
                     &ivpp->pq,
-                    ivpp->quantizer));
+                    ivpp->quantizer);
+            storage->pano.reset(pano_pq);
+
+            // Recompute init_dists from stored codes + quantizer.
+            for (size_t list_no = 0; list_no < ivpp->nlist; list_no++) {
+                size_t list_size = storage->ids[list_no].size();
+                if (list_size == 0)
+                    continue;
+                size_t padded =
+                        ((list_size +
+                          ArrayInvertedListsPanorama::kBatchSize - 1) /
+                         ArrayInvertedListsPanorama::kBatchSize) *
+                        ArrayInvertedListsPanorama::kBatchSize;
+                storage->init_dists[list_no].resize(padded);
+
+                // Reconstruct row-major codes, then compute init distances.
+                std::vector<uint8_t> row_code(ivpp->code_size);
+                for (size_t i = 0; i < list_size; i++) {
+                    pano_pq->reconstruct(
+                            i,
+                            reinterpret_cast<float*>(row_code.data()),
+                            storage->codes[list_no].data());
+                    pano_pq->compute_init_distances(
+                            storage->init_dists[list_no].data(),
+                            list_no,
+                            i,
+                            1,
+                            row_code.data());
+                }
+            }
         }
         if (ivpp->is_trained) {
             ivpp->use_precomputed_table = 1;
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 2f6e1d52f7..02d0870bbc 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -284,11 +284,7 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) {
         }
         WRITEVECTOR(sizes);
 
-        bool has_init_dists = !ailp->init_dists.empty() &&
-                ailp->init_dists[0].size() > 0;
-        WRITE1(has_init_dists);
-
-        // Write codes, ids, cum_sums, and optionally init_dists
+        // Write codes, ids, and cum_sums
         for (size_t i = 0; i < ailp->nlist; i++) {
             size_t n = ailp->ids[i].size();
             if (n > 0) {
@@ -296,11 +292,6 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) {
                 WRITEANDCHECK(ailp->ids[i].data(), n);
                 WRITEANDCHECK(
                         ailp->cum_sums[i].data(), ailp->cum_sums[i].size());
-                if (has_init_dists) {
-                    WRITEANDCHECK(
-                            ailp->init_dists[i].data(),
-                            ailp->init_dists[i].size());
-                }
             }
         }
     } else if (
diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index b0256d7073..e9f137f6b7 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -436,7 +436,7 @@ void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) {
     codes[list_no].resize(padded * code_size);
     cum_sums[list_no].resize(padded * (pano->n_levels + 1));
 
-    if (init_dists[list_no].size() > 0) {
+    if (dynamic_cast<PanoramaPQ*>(pano.get())) {
         init_dists[list_no].resize(padded);
     }
 }

From 2a6b0f78c4c7f0a14c4dfc4d1c2c50dd784878f7 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sat, 21 Mar 2026 04:34:14 +0000
Subject: [PATCH 16/41] Batch size

---
 faiss/IndexIVFFlatPanorama.cpp   | 13 ++++++-------
 faiss/impl/index_read.cpp        | 16 ++++++----------
 faiss/invlists/InvertedLists.cpp | 10 ++++++----
 faiss/invlists/InvertedLists.h   |  1 -
 4 files changed, 18 insertions(+), 22 deletions(-)

diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp
index 5e678be28c..2dc7cd5594 100644
--- a/faiss/IndexIVFFlatPanorama.cpp
+++ b/faiss/IndexIVFFlatPanorama.cpp
@@ -38,8 +38,7 @@ IndexIVFFlatPanorama::IndexIVFFlatPanorama(
     // We construct the inverted lists here so that we can use the
     // level-oriented storage. This does not cause a leak as we constructed
     // IndexIVF first, with own_invlists set to false.
-    auto* pano = new PanoramaFlat(
-            d, n_levels, ArrayInvertedListsPanorama::kBatchSize);
+    auto* pano = new PanoramaFlat(d, n_levels, 128);
     this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano);
     this->own_invlists = own_invlists;
 }
@@ -100,19 +99,19 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
             ResultHandler& handler) const override {
         size_t nup = 0;
 
-        const size_t n_batches =
-                (list_size + storage->kBatchSize - 1) / storage->kBatchSize;
+        const size_t bs = pano_flat->batch_size;
+        const size_t n_batches = (list_size + bs - 1) / bs;
 
         const float* cum_sums_data = storage->get_cum_sums(list_no);
 
-        std::vector<float> exact_distances(storage->kBatchSize);
-        std::vector<uint32_t> active_indices(storage->kBatchSize);
+        std::vector<float> exact_distances(bs);
+        std::vector<uint32_t> active_indices(bs);
 
         PanoramaStats local_stats;
         local_stats.reset();
 
         for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
-            size_t batch_start = batch_no * storage->kBatchSize;
+            size_t batch_start = batch_no * bs;
 
             size_t num_active = with_metric_type(metric, [&]<MetricType M>() {
                 return pano_flat->progressive_filter_batch<C, M>(
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 21d76e35c4..e5632c9d84 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -402,10 +402,9 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
         FAISS_CHECK_DESERIALIZATION_LOOP_LIMIT(nlist, "ilpn nlist");
         READ1(code_size);
         READ1(n_levels);
+        constexpr size_t kFlatBatchSize = 128;
         auto* pano = new PanoramaFlat(
-                code_size / sizeof(float),
-                n_levels,
-                ArrayInvertedListsPanorama::kBatchSize);
+                code_size / sizeof(float), n_levels, kFlatBatchSize);
         auto ailp = std::make_unique<ArrayInvertedListsPanorama>(
                 nlist, code_size, pano);
         std::vector<size_t> sizes(nlist);
@@ -414,9 +413,8 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
         for (size_t i = 0; i < nlist; i++) {
             ailp->ids[i].resize(sizes[i]);
             size_t num_elems =
-                    ((sizes[i] + ArrayInvertedListsPanorama::kBatchSize - 1) /
-                     ArrayInvertedListsPanorama::kBatchSize) *
-                    ArrayInvertedListsPanorama::kBatchSize;
+                    ((sizes[i] + kFlatBatchSize - 1) / kFlatBatchSize) *
+                    kFlatBatchSize;
             ailp->codes[i].resize(num_elems * code_size);
             ailp->cum_sums[i].resize(num_elems * (n_levels + 1));
         }
@@ -1402,11 +1400,9 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
                 size_t list_size = storage->ids[list_no].size();
                 if (list_size == 0)
                     continue;
+                size_t bs = pano_pq->batch_size;
                 size_t padded =
-                        ((list_size +
-                          ArrayInvertedListsPanorama::kBatchSize - 1) /
-                         ArrayInvertedListsPanorama::kBatchSize) *
-                        ArrayInvertedListsPanorama::kBatchSize;
+                        ((list_size + bs - 1) / bs) * bs;
                 storage->init_dists[list_no].resize(padded);
 
                 // Reconstruct row-major codes, then compute init distances.
diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index e9f137f6b7..0cdb7a2b07 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -387,8 +387,9 @@ size_t ArrayInvertedListsPanorama::add_entries(
     memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry);
 
     size_t new_size = o + n_entry;
-    size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize;
-    size_t padded = num_batches * kBatchSize;
+    size_t bs = pano->batch_size;
+    size_t num_batches = (new_size + bs - 1) / bs;
+    size_t padded = num_batches * bs;
     codes[list_no].resize(padded * code_size);
     cum_sums[list_no].resize(padded * (pano->n_levels + 1));
 
@@ -431,8 +432,9 @@ void ArrayInvertedListsPanorama::update_entries(
 void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) {
     ids[list_no].resize(new_size);
 
-    size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize;
-    size_t padded = num_batches * kBatchSize;
+    size_t bs = pano->batch_size;
+    size_t num_batches = (new_size + bs - 1) / bs;
+    size_t padded = num_batches * bs;
     codes[list_no].resize(padded * code_size);
     cum_sums[list_no].resize(padded * (pano->n_levels + 1));
 
diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h
index 842344e50c..620c0cbbff 100644
--- a/faiss/invlists/InvertedLists.h
+++ b/faiss/invlists/InvertedLists.h
@@ -283,7 +283,6 @@ struct ArrayInvertedLists : InvertedLists {
 /// Works with both flat codes (PanoramaFlat) and PQ codes (PanoramaPQ)
 /// via the virtual Panorama interface.
 struct ArrayInvertedListsPanorama : ArrayInvertedLists {
-    static constexpr size_t kBatchSize = 128;
     std::vector<MaybeOwnedVector<float>> cum_sums;
     std::vector<MaybeOwnedVector<float>> init_dists;
     std::unique_ptr<Panorama> pano;

From 53033507f509cfede4d4c810d87ecabdc7169b0d Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sat, 21 Mar 2026 05:01:36 +0000
Subject: [PATCH 17/41] selector

---
 faiss/IndexIVFPQPanorama.cpp | 33 ++++++++++++++-------------
 faiss/impl/PanoramaPQ.h      | 43 ++++++++++++++++++++++--------------
 faiss/python/__init__.py     |  1 +
 3 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index 6dcbdad261..26c9eccbd3 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -149,21 +149,24 @@ struct IVFPQScannerPanorama : InvertedListScanner {
         local_stats.reset();
 
         for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
-            size_t num_active = pano_pq->progressive_filter_batch<C>(
-                    col_codes,
-                    list_cum_sums,
-                    list_init_dists,
-                    sim_table_2.data(),
-                    query_cum_norms.data(),
-                    dis0,
-                    list_size,
-                    batch_no,
-                    exact_distances,
-                    active_indices,
-                    bitset,
-                    compressed_codes,
-                    distances[0],
-                    local_stats);
+            size_t num_active =
+                    pano_pq->progressive_filter_batch<C, use_sel>(
+                            col_codes,
+                            list_cum_sums,
+                            list_init_dists,
+                            sim_table_2.data(),
+                            query_cum_norms.data(),
+                            dis0,
+                            list_size,
+                            batch_no,
+                            ids,
+                            sel,
+                            exact_distances,
+                            active_indices,
+                            bitset,
+                            compressed_codes,
+                            distances[0],
+                            local_stats);
 
             // Insert surviving candidates into heap.
             for (size_t i = 0; i < num_active; i++) {
diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h
index fcf2d59c61..a8e35a5699 100644
--- a/faiss/impl/PanoramaPQ.h
+++ b/faiss/impl/PanoramaPQ.h
@@ -79,6 +79,8 @@ struct PanoramaPQ : Panorama {
     /// @param coarse_dis      Coarse distance (dis0) for this list.
     /// @param list_size       Total number of vectors in this list.
     /// @param batch_no        Which batch to process.
+    /// @param ids             ID array for the inverted list.
+    /// @param sel             ID selector for filtering (may be nullptr).
     /// @param exact_distances [out] Scratch buffer for partial distances.
     /// @param active_indices  [out] Scratch buffer for survivor indices.
     /// @param bitset          Scratch buffer for code compression.
@@ -86,7 +88,7 @@ struct PanoramaPQ : Panorama {
     /// @param threshold       Current heap threshold for pruning.
     /// @param local_stats     [out] Accumulated pruning statistics.
     /// @return Number of surviving candidates in active_indices.
-    template <typename C>
+    template <typename C, bool use_sel>
     size_t progressive_filter_batch(
             const uint8_t* col_codes,
             const float* list_cum_sums,
@@ -96,6 +98,8 @@ struct PanoramaPQ : Panorama {
             float coarse_dis,
             size_t list_size,
             size_t batch_no,
+            const idx_t* ids,
+            const IDSelector* sel,
             std::vector<float>& exact_distances,
             std::vector<uint32_t>& active_indices,
             std::vector<uint8_t>& bitset,
@@ -109,26 +113,33 @@ struct PanoramaPQ : Panorama {
         size_t curr_batch_size = std::min(list_size - batch_no * bs, bs);
         size_t b_offset = batch_no * bs;
 
-        // Initialize active set.
-        std::iota(
-                active_indices.begin(),
-                active_indices.begin() + curr_batch_size,
-                b_offset);
-        std::fill(bitset.begin(), bitset.begin() + curr_batch_size, 1);
-        std::fill(bitset.begin() + curr_batch_size, bitset.end(), 0);
+        // Initialize active set with ID-filtered vectors.
+        std::fill(bitset.begin(), bitset.end(), 0);
+        size_t num_active = 0;
+        const float* batch_init = init_dists + b_offset;
+        for (size_t i = 0; i < curr_batch_size; i++) {
+            size_t global_idx = b_offset + i;
+            if (use_sel) {
+                idx_t id = ids[global_idx];
+                if (!sel->is_member(id)) {
+                    continue;
+                }
+            }
+            active_indices[num_active] = global_idx;
+            exact_distances[num_active] = batch_init[i];
+            bitset[i] = 1;
+            num_active++;
+        }
 
-        const uint8_t* batch_codes = col_codes + b_offset * code_size;
+        if (num_active == 0) {
+            return 0;
+        }
 
-        // Load precomputed init distances (||r||^2 + 2<r, c>).
-        const float* batch_init = init_dists + b_offset;
-        std::copy(
-                batch_init,
-                batch_init + curr_batch_size,
-                exact_distances.begin());
+        const uint8_t* batch_codes = col_codes + b_offset * code_size;
 
         const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1);
 
-        size_t next_num_active = curr_batch_size;
+        size_t next_num_active = num_active;
         size_t batch_offset = batch_no * bs;
         const size_t total_active = next_num_active;
 
diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py
index 05b376efce..1e82912ca4 100644
--- a/faiss/python/__init__.py
+++ b/faiss/python/__init__.py
@@ -177,6 +177,7 @@ def replacement_function(*args):
 add_ref_in_constructor(IndexPreTransform, {2: [0, 1], 1: [0]})
 add_ref_in_method(IndexPreTransform, 'prepend_transform', 0)
 add_ref_in_constructor(IndexIVFPQ, 0)
+add_ref_in_constructor(IndexIVFPQPanorama, 0)
 add_ref_in_constructor(IndexIVFPQR, 0)
 add_ref_in_constructor(IndexIVFPQFastScan, 0)
 add_ref_in_constructor(IndexIVFResidualQuantizer, 0)

From a6e55288c0f7248ce79bb517dcb1d659a5397d3e Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sat, 21 Mar 2026 05:05:08 +0000
Subject: [PATCH 18/41] First pass at tests

---
 tests/test_ivfpq_panorama.py | 617 +++++++++++++++++++++++++++++++++++
 1 file changed, 617 insertions(+)
 create mode 100644 tests/test_ivfpq_panorama.py

diff --git a/tests/test_ivfpq_panorama.py b/tests/test_ivfpq_panorama.py
new file mode 100644
index 0000000000..d8b16a128e
--- /dev/null
+++ b/tests/test_ivfpq_panorama.py
@@ -0,0 +1,617 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Comprehensive test suite for IndexIVFPQPanorama.
+
+Panorama is an adaptation of IndexIVFPQ that uses level-oriented storage
+and progressive filtering with Cauchy-Schwarz bounds to achieve significant
+speedups when combined with PCA or Cayley transforms, with zero loss in
+accuracy.
+
+Paper: https://www.arxiv.org/pdf/2510.00566
+
+Constraints specific to IndexIVFPQPanorama:
+  - Only L2 metric is supported.
+  - Only 8-bit PQ codes (nbits == 8).
+  - M must be divisible by n_levels.
+  - batch_size must be a multiple of 64.
+  - use_precomputed_table must be 1.
+"""
+
+import unittest
+
+import faiss
+import numpy as np
+from faiss.contrib.datasets import SyntheticDataset
+
+
+class TestIndexIVFPQPanorama(unittest.TestCase):
+    """Test Suite for IndexIVFPQPanorama."""
+
+    # Helper methods for index creation and data generation
+
+    def generate_data(self, d, nt, nb, nq, seed=42):
+        ds = SyntheticDataset(d, nt, nb, nq, seed=seed)
+        return ds.get_train(), ds.get_database(), ds.get_queries()
+
+    def create_ivfpq(self, d, nlist, M, nbits, xt, xb=None, nprobe=None):
+        """Create and train a standard IndexIVFPQ (L2 only)."""
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+        index.train(xt)
+        if xb is not None:
+            index.add(xb)
+        if nprobe is not None:
+            index.nprobe = nprobe
+        return index
+
+    def create_panorama(
+        self, d, nlist, M, nbits, n_levels, xt, xb=None,
+        nprobe=None, batch_size=128,
+    ):
+        """Create IndexIVFPQPanorama from a freshly trained IVFPQ.
+
+        Trains a temporary IndexIVFPQ, copies PQ centroids and quantizer
+        into the Panorama index, then sets up precomputed tables.
+        """
+        quantizer = faiss.IndexFlatL2(d)
+        trained = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+        trained.train(xt)
+
+        trained.own_fields = False
+        pano = faiss.IndexIVFPQPanorama(
+            quantizer, d, nlist, M, nbits, n_levels, batch_size,
+        )
+        centroids = faiss.vector_to_array(trained.pq.centroids)
+        faiss.copy_array_to_vector(centroids, pano.pq.centroids)
+        pano.is_trained = True
+        pano.use_precomputed_table = 1
+        pano.precompute_table()
+
+        if xb is not None:
+            pano.add(xb)
+        if nprobe is not None:
+            pano.nprobe = nprobe
+        return pano
+
+    def create_pair(
+        self, d, nlist, M, nbits, n_levels, xt, xb=None,
+        nprobe=None, batch_size=128,
+    ):
+        """Create an IVFPQ and an IVFPQPanorama sharing the same training.
+
+        Both indexes use the same quantizer centroids and PQ codebook,
+        so search results should be identical.
+        """
+        quantizer = faiss.IndexFlatL2(d)
+        trained = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+        trained.train(xt)
+
+        # Build the IVFPQ baseline from the trained state.
+        ivfpq = faiss.clone_index(trained)
+
+        # Build the Panorama from the same trained state.
+        trained.own_fields = False
+        pano = faiss.IndexIVFPQPanorama(
+            quantizer, d, nlist, M, nbits, n_levels, batch_size,
+        )
+        centroids = faiss.vector_to_array(trained.pq.centroids)
+        faiss.copy_array_to_vector(centroids, pano.pq.centroids)
+        pano.is_trained = True
+        pano.use_precomputed_table = 1
+        pano.precompute_table()
+
+        if xb is not None:
+            ivfpq.add(xb)
+            pano.add(xb)
+        if nprobe is not None:
+            ivfpq.nprobe = nprobe
+            pano.nprobe = nprobe
+        return ivfpq, pano
+
+    def assert_search_results_equal(
+        self,
+        D_regular,
+        I_regular,
+        D_panorama,
+        I_panorama,
+        rtol=1e-4,
+        atol=1e-6,
+        otol=1e-3,
+    ):
+        overlap_rate = np.mean(I_regular == I_panorama)
+
+        self.assertGreater(
+            overlap_rate,
+            1 - otol,
+            f"Overlap rate {overlap_rate:.6f} is not > {1 - otol:.3f}. ",
+        )
+        np.testing.assert_allclose(
+            D_regular,
+            D_panorama,
+            rtol=rtol,
+            atol=atol,
+            err_msg="Distances mismatch",
+        )
+
+    # Core functionality tests
+
+    def test_exact_match_with_ivfpq(self):
+        """Core test: Panorama must return identical results to IndexIVFPQ"""
+        d, nb, nt, nq = 64, 50000, 60000, 500
+        nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=42)
+
+        for nprobe in [1, 4, 16, 64]:
+            with self.subTest(nprobe=nprobe):
+                ivfpq, pano = self.create_pair(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe,
+                )
+                D_regular, I_regular = ivfpq.search(xq, k)
+                D_panorama, I_panorama = pano.search(xq, k)
+
+                self.assert_search_results_equal(
+                    D_regular, I_regular, D_panorama, I_panorama
+                )
+
+    def test_exact_match_with_ivfpq_medium(self):
+        """Core test: Medium scale version"""
+        d, nb, nt, nq = 32, 10000, 15000, 200
+        nlist, M, nbits, n_levels, k = 32, 8, 8, 4, 10
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=42)
+
+        for nprobe in [1, 4, 8, nlist]:
+            with self.subTest(nprobe=nprobe):
+                ivfpq, pano = self.create_pair(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe,
+                )
+                D_regular, I_regular = ivfpq.search(xq, k)
+                D_panorama, I_panorama = pano.search(xq, k)
+
+                self.assert_search_results_equal(
+                    D_regular, I_regular, D_panorama, I_panorama
+                )
+
+    # Parameter variation tests
+
+    def test_different_n_levels(self):
+        """Test correctness with various n_levels parameter values"""
+        d, nb, nt, nq = 64, 25000, 40000, 200
+        nlist, M, nbits, k = 64, 16, 8, 15
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=456)
+
+        # Train IVFPQ once for the baseline.
+        ivfpq = self.create_ivfpq(d, nlist, M, nbits, xt, xb, nprobe=16)
+        D_base, I_base = ivfpq.search(xq, k)
+
+        nt = faiss.omp_get_max_threads()
+        faiss.omp_set_num_threads(1)
+
+        prev_ratio = float("inf")
+        # n_levels must divide M=16.
+        for n_levels in [1, 2, 4, 8, 16]:
+            with self.subTest(n_levels=n_levels):
+                faiss.cvar.indexPanorama_stats.reset()
+
+                pano = self.create_panorama(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe=16,
+                )
+                D, I = pano.search(xq, k)
+                self.assert_search_results_equal(D_base, I_base, D, I)
+
+                ratio = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
+                self.assertLess(ratio, prev_ratio)
+                prev_ratio = ratio
+
+        faiss.omp_set_num_threads(nt)
+
+    def test_different_M_and_n_levels(self):
+        """Test various M / n_levels combinations"""
+        test_cases = [
+            (32, 8, 2),   # M=8,  n_levels=2, chunk=4
+            (64, 16, 4),  # M=16, n_levels=4, chunk=4
+            (64, 32, 8),  # M=32, n_levels=8, chunk=4
+        ]
+        for d, M, n_levels in test_cases:
+            with self.subTest(d=d, M=M, n_levels=n_levels):
+                nb, nt, nq, nlist, nbits, k = 10000, 15000, 100, 32, 8, 10
+                xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=789)
+
+                ivfpq, pano = self.create_pair(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe=8,
+                )
+                D_regular, I_regular = ivfpq.search(xq, k)
+                D_panorama, I_panorama = pano.search(xq, k)
+
+                self.assert_search_results_equal(
+                    D_regular, I_regular, D_panorama, I_panorama
+                )
+
+    def test_single_level(self):
+        """Test edge case with n_levels=1 (no pruning, equivalent to IVFPQ)"""
+        d, nb, nt, nq = 32, 5000, 7000, 50
+        nlist, M, nbits, n_levels, k = 16, 8, 8, 1, 5
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=333)
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=4,
+        )
+        D_regular, I_regular = ivfpq.search(xq, k)
+        D_panorama, I_panorama = pano.search(xq, k)
+
+        self.assert_search_results_equal(
+            D_regular, I_regular, D_panorama, I_panorama
+        )
+
+    def test_max_levels(self):
+        """Test edge case with n_levels=M (each level is one subquantizer)"""
+        d, nb, nt, nq = 64, 5000, 7000, 50
+        nlist, M, nbits, n_levels, k = 16, 16, 8, 16, 5
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=444)
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=4,
+        )
+        D_regular, I_regular = ivfpq.search(xq, k)
+        D_panorama, I_panorama = pano.search(xq, k)
+
+        self.assert_search_results_equal(
+            D_regular, I_regular, D_panorama, I_panorama
+        )
+
+    # ID selector tests
+
+    def test_id_selector_range(self):
+        """Test ID filtering with range selector"""
+        d, nb, nt, nq = 64, 50000, 60000, 300
+        nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=321)
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=16,
+        )
+
+        params = faiss.SearchParametersIVF()
+        params.sel = faiss.IDSelectorRange(10000, 30000)
+
+        D_regular, I_regular = ivfpq.search(xq, k, params=params)
+        D_panorama, I_panorama = pano.search(xq, k, params=params)
+
+        valid = I_panorama[I_panorama >= 0]
+        self.assertTrue(np.all(valid >= 10000))
+        self.assertTrue(np.all(valid < 30000))
+
+        np.testing.assert_array_equal(I_regular, I_panorama)
+        np.testing.assert_allclose(D_regular, D_panorama, rtol=1e-4)
+
+    def test_id_selector_batch(self):
+        """Test ID filtering with batch selector"""
+        d, nb, nt, nq = 64, 30000, 45000, 200
+        nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=654)
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=16,
+        )
+
+        allowed_ids = np.array([i * 50 for i in range(500)], dtype=np.int64)
+        params = faiss.SearchParametersIVF()
+        params.sel = faiss.IDSelectorBatch(allowed_ids)
+
+        D_regular, I_regular = ivfpq.search(xq, k, params=params)
+        D_panorama, I_panorama = pano.search(xq, k, params=params)
+
+        allowed_set = set(allowed_ids) | {-1}
+        for id_val in I_panorama.flatten():
+            self.assertIn(int(id_val), allowed_set)
+
+        np.testing.assert_array_equal(I_regular, I_panorama)
+        np.testing.assert_allclose(D_regular, D_panorama, rtol=1e-4)
+
+    def test_selector_excludes_all(self):
+        """Test selector that excludes all results"""
+        d, nb, nt, nq = 32, 3000, 5000, 5
+        nlist, M, nbits, n_levels, k = 8, 8, 8, 4, 10
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=999)
+
+        pano = self.create_panorama(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist,
+        )
+
+        params = faiss.SearchParametersIVF()
+        params.sel = faiss.IDSelectorRange(nb + 100, nb + 200)
+
+        D, I = pano.search(xq, k, params=params)
+        self.assertTrue(np.all(I == -1))
+
+    # Batch size and edge case tests
+
+    def test_batch_boundaries(self):
+        """Test correctness at various database sizes relative to batch_size"""
+        d, nq = 64, 50
+        nlist, M, nbits, n_levels, k = 16, 16, 8, 4, 10
+        xq = np.random.rand(nq, d).astype("float32")
+
+        batch_size = 128
+        test_sizes = [
+            batch_size - 1,
+            batch_size,
+            batch_size + 1,
+            batch_size * 2,
+            batch_size * 3 - 1,
+        ]
+        for nb in test_sizes:
+            with self.subTest(nb=nb):
+                nt = max(nb, 500)
+                np.random.seed(987)
+                xt = np.random.rand(nt, d).astype("float32")
+                xb = np.random.rand(nb, d).astype("float32")
+
+                ivfpq, pano = self.create_pair(
+                    d, nlist, M, nbits, n_levels, xt, xb,
+                    nprobe=nlist, batch_size=batch_size,
+                )
+                D_regular, I_regular = ivfpq.search(xq, k)
+                D_panorama, I_panorama = pano.search(xq, k)
+
+                self.assert_search_results_equal(
+                    D_regular, I_regular, D_panorama, I_panorama
+                )
+
+    def test_different_batch_sizes(self):
+        """Test correctness across different internal batch sizes"""
+        d, nb, nt, nq = 64, 10000, 15000, 50
+        nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 10
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=4242)
+
+        ivfpq = self.create_ivfpq(d, nlist, M, nbits, xt, xb, nprobe=8)
+        D_base, I_base = ivfpq.search(xq, k)
+
+        for bs in [64, 128, 256, 512, 1024]:
+            with self.subTest(batch_size=bs):
+                pano = self.create_panorama(
+                    d, nlist, M, nbits, n_levels, xt, xb,
+                    nprobe=8, batch_size=bs,
+                )
+                D, I = pano.search(xq, k)
+                self.assert_search_results_equal(D_base, I_base, D, I)
+
+    def test_very_small_dataset(self):
+        """Test with dataset smaller than batch size"""
+        test_cases = [10, 50, 100]
+
+        for nb in test_cases:
+            with self.subTest(nb=nb):
+                d, nlist, M, nbits, n_levels = 32, 4, 4, 8, 2
+                nt, nq = max(nb, 1500), 5
+                k = min(3, nb)
+                xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=666 + nb)
+
+                ivfpq, pano = self.create_pair(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist,
+                )
+                D_regular, I_regular = ivfpq.search(xq, k)
+                D_panorama, I_panorama = pano.search(xq, k)
+
+                self.assert_search_results_equal(
+                    D_regular, I_regular, D_panorama, I_panorama
+                )
+
+    def test_single_vector_per_cluster(self):
+        """Test extreme case where clusters have very few vectors"""
+        d, nb, nt, nq = 32, 20, 3000, 5
+        nlist, M, nbits, n_levels, k = 16, 4, 8, 2, 3
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=1313)
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist,
+        )
+        D_regular, I_regular = ivfpq.search(xq, k)
+        D_panorama, I_panorama = pano.search(xq, k)
+
+        self.assert_search_results_equal(
+            D_regular, I_regular, D_panorama, I_panorama
+        )
+
+    def test_empty_result_handling(self):
+        """Test handling of empty search results (shapes only)"""
+        d, nb, nt, nq = 32, 100, 3000, 10
+        nlist, M, nbits, n_levels, k = 8, 4, 8, 2, 10
+        xt, xb, _ = self.generate_data(d, nt, nb, nq, seed=111)
+        xq = np.random.rand(nq, d).astype("float32") + 10.0
+
+        pano = self.create_panorama(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=1,
+        )
+        D, I = pano.search(xq, k)
+
+        self.assertEqual(D.shape, (nq, k))
+        self.assertEqual(I.shape, (nq, k))
+
+    # Dynamic operations tests
+
+    def test_incremental_add(self):
+        """Test adding vectors incrementally in multiple batches"""
+        d, nt = 64, 20000
+        nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 15
+        xt = np.random.rand(nt, d).astype("float32")
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, nprobe=16,
+        )
+
+        for batch_nb in [5000, 10000, 15000]:
+            xb_batch = np.random.rand(batch_nb, d).astype("float32")
+            ivfpq.add(xb_batch)
+            pano.add(xb_batch)
+
+        nq = 100
+        xq = np.random.rand(nq, d).astype("float32")
+
+        D_regular, I_regular = ivfpq.search(xq, k)
+        D_panorama, I_panorama = pano.search(xq, k)
+
+        self.assert_search_results_equal(
+            D_regular, I_regular, D_panorama, I_panorama
+        )
+
+    def test_add_search_add_search(self):
+        """Test interleaved add and search operations"""
+        d, nt = 32, 500
+        nlist, M, nbits, n_levels, k = 8, 8, 8, 4, 5
+        np.random.seed(555)
+        xt = np.random.rand(nt, d).astype("float32")
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, nprobe=4,
+        )
+
+        xb1 = np.random.rand(200, d).astype("float32")
+        ivfpq.add(xb1)
+        pano.add(xb1)
+
+        xq1 = np.random.rand(10, d).astype("float32")
+        D_reg_1, I_reg_1 = ivfpq.search(xq1, k)
+        D_pan_1, I_pan_1 = pano.search(xq1, k)
+        self.assert_search_results_equal(D_reg_1, I_reg_1, D_pan_1, I_pan_1)
+
+        xb2 = np.random.rand(300, d).astype("float32")
+        ivfpq.add(xb2)
+        pano.add(xb2)
+
+        xq2 = np.random.rand(10, d).astype("float32")
+        D_reg_2, I_reg_2 = ivfpq.search(xq2, k)
+        D_pan_2, I_pan_2 = pano.search(xq2, k)
+        self.assert_search_results_equal(D_reg_2, I_reg_2, D_pan_2, I_pan_2)
+
+    # Serialization tests
+
+    def test_serialization(self):
+        """Test write/read preserves search results"""
+        d, nb, nt, nq = 64, 10000, 15000, 100
+        nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 20
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=2024)
+
+        pano = self.create_panorama(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=8,
+        )
+
+        D_before, I_before = pano.search(xq, k)
+        pano_after = faiss.deserialize_index(faiss.serialize_index(pano))
+        D_after, I_after = pano_after.search(xq, k)
+
+        np.testing.assert_array_equal(I_before, I_after)
+        np.testing.assert_allclose(D_before, D_after, rtol=1e-5)
+
+    def test_serialization_preserves_params(self):
+        """Test serialization preserves n_levels and batch_size correctly"""
+        d, nb, nt, nq = 64, 10000, 15000, 50
+        nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 10
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=2025)
+
+        pano = self.create_panorama(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=4,
+        )
+        D_before, I_before = pano.search(xq, k)
+
+        pano_after = faiss.deserialize_index(
+            faiss.serialize_index(pano)
+        )
+        self.assertEqual(pano_after.batch_size, 128)
+        self.assertEqual(pano_after.n_levels, n_levels)
+
+        D_after, I_after = pano_after.search(xq, k)
+        np.testing.assert_array_equal(I_before, I_after)
+        np.testing.assert_allclose(D_before, D_after, rtol=1e-5)
+
+    # Statistics tests
+
+    def test_ratio_dims_scanned(self):
+        """Test that ratio_dims_scanned is 1.0 at n_levels=1 and strictly
+        less for higher n_levels.
+
+        Unlike IndexFlatPanorama, PQ quantization error prevents achieving
+        the ideal 1/n_levels ratio even on synthetic data. We verify that
+        n_levels=1 gives ratio=1.0 (exhaustive) and that multi-level
+        pruning is effective (ratio well below 1.0).
+        """
+        d, nb, nt, nq = 64, 25000, 40000, 10
+        nlist, M, nbits, k = 32, 16, 8, 1
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=5678)
+
+        nt_threads = faiss.omp_get_max_threads()
+        faiss.omp_set_num_threads(1)
+
+        faiss.cvar.indexPanorama_stats.reset()
+        pano_1 = self.create_panorama(
+            d, nlist, M, nbits, 1, xt, xb, nprobe=8,
+        )
+        pano_1.search(xq, k)
+        ratio_1 = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
+        self.assertAlmostEqual(ratio_1, 1.0, places=3)
+
+        faiss.cvar.indexPanorama_stats.reset()
+        pano_16 = self.create_panorama(
+            d, nlist, M, nbits, 16, xt, xb, nprobe=8,
+        )
+        pano_16.search(xq, k)
+        ratio_16 = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
+        self.assertLess(ratio_16, 0.55)
+
+        faiss.omp_set_num_threads(nt_threads)
+
+    def test_pruning_improves_with_n_levels(self):
+        """Test that increasing n_levels reduces the fraction scanned"""
+        d, nb, nt, nq = 64, 25000, 40000, 50
+        nlist, M, nbits, k = 32, 16, 8, 10
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=1234)
+
+        nt_threads = faiss.omp_get_max_threads()
+        faiss.omp_set_num_threads(1)
+
+        prev_ratio = float("inf")
+        for n_levels in [1, 2, 4, 8, 16]:
+            with self.subTest(n_levels=n_levels):
+                faiss.cvar.indexPanorama_stats.reset()
+                pano = self.create_panorama(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe=8,
+                )
+                pano.search(xq, k)
+                ratio = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
+                self.assertLessEqual(ratio, prev_ratio)
+                prev_ratio = ratio
+
+        faiss.omp_set_num_threads(nt_threads)
+
+    # Constraint validation tests
+
+    def test_rejects_non_l2_metric(self):
+        """Verify that non-L2 metrics are rejected"""
+        d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 4
+        quantizer = faiss.IndexFlatIP(d)
+        with self.assertRaises(RuntimeError):
+            faiss.IndexIVFPQPanorama(
+                quantizer, d, nlist, M, nbits, n_levels, 128,
+                faiss.METRIC_INNER_PRODUCT,
+            )
+
+    def test_rejects_invalid_batch_size(self):
+        """Verify that non-multiple-of-64 batch_size is rejected"""
+        d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 4
+        quantizer = faiss.IndexFlatL2(d)
+        with self.assertRaises(RuntimeError):
+            faiss.IndexIVFPQPanorama(
+                quantizer, d, nlist, M, nbits, n_levels, 100,
+            )
+
+    def test_rejects_m_not_divisible_by_n_levels(self):
+        """Verify that M not divisible by n_levels is rejected"""
+        d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 3
+        quantizer = faiss.IndexFlatL2(d)
+        with self.assertRaises(RuntimeError):
+            faiss.IndexIVFPQPanorama(
+                quantizer, d, nlist, M, nbits, n_levels, 128,
+            )

From 0be0e1f09a0351b2105ce5dc514479418ed61751 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sat, 21 Mar 2026 06:01:51 +0000
Subject: [PATCH 19/41] Bench first pass

---
 benchs/bench_ivfpq_panorama.py      | 208 +++++++++++-----------------
 benchs/bench_ivfpq_panorama_test.py | 173 +++++++++++++++++++++++
 faiss/index_factory.cpp             |   7 +
 tests/test_factory.py               |  15 ++
 tests/test_ivfpq_panorama.py        |   2 +-
 5 files changed, 277 insertions(+), 128 deletions(-)
 create mode 100644 benchs/bench_ivfpq_panorama_test.py

diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
index eafeebb7e8..7c2c689b83 100644
--- a/benchs/bench_ivfpq_panorama.py
+++ b/benchs/bench_ivfpq_panorama.py
@@ -1,61 +1,60 @@
-# Quick 10% verification of IVFPQPanorama (with index caching)
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
 
 import multiprocessing as mp
-import os
 import time
 
 import faiss
+import matplotlib.pyplot as plt
 import numpy as np
 
-print("Compile options:", faiss.get_compile_options(), flush=True)
+try:
+    from faiss.contrib.datasets_fb import DatasetGIST1M
+except ImportError:
+    from faiss.contrib.datasets import DatasetGIST1M
 
+ds = DatasetGIST1M()
 
-def fvecs_read(fname):
-    a = np.fromfile(fname, dtype="float32")
-    d = a[0].view("int32")
-    return a.reshape(-1, d + 1)[:, 1:].copy()
+SUBSET = 0.1  # Set to 1.0 for full dataset
 
-
-GIST_DIR = "/datasets/PCA_init"
-CACHE_DIR = "/home/akash/faiss-panorama/index_cache"
-os.makedirs(CACHE_DIR, exist_ok=True)
-
-IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index")
-IVFPQ_TRAINED_CACHE = os.path.join(CACHE_DIR, "ivfpq_trained_10pct.index")
-IVFPQ_PANO_CACHE = os.path.join(CACHE_DIR, "ivfpq_pano_10pct.index")
-
-print("Loading GIST1M data (10% subset)...", flush=True)
-xb_full = fvecs_read(os.path.join(GIST_DIR, "gist1m_base.fvecs"))
-xq = fvecs_read(os.path.join(GIST_DIR, "gist1m_query.fvecs"))
-
-nb_full, d = xb_full.shape
-nb = nb_full // 10  # 10% = 100000
+xq = ds.get_queries()
+xb_full = ds.get_database()
+nb_full = xb_full.shape[0]
+nb = int(nb_full * SUBSET)
 xb = xb_full[:nb].copy()
 del xb_full
 
-nq = xq.shape[0]
-print(f"Database: {nb} x {d}, Queries: {nq} x {d}", flush=True)
+gt = ds.get_groundtruth() if SUBSET == 1.0 else None
+xt = ds.get_train()[:max(nb // 2, 50000)]
 
-xt = xb[:50000].copy()
+nb, d = xb.shape
+nq = xq.shape[0]
+nt = xt.shape[0]
 
 k = 10
-M = 960
-nbits = 8
-nlist = 64
-n_levels = 8
-batch_size = 128
-
-GT_PATH = os.path.join(CACHE_DIR, "gt_10pct.npy")
-if os.path.exists(GT_PATH):
-    gt_I = np.load(GT_PATH)
-    print(f"Loaded cached ground truth: {gt_I.shape}", flush=True)
-else:
-    print("Computing ground truth on 10% subset...", flush=True)
+
+if gt is None:
+    print(f"Computing ground truth for {SUBSET*100:.0f}% subset ({nb} vectors)...")
     flat = faiss.IndexFlatL2(d)
     flat.add(xb)
-    _, gt_I = flat.search(xq, k)
-    np.save(GT_PATH, gt_I)
-    print("Ground truth computed and cached.", flush=True)
+    _, gt = flat.search(xq, k)
+else:
+    gt = gt[:, :k]
+
+print(f"Database: {nb} x {d}, Queries: {nq}, Train: {nt}")
+
+M_values = [960, 480, 240]
+nbits = 8
+nlist = 128
+n_levels = 16
+
+
+def get_ivf_index(index):
+    if isinstance(index, faiss.IndexPreTransform):
+        return faiss.downcast_index(index.index)
+    return index
 
 
 def eval_recall(index, nprobe_val):
@@ -65,109 +64,64 @@ def eval_recall(index, nprobe_val):
     t = time.time() - t0
     speed = t * 1000 / nq
     qps = 1000 / speed
-    corrects = sum(len(set(gt_I[i]) & set(I[i])) for i in range(nq))
+
+    corrects = (gt == I).sum()
     recall = corrects / (nq * k)
-    stats = faiss.cvar.indexPanorama_stats
-    pct_active = stats.ratio_dims_scanned * 100
+    ratio_dims_scanned = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
     print(
         f"\tnprobe {nprobe_val:3d}, Recall@{k}: "
         f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}, "
-        f"active: {pct_active:.1f}%",
-        flush=True,
+        f"dims scanned: {ratio_dims_scanned * 100:.1f}%"
     )
+
     return recall, qps
 
 
-faiss.omp_set_num_threads(mp.cpu_count())
+def build_index(name):
+    index = faiss.index_factory(d, name)
 
-# # --- IVFPQ baseline (cached) ---
-# if os.path.exists(IVFPQ_CACHE):
-#     print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True)
-#     t0 = time.time()
-#     ivfpq = faiss.read_index(IVFPQ_CACHE)
-#     print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
-# else:
-#     print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True)
-#     quantizer = faiss.IndexFlatL2(d)
-#     ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
-#     t0 = time.time()
-#     ivfpq.train(xt)
-#     print(f"  Training took {time.time() - t0:.1f}s", flush=True)
+    faiss.omp_set_num_threads(mp.cpu_count())
+    index.train(xt)
+    index.add(xb)
 
-#     print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
-#     faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE)
+    return index
 
-#     t0 = time.time()
-#     ivfpq.add(xb)
-#     print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
 
-#     print(f"  Saving full index to {IVFPQ_CACHE}...", flush=True)
-#     faiss.write_index(ivfpq, IVFPQ_CACHE)
+def eval_and_plot(name, label=None):
+    index = build_index(name)
+    ivf_index = get_ivf_index(index)
 
-# faiss.omp_set_num_threads(1)
-# print("\n====== IVFPQ baseline", flush=True)
-# for nprobe in [1, 2, 4, 8, 16]:
-#     ivfpq.nprobe = nprobe
-#     eval_recall(ivfpq, nprobe)
+    faiss.omp_set_num_threads(1)
 
-# --- IVFPQPanorama (cached) ---
-faiss.omp_set_num_threads(mp.cpu_count())
+    data = []
+    print(f"====== {label or name}")
+    for nprobe in nprobes:
+        ivf_index.nprobe = nprobe
+        recall, qps = eval_recall(index, nprobe)
+        data.append((recall, qps))
 
-if os.path.exists(IVFPQ_PANO_CACHE):
-    print(f"\nLoading cached IVFPQPanorama from {IVFPQ_PANO_CACHE}...", flush=True)
-    t0 = time.time()
-    ivfpq_pano = faiss.read_index(IVFPQ_PANO_CACHE)
-    print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
-else:
-    def build_panorama_from_trained(trained_index):
-        quantizer2 = trained_index.quantizer
-        trained_index.own_fields = False
-
-        pano = faiss.IndexIVFPQPanorama(
-            quantizer2, d, nlist, M, nbits, n_levels, batch_size
-        )
-        centroids = faiss.vector_to_array(trained_index.pq.centroids)
-        faiss.copy_array_to_vector(centroids, pano.pq.centroids)
-        pano.is_trained = True
-        pano.use_precomputed_table = 1
-        pano.precompute_table()
-        return pano
-
-    if os.path.exists(IVFPQ_TRAINED_CACHE):
-        print(
-            f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...",
-            flush=True,
-        )
-        trained = faiss.read_index(IVFPQ_TRAINED_CACHE)
-        ivfpq_pano = build_panorama_from_trained(trained)
-        print("  Reused trained PQ (skipped training).", flush=True)
-    else:
-        print(
-            f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}",
-            flush=True,
-        )
-        quantizer2 = faiss.IndexFlatL2(d)
-        trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits)
-        t0 = time.time()
-        trained.train(xt)
-        print(f"  Training took {time.time() - t0:.1f}s", flush=True)
-
-        print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
-        faiss.write_index(trained, IVFPQ_TRAINED_CACHE)
-
-        ivfpq_pano = build_panorama_from_trained(trained)
+    data = np.array(data)
+    plt.plot(data[:, 0], data[:, 1], "o-", label=label or name)
 
-    t0 = time.time()
-    ivfpq_pano.add(xb)
-    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
 
-    print(f"  Saving IVFPQPanorama to {IVFPQ_PANO_CACHE}...", flush=True)
-    faiss.write_index(ivfpq_pano, IVFPQ_PANO_CACHE)
+nprobes = [1, 2, 4, 8, 16, 32, 64]
+
+plt.figure(figsize=(10, 7), dpi=80)
 
-faiss.omp_set_num_threads(1)
-print("\n====== IVFPQPanorama", flush=True)
-for nprobe in [1, 2, 4, 8, 16]:
-    ivfpq_pano.nprobe = nprobe
-    eval_recall(ivfpq_pano, nprobe)
+for M in M_values:
+    eval_and_plot(
+        f"IVF{nlist},PQ{M}x{nbits}",
+        label=f"IVFPQ (M={M})",
+    )
+    eval_and_plot(
+        f"PCA{d},IVF{nlist},PQ{M}x{nbits}Panorama{n_levels}",
+        label=f"PCA + IVFPQPanorama (M={M})",
+    )
 
-print("\nVerification complete!", flush=True)
+plt.title(f"IVFPQ Panorama on GIST1M (nlist={nlist})")
+plt.xlabel(f"Recall@{k}")
+plt.ylabel("QPS")
+plt.yscale("log")
+plt.legend(bbox_to_anchor=(1.02, 0.1), loc="upper left", borderaxespad=0)
+plt.savefig("bench_ivfpq_panorama.png", bbox_inches="tight")
+print("\nBenchmark complete! Plot saved to bench_ivfpq_panorama.png")
diff --git a/benchs/bench_ivfpq_panorama_test.py b/benchs/bench_ivfpq_panorama_test.py
new file mode 100644
index 0000000000..38fe14614c
--- /dev/null
+++ b/benchs/bench_ivfpq_panorama_test.py
@@ -0,0 +1,173 @@
+# Quick 10% verification of IVFPQPanorama (with index caching)
+
+import multiprocessing as mp
+import os
+import time
+
+import faiss
+import numpy as np
+
+print("Compile options:", faiss.get_compile_options(), flush=True)
+
+
+def fvecs_read(fname):
+    a = np.fromfile(fname, dtype="float32")
+    d = a[0].view("int32")
+    return a.reshape(-1, d + 1)[:, 1:].copy()
+
+
+GIST_DIR = "/datasets/PCA_init"
+CACHE_DIR = "/home/akash/faiss-panorama/index_cache"
+os.makedirs(CACHE_DIR, exist_ok=True)
+
+IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index")
+IVFPQ_TRAINED_CACHE = os.path.join(CACHE_DIR, "ivfpq_trained_10pct.index")
+IVFPQ_PANO_CACHE = os.path.join(CACHE_DIR, "ivfpq_pano_10pct.index")
+
+print("Loading GIST1M data (10% subset)...", flush=True)
+xb_full = fvecs_read(os.path.join(GIST_DIR, "gist1m_base.fvecs"))
+xq = fvecs_read(os.path.join(GIST_DIR, "gist1m_query.fvecs"))
+
+nb_full, d = xb_full.shape
+nb = nb_full // 10  # 10% = 100000
+xb = xb_full[:nb].copy()
+del xb_full
+
+nq = xq.shape[0]
+print(f"Database: {nb} x {d}, Queries: {nq} x {d}", flush=True)
+
+xt = xb[:50000].copy()
+
+k = 10
+M = 960
+nbits = 8
+nlist = 64
+n_levels = 16
+batch_size = 128
+
+GT_PATH = os.path.join(CACHE_DIR, "gt_10pct.npy")
+if os.path.exists(GT_PATH):
+    gt_I = np.load(GT_PATH)
+    print(f"Loaded cached ground truth: {gt_I.shape}", flush=True)
+else:
+    print("Computing ground truth on 10% subset...", flush=True)
+    flat = faiss.IndexFlatL2(d)
+    flat.add(xb)
+    _, gt_I = flat.search(xq, k)
+    np.save(GT_PATH, gt_I)
+    print("Ground truth computed and cached.", flush=True)
+
+
+def eval_recall(index, nprobe_val):
+    faiss.cvar.indexPanorama_stats.reset()
+    t0 = time.time()
+    _, I = index.search(xq, k=k)
+    t = time.time() - t0
+    speed = t * 1000 / nq
+    qps = 1000 / speed
+    corrects = sum(len(set(gt_I[i]) & set(I[i])) for i in range(nq))
+    recall = corrects / (nq * k)
+    stats = faiss.cvar.indexPanorama_stats
+    pct_active = stats.ratio_dims_scanned * 100
+    print(
+        f"\tnprobe {nprobe_val:3d}, Recall@{k}: "
+        f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}, "
+        f"active: {pct_active:.1f}%",
+        flush=True,
+    )
+    return recall, qps
+
+
+faiss.omp_set_num_threads(mp.cpu_count())
+
+# # --- IVFPQ baseline (cached) ---
+# if os.path.exists(IVFPQ_CACHE):
+#     print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True)
+#     t0 = time.time()
+#     ivfpq = faiss.read_index(IVFPQ_CACHE)
+#     print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
+# else:
+#     print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True)
+#     quantizer = faiss.IndexFlatL2(d)
+#     ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+#     t0 = time.time()
+#     ivfpq.train(xt)
+#     print(f"  Training took {time.time() - t0:.1f}s", flush=True)
+
+#     print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
+#     faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE)
+
+#     t0 = time.time()
+#     ivfpq.add(xb)
+#     print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
+
+#     print(f"  Saving full index to {IVFPQ_CACHE}...", flush=True)
+#     faiss.write_index(ivfpq, IVFPQ_CACHE)
+
+# faiss.omp_set_num_threads(1)
+# print("\n====== IVFPQ baseline", flush=True)
+# for nprobe in [1, 2, 4, 8, 16]:
+#     ivfpq.nprobe = nprobe
+#     eval_recall(ivfpq, nprobe)
+
+# --- IVFPQPanorama (cached) ---
+faiss.omp_set_num_threads(mp.cpu_count())
+
+if os.path.exists(IVFPQ_PANO_CACHE):
+    print(f"\nLoading cached IVFPQPanorama from {IVFPQ_PANO_CACHE}...", flush=True)
+    t0 = time.time()
+    ivfpq_pano = faiss.read_index(IVFPQ_PANO_CACHE)
+    print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
+else:
+    def build_panorama_from_trained(trained_index):
+        quantizer2 = trained_index.quantizer
+        trained_index.own_fields = False
+
+        pano = faiss.IndexIVFPQPanorama(
+            quantizer2, d, nlist, M, nbits, n_levels, batch_size
+        )
+        centroids = faiss.vector_to_array(trained_index.pq.centroids)
+        faiss.copy_array_to_vector(centroids, pano.pq.centroids)
+        pano.is_trained = True
+        pano.use_precomputed_table = 1
+        pano.precompute_table()
+        return pano
+
+    if os.path.exists(IVFPQ_TRAINED_CACHE):
+        print(
+            f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...",
+            flush=True,
+        )
+        trained = faiss.read_index(IVFPQ_TRAINED_CACHE)
+        ivfpq_pano = build_panorama_from_trained(trained)
+        print("  Reused trained PQ (skipped training).", flush=True)
+    else:
+        print(
+            f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}",
+            flush=True,
+        )
+        quantizer2 = faiss.IndexFlatL2(d)
+        trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits)
+        t0 = time.time()
+        trained.train(xt)
+        print(f"  Training took {time.time() - t0:.1f}s", flush=True)
+
+        print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
+        faiss.write_index(trained, IVFPQ_TRAINED_CACHE)
+
+        ivfpq_pano = build_panorama_from_trained(trained)
+
+    t0 = time.time()
+    ivfpq_pano.add(xb)
+    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
+
+    print(f"  Saving IVFPQPanorama to {IVFPQ_PANO_CACHE}...", flush=True)
+    faiss.write_index(ivfpq_pano, IVFPQ_PANO_CACHE)
+
+faiss.omp_set_num_threads(1)
+print("\n====== IVFPQPanorama", flush=True)
+for nprobe in [1, 2, 4, 8, 16]:
+    ivfpq_pano.nprobe = nprobe
+    eval_recall(ivfpq_pano, nprobe)
+
+print("\nVerification complete!", flush=True)
diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
index fb4c442440..22097c96d7 100644
--- a/faiss/index_factory.cpp
+++ b/faiss/index_factory.cpp
@@ -29,6 +29,7 @@
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFFlatPanorama.h>
 #include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/IndexIVFPQFastScan.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFRaBitQ.h>
@@ -354,6 +355,12 @@ IndexIVF* parse_IndexIVF(
                 /*by_residual=*/true,
                 own_il);
     }
+    if (match("PQ([0-9]+)(x[0-9]+)?Panorama([0-9]+)?")) {
+        int M = mres_to_int(sm[1]), nbit = mres_to_int(sm[2], 8, 1);
+        int nlevels = mres_to_int(sm[3], 8);
+        return new IndexIVFPQPanorama(
+                get_q(), d, nlist, M, nbit, nlevels, 128, mt, own_il);
+    }
     if (match("PQ([0-9]+)(x[0-9]+)?(np)?")) {
         int M = mres_to_int(sm[1]), nbit = mres_to_int(sm[2], 8, 1);
         IndexIVFPQ* index_ivf =
diff --git a/tests/test_factory.py b/tests/test_factory.py
index 2246eb8c10..922ab14cf0 100644
--- a/tests/test_factory.py
+++ b/tests/test_factory.py
@@ -70,6 +70,21 @@ def test_factory_6(self):
         assert index.d == 128
         assert index.metric_type == faiss.METRIC_L2
 
+    def test_factory_panorama(self):
+        index = faiss.index_factory(64, "IVF16,PQ16x8Panorama4")
+        assert isinstance(index, faiss.IndexIVFPQPanorama)
+        assert index.n_levels == 4
+        assert index.pq.M == 16
+
+        index = faiss.index_factory(64, "IVF16,PQ16Panorama")
+        assert isinstance(index, faiss.IndexIVFPQPanorama)
+        assert index.n_levels == 8  # default
+
+        index = faiss.index_factory(64, "PCA64,IVF16,PQ16x8Panorama4")
+        ivf = faiss.downcast_index(index.index)
+        assert isinstance(ivf, faiss.IndexIVFPQPanorama)
+        assert ivf.n_levels == 4
+
     def test_factory_HNSW(self):
         index = faiss.index_factory(12, "HNSW32")
         assert index.storage.sa_code_size() == 12 * 4
diff --git a/tests/test_ivfpq_panorama.py b/tests/test_ivfpq_panorama.py
index d8b16a128e..d2e28d78d6 100644
--- a/tests/test_ivfpq_panorama.py
+++ b/tests/test_ivfpq_panorama.py
@@ -559,7 +559,7 @@ def test_ratio_dims_scanned(self):
         )
         pano_16.search(xq, k)
         ratio_16 = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
-        self.assertLess(ratio_16, 0.55)
+        self.assertLess(ratio_16, 0.6)
 
         faiss.omp_set_num_threads(nt_threads)
 

From 267ced5e2fdf5af17caae8f76013404a1fe41466 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sat, 21 Mar 2026 06:42:18 +0000
Subject: [PATCH 20/41] Alexis genius idea v1

---
 benchs/bench_ivfpq_panorama.py | 100 +++++++++++++++++++++++++--------
 1 file changed, 77 insertions(+), 23 deletions(-)

diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
index 7c2c689b83..e761642a14 100644
--- a/benchs/bench_ivfpq_panorama.py
+++ b/benchs/bench_ivfpq_panorama.py
@@ -9,6 +9,8 @@
 import faiss
 import matplotlib.pyplot as plt
 import numpy as np
+from scipy.linalg import block_diag
+from sklearn.decomposition import PCA
 
 try:
     from faiss.contrib.datasets_fb import DatasetGIST1M
@@ -49,6 +51,7 @@
 nbits = 8
 nlist = 128
 n_levels = 16
+nprobes = [1, 2, 4, 8, 16, 32, 64]
 
 
 def get_ivf_index(index):
@@ -77,46 +80,97 @@ def eval_recall(index, nprobe_val):
     return recall, qps
 
 
-def build_index(name):
-    index = faiss.index_factory(d, name)
-
-    faiss.omp_set_num_threads(mp.cpu_count())
-    index.train(xt)
-    index.add(xb)
-
-    return index
-
-
-def eval_and_plot(name, label=None):
-    index = build_index(name)
+def eval_index(index, label):
     ivf_index = get_ivf_index(index)
 
     faiss.omp_set_num_threads(1)
 
     data = []
-    print(f"====== {label or name}")
+    print(f"====== {label}")
     for nprobe in nprobes:
         ivf_index.nprobe = nprobe
         recall, qps = eval_recall(index, nprobe)
         data.append((recall, qps))
 
     data = np.array(data)
-    plt.plot(data[:, 0], data[:, 1], "o-", label=label or name)
+    plt.plot(data[:, 0], data[:, 1], "o-", label=label)
 
 
-nprobes = [1, 2, 4, 8, 16, 32, 64]
+def build_ivfpq(M):
+    """Build vanilla IVFPQ (no transform) via index_factory."""
+    index = faiss.index_factory(d, f"IVF{nlist},PQ{M}x{nbits}")
+    faiss.omp_set_num_threads(mp.cpu_count())
+    index.train(xt)
+    index.add(xb)
+    return index
+
+
+def make_pca_level_rotation_transform(xt, n_levels, seed=77):
+    """Build a fused PCA + per-level random rotation as a LinearTransform.
+
+    FAISS LinearTransform applies: y = A_stored @ x + b  (column-vector)
+    We want: y = R_block @ P @ (x - mean)
+      1. Center x
+      2. PCA project (P @ x_centered)
+      3. Per-level rotation (R_block @ z_pca)
+
+    So: A_stored = R_block @ P,  b = -A_stored @ mean
+    """
+    pca = PCA(n_components=d)
+    pca.fit(xt)
+
+    P = pca.components_.astype(np.float32)  # (d, d)
+    mean = pca.mean_.astype(np.float32)     # (d,)
+
+    block_size = d // n_levels
+    rng = np.random.RandomState(seed)
+    blocks = []
+    for _ in range(n_levels):
+        H = rng.randn(block_size, block_size).astype(np.float32)
+        Q, R = np.linalg.qr(H)
+        Q *= np.sign(np.diag(R))[:, None]
+        blocks.append(Q)
+    A = block_diag(*blocks).astype(np.float32)  # (d, d)
+
+    combined = A @ P  # (d, d)  -- rotation AFTER PCA
+
+    lt = faiss.LinearTransform(d, d, True)
+    faiss.copy_array_to_vector(combined.ravel(), lt.A)
+    faiss.copy_array_to_vector(-(combined @ mean).ravel(), lt.b)
+    lt.is_trained = True
+    lt.have_bias = True
+
+    return lt
+
+
+def build_ivfpq_panorama(M, n_levels):
+    """Build PCA + LevelRotation + IVFPQPanorama."""
+    lt = make_pca_level_rotation_transform(xt, n_levels)
+
+    quantizer = faiss.IndexFlatL2(d)
+    ivfpq_pano = faiss.IndexIVFPQPanorama(
+        quantizer, d, nlist, M, nbits, n_levels,
+    )
+
+    index = faiss.IndexPreTransform(lt, ivfpq_pano)
+
+    faiss.omp_set_num_threads(mp.cpu_count())
+    index.train(xt)
+    index.add(xb)
+
+    return index
+
 
 plt.figure(figsize=(10, 7), dpi=80)
 
 for M in M_values:
-    eval_and_plot(
-        f"IVF{nlist},PQ{M}x{nbits}",
-        label=f"IVFPQ (M={M})",
-    )
-    eval_and_plot(
-        f"PCA{d},IVF{nlist},PQ{M}x{nbits}Panorama{n_levels}",
-        label=f"PCA + IVFPQPanorama (M={M})",
-    )
+    ivfpq = build_ivfpq(M)
+    eval_index(ivfpq, label=f"IVFPQ (M={M})")
+    del ivfpq
+
+    pano = build_ivfpq_panorama(M, n_levels)
+    eval_index(pano, label=f"PCA+Rot + IVFPQPanorama (M={M})")
+    del pano
 
 plt.title(f"IVFPQ Panorama on GIST1M (nlist={nlist})")
 plt.xlabel(f"Recall@{k}")

From 46d4445ddbfde100f13c2ff47996dc4e8d4d5859 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sat, 21 Mar 2026 07:49:53 +0000
Subject: [PATCH 21/41] Alexis v2

---
 benchs/bench_ivfpq_panorama.py | 161 +++++++++++++++++++++++++++------
 1 file changed, 135 insertions(+), 26 deletions(-)

diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
index e761642a14..806c56fcc4 100644
--- a/benchs/bench_ivfpq_panorama.py
+++ b/benchs/bench_ivfpq_panorama.py
@@ -47,6 +47,7 @@
 
 print(f"Database: {nb} x {d}, Queries: {nq}, Train: {nt}")
 
+ALPHA = 8
 M_values = [960, 480, 240]
 nbits = 8
 nlist = 128
@@ -105,36 +106,144 @@ def build_ivfpq(M):
     return index
 
 
-def make_pca_level_rotation_transform(xt, n_levels, seed=77):
-    """Build a fused PCA + per-level random rotation as a LinearTransform.
+def compute_level_energies(variances, n_levels, block_size):
+    """Sum per-dimension variances into per-level total energies."""
+    return np.array([
+        np.sum(variances[l * block_size : (l + 1) * block_size])
+        for l in range(n_levels)
+    ])
 
-    FAISS LinearTransform applies: y = A_stored @ x + b  (column-vector)
-    We want: y = R_block @ P @ (x - mean)
-      1. Center x
-      2. PCA project (P @ x_centered)
-      3. Per-level rotation (R_block @ z_pca)
 
-    So: A_stored = R_block @ P,  b = -A_stored @ mean
+def find_n_spill(variances, level_start, block_size, max_energy_per_level, d):
+    """Find the smallest number of extra dimensions to spill into.
+
+    After a random rotation over (block_size + n_spill) dims, each dim gets
+    uniform expected energy.  The level's expected energy becomes:
+        block_size * total_subspace_energy / (block_size + n_spill)
+
+    Returns the smallest n_spill >= 1 where this is <= max_energy_per_level,
+    or all remaining dims if the cap can't be reached.
     """
-    pca = PCA(n_components=d)
-    pca.fit(xt)
+    level_end = level_start + block_size
+    max_extra = d - level_end
+    if max_extra == 0:
+        return 0
+
+    total = np.sum(variances[level_start:level_end])
+    for n in range(1, max_extra + 1):
+        total += variances[level_end + n - 1]
+        if block_size * total / (block_size + n) <= max_energy_per_level:
+            return n
+
+    return max_extra
+
+
+def random_orthogonal(size, rng):
+    """Haar-distributed random orthogonal matrix via QR of Gaussian."""
+    H = rng.randn(size, size).astype(np.float32)
+    Q, R = np.linalg.qr(H)
+    Q *= np.sign(np.diag(R))[:, None]
+    return Q
+
+
+def build_energy_spill_rotation(eigenvalues, n_levels, block_size,
+                                alpha, seed=42):
+    """Orthogonal matrix that caps per-level energy via localized rotations.
+
+    Iterates over levels sequentially.  When a level's effective energy
+    exceeds alpha * avg_energy_per_level, applies a random rotation spanning
+    that level plus enough subsequent dimensions to bring the expected level
+    energy down to the cap.
+
+    Variances are tracked analytically: after each rotation the dims in the
+    rotated subspace are set to uniform expected variance.
+
+    Returns (spill_rotation, effective_variances).
+    """
+    d = len(eigenvalues)
+    total_energy = float(np.sum(eigenvalues))
+    max_energy_per_level = alpha * total_energy / n_levels
+
+    variances = eigenvalues.astype(np.float32).copy()
+    spill_matrix = np.eye(d, dtype=np.float32)
+    rng = np.random.RandomState(seed)
+
+    for level in range(n_levels):
+        start = level * block_size
+        end = start + block_size
+        level_energy = float(np.sum(variances[start:end]))
+
+        if level_energy <= max_energy_per_level:
+            continue
+
+        n_spill = find_n_spill(
+            variances, start, block_size, max_energy_per_level, d,
+        )
+        if n_spill == 0:
+            continue
 
-    P = pca.components_.astype(np.float32)  # (d, d)
-    mean = pca.mean_.astype(np.float32)     # (d,)
+        sub_end = end + n_spill
+        Q = random_orthogonal(block_size + n_spill, rng)
 
-    block_size = d // n_levels
+        full_Q = np.eye(d, dtype=np.float32)
+        full_Q[start:sub_end, start:sub_end] = Q
+        spill_matrix = full_Q @ spill_matrix
+
+        avg_var = float(np.sum(variances[start:sub_end])) / (block_size + n_spill)
+        variances[start:sub_end] = avg_var
+
+    return spill_matrix, variances
+
+
+def build_level_equalization_rotation(d, n_levels, block_size, seed=77):
+    """Block-diagonal random rotation for within-level energy equalization."""
     rng = np.random.RandomState(seed)
-    blocks = []
-    for _ in range(n_levels):
-        H = rng.randn(block_size, block_size).astype(np.float32)
-        Q, R = np.linalg.qr(H)
-        Q *= np.sign(np.diag(R))[:, None]
-        blocks.append(Q)
-    A = block_diag(*blocks).astype(np.float32)  # (d, d)
+    blocks = [random_orthogonal(block_size, rng) for _ in range(n_levels)]
+    return block_diag(*blocks).astype(np.float32)
+
+
+def print_energy_diagnostics(eigenvalues, effective_variances, n_levels,
+                             block_size, alpha):
+    """Print per-level energy before/after the spill transform."""
+    before = compute_level_energies(eigenvalues, n_levels, block_size)
+    after = compute_level_energies(effective_variances, n_levels, block_size)
+    total = float(np.sum(eigenvalues))
+    cap = alpha * total / n_levels
+
+
+def make_pca_level_rotation_transform(xt, n_levels, alpha=ALPHA, seed=77):
+    """Build PCA + energy-spill + per-level rotation as one LinearTransform.
+
+    Pipeline:  y = R_eq @ R_spill @ P @ (x - mean)
+      1. Center + PCA project           (P, mean)
+      2. Energy spill across levels      (R_spill)
+      3. Within-level equalization       (R_eq, block-diagonal)
+
+    Stored as:  A = R_eq @ R_spill @ P,  b = -A @ mean
+    """
+    dim = xt.shape[1]
+    block_size = dim // n_levels
+
+    pca = PCA(n_components=dim)
+    pca.fit(xt)
+    P = pca.components_.astype(np.float32)
+    mean = pca.mean_.astype(np.float32)
+    eigenvalues = pca.explained_variance_.astype(np.float32)
+
+    R_spill, effective_variances = build_energy_spill_rotation(
+        eigenvalues, n_levels, block_size, alpha, seed=seed,
+    )
+    print_energy_diagnostics(
+        eigenvalues, effective_variances, n_levels, block_size, alpha,
+    )
+
+    R_eq = build_level_equalization_rotation(
+        dim, n_levels, block_size, seed=seed + 1,
+    )
 
-    combined = A @ P  # (d, d)  -- rotation AFTER PCA
+    combined = (R_eq @ R_spill @ P).astype(np.float32)
 
-    lt = faiss.LinearTransform(d, d, True)
+    lt = faiss.LinearTransform(dim, dim, True)
     faiss.copy_array_to_vector(combined.ravel(), lt.A)
     faiss.copy_array_to_vector(-(combined @ mean).ravel(), lt.b)
     lt.is_trained = True
@@ -143,9 +252,9 @@ def make_pca_level_rotation_transform(xt, n_levels, seed=77):
     return lt
 
 
-def build_ivfpq_panorama(M, n_levels):
-    """Build PCA + LevelRotation + IVFPQPanorama."""
-    lt = make_pca_level_rotation_transform(xt, n_levels)
+def build_ivfpq_panorama(M, n_levels, alpha=ALPHA):
+    """Build PCA + EnergySpill + LevelRotation + IVFPQPanorama."""
+    lt = make_pca_level_rotation_transform(xt, n_levels, alpha=alpha)
 
     quantizer = faiss.IndexFlatL2(d)
     ivfpq_pano = faiss.IndexIVFPQPanorama(
@@ -169,7 +278,7 @@ def build_ivfpq_panorama(M, n_levels):
     del ivfpq
 
     pano = build_ivfpq_panorama(M, n_levels)
-    eval_index(pano, label=f"PCA+Rot + IVFPQPanorama (M={M})")
+    eval_index(pano, label=f"PCA+Spill+Rot + IVFPQPanorama (M={M})")
     del pano
 
 plt.title(f"IVFPQ Panorama on GIST1M (nlist={nlist})")

From b815ea5c84db387950bdd63ce542d13df41e7f0f Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Sat, 21 Mar 2026 08:24:58 +0000
Subject: [PATCH 22/41] Remove side bench

---
 benchs/bench_ivfpq_panorama_test.py | 173 ----------------------------
 1 file changed, 173 deletions(-)
 delete mode 100644 benchs/bench_ivfpq_panorama_test.py

diff --git a/benchs/bench_ivfpq_panorama_test.py b/benchs/bench_ivfpq_panorama_test.py
deleted file mode 100644
index 38fe14614c..0000000000
--- a/benchs/bench_ivfpq_panorama_test.py
+++ /dev/null
@@ -1,173 +0,0 @@
-# Quick 10% verification of IVFPQPanorama (with index caching)
-
-import multiprocessing as mp
-import os
-import time
-
-import faiss
-import numpy as np
-
-print("Compile options:", faiss.get_compile_options(), flush=True)
-
-
-def fvecs_read(fname):
-    a = np.fromfile(fname, dtype="float32")
-    d = a[0].view("int32")
-    return a.reshape(-1, d + 1)[:, 1:].copy()
-
-
-GIST_DIR = "/datasets/PCA_init"
-CACHE_DIR = "/home/akash/faiss-panorama/index_cache"
-os.makedirs(CACHE_DIR, exist_ok=True)
-
-IVFPQ_CACHE = os.path.join(CACHE_DIR, "ivfpq_10pct.index")
-IVFPQ_TRAINED_CACHE = os.path.join(CACHE_DIR, "ivfpq_trained_10pct.index")
-IVFPQ_PANO_CACHE = os.path.join(CACHE_DIR, "ivfpq_pano_10pct.index")
-
-print("Loading GIST1M data (10% subset)...", flush=True)
-xb_full = fvecs_read(os.path.join(GIST_DIR, "gist1m_base.fvecs"))
-xq = fvecs_read(os.path.join(GIST_DIR, "gist1m_query.fvecs"))
-
-nb_full, d = xb_full.shape
-nb = nb_full // 10  # 10% = 100000
-xb = xb_full[:nb].copy()
-del xb_full
-
-nq = xq.shape[0]
-print(f"Database: {nb} x {d}, Queries: {nq} x {d}", flush=True)
-
-xt = xb[:50000].copy()
-
-k = 10
-M = 960
-nbits = 8
-nlist = 64
-n_levels = 16
-batch_size = 128
-
-GT_PATH = os.path.join(CACHE_DIR, "gt_10pct.npy")
-if os.path.exists(GT_PATH):
-    gt_I = np.load(GT_PATH)
-    print(f"Loaded cached ground truth: {gt_I.shape}", flush=True)
-else:
-    print("Computing ground truth on 10% subset...", flush=True)
-    flat = faiss.IndexFlatL2(d)
-    flat.add(xb)
-    _, gt_I = flat.search(xq, k)
-    np.save(GT_PATH, gt_I)
-    print("Ground truth computed and cached.", flush=True)
-
-
-def eval_recall(index, nprobe_val):
-    faiss.cvar.indexPanorama_stats.reset()
-    t0 = time.time()
-    _, I = index.search(xq, k=k)
-    t = time.time() - t0
-    speed = t * 1000 / nq
-    qps = 1000 / speed
-    corrects = sum(len(set(gt_I[i]) & set(I[i])) for i in range(nq))
-    recall = corrects / (nq * k)
-    stats = faiss.cvar.indexPanorama_stats
-    pct_active = stats.ratio_dims_scanned * 100
-    print(
-        f"\tnprobe {nprobe_val:3d}, Recall@{k}: "
-        f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}, "
-        f"active: {pct_active:.1f}%",
-        flush=True,
-    )
-    return recall, qps
-
-
-faiss.omp_set_num_threads(mp.cpu_count())
-
-# # --- IVFPQ baseline (cached) ---
-# if os.path.exists(IVFPQ_CACHE):
-#     print(f"\nLoading cached IVFPQ from {IVFPQ_CACHE}...", flush=True)
-#     t0 = time.time()
-#     ivfpq = faiss.read_index(IVFPQ_CACHE)
-#     print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
-# else:
-#     print(f"\nBuilding IVFPQ: nlist={nlist}, M={M}, nbits={nbits}", flush=True)
-#     quantizer = faiss.IndexFlatL2(d)
-#     ivfpq = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
-#     t0 = time.time()
-#     ivfpq.train(xt)
-#     print(f"  Training took {time.time() - t0:.1f}s", flush=True)
-
-#     print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
-#     faiss.write_index(ivfpq, IVFPQ_TRAINED_CACHE)
-
-#     t0 = time.time()
-#     ivfpq.add(xb)
-#     print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
-
-#     print(f"  Saving full index to {IVFPQ_CACHE}...", flush=True)
-#     faiss.write_index(ivfpq, IVFPQ_CACHE)
-
-# faiss.omp_set_num_threads(1)
-# print("\n====== IVFPQ baseline", flush=True)
-# for nprobe in [1, 2, 4, 8, 16]:
-#     ivfpq.nprobe = nprobe
-#     eval_recall(ivfpq, nprobe)
-
-# --- IVFPQPanorama (cached) ---
-faiss.omp_set_num_threads(mp.cpu_count())
-
-if os.path.exists(IVFPQ_PANO_CACHE):
-    print(f"\nLoading cached IVFPQPanorama from {IVFPQ_PANO_CACHE}...", flush=True)
-    t0 = time.time()
-    ivfpq_pano = faiss.read_index(IVFPQ_PANO_CACHE)
-    print(f"  Loaded in {time.time() - t0:.1f}s", flush=True)
-else:
-    def build_panorama_from_trained(trained_index):
-        quantizer2 = trained_index.quantizer
-        trained_index.own_fields = False
-
-        pano = faiss.IndexIVFPQPanorama(
-            quantizer2, d, nlist, M, nbits, n_levels, batch_size
-        )
-        centroids = faiss.vector_to_array(trained_index.pq.centroids)
-        faiss.copy_array_to_vector(centroids, pano.pq.centroids)
-        pano.is_trained = True
-        pano.use_precomputed_table = 1
-        pano.precompute_table()
-        return pano
-
-    if os.path.exists(IVFPQ_TRAINED_CACHE):
-        print(
-            f"\nLoading trained IVFPQ for Panorama from {IVFPQ_TRAINED_CACHE}...",
-            flush=True,
-        )
-        trained = faiss.read_index(IVFPQ_TRAINED_CACHE)
-        ivfpq_pano = build_panorama_from_trained(trained)
-        print("  Reused trained PQ (skipped training).", flush=True)
-    else:
-        print(
-            f"\nTraining IVFPQ for Panorama from scratch: nlist={nlist}, M={M}, nbits={nbits}",
-            flush=True,
-        )
-        quantizer2 = faiss.IndexFlatL2(d)
-        trained = faiss.IndexIVFPQ(quantizer2, d, nlist, M, nbits)
-        t0 = time.time()
-        trained.train(xt)
-        print(f"  Training took {time.time() - t0:.1f}s", flush=True)
-
-        print(f"  Saving trained state to {IVFPQ_TRAINED_CACHE}...", flush=True)
-        faiss.write_index(trained, IVFPQ_TRAINED_CACHE)
-
-        ivfpq_pano = build_panorama_from_trained(trained)
-
-    t0 = time.time()
-    ivfpq_pano.add(xb)
-    print(f"  Adding took {time.time() - t0:.1f}s", flush=True)
-
-    print(f"  Saving IVFPQPanorama to {IVFPQ_PANO_CACHE}...", flush=True)
-    faiss.write_index(ivfpq_pano, IVFPQ_PANO_CACHE)
-
-faiss.omp_set_num_threads(1)
-print("\n====== IVFPQPanorama", flush=True)
-for nprobe in [1, 2, 4, 8, 16]:
-    ivfpq_pano.nprobe = nprobe
-    eval_recall(ivfpq_pano, nprobe)
-
-print("\nVerification complete!", flush=True)

From 1ecda7eaa58eaf95ede4a70d4decda75973caf26 Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Sat, 21 Mar 2026 08:57:08 +0000
Subject: [PATCH 23/41] format and fix some merge bugs

---
 faiss/IndexFlat.h                             |  2 +-
 faiss/IndexHNSW.cpp                           |  2 +-
 faiss/IndexIVFPQPanorama.cpp                  | 38 ++++++++---------
 faiss/impl/Panorama.h                         |  6 ++-
 faiss/impl/PanoramaPQ.cpp                     | 19 ++++-----
 faiss/impl/index_read.cpp                     |  3 +-
 faiss/impl/index_write.cpp                    |  2 +-
 .../panorama_kernels-avx2.cpp                 | 17 ++++----
 .../panorama_kernels-avx512.cpp               | 41 ++++++++-----------
 .../panorama_kernels-generic.cpp              | 12 ++----
 faiss/index_factory.cpp                       |  2 +-
 11 files changed, 64 insertions(+), 80 deletions(-)

diff --git a/faiss/IndexFlat.h b/faiss/IndexFlat.h
index 632768e9ff..7e10f05b25 100644
--- a/faiss/IndexFlat.h
+++ b/faiss/IndexFlat.h
@@ -120,7 +120,7 @@ struct IndexFlatPanorama : IndexFlat {
             : IndexFlat(d_in, metric),
               batch_size(batch_size_in),
               n_levels(n_levels_in),
-              pano(code_size, n_levels_in, batch_size_in) {
+              pano(d_in, n_levels_in, batch_size_in) {
         FAISS_THROW_IF_NOT(
                 metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
     }
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index e99796ef5a..8f2f5f3e8a 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -680,7 +680,7 @@ IndexHNSWFlatPanorama::IndexHNSWFlatPanorama(
         MetricType metric)
         : IndexHNSWFlat(d_in, M, metric),
           cum_sums(),
-          pano(d_in * sizeof(float), num_panorama_levels_in, 1),
+          pano(d_in, num_panorama_levels_in, 1),
           num_panorama_levels(num_panorama_levels_in) {
     // For now, we only support L2 distance.
     // Supporting dot product and cosine distance is a trivial addition
diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index 26c9eccbd3..4848326553 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -49,7 +49,8 @@ IndexIVFPQPanorama::IndexIVFPQPanorama(
             M == code_size, "M must equal code_size for 8-bit PQ");
     FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported");
 
-    auto* pano = new PanoramaPQ(d, code_size, n_levels, batch_size, &pq, quantizer);
+    auto* pano =
+            new PanoramaPQ(d, code_size, n_levels, batch_size, &pq, quantizer);
     this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano);
     this->own_invlists = own_invlists;
 }
@@ -149,24 +150,23 @@ struct IVFPQScannerPanorama : InvertedListScanner {
         local_stats.reset();
 
         for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
-            size_t num_active =
-                    pano_pq->progressive_filter_batch<C, use_sel>(
-                            col_codes,
-                            list_cum_sums,
-                            list_init_dists,
-                            sim_table_2.data(),
-                            query_cum_norms.data(),
-                            dis0,
-                            list_size,
-                            batch_no,
-                            ids,
-                            sel,
-                            exact_distances,
-                            active_indices,
-                            bitset,
-                            compressed_codes,
-                            distances[0],
-                            local_stats);
+            size_t num_active = pano_pq->progressive_filter_batch<C, use_sel>(
+                    col_codes,
+                    list_cum_sums,
+                    list_init_dists,
+                    sim_table_2.data(),
+                    query_cum_norms.data(),
+                    dis0,
+                    list_size,
+                    batch_no,
+                    ids,
+                    sel,
+                    exact_distances,
+                    active_indices,
+                    bitset,
+                    compressed_codes,
+                    distances[0],
+                    local_stats);
 
             // Insert surviving candidates into heap.
             for (size_t i = 0; i < num_active; i++) {
diff --git a/faiss/impl/Panorama.h b/faiss/impl/Panorama.h
index 8f33fc8d1a..fcf3136d44 100644
--- a/faiss/impl/Panorama.h
+++ b/faiss/impl/Panorama.h
@@ -94,8 +94,10 @@ struct Panorama {
             size_t dest_idx,
             size_t src_idx) const;
 
-    virtual void reconstruct(idx_t key, float* recons, const uint8_t* codes_base)
-            const;
+    virtual void reconstruct(
+            idx_t key,
+            float* recons,
+            const uint8_t* codes_base) const;
 };
 
 /**
diff --git a/faiss/impl/PanoramaPQ.cpp b/faiss/impl/PanoramaPQ.cpp
index 02e70967b0..bd80f7f81c 100644
--- a/faiss/impl/PanoramaPQ.cpp
+++ b/faiss/impl/PanoramaPQ.cpp
@@ -7,7 +7,6 @@
 
 #include <faiss/impl/PanoramaPQ.h>
 
-#include <algorithm>
 #include <cmath>
 #include <vector>
 
@@ -59,9 +58,8 @@ void PanoramaPQ::reconstruct(
         size_t start_byte = level * cs;
 
         for (size_t ci = 0; ci < cs && (start_byte + ci) < code_size; ci++) {
-            recons_buffer[start_byte + ci] =
-                    codes_base[batch_offset + level_offset + ci * bs +
-                               pos_in_batch];
+            recons_buffer[start_byte + ci] = codes_base
+                    [batch_offset + level_offset + ci * bs + pos_in_batch];
         }
     }
 }
@@ -108,15 +106,14 @@ void PanoramaPQ::compute_cumulative_sums(
         size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1);
         for (size_t level = 0; level < n_levels; level++) {
             size_t start_idx = level * levels_size;
-            size_t out_offset = cumsum_batch_offset + level * batch_size +
-                    pos_in_batch;
-            cumsum_base[out_offset] = start_idx < d
-                    ? std::sqrt(suffix[start_idx])
-                    : 0.0f;
+            size_t out_offset =
+                    cumsum_batch_offset + level * batch_size + pos_in_batch;
+            cumsum_base[out_offset] =
+                    start_idx < d ? std::sqrt(suffix[start_idx]) : 0.0f;
         }
 
-        size_t last_offset = cumsum_batch_offset + n_levels * batch_size +
-                pos_in_batch;
+        size_t last_offset =
+                cumsum_batch_offset + n_levels * batch_size + pos_in_batch;
         cumsum_base[last_offset] = 0.0f;
     }
 }
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 3cd555e0d8..864cc52455 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -1508,8 +1508,7 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
                 if (list_size == 0)
                     continue;
                 size_t bs = pano_pq->batch_size;
-                size_t padded =
-                        ((list_size + bs - 1) / bs) * bs;
+                size_t padded = ((list_size + bs - 1) / bs) * bs;
                 storage->init_dists[list_no].resize(padded);
 
                 // Reconstruct row-major codes, then compute init distances.
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 5ebae71acf..66f0ed325f 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -34,8 +34,8 @@
 #include <faiss/IndexIVFFlatPanorama.h>
 #include <faiss/IndexIVFIndependentQuantizer.h>
 #include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/IndexIVFPQFastScan.h>
+#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFRaBitQ.h>
 #include <faiss/IndexIVFRaBitQFastScan.h>
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
index 46728b1cdd..5633f6c874 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -100,15 +100,15 @@ void process_chunks(
             __m128i raw = _mm_loadl_epi64(
                     (__m128i*)(compressed_codes + chunk_offset + batch_idx));
             __m256i codes = _mm256_cvtepu8_epi32(raw);
-            __m256 m_dist = _mm256_i32gather_ps(
-                    sim_table_ptr, codes, sizeof(float));
+            __m256 m_dist =
+                    _mm256_i32gather_ps(sim_table_ptr, codes, sizeof(float));
             acc = _mm256_add_ps(acc, m_dist);
             _mm256_storeu_ps(exact_distances + batch_idx, acc);
         }
 
         for (; batch_idx < num_active; batch_idx += 1) {
-            exact_distances[batch_idx] += sim_table_ptr
-                    [compressed_codes[chunk_offset + batch_idx]];
+            exact_distances[batch_idx] +=
+                    sim_table_ptr[compressed_codes[chunk_offset + batch_idx]];
         }
     }
 }
@@ -127,8 +127,7 @@ size_t process_filtering(
     for (size_t i = 0; i < num_active; i++) {
         float exact_distance = exact_distances[i];
         float cum_sum = cum_sums[active_indices[i] - batch_offset];
-        float lower_bound =
-                exact_distance + dis0 - cum_sum * query_cum_norm;
+        float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm;
 
         bool keep = heap_max > lower_bound;
         active_indices[next_num_active] = active_indices[i];
@@ -170,8 +169,7 @@ std::pair<uint8_t*, size_t> process_code_compression(
             for (int g = 0; g < 8; g++) {
                 uint64_t bytes;
                 memcpy(&bytes, bitset + point_idx + g * 8, 8);
-                uint8_t bits = (uint8_t)_pext_u64(
-                        bytes, 0x0101010101010101ULL);
+                uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL);
                 mask |= ((uint64_t)bits << (g * 8));
             }
 #else
@@ -196,8 +194,7 @@ std::pair<uint8_t*, size_t> process_code_compression(
                     memcpy(&src_val, src + g * 8, 8);
                     uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF);
                     uint64_t byte_mask =
-                            _pdep_u64(submask, 0x0101010101010101ULL) *
-                            0xFF;
+                            _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF;
                     uint64_t compressed_val = _pext_u64(src_val, byte_mask);
                     int count = __builtin_popcount(submask);
                     memcpy(dst + write_pos, &compressed_val, 8);
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
index 7733d5a6da..a73461a8dc 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -42,33 +42,29 @@ void process_chunks(
         for (; batch_idx + 15 < num_active; batch_idx += 16) {
             __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
 
-            __m128i comp0 =
-                    _mm_loadu_si128((__m128i*)(compressed_codes +
-                                               chunk_offset0 + batch_idx));
+            __m128i comp0 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + chunk_offset0 + batch_idx));
             __m512i codes0 = _mm512_cvtepu8_epi32(comp0);
             acc = _mm512_add_ps(
                     acc,
                     _mm512_i32gather_ps(codes0, sim_table0, sizeof(float)));
 
-            __m128i comp1 =
-                    _mm_loadu_si128((__m128i*)(compressed_codes +
-                                               chunk_offset1 + batch_idx));
+            __m128i comp1 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + chunk_offset1 + batch_idx));
             __m512i codes1 = _mm512_cvtepu8_epi32(comp1);
             acc = _mm512_add_ps(
                     acc,
                     _mm512_i32gather_ps(codes1, sim_table1, sizeof(float)));
 
-            __m128i comp2 =
-                    _mm_loadu_si128((__m128i*)(compressed_codes +
-                                               chunk_offset2 + batch_idx));
+            __m128i comp2 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + chunk_offset2 + batch_idx));
             __m512i codes2 = _mm512_cvtepu8_epi32(comp2);
             acc = _mm512_add_ps(
                     acc,
                     _mm512_i32gather_ps(codes2, sim_table2, sizeof(float)));
 
-            __m128i comp3 =
-                    _mm_loadu_si128((__m128i*)(compressed_codes +
-                                               chunk_offset3 + batch_idx));
+            __m128i comp3 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + chunk_offset3 + batch_idx));
             __m512i codes3 = _mm512_cvtepu8_epi32(comp3);
             acc = _mm512_add_ps(
                     acc,
@@ -94,18 +90,18 @@ void process_chunks(
         size_t batch_idx = 0;
         for (; batch_idx + 15 < num_active; batch_idx += 16) {
             __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
-            __m128i comp = _mm_loadu_si128((
-                    __m128i*)(compressed_codes + chunk_offset + batch_idx));
+            __m128i comp = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + chunk_offset + batch_idx));
             __m512i codes = _mm512_cvtepu8_epi32(comp);
-            __m512 m_dist = _mm512_i32gather_ps(
-                    codes, sim_table_ptr, sizeof(float));
+            __m512 m_dist =
+                    _mm512_i32gather_ps(codes, sim_table_ptr, sizeof(float));
             acc = _mm512_add_ps(acc, m_dist);
             _mm512_storeu_ps(exact_distances + batch_idx, acc);
         }
 
         for (; batch_idx < num_active; batch_idx += 1) {
-            exact_distances[batch_idx] += sim_table_ptr
-                    [compressed_codes[chunk_offset + batch_idx]];
+            exact_distances[batch_idx] +=
+                    sim_table_ptr[compressed_codes[chunk_offset + batch_idx]];
         }
     }
 }
@@ -124,8 +120,7 @@ size_t process_filtering(
     for (size_t i = 0; i < num_active; i++) {
         float exact_distance = exact_distances[i];
         float cum_sum = cum_sums[active_indices[i] - batch_offset];
-        float lower_bound =
-                exact_distance + dis0 - cum_sum * query_cum_norm;
+        float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm;
 
         bool keep = heap_max > lower_bound;
         active_indices[next_num_active] = active_indices[i];
@@ -169,8 +164,7 @@ std::pair<uint8_t*, size_t> process_code_compression(
             for (int g = 0; g < 8; g++) {
                 uint64_t bytes;
                 memcpy(&bytes, bitset + point_idx + g * 8, 8);
-                uint8_t bits = (uint8_t)_pext_u64(
-                        bytes, 0x0101010101010101ULL);
+                uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL);
                 mask |= ((uint64_t)bits << (g * 8));
             }
 #else
@@ -196,8 +190,7 @@ std::pair<uint8_t*, size_t> process_code_compression(
                     memcpy(&src_val, src + g * 8, 8);
                     uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF);
                     uint64_t byte_mask =
-                            _pdep_u64(submask, 0x0101010101010101ULL) *
-                            0xFF;
+                            _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF;
                     uint64_t compressed_val = _pext_u64(src_val, byte_mask);
                     int count = __builtin_popcount(submask);
                     memcpy(dst + write_pos, &compressed_val, 8);
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
index cfd1283c80..73d5ba24d9 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
@@ -32,8 +32,7 @@ void process_chunks(
         size_t chunk_offset = chunk_idx * max_batch_size;
         float* chunk_sim = sim_table + chunk_idx * 256;
         for (size_t i = 0; i < num_active; i++) {
-            exact_distances[i] +=
-                    chunk_sim[compressed_codes[chunk_offset + i]];
+            exact_distances[i] += chunk_sim[compressed_codes[chunk_offset + i]];
         }
     }
 }
@@ -52,8 +51,7 @@ size_t process_filtering(
     for (size_t i = 0; i < num_active; i++) {
         float exact_distance = exact_distances[i];
         float cum_sum = cum_sums[active_indices[i] - batch_offset];
-        float lower_bound =
-                exact_distance + dis0 - cum_sum * query_cum_norm;
+        float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm;
 
         bool keep = heap_max > lower_bound;
         active_indices[next_num_active] = active_indices[i];
@@ -87,8 +85,7 @@ std::pair<uint8_t*, size_t> process_code_compression(
             for (int g = 0; g < 8; g++) {
                 uint64_t bytes;
                 memcpy(&bytes, bitset + point_idx + g * 8, 8);
-                uint8_t bits = (uint8_t)_pext_u64(
-                        bytes, 0x0101010101010101ULL);
+                uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL);
                 mask |= ((uint64_t)bits << (g * 8));
             }
 #else
@@ -113,8 +110,7 @@ std::pair<uint8_t*, size_t> process_code_compression(
                     memcpy(&src_val, src + g * 8, 8);
                     uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF);
                     uint64_t byte_mask =
-                            _pdep_u64(submask, 0x0101010101010101ULL) *
-                            0xFF;
+                            _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF;
                     uint64_t compressed_val = _pext_u64(src_val, byte_mask);
                     int count = __builtin_popcount(submask);
                     memcpy(dst + write_pos, &compressed_val, 8);
diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
index ba8050962d..8acbd02003 100644
--- a/faiss/index_factory.cpp
+++ b/faiss/index_factory.cpp
@@ -29,8 +29,8 @@
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFFlatPanorama.h>
 #include <faiss/IndexIVFPQ.h>
-#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/IndexIVFPQFastScan.h>
+#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFRaBitQ.h>
 #include <faiss/IndexIVFRaBitQFastScan.h>

From 5a599c46f9cdedef3e29b790305e1ea95ade39c5 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sat, 21 Mar 2026 17:55:06 +0000
Subject: [PATCH 24/41] Fix bug

---
 faiss/invlists/InvertedLists.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index 41638f2e39..63a4d3383c 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -363,6 +363,7 @@ ArrayInvertedListsPanorama::ArrayInvertedListsPanorama(
             !use_iterator, "Panorama does not support iterators");
 
     cum_sums.resize(nlist_in);
+    init_dists.resize(nlist_in);
 }
 
 const float* ArrayInvertedListsPanorama::get_cum_sums(size_t list_no) const {

From 5cf76c49717bdcdbfde0b0937c690734f29fc42d Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 00:20:24 +0000
Subject: [PATCH 25/41] Get rid of "chunk_size"

---
 faiss/IndexIVFPQPanorama.cpp                  |  5 +-
 faiss/IndexIVFPQPanorama.h                    |  3 +-
 faiss/impl/PanoramaPQ.cpp                     | 27 ++++----
 faiss/impl/PanoramaPQ.h                       | 19 +++---
 faiss/impl/index_read.cpp                     |  1 -
 .../panorama_kernels-avx2.cpp                 | 68 +++++++++----------
 .../panorama_kernels-avx512.cpp               | 68 +++++++++----------
 .../panorama_kernels-generic.cpp              | 28 ++++----
 .../impl/panorama_kernels/panorama_kernels.h  |  6 +-
 9 files changed, 110 insertions(+), 115 deletions(-)

diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
index 4848326553..92862713f7 100644
--- a/faiss/IndexIVFPQPanorama.cpp
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -38,7 +38,6 @@ IndexIVFPQPanorama::IndexIVFPQPanorama(
         : IndexIVFPQ(quantizer, d, nlist, M, nbits_per_idx, metric, false),
           n_levels(n_levels),
           batch_size(batch_size),
-          chunk_size(code_size / n_levels),
           levels_size(d / n_levels) {
     FAISS_THROW_IF_NOT_MSG(
             M % n_levels == 0, "M must be divisible by n_levels");
@@ -132,7 +131,7 @@ struct IVFPQScannerPanorama : InvertedListScanner {
         size_t nup = 0;
 
         const size_t bs = index.batch_size;
-        const size_t cs = index.chunk_size;
+        const size_t ls = pano_pq->level_width_bytes;
 
         const size_t n_batches = (list_size + bs - 1) / bs;
         const uint8_t* col_codes = storage->get_codes(list_no);
@@ -143,7 +142,7 @@ struct IVFPQScannerPanorama : InvertedListScanner {
         std::vector<float> exact_distances(bs);
         std::vector<uint8_t> bitset(bs);
         std::vector<uint32_t> active_indices(bs);
-        std::vector<uint8_t> compressed_codes(bs * cs);
+        std::vector<uint8_t> compressed_codes(bs * ls);
         float dis0 = coarse_dis;
 
         PanoramaStats local_stats;
diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h
index 717308bb07..74d38677e1 100644
--- a/faiss/IndexIVFPQPanorama.h
+++ b/faiss/IndexIVFPQPanorama.h
@@ -30,7 +30,7 @@ namespace faiss {
 /// Panorama transposes codes into column-major within each batch:
 /// for each batch of `batch_size` points, codes are stored as
 /// M columns of `batch_size` bytes each. The M columns are grouped
-/// into `n_levels` levels of `chunk_size` columns, enabling incremental
+/// into `n_levels` levels of `level_width_bytes` columns, enabling incremental
 /// distance computation level-by-level.
 ///
 /// Storage is managed by ArrayInvertedListsPanorama with a PanoramaPQ
@@ -60,7 +60,6 @@ struct IndexIVFPQPanorama : public IndexIVFPQ {
     int n_levels;
     size_t batch_size;
 
-    size_t chunk_size;
     size_t levels_size;
 
     IndexIVFPQPanorama(
diff --git a/faiss/impl/PanoramaPQ.cpp b/faiss/impl/PanoramaPQ.cpp
index bd80f7f81c..f535ae9b84 100644
--- a/faiss/impl/PanoramaPQ.cpp
+++ b/faiss/impl/PanoramaPQ.cpp
@@ -19,7 +19,7 @@ void PanoramaPQ::copy_codes_to_level_layout(
         size_t offset,
         size_t n_entry,
         const uint8_t* code) {
-    const size_t cs = chunk_size;
+    const size_t ls = level_width_bytes;
     const size_t bs = batch_size;
 
     for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
@@ -29,13 +29,13 @@ void PanoramaPQ::copy_codes_to_level_layout(
         size_t batch_offset = batch_no * bs * code_size;
 
         for (size_t level = 0; level < n_levels; level++) {
-            size_t level_offset = level * cs * bs;
-            size_t start_byte = level * cs;
+            size_t level_offset = level * ls * bs;
+            size_t start_byte = level * ls;
 
-            for (size_t ci = 0; ci < cs && (start_byte + ci) < code_size;
-                 ci++) {
-                codes[batch_offset + level_offset + ci * bs + pos_in_batch] =
-                        code[entry_idx * code_size + start_byte + ci];
+            for (size_t li = 0; li < ls && (start_byte + li) < code_size;
+                 li++) {
+                codes[batch_offset + level_offset + li * bs + pos_in_batch] =
+                        code[entry_idx * code_size + start_byte + li];
             }
         }
     }
@@ -46,7 +46,7 @@ void PanoramaPQ::reconstruct(
         float* recons,
         const uint8_t* codes_base) const {
     uint8_t* recons_buffer = reinterpret_cast<uint8_t*>(recons);
-    const size_t cs = chunk_size;
+    const size_t ls = level_width_bytes;
     const size_t bs = batch_size;
 
     size_t batch_no = key / bs;
@@ -54,12 +54,12 @@ void PanoramaPQ::reconstruct(
     size_t batch_offset = batch_no * bs * code_size;
 
     for (size_t level = 0; level < n_levels; level++) {
-        size_t level_offset = level * cs * bs;
-        size_t start_byte = level * cs;
+        size_t level_offset = level * ls * bs;
+        size_t start_byte = level * ls;
 
-        for (size_t ci = 0; ci < cs && (start_byte + ci) < code_size; ci++) {
-            recons_buffer[start_byte + ci] = codes_base
-                    [batch_offset + level_offset + ci * bs + pos_in_batch];
+        for (size_t li = 0; li < ls && (start_byte + li) < code_size; li++) {
+            recons_buffer[start_byte + li] = codes_base
+                    [batch_offset + level_offset + li * bs + pos_in_batch];
         }
     }
 }
@@ -74,7 +74,6 @@ PanoramaPQ::PanoramaPQ(
         : Panorama(d, code_size, n_levels, batch_size),
           pq(pq),
           quantizer(quantizer),
-          chunk_size(code_size / n_levels),
           levels_size(d / n_levels) {
     FAISS_THROW_IF_NOT_MSG(
             code_size % n_levels == 0,
diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h
index a8e35a5699..a3901db795 100644
--- a/faiss/impl/PanoramaPQ.h
+++ b/faiss/impl/PanoramaPQ.h
@@ -28,7 +28,6 @@ namespace faiss {
 struct PanoramaPQ : Panorama {
     const ProductQuantizer* pq = nullptr;
     const Index* quantizer = nullptr;
-    size_t chunk_size = 0;
     size_t levels_size = 0;
 
     PanoramaPQ() = default;
@@ -107,7 +106,7 @@ struct PanoramaPQ : Panorama {
             float threshold,
             PanoramaStats& local_stats) const {
         const size_t bs = batch_size;
-        const size_t cs = chunk_size;
+        const size_t ls = level_width_bytes;
         const size_t ksub = pq->ksub;
 
         size_t curr_batch_size = std::min(list_size - batch_no * bs, bs);
@@ -149,12 +148,12 @@ struct PanoramaPQ : Panorama {
              level++) {
             local_stats.total_dims_scanned += next_num_active;
 
-            size_t level_sim_offset = level * ksub * cs;
+            size_t level_sim_offset = level * ksub * ls;
 
             float query_cum_norm = 2 * query_cum_norms[level + 1];
 
             const float* cum_sums_level = batch_cums + bs * (level + 1);
-            const uint8_t* codes_level = batch_codes + bs * cs * level;
+            const uint8_t* codes_level = batch_codes + bs * ls * level;
 
             const float* sim_table_level = sim_table_2 + level_sim_offset;
 
@@ -162,13 +161,13 @@ struct PanoramaPQ : Panorama {
 
             size_t num_active_for_filtering = 0;
             if (is_sparse) {
-                for (size_t ci = 0; ci < cs; ci++) {
-                    size_t chunk_off = ci * bs;
-                    const float* chunk_sim = sim_table_level + ci * ksub;
+                for (size_t li = 0; li < ls; li++) {
+                    size_t byte_off = li * bs;
+                    const float* chunk_sim = sim_table_level + li * ksub;
                     for (size_t i = 0; i < next_num_active; i++) {
                         size_t real_idx = active_indices[i] - batch_offset;
                         exact_distances[i] +=
-                                chunk_sim[codes_level[chunk_off + real_idx]];
+                                chunk_sim[codes_level[byte_off + real_idx]];
                     }
                 }
                 num_active_for_filtering = next_num_active;
@@ -176,13 +175,13 @@ struct PanoramaPQ : Panorama {
                 auto [cc, na] = panorama_kernels::process_code_compression(
                         next_num_active,
                         bs,
-                        cs,
+                        ls,
                         compressed_codes.data(),
                         bitset.data(),
                         codes_level);
 
                 panorama_kernels::process_chunks(
-                        cs,
+                        ls,
                         bs,
                         na,
                         const_cast<float*>(sim_table_level),
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 864cc52455..d9b9fceb9d 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -1485,7 +1485,6 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         read_ProductQuantizer(&ivpp->pq, f);
         READ1(ivpp->n_levels);
         READ1(ivpp->batch_size);
-        ivpp->chunk_size = ivpp->code_size / ivpp->n_levels;
         ivpp->levels_size = ivpp->d / ivpp->n_levels;
         read_InvertedLists(*ivpp, f, io_flags);
         // The "ilpn" reader creates a PanoramaFlat placeholder; replace
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
index 5633f6c874..ff5a86d678 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -23,26 +23,26 @@ namespace faiss {
 namespace panorama_kernels {
 
 void process_chunks(
-        size_t chunk_size,
+        size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
         float* sim_table,
         uint8_t* compressed_codes,
         float* exact_distances) {
-    size_t chunk_idx = 0;
+    size_t byte_idx = 0;
 
     // Process 4 chunks at a time to amortize loop overhead and keep
     // the accumulator in registers across chunks.
-    for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) {
-        size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size;
-        size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size;
-        size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size;
-        size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size;
+    for (; byte_idx + 3 < level_width_bytes; byte_idx += 4) {
+        size_t byte_offset0 = (byte_idx + 0) * max_batch_size;
+        size_t byte_offset1 = (byte_idx + 1) * max_batch_size;
+        size_t byte_offset2 = (byte_idx + 2) * max_batch_size;
+        size_t byte_offset3 = (byte_idx + 3) * max_batch_size;
 
-        float* sim_table0 = sim_table + (chunk_idx + 0) * 256;
-        float* sim_table1 = sim_table + (chunk_idx + 1) * 256;
-        float* sim_table2 = sim_table + (chunk_idx + 2) * 256;
-        float* sim_table3 = sim_table + (chunk_idx + 3) * 256;
+        float* sim_table0 = sim_table + (byte_idx + 0) * 256;
+        float* sim_table1 = sim_table + (byte_idx + 1) * 256;
+        float* sim_table2 = sim_table + (byte_idx + 2) * 256;
+        float* sim_table3 = sim_table + (byte_idx + 3) * 256;
 
         size_t batch_idx = 0;
         for (; batch_idx + 7 < num_active; batch_idx += 8) {
@@ -50,28 +50,28 @@ void process_chunks(
 
             // Load 8 byte codes, zero-extend to 32-bit indices.
             __m128i raw0 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + chunk_offset0 + batch_idx));
+                    (__m128i*)(compressed_codes + byte_offset0 + batch_idx));
             __m256i codes0 = _mm256_cvtepu8_epi32(raw0);
             acc = _mm256_add_ps(
                     acc,
                     _mm256_i32gather_ps(sim_table0, codes0, sizeof(float)));
 
             __m128i raw1 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + chunk_offset1 + batch_idx));
+                    (__m128i*)(compressed_codes + byte_offset1 + batch_idx));
             __m256i codes1 = _mm256_cvtepu8_epi32(raw1);
             acc = _mm256_add_ps(
                     acc,
                     _mm256_i32gather_ps(sim_table1, codes1, sizeof(float)));
 
             __m128i raw2 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + chunk_offset2 + batch_idx));
+                    (__m128i*)(compressed_codes + byte_offset2 + batch_idx));
             __m256i codes2 = _mm256_cvtepu8_epi32(raw2);
             acc = _mm256_add_ps(
                     acc,
                     _mm256_i32gather_ps(sim_table2, codes2, sizeof(float)));
 
             __m128i raw3 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + chunk_offset3 + batch_idx));
+                    (__m128i*)(compressed_codes + byte_offset3 + batch_idx));
             __m256i codes3 = _mm256_cvtepu8_epi32(raw3);
             acc = _mm256_add_ps(
                     acc,
@@ -82,23 +82,23 @@ void process_chunks(
 
         for (; batch_idx < num_active; batch_idx += 1) {
             float acc = exact_distances[batch_idx];
-            acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]];
-            acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]];
-            acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]];
-            acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]];
+            acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]];
+            acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]];
+            acc += sim_table2[compressed_codes[byte_offset2 + batch_idx]];
+            acc += sim_table3[compressed_codes[byte_offset3 + batch_idx]];
             exact_distances[batch_idx] = acc;
         }
     }
 
-    for (; chunk_idx < chunk_size; chunk_idx++) {
-        size_t chunk_offset = chunk_idx * max_batch_size;
-        float* sim_table_ptr = sim_table + chunk_idx * 256;
+    for (; byte_idx < level_width_bytes; byte_idx++) {
+        size_t byte_offset = byte_idx * max_batch_size;
+        float* sim_table_ptr = sim_table + byte_idx * 256;
 
         size_t batch_idx = 0;
         for (; batch_idx + 7 < num_active; batch_idx += 8) {
             __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx);
             __m128i raw = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + chunk_offset + batch_idx));
+                    (__m128i*)(compressed_codes + byte_offset + batch_idx));
             __m256i codes = _mm256_cvtepu8_epi32(raw);
             __m256 m_dist =
                     _mm256_i32gather_ps(sim_table_ptr, codes, sizeof(float));
@@ -108,7 +108,7 @@ void process_chunks(
 
         for (; batch_idx < num_active; batch_idx += 1) {
             exact_distances[batch_idx] +=
-                    sim_table_ptr[compressed_codes[chunk_offset + batch_idx]];
+                    sim_table_ptr[compressed_codes[byte_offset + batch_idx]];
         }
     }
 }
@@ -141,7 +141,7 @@ size_t process_filtering(
 std::pair<uint8_t*, size_t> process_code_compression(
         size_t next_num_active,
         size_t max_batch_size,
-        size_t chunk_size,
+        size_t level_width_bytes,
         uint8_t* compressed_codes_begin,
         uint8_t* bitset,
         const uint8_t* codes) {
@@ -154,7 +154,7 @@ std::pair<uint8_t*, size_t> process_code_compression(
         // Compress the codes: here we don't need to process remainders
         // as long as `max_batch_size` is a multiple of 64 (which we
         // assert in the constructor). Conveniently, compressed_codes is
-        // allocated to `max_batch_size` * `chunk_size` elements.
+        // allocated to `max_batch_size` * `level_width_bytes` elements.
         // `num_active` is guaranteed to always be less than or equal to
         // `max_batch_size`. Only the last batch may be smaller than
         // `max_batch_size`, the caller ensures that the batch and
@@ -184,10 +184,10 @@ std::pair<uint8_t*, size_t> process_code_compression(
             // PEXT/PDEP path: process 8 bytes at a time. PDEP
             // expands the per-byte mask bits into a per-byte lane
             // mask, then PEXT extracts only the selected bytes.
-            for (size_t ci = 0; ci < chunk_size; ci++) {
-                size_t chunk_offset = ci * max_batch_size;
-                const uint8_t* src = codes + chunk_offset + point_idx;
-                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+            for (size_t ci = 0; ci < level_width_bytes; ci++) {
+                size_t byte_offset = ci * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
                 for (int g = 0; g < 8; g++) {
                     uint64_t src_val;
@@ -204,10 +204,10 @@ std::pair<uint8_t*, size_t> process_code_compression(
 #else
             // Scalar fallback: scan set bits one by one and copy
             // the corresponding code byte.
-            for (size_t ci = 0; ci < chunk_size; ci++) {
-                size_t chunk_offset = ci * max_batch_size;
-                const uint8_t* src = codes + chunk_offset + point_idx;
-                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+            for (size_t ci = 0; ci < level_width_bytes; ci++) {
+                size_t byte_offset = ci * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
                 uint64_t m = mask;
                 while (m) {
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
index a73461a8dc..e8cdb93af7 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -17,54 +17,54 @@ namespace faiss {
 namespace panorama_kernels {
 
 void process_chunks(
-        size_t chunk_size,
+        size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
         float* sim_table,
         uint8_t* compressed_codes,
         float* exact_distances) {
-    size_t chunk_idx = 0;
+    size_t byte_idx = 0;
 
     // Process 4 chunks at a time to amortize loop overhead and keep
     // the accumulator in registers across chunks.
-    for (; chunk_idx + 3 < chunk_size; chunk_idx += 4) {
-        size_t chunk_offset0 = (chunk_idx + 0) * max_batch_size;
-        size_t chunk_offset1 = (chunk_idx + 1) * max_batch_size;
-        size_t chunk_offset2 = (chunk_idx + 2) * max_batch_size;
-        size_t chunk_offset3 = (chunk_idx + 3) * max_batch_size;
+    for (; byte_idx + 3 < level_width_bytes; byte_idx += 4) {
+        size_t byte_offset0 = (byte_idx + 0) * max_batch_size;
+        size_t byte_offset1 = (byte_idx + 1) * max_batch_size;
+        size_t byte_offset2 = (byte_idx + 2) * max_batch_size;
+        size_t byte_offset3 = (byte_idx + 3) * max_batch_size;
 
-        float* sim_table0 = sim_table + (chunk_idx + 0) * 256;
-        float* sim_table1 = sim_table + (chunk_idx + 1) * 256;
-        float* sim_table2 = sim_table + (chunk_idx + 2) * 256;
-        float* sim_table3 = sim_table + (chunk_idx + 3) * 256;
+        float* sim_table0 = sim_table + (byte_idx + 0) * 256;
+        float* sim_table1 = sim_table + (byte_idx + 1) * 256;
+        float* sim_table2 = sim_table + (byte_idx + 2) * 256;
+        float* sim_table3 = sim_table + (byte_idx + 3) * 256;
 
         size_t batch_idx = 0;
         for (; batch_idx + 15 < num_active; batch_idx += 16) {
             __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
 
             __m128i comp0 = _mm_loadu_si128(
-                    (__m128i*)(compressed_codes + chunk_offset0 + batch_idx));
+                    (__m128i*)(compressed_codes + byte_offset0 + batch_idx));
             __m512i codes0 = _mm512_cvtepu8_epi32(comp0);
             acc = _mm512_add_ps(
                     acc,
                     _mm512_i32gather_ps(codes0, sim_table0, sizeof(float)));
 
             __m128i comp1 = _mm_loadu_si128(
-                    (__m128i*)(compressed_codes + chunk_offset1 + batch_idx));
+                    (__m128i*)(compressed_codes + byte_offset1 + batch_idx));
             __m512i codes1 = _mm512_cvtepu8_epi32(comp1);
             acc = _mm512_add_ps(
                     acc,
                     _mm512_i32gather_ps(codes1, sim_table1, sizeof(float)));
 
             __m128i comp2 = _mm_loadu_si128(
-                    (__m128i*)(compressed_codes + chunk_offset2 + batch_idx));
+                    (__m128i*)(compressed_codes + byte_offset2 + batch_idx));
             __m512i codes2 = _mm512_cvtepu8_epi32(comp2);
             acc = _mm512_add_ps(
                     acc,
                     _mm512_i32gather_ps(codes2, sim_table2, sizeof(float)));
 
             __m128i comp3 = _mm_loadu_si128(
-                    (__m128i*)(compressed_codes + chunk_offset3 + batch_idx));
+                    (__m128i*)(compressed_codes + byte_offset3 + batch_idx));
             __m512i codes3 = _mm512_cvtepu8_epi32(comp3);
             acc = _mm512_add_ps(
                     acc,
@@ -75,23 +75,23 @@ void process_chunks(
 
         for (; batch_idx < num_active; batch_idx += 1) {
             float acc = exact_distances[batch_idx];
-            acc += sim_table0[compressed_codes[chunk_offset0 + batch_idx]];
-            acc += sim_table1[compressed_codes[chunk_offset1 + batch_idx]];
-            acc += sim_table2[compressed_codes[chunk_offset2 + batch_idx]];
-            acc += sim_table3[compressed_codes[chunk_offset3 + batch_idx]];
+            acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]];
+            acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]];
+            acc += sim_table2[compressed_codes[byte_offset2 + batch_idx]];
+            acc += sim_table3[compressed_codes[byte_offset3 + batch_idx]];
             exact_distances[batch_idx] = acc;
         }
     }
 
-    for (; chunk_idx < chunk_size; chunk_idx++) {
-        size_t chunk_offset = chunk_idx * max_batch_size;
-        float* sim_table_ptr = sim_table + chunk_idx * 256;
+    for (; byte_idx < level_width_bytes; byte_idx++) {
+        size_t byte_offset = byte_idx * max_batch_size;
+        float* sim_table_ptr = sim_table + byte_idx * 256;
 
         size_t batch_idx = 0;
         for (; batch_idx + 15 < num_active; batch_idx += 16) {
             __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
             __m128i comp = _mm_loadu_si128(
-                    (__m128i*)(compressed_codes + chunk_offset + batch_idx));
+                    (__m128i*)(compressed_codes + byte_offset + batch_idx));
             __m512i codes = _mm512_cvtepu8_epi32(comp);
             __m512 m_dist =
                     _mm512_i32gather_ps(codes, sim_table_ptr, sizeof(float));
@@ -101,7 +101,7 @@ void process_chunks(
 
         for (; batch_idx < num_active; batch_idx += 1) {
             exact_distances[batch_idx] +=
-                    sim_table_ptr[compressed_codes[chunk_offset + batch_idx]];
+                    sim_table_ptr[compressed_codes[byte_offset + batch_idx]];
         }
     }
 }
@@ -134,7 +134,7 @@ size_t process_filtering(
 std::pair<uint8_t*, size_t> process_code_compression(
         size_t next_num_active,
         size_t max_batch_size,
-        size_t chunk_size,
+        size_t level_width_bytes,
         uint8_t* compressed_codes_begin,
         uint8_t* bitset,
         const uint8_t* codes) {
@@ -147,7 +147,7 @@ std::pair<uint8_t*, size_t> process_code_compression(
         // Compress the codes: here we don't need to process remainders
         // as long as `max_batch_size` is a multiple of 64 (which we
         // assert in the constructor). Conveniently, compressed_codes is
-        // allocated to `max_batch_size` * `chunk_size` elements.
+        // allocated to `max_batch_size` * `level_width_bytes` elements.
         // `num_active` is guaranteed to always be less than or equal to
         // `max_batch_size`. Only the last batch may be smaller than
         // `max_batch_size`, the caller ensures that the batch and
@@ -180,10 +180,10 @@ std::pair<uint8_t*, size_t> process_code_compression(
             // PEXT/PDEP path: process 8 bytes at a time. PDEP
             // expands the per-byte mask bits into a per-byte lane
             // mask, then PEXT extracts only the selected bytes.
-            for (size_t ci = 0; ci < chunk_size; ci++) {
-                size_t chunk_offset = ci * max_batch_size;
-                const uint8_t* src = codes + chunk_offset + point_idx;
-                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+            for (size_t ci = 0; ci < level_width_bytes; ci++) {
+                size_t byte_offset = ci * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
                 for (int g = 0; g < 8; g++) {
                     uint64_t src_val;
@@ -200,10 +200,10 @@ std::pair<uint8_t*, size_t> process_code_compression(
 #else
             // Scalar fallback: scan set bits one by one and copy
             // the corresponding code byte.
-            for (size_t ci = 0; ci < chunk_size; ci++) {
-                size_t chunk_offset = ci * max_batch_size;
-                const uint8_t* src = codes + chunk_offset + point_idx;
-                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+            for (size_t ci = 0; ci < level_width_bytes; ci++) {
+                size_t byte_offset = ci * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
                 uint64_t m = mask;
                 while (m) {
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
index 73d5ba24d9..603e51bc73 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
@@ -22,17 +22,17 @@ namespace faiss {
 namespace panorama_kernels {
 
 void process_chunks(
-        size_t chunk_size,
+        size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
         float* sim_table,
         uint8_t* compressed_codes,
         float* exact_distances) {
-    for (size_t chunk_idx = 0; chunk_idx < chunk_size; chunk_idx++) {
-        size_t chunk_offset = chunk_idx * max_batch_size;
-        float* chunk_sim = sim_table + chunk_idx * 256;
+    for (size_t byte_idx = 0; byte_idx < level_width_bytes; byte_idx++) {
+        size_t byte_offset = byte_idx * max_batch_size;
+        float* chunk_sim = sim_table + byte_idx * 256;
         for (size_t i = 0; i < num_active; i++) {
-            exact_distances[i] += chunk_sim[compressed_codes[chunk_offset + i]];
+            exact_distances[i] += chunk_sim[compressed_codes[byte_offset + i]];
         }
     }
 }
@@ -65,7 +65,7 @@ size_t process_filtering(
 std::pair<uint8_t*, size_t> process_code_compression(
         size_t next_num_active,
         size_t max_batch_size,
-        size_t chunk_size,
+        size_t level_width_bytes,
         uint8_t* compressed_codes_begin,
         uint8_t* bitset,
         const uint8_t* codes) {
@@ -100,10 +100,10 @@ std::pair<uint8_t*, size_t> process_code_compression(
             // PEXT/PDEP path: process 8 bytes at a time. PDEP
             // expands the per-byte mask bits into a per-byte lane
             // mask, then PEXT extracts only the selected bytes.
-            for (size_t ci = 0; ci < chunk_size; ci++) {
-                size_t chunk_offset = ci * max_batch_size;
-                const uint8_t* src = codes + chunk_offset + point_idx;
-                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+            for (size_t ci = 0; ci < level_width_bytes; ci++) {
+                size_t byte_offset = ci * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
                 for (int g = 0; g < 8; g++) {
                     uint64_t src_val;
@@ -120,10 +120,10 @@ std::pair<uint8_t*, size_t> process_code_compression(
 #else
             // Scalar fallback: scan set bits one by one and copy
             // the corresponding code byte.
-            for (size_t ci = 0; ci < chunk_size; ci++) {
-                size_t chunk_offset = ci * max_batch_size;
-                const uint8_t* src = codes + chunk_offset + point_idx;
-                uint8_t* dst = compressed_codes + chunk_offset + num_active;
+            for (size_t ci = 0; ci < level_width_bytes; ci++) {
+                size_t byte_offset = ci * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
                 uint64_t m = mask;
                 while (m) {
diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h
index aed8a87660..1ff74086e6 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels.h
+++ b/faiss/impl/panorama_kernels/panorama_kernels.h
@@ -35,7 +35,7 @@ namespace panorama_kernels {
 /// Iterates chunks first to keep the LUT slice in L1 cache.
 /// The AVX-512 version unrolls 4 chunks at a time.
 void process_chunks(
-        size_t chunk_size,
+        size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
         float* sim_table,
@@ -71,7 +71,7 @@ size_t process_filtering(
 /// Compress the codes: here we don't need to process remainders
 /// as long as `max_batch_size` is a multiple of 64 (which we
 /// assert in the constructor). Conveniently, compressed_codes is
-/// allocated to `max_batch_size` * `chunk_size` elements.
+/// allocated to `max_batch_size` * `level_width_bytes` elements.
 /// `num_active` is guaranteed to always be less than or equal to
 /// `max_batch_size`. Only the last batch may be smaller than
 /// `max_batch_size`, the caller ensures that the batch and
@@ -79,7 +79,7 @@ size_t process_filtering(
 std::pair<uint8_t*, size_t> process_code_compression(
         size_t next_num_active,
         size_t max_batch_size,
-        size_t chunk_size,
+        size_t level_width_bytes,
         uint8_t* compressed_codes_begin,
         uint8_t* bitset,
         const uint8_t* codes);

From ed678957700bfe8378c70315b5172e58018344a9 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 00:22:02 +0000
Subject: [PATCH 26/41] Format

---
 faiss/IndexIVF.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/faiss/IndexIVF.h b/faiss/IndexIVF.h
index 972a86d92e..d3a4bb891b 100644
--- a/faiss/IndexIVF.h
+++ b/faiss/IndexIVF.h
@@ -22,7 +22,6 @@
 
 namespace faiss {
 
-
 /** Encapsulates a quantizer object for the IndexIVF
  *
  * The class isolates the fields that are independent of the storage

From 26de19a371158697b57b8e070b00d44b183845a7 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 00:39:43 +0000
Subject: [PATCH 27/41] Remove more "ci"

---
 faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp    | 8 ++++----
 faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp  | 8 ++++----
 faiss/impl/panorama_kernels/panorama_kernels-generic.cpp | 8 ++++----
 3 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
index ff5a86d678..a580cc8a29 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -184,8 +184,8 @@ std::pair<uint8_t*, size_t> process_code_compression(
             // PEXT/PDEP path: process 8 bytes at a time. PDEP
             // expands the per-byte mask bits into a per-byte lane
             // mask, then PEXT extracts only the selected bytes.
-            for (size_t ci = 0; ci < level_width_bytes; ci++) {
-                size_t byte_offset = ci * max_batch_size;
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
                 const uint8_t* src = codes + byte_offset + point_idx;
                 uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
@@ -204,8 +204,8 @@ std::pair<uint8_t*, size_t> process_code_compression(
 #else
             // Scalar fallback: scan set bits one by one and copy
             // the corresponding code byte.
-            for (size_t ci = 0; ci < level_width_bytes; ci++) {
-                size_t byte_offset = ci * max_batch_size;
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
                 const uint8_t* src = codes + byte_offset + point_idx;
                 uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
index e8cdb93af7..139b0e8867 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -180,8 +180,8 @@ std::pair<uint8_t*, size_t> process_code_compression(
             // PEXT/PDEP path: process 8 bytes at a time. PDEP
             // expands the per-byte mask bits into a per-byte lane
             // mask, then PEXT extracts only the selected bytes.
-            for (size_t ci = 0; ci < level_width_bytes; ci++) {
-                size_t byte_offset = ci * max_batch_size;
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
                 const uint8_t* src = codes + byte_offset + point_idx;
                 uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
@@ -200,8 +200,8 @@ std::pair<uint8_t*, size_t> process_code_compression(
 #else
             // Scalar fallback: scan set bits one by one and copy
             // the corresponding code byte.
-            for (size_t ci = 0; ci < level_width_bytes; ci++) {
-                size_t byte_offset = ci * max_batch_size;
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
                 const uint8_t* src = codes + byte_offset + point_idx;
                 uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
index 603e51bc73..2c64fd22db 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
@@ -100,8 +100,8 @@ std::pair<uint8_t*, size_t> process_code_compression(
             // PEXT/PDEP path: process 8 bytes at a time. PDEP
             // expands the per-byte mask bits into a per-byte lane
             // mask, then PEXT extracts only the selected bytes.
-            for (size_t ci = 0; ci < level_width_bytes; ci++) {
-                size_t byte_offset = ci * max_batch_size;
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
                 const uint8_t* src = codes + byte_offset + point_idx;
                 uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;
@@ -120,8 +120,8 @@ std::pair<uint8_t*, size_t> process_code_compression(
 #else
             // Scalar fallback: scan set bits one by one and copy
             // the corresponding code byte.
-            for (size_t ci = 0; ci < level_width_bytes; ci++) {
-                size_t byte_offset = ci * max_batch_size;
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
                 const uint8_t* src = codes + byte_offset + point_idx;
                 uint8_t* dst = compressed_codes + byte_offset + num_active;
                 int write_pos = 0;

From c8d22ecd4f231f4f38ccafc728ea2ba81e6911ae Mon Sep 17 00:00:00 2001
From: Alexis Schlomer <alexis_schlomer@hotmail.com>
Date: Sun, 22 Mar 2026 00:46:11 +0000
Subject: [PATCH 28/41] Fix recall on bench

---
 benchs/bench_ivfpq_panorama.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
index 806c56fcc4..fcd52210b9 100644
--- a/benchs/bench_ivfpq_panorama.py
+++ b/benchs/bench_ivfpq_panorama.py
@@ -69,8 +69,9 @@ def eval_recall(index, nprobe_val):
     speed = t * 1000 / nq
     qps = 1000 / speed
 
-    corrects = (gt == I).sum()
-    recall = corrects / (nq * k)
+    recall = np.mean(
+        [len(set(gt[i]) & set(I[i])) / k for i in range(nq)],
+    )
     ratio_dims_scanned = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
     print(
         f"\tnprobe {nprobe_val:3d}, Recall@{k}: "
@@ -281,7 +282,9 @@ def build_ivfpq_panorama(M, n_levels, alpha=ALPHA):
     eval_index(pano, label=f"PCA+Spill+Rot + IVFPQPanorama (M={M})")
     del pano
 
-plt.title(f"IVFPQ Panorama on GIST1M (nlist={nlist})")
+plt.title(
+    f"IVFPQ Panorama on GIST ({SUBSET*100:.0f}% subset, nlist={nlist})",
+)
 plt.xlabel(f"Recall@{k}")
 plt.ylabel("QPS")
 plt.yscale("log")

From e91980c50564dd49a8b45b9b1697ae9570966350 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 01:14:50 +0000
Subject: [PATCH 29/41] Remove batch size magic number

---
 faiss/IndexIVFFlatPanorama.cpp | 2 +-
 faiss/IndexIVFPQPanorama.h     | 3 ++-
 faiss/impl/Panorama.h          | 2 ++
 faiss/impl/index_read.cpp      | 7 +++----
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp
index 335be43ca7..65587b22a9 100644
--- a/faiss/IndexIVFFlatPanorama.cpp
+++ b/faiss/IndexIVFFlatPanorama.cpp
@@ -39,7 +39,7 @@ IndexIVFFlatPanorama::IndexIVFFlatPanorama(
     // We construct the inverted lists here so that we can use the
     // level-oriented storage. This does not cause a leak as we constructed
     // IndexIVF first, with own_invlists set to false.
-    auto* pano = new PanoramaFlat(d, n_levels, 128);
+    auto* pano = new PanoramaFlat(d, n_levels, Panorama::kDefaultBatchSize);
     this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano);
     this->own_invlists = own_invlists_in;
 }
diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h
index 74d38677e1..9d6ef0dd3b 100644
--- a/faiss/IndexIVFPQPanorama.h
+++ b/faiss/IndexIVFPQPanorama.h
@@ -11,6 +11,7 @@
 #include <vector>
 
 #include <faiss/IndexIVFPQ.h>
+#include <faiss/impl/Panorama.h>
 
 namespace faiss {
 
@@ -69,7 +70,7 @@ struct IndexIVFPQPanorama : public IndexIVFPQ {
             size_t M,
             size_t nbits_per_idx,
             int n_levels,
-            size_t batch_size = 128,
+            size_t batch_size = Panorama::kDefaultBatchSize,
             MetricType metric = METRIC_L2,
             bool own_invlists = true);
 
diff --git a/faiss/impl/Panorama.h b/faiss/impl/Panorama.h
index fcf3136d44..58e889a1f0 100644
--- a/faiss/impl/Panorama.h
+++ b/faiss/impl/Panorama.h
@@ -46,6 +46,8 @@ namespace faiss {
  * for their respective code formats.
  */
 struct Panorama {
+    static constexpr size_t kDefaultBatchSize = 128;
+
     size_t d = 0;
     size_t code_size = 0;
     size_t n_levels = 0;
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index d9b9fceb9d..11ba4deeb1 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -431,9 +431,9 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
         FAISS_CHECK_DESERIALIZATION_LOOP_LIMIT(nlist, "ilpn nlist");
         READ1(code_size);
         READ1(n_levels);
-        constexpr size_t kFlatBatchSize = 128;
+        constexpr size_t bs = Panorama::kDefaultBatchSize;
         auto* pano = new PanoramaFlat(
-                code_size / sizeof(float), n_levels, kFlatBatchSize);
+                code_size / sizeof(float), n_levels, bs);
         auto ailp = std::make_unique<ArrayInvertedListsPanorama>(
                 nlist, code_size, pano);
         std::vector<size_t> sizes(nlist);
@@ -442,8 +442,7 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
         for (size_t i = 0; i < nlist; i++) {
             ailp->ids[i].resize(sizes[i]);
             size_t num_elems =
-                    ((sizes[i] + kFlatBatchSize - 1) / kFlatBatchSize) *
-                    kFlatBatchSize;
+                    ((sizes[i] + bs - 1) / bs) * bs;
             ailp->codes[i].resize(num_elems * code_size);
             ailp->cum_sums[i].resize(num_elems * (n_levels + 1));
         }

From 69552f21a6f7fa879a42ca453911edb4c114c542 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 01:18:20 +0000
Subject: [PATCH 30/41] vd_in

---
 faiss/IndexIVFFlatPanorama.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp
index 65587b22a9..d05c73a049 100644
--- a/faiss/IndexIVFFlatPanorama.cpp
+++ b/faiss/IndexIVFFlatPanorama.cpp
@@ -57,12 +57,12 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
     static constexpr MetricType metric = VectorDistance::metric;
 
     IVFFlatScannerPanorama(
-            const VectorDistance& vd,
+            const VectorDistance& vd_in,
             const ArrayInvertedListsPanorama* storage_in,
             bool store_pairs_in,
             const IDSelector* sel_in)
             : InvertedListScanner(store_pairs_in, sel_in),
-              vd(vd),
+              vd(vd_in),
               storage(storage_in),
               pano_flat(
                       dynamic_cast<const PanoramaFlat*>(

From 084381a3ab40a39e99c9512c3674ba6bc0c344fc Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 01:19:40 +0000
Subject: [PATCH 31/41] Clean diffs

---
 faiss/IndexIVFPQ.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/faiss/IndexIVFPQ.cpp b/faiss/IndexIVFPQ.cpp
index 2d0bba4228..3af6fddff7 100644
--- a/faiss/IndexIVFPQ.cpp
+++ b/faiss/IndexIVFPQ.cpp
@@ -9,14 +9,13 @@
 
 #include <faiss/IndexIVFPQ.h>
 
-#include <algorithm>
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
-#include <cstddef>
 #include <cstdint>
 #include <cstdio>
-#include <utility>
+
+#include <algorithm>
 
 #include <faiss/utils/Heap.h>
 #include <faiss/utils/distances_dispatch.h>

From eb9f1b76aba963c05cd16d05455d04e43477ce40 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 01:21:03 +0000
Subject: [PATCH 32/41] Format

---
 faiss/impl/index_read.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index 11ba4deeb1..b7cef27a10 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -432,8 +432,7 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
         READ1(code_size);
         READ1(n_levels);
         constexpr size_t bs = Panorama::kDefaultBatchSize;
-        auto* pano = new PanoramaFlat(
-                code_size / sizeof(float), n_levels, bs);
+        auto* pano = new PanoramaFlat(code_size / sizeof(float), n_levels, bs);
         auto ailp = std::make_unique<ArrayInvertedListsPanorama>(
                 nlist, code_size, pano);
         std::vector<size_t> sizes(nlist);
@@ -441,8 +440,7 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
 
         for (size_t i = 0; i < nlist; i++) {
             ailp->ids[i].resize(sizes[i]);
-            size_t num_elems =
-                    ((sizes[i] + bs - 1) / bs) * bs;
+            size_t num_elems = ((sizes[i] + bs - 1) / bs) * bs;
             ailp->codes[i].resize(num_elems * code_size);
             ailp->cum_sums[i].resize(num_elems * (n_levels + 1));
         }

From 8a375c2ac33e65e40fc82c2900ef09b0a6cccd4e Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 01:38:28 +0000
Subject: [PATCH 33/41] process_level

---
 faiss/impl/PanoramaPQ.h                                  | 2 +-
 faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp    | 4 ++--
 faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp  | 2 +-
 faiss/impl/panorama_kernels/panorama_kernels-generic.cpp | 2 +-
 faiss/impl/panorama_kernels/panorama_kernels.h           | 4 ++--
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h
index a3901db795..b94623cb5e 100644
--- a/faiss/impl/PanoramaPQ.h
+++ b/faiss/impl/PanoramaPQ.h
@@ -180,7 +180,7 @@ struct PanoramaPQ : Panorama {
                         bitset.data(),
                         codes_level);
 
-                panorama_kernels::process_chunks(
+                panorama_kernels::process_level(
                         ls,
                         bs,
                         na,
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
index a580cc8a29..a10a3f3c72 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -6,7 +6,7 @@
  */
 
 // AVX2 implementations of Panorama kernels.
-// Uses 256-bit gather for process_chunks, scalar filtering (no
+// Uses 256-bit gather for process_level, scalar filtering (no
 // compress instruction in AVX2), and BMI2 PEXT/PDEP for code
 // compression where available.
 
@@ -22,7 +22,7 @@
 namespace faiss {
 namespace panorama_kernels {
 
-void process_chunks(
+void process_level(
         size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
index 139b0e8867..8181327c14 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -16,7 +16,7 @@
 namespace faiss {
 namespace panorama_kernels {
 
-void process_chunks(
+void process_level(
         size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
index 2c64fd22db..664485f5f9 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
@@ -21,7 +21,7 @@
 namespace faiss {
 namespace panorama_kernels {
 
-void process_chunks(
+void process_level(
         size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h
index 1ff74086e6..0bbcad0eef 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels.h
+++ b/faiss/impl/panorama_kernels/panorama_kernels.h
@@ -12,7 +12,7 @@
  * @brief Panorama search kernels with scalar and AVX-512 implementations.
  *
  * The three core kernels of the Panorama progressive filtering search:
- * - process_chunks: accumulate PQ distance table lookups over chunks
+ * - process_level: accumulate PQ distance table lookups over chunks
  * - process_filtering: Cauchy-Schwarz lower bound pruning with stream
  *   compaction
  * - process_code_compression: byte-level stream compaction of PQ codes
@@ -34,7 +34,7 @@ namespace panorama_kernels {
 /// accumulates into `exact_distances[i]` for all active elements.
 /// Iterates chunks first to keep the LUT slice in L1 cache.
 /// The AVX-512 version unrolls 4 chunks at a time.
-void process_chunks(
+void process_level(
         size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,

From a624893d93567ca2e921a633c14e7a59706e4f1b Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 01:44:59 +0000
Subject: [PATCH 34/41] for now

---
 faiss/IndexIVFPQPanorama.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h
index 9d6ef0dd3b..8bcc97b624 100644
--- a/faiss/IndexIVFPQPanorama.h
+++ b/faiss/IndexIVFPQPanorama.h
@@ -45,7 +45,7 @@ namespace faiss {
 /// search using the precomputed_table (no extra per-point storage).
 ///
 /// CONSTRAINTS:
-/// - Only L2 metric is supported.
+/// - Only L2 metric is supported (for now).
 /// - Only 8-bit PQ codes (nbits_per_idx == 8).
 /// - M must be divisible by n_levels.
 /// - batch_size must be a multiple of 64.

From d7070225fda006beb638e7fe035a59c1f3145053 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 02:06:06 +0000
Subject: [PATCH 35/41] Reorder unrolled AVX512 instructions

---
 .../panorama_kernels-avx512.cpp               | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
index 8181327c14..6d22358153 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -44,28 +44,27 @@ void process_level(
 
             __m128i comp0 = _mm_loadu_si128(
                     (__m128i*)(compressed_codes + byte_offset0 + batch_idx));
+            __m128i comp1 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + byte_offset1 + batch_idx));
+            __m128i comp2 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + byte_offset2 + batch_idx));
+            __m128i comp3 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + byte_offset3 + batch_idx));
+
             __m512i codes0 = _mm512_cvtepu8_epi32(comp0);
+            __m512i codes1 = _mm512_cvtepu8_epi32(comp1);
+            __m512i codes2 = _mm512_cvtepu8_epi32(comp2);
+            __m512i codes3 = _mm512_cvtepu8_epi32(comp3);
+
             acc = _mm512_add_ps(
                     acc,
                     _mm512_i32gather_ps(codes0, sim_table0, sizeof(float)));
-
-            __m128i comp1 = _mm_loadu_si128(
-                    (__m128i*)(compressed_codes + byte_offset1 + batch_idx));
-            __m512i codes1 = _mm512_cvtepu8_epi32(comp1);
             acc = _mm512_add_ps(
                     acc,
                     _mm512_i32gather_ps(codes1, sim_table1, sizeof(float)));
-
-            __m128i comp2 = _mm_loadu_si128(
-                    (__m128i*)(compressed_codes + byte_offset2 + batch_idx));
-            __m512i codes2 = _mm512_cvtepu8_epi32(comp2);
             acc = _mm512_add_ps(
                     acc,
                     _mm512_i32gather_ps(codes2, sim_table2, sizeof(float)));
-
-            __m128i comp3 = _mm_loadu_si128(
-                    (__m128i*)(compressed_codes + byte_offset3 + batch_idx));
-            __m512i codes3 = _mm512_cvtepu8_epi32(comp3);
             acc = _mm512_add_ps(
                     acc,
                     _mm512_i32gather_ps(codes3, sim_table3, sizeof(float)));

From cc59df9da52902210486244b232558d5a249b51a Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 02:22:50 +0000
Subject: [PATCH 36/41] Remove redundant unrolling

---
 .../panorama_kernels-avx2.cpp                 | 38 +------------------
 .../panorama_kernels-avx512.cpp               |  4 +-
 2 files changed, 3 insertions(+), 39 deletions(-)

diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
index a10a3f3c72..2089ded936 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -44,43 +44,7 @@ void process_level(
         float* sim_table2 = sim_table + (byte_idx + 2) * 256;
         float* sim_table3 = sim_table + (byte_idx + 3) * 256;
 
-        size_t batch_idx = 0;
-        for (; batch_idx + 7 < num_active; batch_idx += 8) {
-            __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx);
-
-            // Load 8 byte codes, zero-extend to 32-bit indices.
-            __m128i raw0 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + byte_offset0 + batch_idx));
-            __m256i codes0 = _mm256_cvtepu8_epi32(raw0);
-            acc = _mm256_add_ps(
-                    acc,
-                    _mm256_i32gather_ps(sim_table0, codes0, sizeof(float)));
-
-            __m128i raw1 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + byte_offset1 + batch_idx));
-            __m256i codes1 = _mm256_cvtepu8_epi32(raw1);
-            acc = _mm256_add_ps(
-                    acc,
-                    _mm256_i32gather_ps(sim_table1, codes1, sizeof(float)));
-
-            __m128i raw2 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + byte_offset2 + batch_idx));
-            __m256i codes2 = _mm256_cvtepu8_epi32(raw2);
-            acc = _mm256_add_ps(
-                    acc,
-                    _mm256_i32gather_ps(sim_table2, codes2, sizeof(float)));
-
-            __m128i raw3 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + byte_offset3 + batch_idx));
-            __m256i codes3 = _mm256_cvtepu8_epi32(raw3);
-            acc = _mm256_add_ps(
-                    acc,
-                    _mm256_i32gather_ps(sim_table3, codes3, sizeof(float)));
-
-            _mm256_storeu_ps(exact_distances + batch_idx, acc);
-        }
-
-        for (; batch_idx < num_active; batch_idx += 1) {
+        for (size_t batch_idx = 0; batch_idx < num_active; batch_idx++) {
             float acc = exact_distances[batch_idx];
             acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]];
             acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]];
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
index 6d22358153..e8976378f3 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -72,7 +72,7 @@ void process_level(
             _mm512_storeu_ps(exact_distances + batch_idx, acc);
         }
 
-        for (; batch_idx < num_active; batch_idx += 1) {
+        for (; batch_idx < num_active; batch_idx++) {
             float acc = exact_distances[batch_idx];
             acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]];
             acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]];
@@ -98,7 +98,7 @@ void process_level(
             _mm512_storeu_ps(exact_distances + batch_idx, acc);
         }
 
-        for (; batch_idx < num_active; batch_idx += 1) {
+        for (; batch_idx < num_active; batch_idx++) {
             exact_distances[batch_idx] +=
                     sim_table_ptr[compressed_codes[byte_offset + batch_idx]];
         }

From 447981030ab89d32869afa9ace423d759977dd74 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 02:34:10 +0000
Subject: [PATCH 37/41] Revert "Remove redundant unrolling"

This reverts commit cc59df9da52902210486244b232558d5a249b51a.
---
 .../panorama_kernels-avx2.cpp                 | 38 ++++++++++++++++++-
 .../panorama_kernels-avx512.cpp               |  4 +-
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
index 2089ded936..a10a3f3c72 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -44,7 +44,43 @@ void process_level(
         float* sim_table2 = sim_table + (byte_idx + 2) * 256;
         float* sim_table3 = sim_table + (byte_idx + 3) * 256;
 
-        for (size_t batch_idx = 0; batch_idx < num_active; batch_idx++) {
+        size_t batch_idx = 0;
+        for (; batch_idx + 7 < num_active; batch_idx += 8) {
+            __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx);
+
+            // Load 8 byte codes, zero-extend to 32-bit indices.
+            __m128i raw0 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset0 + batch_idx));
+            __m256i codes0 = _mm256_cvtepu8_epi32(raw0);
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table0, codes0, sizeof(float)));
+
+            __m128i raw1 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset1 + batch_idx));
+            __m256i codes1 = _mm256_cvtepu8_epi32(raw1);
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table1, codes1, sizeof(float)));
+
+            __m128i raw2 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset2 + batch_idx));
+            __m256i codes2 = _mm256_cvtepu8_epi32(raw2);
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table2, codes2, sizeof(float)));
+
+            __m128i raw3 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset3 + batch_idx));
+            __m256i codes3 = _mm256_cvtepu8_epi32(raw3);
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table3, codes3, sizeof(float)));
+
+            _mm256_storeu_ps(exact_distances + batch_idx, acc);
+        }
+
+        for (; batch_idx < num_active; batch_idx += 1) {
             float acc = exact_distances[batch_idx];
             acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]];
             acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]];
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
index e8976378f3..6d22358153 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -72,7 +72,7 @@ void process_level(
             _mm512_storeu_ps(exact_distances + batch_idx, acc);
         }
 
-        for (; batch_idx < num_active; batch_idx++) {
+        for (; batch_idx < num_active; batch_idx += 1) {
             float acc = exact_distances[batch_idx];
             acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]];
             acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]];
@@ -98,7 +98,7 @@ void process_level(
             _mm512_storeu_ps(exact_distances + batch_idx, acc);
         }
 
-        for (; batch_idx < num_active; batch_idx++) {
+        for (; batch_idx < num_active; batch_idx += 1) {
             exact_distances[batch_idx] +=
                     sim_table_ptr[compressed_codes[byte_offset + batch_idx]];
         }

From 7db3f1ec8fe56dc25e2d623daca61b99af15afaa Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 02:35:15 +0000
Subject: [PATCH 38/41] Clean

---
 faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp   | 4 ++--
 faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
index a10a3f3c72..a45446d545 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -80,7 +80,7 @@ void process_level(
             _mm256_storeu_ps(exact_distances + batch_idx, acc);
         }
 
-        for (; batch_idx < num_active; batch_idx += 1) {
+        for (; batch_idx < num_active; batch_idx++) {
             float acc = exact_distances[batch_idx];
             acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]];
             acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]];
@@ -106,7 +106,7 @@ void process_level(
             _mm256_storeu_ps(exact_distances + batch_idx, acc);
         }
 
-        for (; batch_idx < num_active; batch_idx += 1) {
+        for (; batch_idx < num_active; batch_idx++) {
             exact_distances[batch_idx] +=
                     sim_table_ptr[compressed_codes[byte_offset + batch_idx]];
         }
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
index 6d22358153..e8976378f3 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -72,7 +72,7 @@ void process_level(
             _mm512_storeu_ps(exact_distances + batch_idx, acc);
         }
 
-        for (; batch_idx < num_active; batch_idx += 1) {
+        for (; batch_idx < num_active; batch_idx++) {
             float acc = exact_distances[batch_idx];
             acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]];
             acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]];
@@ -98,7 +98,7 @@ void process_level(
             _mm512_storeu_ps(exact_distances + batch_idx, acc);
         }
 
-        for (; batch_idx < num_active; batch_idx += 1) {
+        for (; batch_idx < num_active; batch_idx++) {
             exact_distances[batch_idx] +=
                     sim_table_ptr[compressed_codes[byte_offset + batch_idx]];
         }

From 9b8ac6ecfadb2e5c76db5ba4258be748e58d171f Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 17:00:51 +0000
Subject: [PATCH 39/41] Fix build

---
 faiss/impl/panorama_kernels/panorama_kernels-generic.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
index 664485f5f9..3a1f592f49 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
@@ -11,6 +11,7 @@
 #if !defined(COMPILE_SIMD_AVX2) && !defined(COMPILE_SIMD_AVX512)
 
 #include <faiss/impl/panorama_kernels/panorama_kernels.h>
+#include <faiss/impl/platform_macros.h>
 
 #include <cstring>
 

From 53835d58e5fb5cd50bac66be622fa27ce92316c2 Mon Sep 17 00:00:00 2001
From: Akash Nayar <akashknayar5@gmail.com>
Date: Sun, 22 Mar 2026 18:23:15 +0000
Subject: [PATCH 40/41] SIMD refactor and add NEON / SVE stubs

---
 faiss/CMakeLists.txt                          |   3 +
 .../panorama_kernels-avx2.cpp                 |  60 +++-------
 .../panorama_kernels-avx512.cpp               |  77 ++++++++-----
 .../panorama_kernels-generic.cpp              | 108 ++++++++++++------
 .../panorama_kernels/panorama_kernels-inl.h   |  23 ++++
 .../panorama_kernels-neon.cpp                 |  58 ++++++++++
 .../panorama_kernels/panorama_kernels-sve.cpp |  58 ++++++++++
 .../impl/panorama_kernels/panorama_kernels.h  |  38 ++++--
 8 files changed, 308 insertions(+), 117 deletions(-)
 create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-inl.h
 create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-neon.cpp
 create mode 100644 faiss/impl/panorama_kernels/panorama_kernels-sve.cpp

diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 5e752d9d3b..354a2ef883 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -25,11 +25,13 @@ set(FAISS_SIMD_AVX512_SRC
 )
 set(FAISS_SIMD_NEON_SRC
   impl/fast_scan/impl-neon.cpp
+  impl/panorama_kernels/panorama_kernels-neon.cpp
   impl/scalar_quantizer/sq-neon.cpp
   impl/approx_topk/neon.cpp
   utils/simd_impl/distances_aarch64.cpp
 )
 set(FAISS_SIMD_SVE_SRC
+  impl/panorama_kernels/panorama_kernels-sve.cpp
   impl/pq_code_distance/pq_code_distance-sve.cpp
   utils/simd_impl/distances_arm_sve.cpp
 )
@@ -286,6 +288,7 @@ set(FAISS_HEADERS
   impl/zerocopy_io.h
   utils/pq_code_distance.h
   impl/panorama_kernels/panorama_kernels.h
+  impl/panorama_kernels/panorama_kernels-inl.h
   impl/pq_code_distance/pq_code_distance-inl.h
   invlists/BlockInvertedLists.h
   invlists/DirectMap.h
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
index a45446d545..070475b067 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -11,18 +11,19 @@
 // compression where available.
 
 #ifdef COMPILE_SIMD_AVX2
-#ifndef COMPILE_SIMD_AVX512
 
 #include <immintrin.h>
 
-#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+#include <faiss/impl/panorama_kernels/panorama_kernels-inl.h>
 
 #include <cstring>
 
 namespace faiss {
 namespace panorama_kernels {
 
-void process_level(
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::AVX2>(
         size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
@@ -51,28 +52,27 @@ void process_level(
             // Load 8 byte codes, zero-extend to 32-bit indices.
             __m128i raw0 = _mm_loadl_epi64(
                     (__m128i*)(compressed_codes + byte_offset0 + batch_idx));
+            __m128i raw1 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset1 + batch_idx));
+            __m128i raw2 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset2 + batch_idx));
+            __m128i raw3 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset3 + batch_idx));
+
             __m256i codes0 = _mm256_cvtepu8_epi32(raw0);
+            __m256i codes1 = _mm256_cvtepu8_epi32(raw1);
+            __m256i codes2 = _mm256_cvtepu8_epi32(raw2);
+            __m256i codes3 = _mm256_cvtepu8_epi32(raw3);
+
             acc = _mm256_add_ps(
                     acc,
                     _mm256_i32gather_ps(sim_table0, codes0, sizeof(float)));
-
-            __m128i raw1 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + byte_offset1 + batch_idx));
-            __m256i codes1 = _mm256_cvtepu8_epi32(raw1);
             acc = _mm256_add_ps(
                     acc,
                     _mm256_i32gather_ps(sim_table1, codes1, sizeof(float)));
-
-            __m128i raw2 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + byte_offset2 + batch_idx));
-            __m256i codes2 = _mm256_cvtepu8_epi32(raw2);
             acc = _mm256_add_ps(
                     acc,
                     _mm256_i32gather_ps(sim_table2, codes2, sizeof(float)));
-
-            __m128i raw3 = _mm_loadl_epi64(
-                    (__m128i*)(compressed_codes + byte_offset3 + batch_idx));
-            __m256i codes3 = _mm256_cvtepu8_epi32(raw3);
             acc = _mm256_add_ps(
                     acc,
                     _mm256_i32gather_ps(sim_table3, codes3, sizeof(float)));
@@ -113,32 +113,9 @@ void process_level(
     }
 }
 
-size_t process_filtering(
-        size_t num_active,
-        float* exact_distances,
-        uint32_t* active_indices,
-        float* cum_sums,
-        uint8_t* bitset,
-        size_t batch_offset,
-        float dis0,
-        float query_cum_norm,
-        float heap_max) {
-    size_t next_num_active = 0;
-    for (size_t i = 0; i < num_active; i++) {
-        float exact_distance = exact_distances[i];
-        float cum_sum = cum_sums[active_indices[i] - batch_offset];
-        float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm;
-
-        bool keep = heap_max > lower_bound;
-        active_indices[next_num_active] = active_indices[i];
-        exact_distances[next_num_active] = exact_distance;
-        bitset[active_indices[i] - batch_offset] = keep;
-        next_num_active += keep;
-    }
-    return next_num_active;
-}
-
-std::pair<uint8_t*, size_t> process_code_compression(
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<SIMDLevel::AVX2>(
         size_t next_num_active,
         size_t max_batch_size,
         size_t level_width_bytes,
@@ -231,5 +208,4 @@ std::pair<uint8_t*, size_t> process_code_compression(
 } // namespace panorama_kernels
 } // namespace faiss
 
-#endif // COMPILE_SIMD_AVX512
 #endif // COMPILE_SIMD_AVX2
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
index e8976378f3..811fb34579 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -9,14 +9,16 @@
 
 #include <immintrin.h>
 
-#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+#include <faiss/impl/panorama_kernels/panorama_kernels-inl.h>
 
 #include <cstring>
 
 namespace faiss {
 namespace panorama_kernels {
 
-void process_level(
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::AVX512>(
         size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
@@ -105,32 +107,9 @@ void process_level(
     }
 }
 
-size_t process_filtering(
-        size_t num_active,
-        float* exact_distances,
-        uint32_t* active_indices,
-        float* cum_sums,
-        uint8_t* bitset,
-        size_t batch_offset,
-        float dis0,
-        float query_cum_norm,
-        float heap_max) {
-    size_t next_num_active = 0;
-    for (size_t i = 0; i < num_active; i++) {
-        float exact_distance = exact_distances[i];
-        float cum_sum = cum_sums[active_indices[i] - batch_offset];
-        float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm;
-
-        bool keep = heap_max > lower_bound;
-        active_indices[next_num_active] = active_indices[i];
-        exact_distances[next_num_active] = exact_distance;
-        bitset[active_indices[i] - batch_offset] = keep;
-        next_num_active += keep;
-    }
-    return next_num_active;
-}
-
-std::pair<uint8_t*, size_t> process_code_compression(
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<SIMDLevel::AVX512>(
         size_t next_num_active,
         size_t max_batch_size,
         size_t level_width_bytes,
@@ -223,6 +202,48 @@ std::pair<uint8_t*, size_t> process_code_compression(
     return std::make_pair(compressed_codes, num_active);
 }
 
+#ifdef COMPILE_SIMD_AVX512_SPR
+// AVX512_SPR: Sapphire Rapids is a superset of AVX512. Reuse the
+// AVX512 implementation until a dedicated SPR specialization is written.
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::AVX512_SPR>(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    process_level_impl<SIMDLevel::AVX512>(
+            level_width_bytes,
+            max_batch_size,
+            num_active,
+            sim_table,
+            compressed_codes,
+            exact_distances);
+}
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<
+        SIMDLevel::AVX512_SPR>(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    return process_code_compression_impl<SIMDLevel::AVX512>(
+            next_num_active,
+            max_batch_size,
+            level_width_bytes,
+            compressed_codes_begin,
+            bitset,
+            codes);
+}
+#endif // COMPILE_SIMD_AVX512_SPR
+
 } // namespace panorama_kernels
 } // namespace faiss
 
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
index 3a1f592f49..9601cec46d 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
+++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
@@ -5,13 +5,15 @@
  * LICENSE file in the root directory of this source tree.
  */
 
-// Scalar implementations of Panorama kernels.
-// Compiled only when no SIMD variant (AVX2/AVX-512) is available.
+// This TU provides:
+// 1. _impl specializations for NONE, using scalar code.
+// 2. Non-templated Panorama kernel dispatch wrappers
+//    (process_level, process_filtering, process_code_compression) declared
+//    in panorama_kernels.h. These use DISPATCH_SIMDLevel to route to the
+//    best available SIMD implementation via the _impl function template
+//    specializations defined in the per-SIMD .cpp files.
 
-#if !defined(COMPILE_SIMD_AVX2) && !defined(COMPILE_SIMD_AVX512)
-
-#include <faiss/impl/panorama_kernels/panorama_kernels.h>
-#include <faiss/impl/platform_macros.h>
+#include <faiss/impl/panorama_kernels/panorama_kernels-inl.h>
 
 #include <cstring>
 
@@ -22,7 +24,9 @@
 namespace faiss {
 namespace panorama_kernels {
 
-void process_level(
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::NONE>(
         size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
@@ -38,32 +42,9 @@ void process_level(
     }
 }
 
-size_t process_filtering(
-        size_t num_active,
-        float* exact_distances,
-        uint32_t* active_indices,
-        float* cum_sums,
-        uint8_t* bitset,
-        size_t batch_offset,
-        float dis0,
-        float query_cum_norm,
-        float heap_max) {
-    size_t next_num_active = 0;
-    for (size_t i = 0; i < num_active; i++) {
-        float exact_distance = exact_distances[i];
-        float cum_sum = cum_sums[active_indices[i] - batch_offset];
-        float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm;
-
-        bool keep = heap_max > lower_bound;
-        active_indices[next_num_active] = active_indices[i];
-        exact_distances[next_num_active] = exact_distance;
-        bitset[active_indices[i] - batch_offset] = keep;
-        next_num_active += keep;
-    }
-    return next_num_active;
-}
-
-std::pair<uint8_t*, size_t> process_code_compression(
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<SIMDLevel::NONE>(
         size_t next_num_active,
         size_t max_batch_size,
         size_t level_width_bytes,
@@ -145,7 +126,64 @@ std::pair<uint8_t*, size_t> process_code_compression(
     return std::make_pair(compressed_codes, num_active);
 }
 
+void process_level(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    DISPATCH_SIMDLevel(
+            process_level_impl,
+            level_width_bytes,
+            max_batch_size,
+            num_active,
+            sim_table,
+            compressed_codes,
+            exact_distances);
+}
+
+size_t process_filtering(
+        size_t num_active,
+        float* exact_distances,
+        uint32_t* active_indices,
+        float* cum_sums,
+        uint8_t* bitset,
+        size_t batch_offset,
+        float dis0,
+        float query_cum_norm,
+        float heap_max) {
+    size_t next_num_active = 0;
+    for (size_t i = 0; i < num_active; i++) {
+        float exact_distance = exact_distances[i];
+        float cum_sum = cum_sums[active_indices[i] - batch_offset];
+        float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm;
+
+        bool keep = heap_max > lower_bound;
+        active_indices[next_num_active] = active_indices[i];
+        exact_distances[next_num_active] = exact_distance;
+        bitset[active_indices[i] - batch_offset] = keep;
+        next_num_active += keep;
+    }
+    return next_num_active;
+}
+
+std::pair<uint8_t*, size_t> process_code_compression(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    DISPATCH_SIMDLevel(
+            process_code_compression_impl,
+            next_num_active,
+            max_batch_size,
+            level_width_bytes,
+            compressed_codes_begin,
+            bitset,
+            codes);
+}
+
 } // namespace panorama_kernels
 } // namespace faiss
-
-#endif // !COMPILE_SIMD_AVX2 && !COMPILE_SIMD_AVX512
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-inl.h b/faiss/impl/panorama_kernels/panorama_kernels-inl.h
new file mode 100644
index 0000000000..b1ae4316db
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-inl.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/**
+ * @file panorama_kernels-inl.h
+ * @brief Private header for Panorama kernel SIMD implementations.
+ *
+ * This is a PRIVATE header — do not include in public APIs or user code.
+ * Only faiss internal .cpp files (the per-SIMD implementation files and
+ * panorama_kernels-generic.cpp) should include this header.
+ *
+ * This header re-exports the public API (panorama_kernels.h) plus the
+ * simd_dispatch.h machinery needed by the implementation files.
+ */
+
+#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+#include <faiss/impl/simd_dispatch.h>
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp b/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp
new file mode 100644
index 0000000000..88eba0b574
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// ARM NEON implementations of Panorama kernels.
+// TODO(@AlSchlo, @aknayar): implement NEON-optimized panorama kernels.
+// Currently delegates to the scalar (NONE) implementation.
+
+#ifdef COMPILE_SIMD_ARM_NEON
+
+#include <faiss/impl/panorama_kernels/panorama_kernels-inl.h>
+
+namespace faiss {
+namespace panorama_kernels {
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::ARM_NEON>(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    process_level_impl<SIMDLevel::NONE>(
+            level_width_bytes,
+            max_batch_size,
+            num_active,
+            sim_table,
+            compressed_codes,
+            exact_distances);
+}
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<SIMDLevel::ARM_NEON>(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    return process_code_compression_impl<SIMDLevel::NONE>(
+            next_num_active,
+            max_batch_size,
+            level_width_bytes,
+            compressed_codes_begin,
+            bitset,
+            codes);
+}
+
+} // namespace panorama_kernels
+} // namespace faiss
+
+#endif // COMPILE_SIMD_ARM_NEON
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp b/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp
new file mode 100644
index 0000000000..af89f6aac9
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// ARM SVE implementations of Panorama kernels.
+// TODO(@AlSchlo, @aknayar): implement SVE-optimized panorama kernels.
+// Currently delegates to the scalar (NONE) implementation.
+
+#ifdef COMPILE_SIMD_ARM_SVE
+
+#include <faiss/impl/panorama_kernels/panorama_kernels-inl.h>
+
+namespace faiss {
+namespace panorama_kernels {
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::ARM_SVE>(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    process_level_impl<SIMDLevel::NONE>(
+            level_width_bytes,
+            max_batch_size,
+            num_active,
+            sim_table,
+            compressed_codes,
+            exact_distances);
+}
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<SIMDLevel::ARM_SVE>(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    return process_code_compression_impl<SIMDLevel::NONE>(
+            next_num_active,
+            max_batch_size,
+            level_width_bytes,
+            compressed_codes_begin,
+            bitset,
+            codes);
+}
+
+} // namespace panorama_kernels
+} // namespace faiss
+
+#endif // COMPILE_SIMD_ARM_SVE
diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h
index 0bbcad0eef..3a67415523 100644
--- a/faiss/impl/panorama_kernels/panorama_kernels.h
+++ b/faiss/impl/panorama_kernels/panorama_kernels.h
@@ -9,32 +9,50 @@
 
 /**
  * @file panorama_kernels.h
- * @brief Panorama search kernels with scalar and AVX-512 implementations.
+ * @brief Panorama search kernels with SIMD-dispatched implementations.
  *
  * The three core kernels of the Panorama progressive filtering search:
  * - process_level: accumulate PQ distance table lookups over chunks
  * - process_filtering: Cauchy-Schwarz lower bound pruning with stream
  *   compaction
  * - process_code_compression: byte-level stream compaction of PQ codes
- *
- * Implementations live in panorama_kernels-generic.cpp (scalar) and
- * panorama_kernels-avx512.cpp (AVX-512 gather/compress + BMI2 PEXT/PDEP).
  */
 
 #include <cstddef>
 #include <cstdint>
 #include <utility>
 
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/simd_levels.h>
+
 namespace faiss {
 namespace panorama_kernels {
 
+template <SIMDLevel SL>
+void process_level_impl(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances);
+
+template <SIMDLevel SL>
+std::pair<uint8_t*, size_t> process_code_compression_impl(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes);
+
 /// Accumulate PQ distance table lookups over chunks.
 ///
 /// For each chunk, looks up `sim_table[compressed_codes[i]]` and
 /// accumulates into `exact_distances[i]` for all active elements.
 /// Iterates chunks first to keep the LUT slice in L1 cache.
-/// The AVX-512 version unrolls 4 chunks at a time.
-void process_level(
+/// The AVX2/AVX-512 versions unroll 4 chunks at a time.
+FAISS_API void process_level(
         size_t level_width_bytes,
         size_t max_batch_size,
         size_t num_active,
@@ -48,11 +66,7 @@ void process_level(
 /// and removes elements that cannot improve the current heap top.
 /// Uses stream compaction to pack surviving elements contiguously.
 /// Updates the bitset to reflect which elements were removed.
-///
-/// Unfortunately, AVX-512 does not support a way to scatter at a
-/// 1-byte granularity, so the bitset update for removed items is
-/// done sequentially after compressing the indices.
-size_t process_filtering(
+FAISS_API size_t process_filtering(
         size_t num_active,
         float* exact_distances,
         uint32_t* active_indices,
@@ -76,7 +90,7 @@ size_t process_filtering(
 /// `max_batch_size`. Only the last batch may be smaller than
 /// `max_batch_size`, the caller ensures that the batch and
 /// bitset are padded with zeros.
-std::pair<uint8_t*, size_t> process_code_compression(
+FAISS_API std::pair<uint8_t*, size_t> process_code_compression(
         size_t next_num_active,
         size_t max_batch_size,
         size_t level_width_bytes,

From 9c4731cb245dcf69411502e9f44b8688957a4392 Mon Sep 17 00:00:00 2001
From: zoeyeye <zoeyeye@meta.com>
Date: Wed, 25 Mar 2026 17:07:55 -0700
Subject: [PATCH 41/41] Update index_read.cpp

solve lint error.
---
 faiss/impl/index_read.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index c4b645a48e..19b9f4b3f8 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -483,8 +483,8 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
         READ1(code_size);
         READ1(n_levels);
         constexpr size_t bs = Panorama::kDefaultBatchSize;
-      FAISS_THROW_IF_NOT_FMT(
-                n_levels > 0, "invalid ilpn n_levels %zd", n_levels);
+        FAISS_THROW_IF_NOT_FMT(
+            n_levels > 0, "invalid ilpn n_levels %zd", n_levels);
         auto* pano = new PanoramaFlat(code_size / sizeof(float), n_levels, bs);
         auto ailp = std::make_unique<ArrayInvertedListsPanorama>(
                 nlist, code_size, pano);