diff --git a/benchs/bench_ivfpq_panorama.py b/benchs/bench_ivfpq_panorama.py
new file mode 100644
index 0000000000..fcd52210b9
--- /dev/null
+++ b/benchs/bench_ivfpq_panorama.py
@@ -0,0 +1,293 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import multiprocessing as mp
+import time
+
+import faiss
+import matplotlib.pyplot as plt
+import numpy as np
+from scipy.linalg import block_diag
+from sklearn.decomposition import PCA
+
+try:
+    from faiss.contrib.datasets_fb import DatasetGIST1M
+except ImportError:
+    from faiss.contrib.datasets import DatasetGIST1M
+
+ds = DatasetGIST1M()
+
+SUBSET = 0.1  # Set to 1.0 for full dataset
+
+xq = ds.get_queries()
+xb_full = ds.get_database()
+nb_full = xb_full.shape[0]
+nb = int(nb_full * SUBSET)
+xb = xb_full[:nb].copy()
+del xb_full
+
+gt = ds.get_groundtruth() if SUBSET == 1.0 else None
+xt = ds.get_train()[:max(nb // 2, 50000)]
+
+nb, d = xb.shape
+nq = xq.shape[0]
+nt = xt.shape[0]
+
+k = 10
+
+if gt is None:
+    print(f"Computing ground truth for {SUBSET*100:.0f}% subset ({nb} vectors)...")
+    flat = faiss.IndexFlatL2(d)
+    flat.add(xb)
+    _, gt = flat.search(xq, k)
+else:
+    gt = gt[:, :k]
+
+print(f"Database: {nb} x {d}, Queries: {nq}, Train: {nt}")
+
+ALPHA = 8
+M_values = [960, 480, 240]
+nbits = 8
+nlist = 128
+n_levels = 16
+nprobes = [1, 2, 4, 8, 16, 32, 64]
+
+
+def get_ivf_index(index):
+    if isinstance(index, faiss.IndexPreTransform):
+        return faiss.downcast_index(index.index)
+    return index
+
+
+def eval_recall(index, nprobe_val):
+    faiss.cvar.indexPanorama_stats.reset()
+    t0 = time.time()
+    _, I = index.search(xq, k=k)
+    t = time.time() - t0
+    speed = t * 1000 / nq
+    qps = 1000 / speed
+
+    recall = np.mean(
+        [len(set(gt[i]) & set(I[i])) / k for i in range(nq)],
+    )
+    ratio_dims_scanned = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
+    print(
+        f"\tnprobe {nprobe_val:3d}, Recall@{k}: "
+        f"{recall:.6f}, speed: {speed:.6f} ms/query, QPS: {qps:.1f}, "
+        f"dims scanned: {ratio_dims_scanned * 100:.1f}%"
+    )
+
+    return recall, qps
+
+
+def eval_index(index, label):
+    ivf_index = get_ivf_index(index)
+
+    faiss.omp_set_num_threads(1)
+
+    data = []
+    print(f"====== {label}")
+    for nprobe in nprobes:
+        ivf_index.nprobe = nprobe
+        recall, qps = eval_recall(index, nprobe)
+        data.append((recall, qps))
+
+    data = np.array(data)
+    plt.plot(data[:, 0], data[:, 1], "o-", label=label)
+
+
+def build_ivfpq(M):
+    """Build vanilla IVFPQ (no transform) via index_factory."""
+    index = faiss.index_factory(d, f"IVF{nlist},PQ{M}x{nbits}")
+    faiss.omp_set_num_threads(mp.cpu_count())
+    index.train(xt)
+    index.add(xb)
+    return index
+
+
+def compute_level_energies(variances, n_levels, block_size):
+    """Sum per-dimension variances into per-level total energies."""
+    return np.array([
+        np.sum(variances[l * block_size : (l + 1) * block_size])
+        for l in range(n_levels)
+    ])
+
+
+def find_n_spill(variances, level_start, block_size, max_energy_per_level, d):
+    """Find the smallest number of extra dimensions to spill into.
+
+    After a random rotation over (block_size + n_spill) dims, each dim gets
+    uniform expected energy.  The level's expected energy becomes:
+        block_size * total_subspace_energy / (block_size + n_spill)
+
+    Returns the smallest n_spill >= 1 where this is <= max_energy_per_level,
+    or all remaining dims if the cap can't be reached.
+    """
+    level_end = level_start + block_size
+    max_extra = d - level_end
+    if max_extra == 0:
+        return 0
+
+    total = np.sum(variances[level_start:level_end])
+    for n in range(1, max_extra + 1):
+        total += variances[level_end + n - 1]
+        if block_size * total / (block_size + n) <= max_energy_per_level:
+            return n
+
+    return max_extra
+
+
+def random_orthogonal(size, rng):
+    """Haar-distributed random orthogonal matrix via QR of Gaussian."""
+    H = rng.randn(size, size).astype(np.float32)
+    Q, R = np.linalg.qr(H)
+    Q *= np.sign(np.diag(R))[:, None]
+    return Q
+
+
+def build_energy_spill_rotation(eigenvalues, n_levels, block_size,
+                                alpha, seed=42):
+    """Orthogonal matrix that caps per-level energy via localized rotations.
+
+    Iterates over levels sequentially.  When a level's effective energy
+    exceeds alpha * avg_energy_per_level, applies a random rotation spanning
+    that level plus enough subsequent dimensions to bring the expected level
+    energy down to the cap.
+
+    Variances are tracked analytically: after each rotation the dims in the
+    rotated subspace are set to uniform expected variance.
+
+    Returns (spill_rotation, effective_variances).
+    """
+    d = len(eigenvalues)
+    total_energy = float(np.sum(eigenvalues))
+    max_energy_per_level = alpha * total_energy / n_levels
+
+    variances = eigenvalues.astype(np.float32).copy()
+    spill_matrix = np.eye(d, dtype=np.float32)
+    rng = np.random.RandomState(seed)
+
+    for level in range(n_levels):
+        start = level * block_size
+        end = start + block_size
+        level_energy = float(np.sum(variances[start:end]))
+
+        if level_energy <= max_energy_per_level:
+            continue
+
+        n_spill = find_n_spill(
+            variances, start, block_size, max_energy_per_level, d,
+        )
+        if n_spill == 0:
+            continue
+
+        sub_end = end + n_spill
+        Q = random_orthogonal(block_size + n_spill, rng)
+
+        full_Q = np.eye(d, dtype=np.float32)
+        full_Q[start:sub_end, start:sub_end] = Q
+        spill_matrix = full_Q @ spill_matrix
+
+        avg_var = float(np.sum(variances[start:sub_end])) / (block_size + n_spill)
+        variances[start:sub_end] = avg_var
+
+    return spill_matrix, variances
+
+
+def build_level_equalization_rotation(d, n_levels, block_size, seed=77):
+    """Block-diagonal random rotation for within-level energy equalization."""
+    rng = np.random.RandomState(seed)
+    blocks = [random_orthogonal(block_size, rng) for _ in range(n_levels)]
+    return block_diag(*blocks).astype(np.float32)
+
+
+def print_energy_diagnostics(eigenvalues, effective_variances, n_levels,
+                             block_size, alpha):
+    """Print per-level energy before/after the spill transform."""
+    before = compute_level_energies(eigenvalues, n_levels, block_size)
+    after = compute_level_energies(effective_variances, n_levels, block_size)
+    total = float(np.sum(eigenvalues))
+    cap = alpha * total / n_levels
+
+
+def make_pca_level_rotation_transform(xt, n_levels, alpha=ALPHA, seed=77):
+    """Build PCA + energy-spill + per-level rotation as one LinearTransform.
+
+    Pipeline:  y = R_eq @ R_spill @ P @ (x - mean)
+      1. Center + PCA project           (P, mean)
+      2. Energy spill across levels      (R_spill)
+      3. Within-level equalization       (R_eq, block-diagonal)
+
+    Stored as:  A = R_eq @ R_spill @ P,  b = -A @ mean
+    """
+    dim = xt.shape[1]
+    block_size = dim // n_levels
+
+    pca = PCA(n_components=dim)
+    pca.fit(xt)
+    P = pca.components_.astype(np.float32)
+    mean = pca.mean_.astype(np.float32)
+    eigenvalues = pca.explained_variance_.astype(np.float32)
+
+    R_spill, effective_variances = build_energy_spill_rotation(
+        eigenvalues, n_levels, block_size, alpha, seed=seed,
+    )
+    print_energy_diagnostics(
+        eigenvalues, effective_variances, n_levels, block_size, alpha,
+    )
+
+    R_eq = build_level_equalization_rotation(
+        dim, n_levels, block_size, seed=seed + 1,
+    )
+
+    combined = (R_eq @ R_spill @ P).astype(np.float32)
+
+    lt = faiss.LinearTransform(dim, dim, True)
+    faiss.copy_array_to_vector(combined.ravel(), lt.A)
+    faiss.copy_array_to_vector(-(combined @ mean).ravel(), lt.b)
+    lt.is_trained = True
+    lt.have_bias = True
+
+    return lt
+
+
+def build_ivfpq_panorama(M, n_levels, alpha=ALPHA):
+    """Build PCA + EnergySpill + LevelRotation + IVFPQPanorama."""
+    lt = make_pca_level_rotation_transform(xt, n_levels, alpha=alpha)
+
+    quantizer = faiss.IndexFlatL2(d)
+    ivfpq_pano = faiss.IndexIVFPQPanorama(
+        quantizer, d, nlist, M, nbits, n_levels,
+    )
+
+    index = faiss.IndexPreTransform(lt, ivfpq_pano)
+
+    faiss.omp_set_num_threads(mp.cpu_count())
+    index.train(xt)
+    index.add(xb)
+
+    return index
+
+
+plt.figure(figsize=(10, 7), dpi=80)
+
+for M in M_values:
+    ivfpq = build_ivfpq(M)
+    eval_index(ivfpq, label=f"IVFPQ (M={M})")
+    del ivfpq
+
+    pano = build_ivfpq_panorama(M, n_levels)
+    eval_index(pano, label=f"PCA+Spill+Rot + IVFPQPanorama (M={M})")
+    del pano
+
+plt.title(
+    f"IVFPQ Panorama on GIST ({SUBSET*100:.0f}% subset, nlist={nlist})",
+)
+plt.xlabel(f"Recall@{k}")
+plt.ylabel("QPS")
+plt.yscale("log")
+plt.legend(bbox_to_anchor=(1.02, 0.1), loc="upper left", borderaxespad=0)
+plt.savefig("bench_ivfpq_panorama.png", bbox_inches="tight")
+print("\nBenchmark complete! Plot saved to bench_ivfpq_panorama.png")
diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index 42e6dd14eb..fdd25d4500 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -10,6 +10,7 @@
 # =============================================================================
 set(FAISS_SIMD_AVX2_SRC
   impl/fast_scan/impl-avx2.cpp
+  impl/panorama_kernels/panorama_kernels-avx2.cpp
   impl/pq_code_distance/pq_code_distance-avx2.cpp
   impl/scalar_quantizer/sq-avx2.cpp
   impl/approx_topk/avx2.cpp
@@ -17,17 +18,20 @@ set(FAISS_SIMD_AVX2_SRC
 )
 set(FAISS_SIMD_AVX512_SRC
   impl/fast_scan/impl-avx512.cpp
+  impl/panorama_kernels/panorama_kernels-avx512.cpp
   impl/pq_code_distance/pq_code_distance-avx512.cpp
   impl/scalar_quantizer/sq-avx512.cpp
   utils/simd_impl/distances_avx512.cpp
 )
 set(FAISS_SIMD_NEON_SRC
   impl/fast_scan/impl-neon.cpp
+  impl/panorama_kernels/panorama_kernels-neon.cpp
   impl/scalar_quantizer/sq-neon.cpp
   impl/approx_topk/neon.cpp
   utils/simd_impl/distances_aarch64.cpp
 )
 set(FAISS_SIMD_SVE_SRC
+  impl/panorama_kernels/panorama_kernels-sve.cpp
   impl/pq_code_distance/pq_code_distance-sve.cpp
   utils/simd_impl/distances_arm_sve.cpp
 )
@@ -63,6 +67,7 @@ set(FAISS_SRC
   IndexIVFFlat.cpp
   IndexIVFFlatPanorama.cpp
   IndexIVFPQ.cpp
+  IndexIVFPQPanorama.cpp
   IndexIVFFastScan.cpp
   IndexIVFAdditiveQuantizerFastScan.cpp
   IndexIVFPQFastScan.cpp
@@ -105,6 +110,7 @@ set(FAISS_SRC
   impl/NSG.cpp
   impl/PolysemousTraining.cpp
   impl/ProductQuantizer.cpp
+  impl/panorama_kernels/panorama_kernels-generic.cpp
   impl/pq_code_distance/pq_code_distance-generic.cpp
   impl/AdditiveQuantizer.cpp
   impl/RaBitQuantizer.cpp
@@ -127,6 +133,7 @@ set(FAISS_SRC
   impl/zerocopy_io.cpp
   impl/NNDescent.cpp
   impl/Panorama.cpp
+  impl/PanoramaPQ.cpp
   impl/PanoramaStats.cpp
   invlists/BlockInvertedLists.cpp
   invlists/DirectMap.cpp
@@ -184,6 +191,7 @@ set(FAISS_HEADERS
   IndexIVFFlat.h
   IndexIVFFlatPanorama.h
   IndexIVFPQ.h
+  IndexIVFPQPanorama.h
   IndexIVFFastScan.h
   IndexIVFAdditiveQuantizerFastScan.h
   IndexIVFPQFastScan.h
@@ -236,6 +244,7 @@ set(FAISS_HEADERS
   impl/NNDescent.h
   impl/NSG.h
   impl/Panorama.h
+  impl/PanoramaPQ.h
   impl/PanoramaStats.h
   impl/PolysemousTraining.h
   impl/ProductQuantizer-inl.h
@@ -278,6 +287,8 @@ set(FAISS_HEADERS
   impl/fast_scan/simd_result_handlers.h
   impl/zerocopy_io.h
   utils/pq_code_distance.h
+  impl/panorama_kernels/panorama_kernels.h
+  impl/panorama_kernels/panorama_kernels-inl.h
   impl/pq_code_distance/pq_code_distance-inl.h
   invlists/BlockInvertedLists.h
   invlists/DirectMap.h
@@ -354,6 +365,15 @@ endif()
 # Export FAISS_HEADERS variable to parent scope.
 set(FAISS_HEADERS ${FAISS_HEADERS} PARENT_SCOPE)
 
+# Detect BMI2 compiler support (PEXT/PDEP used in Panorama code compression).
+include(CheckCXXCompilerFlag)
+check_cxx_compiler_flag("-mbmi2" COMPILER_SUPPORTS_BMI2)
+if(COMPILER_SUPPORTS_BMI2)
+  set(FAISS_BMI2_FLAGS "-mbmi2")
+else()
+  set(FAISS_BMI2_FLAGS "")
+endif()
+
 add_library(faiss ${FAISS_SRC})
 
 add_library(faiss_avx2 ${FAISS_SRC})
@@ -361,7 +381,7 @@ if(NOT FAISS_OPT_LEVEL STREQUAL "avx2" AND NOT FAISS_OPT_LEVEL STREQUAL "avx512"
   set_target_properties(faiss_avx2 PROPERTIES EXCLUDE_FROM_ALL TRUE)
 endif()
 if(NOT WIN32)
-  target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt>)
+  target_compile_options(faiss_avx2 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mpopcnt ${FAISS_BMI2_FLAGS}>)
 else()
   # MSVC enables FMA with /arch:AVX2; no separate flags for F16C, POPCNT
   # Ref. FMA (under /arch:AVX2): https://docs.microsoft.com/en-us/cpp/build/reference/arch-x64
@@ -381,7 +401,7 @@ endif()
 if(NOT WIN32)
   # All modern CPUs support F, CD, VL, DQ, BW extensions.
   # Ref: https://en.wikipedia.org/wiki/AVX512
-  target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt>)
+  target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:-mavx2 -mfma -mf16c -mavx512f -mavx512cd -mavx512vl -mavx512dq -mavx512bw -mpopcnt ${FAISS_BMI2_FLAGS}>)
 else()
   target_compile_options(faiss_avx512 PRIVATE $<$<COMPILE_LANGUAGE:CXX>:/arch:AVX512>)
   # we need bigobj for the swig wrapper
diff --git a/faiss/IndexFlat.cpp b/faiss/IndexFlat.cpp
index 7eb76958ad..a17215737d 100644
--- a/faiss/IndexFlat.cpp
+++ b/faiss/IndexFlat.cpp
@@ -696,7 +696,7 @@ void IndexFlatPanorama::add(idx_t n, const float* x) {
 
     const uint8_t* code = reinterpret_cast<const uint8_t*>(x);
     pano.copy_codes_to_level_layout(codes.data(), offset, n, code);
-    pano.compute_cumulative_sums(cum_sums.data(), offset, n, x);
+    pano.compute_cumulative_sums(cum_sums.data(), offset, n, code);
 }
 
 void IndexFlatPanorama::search(
@@ -893,12 +893,12 @@ void IndexFlatPanorama::search_subset(
                         bool pruned = false;
                         for (size_t level = 0; level < n_levels; level++) {
                             local_stats.total_dims_scanned +=
-                                    pano.level_width_floats;
+                                    pano.level_width_dims;
 
                             // Refine distance
                             size_t actual_level_width = std::min(
-                                    pano.level_width_floats,
-                                    d - level * pano.level_width_floats);
+                                    pano.level_width_dims,
+                                    d - level * pano.level_width_dims);
                             float dot_product = fvec_inner_product<SL>(
                                     x_ptr, p_ptr, actual_level_width);
                             if constexpr (is_sim) {
@@ -931,8 +931,8 @@ void IndexFlatPanorama::search_subset(
                             }
 
                             cum_sum_offset++;
-                            x_ptr += pano.level_width_floats;
-                            p_ptr += pano.level_width_floats;
+                            x_ptr += pano.level_width_dims;
+                            p_ptr += pano.level_width_dims;
                         }
 
                         if (!pruned) {
diff --git a/faiss/IndexFlat.h b/faiss/IndexFlat.h
index 4cdc7d4386..7e10f05b25 100644
--- a/faiss/IndexFlat.h
+++ b/faiss/IndexFlat.h
@@ -104,7 +104,7 @@ struct IndexFlatPanorama : IndexFlat {
     const size_t batch_size;
     const size_t n_levels;
     std::vector<float> cum_sums;
-    Panorama pano;
+    PanoramaFlat pano;
 
     /**
      * @param d dimensionality of the input vectors
@@ -120,7 +120,7 @@ struct IndexFlatPanorama : IndexFlat {
             : IndexFlat(d_in, metric),
               batch_size(batch_size_in),
               n_levels(n_levels_in),
-              pano(code_size, n_levels_in, batch_size_in) {
+              pano(d_in, n_levels_in, batch_size_in) {
         FAISS_THROW_IF_NOT(
                 metric == METRIC_L2 || metric == METRIC_INNER_PRODUCT);
     }
diff --git a/faiss/IndexHNSW.cpp b/faiss/IndexHNSW.cpp
index f87003a76c..8f2f5f3e8a 100644
--- a/faiss/IndexHNSW.cpp
+++ b/faiss/IndexHNSW.cpp
@@ -680,7 +680,7 @@ IndexHNSWFlatPanorama::IndexHNSWFlatPanorama(
         MetricType metric)
         : IndexHNSWFlat(d_in, M, metric),
           cum_sums(),
-          pano(d_in * sizeof(float), num_panorama_levels_in, 1),
+          pano(d_in, num_panorama_levels_in, 1),
           num_panorama_levels(num_panorama_levels_in) {
     // For now, we only support L2 distance.
     // Supporting dot product and cosine distance is a trivial addition
@@ -696,7 +696,8 @@ IndexHNSWFlatPanorama::IndexHNSWFlatPanorama(
 void IndexHNSWFlatPanorama::add(idx_t n, const float* x) {
     idx_t n0 = ntotal;
     cum_sums.resize((ntotal + n) * (pano.n_levels + 1));
-    pano.compute_cumulative_sums(cum_sums.data(), n0, n, x);
+    pano.compute_cumulative_sums(
+            cum_sums.data(), n0, n, reinterpret_cast<const uint8_t*>(x));
     IndexHNSWFlat::add(n, x);
 }
 
diff --git a/faiss/IndexHNSW.h b/faiss/IndexHNSW.h
index a43828d428..17d54ecaa2 100644
--- a/faiss/IndexHNSW.h
+++ b/faiss/IndexHNSW.h
@@ -179,7 +179,7 @@ struct IndexHNSWFlatPanorama : IndexHNSWFlat {
     }
 
     std::vector<float> cum_sums;
-    Panorama pano;
+    PanoramaFlat pano;
     const size_t num_panorama_levels;
 };
 
diff --git a/faiss/IndexIVFFlatPanorama.cpp b/faiss/IndexIVFFlatPanorama.cpp
index 7bda245275..d05c73a049 100644
--- a/faiss/IndexIVFFlatPanorama.cpp
+++ b/faiss/IndexIVFFlatPanorama.cpp
@@ -39,7 +39,8 @@ IndexIVFFlatPanorama::IndexIVFFlatPanorama(
     // We construct the inverted lists here so that we can use the
     // level-oriented storage. This does not cause a leak as we constructed
     // IndexIVF first, with own_invlists set to false.
-    this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, n_levels);
+    auto* pano = new PanoramaFlat(d, n_levels, Panorama::kDefaultBatchSize);
+    this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano);
     this->own_invlists = own_invlists_in;
 }
 
@@ -51,6 +52,7 @@ template <typename VectorDistance, bool use_sel>
 struct IVFFlatScannerPanorama : InvertedListScanner {
     VectorDistance vd;
     const ArrayInvertedListsPanorama* storage;
+    const PanoramaFlat* pano_flat;
     using C = typename VectorDistance::C;
     static constexpr MetricType metric = VectorDistance::metric;
 
@@ -61,10 +63,14 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
             const IDSelector* sel_in)
             : InvertedListScanner(store_pairs_in, sel_in),
               vd(vd_in),
-              storage(storage_in) {
+              storage(storage_in),
+              pano_flat(
+                      dynamic_cast<const PanoramaFlat*>(
+                              storage_in->pano.get())) {
+        FAISS_THROW_IF_NOT(pano_flat);
         keep_max = vd.is_similarity;
         code_size = vd.d * sizeof(float);
-        cum_sums.resize(storage->n_levels + 1);
+        cum_sums.resize(pano_flat->n_levels + 1);
     }
 
     const float* xi = nullptr;
@@ -72,7 +78,7 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
     float q_norm = 0.0f;
     void set_query(const float* query) override {
         this->xi = query;
-        this->storage->pano.compute_query_cum_sums(query, cum_sums.data());
+        pano_flat->compute_query_cum_sums(query, cum_sums.data());
         q_norm = cum_sums[0] * cum_sums[0];
     }
 
@@ -96,22 +102,22 @@ struct IVFFlatScannerPanorama : InvertedListScanner {
             ResultHandler& handler) const override {
         size_t nup = 0;
 
-        const size_t n_batches =
-                (list_size + storage->kBatchSize - 1) / storage->kBatchSize;
+        const size_t bs = pano_flat->batch_size;
+        const size_t n_batches = (list_size + bs - 1) / bs;
 
         const float* cum_sums_data = storage->get_cum_sums(list_no);
 
-        std::vector<float> exact_distances(storage->kBatchSize);
-        std::vector<uint32_t> active_indices(storage->kBatchSize);
+        std::vector<float> exact_distances(bs);
+        std::vector<uint32_t> active_indices(bs);
 
         PanoramaStats local_stats;
         local_stats.reset();
 
         for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
-            size_t batch_start = batch_no * storage->kBatchSize;
+            size_t batch_start = batch_no * bs;
 
             size_t num_active = with_metric_type(metric, [&]<MetricType M>() {
-                return storage->pano.progressive_filter_batch<C, M>(
+                return pano_flat->progressive_filter_batch<C, M>(
                         codes,
                         cum_sums_data,
                         xi,
diff --git a/faiss/IndexIVFPQPanorama.cpp b/faiss/IndexIVFPQPanorama.cpp
new file mode 100644
index 0000000000..92862713f7
--- /dev/null
+++ b/faiss/IndexIVFPQPanorama.cpp
@@ -0,0 +1,231 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/IndexIVFPQPanorama.h>
+
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <numeric>
+
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/PanoramaPQ.h>
+#include <faiss/impl/PanoramaStats.h>
+#include <faiss/invlists/InvertedLists.h>
+#include <faiss/utils/Heap.h>
+
+namespace faiss {
+
+/*****************************************
+ * Constructor
+ ******************************************/
+
+IndexIVFPQPanorama::IndexIVFPQPanorama(
+        Index* quantizer,
+        size_t d,
+        size_t nlist,
+        size_t M,
+        size_t nbits_per_idx,
+        int n_levels,
+        size_t batch_size,
+        MetricType metric,
+        bool own_invlists)
+        : IndexIVFPQ(quantizer, d, nlist, M, nbits_per_idx, metric, false),
+          n_levels(n_levels),
+          batch_size(batch_size),
+          levels_size(d / n_levels) {
+    FAISS_THROW_IF_NOT_MSG(
+            M % n_levels == 0, "M must be divisible by n_levels");
+    FAISS_THROW_IF_NOT_MSG(
+            batch_size % 64 == 0, "batch_size must be multiple of 64");
+    FAISS_THROW_IF_NOT_MSG(nbits_per_idx == 8, "only 8-bit PQ codes supported");
+    FAISS_THROW_IF_NOT_MSG(
+            M == code_size, "M must equal code_size for 8-bit PQ");
+    FAISS_THROW_IF_NOT_MSG(metric == METRIC_L2, "only L2 metric supported");
+
+    auto* pano =
+            new PanoramaPQ(d, code_size, n_levels, batch_size, &pq, quantizer);
+    this->invlists = new ArrayInvertedListsPanorama(nlist, code_size, pano);
+    this->own_invlists = own_invlists;
+}
+
+/*****************************************
+ * Panorama scanner — overrides scan_codes with batch processing
+ ******************************************/
+
+namespace {
+
+using idx_t = faiss::idx_t;
+
+template <class C, bool use_sel>
+struct IVFPQScannerPanorama : InvertedListScanner {
+    const IndexIVFPQPanorama& index;
+    const ProductQuantizer& pq;
+    const ArrayInvertedListsPanorama* storage;
+    const PanoramaPQ* pano_pq;
+
+    // Query state
+    const float* qi = nullptr;
+    std::vector<float> query_cum_norms;
+    std::vector<float> sim_table_2;
+
+    // Per-list state
+    float coarse_dis = 0;
+
+    IVFPQScannerPanorama(
+            const IndexIVFPQPanorama& index,
+            const ArrayInvertedListsPanorama* storage,
+            bool store_pairs,
+            const IDSelector* sel)
+            : InvertedListScanner(store_pairs, sel),
+              index(index),
+              pq(index.pq),
+              storage(storage),
+              pano_pq(dynamic_cast<const PanoramaPQ*>(storage->pano.get())) {
+        FAISS_THROW_IF_NOT(pano_pq);
+        this->keep_max = is_similarity_metric(index.metric_type);
+        this->code_size = pq.code_size;
+        query_cum_norms.resize(index.n_levels + 1);
+        sim_table_2.resize(pq.M * pq.ksub);
+    }
+
+    void set_query(const float* query) override {
+        this->qi = query;
+
+        FAISS_ASSERT(index.by_residual);
+        FAISS_ASSERT(index.use_precomputed_table == 1);
+
+        pq.compute_inner_prod_table(qi, sim_table_2.data());
+
+        // The PQ distance LUT is -2 * inner_prod_table; apply in-place
+        // so scan_codes() can use sim_table_2 directly.
+        const size_t n = pq.M * pq.ksub;
+        for (size_t i = 0; i < n; i++) {
+            sim_table_2[i] *= -2.0f;
+        }
+
+        pano_pq->compute_query_cum_sums(qi, query_cum_norms.data());
+    }
+
+    void set_list(idx_t list_no, float coarse_dis) override {
+        this->list_no = list_no;
+        this->coarse_dis = coarse_dis;
+    }
+
+    float distance_to_code(const uint8_t* code) const override {
+        FAISS_THROW_MSG("IndexIVFPQPanorama does not support distance_to_code");
+    }
+
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* /* codes (column-major in storage) */,
+            const idx_t* ids,
+            float* distances,
+            idx_t* labels,
+            size_t k) const override {
+        size_t nup = 0;
+
+        const size_t bs = index.batch_size;
+        const size_t ls = pano_pq->level_width_bytes;
+
+        const size_t n_batches = (list_size + bs - 1) / bs;
+        const uint8_t* col_codes = storage->get_codes(list_no);
+        const float* list_cum_sums = storage->get_cum_sums(list_no);
+        const float* list_init_dists = storage->get_init_dists(list_no);
+
+        // Scratch buffers.
+        std::vector<float> exact_distances(bs);
+        std::vector<uint8_t> bitset(bs);
+        std::vector<uint32_t> active_indices(bs);
+        std::vector<uint8_t> compressed_codes(bs * ls);
+        float dis0 = coarse_dis;
+
+        PanoramaStats local_stats;
+        local_stats.reset();
+
+        for (size_t batch_no = 0; batch_no < n_batches; batch_no++) {
+            size_t num_active = pano_pq->progressive_filter_batch<C, use_sel>(
+                    col_codes,
+                    list_cum_sums,
+                    list_init_dists,
+                    sim_table_2.data(),
+                    query_cum_norms.data(),
+                    dis0,
+                    list_size,
+                    batch_no,
+                    ids,
+                    sel,
+                    exact_distances,
+                    active_indices,
+                    bitset,
+                    compressed_codes,
+                    distances[0],
+                    local_stats);
+
+            // Insert surviving candidates into heap.
+            for (size_t i = 0; i < num_active; i++) {
+                float dis = dis0 + exact_distances[i];
+                if (C::cmp(distances[0], dis)) {
+                    idx_t id = store_pairs
+                            ? lo_build(list_no, active_indices[i])
+                            : ids[active_indices[i]];
+                    heap_replace_top<C>(k, distances, labels, dis, id);
+                    nup++;
+                }
+            }
+        }
+
+        indexPanorama_stats.add(local_stats);
+        return nup;
+    }
+
+    size_t scan_codes(
+            size_t n,
+            const uint8_t* codes,
+            const idx_t* ids,
+            ResultHandler& handler) const override {
+        FAISS_THROW_MSG(
+                "IndexIVFPQPanorama: ResultHandler scan_codes not supported");
+    }
+};
+
+} // anonymous namespace
+
+/*****************************************
+ * get_InvertedListScanner
+ ******************************************/
+
+InvertedListScanner* IndexIVFPQPanorama::get_InvertedListScanner(
+        bool store_pairs,
+        const IDSelector* sel,
+        const IVFSearchParameters*) const {
+    FAISS_THROW_IF_NOT_MSG(
+            metric_type == METRIC_L2, "only L2 metric supported");
+    FAISS_THROW_IF_NOT_MSG(
+            use_precomputed_table == 1,
+            "Panorama PQ requires use_precomputed_table == 1");
+    FAISS_THROW_IF_NOT_MSG(pq.nbits == 8, "only 8-bit PQ codes supported");
+    FAISS_THROW_IF_NOT_MSG(by_residual, "Panorama PQ requires by_residual");
+    FAISS_THROW_IF_NOT_MSG(
+            polysemous_ht == 0, "Panorama PQ does not support polysemous");
+
+    const auto* storage =
+            dynamic_cast<const ArrayInvertedListsPanorama*>(invlists);
+    FAISS_THROW_IF_NOT_MSG(
+            storage, "IndexIVFPQPanorama requires ArrayInvertedListsPanorama");
+
+    if (sel) {
+        return new IVFPQScannerPanorama<CMax<float, idx_t>, true>(
+                *this, storage, store_pairs, sel);
+    } else {
+        return new IVFPQScannerPanorama<CMax<float, idx_t>, false>(
+                *this, storage, store_pairs, sel);
+    }
+}
+
+} // namespace faiss
diff --git a/faiss/IndexIVFPQPanorama.h b/faiss/IndexIVFPQPanorama.h
new file mode 100644
index 0000000000..8bcc97b624
--- /dev/null
+++ b/faiss/IndexIVFPQPanorama.h
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifndef FAISS_INDEX_IVFPQ_PANORAMA_H
+#define FAISS_INDEX_IVFPQ_PANORAMA_H
+
+#include <vector>
+
+#include <faiss/IndexIVFPQ.h>
+#include <faiss/impl/Panorama.h>
+
+namespace faiss {
+
+/// Panorama adaptation of IndexIVFPQ following
+/// https://www.arxiv.org/pdf/2510.00566.
+///
+/// IDEA:
+/// Panorama adapts the storage layout within each cluster and uses
+/// Cauchy-Schwarz pruning to skip unnecessary distance computations.
+/// Combined with orthogonal transforms upstream that concentrate signal
+/// energy in the early PQ subquantizers (like PCA), Panorama can prune
+/// the majority of candidates after computing only a fraction of the
+/// full PQ distance.
+///
+/// STORAGE LAYOUT:
+/// Standard IVFPQ stores codes row-major: [point0_code, point1_code, ...].
+/// Panorama transposes codes into column-major within each batch:
+/// for each batch of `batch_size` points, codes are stored as
+/// M columns of `batch_size` bytes each. The M columns are grouped
+/// into `n_levels` levels of `level_width_bytes` columns, enabling incremental
+/// distance computation level-by-level.
+///
+/// Storage is managed by ArrayInvertedListsPanorama with a PanoramaPQ
+/// instance that handles code transposition and cumulative sum computation
+/// (via PQ decoding) on insertion.
+///
+/// OVERHEAD:
+/// Panorama precomputes per-point cumulative residual norms at insertion
+/// time. Storage overhead is (n_levels + 1) floats per point for
+/// cum_sums. Initial exact distances are computed on-the-fly during
+/// search using the precomputed_table (no extra per-point storage).
+///
+/// CONSTRAINTS:
+/// - Only L2 metric is supported (for now).
+/// - Only 8-bit PQ codes (nbits_per_idx == 8).
+/// - M must be divisible by n_levels.
+/// - batch_size must be a multiple of 64.
+/// - use_precomputed_table must be 1.
+///
+/// NOTE:
+/// We inherit from IndexIVFPQ and override only get_InvertedListScanner().
+/// The base IndexIVF::search_preassigned() handles all search
+/// orchestration — no search code is duplicated.
+/// Storage (transposition + cum_sums) is handled by
+/// ArrayInvertedListsPanorama, so no add() override is needed.
+struct IndexIVFPQPanorama : public IndexIVFPQ {
+    int n_levels;
+    size_t batch_size;
+
+    size_t levels_size;
+
+    IndexIVFPQPanorama(
+            Index* quantizer,
+            size_t d,
+            size_t nlist,
+            size_t M,
+            size_t nbits_per_idx,
+            int n_levels,
+            size_t batch_size = Panorama::kDefaultBatchSize,
+            MetricType metric = METRIC_L2,
+            bool own_invlists = true);
+
+    IndexIVFPQPanorama() = default;
+
+    InvertedListScanner* get_InvertedListScanner(
+            bool store_pairs,
+            const IDSelector* sel,
+            const IVFSearchParameters* params) const override;
+};
+
+} // namespace faiss
+
+#endif
diff --git a/faiss/impl/HNSW.cpp b/faiss/impl/HNSW.cpp
index 381adff002..0dd07a374a 100644
--- a/faiss/impl/HNSW.cpp
+++ b/faiss/impl/HNSW.cpp
@@ -864,10 +864,10 @@ int search_from_candidates_panorama(
         while (curr_panorama_level < num_panorama_levels && batch_size > 0) {
             float query_cum_norm = query_cum_sums[curr_panorama_level + 1];
 
-            size_t start_dim = curr_panorama_level *
-                    panorama_index->pano.level_width_floats;
+            size_t start_dim =
+                    curr_panorama_level * panorama_index->pano.level_width_dims;
             size_t end_dim = (curr_panorama_level + 1) *
-                    panorama_index->pano.level_width_floats;
+                    panorama_index->pano.level_width_dims;
             end_dim = std::min(end_dim, static_cast<size_t>(panorama_index->d));
 
             size_t i = 0;
diff --git a/faiss/impl/Panorama.cpp b/faiss/impl/Panorama.cpp
index 34211eceba..dd105f5ec4 100644
--- a/faiss/impl/Panorama.cpp
+++ b/faiss/impl/Panorama.cpp
@@ -26,7 +26,7 @@ inline void compute_cum_sums_impl(
         float* output,
         size_t d,
         size_t n_levels,
-        size_t level_width_floats,
+        size_t level_width_dims,
         OffsetFunc&& get_offset) {
     // Iterate backwards through levels, accumulating sum as we go.
     // This avoids computing the suffix sum for each vector, which takes
@@ -34,9 +34,9 @@ inline void compute_cum_sums_impl(
     float sum = 0.0f;
 
     for (int level = n_levels - 1; level >= 0; level--) {
-        size_t start_idx = level * level_width_floats;
+        size_t start_idx = level * level_width_dims;
         size_t end_idx = std::min(
-                (level + 1) * level_width_floats, static_cast<size_t>(d));
+                (level + 1) * level_width_dims, static_cast<size_t>(d));
 
         for (size_t j = start_idx; j < end_idx; j++) {
             sum += vector[j] * vector[j];
@@ -51,14 +51,16 @@ inline void compute_cum_sums_impl(
 } // namespace
 
 /**************************************************************
- * Panorama structure implementation
+ * Panorama base class implementation
  **************************************************************/
 
 Panorama::Panorama(
+        size_t d_in,
         size_t code_size_in,
         size_t n_levels_in,
         size_t batch_size_in)
-        : code_size(code_size_in),
+        : d(d_in),
+          code_size(code_size_in),
           n_levels(n_levels_in),
           batch_size(batch_size_in) {
     set_derived_values();
@@ -66,9 +68,7 @@ Panorama::Panorama(
 
 void Panorama::set_derived_values() {
     FAISS_THROW_IF_NOT_MSG(n_levels > 0, "Panorama: n_levels must be > 0");
-    this->d = code_size / sizeof(float);
-    this->level_width_floats = ((d + n_levels - 1) / n_levels);
-    this->level_width = this->level_width_floats * sizeof(float);
+    level_width_bytes = (code_size + n_levels - 1) / n_levels;
 }
 
 /**
@@ -93,10 +93,10 @@ void Panorama::copy_codes_to_level_layout(
         // Copy entry into level-oriented layout for this batch.
         size_t batch_offset = batch_no * batch_size * code_size;
         for (size_t level = 0; level < n_levels; level++) {
-            size_t level_offset = level * level_width * batch_size;
-            size_t start_byte = level * level_width;
-            size_t actual_level_width =
-                    std::min(level_width, code_size - level * level_width);
+            size_t level_offset = level * level_width_bytes * batch_size;
+            size_t start_byte = level * level_width_bytes;
+            size_t actual_level_width = std::min(
+                    level_width_bytes, code_size - level * level_width_bytes);
 
             const uint8_t* src = code + entry_idx * code_size + start_byte;
             uint8_t* dest = codes + batch_offset + level_offset +
@@ -107,38 +107,12 @@ void Panorama::copy_codes_to_level_layout(
     }
 }
 
-void Panorama::compute_cumulative_sums(
-        float* cumsum_base,
-        size_t offset,
-        size_t n_entry,
-        const float* vectors) const {
-    for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
-        size_t current_pos = offset + entry_idx;
-        size_t batch_no = current_pos / batch_size;
-        size_t pos_in_batch = current_pos % batch_size;
-
-        const float* vector = vectors + entry_idx * d;
-        size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1);
-
-        auto get_offset = [&](size_t level) {
-            return cumsum_batch_offset + level * batch_size + pos_in_batch;
-        };
-
-        compute_cum_sums_impl(
-                vector,
-                cumsum_base,
-                d,
-                n_levels,
-                level_width_floats,
-                get_offset);
-    }
-}
-
 void Panorama::compute_query_cum_sums(const float* query, float* query_cum_sums)
         const {
+    size_t level_dims = (d + n_levels - 1) / n_levels;
     auto get_offset = [](size_t level) { return level; };
     compute_cum_sums_impl(
-            query, query_cum_sums, d, n_levels, level_width_floats, get_offset);
+            query, query_cum_sums, d, n_levels, level_dims, get_offset);
 }
 
 void Panorama::reconstruct(idx_t key, float* recons, const uint8_t* codes_base)
@@ -150,12 +124,12 @@ void Panorama::reconstruct(idx_t key, float* recons, const uint8_t* codes_base)
     size_t batch_offset = batch_no * batch_size * code_size;
 
     for (size_t level = 0; level < n_levels; level++) {
-        size_t level_offset = level * level_width * batch_size;
+        size_t level_offset = level * level_width_bytes * batch_size;
         const uint8_t* src = codes_base + batch_offset + level_offset +
-                pos_in_batch * level_width;
-        uint8_t* dest = recons_buffer + level * level_width;
-        size_t copy_size =
-                std::min(level_width, code_size - level * level_width);
+                pos_in_batch * level_width_bytes;
+        uint8_t* dest = recons_buffer + level * level_width_bytes;
+        size_t copy_size = std::min(
+                level_width_bytes, code_size - level * level_width_bytes);
         memcpy(dest, src, copy_size);
     }
 }
@@ -182,9 +156,9 @@ void Panorama::copy_entry(
 
     for (size_t level = 0; level < n_levels; level++) {
         // Copy code
-        size_t level_offset = level * level_width * batch_size;
-        size_t actual_level_width =
-                std::min(level_width, code_size - level * level_width);
+        size_t level_offset = level * level_width_bytes * batch_size;
+        size_t actual_level_width = std::min(
+                level_width_bytes, code_size - level * level_width_bytes);
 
         const uint8_t* src = src_codes + src_batch_offset + level_offset +
                 src_pos_in_batch * actual_level_width;
@@ -202,4 +176,38 @@ void Panorama::copy_entry(
         dest_cum_sums[dest_offset] = src_cum_sums[src_offset];
     }
 }
+
+/**************************************************************
+ * PanoramaFlat implementation
+ **************************************************************/
+
+PanoramaFlat::PanoramaFlat(size_t d, size_t n_levels, size_t batch_size)
+        : Panorama(d, d * sizeof(float), n_levels, batch_size) {
+    level_width_dims = (d + n_levels - 1) / n_levels;
+    level_width_bytes = level_width_dims * sizeof(float);
+}
+
+void PanoramaFlat::compute_cumulative_sums(
+        float* cumsum_base,
+        size_t offset,
+        size_t n_entry,
+        const uint8_t* code) const {
+    const float* vectors = reinterpret_cast<const float*>(code);
+    for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
+        size_t current_pos = offset + entry_idx;
+        size_t batch_no = current_pos / batch_size;
+        size_t pos_in_batch = current_pos % batch_size;
+
+        const float* vector = vectors + entry_idx * d;
+        size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1);
+
+        auto get_offset = [&](size_t level) {
+            return cumsum_batch_offset + level * batch_size + pos_in_batch;
+        };
+
+        compute_cum_sums_impl(
+                vector, cumsum_base, d, n_levels, level_width_dims, get_offset);
+    }
+}
+
 } // namespace faiss
diff --git a/faiss/impl/Panorama.h b/faiss/impl/Panorama.h
index 79a23a64a7..58e889a1f0 100644
--- a/faiss/impl/Panorama.h
+++ b/faiss/impl/Panorama.h
@@ -40,35 +40,48 @@ namespace faiss {
  * Coupled with the appropriate orthogonal PreTransform (e.g. PCA, Cayley,
  * etc.), Panorama can prune the vast majority of dimensions, greatly
  * accelerating the refinement stage.
+ *
+ * This is the abstract base class. Concrete subclasses (PanoramaFlat,
+ * PanoramaPQ) implement compute_cumulative_sums and progressive_filter_batch
+ * for their respective code formats.
  */
 struct Panorama {
+    static constexpr size_t kDefaultBatchSize = 128;
+
     size_t d = 0;
     size_t code_size = 0;
     size_t n_levels = 0;
-    size_t level_width = 0;
-    size_t level_width_floats = 0;
+    size_t level_width_bytes = 0;
     size_t batch_size = 0;
 
-    explicit Panorama(size_t code_size, size_t n_levels, size_t batch_size);
+    Panorama() = default;
+    Panorama(size_t d, size_t code_size, size_t n_levels, size_t batch_size);
+
+    virtual ~Panorama() = default;
 
     void set_derived_values();
 
     /// Helper method to copy codes into level-oriented batch layout at a given
     /// offset in the list.
-    void copy_codes_to_level_layout(
+    /// PanoramaFlat uses row-major within each level (point bytes contiguous).
+    /// PanoramaPQ overrides to use column-major (subquantizer columns
+    /// contiguous).
+    virtual void copy_codes_to_level_layout(
             uint8_t* codes,
             size_t offset,
             size_t n_entry,
             const uint8_t* code);
 
-    /// Helper method to compute the cumulative sums of the codes.
-    /// The cumsums also follow the level-oriented batch layout to minimize the
+    /// Compute the cumulative sums (suffix norms) for database vectors.
+    /// The cumsums follow the level-oriented batch layout to minimize the
     /// number of random memory accesses.
-    void compute_cumulative_sums(
+    /// Subclasses interpret the raw code bytes according to their format:
+    /// PanoramaFlat reinterprets as float*, PanoramaPQ decodes via PQ.
+    virtual void compute_cumulative_sums(
             float* cumsum_base,
             size_t offset,
             size_t n_entry,
-            const float* vectors) const;
+            const uint8_t* code) const = 0;
 
     /// Compute the cumulative sums of the query vector.
     void compute_query_cum_sums(const float* query, float* query_cum_sums)
@@ -83,7 +96,32 @@ struct Panorama {
             size_t dest_idx,
             size_t src_idx) const;
 
-    /// Panorama's core progressive filtering algorithm:
+    virtual void reconstruct(
+            idx_t key,
+            float* recons,
+            const uint8_t* codes_base) const;
+};
+
+/**
+ * Panorama for flat (uncompressed) float vectors.
+ *
+ * Codes are raw float vectors (code_size = d * sizeof(float)).
+ * compute_cumulative_sums interprets codes as floats.
+ * progressive_filter_batch computes dot products on raw float storage.
+ */
+struct PanoramaFlat : Panorama {
+    size_t level_width_dims = 0;
+
+    PanoramaFlat() = default;
+    PanoramaFlat(size_t d, size_t n_levels, size_t batch_size);
+
+    void compute_cumulative_sums(
+            float* cumsum_base,
+            size_t offset,
+            size_t n_entry,
+            const uint8_t* code) const override;
+
+    /// Panorama's core progressive filtering algorithm for flat codes:
     /// Process vectors in batches for cache efficiency. For each batch:
     /// 1. Apply ID selection filter and initialize distances
     /// (||y||^2 + ||x||^2).
@@ -113,12 +151,10 @@ struct Panorama {
             std::vector<float>& exact_distances,
             float threshold,
             PanoramaStats& local_stats) const;
-
-    void reconstruct(idx_t key, float* recons, const uint8_t* codes_base) const;
 };
 
 template <typename C, MetricType M>
-size_t Panorama::progressive_filter_batch(
+size_t PanoramaFlat::progressive_filter_batch(
         const uint8_t* codes_base,
         const float* cum_sums,
         const float* query,
@@ -173,18 +209,18 @@ size_t Panorama::progressive_filter_batch(
 
         float query_cum_norm = query_cum_sums[level + 1];
 
-        size_t level_offset = level * level_width * batch_size;
+        size_t level_offset = level * level_width_bytes * batch_size;
         const float* level_storage =
                 (const float*)(storage_base + level_offset);
 
         size_t next_active = 0;
         for (size_t i = 0; i < num_active; i++) {
             uint32_t idx = active_indices[i];
-            size_t actual_level_width = std::min(
-                    level_width_floats, d - level * level_width_floats);
+            size_t actual_level_width =
+                    std::min(level_width_dims, d - level * level_width_dims);
 
             const float* yj = level_storage + idx * actual_level_width;
-            const float* query_level = query + level * level_width_floats;
+            const float* query_level = query + level * level_width_dims;
 
             float dot_product =
                     fvec_inner_product(query_level, yj, actual_level_width);
diff --git a/faiss/impl/PanoramaPQ.cpp b/faiss/impl/PanoramaPQ.cpp
new file mode 100644
index 0000000000..f535ae9b84
--- /dev/null
+++ b/faiss/impl/PanoramaPQ.cpp
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <faiss/impl/PanoramaPQ.h>
+
+#include <cmath>
+#include <vector>
+
+#include <faiss/impl/FaissAssert.h>
+
+namespace faiss {
+
+void PanoramaPQ::copy_codes_to_level_layout(
+        uint8_t* codes,
+        size_t offset,
+        size_t n_entry,
+        const uint8_t* code) {
+    const size_t ls = level_width_bytes;
+    const size_t bs = batch_size;
+
+    for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
+        size_t current_pos = offset + entry_idx;
+        size_t batch_no = current_pos / bs;
+        size_t pos_in_batch = current_pos % bs;
+        size_t batch_offset = batch_no * bs * code_size;
+
+        for (size_t level = 0; level < n_levels; level++) {
+            size_t level_offset = level * ls * bs;
+            size_t start_byte = level * ls;
+
+            for (size_t li = 0; li < ls && (start_byte + li) < code_size;
+                 li++) {
+                codes[batch_offset + level_offset + li * bs + pos_in_batch] =
+                        code[entry_idx * code_size + start_byte + li];
+            }
+        }
+    }
+}
+
+void PanoramaPQ::reconstruct(
+        idx_t key,
+        float* recons,
+        const uint8_t* codes_base) const {
+    uint8_t* recons_buffer = reinterpret_cast<uint8_t*>(recons);
+    const size_t ls = level_width_bytes;
+    const size_t bs = batch_size;
+
+    size_t batch_no = key / bs;
+    size_t pos_in_batch = key % bs;
+    size_t batch_offset = batch_no * bs * code_size;
+
+    for (size_t level = 0; level < n_levels; level++) {
+        size_t level_offset = level * ls * bs;
+        size_t start_byte = level * ls;
+
+        for (size_t li = 0; li < ls && (start_byte + li) < code_size; li++) {
+            recons_buffer[start_byte + li] = codes_base
+                    [batch_offset + level_offset + li * bs + pos_in_batch];
+        }
+    }
+}
+
+PanoramaPQ::PanoramaPQ(
+        size_t d,
+        size_t code_size,
+        size_t n_levels,
+        size_t batch_size,
+        const ProductQuantizer* pq,
+        const Index* quantizer)
+        : Panorama(d, code_size, n_levels, batch_size),
+          pq(pq),
+          quantizer(quantizer),
+          levels_size(d / n_levels) {
+    FAISS_THROW_IF_NOT_MSG(
+            code_size % n_levels == 0,
+            "PanoramaPQ: code_size must be divisible by n_levels");
+    FAISS_THROW_IF_NOT_MSG(pq != nullptr, "PanoramaPQ: pq must not be null");
+}
+
+void PanoramaPQ::compute_cumulative_sums(
+        float* cumsum_base,
+        size_t offset,
+        size_t n_entry,
+        const uint8_t* code) const {
+    for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
+        size_t current_pos = offset + entry_idx;
+        size_t batch_no = current_pos / batch_size;
+        size_t pos_in_batch = current_pos % batch_size;
+
+        // Decode PQ code to float vector.
+        std::vector<float> vec(d);
+        pq->decode(code + entry_idx * code_size, vec.data());
+
+        // Compute suffix sums of squared norms.
+        std::vector<float> suffix(d + 1, 0.0f);
+        for (int j = d - 1; j >= 0; j--) {
+            suffix[j] = suffix[j + 1] + vec[j] * vec[j];
+        }
+
+        // Write into batch-oriented layout.
+        size_t cumsum_batch_offset = batch_no * batch_size * (n_levels + 1);
+        for (size_t level = 0; level < n_levels; level++) {
+            size_t start_idx = level * levels_size;
+            size_t out_offset =
+                    cumsum_batch_offset + level * batch_size + pos_in_batch;
+            cumsum_base[out_offset] =
+                    start_idx < d ? std::sqrt(suffix[start_idx]) : 0.0f;
+        }
+
+        size_t last_offset =
+                cumsum_batch_offset + n_levels * batch_size + pos_in_batch;
+        cumsum_base[last_offset] = 0.0f;
+    }
+}
+
+void PanoramaPQ::compute_init_distances(
+        float* init_dists_base,
+        size_t list_no,
+        size_t offset,
+        size_t n_entry,
+        const uint8_t* code) const {
+    FAISS_THROW_IF_NOT_MSG(
+            quantizer != nullptr,
+            "PanoramaPQ: quantizer required for compute_init_distances");
+
+    std::vector<float> centroid(d);
+    quantizer->reconstruct(list_no, centroid.data());
+
+    for (size_t entry_idx = 0; entry_idx < n_entry; entry_idx++) {
+        std::vector<float> vec(d);
+        pq->decode(code + entry_idx * code_size, vec.data());
+
+        float init_dist = 0.0f;
+        for (size_t j = 0; j < d; j++) {
+            init_dist += vec[j] * vec[j] + 2 * vec[j] * centroid[j];
+        }
+
+        size_t point_idx = offset + entry_idx;
+        init_dists_base[point_idx] = init_dist;
+    }
+}
+
+} // namespace faiss
diff --git a/faiss/impl/PanoramaPQ.h b/faiss/impl/PanoramaPQ.h
new file mode 100644
index 0000000000..b94623cb5e
--- /dev/null
+++ b/faiss/impl/PanoramaPQ.h
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/Index.h>
+#include <faiss/impl/Panorama.h>
+#include <faiss/impl/ProductQuantizer.h>
+#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+#include <faiss/utils/Heap.h>
+#include <cmath>
+
+#include <numeric>
+
+namespace faiss {
+
+/**
+ * Panorama for PQ-compressed vectors.
+ *
+ * Codes are PQ codes (code_size = M bytes for 8-bit PQ).
+ * compute_cumulative_sums decodes via PQ then computes suffix norms.
+ * progressive_filter_batch uses LUT accumulation with panorama_kernels.
+ */
+struct PanoramaPQ : Panorama {
+    const ProductQuantizer* pq = nullptr;
+    const Index* quantizer = nullptr;
+    size_t levels_size = 0;
+
+    PanoramaPQ() = default;
+    PanoramaPQ(
+            size_t d,
+            size_t code_size,
+            size_t n_levels,
+            size_t batch_size,
+            const ProductQuantizer* pq,
+            const Index* quantizer = nullptr);
+
+    void copy_codes_to_level_layout(
+            uint8_t* codes,
+            size_t offset,
+            size_t n_entry,
+            const uint8_t* code) override;
+
+    void reconstruct(idx_t key, float* recons, const uint8_t* codes_base)
+            const override;
+
+    void compute_cumulative_sums(
+            float* cumsum_base,
+            size_t offset,
+            size_t n_entry,
+            const uint8_t* code) const override;
+
+    /// Precompute per-point init distances: ||r||^2 + 2<r, c>.
+    /// Requires quantizer to be set. Layout is flat per-list,
+    /// padded to batch_size boundaries.
+    void compute_init_distances(
+            float* init_dists_base,
+            size_t list_no,
+            size_t offset,
+            size_t n_entry,
+            const uint8_t* code) const;
+
+    /// Progressive filtering for PQ codes: processes one batch.
+    ///
+    /// Initializes exact_distances from precomputed init_dists
+    /// (||r||^2 + 2<r, c>), then refines with the query-specific
+    /// sim_table_2 level-by-level with Cauchy-Schwarz pruning.
+    ///
+    /// @param col_codes       Column-major codes for this inverted list.
+    /// @param list_cum_sums   Cumulative sums for this inverted list.
+    /// @param init_dists      Precomputed init distances for this list.
+    /// @param sim_table_2     -2 * inner_prod_table (query-specific LUT).
+    /// @param query_cum_norms Query suffix norms per level.
+    /// @param coarse_dis      Coarse distance (dis0) for this list.
+    /// @param list_size       Total number of vectors in this list.
+    /// @param batch_no        Which batch to process.
+    /// @param ids             ID array for the inverted list.
+    /// @param sel             ID selector for filtering (may be nullptr).
+    /// @param exact_distances [out] Scratch buffer for partial distances.
+    /// @param active_indices  [out] Scratch buffer for survivor indices.
+    /// @param bitset          Scratch buffer for code compression.
+    /// @param compressed_codes Scratch buffer for compressed codes.
+    /// @param threshold       Current heap threshold for pruning.
+    /// @param local_stats     [out] Accumulated pruning statistics.
+    /// @return Number of surviving candidates in active_indices.
+    template <typename C, bool use_sel>
+    size_t progressive_filter_batch(
+            const uint8_t* col_codes,
+            const float* list_cum_sums,
+            const float* init_dists,
+            const float* sim_table_2,
+            const float* query_cum_norms,
+            float coarse_dis,
+            size_t list_size,
+            size_t batch_no,
+            const idx_t* ids,
+            const IDSelector* sel,
+            std::vector<float>& exact_distances,
+            std::vector<uint32_t>& active_indices,
+            std::vector<uint8_t>& bitset,
+            std::vector<uint8_t>& compressed_codes,
+            float threshold,
+            PanoramaStats& local_stats) const {
+        const size_t bs = batch_size;
+        const size_t ls = level_width_bytes;
+        const size_t ksub = pq->ksub;
+
+        size_t curr_batch_size = std::min(list_size - batch_no * bs, bs);
+        size_t b_offset = batch_no * bs;
+
+        // Initialize active set with ID-filtered vectors.
+        std::fill(bitset.begin(), bitset.end(), 0);
+        size_t num_active = 0;
+        const float* batch_init = init_dists + b_offset;
+        for (size_t i = 0; i < curr_batch_size; i++) {
+            size_t global_idx = b_offset + i;
+            if (use_sel) {
+                idx_t id = ids[global_idx];
+                if (!sel->is_member(id)) {
+                    continue;
+                }
+            }
+            active_indices[num_active] = global_idx;
+            exact_distances[num_active] = batch_init[i];
+            bitset[i] = 1;
+            num_active++;
+        }
+
+        if (num_active == 0) {
+            return 0;
+        }
+
+        const uint8_t* batch_codes = col_codes + b_offset * code_size;
+
+        const float* batch_cums = list_cum_sums + b_offset * (n_levels + 1);
+
+        size_t next_num_active = num_active;
+        size_t batch_offset = batch_no * bs;
+        const size_t total_active = next_num_active;
+
+        local_stats.total_dims += total_active * n_levels;
+
+        for (size_t level = 0; level < n_levels && next_num_active > 0;
+             level++) {
+            local_stats.total_dims_scanned += next_num_active;
+
+            size_t level_sim_offset = level * ksub * ls;
+
+            float query_cum_norm = 2 * query_cum_norms[level + 1];
+
+            const float* cum_sums_level = batch_cums + bs * (level + 1);
+            const uint8_t* codes_level = batch_codes + bs * ls * level;
+
+            const float* sim_table_level = sim_table_2 + level_sim_offset;
+
+            bool is_sparse = next_num_active < bs / 16;
+
+            size_t num_active_for_filtering = 0;
+            if (is_sparse) {
+                for (size_t li = 0; li < ls; li++) {
+                    size_t byte_off = li * bs;
+                    const float* chunk_sim = sim_table_level + li * ksub;
+                    for (size_t i = 0; i < next_num_active; i++) {
+                        size_t real_idx = active_indices[i] - batch_offset;
+                        exact_distances[i] +=
+                                chunk_sim[codes_level[byte_off + real_idx]];
+                    }
+                }
+                num_active_for_filtering = next_num_active;
+            } else {
+                auto [cc, na] = panorama_kernels::process_code_compression(
+                        next_num_active,
+                        bs,
+                        ls,
+                        compressed_codes.data(),
+                        bitset.data(),
+                        codes_level);
+
+                panorama_kernels::process_level(
+                        ls,
+                        bs,
+                        na,
+                        const_cast<float*>(sim_table_level),
+                        cc,
+                        exact_distances.data());
+                num_active_for_filtering = na;
+            }
+
+            next_num_active = panorama_kernels::process_filtering(
+                    num_active_for_filtering,
+                    exact_distances.data(),
+                    active_indices.data(),
+                    const_cast<float*>(cum_sums_level),
+                    bitset.data(),
+                    batch_offset,
+                    coarse_dis,
+                    query_cum_norm,
+                    threshold);
+        }
+
+        return next_num_active;
+    }
+};
+
+} // namespace faiss
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
index f0c433f544..19b9f4b3f8 100644
--- a/faiss/impl/index_read.cpp
+++ b/faiss/impl/index_read.cpp
@@ -39,6 +39,7 @@
 #include <faiss/IndexIVFIndependentQuantizer.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
+#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFRaBitQ.h>
 #include <faiss/IndexIVFRaBitQFastScan.h>
@@ -54,6 +55,7 @@
 #include <faiss/IndexRaBitQFastScan.h>
 #include <faiss/IndexRefine.h>
 #include <faiss/IndexRowwiseMinMax.h>
+#include <faiss/impl/PanoramaPQ.h>
 #ifdef FAISS_ENABLE_SVS
 #include <faiss/impl/svs_io.h>
 #include <faiss/svs/IndexSVSFlat.h>
@@ -480,18 +482,18 @@ std::unique_ptr<InvertedLists> read_InvertedLists_up(
         FAISS_CHECK_DESERIALIZATION_LOOP_LIMIT(nlist, "ilpn nlist");
         READ1(code_size);
         READ1(n_levels);
+        constexpr size_t bs = Panorama::kDefaultBatchSize;
         FAISS_THROW_IF_NOT_FMT(
-                n_levels > 0, "invalid ilpn n_levels %zd", n_levels);
+            n_levels > 0, "invalid ilpn n_levels %zd", n_levels);
+        auto* pano = new PanoramaFlat(code_size / sizeof(float), n_levels, bs);
         auto ailp = std::make_unique<ArrayInvertedListsPanorama>(
-                nlist, code_size, n_levels);
+                nlist, code_size, pano);
         std::vector<size_t> sizes(nlist);
         read_ArrayInvertedLists_sizes(f, sizes);
+
         for (size_t i = 0; i < nlist; i++) {
             ailp->ids[i].resize(sizes[i]);
-            size_t num_elems =
-                    ((sizes[i] + ArrayInvertedListsPanorama::kBatchSize - 1) /
-                     ArrayInvertedListsPanorama::kBatchSize) *
-                    ArrayInvertedListsPanorama::kBatchSize;
+            size_t num_elems = ((sizes[i] + bs - 1) / bs) * bs;
             ailp->codes[i].resize(num_elems * code_size);
             ailp->cum_sums[i].resize(num_elems * (n_levels + 1));
         }
@@ -1535,6 +1537,60 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         READVECTOR(ivsp->trained);
         read_InvertedLists(*ivsp, f, io_flags);
         idx = std::move(ivsp);
+    } else if (h == fourcc("IwPP")) {
+        auto ivpp = std::make_unique<IndexIVFPQPanorama>();
+        read_ivf_header(ivpp.get(), f);
+        READ1(ivpp->by_residual);
+        READ1(ivpp->code_size);
+        read_ProductQuantizer(&ivpp->pq, f);
+        READ1(ivpp->n_levels);
+        READ1(ivpp->batch_size);
+        ivpp->levels_size = ivpp->d / ivpp->n_levels;
+        read_InvertedLists(*ivpp, f, io_flags);
+        // The "ilpn" reader creates a PanoramaFlat placeholder; replace
+        // it with PanoramaPQ now that we have the ProductQuantizer.
+        auto* storage =
+                dynamic_cast<ArrayInvertedListsPanorama*>(ivpp->invlists);
+        if (storage) {
+            auto* pano_pq = new PanoramaPQ(
+                    ivpp->d,
+                    ivpp->code_size,
+                    ivpp->n_levels,
+                    ivpp->batch_size,
+                    &ivpp->pq,
+                    ivpp->quantizer);
+            storage->pano.reset(pano_pq);
+
+            // Recompute init_dists from stored codes + quantizer.
+            for (size_t list_no = 0; list_no < ivpp->nlist; list_no++) {
+                size_t list_size = storage->ids[list_no].size();
+                if (list_size == 0)
+                    continue;
+                size_t bs = pano_pq->batch_size;
+                size_t padded = ((list_size + bs - 1) / bs) * bs;
+                storage->init_dists[list_no].resize(padded);
+
+                // Reconstruct row-major codes, then compute init distances.
+                std::vector<uint8_t> row_code(ivpp->code_size);
+                for (size_t i = 0; i < list_size; i++) {
+                    pano_pq->reconstruct(
+                            i,
+                            reinterpret_cast<float*>(row_code.data()),
+                            storage->codes[list_no].data());
+                    pano_pq->compute_init_distances(
+                            storage->init_dists[list_no].data(),
+                            list_no,
+                            i,
+                            1,
+                            row_code.data());
+                }
+            }
+        }
+        if (ivpp->is_trained) {
+            ivpp->use_precomputed_table = 1;
+            ivpp->precompute_table();
+        }
+        idx = std::move(ivpp);
     } else if (
             h == fourcc("IvPQ") || h == fourcc("IvQR") || h == fourcc("IwPQ") ||
             h == fourcc("IwQR")) {
@@ -1666,8 +1722,8 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
             size_t nlevels;
             READ1(nlevels);
             const_cast<size_t&>(idx_panorama->num_panorama_levels) = nlevels;
-            const_cast<Panorama&>(idx_panorama->pano) =
-                    Panorama(idx_panorama->d * sizeof(float), nlevels, 1);
+            const_cast<PanoramaFlat&>(idx_panorama->pano) =
+                    PanoramaFlat(idx_panorama->d, nlevels, 1);
             READVECTOR(idx_panorama->cum_sums);
         }
         if (h == fourcc("IHNc") || h == fourcc("IHc2")) {
diff --git a/faiss/impl/index_write.cpp b/faiss/impl/index_write.cpp
index 03cc9bdd69..66f0ed325f 100644
--- a/faiss/impl/index_write.cpp
+++ b/faiss/impl/index_write.cpp
@@ -35,6 +35,7 @@
 #include <faiss/IndexIVFIndependentQuantizer.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
+#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFRaBitQ.h>
 #include <faiss/IndexIVFRaBitQFastScan.h>
@@ -273,7 +274,7 @@ void write_InvertedLists(const InvertedLists* ils, IOWriter* f) {
         WRITE1(h);
         WRITE1(ailp->nlist);
         WRITE1(ailp->code_size);
-        WRITE1(ailp->n_levels);
+        WRITE1(ailp->pano->n_levels);
         uint32_t list_type = fourcc("full");
         WRITE1(list_type);
         std::vector<size_t> sizes;
@@ -774,6 +775,18 @@ void write_index(const Index* idx, IOWriter* f, int io_flags) {
         WRITE1(ivsp->threshold_type);
         WRITEVECTOR(ivsp->trained);
         write_InvertedLists(ivsp->invlists, f);
+    } else if (
+            const IndexIVFPQPanorama* ivpp =
+                    dynamic_cast<const IndexIVFPQPanorama*>(idx)) {
+        uint32_t h = fourcc("IwPP");
+        WRITE1(h);
+        write_ivf_header(ivpp, f);
+        WRITE1(ivpp->by_residual);
+        WRITE1(ivpp->code_size);
+        write_ProductQuantizer(&ivpp->pq, f);
+        WRITE1(ivpp->n_levels);
+        WRITE1(ivpp->batch_size);
+        write_InvertedLists(ivpp->invlists, f);
     } else if (const IndexIVFPQ* ivpq = dynamic_cast<const IndexIVFPQ*>(idx)) {
         const IndexIVFPQR* ivfpqr = dynamic_cast<const IndexIVFPQR*>(idx);
 
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
new file mode 100644
index 0000000000..070475b067
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx2.cpp
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// AVX2 implementations of Panorama kernels.
+// Uses 256-bit gather for process_level, scalar filtering (no
+// compress instruction in AVX2), and BMI2 PEXT/PDEP for code
+// compression where available.
+
+#ifdef COMPILE_SIMD_AVX2
+
+#include <immintrin.h>
+
+#include <faiss/impl/panorama_kernels/panorama_kernels-inl.h>
+
+#include <cstring>
+
+namespace faiss {
+namespace panorama_kernels {
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::AVX2>(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    size_t byte_idx = 0;
+
+    // Process 4 chunks at a time to amortize loop overhead and keep
+    // the accumulator in registers across chunks.
+    for (; byte_idx + 3 < level_width_bytes; byte_idx += 4) {
+        size_t byte_offset0 = (byte_idx + 0) * max_batch_size;
+        size_t byte_offset1 = (byte_idx + 1) * max_batch_size;
+        size_t byte_offset2 = (byte_idx + 2) * max_batch_size;
+        size_t byte_offset3 = (byte_idx + 3) * max_batch_size;
+
+        float* sim_table0 = sim_table + (byte_idx + 0) * 256;
+        float* sim_table1 = sim_table + (byte_idx + 1) * 256;
+        float* sim_table2 = sim_table + (byte_idx + 2) * 256;
+        float* sim_table3 = sim_table + (byte_idx + 3) * 256;
+
+        size_t batch_idx = 0;
+        for (; batch_idx + 7 < num_active; batch_idx += 8) {
+            __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx);
+
+            // Load 8 byte codes, zero-extend to 32-bit indices.
+            __m128i raw0 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset0 + batch_idx));
+            __m128i raw1 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset1 + batch_idx));
+            __m128i raw2 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset2 + batch_idx));
+            __m128i raw3 = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset3 + batch_idx));
+
+            __m256i codes0 = _mm256_cvtepu8_epi32(raw0);
+            __m256i codes1 = _mm256_cvtepu8_epi32(raw1);
+            __m256i codes2 = _mm256_cvtepu8_epi32(raw2);
+            __m256i codes3 = _mm256_cvtepu8_epi32(raw3);
+
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table0, codes0, sizeof(float)));
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table1, codes1, sizeof(float)));
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table2, codes2, sizeof(float)));
+            acc = _mm256_add_ps(
+                    acc,
+                    _mm256_i32gather_ps(sim_table3, codes3, sizeof(float)));
+
+            _mm256_storeu_ps(exact_distances + batch_idx, acc);
+        }
+
+        for (; batch_idx < num_active; batch_idx++) {
+            float acc = exact_distances[batch_idx];
+            acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]];
+            acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]];
+            acc += sim_table2[compressed_codes[byte_offset2 + batch_idx]];
+            acc += sim_table3[compressed_codes[byte_offset3 + batch_idx]];
+            exact_distances[batch_idx] = acc;
+        }
+    }
+
+    for (; byte_idx < level_width_bytes; byte_idx++) {
+        size_t byte_offset = byte_idx * max_batch_size;
+        float* sim_table_ptr = sim_table + byte_idx * 256;
+
+        size_t batch_idx = 0;
+        for (; batch_idx + 7 < num_active; batch_idx += 8) {
+            __m256 acc = _mm256_loadu_ps(exact_distances + batch_idx);
+            __m128i raw = _mm_loadl_epi64(
+                    (__m128i*)(compressed_codes + byte_offset + batch_idx));
+            __m256i codes = _mm256_cvtepu8_epi32(raw);
+            __m256 m_dist =
+                    _mm256_i32gather_ps(sim_table_ptr, codes, sizeof(float));
+            acc = _mm256_add_ps(acc, m_dist);
+            _mm256_storeu_ps(exact_distances + batch_idx, acc);
+        }
+
+        for (; batch_idx < num_active; batch_idx++) {
+            exact_distances[batch_idx] +=
+                    sim_table_ptr[compressed_codes[byte_offset + batch_idx]];
+        }
+    }
+}
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<SIMDLevel::AVX2>(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    uint8_t* compressed_codes = compressed_codes_begin;
+    size_t num_active = 0;
+
+    // An important optimization is to skip the compression if all points
+    // are active, as we can just use the compressed_codes_begin pointer.
+    if (next_num_active < max_batch_size) {
+        // Compress the codes: here we don't need to process remainders
+        // as long as `max_batch_size` is a multiple of 64 (which we
+        // assert in the constructor). Conveniently, compressed_codes is
+        // allocated to `max_batch_size` * `level_width_bytes` elements.
+        // `num_active` is guaranteed to always be less than or equal to
+        // `max_batch_size`. Only the last batch may be smaller than
+        // `max_batch_size`, the caller ensures that the batch and
+        // bitset are padded with zeros.
+        compressed_codes = compressed_codes_begin;
+        for (size_t point_idx = 0; point_idx < max_batch_size;
+             point_idx += 64) {
+            // Build a 64-bit mask from the byteset: each byte is
+            // 0 or 1, collect into a single bitmask.
+            uint64_t mask = 0;
+#ifdef __BMI2__
+            for (int g = 0; g < 8; g++) {
+                uint64_t bytes;
+                memcpy(&bytes, bitset + point_idx + g * 8, 8);
+                uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL);
+                mask |= ((uint64_t)bits << (g * 8));
+            }
+#else
+            for (int b = 0; b < 64; b++) {
+                if (bitset[point_idx + b])
+                    mask |= (1ULL << b);
+            }
+#endif
+
+            // Byte-level stream compaction.
+#ifdef __BMI2__
+            // PEXT/PDEP path: process 8 bytes at a time. PDEP
+            // expands the per-byte mask bits into a per-byte lane
+            // mask, then PEXT extracts only the selected bytes.
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
+                int write_pos = 0;
+                for (int g = 0; g < 8; g++) {
+                    uint64_t src_val;
+                    memcpy(&src_val, src + g * 8, 8);
+                    uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF);
+                    uint64_t byte_mask =
+                            _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF;
+                    uint64_t compressed_val = _pext_u64(src_val, byte_mask);
+                    int count = __builtin_popcount(submask);
+                    memcpy(dst + write_pos, &compressed_val, 8);
+                    write_pos += count;
+                }
+            }
+#else
+            // Scalar fallback: scan set bits one by one and copy
+            // the corresponding code byte.
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
+                int write_pos = 0;
+                uint64_t m = mask;
+                while (m) {
+                    int bit = __builtin_ctzll(m);
+                    dst[write_pos++] = src[bit];
+                    m &= m - 1;
+                }
+            }
+#endif
+
+            num_active += __builtin_popcountll(mask);
+        }
+    } else {
+        num_active = next_num_active;
+        compressed_codes = const_cast<uint8_t*>(codes);
+    }
+
+    return std::make_pair(compressed_codes, num_active);
+}
+
+} // namespace panorama_kernels
+} // namespace faiss
+
+#endif // COMPILE_SIMD_AVX2
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
new file mode 100644
index 0000000000..811fb34579
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-avx512.cpp
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_AVX512
+
+#include <immintrin.h>
+
+#include <faiss/impl/panorama_kernels/panorama_kernels-inl.h>
+
+#include <cstring>
+
+namespace faiss {
+namespace panorama_kernels {
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::AVX512>(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    size_t byte_idx = 0;
+
+    // Process 4 chunks at a time to amortize loop overhead and keep
+    // the accumulator in registers across chunks.
+    for (; byte_idx + 3 < level_width_bytes; byte_idx += 4) {
+        size_t byte_offset0 = (byte_idx + 0) * max_batch_size;
+        size_t byte_offset1 = (byte_idx + 1) * max_batch_size;
+        size_t byte_offset2 = (byte_idx + 2) * max_batch_size;
+        size_t byte_offset3 = (byte_idx + 3) * max_batch_size;
+
+        float* sim_table0 = sim_table + (byte_idx + 0) * 256;
+        float* sim_table1 = sim_table + (byte_idx + 1) * 256;
+        float* sim_table2 = sim_table + (byte_idx + 2) * 256;
+        float* sim_table3 = sim_table + (byte_idx + 3) * 256;
+
+        size_t batch_idx = 0;
+        for (; batch_idx + 15 < num_active; batch_idx += 16) {
+            __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
+
+            __m128i comp0 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + byte_offset0 + batch_idx));
+            __m128i comp1 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + byte_offset1 + batch_idx));
+            __m128i comp2 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + byte_offset2 + batch_idx));
+            __m128i comp3 = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + byte_offset3 + batch_idx));
+
+            __m512i codes0 = _mm512_cvtepu8_epi32(comp0);
+            __m512i codes1 = _mm512_cvtepu8_epi32(comp1);
+            __m512i codes2 = _mm512_cvtepu8_epi32(comp2);
+            __m512i codes3 = _mm512_cvtepu8_epi32(comp3);
+
+            acc = _mm512_add_ps(
+                    acc,
+                    _mm512_i32gather_ps(codes0, sim_table0, sizeof(float)));
+            acc = _mm512_add_ps(
+                    acc,
+                    _mm512_i32gather_ps(codes1, sim_table1, sizeof(float)));
+            acc = _mm512_add_ps(
+                    acc,
+                    _mm512_i32gather_ps(codes2, sim_table2, sizeof(float)));
+            acc = _mm512_add_ps(
+                    acc,
+                    _mm512_i32gather_ps(codes3, sim_table3, sizeof(float)));
+
+            _mm512_storeu_ps(exact_distances + batch_idx, acc);
+        }
+
+        for (; batch_idx < num_active; batch_idx++) {
+            float acc = exact_distances[batch_idx];
+            acc += sim_table0[compressed_codes[byte_offset0 + batch_idx]];
+            acc += sim_table1[compressed_codes[byte_offset1 + batch_idx]];
+            acc += sim_table2[compressed_codes[byte_offset2 + batch_idx]];
+            acc += sim_table3[compressed_codes[byte_offset3 + batch_idx]];
+            exact_distances[batch_idx] = acc;
+        }
+    }
+
+    for (; byte_idx < level_width_bytes; byte_idx++) {
+        size_t byte_offset = byte_idx * max_batch_size;
+        float* sim_table_ptr = sim_table + byte_idx * 256;
+
+        size_t batch_idx = 0;
+        for (; batch_idx + 15 < num_active; batch_idx += 16) {
+            __m512 acc = _mm512_loadu_ps(exact_distances + batch_idx);
+            __m128i comp = _mm_loadu_si128(
+                    (__m128i*)(compressed_codes + byte_offset + batch_idx));
+            __m512i codes = _mm512_cvtepu8_epi32(comp);
+            __m512 m_dist =
+                    _mm512_i32gather_ps(codes, sim_table_ptr, sizeof(float));
+            acc = _mm512_add_ps(acc, m_dist);
+            _mm512_storeu_ps(exact_distances + batch_idx, acc);
+        }
+
+        for (; batch_idx < num_active; batch_idx++) {
+            exact_distances[batch_idx] +=
+                    sim_table_ptr[compressed_codes[byte_offset + batch_idx]];
+        }
+    }
+}
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<SIMDLevel::AVX512>(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    uint8_t* compressed_codes = compressed_codes_begin;
+    size_t num_active = 0;
+
+    // An important optimization is to skip the compression if all points
+    // are active, as we can just use the compressed_codes_begin pointer.
+    if (next_num_active < max_batch_size) {
+        // Compress the codes: here we don't need to process remainders
+        // as long as `max_batch_size` is a multiple of 64 (which we
+        // assert in the constructor). Conveniently, compressed_codes is
+        // allocated to `max_batch_size` * `level_width_bytes` elements.
+        // `num_active` is guaranteed to always be less than or equal to
+        // `max_batch_size`. Only the last batch may be smaller than
+        // `max_batch_size`, the caller ensures that the batch and
+        // bitset are padded with zeros.
+        compressed_codes = compressed_codes_begin;
+        for (size_t point_idx = 0; point_idx < max_batch_size;
+             point_idx += 64) {
+            // Build a 64-bit mask from the byteset: each byte is
+            // 0 or 1, collect into a single bitmask.
+            uint64_t mask = 0;
+#ifdef __BMI2__
+            // PEXT path: extract the LSB of each byte into a
+            // single bit, producing a 64-bit bitmask.
+            for (int g = 0; g < 8; g++) {
+                uint64_t bytes;
+                memcpy(&bytes, bitset + point_idx + g * 8, 8);
+                uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL);
+                mask |= ((uint64_t)bits << (g * 8));
+            }
+#else
+            for (int b = 0; b < 64; b++) {
+                if (bitset[point_idx + b])
+                    mask |= (1ULL << b);
+            }
+#endif
+
+            // Byte-level stream compaction (replaces
+            // _mm512_maskz_compress_epi8 which requires VBMI2).
+#ifdef __BMI2__
+            // PEXT/PDEP path: process 8 bytes at a time. PDEP
+            // expands the per-byte mask bits into a per-byte lane
+            // mask, then PEXT extracts only the selected bytes.
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
+                int write_pos = 0;
+                for (int g = 0; g < 8; g++) {
+                    uint64_t src_val;
+                    memcpy(&src_val, src + g * 8, 8);
+                    uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF);
+                    uint64_t byte_mask =
+                            _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF;
+                    uint64_t compressed_val = _pext_u64(src_val, byte_mask);
+                    int count = __builtin_popcount(submask);
+                    memcpy(dst + write_pos, &compressed_val, 8);
+                    write_pos += count;
+                }
+            }
+#else
+            // Scalar fallback: scan set bits one by one and copy
+            // the corresponding code byte.
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
+                int write_pos = 0;
+                uint64_t m = mask;
+                while (m) {
+                    int bit = __builtin_ctzll(m);
+                    dst[write_pos++] = src[bit];
+                    m &= m - 1;
+                }
+            }
+#endif
+
+            num_active += __builtin_popcountll(mask);
+        }
+    } else {
+        num_active = next_num_active;
+        compressed_codes = const_cast<uint8_t*>(codes);
+    }
+
+    return std::make_pair(compressed_codes, num_active);
+}
+
+#ifdef COMPILE_SIMD_AVX512_SPR
+// AVX512_SPR: Sapphire Rapids is a superset of AVX512. Reuse the
+// AVX512 implementation until a dedicated SPR specialization is written.
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::AVX512_SPR>(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    process_level_impl<SIMDLevel::AVX512>(
+            level_width_bytes,
+            max_batch_size,
+            num_active,
+            sim_table,
+            compressed_codes,
+            exact_distances);
+}
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<
+        SIMDLevel::AVX512_SPR>(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    return process_code_compression_impl<SIMDLevel::AVX512>(
+            next_num_active,
+            max_batch_size,
+            level_width_bytes,
+            compressed_codes_begin,
+            bitset,
+            codes);
+}
+#endif // COMPILE_SIMD_AVX512_SPR
+
+} // namespace panorama_kernels
+} // namespace faiss
+
+#endif // COMPILE_SIMD_AVX512
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
new file mode 100644
index 0000000000..9601cec46d
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-generic.cpp
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// This TU provides:
+// 1. _impl specializations for NONE, using scalar code.
+// 2. Non-templated Panorama kernel dispatch wrappers
+//    (process_level, process_filtering, process_code_compression) declared
+//    in panorama_kernels.h. These use DISPATCH_SIMDLevel to route to the
+//    best available SIMD implementation via the _impl function template
+//    specializations defined in the per-SIMD .cpp files.
+
+#include <faiss/impl/panorama_kernels/panorama_kernels-inl.h>
+
+#include <cstring>
+
+#ifdef __BMI2__
+#include <immintrin.h>
+#endif
+
+namespace faiss {
+namespace panorama_kernels {
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::NONE>(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    for (size_t byte_idx = 0; byte_idx < level_width_bytes; byte_idx++) {
+        size_t byte_offset = byte_idx * max_batch_size;
+        float* chunk_sim = sim_table + byte_idx * 256;
+        for (size_t i = 0; i < num_active; i++) {
+            exact_distances[i] += chunk_sim[compressed_codes[byte_offset + i]];
+        }
+    }
+}
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<SIMDLevel::NONE>(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    uint8_t* compressed_codes = compressed_codes_begin;
+    size_t num_active = 0;
+
+    // An important optimization is to skip the compression if all points
+    // are active, as we can just use the compressed_codes_begin pointer.
+    if (next_num_active < max_batch_size) {
+        compressed_codes = compressed_codes_begin;
+        for (size_t point_idx = 0; point_idx < max_batch_size;
+             point_idx += 64) {
+            // Build a 64-bit mask from the byteset: each byte is
+            // 0 or 1, collect into a single bitmask.
+            uint64_t mask = 0;
+#ifdef __BMI2__
+            for (int g = 0; g < 8; g++) {
+                uint64_t bytes;
+                memcpy(&bytes, bitset + point_idx + g * 8, 8);
+                uint8_t bits = (uint8_t)_pext_u64(bytes, 0x0101010101010101ULL);
+                mask |= ((uint64_t)bits << (g * 8));
+            }
+#else
+            for (int b = 0; b < 64; b++) {
+                if (bitset[point_idx + b])
+                    mask |= (1ULL << b);
+            }
+#endif
+
+            // Byte-level stream compaction.
+#ifdef __BMI2__
+            // PEXT/PDEP path: process 8 bytes at a time. PDEP
+            // expands the per-byte mask bits into a per-byte lane
+            // mask, then PEXT extracts only the selected bytes.
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
+                int write_pos = 0;
+                for (int g = 0; g < 8; g++) {
+                    uint64_t src_val;
+                    memcpy(&src_val, src + g * 8, 8);
+                    uint8_t submask = (uint8_t)((mask >> (g * 8)) & 0xFF);
+                    uint64_t byte_mask =
+                            _pdep_u64(submask, 0x0101010101010101ULL) * 0xFF;
+                    uint64_t compressed_val = _pext_u64(src_val, byte_mask);
+                    int count = __builtin_popcount(submask);
+                    memcpy(dst + write_pos, &compressed_val, 8);
+                    write_pos += count;
+                }
+            }
+#else
+            // Scalar fallback: scan set bits one by one and copy
+            // the corresponding code byte.
+            for (size_t li = 0; li < level_width_bytes; li++) {
+                size_t byte_offset = li * max_batch_size;
+                const uint8_t* src = codes + byte_offset + point_idx;
+                uint8_t* dst = compressed_codes + byte_offset + num_active;
+                int write_pos = 0;
+                uint64_t m = mask;
+                while (m) {
+                    int bit = __builtin_ctzll(m);
+                    dst[write_pos++] = src[bit];
+                    m &= m - 1;
+                }
+            }
+#endif
+
+            num_active += __builtin_popcountll(mask);
+        }
+    } else {
+        num_active = next_num_active;
+        compressed_codes = const_cast<uint8_t*>(codes);
+    }
+
+    return std::make_pair(compressed_codes, num_active);
+}
+
+void process_level(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    DISPATCH_SIMDLevel(
+            process_level_impl,
+            level_width_bytes,
+            max_batch_size,
+            num_active,
+            sim_table,
+            compressed_codes,
+            exact_distances);
+}
+
+size_t process_filtering(
+        size_t num_active,
+        float* exact_distances,
+        uint32_t* active_indices,
+        float* cum_sums,
+        uint8_t* bitset,
+        size_t batch_offset,
+        float dis0,
+        float query_cum_norm,
+        float heap_max) {
+    size_t next_num_active = 0;
+    for (size_t i = 0; i < num_active; i++) {
+        float exact_distance = exact_distances[i];
+        float cum_sum = cum_sums[active_indices[i] - batch_offset];
+        float lower_bound = exact_distance + dis0 - cum_sum * query_cum_norm;
+
+        bool keep = heap_max > lower_bound;
+        active_indices[next_num_active] = active_indices[i];
+        exact_distances[next_num_active] = exact_distance;
+        bitset[active_indices[i] - batch_offset] = keep;
+        next_num_active += keep;
+    }
+    return next_num_active;
+}
+
+std::pair<uint8_t*, size_t> process_code_compression(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    DISPATCH_SIMDLevel(
+            process_code_compression_impl,
+            next_num_active,
+            max_batch_size,
+            level_width_bytes,
+            compressed_codes_begin,
+            bitset,
+            codes);
+}
+
+} // namespace panorama_kernels
+} // namespace faiss
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-inl.h b/faiss/impl/panorama_kernels/panorama_kernels-inl.h
new file mode 100644
index 0000000000..b1ae4316db
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-inl.h
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/**
+ * @file panorama_kernels-inl.h
+ * @brief Private header for Panorama kernel SIMD implementations.
+ *
+ * This is a PRIVATE header — do not include in public APIs or user code.
+ * Only faiss internal .cpp files (the per-SIMD implementation files and
+ * panorama_kernels-generic.cpp) should include this header.
+ *
+ * This header re-exports the public API (panorama_kernels.h) plus the
+ * simd_dispatch.h machinery needed by the implementation files.
+ */
+
+#include <faiss/impl/panorama_kernels/panorama_kernels.h>
+#include <faiss/impl/simd_dispatch.h>
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp b/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp
new file mode 100644
index 0000000000..88eba0b574
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-neon.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// ARM NEON implementations of Panorama kernels.
+// TODO(@AlSchlo, @aknayar): implement NEON-optimized panorama kernels.
+// Currently delegates to the scalar (NONE) implementation.
+
+#ifdef COMPILE_SIMD_ARM_NEON
+
+#include <faiss/impl/panorama_kernels/panorama_kernels-inl.h>
+
+namespace faiss {
+namespace panorama_kernels {
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::ARM_NEON>(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    process_level_impl<SIMDLevel::NONE>(
+            level_width_bytes,
+            max_batch_size,
+            num_active,
+            sim_table,
+            compressed_codes,
+            exact_distances);
+}
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<SIMDLevel::ARM_NEON>(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    return process_code_compression_impl<SIMDLevel::NONE>(
+            next_num_active,
+            max_batch_size,
+            level_width_bytes,
+            compressed_codes_begin,
+            bitset,
+            codes);
+}
+
+} // namespace panorama_kernels
+} // namespace faiss
+
+#endif // COMPILE_SIMD_ARM_NEON
diff --git a/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp b/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp
new file mode 100644
index 0000000000..af89f6aac9
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels-sve.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// ARM SVE implementations of Panorama kernels.
+// TODO(@AlSchlo, @aknayar): implement SVE-optimized panorama kernels.
+// Currently delegates to the scalar (NONE) implementation.
+
+#ifdef COMPILE_SIMD_ARM_SVE
+
+#include <faiss/impl/panorama_kernels/panorama_kernels-inl.h>
+
+namespace faiss {
+namespace panorama_kernels {
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+void process_level_impl<SIMDLevel::ARM_SVE>(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances) {
+    process_level_impl<SIMDLevel::NONE>(
+            level_width_bytes,
+            max_batch_size,
+            num_active,
+            sim_table,
+            compressed_codes,
+            exact_distances);
+}
+
+// NOLINTNEXTLINE(facebook-hte-MisplacedTemplateSpecialization)
+template <>
+std::pair<uint8_t*, size_t> process_code_compression_impl<SIMDLevel::ARM_SVE>(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes) {
+    return process_code_compression_impl<SIMDLevel::NONE>(
+            next_num_active,
+            max_batch_size,
+            level_width_bytes,
+            compressed_codes_begin,
+            bitset,
+            codes);
+}
+
+} // namespace panorama_kernels
+} // namespace faiss
+
+#endif // COMPILE_SIMD_ARM_SVE
diff --git a/faiss/impl/panorama_kernels/panorama_kernels.h b/faiss/impl/panorama_kernels/panorama_kernels.h
new file mode 100644
index 0000000000..3a67415523
--- /dev/null
+++ b/faiss/impl/panorama_kernels/panorama_kernels.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+/**
+ * @file panorama_kernels.h
+ * @brief Panorama search kernels with SIMD-dispatched implementations.
+ *
+ * The three core kernels of the Panorama progressive filtering search:
+ * - process_level: accumulate PQ distance table lookups over chunks
+ * - process_filtering: Cauchy-Schwarz lower bound pruning with stream
+ *   compaction
+ * - process_code_compression: byte-level stream compaction of PQ codes
+ */
+
+#include <cstddef>
+#include <cstdint>
+#include <utility>
+
+#include <faiss/impl/platform_macros.h>
+#include <faiss/utils/simd_levels.h>
+
+namespace faiss {
+namespace panorama_kernels {
+
+template <SIMDLevel SL>
+void process_level_impl(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances);
+
+template <SIMDLevel SL>
+std::pair<uint8_t*, size_t> process_code_compression_impl(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes);
+
+/// Accumulate PQ distance table lookups over chunks.
+///
+/// For each chunk, looks up `sim_table[compressed_codes[i]]` and
+/// accumulates into `exact_distances[i]` for all active elements.
+/// Iterates chunks first to keep the LUT slice in L1 cache.
+/// The AVX2/AVX-512 versions unroll 4 chunks at a time.
+FAISS_API void process_level(
+        size_t level_width_bytes,
+        size_t max_batch_size,
+        size_t num_active,
+        float* sim_table,
+        uint8_t* compressed_codes,
+        float* exact_distances);
+
+/// Filter active elements using Cauchy-Schwarz lower bound pruning.
+///
+/// Computes a lower bound on the true distance for each active element
+/// and removes elements that cannot improve the current heap top.
+/// Uses stream compaction to pack surviving elements contiguously.
+/// Updates the bitset to reflect which elements were removed.
+FAISS_API size_t process_filtering(
+        size_t num_active,
+        float* exact_distances,
+        uint32_t* active_indices,
+        float* cum_sums,
+        uint8_t* bitset,
+        size_t batch_offset,
+        float dis0,
+        float query_cum_norm,
+        float heap_max);
+
+/// Byte-level stream compaction of PQ codes using the active bitset.
+///
+/// An important optimization is to skip the compression if all points
+/// are active, as we can just use the original codes pointer.
+///
+/// Compress the codes: here we don't need to process remainders
+/// as long as `max_batch_size` is a multiple of 64 (which we
+/// assert in the constructor). Conveniently, compressed_codes is
+/// allocated to `max_batch_size` * `level_width_bytes` elements.
+/// `num_active` is guaranteed to always be less than or equal to
+/// `max_batch_size`. Only the last batch may be smaller than
+/// `max_batch_size`, the caller ensures that the batch and
+/// bitset are padded with zeros.
+FAISS_API std::pair<uint8_t*, size_t> process_code_compression(
+        size_t next_num_active,
+        size_t max_batch_size,
+        size_t level_width_bytes,
+        uint8_t* compressed_codes_begin,
+        uint8_t* bitset,
+        const uint8_t* codes);
+
+} // namespace panorama_kernels
+} // namespace faiss
diff --git a/faiss/index_factory.cpp b/faiss/index_factory.cpp
index 1e66227aea..8acbd02003 100644
--- a/faiss/index_factory.cpp
+++ b/faiss/index_factory.cpp
@@ -30,6 +30,7 @@
 #include <faiss/IndexIVFFlatPanorama.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQFastScan.h>
+#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFRaBitQ.h>
 #include <faiss/IndexIVFRaBitQFastScan.h>
@@ -355,6 +356,12 @@ IndexIVF* parse_IndexIVF(
                 /*by_residual=*/true,
                 own_il);
     }
+    if (match("PQ([0-9]+)(x[0-9]+)?Panorama([0-9]+)?")) {
+        int M = mres_to_int(sm[1]), nbit = mres_to_int(sm[2], 8, 1);
+        int nlevels = mres_to_int(sm[3], 8);
+        return new IndexIVFPQPanorama(
+                get_q(), d, nlist, M, nbit, nlevels, 128, mt, own_il);
+    }
     if (match("PQ([0-9]+)(x[0-9]+)?(np)?")) {
         int M = mres_to_int(sm[1]), nbit = mres_to_int(sm[2], 8, 1);
         IndexIVFPQ* index_ivf =
diff --git a/faiss/invlists/InvertedLists.cpp b/faiss/invlists/InvertedLists.cpp
index 6f8e88ef35..63a4d3383c 100644
--- a/faiss/invlists/InvertedLists.cpp
+++ b/faiss/invlists/InvertedLists.cpp
@@ -11,6 +11,7 @@
 #include <memory>
 
 #include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/PanoramaPQ.h>
 #include <faiss/utils/utils.h>
 
 namespace faiss {
@@ -354,22 +355,15 @@ ArrayInvertedLists::~ArrayInvertedLists() {}
 ArrayInvertedListsPanorama::ArrayInvertedListsPanorama(
         size_t nlist_in,
         size_t code_size_in,
-        size_t n_levels_in)
-        : ArrayInvertedLists(nlist_in, code_size_in),
-          n_levels(n_levels_in),
-          level_width(
-                  (((code_size_in / sizeof(float)) + n_levels_in - 1) /
-                   n_levels_in) *
-                  sizeof(float)),
-          pano(code_size_in, n_levels_in, kBatchSize) {
-    FAISS_THROW_IF_NOT(n_levels_in > 0);
-    FAISS_THROW_IF_NOT(code_size_in % sizeof(float) == 0);
+        Panorama* pano_in)
+        : ArrayInvertedLists(nlist_in, code_size_in), pano(pano_in) {
+    FAISS_THROW_IF_NOT(pano != nullptr);
+    FAISS_THROW_IF_NOT(pano_in->n_levels > 0);
     FAISS_THROW_IF_NOT_MSG(
-            !use_iterator,
-            "IndexIVFFlatPanorama does not support iterators, use vanilla IndexIVFFlat instead");
-    FAISS_ASSERT(level_width % sizeof(float) == 0);
+            !use_iterator, "Panorama does not support iterators");
 
     cum_sums.resize(nlist_in);
+    init_dists.resize(nlist_in);
 }
 
 const float* ArrayInvertedListsPanorama::get_cum_sums(size_t list_no) const {
@@ -377,6 +371,11 @@ const float* ArrayInvertedListsPanorama::get_cum_sums(size_t list_no) const {
     return cum_sums[list_no].data();
 }
 
+const float* ArrayInvertedListsPanorama::get_init_dists(size_t list_no) const {
+    assert(list_no < nlist);
+    return init_dists[list_no].data();
+}
+
 size_t ArrayInvertedListsPanorama::add_entries(
         size_t list_no,
         size_t n_entry,
@@ -389,15 +388,21 @@ size_t ArrayInvertedListsPanorama::add_entries(
     memcpy(&ids[list_no][o], ids_in, sizeof(ids_in[0]) * n_entry);
 
     size_t new_size = o + n_entry;
-    size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize;
-    codes[list_no].resize(num_batches * kBatchSize * code_size);
-    cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1));
+    size_t bs = pano->batch_size;
+    size_t num_batches = (new_size + bs - 1) / bs;
+    size_t padded = num_batches * bs;
+    codes[list_no].resize(padded * code_size);
+    cum_sums[list_no].resize(padded * (pano->n_levels + 1));
+
+    pano->copy_codes_to_level_layout(codes[list_no].data(), o, n_entry, code);
+    pano->compute_cumulative_sums(cum_sums[list_no].data(), o, n_entry, code);
 
-    // Cast to float* is safe here as we guarantee codes are always float
-    // vectors for `IndexIVFFlatPanorama` (verified by the constructor).
-    const float* vectors = reinterpret_cast<const float*>(code);
-    pano.copy_codes_to_level_layout(codes[list_no].data(), o, n_entry, code);
-    pano.compute_cumulative_sums(cum_sums[list_no].data(), o, n_entry, vectors);
+    auto* pano_pq = dynamic_cast<PanoramaPQ*>(pano.get());
+    if (pano_pq) {
+        init_dists[list_no].resize(padded);
+        pano_pq->compute_init_distances(
+                init_dists[list_no].data(), list_no, o, n_entry, code);
+    }
 
     return o;
 }
@@ -413,21 +418,30 @@ void ArrayInvertedListsPanorama::update_entries(
 
     memcpy(&ids[list_no][offset], ids_in, sizeof(ids_in[0]) * n_entry);
 
-    // Cast to float* is safe here as we guarantee codes are always float
-    // vectors for `IndexIVFFlatPanorama` (verified by the constructor).
-    const float* vectors = reinterpret_cast<const float*>(code);
-    pano.copy_codes_to_level_layout(
+    pano->copy_codes_to_level_layout(
             codes[list_no].data(), offset, n_entry, code);
-    pano.compute_cumulative_sums(
-            cum_sums[list_no].data(), offset, n_entry, vectors);
+    pano->compute_cumulative_sums(
+            cum_sums[list_no].data(), offset, n_entry, code);
+
+    auto* pano_pq = dynamic_cast<PanoramaPQ*>(pano.get());
+    if (pano_pq) {
+        pano_pq->compute_init_distances(
+                init_dists[list_no].data(), list_no, offset, n_entry, code);
+    }
 }
 
 void ArrayInvertedListsPanorama::resize(size_t list_no, size_t new_size) {
     ids[list_no].resize(new_size);
 
-    size_t num_batches = (new_size + kBatchSize - 1) / kBatchSize;
-    codes[list_no].resize(num_batches * kBatchSize * code_size);
-    cum_sums[list_no].resize(num_batches * kBatchSize * (n_levels + 1));
+    size_t bs = pano->batch_size;
+    size_t num_batches = (new_size + bs - 1) / bs;
+    size_t padded = num_batches * bs;
+    codes[list_no].resize(padded * code_size);
+    cum_sums[list_no].resize(padded * (pano->n_levels + 1));
+
+    if (dynamic_cast<PanoramaPQ*>(pano.get())) {
+        init_dists[list_no].resize(padded);
+    }
 }
 
 const uint8_t* ArrayInvertedListsPanorama::get_single_code(
@@ -439,7 +453,7 @@ const uint8_t* ArrayInvertedListsPanorama::get_single_code(
     uint8_t* recons_buffer = new uint8_t[code_size];
 
     float* recons = reinterpret_cast<float*>(recons_buffer);
-    pano.reconstruct(offset, recons, codes[list_no].data());
+    pano->reconstruct(offset, recons, codes[list_no].data());
 
     return recons_buffer;
 }
diff --git a/faiss/invlists/InvertedLists.h b/faiss/invlists/InvertedLists.h
index 743bcad62d..f6ea588efd 100644
--- a/faiss/invlists/InvertedLists.h
+++ b/faiss/invlists/InvertedLists.h
@@ -15,6 +15,7 @@
  * the interface.
  */
 
+#include <memory>
 #include <vector>
 
 #include <faiss/MetricType.h>
@@ -277,21 +278,23 @@ struct ArrayInvertedLists : InvertedLists {
     ~ArrayInvertedLists() override;
 };
 
-/// Level-oriented storage as defined in the IVFFlat section of Panorama
+/// Level-oriented storage as defined in the Panorama paper
 /// (https://www.arxiv.org/pdf/2510.00566).
+/// Works with both flat codes (PanoramaFlat) and PQ codes (PanoramaPQ)
+/// via the virtual Panorama interface.
 struct ArrayInvertedListsPanorama : ArrayInvertedLists {
-    static constexpr size_t kBatchSize = 128;
     std::vector<MaybeOwnedVector<float>> cum_sums;
-    const size_t n_levels;
-    const size_t level_width; // in code units
-    Panorama pano;
+    std::vector<MaybeOwnedVector<float>> init_dists;
+    std::unique_ptr<Panorama> pano;
 
+    /// Takes ownership of the provided Panorama*.
     ArrayInvertedListsPanorama(
             size_t nlist_in,
             size_t code_size_in,
-            size_t n_levels_in);
+            Panorama* pano_in);
 
     const float* get_cum_sums(size_t list_no) const;
+    const float* get_init_dists(size_t list_no) const;
 
     size_t add_entries(
             size_t list_no,
diff --git a/faiss/python/__init__.py b/faiss/python/__init__.py
index 05b376efce..1e82912ca4 100644
--- a/faiss/python/__init__.py
+++ b/faiss/python/__init__.py
@@ -177,6 +177,7 @@ def replacement_function(*args):
 add_ref_in_constructor(IndexPreTransform, {2: [0, 1], 1: [0]})
 add_ref_in_method(IndexPreTransform, 'prepend_transform', 0)
 add_ref_in_constructor(IndexIVFPQ, 0)
+add_ref_in_constructor(IndexIVFPQPanorama, 0)
 add_ref_in_constructor(IndexIVFPQR, 0)
 add_ref_in_constructor(IndexIVFPQFastScan, 0)
 add_ref_in_constructor(IndexIVFResidualQuantizer, 0)
diff --git a/faiss/python/swigfaiss.swig b/faiss/python/swigfaiss.swig
index af9b87e981..a7ce6a3ae4 100644
--- a/faiss/python/swigfaiss.swig
+++ b/faiss/python/swigfaiss.swig
@@ -96,6 +96,7 @@ typedef uint64_t size_t;
 #include <faiss/IndexAdditiveQuantizer.h>
 #include <faiss/IndexIVF.h>
 #include <faiss/IndexIVFPQ.h>
+#include <faiss/IndexIVFPQPanorama.h>
 #include <faiss/Index2Layer.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexIVFFlat.h>
@@ -541,6 +542,7 @@ void gpu_sync_all_devices()
 %include  <faiss/IndexAdditiveQuantizer.h>
 %include  <faiss/impl/io.h>
 
+%ignore faiss::ArrayInvertedListsPanorama::pano;
 %include  <faiss/invlists/InvertedLists.h>
 %include  <faiss/invlists/InvertedListsIOHook.h>
 %ignore BlockInvertedListsIOHook;
@@ -594,6 +596,7 @@ void gpu_sync_all_devices()
 
 %ignore faiss::IndexIVFPQ::alloc_type;
 %include  <faiss/IndexIVFPQ.h>
+%include  <faiss/IndexIVFPQPanorama.h>
 %include  <faiss/IndexIVFPQR.h>
 %include  <faiss/Index2Layer.h>
 
@@ -779,6 +782,7 @@ void gpu_sync_all_devices()
     DOWNCAST ( IndexIVFRaBitQ )
     DOWNCAST ( IndexIVFRaBitQFastScan )
     DOWNCAST ( IndexIVFIndependentQuantizer)
+    DOWNCAST ( IndexIVFPQPanorama )
     DOWNCAST ( IndexIVFPQR )
     DOWNCAST ( IndexIVFPQ )
     DOWNCAST ( IndexIVFPQFastScan )
diff --git a/tests/test_factory.py b/tests/test_factory.py
index 2246eb8c10..922ab14cf0 100644
--- a/tests/test_factory.py
+++ b/tests/test_factory.py
@@ -70,6 +70,21 @@ def test_factory_6(self):
         assert index.d == 128
         assert index.metric_type == faiss.METRIC_L2
 
+    def test_factory_panorama(self):
+        index = faiss.index_factory(64, "IVF16,PQ16x8Panorama4")
+        assert isinstance(index, faiss.IndexIVFPQPanorama)
+        assert index.n_levels == 4
+        assert index.pq.M == 16
+
+        index = faiss.index_factory(64, "IVF16,PQ16Panorama")
+        assert isinstance(index, faiss.IndexIVFPQPanorama)
+        assert index.n_levels == 8  # default
+
+        index = faiss.index_factory(64, "PCA64,IVF16,PQ16x8Panorama4")
+        ivf = faiss.downcast_index(index.index)
+        assert isinstance(ivf, faiss.IndexIVFPQPanorama)
+        assert ivf.n_levels == 4
+
     def test_factory_HNSW(self):
         index = faiss.index_factory(12, "HNSW32")
         assert index.storage.sa_code_size() == 12 * 4
diff --git a/tests/test_ivfpq_panorama.py b/tests/test_ivfpq_panorama.py
new file mode 100644
index 0000000000..d2e28d78d6
--- /dev/null
+++ b/tests/test_ivfpq_panorama.py
@@ -0,0 +1,617 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+"""
+Comprehensive test suite for IndexIVFPQPanorama.
+
+Panorama is an adaptation of IndexIVFPQ that uses level-oriented storage
+and progressive filtering with Cauchy-Schwarz bounds to achieve significant
+speedups when combined with PCA or Cayley transforms, with zero loss in
+accuracy.
+
+Paper: https://www.arxiv.org/pdf/2510.00566
+
+Constraints specific to IndexIVFPQPanorama:
+  - Only L2 metric is supported.
+  - Only 8-bit PQ codes (nbits == 8).
+  - M must be divisible by n_levels.
+  - batch_size must be a multiple of 64.
+  - use_precomputed_table must be 1.
+"""
+
+import unittest
+
+import faiss
+import numpy as np
+from faiss.contrib.datasets import SyntheticDataset
+
+
+class TestIndexIVFPQPanorama(unittest.TestCase):
+    """Test Suite for IndexIVFPQPanorama."""
+
+    # Helper methods for index creation and data generation
+
+    def generate_data(self, d, nt, nb, nq, seed=42):
+        ds = SyntheticDataset(d, nt, nb, nq, seed=seed)
+        return ds.get_train(), ds.get_database(), ds.get_queries()
+
+    def create_ivfpq(self, d, nlist, M, nbits, xt, xb=None, nprobe=None):
+        """Create and train a standard IndexIVFPQ (L2 only)."""
+        quantizer = faiss.IndexFlatL2(d)
+        index = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+        index.train(xt)
+        if xb is not None:
+            index.add(xb)
+        if nprobe is not None:
+            index.nprobe = nprobe
+        return index
+
+    def create_panorama(
+        self, d, nlist, M, nbits, n_levels, xt, xb=None,
+        nprobe=None, batch_size=128,
+    ):
+        """Create IndexIVFPQPanorama from a freshly trained IVFPQ.
+
+        Trains a temporary IndexIVFPQ, copies PQ centroids and quantizer
+        into the Panorama index, then sets up precomputed tables.
+        """
+        quantizer = faiss.IndexFlatL2(d)
+        trained = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+        trained.train(xt)
+
+        trained.own_fields = False
+        pano = faiss.IndexIVFPQPanorama(
+            quantizer, d, nlist, M, nbits, n_levels, batch_size,
+        )
+        centroids = faiss.vector_to_array(trained.pq.centroids)
+        faiss.copy_array_to_vector(centroids, pano.pq.centroids)
+        pano.is_trained = True
+        pano.use_precomputed_table = 1
+        pano.precompute_table()
+
+        if xb is not None:
+            pano.add(xb)
+        if nprobe is not None:
+            pano.nprobe = nprobe
+        return pano
+
+    def create_pair(
+        self, d, nlist, M, nbits, n_levels, xt, xb=None,
+        nprobe=None, batch_size=128,
+    ):
+        """Create an IVFPQ and an IVFPQPanorama sharing the same training.
+
+        Both indexes use the same quantizer centroids and PQ codebook,
+        so search results should be identical.
+        """
+        quantizer = faiss.IndexFlatL2(d)
+        trained = faiss.IndexIVFPQ(quantizer, d, nlist, M, nbits)
+        trained.train(xt)
+
+        # Build the IVFPQ baseline from the trained state.
+        ivfpq = faiss.clone_index(trained)
+
+        # Build the Panorama from the same trained state.
+        trained.own_fields = False
+        pano = faiss.IndexIVFPQPanorama(
+            quantizer, d, nlist, M, nbits, n_levels, batch_size,
+        )
+        centroids = faiss.vector_to_array(trained.pq.centroids)
+        faiss.copy_array_to_vector(centroids, pano.pq.centroids)
+        pano.is_trained = True
+        pano.use_precomputed_table = 1
+        pano.precompute_table()
+
+        if xb is not None:
+            ivfpq.add(xb)
+            pano.add(xb)
+        if nprobe is not None:
+            ivfpq.nprobe = nprobe
+            pano.nprobe = nprobe
+        return ivfpq, pano
+
+    def assert_search_results_equal(
+        self,
+        D_regular,
+        I_regular,
+        D_panorama,
+        I_panorama,
+        rtol=1e-4,
+        atol=1e-6,
+        otol=1e-3,
+    ):
+        overlap_rate = np.mean(I_regular == I_panorama)
+
+        self.assertGreater(
+            overlap_rate,
+            1 - otol,
+            f"Overlap rate {overlap_rate:.6f} is not > {1 - otol:.3f}. ",
+        )
+        np.testing.assert_allclose(
+            D_regular,
+            D_panorama,
+            rtol=rtol,
+            atol=atol,
+            err_msg="Distances mismatch",
+        )
+
+    # Core functionality tests
+
+    def test_exact_match_with_ivfpq(self):
+        """Core test: Panorama must return identical results to IndexIVFPQ"""
+        d, nb, nt, nq = 64, 50000, 60000, 500
+        nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=42)
+
+        for nprobe in [1, 4, 16, 64]:
+            with self.subTest(nprobe=nprobe):
+                ivfpq, pano = self.create_pair(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe,
+                )
+                D_regular, I_regular = ivfpq.search(xq, k)
+                D_panorama, I_panorama = pano.search(xq, k)
+
+                self.assert_search_results_equal(
+                    D_regular, I_regular, D_panorama, I_panorama
+                )
+
+    def test_exact_match_with_ivfpq_medium(self):
+        """Core test: Medium scale version"""
+        d, nb, nt, nq = 32, 10000, 15000, 200
+        nlist, M, nbits, n_levels, k = 32, 8, 8, 4, 10
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=42)
+
+        for nprobe in [1, 4, 8, nlist]:
+            with self.subTest(nprobe=nprobe):
+                ivfpq, pano = self.create_pair(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe,
+                )
+                D_regular, I_regular = ivfpq.search(xq, k)
+                D_panorama, I_panorama = pano.search(xq, k)
+
+                self.assert_search_results_equal(
+                    D_regular, I_regular, D_panorama, I_panorama
+                )
+
+    # Parameter variation tests
+
+    def test_different_n_levels(self):
+        """Test correctness with various n_levels parameter values"""
+        d, nb, nt, nq = 64, 25000, 40000, 200
+        nlist, M, nbits, k = 64, 16, 8, 15
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=456)
+
+        # Train IVFPQ once for the baseline.
+        ivfpq = self.create_ivfpq(d, nlist, M, nbits, xt, xb, nprobe=16)
+        D_base, I_base = ivfpq.search(xq, k)
+
+        nt = faiss.omp_get_max_threads()
+        faiss.omp_set_num_threads(1)
+
+        prev_ratio = float("inf")
+        # n_levels must divide M=16.
+        for n_levels in [1, 2, 4, 8, 16]:
+            with self.subTest(n_levels=n_levels):
+                faiss.cvar.indexPanorama_stats.reset()
+
+                pano = self.create_panorama(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe=16,
+                )
+                D, I = pano.search(xq, k)
+                self.assert_search_results_equal(D_base, I_base, D, I)
+
+                ratio = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
+                self.assertLess(ratio, prev_ratio)
+                prev_ratio = ratio
+
+        faiss.omp_set_num_threads(nt)
+
+    def test_different_M_and_n_levels(self):
+        """Test various M / n_levels combinations"""
+        test_cases = [
+            (32, 8, 2),   # M=8,  n_levels=2, chunk=4
+            (64, 16, 4),  # M=16, n_levels=4, chunk=4
+            (64, 32, 8),  # M=32, n_levels=8, chunk=4
+        ]
+        for d, M, n_levels in test_cases:
+            with self.subTest(d=d, M=M, n_levels=n_levels):
+                nb, nt, nq, nlist, nbits, k = 10000, 15000, 100, 32, 8, 10
+                xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=789)
+
+                ivfpq, pano = self.create_pair(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe=8,
+                )
+                D_regular, I_regular = ivfpq.search(xq, k)
+                D_panorama, I_panorama = pano.search(xq, k)
+
+                self.assert_search_results_equal(
+                    D_regular, I_regular, D_panorama, I_panorama
+                )
+
+    def test_single_level(self):
+        """Test edge case with n_levels=1 (no pruning, equivalent to IVFPQ)"""
+        d, nb, nt, nq = 32, 5000, 7000, 50
+        nlist, M, nbits, n_levels, k = 16, 8, 8, 1, 5
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=333)
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=4,
+        )
+        D_regular, I_regular = ivfpq.search(xq, k)
+        D_panorama, I_panorama = pano.search(xq, k)
+
+        self.assert_search_results_equal(
+            D_regular, I_regular, D_panorama, I_panorama
+        )
+
+    def test_max_levels(self):
+        """Test edge case with n_levels=M (each level is one subquantizer)"""
+        d, nb, nt, nq = 64, 5000, 7000, 50
+        nlist, M, nbits, n_levels, k = 16, 16, 8, 16, 5
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=444)
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=4,
+        )
+        D_regular, I_regular = ivfpq.search(xq, k)
+        D_panorama, I_panorama = pano.search(xq, k)
+
+        self.assert_search_results_equal(
+            D_regular, I_regular, D_panorama, I_panorama
+        )
+
+    # ID selector tests
+
+    def test_id_selector_range(self):
+        """Test ID filtering with range selector"""
+        d, nb, nt, nq = 64, 50000, 60000, 300
+        nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=321)
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=16,
+        )
+
+        params = faiss.SearchParametersIVF()
+        params.sel = faiss.IDSelectorRange(10000, 30000)
+
+        D_regular, I_regular = ivfpq.search(xq, k, params=params)
+        D_panorama, I_panorama = pano.search(xq, k, params=params)
+
+        valid = I_panorama[I_panorama >= 0]
+        self.assertTrue(np.all(valid >= 10000))
+        self.assertTrue(np.all(valid < 30000))
+
+        np.testing.assert_array_equal(I_regular, I_panorama)
+        np.testing.assert_allclose(D_regular, D_panorama, rtol=1e-4)
+
+    def test_id_selector_batch(self):
+        """Test ID filtering with batch selector"""
+        d, nb, nt, nq = 64, 30000, 45000, 200
+        nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 20
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=654)
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=16,
+        )
+
+        allowed_ids = np.array([i * 50 for i in range(500)], dtype=np.int64)
+        params = faiss.SearchParametersIVF()
+        params.sel = faiss.IDSelectorBatch(allowed_ids)
+
+        D_regular, I_regular = ivfpq.search(xq, k, params=params)
+        D_panorama, I_panorama = pano.search(xq, k, params=params)
+
+        allowed_set = set(allowed_ids) | {-1}
+        for id_val in I_panorama.flatten():
+            self.assertIn(int(id_val), allowed_set)
+
+        np.testing.assert_array_equal(I_regular, I_panorama)
+        np.testing.assert_allclose(D_regular, D_panorama, rtol=1e-4)
+
+    def test_selector_excludes_all(self):
+        """Test selector that excludes all results"""
+        d, nb, nt, nq = 32, 3000, 5000, 5
+        nlist, M, nbits, n_levels, k = 8, 8, 8, 4, 10
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=999)
+
+        pano = self.create_panorama(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist,
+        )
+
+        params = faiss.SearchParametersIVF()
+        params.sel = faiss.IDSelectorRange(nb + 100, nb + 200)
+
+        D, I = pano.search(xq, k, params=params)
+        self.assertTrue(np.all(I == -1))
+
+    # Batch size and edge case tests
+
+    def test_batch_boundaries(self):
+        """Test correctness at various database sizes relative to batch_size"""
+        d, nq = 64, 50
+        nlist, M, nbits, n_levels, k = 16, 16, 8, 4, 10
+        xq = np.random.rand(nq, d).astype("float32")
+
+        batch_size = 128
+        test_sizes = [
+            batch_size - 1,
+            batch_size,
+            batch_size + 1,
+            batch_size * 2,
+            batch_size * 3 - 1,
+        ]
+        for nb in test_sizes:
+            with self.subTest(nb=nb):
+                nt = max(nb, 500)
+                np.random.seed(987)
+                xt = np.random.rand(nt, d).astype("float32")
+                xb = np.random.rand(nb, d).astype("float32")
+
+                ivfpq, pano = self.create_pair(
+                    d, nlist, M, nbits, n_levels, xt, xb,
+                    nprobe=nlist, batch_size=batch_size,
+                )
+                D_regular, I_regular = ivfpq.search(xq, k)
+                D_panorama, I_panorama = pano.search(xq, k)
+
+                self.assert_search_results_equal(
+                    D_regular, I_regular, D_panorama, I_panorama
+                )
+
+    def test_different_batch_sizes(self):
+        """Test correctness across different internal batch sizes"""
+        d, nb, nt, nq = 64, 10000, 15000, 50
+        nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 10
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=4242)
+
+        ivfpq = self.create_ivfpq(d, nlist, M, nbits, xt, xb, nprobe=8)
+        D_base, I_base = ivfpq.search(xq, k)
+
+        for bs in [64, 128, 256, 512, 1024]:
+            with self.subTest(batch_size=bs):
+                pano = self.create_panorama(
+                    d, nlist, M, nbits, n_levels, xt, xb,
+                    nprobe=8, batch_size=bs,
+                )
+                D, I = pano.search(xq, k)
+                self.assert_search_results_equal(D_base, I_base, D, I)
+
+    def test_very_small_dataset(self):
+        """Test with dataset smaller than batch size"""
+        test_cases = [10, 50, 100]
+
+        for nb in test_cases:
+            with self.subTest(nb=nb):
+                d, nlist, M, nbits, n_levels = 32, 4, 4, 8, 2
+                nt, nq = max(nb, 1500), 5
+                k = min(3, nb)
+                xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=666 + nb)
+
+                ivfpq, pano = self.create_pair(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist,
+                )
+                D_regular, I_regular = ivfpq.search(xq, k)
+                D_panorama, I_panorama = pano.search(xq, k)
+
+                self.assert_search_results_equal(
+                    D_regular, I_regular, D_panorama, I_panorama
+                )
+
+    def test_single_vector_per_cluster(self):
+        """Test extreme case where clusters have very few vectors"""
+        d, nb, nt, nq = 32, 20, 3000, 5
+        nlist, M, nbits, n_levels, k = 16, 4, 8, 2, 3
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=1313)
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=nlist,
+        )
+        D_regular, I_regular = ivfpq.search(xq, k)
+        D_panorama, I_panorama = pano.search(xq, k)
+
+        self.assert_search_results_equal(
+            D_regular, I_regular, D_panorama, I_panorama
+        )
+
+    def test_empty_result_handling(self):
+        """Test handling of empty search results (shapes only)"""
+        d, nb, nt, nq = 32, 100, 3000, 10
+        nlist, M, nbits, n_levels, k = 8, 4, 8, 2, 10
+        xt, xb, _ = self.generate_data(d, nt, nb, nq, seed=111)
+        xq = np.random.rand(nq, d).astype("float32") + 10.0
+
+        pano = self.create_panorama(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=1,
+        )
+        D, I = pano.search(xq, k)
+
+        self.assertEqual(D.shape, (nq, k))
+        self.assertEqual(I.shape, (nq, k))
+
+    # Dynamic operations tests
+
+    def test_incremental_add(self):
+        """Test adding vectors incrementally in multiple batches"""
+        d, nt = 64, 20000
+        nlist, M, nbits, n_levels, k = 64, 16, 8, 4, 15
+        xt = np.random.rand(nt, d).astype("float32")
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, nprobe=16,
+        )
+
+        for batch_nb in [5000, 10000, 15000]:
+            xb_batch = np.random.rand(batch_nb, d).astype("float32")
+            ivfpq.add(xb_batch)
+            pano.add(xb_batch)
+
+        nq = 100
+        xq = np.random.rand(nq, d).astype("float32")
+
+        D_regular, I_regular = ivfpq.search(xq, k)
+        D_panorama, I_panorama = pano.search(xq, k)
+
+        self.assert_search_results_equal(
+            D_regular, I_regular, D_panorama, I_panorama
+        )
+
+    def test_add_search_add_search(self):
+        """Test interleaved add and search operations"""
+        d, nt = 32, 500
+        nlist, M, nbits, n_levels, k = 8, 8, 8, 4, 5
+        np.random.seed(555)
+        xt = np.random.rand(nt, d).astype("float32")
+
+        ivfpq, pano = self.create_pair(
+            d, nlist, M, nbits, n_levels, xt, nprobe=4,
+        )
+
+        xb1 = np.random.rand(200, d).astype("float32")
+        ivfpq.add(xb1)
+        pano.add(xb1)
+
+        xq1 = np.random.rand(10, d).astype("float32")
+        D_reg_1, I_reg_1 = ivfpq.search(xq1, k)
+        D_pan_1, I_pan_1 = pano.search(xq1, k)
+        self.assert_search_results_equal(D_reg_1, I_reg_1, D_pan_1, I_pan_1)
+
+        xb2 = np.random.rand(300, d).astype("float32")
+        ivfpq.add(xb2)
+        pano.add(xb2)
+
+        xq2 = np.random.rand(10, d).astype("float32")
+        D_reg_2, I_reg_2 = ivfpq.search(xq2, k)
+        D_pan_2, I_pan_2 = pano.search(xq2, k)
+        self.assert_search_results_equal(D_reg_2, I_reg_2, D_pan_2, I_pan_2)
+
+    # Serialization tests
+
+    def test_serialization(self):
+        """Test write/read preserves search results"""
+        d, nb, nt, nq = 64, 10000, 15000, 100
+        nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 20
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=2024)
+
+        pano = self.create_panorama(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=8,
+        )
+
+        D_before, I_before = pano.search(xq, k)
+        pano_after = faiss.deserialize_index(faiss.serialize_index(pano))
+        D_after, I_after = pano_after.search(xq, k)
+
+        np.testing.assert_array_equal(I_before, I_after)
+        np.testing.assert_allclose(D_before, D_after, rtol=1e-5)
+
+    def test_serialization_preserves_params(self):
+        """Test serialization preserves n_levels and batch_size correctly"""
+        d, nb, nt, nq = 64, 10000, 15000, 50
+        nlist, M, nbits, n_levels, k = 32, 16, 8, 4, 10
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=2025)
+
+        pano = self.create_panorama(
+            d, nlist, M, nbits, n_levels, xt, xb, nprobe=4,
+        )
+        D_before, I_before = pano.search(xq, k)
+
+        pano_after = faiss.deserialize_index(
+            faiss.serialize_index(pano)
+        )
+        self.assertEqual(pano_after.batch_size, 128)
+        self.assertEqual(pano_after.n_levels, n_levels)
+
+        D_after, I_after = pano_after.search(xq, k)
+        np.testing.assert_array_equal(I_before, I_after)
+        np.testing.assert_allclose(D_before, D_after, rtol=1e-5)
+
+    # Statistics tests
+
+    def test_ratio_dims_scanned(self):
+        """Test that ratio_dims_scanned is 1.0 at n_levels=1 and strictly
+        less for higher n_levels.
+
+        Unlike IndexFlatPanorama, PQ quantization error prevents achieving
+        the ideal 1/n_levels ratio even on synthetic data. We verify that
+        n_levels=1 gives ratio=1.0 (exhaustive) and that multi-level
+        pruning is effective (ratio well below 1.0).
+        """
+        d, nb, nt, nq = 64, 25000, 40000, 10
+        nlist, M, nbits, k = 32, 16, 8, 1
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=5678)
+
+        nt_threads = faiss.omp_get_max_threads()
+        faiss.omp_set_num_threads(1)
+
+        faiss.cvar.indexPanorama_stats.reset()
+        pano_1 = self.create_panorama(
+            d, nlist, M, nbits, 1, xt, xb, nprobe=8,
+        )
+        pano_1.search(xq, k)
+        ratio_1 = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
+        self.assertAlmostEqual(ratio_1, 1.0, places=3)
+
+        faiss.cvar.indexPanorama_stats.reset()
+        pano_16 = self.create_panorama(
+            d, nlist, M, nbits, 16, xt, xb, nprobe=8,
+        )
+        pano_16.search(xq, k)
+        ratio_16 = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
+        self.assertLess(ratio_16, 0.6)
+
+        faiss.omp_set_num_threads(nt_threads)
+
+    def test_pruning_improves_with_n_levels(self):
+        """Test that increasing n_levels reduces the fraction scanned"""
+        d, nb, nt, nq = 64, 25000, 40000, 50
+        nlist, M, nbits, k = 32, 16, 8, 10
+        xt, xb, xq = self.generate_data(d, nt, nb, nq, seed=1234)
+
+        nt_threads = faiss.omp_get_max_threads()
+        faiss.omp_set_num_threads(1)
+
+        prev_ratio = float("inf")
+        for n_levels in [1, 2, 4, 8, 16]:
+            with self.subTest(n_levels=n_levels):
+                faiss.cvar.indexPanorama_stats.reset()
+                pano = self.create_panorama(
+                    d, nlist, M, nbits, n_levels, xt, xb, nprobe=8,
+                )
+                pano.search(xq, k)
+                ratio = faiss.cvar.indexPanorama_stats.ratio_dims_scanned
+                self.assertLessEqual(ratio, prev_ratio)
+                prev_ratio = ratio
+
+        faiss.omp_set_num_threads(nt_threads)
+
+    # Constraint validation tests
+
+    def test_rejects_non_l2_metric(self):
+        """Verify that non-L2 metrics are rejected"""
+        d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 4
+        quantizer = faiss.IndexFlatIP(d)
+        with self.assertRaises(RuntimeError):
+            faiss.IndexIVFPQPanorama(
+                quantizer, d, nlist, M, nbits, n_levels, 128,
+                faiss.METRIC_INNER_PRODUCT,
+            )
+
+    def test_rejects_invalid_batch_size(self):
+        """Verify that non-multiple-of-64 batch_size is rejected"""
+        d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 4
+        quantizer = faiss.IndexFlatL2(d)
+        with self.assertRaises(RuntimeError):
+            faiss.IndexIVFPQPanorama(
+                quantizer, d, nlist, M, nbits, n_levels, 100,
+            )
+
+    def test_rejects_m_not_divisible_by_n_levels(self):
+        """Verify that M not divisible by n_levels is rejected"""
+        d, nlist, M, nbits, n_levels = 32, 8, 8, 8, 3
+        quantizer = faiss.IndexFlatL2(d)
+        with self.assertRaises(RuntimeError):
+            faiss.IndexIVFPQPanorama(
+                quantizer, d, nlist, M, nbits, n_levels, 128,
+            )