Skip to content

Commit dd3003e

Browse files
aknayarmeta-codesync[bot]
authored andcommitted
Fix Panorama segfault (#5200)
Summary: In the current code you can segfault Panorama indexes by loading a dataset with 960 dims and setting `nlevels` to 128. In the case where `nlevels` does not evenly divide `d`, Panorama currently calculates `level_width = ceil(d / nlevels)`. We cannot change this behavior due to backwards compatibility. However, for our case of 960 dims and 128 levels, this would mean each level is 8 dims. Thus, the last 8 levels have 0 dims each. Since we calculate `actual_level_width` using `size_t` params, it overflows and produces 8 when it should produce 0, leading to out-of-bounds memory accesses. The fix is to simply truncate `nlevels` in the `Panorama` object when this would happen. A debug log is emitted to the user when this happens. In the aforementioned example, the user would see `WARNING truncating nlevels from 128 to 120`. This PR also fixes a similar latent bug in `IndexFlatPanorama::reconstruct` where we would perform out-of-bounds memory accesses when `nlevels` doesn't evenly divide `d`. Existing test cases are strengthened to ensure bugs are fixed and to avoid any future regressions. Pull Request resolved: #5200 Reviewed By: junjieqi Differential Revision: D104764705 Pulled By: mnorris11 fbshipit-source-id: e55577b8ce5508516ee5dd4758f31acb4a28179a
1 parent 086bd74 commit dd3003e

7 files changed

Lines changed: 62 additions & 21 deletions

File tree

faiss/IndexFlat.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -627,7 +627,7 @@ inline void flat_pano_search_core(
627627
{
628628
SingleResultHandler res(handler);
629629

630-
std::vector<float> query_cum_norms(index.n_levels + 1);
630+
std::vector<float> query_cum_norms(index.pano.n_levels + 1);
631631
std::vector<uint32_t> active_indices(index.batch_size);
632632
std::vector<uint8_t> active_byteset(index.batch_size);
633633
std::vector<float> exact_distances(index.batch_size);
@@ -698,7 +698,7 @@ void IndexFlatPanorama::add(idx_t n, const float* x) {
698698
size_t num_batches = (ntotal + batch_size - 1) / batch_size;
699699

700700
codes.resize(num_batches * batch_size * code_size);
701-
cum_sums.resize(num_batches * batch_size * (n_levels + 1));
701+
cum_sums.resize(num_batches * batch_size * (pano.n_levels + 1));
702702

703703
const uint8_t* code = reinterpret_cast<const uint8_t*>(x);
704704
pano.copy_codes_to_level_layout(codes.data(), offset, n, code);
@@ -771,7 +771,7 @@ size_t IndexFlatPanorama::remove_ids(const IDSelector& sel) {
771771
ntotal = j;
772772
size_t num_batches = (ntotal + batch_size - 1) / batch_size;
773773
codes.resize(num_batches * batch_size * code_size);
774-
cum_sums.resize(num_batches * batch_size * (n_levels + 1));
774+
cum_sums.resize(num_batches * batch_size * (pano.n_levels + 1));
775775
}
776776
return nremove;
777777
}
@@ -843,7 +843,7 @@ void IndexFlatPanorama::search_subset(
843843
{
844844
SingleResultHandler res(handler);
845845

846-
std::vector<float> query_cum_norms(n_levels + 1);
846+
std::vector<float> query_cum_norms(pano.n_levels + 1);
847847

848848
// Panorama's optimized point-wise refinement (Algorithm 2):
849849
// Batch-wise Panorama, as implemented in Panorama.h, incurs
@@ -881,7 +881,7 @@ void IndexFlatPanorama::search_subset(
881881
continue;
882882
}
883883

884-
size_t cum_sum_offset = (n_levels + 1) * idx;
884+
size_t cum_sum_offset = (pano.n_levels + 1) * idx;
885885
float cum_sum = cum_sums[cum_sum_offset];
886886
float exact_distance = 0.0f;
887887
if constexpr (!is_sim) {
@@ -897,7 +897,7 @@ void IndexFlatPanorama::search_subset(
897897
local_stats.total_dims += d;
898898

899899
bool pruned = false;
900-
for (size_t level = 0; level < n_levels; level++) {
900+
for (size_t level = 0; level < pano.n_levels; level++) {
901901
local_stats.total_dims_scanned +=
902902
pano.level_width_floats;
903903

faiss/IndexHNSW.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -707,7 +707,10 @@ IndexHNSWFlat::IndexHNSWFlat(int d_in, int M, MetricType metric)
707707
**************************************************************/
708708

709709
IndexHNSWFlatPanorama::IndexHNSWFlatPanorama()
710-
: IndexHNSWFlat(), cum_sums(), pano(0, 1, 1), num_panorama_levels(0) {}
710+
: IndexHNSWFlat(),
711+
cum_sums(),
712+
pano(sizeof(float), 1, 1),
713+
num_panorama_levels(0) {}
711714

712715
IndexHNSWFlatPanorama::IndexHNSWFlatPanorama(
713716
int d_in,

faiss/impl/Panorama.cpp

Lines changed: 16 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,20 @@ Panorama::Panorama(
6767
void Panorama::set_derived_values() {
6868
FAISS_THROW_IF_NOT_MSG(n_levels > 0, "Panorama: n_levels must be > 0");
6969
this->d = code_size / sizeof(float);
70+
FAISS_THROW_IF_NOT_MSG(n_levels <= d, "Panorama: n_levels must be <= d");
7071
this->level_width_floats = ((d + n_levels - 1) / n_levels);
7172
this->level_width = this->level_width_floats * sizeof(float);
73+
size_t n_real_levels = d / level_width_floats;
74+
if (d > n_real_levels * level_width_floats) {
75+
n_real_levels++;
76+
}
77+
if (this->n_levels != n_real_levels) {
78+
fprintf(stderr,
79+
"WARNING truncating nlevels from %zu to %zu\n",
80+
this->n_levels,
81+
n_real_levels);
82+
this->n_levels = n_real_levels;
83+
}
7284
}
7385

7486
/**
@@ -151,12 +163,12 @@ void Panorama::reconstruct(idx_t key, float* recons, const uint8_t* codes_base)
151163

152164
for (size_t level = 0; level < n_levels; level++) {
153165
size_t level_offset = level * level_width * batch_size;
166+
size_t actual_level_width =
167+
std::min(level_width, code_size - level * level_width);
154168
const uint8_t* src = codes_base + batch_offset + level_offset +
155-
pos_in_batch * level_width;
169+
pos_in_batch * actual_level_width;
156170
uint8_t* dest = recons_buffer + level * level_width;
157-
size_t copy_size =
158-
std::min(level_width, code_size - level * level_width);
159-
memcpy(dest, src, copy_size);
171+
memcpy(dest, src, actual_level_width);
160172
}
161173
}
162174

tests/test_flat_panorama.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -536,7 +536,7 @@ def test_add_search_add_search(self):
536536

537537
def test_reconstruct(self):
538538
"""Test reconstruct and reconstruct_n return original vectors"""
539-
d, nb, nt, nq, nlevels = 128, 10000, 15000, 10, 8
539+
d, nb, nt, nq, nlevels = 964, 1000, 15000, 10, 128
540540
_, xb, _ = self.generate_data(d, nt, nb, nq, seed=2025)
541541

542542
for metric in self.METRICS:
@@ -558,7 +558,7 @@ def test_reconstruct(self):
558558

559559
def test_remove_ids_then_add(self):
560560
"""Test removing vectors with remove_ids() then adding more vectors"""
561-
d, nb, nt, nq, nlevels, k = 128, 500000, 0, 10, 9, 15
561+
d, nb, nt, nq, nlevels, k = 964, 50000, 0, 10, 128, 15
562562
_, xb, xq = self.generate_data(d, nt, nb, nq, seed=2026)
563563

564564
xb1 = xb[:nb // 2]
@@ -603,7 +603,7 @@ def test_remove_ids_then_add(self):
603603

604604
def test_merge_from(self):
605605
"""Test merging indexes with merge_from()"""
606-
d, nb, nt, nq, nlevels, k, batch_size = 128, 500000, 0, 10, 9, 15, 16
606+
d, nb, nt, nq, nlevels, k, batch_size = 964, 50000, 0, 10, 128, 15, 16
607607
_, xb, xq = self.generate_data(d, nt, nb, nq, seed=2027)
608608

609609
# Split data and create two separate indexes
@@ -637,7 +637,7 @@ def test_merge_from(self):
637637

638638
def test_permute_entries(self):
639639
"""Test permuting entries with permute_entries()"""
640-
d, nb, nt, nq, nlevels, k = 128, 500000, 0, 10, 8, 15
640+
d, nb, nt, nq, nlevels, k = 964, 50000, 0, 20, 128, 10
641641
_, xb, xq = self.generate_data(d, nt, nb, nq, seed=2028)
642642

643643
for metric in self.METRICS:
@@ -664,7 +664,7 @@ def test_permute_entries(self):
664664

665665
def test_serialization(self):
666666
"""Test write/read Panorama indexes preserves search results"""
667-
d, nb, nt, nq, nlevels, k = 128, 10000, 15000, 100, 8, 20
667+
d, nb, nt, nq, nlevels, k = 964, 10000, 15000, 100, 128, 20
668668
_, xb, xq = self.generate_data(d, nt, nb, nq, seed=2024)
669669

670670
for metric in self.METRICS:

tests/test_hnsw_panorama.py

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,31 @@ def test_different_panorama_levels(self):
156156
# More levels should still maintain reasonable recall
157157
self.assertGreaterEqual(recall, 0.80)
158158

159+
def test_uneven_dimension_division(self):
160+
"""Test when n_levels doesn't evenly divide dimension."""
161+
test_cases = [(65, 4), (63, 8), (100, 7), (960, 128), (964, 128)]
162+
163+
nb, nt, nq, k = 1000, 700, 20, 5
164+
165+
for d, nlevels in test_cases:
166+
with self.subTest(d=d, nlevels=nlevels):
167+
_, xb, xq = self.generate_data(d, nt, nb, nq, seed=789)
168+
169+
index = faiss.IndexHNSWFlatPanorama(d, 32, nlevels)
170+
index.hnsw.efSearch = 64
171+
index.add(xb)
172+
173+
_, I = index.search(xq, k)
174+
175+
_, gt_I = self.compute_ground_truth(xb, xq, k)
176+
177+
recall = self.compute_recall(gt_I, I)
178+
self.assertGreaterEqual(
179+
recall, 0.80,
180+
f"Recall too low for d={d}, nlevels={nlevels}: "
181+
f"{recall:.4f}",
182+
)
183+
159184
def test_consistency(self):
160185
"""Test that search results are consistent across multiple searches."""
161186
d = 64
@@ -180,16 +205,17 @@ def test_consistency(self):
180205

181206
def test_io(self):
182207
"""Test serialization and deserialization."""
183-
d = 64
208+
d = 964
184209
nb = 500
185210
nt = 700
186211
nq = 10
187212
k = 5
213+
nlevels = 128
188214

189215
# Generate data
190216
_, xb, xq = self.generate_data(d, nt, nb, nq, seed=2024)
191217

192-
index = faiss.IndexHNSWFlatPanorama(d, 16, 8)
218+
index = faiss.IndexHNSWFlatPanorama(d, 16, nlevels)
193219
index.add(xb)
194220

195221
# Get search results before saving
@@ -204,7 +230,7 @@ def test_io(self):
204230
self.assertIsInstance(loaded_index, faiss.IndexHNSWFlatPanorama)
205231
self.assertEqual(loaded_index.d, d)
206232
self.assertEqual(loaded_index.ntotal, nb)
207-
self.assertEqual(loaded_index.num_panorama_levels, 8)
233+
self.assertEqual(loaded_index.num_panorama_levels, nlevels)
208234

209235
# Search after loading
210236
D_after, I_after = loaded_index.search(xq, k)

tests/test_ivf_flat_panorama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -280,7 +280,7 @@ def test_different_n_levels(self):
280280

281281
def test_uneven_dimension_division(self):
282282
"""Test when n_levels doesn't evenly divide dimension"""
283-
test_cases = [(65, 4), (63, 8), (100, 7)]
283+
test_cases = [(65, 4), (63, 8), (100, 7), (960, 128), (964, 128)]
284284

285285
# TODO(aknayar): Test functions like get_single_code().
286286

tests/test_refine_panorama.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ def test_different_n_levels(self):
167167

168168
def test_uneven_dimension_division(self):
169169
"""Test when n_levels doesn't evenly divide dimension"""
170-
test_cases = [(65, 4), (63, 8), (100, 7)]
170+
test_cases = [(65, 4), (63, 8), (100, 7), (960, 128), (964, 128)]
171171

172172
for metric in self.METRICS:
173173
for d, nlevels in test_cases:

0 commit comments

Comments
 (0)