Validate code_size during deserialization to prevent oversized allocations (#5151)

scsiguy · meta-codesync[bot] · commit 6c70444d0843 · 2026-04-28T15:29:49.000-07:00
Summary: Pull Request resolved: #5151 Several index types read code_size directly from the serialized stream independently of the quantizer parameters that determine its correct value. When the stored code_size is corrupt but ntotal is 0, the existing consistency check (codes.size() == ntotal * code_size) passes trivially. A subsequent search then allocates (code_size * sizeof(float)) bytes in GenericFlatCodesDistanceComputer, which can trigger an OOM exception. Two layers of protection: 1. Cross-validate the deserialized code_size against the quantizer-derived value for all index types that read code_size from the stream: IndexResidualQuantizer, IndexLocalSearchQuantizer, IndexProductResidualQuantizer, IndexProductLocalSearchQuantizer, IndexIVFAdditiveQuantizer, IndexIVFScalarQuantizer, IndexLSH, and Index2Layer. The quantizer code_size is computed from validated parameters via set_derived_values() and is always authoritative. 2. For IndexLattice, where code_size is derived from constructor parameters (scale_nbit, lattice_nbit, nsq) rather than read from the stream, validate that code_size does not exceed the uncompressed vector size (d * sizeof(float)). IndexLattice is a lossy compressor, so its code_size must always be smaller than the uncompressed representation. A corrupt scale_nbit can overflow the total_nbit computation, producing a code_size that wraps to a huge value; this bound catches that before any allocation is attempted. Reviewed By: mnorris11 Differential Revision: D102360605 fbshipit-source-id: 1f0e7262a0e0e4566d7813b7da1bf0102e6fd9bf
diff --git a/faiss/impl/index_read.cpp b/faiss/impl/index_read.cpp
@@ -855,6 +855,18 @@ static void validate_aq_dimension_match(
             idx_d);
 }
 
+static void validate_code_size_match(
+        size_t stored,
+        size_t expected,
+        const char* index_type) {
+    FAISS_THROW_IF_NOT_FMT(
+            stored == expected,
+            "%s code_size mismatch: stored %zd vs derived %zd",
+            index_type,
+            stored,
+            expected);
+}
+
 static void read_ResidualQuantizer(
         ResidualQuantizer& rq,
         IOReader* f,
@@ -1493,6 +1505,10 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         READVECTOR(idxl->thresholds);
         int code_size_i;
         READ1(code_size_i);
+        FAISS_THROW_IF_NOT_FMT(
+                code_size_i >= 0,
+                "IndexLSH invalid code_size %d (must be >= 0)",
+                code_size_i);
         idxl->code_size = code_size_i;
         if (h == fourcc("IxHE")) {
             FAISS_THROW_IF_NOT_FMT(
@@ -1503,6 +1519,8 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
             // leak
             idxl->code_size *= 8;
         }
+        validate_code_size_match(
+                idxl->code_size, (idxl->nbits + 7) / 8, "IndexLSH");
         {
             // Read, dereference, discard.
             auto sub_vt = read_VectorTransform_up(f);
@@ -1550,6 +1568,8 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         validate_aq_dimension_match(
                 idxr->rq, idxr->d, "IndexResidualQuantizer");
         READ1(idxr->code_size);
+        validate_code_size_match(
+                idxr->code_size, idxr->rq.code_size, "IndexResidualQuantizer");
         read_vector(idxr->codes, f);
         FAISS_THROW_IF_NOT(
                 idxr->codes.size() == idxr->ntotal * idxr->code_size);
@@ -1561,6 +1581,10 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         validate_aq_dimension_match(
                 idxr->lsq, idxr->d, "IndexLocalSearchQuantizer");
         READ1(idxr->code_size);
+        validate_code_size_match(
+                idxr->code_size,
+                idxr->lsq.code_size,
+                "IndexLocalSearchQuantizer");
         read_vector(idxr->codes, f);
         FAISS_THROW_IF_NOT(
                 idxr->codes.size() == idxr->ntotal * idxr->code_size);
@@ -1572,6 +1596,10 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         validate_aq_dimension_match(
                 idxpr->prq, idxpr->d, "IndexProductResidualQuantizer");
         READ1(idxpr->code_size);
+        validate_code_size_match(
+                idxpr->code_size,
+                idxpr->prq.code_size,
+                "IndexProductResidualQuantizer");
         read_vector(idxpr->codes, f);
         FAISS_THROW_IF_NOT(
                 idxpr->codes.size() == idxpr->ntotal * idxpr->code_size);
@@ -1583,6 +1611,10 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         validate_aq_dimension_match(
                 idxpl->plsq, idxpl->d, "IndexProductLocalSearchQuantizer");
         READ1(idxpl->code_size);
+        validate_code_size_match(
+                idxpl->code_size,
+                idxpl->plsq.code_size,
+                "IndexProductLocalSearchQuantizer");
         read_vector(idxpl->codes, f);
         FAISS_THROW_IF_NOT(
                 idxpl->codes.size() == idxpl->ntotal * idxpl->code_size);
@@ -1847,6 +1879,27 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
                 nsq,
                 dsq);
         auto idxl = std::make_unique<IndexLattice>(d, nsq, scale_nbit, r2);
+        // IndexLattice is a lossy compressor: code_size should be
+        // smaller than the uncompressed vector (d floats). A corrupt
+        // scale_nbit can overflow the total_nbit computation, producing
+        // a code_size that wraps to a huge value.
+        {
+            size_t max_code_size = mul_no_overflow(
+                    static_cast<size_t>(d),
+                    sizeof(float),
+                    "IndexLattice uncompressed vector size");
+            FAISS_THROW_IF_NOT_FMT(
+                    idxl->code_size <= max_code_size,
+                    "IndexLattice code_size %zd exceeds uncompressed "
+                    "vector size %zd (likely corrupt scale_nbit=%d, "
+                    "d=%d, nsq=%d, r2=%d)",
+                    idxl->code_size,
+                    max_code_size,
+                    scale_nbit,
+                    d,
+                    nsq,
+                    r2);
+        }
         read_index_header(*idxl, f);
         READVECTOR(idxl->trained);
         idx = std::move(idxl);
@@ -1856,6 +1909,8 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         read_ivf_header(ivsc.get(), f, &ids);
         read_ScalarQuantizer(&ivsc->sq, f, *ivsc);
         READ1(ivsc->code_size);
+        validate_code_size_match(
+                ivsc->code_size, ivsc->sq.code_size, "IndexIVFScalarQuantizer");
         ArrayInvertedLists* ail = set_array_invlist(ivsc.get(), ids);
         for (size_t i = 0; i < ivsc->nlist; i++)
             READVECTOR(ail->codes[i]);
@@ -1865,6 +1920,8 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         read_ivf_header(ivsc.get(), f);
         read_ScalarQuantizer(&ivsc->sq, f, *ivsc);
         READ1(ivsc->code_size);
+        validate_code_size_match(
+                ivsc->code_size, ivsc->sq.code_size, "IndexIVFScalarQuantizer");
         if (h == fourcc("IwSQ")) {
             ivsc->by_residual = true;
         } else {
@@ -1903,6 +1960,10 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         }
         validate_aq_dimension_match(
                 *iva->aq, iva->d, "IndexIVFAdditiveQuantizer");
+        validate_code_size_match(
+                iva->code_size,
+                iva->aq->code_size,
+                "IndexIVFAdditiveQuantizer");
         READ1(iva->by_residual);
         READ1(iva->use_precomputed_table);
         read_InvertedLists(*iva, f, io_flags);
@@ -2068,6 +2129,14 @@ std::unique_ptr<Index> read_index_up(IOReader* f, int io_flags) {
         READ1(idxp->code_size_1);
         READ1(idxp->code_size_2);
         READ1(idxp->code_size);
+        validate_code_size_match(
+                idxp->code_size_2,
+                idxp->pq.code_size,
+                "Index2Layer code_size_2");
+        validate_code_size_match(
+                idxp->code_size,
+                idxp->code_size_1 + idxp->code_size_2,
+                "Index2Layer");
         read_vector(idxp->codes, f);
         idx = std::move(idxp);
     } else if (
diff --git a/tests/test_read_index_deserialize.cpp b/tests/test_read_index_deserialize.cpp
@@ -15,18 +15,21 @@
 
 #include <faiss/Index.h>
 #include <faiss/Index2Layer.h>
+#include <faiss/IndexAdditiveQuantizer.h>
 #include <faiss/IndexAdditiveQuantizerFastScan.h>
 #include <faiss/IndexBinary.h>
 #include <faiss/IndexBinaryHNSW.h>
 #include <faiss/IndexBinaryIVF.h>
 #include <faiss/IndexFlat.h>
 #include <faiss/IndexHNSW.h>
+#include <faiss/IndexIVFAdditiveQuantizer.h>
 #include <faiss/IndexIVFAdditiveQuantizerFastScan.h>
 #include <faiss/IndexIVFFlat.h>
 #include <faiss/IndexIVFIndependentQuantizer.h>
 #include <faiss/IndexIVFPQ.h>
 #include <faiss/IndexIVFPQR.h>
 #include <faiss/IndexRaBitQFastScan.h>
+#include <faiss/IndexScalarQuantizer.h>
 #include <faiss/VectorTransform.h>
 #include <faiss/impl/FaissException.h>
 #include <faiss/impl/ScalarQuantizer.h>
@@ -3584,6 +3587,217 @@ TEST(ReadIndexDeserialize, IndexRQFastScanAQDimensionMismatch) {
     expect_read_throws_with(buf, "does not match index d");
 }
 
+// ============================================================
+// code_size cross-validation against quantizer-derived values.
+//
+// Several index types read code_size from the serialized stream
+// independently of the quantizer parameters. A corrupt code_size
+// that passes the codes.size() == ntotal * code_size check (e.g.
+// when ntotal == 0) can cause excessive allocations in
+// GenericFlatCodesDistanceComputer during search.
+// ============================================================
+
+// Locate the byte offset of code_size in a serialized index by
+// diffing two serializations: one with the real code_size and one
+// with a probe value. This avoids false matches from other fields
+// that happen to share the same numeric value.
+static ssize_t find_code_size_offset(
+        Index* index,
+        size_t* code_size_ptr,
+        size_t real_cs) {
+    VectorIOWriter w1;
+    write_index(index, &w1);
+
+    size_t probe_cs = real_cs ^ 0xDEAD;
+    *code_size_ptr = probe_cs;
+    VectorIOWriter w2;
+    write_index(index, &w2);
+    *code_size_ptr = real_cs;
+
+    EXPECT_EQ(w1.data.size(), w2.data.size());
+    if (w1.data.size() != w2.data.size()) {
+        return -1;
+    }
+
+    ssize_t offset = -1;
+    for (size_t i = 0; i + sizeof(size_t) <= w1.data.size(); i++) {
+        if (w1.data[i] != w2.data[i]) {
+            size_t v1, v2;
+            memcpy(&v1, w1.data.data() + i, sizeof(size_t));
+            memcpy(&v2, w2.data.data() + i, sizeof(size_t));
+            if (v1 == real_cs && v2 == probe_cs) {
+                offset = i;
+            }
+            i += sizeof(size_t) - 1;
+        }
+    }
+    return offset;
+}
+
+// Serialize a valid index, locate the exact byte offset of its
+// code_size field via double-serialization diffing, corrupt it,
+// and verify deserialization rejects it.
+static void corrupt_code_size_and_expect_throw(
+        Index* index,
+        size_t* code_size_ptr,
+        const std::string& expected_substr) {
+    size_t real_cs = *code_size_ptr;
+    ssize_t offset = find_code_size_offset(index, code_size_ptr, real_cs);
+    ASSERT_GE(offset, 0) << "could not locate code_size field in "
+                         << "serialized data via double-serialization diff";
+
+    VectorIOWriter writer;
+    write_index(index, &writer);
+
+    size_t corrupt_cs = real_cs + 999;
+    memcpy(writer.data.data() + offset, &corrupt_cs, sizeof(size_t));
+
+    VectorIOReader reader;
+    reader.data = writer.data;
+    try {
+        auto idx = std::unique_ptr<Index>(read_index(&reader));
+        FAIL() << "expected FaissException containing '" << expected_substr
+               << "'";
+    } catch (const FaissException& e) {
+        EXPECT_NE(
+                std::string(e.what()).find(expected_substr), std::string::npos)
+                << "expected '" << expected_substr << "' in: " << e.what();
+    }
+}
+
+static std::vector<float> make_random_data(int d, int n, int seed = 42) {
+    std::vector<float> data(d * n);
+    std::mt19937 rng(seed);
+    std::uniform_real_distribution<float> dist;
+    for (auto& v : data) {
+        v = dist(rng);
+    }
+    return data;
+}
+
+TEST(ReadIndexDeserialize, ResidualQuantizerCodeSizeMismatch) {
+    int d = 8, nb = 256;
+    IndexResidualQuantizer idx(d, 2, 4);
+    auto xb = make_random_data(d, nb);
+    idx.train(nb, xb.data());
+    corrupt_code_size_and_expect_throw(
+            &idx, &idx.code_size, "code_size mismatch");
+}
+
+TEST(ReadIndexDeserialize, LocalSearchQuantizerCodeSizeMismatch) {
+    int d = 8, nb = 256;
+    IndexLocalSearchQuantizer idx(d, 2, 4);
+    auto xb = make_random_data(d, nb);
+    idx.train(nb, xb.data());
+    corrupt_code_size_and_expect_throw(
+            &idx, &idx.code_size, "code_size mismatch");
+}
+
+TEST(ReadIndexDeserialize, ProductResidualQuantizerCodeSizeMismatch) {
+    int d = 16, nb = 512;
+    IndexProductResidualQuantizer idx(d, 2, 4, 8);
+    auto xb = make_random_data(d, nb);
+    idx.train(nb, xb.data());
+    corrupt_code_size_and_expect_throw(
+            &idx, &idx.code_size, "code_size mismatch");
+}
+
+TEST(ReadIndexDeserialize, ProductLocalSearchQuantizerCodeSizeMismatch) {
+    int d = 8, nb = 256;
+    IndexProductLocalSearchQuantizer idx(d, 2, 2, 4);
+    auto xb = make_random_data(d, nb);
+    idx.train(nb, xb.data());
+    corrupt_code_size_and_expect_throw(
+            &idx, &idx.code_size, "code_size mismatch");
+}
+
+// IndexLSH code_size is validated against (nbits + 7) / 8. Use a
+// crafted payload with a mismatched code_size to exercise the check.
+TEST(ReadIndexDeserialize, LSHCodeSizeMismatch) {
+    std::vector<uint8_t> buf;
+    push_fourcc(buf, "IxHe"); // new format IndexLSH
+    push_index_header(buf, 8, 0);
+    push_val<int>(buf, 64);      // nbits
+    push_val<bool>(buf, false);  // rotate_data
+    push_val<bool>(buf, false);  // train_thresholds
+    push_vector<float>(buf, {}); // thresholds
+    push_val<int>(buf, 99);      // code_size = 99 (should be 8)
+    // rrot: RandomRotationMatrix
+    push_fourcc(buf, "rrot");
+    push_val<int>(buf, 8);         // d_in
+    push_val<int>(buf, 64);        // d_out
+    push_val<bool>(buf, false);    // is_trained
+    push_val<bool>(buf, false);    // have_bias
+    push_vector<float>(buf, {});   // A
+    push_vector<float>(buf, {});   // b
+    push_vector<uint8_t>(buf, {}); // codes
+
+    expect_read_throws_with(buf, "code_size mismatch");
+}
+
+TEST(ReadIndexDeserialize, IVFScalarQuantizerCodeSizeMismatch) {
+    // QT_fp16 gives code_size = d * 2 = 16, distinctive enough.
+    int d = 8, nb = 256, nlist = 4;
+    IndexFlatL2 quantizer(d);
+    IndexIVFScalarQuantizer idx(&quantizer, d, nlist, ScalarQuantizer::QT_fp16);
+    idx.own_fields = false;
+    auto xb = make_random_data(d, nb);
+    idx.train(nb, xb.data());
+    // The corrupted code_size is caught either by our
+    // validate_code_size_match or by the InvertedLists code_size
+    // consistency check — both reject corrupt data.
+    corrupt_code_size_and_expect_throw(&idx, &idx.code_size, "code_size");
+}
+
+TEST(ReadIndexDeserialize, IVFAdditiveQuantizerCodeSizeMismatch) {
+    int d = 16, nb = 512, nlist = 4;
+    IndexFlatL2 quantizer(d);
+    IndexIVFResidualQuantizer idx(&quantizer, d, nlist, 2, 8);
+    idx.own_fields = false;
+    auto xb = make_random_data(d, nb);
+    idx.train(nb, xb.data());
+    corrupt_code_size_and_expect_throw(&idx, &idx.code_size, "code_size");
+}
+
+TEST(ReadIndexDeserialize, Index2LayerCodeSize2Mismatch) {
+    int d = 8, nb = 256, nlist = 4;
+    auto quantizer = std::make_unique<IndexFlatL2>(d);
+    Index2Layer idx(quantizer.release(), nlist, 4);
+    idx.q1.own_fields = true;
+    auto xb = make_random_data(d, nb);
+    idx.train(nb, xb.data());
+    idx.add(nb, xb.data());
+    corrupt_code_size_and_expect_throw(
+            &idx, &idx.code_size_2, "code_size mismatch");
+}
+
+TEST(ReadIndexDeserialize, Index2LayerCodeSizeSumMismatch) {
+    int d = 8, nb = 256, nlist = 4;
+    auto quantizer = std::make_unique<IndexFlatL2>(d);
+    Index2Layer idx(quantizer.release(), nlist, 4);
+    idx.q1.own_fields = true;
+    auto xb = make_random_data(d, nb);
+    idx.train(nb, xb.data());
+    idx.add(nb, xb.data());
+    corrupt_code_size_and_expect_throw(
+            &idx, &idx.code_size, "code_size mismatch");
+}
+
+// IndexLattice code_size is derived from scale_nbit, lattice_nbit,
+// and nsq. A corrupt scale_nbit (e.g. negative) causes integer
+// overflow in the total_nbit → code_size computation, producing
+// a huge code_size that is rejected at deserialization.
+TEST(ReadIndexDeserialize, IndexLatticeCodeSizeTooLarge) {
+    std::vector<uint8_t> buf;
+    push_fourcc(buf, "IxLa");
+    push_val<int>(buf, 4);    // d
+    push_val<int>(buf, 2);    // nsq (dsq = 4/2 = 2, power of 2 >= 2)
+    push_val<int>(buf, -100); // scale_nbit (corrupt → overflows code_size)
+    push_val<int>(buf, 1);    // r2
+
+    expect_read_throws_with(buf, "code_size");
+}
+
 // ============================================================
 // SVS fourcc rejection / deserialization safety (Group F: T262015608)
 // ============================================================