facebookresearch
diff --git a/‎faiss/factory_tools.cpp‎
Lines changed: 5 additions & 0 deletions b/‎faiss/factory_tools.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎faiss/impl/ScalarQuantizer.cpp‎
Lines changed: 29 additions & 0 deletions b/‎faiss/impl/ScalarQuantizer.cpp‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎faiss/impl/ScalarQuantizer.h‎
Lines changed: 5 additions & 0 deletions b/‎faiss/impl/ScalarQuantizer.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎faiss/impl/index_read.cpp‎
Lines changed: 16 additions & 1 deletion b/‎faiss/impl/index_read.cpp‎
Lines changed: 16 additions & 1 deletion
diff --git a/‎faiss/impl/scalar_quantizer/quantizers.h‎
Lines changed: 87 additions & 0 deletions b/‎faiss/impl/scalar_quantizer/quantizers.h‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎faiss/impl/scalar_quantizer/sq-avx2.cpp‎
Lines changed: 107 additions & 0 deletions b/‎faiss/impl/scalar_quantizer/sq-avx2.cpp‎
Lines changed: 107 additions & 0 deletions
@@ -38,6 +38,11 @@ const std::map<faiss::ScalarQuantizer::QuantizerType, std::string> sq_types = {
         {faiss::ScalarQuantizer::QT_bf16, "SQbf16"},
         {faiss::ScalarQuantizer::QT_8bit_direct_signed, "SQ8_direct_signed"},
         {faiss::ScalarQuantizer::QT_8bit_direct, "SQ8_direct"},
+        {faiss::ScalarQuantizer::QT_1bit_tqmse, "SQtqmse1"},
+        {faiss::ScalarQuantizer::QT_2bit_tqmse, "SQtqmse2"},
+        {faiss::ScalarQuantizer::QT_3bit_tqmse, "SQtqmse3"},
+        {faiss::ScalarQuantizer::QT_4bit_tqmse, "SQtqmse4"},
+        {faiss::ScalarQuantizer::QT_8bit_tqmse, "SQtqmse8"},
 };
 
 int get_hnsw_M(const faiss::IndexHNSW* index) {
 
@@ -38,15 +38,29 @@ ScalarQuantizer::ScalarQuantizer() {}
 
 void ScalarQuantizer::set_derived_sizes() {
     switch (qtype) {
+        case QT_1bit_tqmse:
+            code_size = (d + 7) / 8;
+            bits = 1;
+            break;
+        case QT_2bit_tqmse:
+            code_size = (d * 2 + 7) / 8;
+            bits = 2;
+            break;
+        case QT_3bit_tqmse:
+            code_size = (d * 3 + 7) / 8;
+            bits = 3;
+            break;
         case QT_8bit:
         case QT_8bit_uniform:
         case QT_8bit_direct:
         case QT_8bit_direct_signed:
+        case QT_8bit_tqmse:
             code_size = d;
             bits = 8;
             break;
         case QT_4bit:
         case QT_4bit_uniform:
+        case QT_4bit_tqmse:
             code_size = (d + 1) / 2;
             bits = 4;
             break;
@@ -107,6 +121,21 @@ void ScalarQuantizer::train(size_t n, const float* x) {
         case QT_8bit_direct_signed:
             // no training necessary
             break;
+        case QT_1bit_tqmse:
+            scalar_quantizer::train_TurboQuantMSE(d, 1, trained);
+            break;
+        case QT_2bit_tqmse:
+            scalar_quantizer::train_TurboQuantMSE(d, 2, trained);
+            break;
+        case QT_3bit_tqmse:
+            scalar_quantizer::train_TurboQuantMSE(d, 3, trained);
+            break;
+        case QT_4bit_tqmse:
+            scalar_quantizer::train_TurboQuantMSE(d, 4, trained);
+            break;
+        case QT_8bit_tqmse:
+            scalar_quantizer::train_TurboQuantMSE(d, 8, trained);
+            break;
         default:
             break;
     }
 
@@ -33,6 +33,11 @@ struct ScalarQuantizer : Quantizer {
         QT_bf16,
         QT_8bit_direct_signed, ///< fast indexing of signed int8s ranging from
                                ///< [-128 to 127]
+        QT_1bit_tqmse, ///< TurboQuant MSE-optimized, x bits per component
+        QT_2bit_tqmse,
+        QT_3bit_tqmse,
+        QT_4bit_tqmse,
+        QT_8bit_tqmse,
         QT_count
     };
 
 
@@ -868,7 +868,7 @@ void read_ScalarQuantizer(
     READ1(qtype_int);
     FAISS_THROW_IF_NOT_FMT(
             qtype_int >= ScalarQuantizer::QT_8bit &&
-                    qtype_int <= ScalarQuantizer::QT_8bit_direct_signed,
+                    qtype_int < ScalarQuantizer::QT_count,
             "invalid ScalarQuantizer qtype %d",
             qtype_int);
     ivsc->qtype = static_cast<ScalarQuantizer::QuantizerType>(qtype_int);
@@ -906,6 +906,21 @@ void read_ScalarQuantizer(
             case ScalarQuantizer::QT_count:
                 expected = 0;
                 break;
+            case ScalarQuantizer::QT_1bit_tqmse:
+                expected = 2 + 1; // 2^bits centroids + (2^bits - 1) boundaries
+                break;
+            case ScalarQuantizer::QT_2bit_tqmse:
+                expected = 4 + 3;
+                break;
+            case ScalarQuantizer::QT_3bit_tqmse:
+                expected = 8 + 7;
+                break;
+            case ScalarQuantizer::QT_4bit_tqmse:
+                expected = 16 + 15;
+                break;
+            case ScalarQuantizer::QT_8bit_tqmse:
+                expected = 256 + 255;
+                break;
         }
         if (ivsc->trained.empty() && expected > 0) {
             // Empty trained is only valid for untrained indices.
 
@@ -7,6 +7,9 @@
 
 #pragma once
 
+#include <algorithm>
+
+#include <faiss/impl/FaissAssert.h>
 #include <faiss/impl/ScalarQuantizer.h>
 #include <faiss/impl/simdlib/simdlib_dispatch.h>
 #include <faiss/utils/bf16.h>
@@ -113,6 +116,90 @@ struct QuantizerTemplate<
     }
 };
 
+/*******************************************************************
+ * TurboQuant MSE quantizer
+ *******************************************************************/
+template <int NBits, SIMDLevel SL>
+struct QuantizerTurboQuantMSE;
+
+template <int NBits>
+struct QuantizerTurboQuantMSE<NBits, SIMDLevel::NONE>
+        : ScalarQuantizer::SQuantizer {
+    static_assert(NBits >= 1 && NBits <= 8);
+
+    static constexpr size_t kCentroidsCount = size_t(1) << NBits;
+    static constexpr uint16_t kIndexMask =
+            static_cast<uint16_t>((1u << NBits) - 1);
+
+    const size_t d;
+    const float* centroids;
+    const float* boundaries;
+
+    QuantizerTurboQuantMSE(size_t d_in, const std::vector<float>& trained)
+            : d(d_in), centroids(nullptr), boundaries(nullptr) {
+        FAISS_THROW_IF_NOT(trained.size() == 2 * kCentroidsCount - 1);
+        centroids = trained.data();
+        boundaries = trained.data() + kCentroidsCount;
+    }
+
+    FAISS_ALWAYS_INLINE uint8_t select_index(float x) const {
+        return static_cast<uint8_t>(
+                std::upper_bound(
+                        boundaries, boundaries + (kCentroidsCount - 1), x) -
+                boundaries);
+    }
+
+    FAISS_ALWAYS_INLINE void encode_index(uint8_t idx, uint8_t* code, size_t i)
+            const {
+        const size_t bit_offset = i * NBits;
+        const size_t byte_offset = bit_offset >> 3;
+        const size_t bit_shift = bit_offset & 7;
+        const uint16_t packed = static_cast<uint16_t>(idx & kIndexMask)
+                << bit_shift;
+        code[byte_offset] |= packed & 0xff;
+        if (bit_shift + NBits > 8) {
+            code[byte_offset + 1] |= packed >> 8;
+        }
+    }
+
+    FAISS_ALWAYS_INLINE uint8_t
+    decode_index(const uint8_t* code, size_t i) const {
+        const size_t bit_offset = i * NBits;
+        const size_t byte_offset = bit_offset >> 3;
+        const size_t bit_shift = bit_offset & 7;
+
+        uint16_t packed = code[byte_offset];
+        if (bit_shift + NBits > 8) {
+            packed |= static_cast<uint16_t>(code[byte_offset + 1]) << 8;
+        }
+        return static_cast<uint8_t>((packed >> bit_shift) & kIndexMask);
+    }
+
+    void encode_vector(const float* x, uint8_t* code) const final {
+        for (size_t i = 0; i < d; i++) {
+            encode_index(select_index(x[i]), code, i);
+        }
+    }
+
+    void decode_vector(const uint8_t* code, float* x) const final {
+        for (size_t i = 0; i < d; i++) {
+            x[i] = centroids[decode_index(code, i)];
+        }
+    }
+
+    FAISS_ALWAYS_INLINE float reconstruct_component(
+            const uint8_t* code,
+            size_t i) const {
+        return centroids[decode_index(code, i)];
+    }
+};
+
+template <int NBits, SIMDLevel SL>
+struct QuantizerTurboQuantMSE : QuantizerTurboQuantMSE<NBits, SIMDLevel::NONE> {
+    using QuantizerTurboQuantMSE<NBits, SIMDLevel::NONE>::
+            QuantizerTurboQuantMSE;
+};
+
 /*******************************************************************
  * FP16 quantizer
  *******************************************************************/
 
@@ -9,6 +9,8 @@
 
 #include <faiss/impl/simdlib/simdlib_avx2.h>
 
+#include <cstring>
+
 #include <faiss/impl/scalar_quantizer/codecs.h>
 #include <faiss/impl/scalar_quantizer/distance_computers.h>
 #include <faiss/impl/scalar_quantizer/quantizers.h>
@@ -21,6 +23,61 @@ namespace scalar_quantizer {
 
 using simd8float32 = faiss::simd8float32_tpl<SIMDLevel::AVX2>;
 
+namespace {
+
+FAISS_ALWAYS_INLINE uint16_t load_u16(const uint8_t* ptr) {
+    uint16_t value;
+    std::memcpy(&value, ptr, sizeof(value));
+    return value;
+}
+
+FAISS_ALWAYS_INLINE uint32_t load_u32(const uint8_t* ptr) {
+    uint32_t value;
+    std::memcpy(&value, ptr, sizeof(value));
+    return value;
+}
+
+FAISS_ALWAYS_INLINE uint32_t load_u24(const uint8_t* ptr) {
+    return static_cast<uint32_t>(ptr[0]) |
+            (static_cast<uint32_t>(ptr[1]) << 8) |
+            (static_cast<uint32_t>(ptr[2]) << 16);
+}
+
+FAISS_ALWAYS_INLINE __m256i unpack_8x1bit_to_u32(const uint8_t* code, int i) {
+    const uint32_t packed = code[static_cast<size_t>(i) >> 3];
+    const __m256i shifts = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    const __m256i indices =
+            _mm256_srlv_epi32(_mm256_set1_epi32(packed), shifts);
+    return _mm256_and_si256(indices, _mm256_set1_epi32(0x1));
+}
+
+FAISS_ALWAYS_INLINE __m256i unpack_8x2bit_to_u32(const uint8_t* code, int i) {
+    const uint32_t packed = load_u16(code + (static_cast<size_t>(i) >> 2));
+    const __m256i shifts = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
+    const __m256i indices =
+            _mm256_srlv_epi32(_mm256_set1_epi32(packed), shifts);
+    return _mm256_and_si256(indices, _mm256_set1_epi32(0x3));
+}
+
+FAISS_ALWAYS_INLINE __m256i unpack_8x3bit_to_u32(const uint8_t* code, int i) {
+    const uint32_t packed =
+            load_u24(code + ((static_cast<size_t>(i) >> 3) * 3));
+    const __m256i shifts = _mm256_setr_epi32(0, 3, 6, 9, 12, 15, 18, 21);
+    const __m256i indices =
+            _mm256_srlv_epi32(_mm256_set1_epi32(packed), shifts);
+    return _mm256_and_si256(indices, _mm256_set1_epi32(0x7));
+}
+
+FAISS_ALWAYS_INLINE __m256i unpack_8x4bit_to_u32(const uint8_t* code, int i) {
+    const uint32_t packed = load_u32(code + (static_cast<size_t>(i) >> 1));
+    const __m256i shifts = _mm256_setr_epi32(0, 4, 8, 12, 16, 20, 24, 28);
+    const __m256i indices =
+            _mm256_srlv_epi32(_mm256_set1_epi32(packed), shifts);
+    return _mm256_and_si256(indices, _mm256_set1_epi32(0xf));
+}
+
+} // namespace
+
 /**********************************************************
  * Codecs
  **********************************************************/
@@ -168,6 +225,56 @@ struct QuantizerTemplate<
     }
 };
 
+/**********************************************************
+ * TurboQuant MSE quantizer
+ **********************************************************/
+
+#define DEFINE_TQMSE_AVX2_SPECIALIZATION(NBITS, INDEX_EXPR)                 \
+    template <>                                                             \
+    struct QuantizerTurboQuantMSE<NBITS, SIMDLevel::AVX2>                   \
+            : QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE> {              \
+        using Base = QuantizerTurboQuantMSE<NBITS, SIMDLevel::NONE>;        \
+                                                                            \
+        QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained) \
+                : Base(d, trained) {                                        \
+            assert(d % 8 == 0);                                             \
+        }                                                                   \
+                                                                            \
+        FAISS_ALWAYS_INLINE simd8float32                                    \
+        reconstruct_8_components(const uint8_t* code, int i) const {        \
+            const __m256i indices = (INDEX_EXPR);                           \
+            return simd8float32(_mm256_i32gather_ps(                        \
+                    this->centroids, indices, sizeof(float)));              \
+        }                                                                   \
+    }
+
+DEFINE_TQMSE_AVX2_SPECIALIZATION(1, unpack_8x1bit_to_u32(code, i));
+DEFINE_TQMSE_AVX2_SPECIALIZATION(2, unpack_8x2bit_to_u32(code, i));
+DEFINE_TQMSE_AVX2_SPECIALIZATION(3, unpack_8x3bit_to_u32(code, i));
+DEFINE_TQMSE_AVX2_SPECIALIZATION(4, unpack_8x4bit_to_u32(code, i));
+
+#undef DEFINE_TQMSE_AVX2_SPECIALIZATION
+
+template <>
+struct QuantizerTurboQuantMSE<8, SIMDLevel::AVX2>
+        : QuantizerTurboQuantMSE<8, SIMDLevel::NONE> {
+    using Base = QuantizerTurboQuantMSE<8, SIMDLevel::NONE>;
+
+    QuantizerTurboQuantMSE(size_t d, const std::vector<float>& trained)
+            : Base(d, trained) {
+        assert(d % 8 == 0);
+    }
+
+    FAISS_ALWAYS_INLINE simd8float32
+    reconstruct_8_components(const uint8_t* code, int i) const {
+        const __m128i packed = _mm_loadl_epi64(
+                (const __m128i*)(code + static_cast<size_t>(i)));
+        const __m256i indices = _mm256_cvtepu8_epi32(packed);
+        return simd8float32(
+                _mm256_i32gather_ps(this->centroids, indices, sizeof(float)));
+    }
+};
+
 /**********************************************************
  * FP16 Quantizer
  **********************************************************/