diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index cf870cc159..c5c320257e 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -14,7 +14,9 @@ set(FAISS_SIMD_AVX2_SRC impl/pq_code_distance/pq_code_distance-avx2.cpp impl/scalar_quantizer/sq-avx2.cpp impl/approx_topk/avx2.cpp + impl/binary_hamming/avx2.cpp utils/simd_impl/distances_avx2.cpp + utils/simd_impl/hamming_avx2.cpp utils/simd_impl/partitioning_avx2.cpp utils/distances_fused/simdlib_based.cpp utils/simd_impl/rabitq_avx2.cpp @@ -32,7 +34,9 @@ set(FAISS_SIMD_NEON_SRC impl/fast_scan/impl-neon.cpp impl/scalar_quantizer/sq-neon.cpp impl/approx_topk/neon.cpp + impl/binary_hamming/neon.cpp utils/simd_impl/distances_aarch64.cpp + utils/simd_impl/hamming_neon.cpp utils/simd_impl/partitioning_neon.cpp utils/distances_fused/simdlib_based_neon.cpp utils/simd_impl/rabitq_neon.cpp @@ -342,6 +346,7 @@ set(FAISS_HEADERS utils/hamming_distance/avx512-inl.h utils/simd_impl/distances_autovec-inl.h utils/simd_impl/distances_simdlib256.h + utils/simd_impl/hamming_impl.h utils/simd_impl/exhaustive_L2sqr_blas_cmax.h utils/simd_impl/IVFFlatScanner-inl.h utils/simd_impl/distances_sse-inl.h diff --git a/faiss/IndexBinaryHNSW.cpp b/faiss/IndexBinaryHNSW.cpp index 21f10e53f3..80925e6c75 100644 --- a/faiss/IndexBinaryHNSW.cpp +++ b/faiss/IndexBinaryHNSW.cpp @@ -27,8 +27,16 @@ #include #include +#include + #include +// Scalar (NONE) fallback for dynamic dispatch +#define THE_SIMD_LEVEL SIMDLevel::NONE +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +#undef THE_SIMD_LEVEL + namespace faiss { /************************************************************** @@ -280,50 +288,15 @@ void IndexBinaryHNSW::reconstruct(idx_t key, uint8_t* recons) const { storage->reconstruct(key, recons); } -namespace { - -template -struct FlatHammingDis : DistanceComputer { - const int code_size; - const uint8_t* b; - HammingComputer hc; - - float operator()(idx_t i) override { - return hc.hamming(b + i * code_size); - } - - float symmetric_dis(idx_t i, idx_t j) override { - return HammingComputerDefault(b + j * code_size, code_size) - .hamming(b + i * code_size); - } - - explicit FlatHammingDis(const IndexBinaryFlat& storage) - : code_size(storage.code_size), b(storage.xb.data()), hc() {} - - // NOTE: Pointers are cast from float in order to reuse the floating-point - // DistanceComputer. - void set_query(const float* x) override { - hc.set((uint8_t*)x, code_size); - } -}; - -struct BuildDistanceComputer { - using T = DistanceComputer*; - template - DistanceComputer* f(IndexBinaryFlat* flat_storage) { - return new FlatHammingDis(*flat_storage); - } -}; - -} // namespace - DistanceComputer* IndexBinaryHNSW::get_distance_computer() const { IndexBinaryFlat* flat_storage = dynamic_cast(storage); FAISS_THROW_IF_NOT_MSG( flat_storage != nullptr, "IndexBinaryHNSW requires IndexBinaryFlat storage"); - BuildDistanceComputer bd; - return dispatch_HammingComputer(code_size, bd, flat_storage); + return with_simd_level_256bit([&]() { + return make_binary_hnsw_distance_computer_dispatch( + code_size, flat_storage); + }); } /************************************************************** diff --git a/faiss/IndexBinaryHash.cpp b/faiss/IndexBinaryHash.cpp index b8c0092eae..8a253a3a93 100644 --- a/faiss/IndexBinaryHash.cpp +++ b/faiss/IndexBinaryHash.cpp @@ -20,6 +20,14 @@ #include #include +#include + +// Scalar (NONE) fallback for dynamic dispatch +#define THE_SIMD_LEVEL SIMDLevel::NONE +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +#undef THE_SIMD_LEVEL + namespace faiss { void IndexBinaryHash::InvertedList::add( @@ -64,139 +72,8 @@ void IndexBinaryHash::add_with_ids( ntotal += n; } -namespace { - -/** Enumerate all bit vectors of size nbit with up to maxflip 1s - * test in P127257851 P127258235 - */ -struct FlipEnumerator { - int nbit, nflip, maxflip; - uint64_t mask, x; - - FlipEnumerator(int nbit_, int maxflip_) : nbit(nbit_), maxflip(maxflip_) { - nflip = 0; - mask = 0; - x = 0; - } - - bool next() { - if (x == mask) { - if (nflip == maxflip) { - return false; - } - // increase Hamming radius - nflip++; - mask = (((uint64_t)1 << nflip) - 1); - x = mask << (nbit - nflip); - return true; - } - - int i = __builtin_ctzll(x); - - if (i > 0) { - x ^= (uint64_t)3 << (i - 1); - } else { - // nb of LSB 1s - int n1 = __builtin_ctzll(~x); - // clear them - x &= ((uint64_t)(-1) << n1); - int n2 = __builtin_ctzll(x); - x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1); - } - return true; - } -}; - -struct RangeSearchResults { - int radius; - RangeQueryResult& qres; - - inline void add(float dis, idx_t id) { - if (dis < radius) { - qres.add(dis, id); - } - } -}; - -struct KnnSearchResults { - // heap params - idx_t k; - int32_t* heap_sim; - idx_t* heap_ids; - - using C = CMax; - - inline void add(float dis, idx_t id) { - if (dis < heap_sim[0]) { - heap_replace_top(k, heap_sim, heap_ids, dis, id); - } - } -}; - -template -void search_single_query_template( - const IndexBinaryHash& index, - const uint8_t* q, - SearchResults& res, - size_t& n0, - size_t& nlist, - size_t& ndis) { - size_t code_size = index.code_size; - BitstringReader br(q, code_size); - uint64_t qhash = br.read(index.b); - HammingComputer hc(q, code_size); - FlipEnumerator fe(index.b, index.nflip); - - // loop over neighbors that are at most at nflip bits - do { - uint64_t hash = qhash ^ fe.x; - auto it = index.invlists.find(hash); - - if (it == index.invlists.end()) { - continue; - } - - const IndexBinaryHash::InvertedList& il = it->second; - - size_t nv = il.ids.size(); - - if (nv == 0) { - n0++; - } else { - const uint8_t* codes = il.vecs.data(); - for (size_t i = 0; i < nv; i++) { - int dis = hc.hamming(codes); - res.add(dis, il.ids[i]); - codes += code_size; - } - ndis += nv; - nlist++; - } - } while (fe.next()); -} - -struct Run_search_single_query { - using T = void; - template - T f(Types*... args) { - search_single_query_template(*args...); - } -}; - -template -void search_single_query( - const IndexBinaryHash& index, - const uint8_t* q, - SearchResults& res, - size_t& n0, - size_t& nlist, - size_t& ndis) { - Run_search_single_query r; - dispatch_HammingComputer( - index.code_size, r, &index, &q, &res, &n0, &nlist, &ndis); -} - -} // anonymous namespace +// search_single_query_template and helpers are now in +// impl/binary_hamming/IndexBinaryHash_impl.h (compiled per-ISA) void IndexBinaryHash::range_search( idx_t n, @@ -215,10 +92,12 @@ void IndexBinaryHash::range_search( #pragma omp for for (idx_t i = 0; i < n; i++) { // loop queries RangeQueryResult& qres = pres.new_result(i); - RangeSearchResults res = {radius, qres}; const uint8_t* q = x + i * code_size; - search_single_query(*this, q, res, n0, nlist, ndis); + with_simd_level_256bit([&]() { + binary_hash_range_search_dispatch( + *this, q, radius, qres, n0, nlist, ndis); + }); } pres.finalize(); } @@ -248,10 +127,12 @@ void IndexBinaryHash::search( idx_t* idxi = labels + k * i; heap_heapify(k, simi, idxi); - KnnSearchResults res = {k, simi, idxi}; const uint8_t* q = x + i * code_size; - search_single_query(*this, q, res, n0, nlist, ndis); + with_simd_level_256bit([&]() { + binary_hash_knn_search_dispatch( + *this, q, k, simi, idxi, n0, nlist, ndis); + }); heap_reorder(k, simi, idxi); } @@ -324,75 +205,8 @@ void IndexBinaryMultiHash::add(idx_t n, const uint8_t* x) { ntotal += n; } -namespace { - -template -static void verify_shortlist( - const IndexBinaryFlat* index, - const uint8_t* q, - const std::unordered_set& shortlist, - SearchResults& res) { - size_t code_size = index->code_size; - - HammingComputer hc(q, code_size); - const uint8_t* codes = index->xb.data(); - - for (auto i : shortlist) { - int dis = hc.hamming(codes + i * code_size); - res.add(dis, i); - } -} - -struct Run_verify_shortlist { - using T = void; - template - void f(Types... args) { - verify_shortlist(args...); - } -}; - -template -void search_1_query_multihash( - const IndexBinaryMultiHash& index, - const uint8_t* xi, - SearchResults& res, - size_t& n0, - size_t& nlist, - size_t& ndis) { - std::unordered_set shortlist; - int b = index.b; - - BitstringReader br(xi, index.code_size); - for (int h = 0; h < index.nhash; h++) { - uint64_t qhash = br.read(b); - const IndexBinaryMultiHash::Map& map = index.maps[h]; - - FlipEnumerator fe(index.b, index.nflip); - // loop over neighbors that are at most at nflip bits - do { - uint64_t hash = qhash ^ fe.x; - auto it = map.find(hash); - - if (it != map.end()) { - const std::vector& v = it->second; - for (auto i : v) { - shortlist.insert(i); - } - nlist++; - } else { - n0++; - } - } while (fe.next()); - } - ndis += shortlist.size(); - - // verify shortlist - Run_verify_shortlist r; - dispatch_HammingComputer( - index.code_size, r, index.storage, xi, shortlist, res); -} - -} // anonymous namespace +// verify_shortlist and search_1_query_multihash are now in +// impl/binary_hamming/IndexBinaryHash_impl.h (compiled per-ISA) void IndexBinaryMultiHash::range_search( idx_t n, @@ -411,10 +225,12 @@ void IndexBinaryMultiHash::range_search( #pragma omp for for (idx_t i = 0; i < n; i++) { // loop queries RangeQueryResult& qres = pres.new_result(i); - RangeSearchResults res = {radius, qres}; const uint8_t* q = x + i * code_size; - search_1_query_multihash(*this, q, res, n0, nlist, ndis); + with_simd_level_256bit([&]() { + binary_multihash_range_search_dispatch( + *this, q, radius, qres, n0, nlist, ndis); + }); } pres.finalize(); } @@ -444,10 +260,12 @@ void IndexBinaryMultiHash::search( idx_t* idxi = labels + k * i; heap_heapify(k, simi, idxi); - KnnSearchResults res = {k, simi, idxi}; const uint8_t* q = x + i * code_size; - search_1_query_multihash(*this, q, res, n0, nlist, ndis); + with_simd_level_256bit([&]() { + binary_multihash_knn_search_dispatch( + *this, q, k, simi, idxi, n0, nlist, ndis); + }); heap_reorder(k, simi, idxi); } diff --git a/faiss/IndexBinaryIVF.cpp b/faiss/IndexBinaryIVF.cpp index 90c5ecc301..c53cdb3dc1 100644 --- a/faiss/IndexBinaryIVF.cpp +++ b/faiss/IndexBinaryIVF.cpp @@ -25,6 +25,14 @@ #include #include +#include + +// Scalar (NONE) fallback for dynamic dispatch +#define THE_SIMD_LEVEL SIMDLevel::NONE +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +#undef THE_SIMD_LEVEL + namespace faiss { IndexBinaryIVF::IndexBinaryIVF( @@ -339,68 +347,11 @@ void IndexBinaryIVF::replace_invlists(InvertedLists* il, bool own) { own_invlists = own; } -namespace { - -template -struct IVFBinaryScannerL2 : BinaryInvertedListScanner { - HammingComputer hc; - size_t code_size; - bool store_pairs; - - IVFBinaryScannerL2(size_t code_size_, bool store_pairs_) - : code_size(code_size_), store_pairs(store_pairs_) {} - - void set_query(const uint8_t* query_vector) override { - hc.set(query_vector, code_size); - } - - idx_t list_no; - void set_list(idx_t list_no_2, uint8_t /* coarse_dis */) override { - this->list_no = list_no_2; - } - - uint32_t distance_to_code(const uint8_t* code) const override { - return hc.hamming(code); - } - - size_t scan_codes( - size_t n, - const uint8_t* __restrict codes, - const idx_t* __restrict ids, - int32_t* __restrict simi, - idx_t* __restrict idxi, - size_t k) const override { - using C = CMax; - - size_t nup = 0; - for (size_t j = 0; j < n; j++) { - uint32_t dis = hc.hamming(codes); - if (dis < static_cast(simi[0])) { - idx_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - heap_replace_top(k, simi, idxi, dis, id); - nup++; - } - codes += code_size; - } - return nup; - } +// IVFBinaryScannerL2, search_knn_hamming_count, BlockSearch, +// BlockSearchVariableK, search_knn_hamming_per_invlist are now in +// impl/binary_hamming/IndexBinaryIVF_impl.h (compiled per-ISA) - void scan_codes_range( - size_t n, - const uint8_t* __restrict codes, - const idx_t* __restrict ids, - int radius, - RangeQueryResult& result) const override { - for (size_t j = 0; j < n; j++) { - uint32_t dis = hc.hamming(codes); - if (dis < static_cast(radius)) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - result.add(dis, id); - } - codes += code_size; - } - } -}; +namespace { void search_knn_hamming_heap( const IndexBinaryIVF* ivf, @@ -507,317 +458,17 @@ void search_knn_hamming_heap( indexIVF_stats.nheap_updates += nheap; } -template -void search_knn_hamming_count( - const IndexBinaryIVF* ivf, - size_t nx, - const uint8_t* __restrict x, - const idx_t* __restrict keys, - int k, - int32_t* __restrict distances, - idx_t* __restrict labels, - const IVFSearchParameters* params) { - const int nBuckets = ivf->d + 1; - std::vector all_counters(nx * nBuckets, 0); - std::unique_ptr all_ids_per_dis(new idx_t[nx * nBuckets * k]); - - idx_t nprobe = params ? params->nprobe : ivf->nprobe; - nprobe = std::min((idx_t)ivf->nlist, nprobe); - idx_t max_codes = params ? params->max_codes : ivf->max_codes; - - std::vector> cs; - for (size_t i = 0; i < nx; ++i) { - cs.push_back( - HCounterState( - all_counters.data() + i * nBuckets, - all_ids_per_dis.get() + i * nBuckets * k, - x + i * ivf->code_size, - ivf->d, - k)); - } - - size_t nlistv = 0, ndis = 0; - -#pragma omp parallel for reduction(+ : nlistv, ndis) - for (int64_t i = 0; i < static_cast(nx); i++) { - const idx_t* keysi = keys + i * nprobe; - HCounterState& csi = cs[i]; - - size_t nscan = 0; - - for (idx_t ik = 0; ik < nprobe; ik++) { - idx_t key = keysi[ik]; /* select the list */ - if (key < 0) { - // not enough centroids for multiprobe - continue; - } - FAISS_THROW_IF_NOT_FMT( - key < (idx_t)ivf->nlist, - "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n", - key, - static_cast(ik), - ivf->nlist); - - nlistv++; - size_t list_size = ivf->invlists->list_size(key); - size_t list_size_max = static_cast(max_codes) - nscan; - if (list_size > list_size_max) { - list_size = list_size_max; - } - InvertedLists::ScopedCodes scodes(ivf->invlists, key); - const uint8_t* list_vecs = scodes.get(); - const idx_t* ids = - store_pairs ? nullptr : ivf->invlists->get_ids(key); - - for (size_t j = 0; j < list_size; j++) { - const uint8_t* yj = list_vecs + ivf->code_size * j; - - idx_t id = store_pairs ? (key << 32 | j) : ids[j]; - csi.update_counter(yj, id); - } - if (ids) { - ivf->invlists->release_ids(key, ids); - } - - nscan += list_size; - if (nscan >= static_cast(max_codes)) { - break; - } - } - ndis += nscan; - - int nres = 0; - for (int b = 0; b < nBuckets && nres < k; b++) { - for (int l = 0; l < csi.counters[b] && nres < k; l++) { - labels[i * k + nres] = csi.ids_per_dis[b * k + l]; - distances[i * k + nres] = b; - nres++; - } - } - while (nres < k) { - labels[i * k + nres] = -1; - distances[i * k + nres] = std::numeric_limits::max(); - ++nres; - } - } - - indexIVF_stats.nq += nx; - indexIVF_stats.nlist += nlistv; - indexIVF_stats.ndis += ndis; -} - -/* Manages NQ queries at a time, stores results */ -template -struct BlockSearch { - HammingComputer hcs[NQ]; - // heaps to update for each query - int32_t* distances[NQ]; - idx_t* labels[NQ]; - // curent top of heap - int32_t heap_tops[NQ]; - - BlockSearch( - size_t code_size, - const uint8_t* __restrict x, - const int32_t* __restrict keys, - int32_t* __restrict all_distances, - idx_t* __restrict all_labels) { - for (idx_t q = 0; q < NQ; q++) { - idx_t qno = keys[q]; - hcs[q] = HammingComputer(x + qno * code_size, code_size); - distances[q] = all_distances + qno * K; - labels[q] = all_labels + qno * K; - heap_tops[q] = distances[q][0]; - } - } - - void add_bcode(const uint8_t* bcode, idx_t id) { - using C = CMax; - for (int q = 0; q < NQ; q++) { - int dis = hcs[q].hamming(bcode); - if (dis < heap_tops[q]) { - heap_replace_top(K, distances[q], labels[q], dis, id); - heap_tops[q] = distances[q][0]; - } - } - } -}; - -template -struct BlockSearchVariableK { - int k; - HammingComputer hcs[NQ]; - // heaps to update for each query - int32_t* distances[NQ]; - idx_t* labels[NQ]; - // curent top of heap - int32_t heap_tops[NQ]; - - BlockSearchVariableK( - size_t code_size, - int k_, - const uint8_t* __restrict x, - const int32_t* __restrict keys, - int32_t* __restrict all_distances, - idx_t* __restrict all_labels) - : k(k_) { - for (idx_t q = 0; q < NQ; q++) { - idx_t qno = keys[q]; - hcs[q] = HammingComputer(x + qno * code_size, code_size); - distances[q] = all_distances + qno * k; - labels[q] = all_labels + qno * k; - heap_tops[q] = distances[q][0]; - } - } - - void add_bcode(const uint8_t* bcode, idx_t id) { - using C = CMax; - for (int q = 0; q < NQ; q++) { - int dis = hcs[q].hamming(bcode); - if (dis < heap_tops[q]) { - heap_replace_top(k, distances[q], labels[q], dis, id); - heap_tops[q] = distances[q][0]; - } - } - } -}; - -template -void search_knn_hamming_per_invlist( - const IndexBinaryIVF* ivf, - size_t n, - const uint8_t* __restrict x, - idx_t k, - const idx_t* __restrict keys_in, - const int32_t* __restrict /* coarse_dis */, - int32_t* __restrict distances, - idx_t* __restrict labels, - bool store_pairs, - const IVFSearchParameters* params) { - idx_t nprobe = params ? params->nprobe : ivf->nprobe; - nprobe = std::min((idx_t)ivf->nlist, nprobe); - idx_t max_codes = params ? params->max_codes : ivf->max_codes; - FAISS_THROW_IF_NOT(max_codes == 0); - FAISS_THROW_IF_NOT(!store_pairs); - - // reorder buckets - std::vector lims(n + 1); - int32_t* keys = new int32_t[n * nprobe]; - std::unique_ptr delete_keys(keys); - for (size_t i = 0; i < n * static_cast(nprobe); i++) { - keys[i] = keys_in[i]; - } - matrix_bucket_sort_inplace(n, nprobe, keys, ivf->nlist, lims.data(), 0); - - using C = CMax; - heap_heapify(n * k, distances, labels); - const size_t code_size = ivf->code_size; - - for (size_t l = 0; l < ivf->nlist; l++) { - idx_t l0 = lims[l], nq = lims[l + 1] - l0; - - InvertedLists::ScopedCodes scodes(ivf->invlists, l); - InvertedLists::ScopedIds sidx(ivf->invlists, l); - idx_t nb = ivf->invlists->list_size(l); - const uint8_t* bcodes = scodes.get(); - const idx_t* ids = sidx.get(); - - idx_t i = 0; - - // process as much as possible by blocks - constexpr int BS = 4; - - if (k == 1) { - for (; i + BS <= nq; i += BS) { - BlockSearch bc( - code_size, x, keys + l0 + i, distances, labels); - for (idx_t j = 0; j < nb; j++) { - bc.add_bcode(bcodes + j * code_size, ids[j]); - } - } - } else if (k == 2) { - for (; i + BS <= nq; i += BS) { - BlockSearch bc( - code_size, x, keys + l0 + i, distances, labels); - for (idx_t j = 0; j < nb; j++) { - bc.add_bcode(bcodes + j * code_size, ids[j]); - } - } - } else if (k == 4) { - for (; i + BS <= nq; i += BS) { - BlockSearch bc( - code_size, x, keys + l0 + i, distances, labels); - for (idx_t j = 0; j < nb; j++) { - bc.add_bcode(bcodes + j * code_size, ids[j]); - } - } - } else { - for (; i + BS <= nq; i += BS) { - BlockSearchVariableK bc( - code_size, k, x, keys + l0 + i, distances, labels); - for (idx_t j = 0; j < nb; j++) { - bc.add_bcode(bcodes + j * code_size, ids[j]); - } - } - } - - // leftovers - for (; i < nq; i++) { - idx_t qno = keys[l0 + i]; - HammingComputer hc(x + qno * code_size, code_size); - idx_t* __restrict idxi = labels + qno * k; - int32_t* __restrict simi = distances + qno * k; - int32_t simi0 = simi[0]; - for (idx_t j = 0; j < nb; j++) { - int dis = hc.hamming(bcodes + j * code_size); - - if (dis < simi0) { - idx_t id = store_pairs ? lo_build(l, j) : ids[j]; - heap_replace_top(k, simi, idxi, dis, id); - simi0 = simi[0]; - } - } - } - } - for (size_t i = 0; i < n; i++) { - heap_reorder(k, distances + i * k, labels + i * k); - } -} - -struct Run_search_knn_hamming_per_invlist { - using T = void; - - template - void f(Types... args) { - search_knn_hamming_per_invlist(args...); - } -}; - -template -struct Run_search_knn_hamming_count { - using T = void; - - template - void f(Types... args) { - search_knn_hamming_count(args...); - } -}; - -struct BuildScanner { - using T = BinaryInvertedListScanner*; - - template - T f(size_t code_size, bool store_pairs) { - return new IVFBinaryScannerL2(code_size, store_pairs); - } -}; - } // anonymous namespace +// The remaining template code (search_knn_hamming_count, +// search_knn_hamming_per_invlist, etc.) has been moved to +// impl/binary_hamming/IndexBinaryIVF_impl.h + BinaryInvertedListScanner* IndexBinaryIVF::get_InvertedListScanner( bool store_pairs) const { - BuildScanner bs; - return dispatch_HammingComputer(code_size, bs, code_size, store_pairs); + return with_simd_level_256bit([&]() { + return make_binary_ivf_scanner_dispatch(code_size, store_pairs); + }); } void IndexBinaryIVF::search_preassigned( @@ -831,23 +482,37 @@ void IndexBinaryIVF::search_preassigned( bool store_pairs, const IVFSearchParameters* params) const { if (per_invlist_search) { - Run_search_knn_hamming_per_invlist r; - // clang-format off - dispatch_HammingComputer( - code_size, r, this, n, x, k, - cidx, cdis, dis, idx, store_pairs, params); - // clang-format on + with_simd_level_256bit([&]() { + search_knn_hamming_per_invlist_dispatch( + code_size, + this, + n, + x, + k, + cidx, + cdis, + dis, + idx, + store_pairs, + params); + }); } else if (use_heap) { search_knn_hamming_heap( this, n, x, k, cidx, cdis, dis, idx, store_pairs, params); - } else if (store_pairs) { // !use_heap && store_pairs - Run_search_knn_hamming_count r; - dispatch_HammingComputer( - code_size, r, this, n, x, cidx, k, dis, idx, params); - } else { // !use_heap && !store_pairs - Run_search_knn_hamming_count r; - dispatch_HammingComputer( - code_size, r, this, n, x, cidx, k, dis, idx, params); + } else { + with_simd_level_256bit([&]() { + search_knn_hamming_count_dispatch( + code_size, + store_pairs, + this, + n, + x, + cidx, + k, + dis, + idx, + params); + }); } } diff --git a/faiss/IndexIVFSpectralHash.cpp b/faiss/IndexIVFSpectralHash.cpp index f0e53d36ea..22f97e1a5f 100644 --- a/faiss/IndexIVFSpectralHash.cpp +++ b/faiss/IndexIVFSpectralHash.cpp @@ -20,6 +20,14 @@ #include #include +#include + +// Scalar (NONE) fallback for dynamic dispatch +#define THE_SIMD_LEVEL SIMDLevel::NONE +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +#undef THE_SIMD_LEVEL + namespace faiss { IndexIVFSpectralHash::IndexIVFSpectralHash( @@ -207,115 +215,15 @@ void IndexIVFSpectralHash::encode_vectors( } } -namespace { - -template -struct IVFScanner : InvertedListScanner { - using InvertedListScanner::scan_codes; - // copied from index structure - const IndexIVFSpectralHash* index; - size_t nbit; - - float period, freq; - std::vector q; - std::vector zero; - std::vector qcode; - HammingComputer hc; - - IVFScanner(const IndexIVFSpectralHash* index_in, bool store_pairs_in) - : index(index_in), - nbit(index_in->nbit), - period(index_in->period), - freq(2.0 / index_in->period), - q(nbit), - zero(nbit), - qcode(index_in->code_size), - hc(qcode.data(), index_in->code_size) { - this->store_pairs = store_pairs_in; - this->code_size = index->code_size; - this->keep_max = is_similarity_metric(index->metric_type); - } - - void set_query(const float* query) override { - FAISS_THROW_IF_NOT(query); - FAISS_THROW_IF_NOT(q.size() == static_cast(nbit)); - index->vt->apply_noalloc(1, query, q.data()); - - if (index->threshold_type == IndexIVFSpectralHash::Thresh_global) { - binarize_with_freq(nbit, freq, q.data(), zero.data(), qcode.data()); - hc.set(qcode.data(), code_size); - } - } - - void set_list(idx_t list_no_in, float /*coarse_dis*/) override { - this->list_no = list_no_in; - if (index->threshold_type != IndexIVFSpectralHash::Thresh_global) { - const float* c = index->trained.data() + list_no_in * nbit; - binarize_with_freq(nbit, freq, q.data(), c, qcode.data()); - hc.set(qcode.data(), code_size); - } - } - - float distance_to_code(const uint8_t* code) const final { - return hc.hamming(code); - } - - size_t scan_codes( - size_t list_size, - const uint8_t* codes, - const idx_t* ids, - float* simi, - idx_t* idxi, - size_t k) const override { - size_t nup = 0; - for (size_t j = 0; j < list_size; j++) { - float dis = hc.hamming(codes); - - if (dis < simi[0]) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - maxheap_replace_top(k, simi, idxi, dis, id); - nup++; - } - codes += code_size; - } - return nup; - } - - void scan_codes_range( - size_t list_size, - const uint8_t* codes, - const idx_t* ids, - float radius, - RangeQueryResult& res) const override { - for (size_t j = 0; j < list_size; j++) { - float dis = hc.hamming(codes); - if (dis < radius) { - int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; - res.add(dis, id); - } - codes += code_size; - } - } -}; - -struct BuildScanner { - using T = InvertedListScanner*; - - template - static T f(const IndexIVFSpectralHash* index, bool store_pairs) { - return new IVFScanner(index, store_pairs); - } -}; - -} // anonymous namespace - InvertedListScanner* IndexIVFSpectralHash::get_InvertedListScanner( bool store_pairs, const IDSelector* sel, const IVFSearchParameters*) const { FAISS_THROW_IF_NOT(!sel); - BuildScanner bs; - return dispatch_HammingComputer(code_size, bs, this, store_pairs); + return with_simd_level_256bit([&]() { + return make_spectral_hash_scanner_dispatch( + code_size, this, store_pairs); + }); } void IndexIVFSpectralHash::replace_vt(VectorTransform* vt_in, bool own) { diff --git a/faiss/IndexPQ.cpp b/faiss/IndexPQ.cpp index a6a5ed4634..b97f924d2c 100644 --- a/faiss/IndexPQ.cpp +++ b/faiss/IndexPQ.cpp @@ -22,6 +22,12 @@ #include #include +// Scalar (NONE) fallback for dynamic dispatch +#define THE_SIMD_LEVEL SIMDLevel::NONE +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +#undef THE_SIMD_LEVEL + namespace faiss { /********************************************************* @@ -275,59 +281,8 @@ void IndexPQStats::reset() { IndexPQStats indexPQ_stats; -namespace { - -template -size_t polysemous_inner_loop( - const IndexPQ* index, - const float* dis_table_qi, - const uint8_t* q_code, - size_t k, - float* heap_dis, - int64_t* heap_ids, - int ht) { - size_t M = index->pq.M; - size_t code_size = index->pq.code_size; - size_t ksub = index->pq.ksub; - size_t ntotal = index->ntotal; - - const uint8_t* b_code = index->codes.data(); - - size_t n_pass_i = 0; - - HammingComputer hc(q_code, code_size); - - for (int64_t bi = 0; bi < static_cast(ntotal); bi++) { - int hd = hc.hamming(b_code); - - if (hd < ht) { - n_pass_i++; - - float dis = 0; - const float* dis_table = dis_table_qi; - for (size_t m = 0; m < M; m++) { - dis += dis_table[b_code[m]]; - dis_table += ksub; - } - - if (dis < heap_dis[0]) { - maxheap_replace_top(k, heap_dis, heap_ids, dis, bi); - } - } - b_code += code_size; - } - return n_pass_i; -} - -struct Run_polysemous_inner_loop { - using T = size_t; - template - size_t f(Types... args) { - return polysemous_inner_loop(args...); - } -}; - -} // anonymous namespace +// polysemous_inner_loop template code is now in +// impl/binary_hamming/IndexPQ_impl.h (compiled per-ISA) void IndexPQ::search_core_polysemous( idx_t n, @@ -377,17 +332,17 @@ void IndexPQ::search_core_polysemous( maxheap_heapify(k, heap_dis, heap_ids); if (!generalized_hamming) { - Run_polysemous_inner_loop r; - n_pass += dispatch_HammingComputer( - pq.code_size, - r, - this, - dis_table_qi, - q_code, - k, - heap_dis, - heap_ids, - param_polysemous_ht); + n_pass += with_simd_level_256bit([&]() { + return polysemous_inner_loop_dispatch( + pq.code_size, + this, + dis_table_qi, + q_code, + k, + heap_dis, + heap_ids, + param_polysemous_ht); + }); } else { // generalized hamming switch (pq.code_size) { diff --git a/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h b/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h new file mode 100644 index 0000000000..be16cd0733 --- /dev/null +++ b/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Per-ISA implementation of Hamming distance computation for + * IndexBinaryHNSW. Included once per SIMD TU with THE_SIMD_LEVEL + * set to the desired SIMDLevel. + */ + +#pragma once + +#ifndef THE_SIMD_LEVEL +#error "THE_SIMD_LEVEL must be defined before including this file" +#endif + +// hamdis-inl.h MUST be included first — the ISA ladder checks __AVX2__ etc. +// which are set by per-ISA TU compilation flags. +#include + +#include +#include +#include +#include + +namespace faiss { + +namespace { + +template +struct FlatHammingDis : DistanceComputer { + const int code_size; + const uint8_t* b; + HammingComputer hc; + + float operator()(idx_t i) override { + return hc.hamming(b + i * code_size); + } + + float symmetric_dis(idx_t i, idx_t j) override { + return HammingComputerDefault(b + j * code_size, code_size) + .hamming(b + i * code_size); + } + + explicit FlatHammingDis(const IndexBinaryFlat& storage) + : code_size(storage.code_size), b(storage.xb.data()), hc() {} + + // NOTE: Pointers are cast from float in order to reuse the floating-point + // DistanceComputer. + void set_query(const float* x) override { + hc.set((uint8_t*)x, code_size); + } +}; + +struct BuildDistanceComputer { + using T = DistanceComputer*; + template + DistanceComputer* f(IndexBinaryFlat* flat_storage) { + return new FlatHammingDis(*flat_storage); + } +}; + +} // anonymous namespace + +template <> +DistanceComputer* make_binary_hnsw_distance_computer_dispatch( + int code_size, + IndexBinaryFlat* flat_storage) { + BuildDistanceComputer bd; + return dispatch_HammingComputer(code_size, bd, flat_storage); +} + +} // namespace faiss diff --git a/faiss/impl/binary_hamming/IndexBinaryHash_impl.h b/faiss/impl/binary_hamming/IndexBinaryHash_impl.h new file mode 100644 index 0000000000..330ed9b104 --- /dev/null +++ b/faiss/impl/binary_hamming/IndexBinaryHash_impl.h @@ -0,0 +1,280 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Per-ISA implementation of Hamming distance computation for + * IndexBinaryHash and IndexBinaryMultiHash. Included once per SIMD TU + * with THE_SIMD_LEVEL set to the desired SIMDLevel. + */ + +#pragma once + +#ifndef THE_SIMD_LEVEL +#error "THE_SIMD_LEVEL must be defined before including this file" +#endif + +#include + +#include + +#include +#include +#include +#include +#include +#include + +namespace faiss { + +namespace { + +/** Enumerate all bit vectors of size nbit with up to maxflip 1s */ +struct FlipEnumerator { + int nbit, nflip, maxflip; + uint64_t mask, x; + + FlipEnumerator(int nbit_, int maxflip_) : nbit(nbit_), maxflip(maxflip_) { + nflip = 0; + mask = 0; + x = 0; + } + + bool next() { + if (x == mask) { + if (nflip == maxflip) { + return false; + } + // increase Hamming radius + nflip++; + mask = (((uint64_t)1 << nflip) - 1); + x = mask << (nbit - nflip); + return true; + } + + int i = __builtin_ctzll(x); + + if (i > 0) { + x ^= (uint64_t)3 << (i - 1); + } else { + // nb of LSB 1s + int n1 = __builtin_ctzll(~x); + // clear them + x &= ((uint64_t)(-1) << n1); + int n2 = __builtin_ctzll(x); + x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1); + } + return true; + } +}; + +struct RangeSearchResults { + int radius; + RangeQueryResult& qres; + + inline void add(float dis, idx_t id) { + if (dis < radius) { + qres.add(dis, id); + } + } +}; + +struct KnnSearchResults { + // heap params + idx_t k; + int32_t* heap_sim; + idx_t* heap_ids; + + using C = CMax; + + inline void add(float dis, idx_t id) { + if (dis < heap_sim[0]) { + heap_replace_top(k, heap_sim, heap_ids, dis, id); + } + } +}; + +template +void search_single_query_template( + const IndexBinaryHash& index, + const uint8_t* q, + SearchResults& res, + size_t& n0, + size_t& nlist, + size_t& ndis) { + size_t code_size = index.code_size; + BitstringReader br(q, code_size); + uint64_t qhash = br.read(index.b); + HammingComputer hc(q, code_size); + FlipEnumerator fe(index.b, index.nflip); + + // loop over neighbors that are at most at nflip bits + do { + uint64_t hash = qhash ^ fe.x; + auto it = index.invlists.find(hash); + + if (it == index.invlists.end()) { + continue; + } + + const IndexBinaryHash::InvertedList& il = it->second; + + size_t nv = il.ids.size(); + + if (nv == 0) { + n0++; + } else { + const uint8_t* codes = il.vecs.data(); + for (size_t i = 0; i < nv; i++) { + int dis = hc.hamming(codes); + res.add(dis, il.ids[i]); + codes += code_size; + } + ndis += nv; + nlist++; + } + } while (fe.next()); +} + +struct Run_search_single_query { + using T = void; + template + T f(Types*... args) { + search_single_query_template(*args...); + } +}; + +template +static void verify_shortlist( + const IndexBinaryFlat* index, + const uint8_t* q, + const std::unordered_set& shortlist, + SearchResults& res) { + size_t code_size = index->code_size; + + HammingComputer hc(q, code_size); + const uint8_t* codes = index->xb.data(); + + for (auto i : shortlist) { + int dis = hc.hamming(codes + i * code_size); + res.add(dis, i); + } +} + +struct Run_verify_shortlist { + using T = void; + template + void f(Types... args) { + verify_shortlist(args...); + } +}; + +template +void search_1_query_multihash( + const IndexBinaryMultiHash& index, + const uint8_t* xi, + SearchResults& res, + size_t& n0, + size_t& nlist, + size_t& ndis) { + std::unordered_set shortlist; + int b = index.b; + + BitstringReader br(xi, index.code_size); + for (int h = 0; h < index.nhash; h++) { + uint64_t qhash = br.read(b); + const IndexBinaryMultiHash::Map& map = index.maps[h]; + + FlipEnumerator fe(index.b, index.nflip); + // loop over neighbors that are at most at nflip bits + do { + uint64_t hash = qhash ^ fe.x; + auto it = map.find(hash); + + if (it != map.end()) { + const std::vector& v = it->second; + for (auto i : v) { + shortlist.insert(i); + } + nlist++; + } else { + n0++; + } + } while (fe.next()); + } + ndis += shortlist.size(); + + // verify shortlist + Run_verify_shortlist r; + dispatch_HammingComputer( + index.code_size, r, index.storage, xi, shortlist, res); +} + +} // anonymous namespace + +// --- IndexBinaryHash entry points --- + +template <> +void binary_hash_knn_search_dispatch( + const IndexBinaryHash& index, + const uint8_t* q, + idx_t k, + int32_t* heap_sim, + idx_t* heap_ids, + size_t& n0, + size_t& nlist, + size_t& ndis) { + KnnSearchResults res = {k, heap_sim, heap_ids}; + Run_search_single_query r; + dispatch_HammingComputer( + index.code_size, r, &index, &q, &res, &n0, &nlist, &ndis); +} + +template <> +void binary_hash_range_search_dispatch( + const IndexBinaryHash& index, + const uint8_t* q, + int radius, + RangeQueryResult& qres, + size_t& n0, + size_t& nlist, + size_t& ndis) { + RangeSearchResults res = {radius, qres}; + Run_search_single_query r; + dispatch_HammingComputer( + index.code_size, r, &index, &q, &res, &n0, &nlist, &ndis); +} + +// --- IndexBinaryMultiHash entry points --- + +template <> +void binary_multihash_knn_search_dispatch( + const IndexBinaryMultiHash& index, + const uint8_t* q, + idx_t k, + int32_t* heap_sim, + idx_t* heap_ids, + size_t& n0, + size_t& nlist, + size_t& ndis) { + KnnSearchResults res = {k, heap_sim, heap_ids}; + search_1_query_multihash(index, q, res, n0, nlist, ndis); +} + +template <> +void binary_multihash_range_search_dispatch( + const IndexBinaryMultiHash& index, + const uint8_t* q, + int radius, + RangeQueryResult& qres, + size_t& n0, + size_t& nlist, + size_t& ndis) { + RangeSearchResults res = {radius, qres}; + search_1_query_multihash(index, q, res, n0, nlist, ndis); +} + +} // namespace faiss diff --git a/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h b/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h new file mode 100644 index 0000000000..55c19ad20e --- /dev/null +++ b/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h @@ -0,0 +1,472 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Per-ISA implementation of Hamming distance computation for + * IndexBinaryIVF. Included once per SIMD TU with THE_SIMD_LEVEL + * set to the desired SIMDLevel. + * + * Contains: IVFBinaryScannerL2, search_knn_hamming_count, + * BlockSearch, BlockSearchVariableK, search_knn_hamming_per_invlist. + */ + +#pragma once + +#ifndef THE_SIMD_LEVEL +#error "THE_SIMD_LEVEL must be defined before including this file" +#endif + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace faiss { + +namespace { + +template +struct IVFBinaryScannerL2 : BinaryInvertedListScanner { + HammingComputer hc; + size_t code_size; + bool store_pairs; + + IVFBinaryScannerL2(size_t code_size_, bool store_pairs_) + : code_size(code_size_), store_pairs(store_pairs_) {} + + void set_query(const uint8_t* query_vector) override { + hc.set(query_vector, code_size); + } + + idx_t list_no; + void set_list(idx_t list_no_2, uint8_t /* coarse_dis */) override { + this->list_no = list_no_2; + } + + uint32_t distance_to_code(const uint8_t* code) const override { + return hc.hamming(code); + } + + size_t scan_codes( + size_t n, + const uint8_t* __restrict codes, + const idx_t* __restrict ids, + int32_t* __restrict simi, + idx_t* __restrict idxi, + size_t k) const override { + using C = CMax; + + size_t nup = 0; + for (size_t j = 0; j < n; j++) { + uint32_t dis = hc.hamming(codes); + if (dis < static_cast(simi[0])) { + idx_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + heap_replace_top(k, simi, idxi, dis, id); + nup++; + } + codes += code_size; + } + return nup; + } + + void scan_codes_range( + size_t n, + const uint8_t* __restrict codes, + const idx_t* __restrict ids, + int radius, + RangeQueryResult& result) const override { + for (size_t j = 0; j < n; j++) { + uint32_t dis = hc.hamming(codes); + if (dis < static_cast(radius)) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + result.add(dis, id); + } + codes += code_size; + } + } +}; + +template +void search_knn_hamming_count( + const IndexBinaryIVF* ivf, + size_t nx, + const uint8_t* __restrict x, + const idx_t* __restrict keys, + int k, + int32_t* __restrict distances, + idx_t* __restrict labels, + const IVFSearchParameters* params) { + const int nBuckets = ivf->d + 1; + std::vector all_counters(nx * nBuckets, 0); + std::unique_ptr all_ids_per_dis(new idx_t[nx * nBuckets * k]); + + idx_t nprobe = params ? params->nprobe : ivf->nprobe; + nprobe = std::min((idx_t)ivf->nlist, nprobe); + idx_t max_codes = params ? params->max_codes : ivf->max_codes; + + std::vector> cs; + for (size_t i = 0; i < nx; ++i) { + cs.push_back( + HCounterState( + all_counters.data() + i * nBuckets, + all_ids_per_dis.get() + i * nBuckets * k, + x + i * ivf->code_size, + ivf->d, + k)); + } + + size_t nlistv = 0, ndis = 0; + +#pragma omp parallel for reduction(+ : nlistv, ndis) + for (int64_t i = 0; i < static_cast(nx); i++) { + const idx_t* keysi = keys + i * nprobe; + HCounterState& csi = cs[i]; + + size_t nscan = 0; + + for (idx_t ik = 0; ik < nprobe; ik++) { + idx_t key = keysi[ik]; /* select the list */ + if (key < 0) { + // not enough centroids for multiprobe + continue; + } + FAISS_THROW_IF_NOT_FMT( + key < (idx_t)ivf->nlist, + "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n", + key, + static_cast(ik), + ivf->nlist); + + nlistv++; + size_t list_size = ivf->invlists->list_size(key); + size_t list_size_max = static_cast(max_codes) - nscan; + if (list_size > list_size_max) { + list_size = list_size_max; + } + InvertedLists::ScopedCodes scodes(ivf->invlists, key); + const uint8_t* list_vecs = scodes.get(); + const idx_t* ids = + store_pairs ? nullptr : ivf->invlists->get_ids(key); + + for (size_t j = 0; j < list_size; j++) { + const uint8_t* yj = list_vecs + ivf->code_size * j; + + idx_t id = store_pairs ? (key << 32 | j) : ids[j]; + csi.update_counter(yj, id); + } + if (ids) { + ivf->invlists->release_ids(key, ids); + } + + nscan += list_size; + if (nscan >= static_cast(max_codes)) { + break; + } + } + ndis += nscan; + + int nres = 0; + for (int b = 0; b < nBuckets && nres < k; b++) { + for (int l = 0; l < csi.counters[b] && nres < k; l++) { + labels[i * k + nres] = csi.ids_per_dis[b * k + l]; + distances[i * k + nres] = b; + nres++; + } + } + while (nres < k) { + labels[i * k + nres] = -1; + distances[i * k + nres] = std::numeric_limits::max(); + ++nres; + } + } + + indexIVF_stats.nq += nx; + indexIVF_stats.nlist += nlistv; + indexIVF_stats.ndis += ndis; +} + +/* Manages NQ queries at a time, stores results */ +template +struct BlockSearch { + HammingComputer hcs[NQ]; + // heaps to update for each query + int32_t* distances[NQ]; + idx_t* labels[NQ]; + // curent top of heap + int32_t heap_tops[NQ]; + + BlockSearch( + size_t code_size, + const uint8_t* __restrict x, + const int32_t* __restrict keys, + int32_t* __restrict all_distances, + idx_t* __restrict all_labels) { + for (idx_t q = 0; q < NQ; q++) { + idx_t qno = keys[q]; + hcs[q] = HammingComputer(x + qno * code_size, code_size); + distances[q] = all_distances + qno * K; + labels[q] = all_labels + qno * K; + heap_tops[q] = distances[q][0]; + } + } + + void add_bcode(const uint8_t* bcode, idx_t id) { + using C = CMax; + for (int q = 0; q < NQ; q++) { + int dis = hcs[q].hamming(bcode); + if (dis < heap_tops[q]) { + heap_replace_top(K, distances[q], labels[q], dis, id); + heap_tops[q] = distances[q][0]; + } + } + } +}; + +template +struct BlockSearchVariableK { + int k; + HammingComputer hcs[NQ]; + // heaps to update for each query + int32_t* distances[NQ]; + idx_t* labels[NQ]; + // curent top of heap + int32_t heap_tops[NQ]; + + BlockSearchVariableK( + size_t code_size, + int k_, + const uint8_t* __restrict x, + const int32_t* __restrict keys, + int32_t* __restrict all_distances, + idx_t* __restrict all_labels) + : k(k_) { + for (idx_t q = 0; q < NQ; q++) { + idx_t qno = keys[q]; + hcs[q] = HammingComputer(x + qno * code_size, code_size); + distances[q] = all_distances + qno * k; + labels[q] = all_labels + qno * k; + heap_tops[q] = distances[q][0]; + } + } + + void add_bcode(const uint8_t* bcode, idx_t id) { + using C = CMax; + for (int q = 0; q < NQ; q++) { + int dis = hcs[q].hamming(bcode); + if (dis < heap_tops[q]) { + heap_replace_top(k, distances[q], labels[q], dis, id); + heap_tops[q] = distances[q][0]; + } + } + } +}; + +template +void search_knn_hamming_per_invlist( + const IndexBinaryIVF* ivf, + size_t n, + const uint8_t* __restrict x, + idx_t k, + const idx_t* __restrict keys_in, + const int32_t* __restrict /* coarse_dis */, + int32_t* __restrict distances, + idx_t* __restrict labels, + bool store_pairs, + const IVFSearchParameters* params) { + idx_t nprobe = params ? params->nprobe : ivf->nprobe; + nprobe = std::min((idx_t)ivf->nlist, nprobe); + idx_t max_codes = params ? params->max_codes : ivf->max_codes; + FAISS_THROW_IF_NOT(max_codes == 0); + FAISS_THROW_IF_NOT(!store_pairs); + + // reorder buckets + std::vector lims(n + 1); + int32_t* keys = new int32_t[n * nprobe]; + std::unique_ptr delete_keys(keys); + for (size_t i = 0; i < n * static_cast(nprobe); i++) { + keys[i] = keys_in[i]; + } + matrix_bucket_sort_inplace(n, nprobe, keys, ivf->nlist, lims.data(), 0); + + using C = CMax; + heap_heapify(n * k, distances, labels); + const size_t code_size = ivf->code_size; + + for (size_t l = 0; l < ivf->nlist; l++) { + idx_t l0 = lims[l], nq = lims[l + 1] - l0; + + InvertedLists::ScopedCodes scodes(ivf->invlists, l); + InvertedLists::ScopedIds sidx(ivf->invlists, l); + idx_t nb = ivf->invlists->list_size(l); + const uint8_t* bcodes = scodes.get(); + const idx_t* ids = sidx.get(); + + idx_t i = 0; + + // process as much as possible by blocks + constexpr int BS = 4; + + if (k == 1) { + for (; i + BS <= nq; i += BS) { + BlockSearch bc( + code_size, x, keys + l0 + i, distances, labels); + for (idx_t j = 0; j < nb; j++) { + bc.add_bcode(bcodes + j * code_size, ids[j]); + } + } + } else if (k == 2) { + for (; i + BS <= nq; i += BS) { + BlockSearch bc( + code_size, x, keys + l0 + i, distances, labels); + for (idx_t j = 0; j < nb; j++) { + bc.add_bcode(bcodes + j * code_size, ids[j]); + } + } + } else if (k == 4) { + for (; i + BS <= nq; i += BS) { + BlockSearch bc( + code_size, x, keys + l0 + i, distances, labels); + for (idx_t j = 0; j < nb; j++) { + bc.add_bcode(bcodes + j * code_size, ids[j]); + } + } + } else { + for (; i + BS <= nq; i += BS) { + BlockSearchVariableK bc( + code_size, k, x, keys + l0 + i, distances, labels); + for (idx_t j = 0; j < nb; j++) { + bc.add_bcode(bcodes + j * code_size, ids[j]); + } + } + } + + // leftovers + for (; i < nq; i++) { + idx_t qno = keys[l0 + i]; + HammingComputer hc(x + qno * code_size, code_size); + idx_t* __restrict idxi = labels + qno * k; + int32_t* __restrict simi = distances + qno * k; + int32_t simi0 = simi[0]; + for (idx_t j = 0; j < nb; j++) { + int dis = hc.hamming(bcodes + j * code_size); + + if (dis < simi0) { + idx_t id = store_pairs ? lo_build(l, j) : ids[j]; + heap_replace_top(k, simi, idxi, dis, id); + simi0 = simi[0]; + } + } + } + } + for (size_t i = 0; i < n; i++) { + heap_reorder(k, distances + i * k, labels + i * k); + } +} + +struct Run_search_knn_hamming_per_invlist { + using T = void; + + template + void f(Types... args) { + search_knn_hamming_per_invlist(args...); + } +}; + +template +struct Run_search_knn_hamming_count { + using T = void; + + template + void f(Types... args) { + search_knn_hamming_count(args...); + } +}; + +struct BuildScannerBinaryIVF { + using T = BinaryInvertedListScanner*; + + template + T f(size_t code_size, bool store_pairs) { + return new IVFBinaryScannerL2(code_size, store_pairs); + } +}; + +} // anonymous namespace + +// --- Entry points --- + +template <> +BinaryInvertedListScanner* make_binary_ivf_scanner_dispatch( + size_t code_size, + bool store_pairs) { + BuildScannerBinaryIVF bs; + return dispatch_HammingComputer(code_size, bs, code_size, store_pairs); +} + +template <> +void search_knn_hamming_per_invlist_dispatch( + int code_size, + const IndexBinaryIVF* ivf, + size_t n, + const uint8_t* x, + idx_t k, + const idx_t* keys_in, + const int32_t* coarse_dis, + int32_t* distances, + idx_t* labels, + bool store_pairs, + const IVFSearchParameters* params) { + Run_search_knn_hamming_per_invlist r; + dispatch_HammingComputer( + code_size, + r, + ivf, + n, + x, + k, + keys_in, + coarse_dis, + distances, + labels, + store_pairs, + params); +} + +template <> +void search_knn_hamming_count_dispatch( + int code_size, + bool store_pairs, + const IndexBinaryIVF* ivf, + size_t nx, + const uint8_t* x, + const idx_t* keys, + int k, + int32_t* distances, + idx_t* labels, + const IVFSearchParameters* params) { + if (store_pairs) { + Run_search_knn_hamming_count r; + dispatch_HammingComputer( + code_size, r, ivf, nx, x, keys, k, distances, labels, params); + } else { + Run_search_knn_hamming_count r; + dispatch_HammingComputer( + code_size, r, ivf, nx, x, keys, k, distances, labels, params); + } +} + +} // namespace faiss diff --git a/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h b/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h new file mode 100644 index 0000000000..b7c8abd26b --- /dev/null +++ b/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Per-ISA implementation of Hamming distance computation for + * IndexIVFSpectralHash. Included once per SIMD TU with THE_SIMD_LEVEL + * set to the desired SIMDLevel. + */ + +#pragma once + +#ifndef THE_SIMD_LEVEL +#error "THE_SIMD_LEVEL must be defined before including this file" +#endif + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace faiss { + +namespace { + +void binarize_with_freq_impl( + size_t nbit, + float freq, + const float* x, + const float* c, + uint8_t* codes) { + memset(codes, 0, (nbit + 7) / 8); + for (size_t i = 0; i < nbit; i++) { + float xf = (x[i] - c[i]); + int64_t xi = int64_t(floor(xf * freq)); + int64_t bit = xi & 1; + codes[i >> 3] |= bit << (i & 7); + } +} + +template +struct IVFScanner : InvertedListScanner { + using InvertedListScanner::scan_codes; + const IndexIVFSpectralHash* index; + size_t nbit; + + float period, freq; + std::vector q; + std::vector zero; + std::vector qcode; + HammingComputer hc; + + IVFScanner(const IndexIVFSpectralHash* index_in, bool store_pairs_in) + : index(index_in), + nbit(index_in->nbit), + period(index_in->period), + freq(2.0 / index_in->period), + q(nbit), + zero(nbit), + qcode(index_in->code_size), + hc(qcode.data(), index_in->code_size) { + this->store_pairs = store_pairs_in; + this->code_size = index->code_size; + this->keep_max = is_similarity_metric(index->metric_type); + } + + void set_query(const float* query) override { + FAISS_THROW_IF_NOT(query); + FAISS_THROW_IF_NOT(q.size() == static_cast(nbit)); + index->vt->apply_noalloc(1, query, q.data()); + + if (index->threshold_type == IndexIVFSpectralHash::Thresh_global) { + binarize_with_freq_impl( + nbit, freq, q.data(), zero.data(), qcode.data()); + hc.set(qcode.data(), code_size); + } + } + + void set_list(idx_t list_no_in, float /*coarse_dis*/) override { + this->list_no = list_no_in; + if (index->threshold_type != IndexIVFSpectralHash::Thresh_global) { + const float* c = index->trained.data() + list_no_in * nbit; + binarize_with_freq_impl(nbit, freq, q.data(), c, qcode.data()); + hc.set(qcode.data(), code_size); + } + } + + float distance_to_code(const uint8_t* code) const final { + return hc.hamming(code); + } + + size_t scan_codes( + size_t list_size, + const uint8_t* codes, + const idx_t* ids, + float* simi, + idx_t* idxi, + size_t k) const override { + size_t nup = 0; + for (size_t j = 0; j < list_size; j++) { + float dis = hc.hamming(codes); + + if (dis < simi[0]) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + maxheap_replace_top(k, simi, idxi, dis, id); + nup++; + } + codes += code_size; + } + return nup; + } + + void scan_codes_range( + size_t list_size, + const uint8_t* codes, + const idx_t* ids, + float radius, + RangeQueryResult& result) const override { + for (size_t j = 0; j < list_size; j++) { + float dis = hc.hamming(codes); + if (dis < radius) { + int64_t id = store_pairs ? lo_build(list_no, j) : ids[j]; + result.add(dis, id); + } + codes += code_size; + } + } +}; + +struct BuildScannerSpectralHash { + using T = InvertedListScanner*; + + template + static T f(const IndexIVFSpectralHash* index, bool store_pairs) { + return new IVFScanner(index, store_pairs); + } +}; + +} // anonymous namespace + +template <> +InvertedListScanner* make_spectral_hash_scanner_dispatch( + int code_size, + const IndexIVFSpectralHash* index, + bool store_pairs) { + BuildScannerSpectralHash bs; + return dispatch_HammingComputer(code_size, bs, index, store_pairs); +} + +} // namespace faiss diff --git a/faiss/impl/binary_hamming/IndexPQ_impl.h b/faiss/impl/binary_hamming/IndexPQ_impl.h new file mode 100644 index 0000000000..09cbf9a1eb --- /dev/null +++ b/faiss/impl/binary_hamming/IndexPQ_impl.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * Per-ISA implementation of polysemous Hamming inner loop for IndexPQ. + * Included once per SIMD TU with THE_SIMD_LEVEL set to the desired + * SIMDLevel. + */ + +#pragma once + +#ifndef THE_SIMD_LEVEL +#error "THE_SIMD_LEVEL must be defined before including this file" +#endif + +#include + +#include +#include +#include +#include + +namespace faiss { + +namespace { + +template +size_t polysemous_inner_loop( + const IndexPQ* index, + const float* dis_table_qi, + const uint8_t* q_code, + size_t k, + float* heap_dis, + int64_t* heap_ids, + int ht) { + size_t M = index->pq.M; + size_t code_size = index->pq.code_size; + size_t ksub = index->pq.ksub; + size_t ntotal = index->ntotal; + + const uint8_t* b_code = index->codes.data(); + + size_t n_pass_i = 0; + + HammingComputer hc(q_code, code_size); + + for (int64_t bi = 0; bi < static_cast(ntotal); bi++) { + int hd = hc.hamming(b_code); + + if (hd < ht) { + n_pass_i++; + + float dis = 0; + const float* dis_table = dis_table_qi; + for (size_t m = 0; m < M; m++) { + dis += dis_table[b_code[m]]; + dis_table += ksub; + } + + if (dis < heap_dis[0]) { + maxheap_replace_top(k, heap_dis, heap_ids, dis, bi); + } + } + b_code += code_size; + } + return n_pass_i; +} + +struct Run_polysemous_inner_loop { + using T = size_t; + template + size_t f(Types... args) { + return polysemous_inner_loop(args...); + } +}; + +} // anonymous namespace + +template <> +size_t polysemous_inner_loop_dispatch( + int code_size, + const IndexPQ* index, + const float* dis_table_qi, + const uint8_t* q_code, + size_t k, + float* heap_dis, + int64_t* heap_ids, + int ht) { + Run_polysemous_inner_loop r; + return dispatch_HammingComputer( + code_size, + r, + index, + dis_table_qi, + q_code, + k, + heap_dis, + heap_ids, + ht); +} + +} // namespace faiss diff --git a/faiss/impl/binary_hamming/avx2.cpp b/faiss/impl/binary_hamming/avx2.cpp new file mode 100644 index 0000000000..af39b0f7bc --- /dev/null +++ b/faiss/impl/binary_hamming/avx2.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_AVX2 + +#define THE_SIMD_LEVEL SIMDLevel::AVX2 + +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include + +#endif // COMPILE_SIMD_AVX2 diff --git a/faiss/impl/binary_hamming/dispatch.h b/faiss/impl/binary_hamming/dispatch.h new file mode 100644 index 0000000000..331739e808 --- /dev/null +++ b/faiss/impl/binary_hamming/dispatch.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +#include + +#include +#include + +namespace faiss { + +// Forward declarations +struct DistanceComputer; +struct InvertedListScanner; +struct BinaryInvertedListScanner; +struct IndexBinaryFlat; +struct IndexBinaryHash; +struct IndexBinaryMultiHash; +struct IndexBinaryIVF; +struct IndexIVFSpectralHash; +struct IndexPQ; +struct SearchParametersIVF; +using IVFSearchParameters = SearchParametersIVF; +struct RangeQueryResult; +using idx_t = int64_t; + +/** @name IndexBinaryHNSW dispatch + * @{ */ +template +DistanceComputer* make_binary_hnsw_distance_computer_dispatch( + int code_size, + IndexBinaryFlat* flat_storage); +/** @} */ + +/** @name IndexBinaryIVF dispatch + * @{ */ +template +BinaryInvertedListScanner* make_binary_ivf_scanner_dispatch( + size_t code_size, + bool store_pairs); + +template +void search_knn_hamming_per_invlist_dispatch( + int code_size, + const IndexBinaryIVF* ivf, + size_t n, + const uint8_t* x, + idx_t k, + const idx_t* keys_in, + const int32_t* coarse_dis, + int32_t* distances, + idx_t* labels, + bool store_pairs, + const IVFSearchParameters* params); + +template +void search_knn_hamming_count_dispatch( + int code_size, + bool store_pairs, + const IndexBinaryIVF* ivf, + size_t nx, + const uint8_t* x, + const idx_t* keys, + int k, + int32_t* distances, + idx_t* labels, + const IVFSearchParameters* params); +/** @} */ + +/** @name IndexBinaryHash dispatch + * @{ */ +template +void binary_hash_knn_search_dispatch( + const IndexBinaryHash& index, + const uint8_t* q, + idx_t k, + int32_t* heap_sim, + idx_t* heap_ids, + size_t& n0, + size_t& nlist, + size_t& ndis); + +template +void binary_hash_range_search_dispatch( + const IndexBinaryHash& index, + const uint8_t* q, + int radius, + RangeQueryResult& qres, + size_t& n0, + size_t& nlist, + size_t& ndis); + +template +void binary_multihash_knn_search_dispatch( + const IndexBinaryMultiHash& index, + const uint8_t* q, + idx_t k, + int32_t* heap_sim, + idx_t* heap_ids, + size_t& n0, + size_t& nlist, + size_t& ndis); + +template +void binary_multihash_range_search_dispatch( + const IndexBinaryMultiHash& index, + const uint8_t* q, + int radius, + RangeQueryResult& qres, + size_t& n0, + size_t& nlist, + size_t& ndis); +/** @} */ + +/** @name IndexIVFSpectralHash dispatch + * @{ */ +template +InvertedListScanner* make_spectral_hash_scanner_dispatch( + int code_size, + const IndexIVFSpectralHash* index, + bool store_pairs); +/** @} */ + +/** @name IndexPQ polysemous dispatch + * @{ */ +template +size_t polysemous_inner_loop_dispatch( + int code_size, + const IndexPQ* index, + const float* dis_table_qi, + const uint8_t* q_code, + size_t k, + float* heap_dis, + int64_t* heap_ids, + int ht); +/** @} */ + +} // namespace faiss diff --git a/faiss/impl/binary_hamming/neon.cpp b/faiss/impl/binary_hamming/neon.cpp new file mode 100644 index 0000000000..0ed1d2db88 --- /dev/null +++ b/faiss/impl/binary_hamming/neon.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_ARM_NEON + +#define THE_SIMD_LEVEL SIMDLevel::ARM_NEON + +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include + +#endif // COMPILE_SIMD_ARM_NEON diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h index b7aa4111e3..8f18d98e54 100644 --- a/faiss/impl/platform_macros.h +++ b/faiss/impl/platform_macros.h @@ -126,9 +126,11 @@ inline int __builtin_clzll(uint64_t x) { #ifdef SWIG #define ALIGNED(x) #define FAISS_PACKED +#define FAISS_MAYBE_UNUSED #else #define ALIGNED(x) __attribute__((aligned(x))) #define FAISS_PACKED __attribute__((packed)) +#define FAISS_MAYBE_UNUSED [[maybe_unused]] #endif // On non-Windows, FAISS_PACKED handles packing, so these are no-ops diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp index 3c7931aa23..d02cd6c962 100644 --- a/faiss/utils/hamming.cpp +++ b/faiss/utils/hamming.cpp @@ -23,348 +23,28 @@ #include -#include #include -#include -#include +#include -#include #include -#include -#include -#include +#include #include +// Scalar (NONE) fallback — compiled without SIMD flags, hamdis-inl.h +// selects generic-inl.h via the preprocessor ISA ladder. +#define THE_SIMD_LEVEL SIMDLevel::NONE +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +#undef THE_SIMD_LEVEL + namespace faiss { size_t hamming_batch_size = 65536; -template -void hammings( - const uint64_t* __restrict bs1, - const uint64_t* __restrict bs2, - size_t n1, - size_t n2, - hamdis_t* __restrict dis) - -{ - size_t i, j; - const size_t nwords = nbits / 64; - for (i = 0; i < n1; i++) { - const uint64_t* __restrict bs1_ = bs1 + i * nwords; - hamdis_t* __restrict dis_ = dis + i * n2; - for (j = 0; j < n2; j++) { - dis_[j] = hamming(bs1_, bs2 + j * nwords); - } - } -} - -void hammings( - const uint64_t* __restrict bs1, - const uint64_t* __restrict bs2, - size_t n1, - size_t n2, - size_t nbits, - hamdis_t* __restrict dis) { - size_t i, j; - const size_t nwords = nbits / 64; - for (i = 0; i < n1; i++) { - const uint64_t* __restrict bs1_ = bs1 + i * nwords; - hamdis_t* __restrict dis_ = dis + i * n2; - for (j = 0; j < n2; j++) { - dis_[j] = hamming(bs1_, bs2 + j * nwords, nwords); - } - } -} - -/* Count number of matches given a max threshold */ -template -void hamming_count_thres( - const uint64_t* __restrict bs1, - const uint64_t* __restrict bs2, - size_t n1, - size_t n2, - hamdis_t ht, - size_t* __restrict nptr) { - const size_t nwords = nbits / 64; - size_t i, j, posm = 0; - const uint64_t* bs2_ = bs2; - - for (i = 0; i < n1; i++) { - bs2 = bs2_; - for (j = 0; j < n2; j++) { - /* collect the match only if this satisfies the threshold */ - if (hamming(bs1, bs2) <= ht) { - posm++; - } - bs2 += nwords; - } - bs1 += nwords; /* next signature */ - } - *nptr = posm; -} - -template -void crosshamming_count_thres( - const uint64_t* __restrict dbs, - size_t n, - int ht, - size_t* __restrict nptr) { - const size_t nwords = nbits / 64; - size_t i, j, posm = 0; - const uint64_t* bs1 = dbs; - for (i = 0; i < n; i++) { - const uint64_t* bs2 = bs1 + 2; - for (j = i + 1; j < n; j++) { - /* collect the match only if this satisfies the threshold */ - if (hamming(bs1, bs2) <= ht) { - posm++; - } - bs2 += nwords; - } - bs1 += nwords; - } - *nptr = posm; -} - -template -size_t match_hamming_thres( - const uint64_t* __restrict bs1, - const uint64_t* __restrict bs2, - size_t n1, - size_t n2, - int ht, - int64_t* __restrict idx, - hamdis_t* __restrict hams) { - const size_t nwords = nbits / 64; - size_t i, j, posm = 0; - hamdis_t h; - const uint64_t* bs2_ = bs2; - for (i = 0; i < n1; i++) { - bs2 = bs2_; - for (j = 0; j < n2; j++) { - /* Here perform the real work of computing the distance */ - h = hamming(bs1, bs2); - - /* collect the match only if this satisfies the threshold */ - if (h <= ht) { - /* Enough space to store another match ? */ - *idx = i; - idx++; - *idx = j; - idx++; - *hams = h; - hams++; - posm++; - } - bs2 += nwords; /* next signature */ - } - bs1 += nwords; - } - return posm; -} - -namespace { - -/* Return closest neighbors w.r.t Hamming distance, using a heap. */ -template -void hammings_knn_hc( - int bytes_per_code, - int_maxheap_array_t* __restrict ha, - const uint8_t* __restrict bs1, - const uint8_t* __restrict bs2, - size_t n2, - bool order = true, - bool init_heap = true, - ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK, - const faiss::IDSelector* sel = nullptr) { - size_t k = ha->k; - if (init_heap) { - ha->heapify(); - } +/****************************************************************** + * Scalar utility functions (no SIMD, no dispatch needed) + ******************************************************************/ - const size_t block_size = hamming_batch_size; - for (size_t j0 = 0; j0 < n2; j0 += block_size) { - const size_t j1 = std::min(j0 + block_size, n2); -#pragma omp parallel for - for (int64_t i = 0; i < static_cast(ha->nh); i++) { - HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code); - - const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code; - hamdis_t dis; - hamdis_t* __restrict bh_val_ = ha->val + i * k; - int64_t* __restrict bh_ids_ = ha->ids + i * k; - - // if larger number of k is required, then ::bs_addn() needs to be - // used instead of ::addn() -#define HANDLE_APPROX(NB, BD) \ - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \ - FAISS_THROW_IF_NOT_FMT( \ - k <= NB * BD, \ - "The chosen mode (%d) of approximate top-k supports " \ - "up to %d values, but %zd is requested.", \ - (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \ - NB * BD, \ - k); \ - HeapWithBucketsForHamming32< \ - CMax, \ - NB, \ - BD, \ - HammingComputer>:: \ - addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel); \ - break; - - switch (approx_topk_mode) { - HANDLE_APPROX(8, 3) - HANDLE_APPROX(8, 2) - HANDLE_APPROX(16, 2) - HANDLE_APPROX(32, 2) - default: { - for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) { - if (sel && !sel->is_member(j)) { - continue; - } - dis = hc.hamming(bs2_); - if (dis < bh_val_[0]) { - faiss::maxheap_replace_top( - k, bh_val_, bh_ids_, dis, j); - } - } - } break; - } - } - } - if (order) { - ha->reorder(); - } -} - -/* Return closest neighbors w.r.t Hamming distance, using max count. */ -template -void hammings_knn_mc( - int bytes_per_code, - const uint8_t* __restrict a, - const uint8_t* __restrict b, - size_t na, - size_t nb, - size_t k, - int32_t* __restrict distances, - int64_t* __restrict labels, - const faiss::IDSelector* sel) { - const int nBuckets = bytes_per_code * 8 + 1; - std::vector all_counters(na * nBuckets, 0); - std::unique_ptr all_ids_per_dis(new int64_t[na * nBuckets * k]); - - std::vector> cs; - for (size_t i = 0; i < na; ++i) { - cs.push_back( - HCounterState( - all_counters.data() + i * nBuckets, - all_ids_per_dis.get() + i * nBuckets * k, - a + i * bytes_per_code, - 8 * bytes_per_code, - k)); - } - - const size_t block_size = hamming_batch_size; - for (size_t j0 = 0; j0 < nb; j0 += block_size) { - const size_t j1 = std::min(j0 + block_size, nb); -#pragma omp parallel for - for (int64_t i = 0; i < static_cast(na); ++i) { - for (size_t j = j0; j < j1; ++j) { - if (!sel || sel->is_member(j)) { - cs[i].update_counter(b + j * bytes_per_code, j); - } - } - } - } - - for (size_t i = 0; i < na; ++i) { - HCounterState& csi = cs[i]; - - size_t nres = 0; - for (int b_2 = 0; b_2 < nBuckets && nres < k; b_2++) { - for (int l = 0; l < csi.counters[b_2] && nres < k; l++) { - labels[i * k + nres] = csi.ids_per_dis[b_2 * k + l]; - distances[i * k + nres] = b_2; - nres++; - } - } - while (nres < k) { - labels[i * k + nres] = -1; - distances[i * k + nres] = std::numeric_limits::max(); - ++nres; - } - } -} - -template -void hamming_range_search( - const uint8_t* a, - const uint8_t* b, - size_t na, - size_t nb, - int radius, - size_t code_size, - RangeSearchResult* res, - const faiss::IDSelector* sel) { -#pragma omp parallel - { - RangeSearchPartialResult pres(res); - -#pragma omp for - for (int64_t i = 0; i < static_cast(na); i++) { - HammingComputer hc(a + i * code_size, code_size); - const uint8_t* yi = b; - RangeQueryResult& qres = pres.new_result(i); - - for (size_t j = 0; j < nb; j++) { - if (!sel || sel->is_member(j)) { - int dis = hc.hamming(yi); - if (dis < radius) { - qres.add(dis, j); - } - } - yi += code_size; - } - } - pres.finalize(); - } -} - -struct Run_hammings_knn_hc { - using T = void; - template - void f(Types... args) { - hammings_knn_hc(args...); - } -}; - -struct Run_hammings_knn_mc { - using T = void; - template - void f(Types... args) { - hammings_knn_mc(args...); - } -}; - -struct Run_hamming_range_search { - using T = void; - template - void f(Types... args) { - hamming_range_search(args...); - } -}; - -} // namespace - -/* Functions to maps vectors to bits. Assume proper allocation done beforehand, - meaning that b should be be able to receive as many bits as x may produce. */ - -/* - * dimension 0 corresponds to the least significant bit of b[0], or - * equivalently to the lsb of the first byte that is stored. - */ void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) { for (size_t i = 0; i < d; i += 8) { uint8_t w = 0; @@ -381,8 +61,6 @@ void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) { } } -/* Same but for n vectors. - Ensure that the output b is byte-aligned (pad with 0s). */ void fvecs2bitvecs( const float* __restrict x, uint8_t* __restrict b, @@ -407,7 +85,6 @@ void bitvecs2fvecs( } } -/* Reverse bit (NOT a optimized function, only used for print purpose) */ static uint64_t uint64_reverse_bits(uint64_t b) { int i; uint64_t revb = 0; @@ -419,7 +96,6 @@ static uint64_t uint64_reverse_bits(uint64_t b) { return revb; } -/* print the bit vector */ void bitvec_print(const uint8_t* b, size_t d) { size_t i, j; for (i = 0; i < d;) { @@ -459,12 +135,10 @@ void bitvec_shuffle( } } -/*----------------------------------------*/ -/* Hamming distance computation and k-nn */ +/****************************************************************** + * Dispatched Hamming distance public API + ******************************************************************/ -#define C64(x) ((uint64_t*)x) - -/* Compute a set of Hamming distances */ void hammings( const uint8_t* __restrict a, const uint8_t* __restrict b, @@ -472,24 +146,9 @@ void hammings( size_t nb, size_t ncodes, hamdis_t* __restrict dis) { - FAISS_THROW_IF_NOT(ncodes % 8 == 0); - switch (ncodes) { - case 8: - faiss::hammings<64>(C64(a), C64(b), na, nb, dis); - return; - case 16: - faiss::hammings<128>(C64(a), C64(b), na, nb, dis); - return; - case 32: - faiss::hammings<256>(C64(a), C64(b), na, nb, dis); - return; - case 64: - faiss::hammings<512>(C64(a), C64(b), na, nb, dis); - return; - default: - faiss::hammings(C64(a), C64(b), na, nb, ncodes * 8, dis); - return; - } + with_simd_level_256bit([&]() { + hammings_dispatch(a, b, na, nb, ncodes, dis); + }); } void hammings_knn( @@ -511,19 +170,10 @@ void hammings_knn_hc( int order, ApproxTopK_mode_t approx_topk_mode, const faiss::IDSelector* sel) { - Run_hammings_knn_hc r; - dispatch_HammingComputer( - ncodes, - r, - ncodes, - ha, - a, - b, - nb, - order, - true, - approx_topk_mode, - sel); + with_simd_level_256bit([&]() { + hammings_knn_hc_dispatch( + ha, a, b, nb, ncodes, order, approx_topk_mode, sel); + }); } void hammings_knn_mc( @@ -536,9 +186,10 @@ void hammings_knn_mc( int32_t* __restrict distances, int64_t* __restrict labels, const faiss::IDSelector* sel) { - Run_hammings_knn_mc r; - dispatch_HammingComputer( - ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel); + with_simd_level_256bit([&]() { + hammings_knn_mc_dispatch( + a, b, na, nb, k, ncodes, distances, labels, sel); + }); } void hamming_range_search( @@ -550,12 +201,12 @@ void hamming_range_search( size_t code_size, RangeSearchResult* result, const faiss::IDSelector* sel) { - Run_hamming_range_search r; - dispatch_HammingComputer( - code_size, r, a, b, na, nb, radius, code_size, result, sel); + with_simd_level_256bit([&]() { + hamming_range_search_dispatch( + a, b, na, nb, radius, code_size, result, sel); + }); } -/* Count number of matches given a max threshold */ void hamming_count_thres( const uint8_t* bs1, const uint8_t* bs2, @@ -564,54 +215,22 @@ void hamming_count_thres( hamdis_t ht, size_t ncodes, size_t* nptr) { - switch (ncodes) { - case 8: - faiss::hamming_count_thres<64>( - C64(bs1), C64(bs2), n1, n2, ht, nptr); - return; - case 16: - faiss::hamming_count_thres<128>( - C64(bs1), C64(bs2), n1, n2, ht, nptr); - return; - case 32: - faiss::hamming_count_thres<256>( - C64(bs1), C64(bs2), n1, n2, ht, nptr); - return; - case 64: - faiss::hamming_count_thres<512>( - C64(bs1), C64(bs2), n1, n2, ht, nptr); - return; - default: - FAISS_THROW_FMT("not implemented for %zu bits", ncodes); - } + with_simd_level_256bit([&]() { + hamming_count_thres_dispatch(bs1, bs2, n1, n2, ht, ncodes, nptr); + }); } -/* Count number of cross-matches given a threshold */ void crosshamming_count_thres( const uint8_t* dbs, size_t n, hamdis_t ht, size_t ncodes, size_t* nptr) { - switch (ncodes) { - case 8: - faiss::crosshamming_count_thres<64>(C64(dbs), n, ht, nptr); - return; - case 16: - faiss::crosshamming_count_thres<128>(C64(dbs), n, ht, nptr); - return; - case 32: - faiss::crosshamming_count_thres<256>(C64(dbs), n, ht, nptr); - return; - case 64: - faiss::crosshamming_count_thres<512>(C64(dbs), n, ht, nptr); - return; - default: - FAISS_THROW_FMT("not implemented for %zu bits", ncodes); - } + with_simd_level_256bit([&]() { + crosshamming_count_thres_dispatch(dbs, n, ht, ncodes, nptr); + }); } -/* Returns all matches given a threshold */ size_t match_hamming_thres( const uint8_t* bs1, const uint8_t* bs2, @@ -621,49 +240,10 @@ size_t match_hamming_thres( size_t ncodes, int64_t* idx, hamdis_t* dis) { - switch (ncodes) { - case 8: - return faiss::match_hamming_thres<64>( - C64(bs1), C64(bs2), n1, n2, ht, idx, dis); - case 16: - return faiss::match_hamming_thres<128>( - C64(bs1), C64(bs2), n1, n2, ht, idx, dis); - case 32: - return faiss::match_hamming_thres<256>( - C64(bs1), C64(bs2), n1, n2, ht, idx, dis); - case 64: - return faiss::match_hamming_thres<512>( - C64(bs1), C64(bs2), n1, n2, ht, idx, dis); - default: - FAISS_THROW_FMT("not implemented for %zu bits", ncodes); - return 0; - } -} - -#undef C64 - -/************************************* - * generalized Hamming distances - ************************************/ - -template -static void hamming_dis_inner_loop( - const uint8_t* __restrict ca, - const uint8_t* __restrict cb, - size_t nb, - size_t code_size, - int k, - hamdis_t* __restrict bh_val_, - int64_t* __restrict bh_ids_) { - HammingComputer hc(ca, code_size); - - for (size_t j = 0; j < nb; j++) { - int ndiff = hc.hamming(cb); - cb += code_size; - if (ndiff < bh_val_[0]) { - maxheap_replace_top(k, bh_val_, bh_ids_, ndiff, j); - } - } + return with_simd_level_256bit([&]() -> size_t { + return match_hamming_thres_dispatch( + bs1, bs2, n1, n2, ht, ncodes, idx, dis); + }); } void generalized_hammings_knn_hc( @@ -673,46 +253,16 @@ void generalized_hammings_knn_hc( size_t nb, size_t code_size, int ordered) { - int na = ha->nh; - int k = ha->k; - - if (ordered) { - ha->heapify(); - } - -#pragma omp parallel for - for (int i = 0; i < na; i++) { - const uint8_t* __restrict ca = a + i * code_size; - const uint8_t* __restrict cb = b; - - hamdis_t* __restrict bh_val_ = ha->val + i * k; - int64_t* __restrict bh_ids_ = ha->ids + i * k; - - switch (code_size) { - case 8: - hamming_dis_inner_loop( - ca, cb, nb, 8, k, bh_val_, bh_ids_); - break; - case 16: - hamming_dis_inner_loop( - ca, cb, nb, 16, k, bh_val_, bh_ids_); - break; - case 32: - hamming_dis_inner_loop( - ca, cb, nb, 32, k, bh_val_, bh_ids_); - break; - default: - hamming_dis_inner_loop( - ca, cb, nb, code_size, k, bh_val_, bh_ids_); - break; - } - } - - if (ordered) { - ha->reorder(); - } + with_simd_level_256bit([&]() { + generalized_hammings_knn_hc_dispatch( + ha, a, b, nb, code_size, ordered); + }); } +/****************************************************************** + * Bitstring pack/unpack (scalar, no dispatch needed) + ******************************************************************/ + void pack_bitstrings( size_t n, size_t M, diff --git a/faiss/utils/hamming.h b/faiss/utils/hamming.h index 64dfbf7467..7011572856 100644 --- a/faiss/utils/hamming.h +++ b/faiss/utils/hamming.h @@ -30,6 +30,7 @@ #include #include #include +#include // Low-level Hamming distance computations and hamdis_t. #include @@ -284,6 +285,90 @@ void unpack_bitstrings( size_t code_size, int32_t* unpacked); +/** SIMD-dispatched Hamming function implementations. + * Specializations live in per-ISA TUs (hamming_avx2.cpp, etc.). */ + +template +void hammings_knn_hc_dispatch( + int_maxheap_array_t* ha, + const uint8_t* a, + const uint8_t* b, + size_t nb, + size_t ncodes, + int ordered, + ApproxTopK_mode_t approx_topk_mode, + const IDSelector* sel); + +template +void hammings_knn_mc_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + size_t k, + size_t ncodes, + int32_t* distances, + int64_t* labels, + const IDSelector* sel); + +template +void hamming_range_search_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + int radius, + size_t code_size, + RangeSearchResult* result, + const IDSelector* sel); + +template +void hammings_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + size_t ncodes, + hamdis_t* dis); + +template +void generalized_hammings_knn_hc_dispatch( + int_maxheap_array_t* ha, + const uint8_t* a, + const uint8_t* b, + size_t nb, + size_t code_size, + int ordered); + +template +void hamming_count_thres_dispatch( + const uint8_t* bs1, + const uint8_t* bs2, + size_t n1, + size_t n2, + hamdis_t ht, + size_t ncodes, + size_t* nptr); + +template +void crosshamming_count_thres_dispatch( + const uint8_t* dbs, + size_t n, + hamdis_t ht, + size_t ncodes, + size_t* nptr); + +template +size_t match_hamming_thres_dispatch( + const uint8_t* bs1, + const uint8_t* bs2, + size_t n1, + size_t n2, + hamdis_t ht, + size_t ncodes, + int64_t* idx, + hamdis_t* dis); + } // namespace faiss #include diff --git a/faiss/utils/hamming_distance/avx2-inl.h b/faiss/utils/hamming_distance/avx2-inl.h index b7d5f32e66..1ca322c7b2 100644 --- a/faiss/utils/hamming_distance/avx2-inl.h +++ b/faiss/utils/hamming_distance/avx2-inl.h @@ -8,10 +8,11 @@ #ifndef HAMMING_AVX2_INL_H #define HAMMING_AVX2_INL_H -// AVX2 version +// AVX2 version. +// Most code is now in common.h. This file only contains the +// GenHammingComputer classes that leverage SSE/AVX2 intrinsics. #include -#include #include #include @@ -20,376 +21,13 @@ namespace faiss { -/* Elementary Hamming distance computation: unoptimized */ -template -inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { - const size_t nbytes = nbits / 8; - size_t i; - T h = 0; - for (i = 0; i < nbytes; i++) { - h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; - } - return h; -} - -/* Hamming distances for multiples of 64 bits */ -template -inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) { - const size_t nwords = nbits / 64; - size_t i; - hamdis_t h = 0; - for (i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/* specialized (optimized) functions */ -template <> -inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]); -} - -template <> -inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]); -} - -template <> -inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) + - popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]); -} - -/* Hamming distances for multiple of 64 bits */ -inline hamdis_t hamming( - const uint64_t* bs1, - const uint64_t* bs2, - size_t nwords) { - hamdis_t h = 0; - for (size_t i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/****************************************************************** - * The HammingComputer series of classes compares a single code of - * size 4 to 32 to incoming codes. They are intended for use as a - * template class where it would be inefficient to switch on the code - * size in the inner loop. Hopefully the compiler will inline the - * hamming() functions and put the a0, a1, ... in registers. - ******************************************************************/ - -struct HammingComputer4 { - uint32_t a0; - - HammingComputer4() {} - - HammingComputer4(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 4); - (void)code_size; - a0 = *(uint32_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint32_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 4; - } -}; - -struct HammingComputer8 { - uint64_t a0; - - HammingComputer8() {} - - HammingComputer8(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 8); - (void)code_size; - a0 = *(uint64_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint64_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - -struct HammingComputer16 { - uint64_t a0, a1; - - HammingComputer16() {} - - HammingComputer16(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 16); - (void)code_size; - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1); - } - - inline static constexpr int get_code_size() { - return 16; - } -}; - -// when applied to an array, 1/2 of the 64-bit accesses are unaligned. -// This incurs a penalty of ~10% wrt. fully aligned accesses. -struct HammingComputer20 { - uint64_t a0, a1; - uint32_t a2; - - HammingComputer20() {} - - HammingComputer20(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 20); - (void)code_size; - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(*(uint32_t*)(b + 2) ^ a2); - } - - inline static constexpr int get_code_size() { - return 20; - } -}; - -struct HammingComputer32 { - uint64_t a0, a1, a2, a3; - - HammingComputer32() {} - - HammingComputer32(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 32); - (void)code_size; - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3); - } - - inline static constexpr int get_code_size() { - return 32; - } -}; - -struct HammingComputer64 { - uint64_t a0, a1, a2, a3, a4, a5, a6, a7; - - HammingComputer64() {} - - HammingComputer64(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 64); - (void)code_size; - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) + - popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) + - popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7); - } - - inline static constexpr int get_code_size() { - return 64; - } -}; - -struct HammingComputerDefault { - const uint8_t* a8; - int quotient8; - int remainder8; - - HammingComputerDefault() {} - - HammingComputerDefault(const uint8_t* a8_in, int code_size) { - set(a8_in, code_size); - } - - void set(const uint8_t* a8_2, int code_size) { - this->a8 = a8_2; - quotient8 = code_size / 8; - remainder8 = code_size % 8; - } - - int hamming(const uint8_t* b8) const { - int accu = 0; - - const uint64_t* a64 = reinterpret_cast(a8); - const uint64_t* b64 = reinterpret_cast(b8); - int i = 0, len = quotient8; - switch (len & 7) { - default: - while (len > 7) { - len -= 8; - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 7: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 6: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 5: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 4: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 3: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 2: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 1: - accu += popcount64(a64[i] ^ b64[i]); - i++; - } - } - if (remainder8) { - const uint8_t* a = a8 + 8 * quotient8; - const uint8_t* b = b8 + 8 * quotient8; - switch (remainder8) { - case 7: - accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; - [[fallthrough]]; - case 6: - accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; - [[fallthrough]]; - case 5: - accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; - [[fallthrough]]; - case 4: - accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; - [[fallthrough]]; - case 3: - accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; - [[fallthrough]]; - case 2: - accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; - [[fallthrough]]; - case 1: - accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; - [[fallthrough]]; - default: - break; - } - } - - return accu; - } - - inline int get_code_size() const { - return quotient8 * 8 + remainder8; - } -}; - -/*************************************************************************** - * generalized Hamming = number of bytes that are different between - * two codes. - ***************************************************************************/ - -inline int generalized_hamming_64(uint64_t a) { - a |= a >> 1; - a |= a >> 2; - a |= a >> 4; - a &= 0x0101010101010101UL; - return popcount64(a); -} - -struct GenHammingComputer8 { - uint64_t a0; - - GenHammingComputer8(const uint8_t* a, int code_size) { - assert(code_size == 8); - (void)code_size; - a0 = *(uint64_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return generalized_hamming_64(*(uint64_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - // I'm not sure whether this version is faster of slower, tbh // todo: test on different CPUs struct GenHammingComputer16 { __m128i a; - GenHammingComputer16(const uint8_t* a8, int code_size) { + GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 16); - (void)code_size; a = _mm_loadu_si128((const __m128i_u*)a8); } @@ -408,9 +46,8 @@ struct GenHammingComputer16 { struct GenHammingComputer32 { __m256i a; - GenHammingComputer32(const uint8_t* a8, int code_size) { + GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 32); - (void)code_size; a = _mm256_loadu_si256((const __m256i_u*)a8); } diff --git a/faiss/utils/hamming_distance/avx512-inl.h b/faiss/utils/hamming_distance/avx512-inl.h index dfd8f55bfe..af04ef175d 100644 --- a/faiss/utils/hamming_distance/avx512-inl.h +++ b/faiss/utils/hamming_distance/avx512-inl.h @@ -8,15 +8,13 @@ #ifndef HAMMING_AVX512_INL_H #define HAMMING_AVX512_INL_H -// AVX512 version -// The _mm512_popcnt_epi64 intrinsic is used to accelerate Hamming distance -// calculations in HammingComputerDefault and HammingComputer64. This intrinsic -// is not available in the default Faiss avx512 build mode but is only -// available in the avx512_spr build mode, which targets Intel(R) Sapphire -// Rapids. +// AVX512 version. +// Most code is now in common.h. This file provides AVX512-specific +// overrides for HammingComputer64 and HammingComputerDefault +// (using _mm512_popcnt_epi64 when __AVX512VPOPCNTDQ__ is available), +// and the GenHammingComputer classes that leverage SSE/AVX2 intrinsics. #include -#include #include #include @@ -25,199 +23,6 @@ namespace faiss { -/* Elementary Hamming distance computation: unoptimized */ -template -inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { - const size_t nbytes = nbits / 8; - size_t i; - T h = 0; - for (i = 0; i < nbytes; i++) { - h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; - } - return h; -} - -/* Hamming distances for multiples of 64 bits */ -template -inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) { - const size_t nwords = nbits / 64; - size_t i; - hamdis_t h = 0; - for (i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/* specialized (optimized) functions */ -template <> -inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]); -} - -template <> -inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]); -} - -template <> -inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) + - popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]); -} - -/* Hamming distances for multiple of 64 bits */ -inline hamdis_t hamming( - const uint64_t* bs1, - const uint64_t* bs2, - size_t nwords) { - hamdis_t h = 0; - for (size_t i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/****************************************************************** - * The HammingComputer series of classes compares a single code of - * size 4 to 32 to incoming codes. They are intended for use as a - * template class where it would be inefficient to switch on the code - * size in the inner loop. Hopefully the compiler will inline the - * hamming() functions and put the a0, a1, ... in registers. - ******************************************************************/ - -struct HammingComputer4 { - uint32_t a0; - - HammingComputer4() {} - - HammingComputer4(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 4); - a0 = *(uint32_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint32_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 4; - } -}; - -struct HammingComputer8 { - uint64_t a0; - - HammingComputer8() {} - - HammingComputer8(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 8); - a0 = *(uint64_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint64_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - -struct HammingComputer16 { - uint64_t a0, a1; - - HammingComputer16() {} - - HammingComputer16(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 16); - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1); - } - - inline static constexpr int get_code_size() { - return 16; - } -}; - -// when applied to an array, 1/2 of the 64-bit accesses are unaligned. -// This incurs a penalty of ~10% wrt. fully aligned accesses. -struct HammingComputer20 { - uint64_t a0, a1; - uint32_t a2; - - HammingComputer20() {} - - HammingComputer20(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 20); - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(*(uint32_t*)(b + 2) ^ a2); - } - - inline static constexpr int get_code_size() { - return 20; - } -}; - -struct HammingComputer32 { - uint64_t a0, a1, a2, a3; - - HammingComputer32() {} - - HammingComputer32(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 32); - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3); - } - - inline static constexpr int get_code_size() { - return 32; - } -}; - struct HammingComputer64 { uint64_t a0, a1, a2, a3, a4, a5, a6, a7; const uint64_t* a; @@ -228,9 +33,9 @@ struct HammingComputer64 { set(a8, code_size); } - void set(const uint8_t* a8, int code_size) { + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 64); - a = (uint64_t*)a8; + a = reinterpret_cast(a8); a0 = a[0]; a1 = a[1]; a2 = a[2]; @@ -242,7 +47,7 @@ struct HammingComputer64 { } inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; + const uint64_t* b = reinterpret_cast(b8); #ifdef __AVX512VPOPCNTDQ__ __m512i vxor = _mm512_xor_si512(_mm512_loadu_si512(a), _mm512_loadu_si512(b)); @@ -269,8 +74,8 @@ struct HammingComputerDefault { HammingComputerDefault() {} - HammingComputerDefault(const uint8_t* a8, int code_size) { - set(a8, code_size); + HammingComputerDefault(const uint8_t* a8_in, int code_size) { + set(a8_in, code_size); } void set(const uint8_t* a8_2, int code_size) { @@ -298,73 +103,8 @@ struct HammingComputerDefault { } i *= 8; #endif - int len = quotient8 - i; - switch (len & 7) { - default: - while (len > 7) { - len -= 8; - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 7: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 6: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 5: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 4: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 3: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 2: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 1: - accu += popcount64(a64[i] ^ b64[i]); - i++; - } - } - if (remainder8) { - const uint8_t* a = a8 + 8 * quotient8; - const uint8_t* b = b8 + 8 * quotient8; - switch (remainder8) { - case 7: - accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; - [[fallthrough]]; - case 6: - accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; - [[fallthrough]]; - case 5: - accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; - [[fallthrough]]; - case 4: - accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; - [[fallthrough]]; - case 3: - accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; - [[fallthrough]]; - case 2: - accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; - [[fallthrough]]; - case 1: - accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; - [[fallthrough]]; - default: - break; - } - } - + accu += hamming_popcount_tail( + a64, b64, i, quotient8, a8, b8, remainder8); return accu; } @@ -373,42 +113,12 @@ struct HammingComputerDefault { } }; -/*************************************************************************** - * generalized Hamming = number of bytes that are different between - * two codes. - ***************************************************************************/ - -inline int generalized_hamming_64(uint64_t a) { - a |= a >> 1; - a |= a >> 2; - a |= a >> 4; - a &= 0x0101010101010101UL; - return popcount64(a); -} - -struct GenHammingComputer8 { - uint64_t a0; - - GenHammingComputer8(const uint8_t* a, int code_size) { - assert(code_size == 8); - a0 = *(uint64_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return generalized_hamming_64(*(uint64_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - // I'm not sure whether this version is faster of slower, tbh // todo: test on different CPUs struct GenHammingComputer16 { __m128i a; - GenHammingComputer16(const uint8_t* a8, int code_size) { + GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 16); a = _mm_loadu_si128((const __m128i_u*)a8); } @@ -428,7 +138,7 @@ struct GenHammingComputer16 { struct GenHammingComputer32 { __m256i a; - GenHammingComputer32(const uint8_t* a8, int code_size) { + GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 32); a = _mm256_loadu_si256((const __m256i_u*)a8); } diff --git a/faiss/utils/hamming_distance/common.h b/faiss/utils/hamming_distance/common.h index 154e14e2cc..b50feca2e0 100644 --- a/faiss/utils/hamming_distance/common.h +++ b/faiss/utils/hamming_distance/common.h @@ -8,6 +8,8 @@ #ifndef FAISS_hamming_common_h #define FAISS_hamming_common_h +#include +#include #include #include @@ -43,6 +45,407 @@ inline constexpr uint8_t hamdis_tab_ham_bytes[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; +/*************************************************************************** + * Universal code: identical across all architectures + ***************************************************************************/ + +/* Elementary Hamming distance computation: unoptimized */ +template +inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { + const size_t nbytes = nbits / 8; + size_t i; + T h = 0; + for (i = 0; i < nbytes; i++) { + h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; + } + return h; +} + +/*************************************************************************** + * generalized Hamming = number of bytes that are different between + * two codes. + ***************************************************************************/ + +inline int generalized_hamming_64(uint64_t a) { + a |= a >> 1; + a |= a >> 2; + a |= a >> 4; + a &= 0x0101010101010101UL; + return popcount64(a); +} + +/****************************************************************** + * The HammingComputer series of classes compares a single code of + * size 4 to 32 to incoming codes. They are intended for use as a + * template class where it would be inefficient to switch on the code + * size in the inner loop. Hopefully the compiler will inline the + * hamming() functions and put the a0, a1, ... in registers. + ******************************************************************/ + +struct HammingComputer4 { + uint32_t a0; + + HammingComputer4() {} + + HammingComputer4(const uint8_t* a, int code_size) { + set(a, code_size); + } + + void set(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 4); + const uint32_t* a32 = reinterpret_cast(a); + a0 = *a32; + } + + inline int hamming(const uint8_t* b) const { + const uint32_t* b32 = reinterpret_cast(b); + return popcount64(*b32 ^ a0); + } + + inline static constexpr int get_code_size() { + return 4; + } +}; + +struct HammingComputer8 { + uint64_t a0; + + HammingComputer8() {} + + HammingComputer8(const uint8_t* a, int code_size) { + set(a, code_size); + } + + void set(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 8); + const uint64_t* a64 = reinterpret_cast(a); + a0 = *a64; + } + + inline int hamming(const uint8_t* b) const { + const uint64_t* b64 = reinterpret_cast(b); + return popcount64(*b64 ^ a0); + } + + inline static constexpr int get_code_size() { + return 8; + } +}; + +/* Duff's device + byte remainder tail for HammingComputerDefault. + * Processes uint64 words starting at index i_start using popcount, + * then handles any remaining bytes via lookup table. */ +inline int hamming_popcount_tail( + const uint64_t* a64, + const uint64_t* b64, + int i_start, + int quotient8, + const uint8_t* a8, + const uint8_t* b8, + int remainder8) { + int accu = 0; + int i = i_start; + int len = quotient8 - i_start; + switch (len & 7) { + default: + while (len > 7) { + len -= 8; + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 7: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 6: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 5: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 4: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 3: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 2: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 1: + accu += popcount64(a64[i] ^ b64[i]); + i++; + } + } + if (remainder8) { + const uint8_t* a = a8 + 8 * quotient8; + const uint8_t* b = b8 + 8 * quotient8; + switch (remainder8) { + case 7: + accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; + [[fallthrough]]; + case 6: + accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; + [[fallthrough]]; + case 5: + accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; + [[fallthrough]]; + case 4: + accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; + [[fallthrough]]; + case 3: + accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; + [[fallthrough]]; + case 2: + accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; + [[fallthrough]]; + case 1: + accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; + break; + default: + break; + } + } + return accu; +} + +/*************************************************************************** + * Scalar code shared by generic, AVX2, and AVX512 backends. + * NEON provides its own optimized versions of these. + ***************************************************************************/ + +#ifndef __aarch64__ + +/* Hamming distances for multiples of 64 bits */ +template +inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) { + const size_t nwords = nbits / 64; + size_t i; + hamdis_t h = 0; + for (i = 0; i < nwords; i++) { + h += popcount64(bs1[i] ^ bs2[i]); + } + return h; +} + +/* specialized (optimized) functions */ +template <> +inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) { + return popcount64(pa[0] ^ pb[0]); +} + +template <> +inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) { + return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]); +} + +template <> +inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) { + return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) + + popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]); +} + +/* Hamming distances for multiple of 64 bits */ +inline hamdis_t hamming( + const uint64_t* bs1, + const uint64_t* bs2, + size_t nwords) { + hamdis_t h = 0; + for (size_t i = 0; i < nwords; i++) { + h += popcount64(bs1[i] ^ bs2[i]); + } + return h; +} + +struct HammingComputer16 { + uint64_t a0, a1; + + HammingComputer16() {} + + HammingComputer16(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 16); + const uint64_t* a = reinterpret_cast(a8); + a0 = a[0]; + a1 = a[1]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = reinterpret_cast(b8); + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1); + } + + inline static constexpr int get_code_size() { + return 16; + } +}; + +// when applied to an array, 1/2 of the 64-bit accesses are unaligned. +// This incurs a penalty of ~10% wrt. fully aligned accesses. +struct HammingComputer20 { + uint64_t a0, a1; + uint32_t a2; + + HammingComputer20() {} + + HammingComputer20(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 20); + const uint64_t* a = reinterpret_cast(a8); + const uint32_t* a32 = reinterpret_cast(a8); + a0 = a[0]; + a1 = a[1]; + // can't read a[2] since it is uint64_t, not uint32_t + // results in AddressSanitizer failure reading past end of array + a2 = a32[4]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = reinterpret_cast(b8); + const uint32_t* b32_tail = reinterpret_cast(b + 2); + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + + popcount64(*b32_tail ^ a2); + } + + inline static constexpr int get_code_size() { + return 20; + } +}; + +struct HammingComputer32 { + uint64_t a0, a1, a2, a3; + + HammingComputer32() {} + + HammingComputer32(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 32); + const uint64_t* a = reinterpret_cast(a8); + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = reinterpret_cast(b8); + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + + popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3); + } + + inline static constexpr int get_code_size() { + return 32; + } +}; + +struct GenHammingComputer8 { + uint64_t a0; + + GenHammingComputer8(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 8); + const uint64_t* a64 = reinterpret_cast(a); + a0 = *a64; + } + + inline int hamming(const uint8_t* b) const { + const uint64_t* b64 = reinterpret_cast(b); + return generalized_hamming_64(*b64 ^ a0); + } + + inline static constexpr int get_code_size() { + return 8; + } +}; + +#endif // !__aarch64__ + +/*************************************************************************** + * Scalar code shared by generic and AVX2 backends only. + * AVX512 and NEON provide their own optimized versions. + ***************************************************************************/ + +#if !defined(__aarch64__) && !defined(__AVX512F__) + +struct HammingComputer64 { + uint64_t a0, a1, a2, a3, a4, a5, a6, a7; + + HammingComputer64() {} + + HammingComputer64(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 64); + const uint64_t* a = reinterpret_cast(a8); + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = reinterpret_cast(b8); + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + + popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) + + popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) + + popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7); + } + + inline static constexpr int get_code_size() { + return 64; + } +}; + +struct HammingComputerDefault { + const uint8_t* a8; + int quotient8; + int remainder8; + + HammingComputerDefault() {} + + HammingComputerDefault(const uint8_t* a8_in, int code_size) { + set(a8_in, code_size); + } + + void set(const uint8_t* a8_in, int code_size) { + this->a8 = a8_in; + quotient8 = code_size / 8; + remainder8 = code_size % 8; + } + + int hamming(const uint8_t* b8) const { + const uint64_t* a64 = reinterpret_cast(a8); + const uint64_t* b64 = reinterpret_cast(b8); + return hamming_popcount_tail( + a64, b64, 0, quotient8, a8, b8, remainder8); + } + + inline int get_code_size() const { + return quotient8 * 8 + remainder8; + } +}; + +#endif // !__aarch64__ && !__AVX512F__ + } // namespace faiss #endif diff --git a/faiss/utils/hamming_distance/generic-inl.h b/faiss/utils/hamming_distance/generic-inl.h index 7f229f4f11..59d942dea6 100644 --- a/faiss/utils/hamming_distance/generic-inl.h +++ b/faiss/utils/hamming_distance/generic-inl.h @@ -9,383 +9,19 @@ #define HAMMING_GENERIC_INL_H // A general-purpose version of hamming distance computation. +// Most code is now in common.h. This file only contains the +// GenHammingComputer classes that use scalar generalized_hamming_64. #include -#include #include #include namespace faiss { -/* Elementary Hamming distance computation: unoptimized */ -template -inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { - const size_t nbytes = nbits / 8; - size_t i; - T h = 0; - for (i = 0; i < nbytes; i++) { - h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; - } - return h; -} - -/* Hamming distances for multiples of 64 bits */ -template -inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) { - const size_t nwords = nbits / 64; - size_t i; - hamdis_t h = 0; - for (i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/* specialized (optimized) functions */ -template <> -inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]); -} - -template <> -inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]); -} - -template <> -inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) + - popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]); -} - -/* Hamming distances for multiple of 64 bits */ -inline hamdis_t hamming( - const uint64_t* bs1, - const uint64_t* bs2, - size_t nwords) { - hamdis_t h = 0; - for (size_t i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/****************************************************************** - * The HammingComputer series of classes compares a single code of - * size 4 to 32 to incoming codes. They are intended for use as a - * template class where it would be inefficient to switch on the code - * size in the inner loop. Hopefully the compiler will inline the - * hamming() functions and put the a0, a1, ... in registers. - ******************************************************************/ - -struct HammingComputer4 { - uint32_t a0; - - HammingComputer4() {} - - HammingComputer4(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, [[maybe_unused]] int code_size) { - assert(code_size == 4); - const uint32_t* a32 = reinterpret_cast(a); - a0 = *a32; - } - - inline int hamming(const uint8_t* b) const { - const uint32_t* b32 = reinterpret_cast(b); - return popcount64(*b32 ^ a0); - } - - inline static constexpr int get_code_size() { - return 4; - } -}; - -struct HammingComputer8 { - uint64_t a0; - - HammingComputer8() {} - - HammingComputer8(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, [[maybe_unused]] int code_size) { - assert(code_size == 8); - const uint64_t* a64 = reinterpret_cast(a); - a0 = *a64; - } - - inline int hamming(const uint8_t* b) const { - const uint64_t* b64 = reinterpret_cast(b); - return popcount64(*b64 ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - -struct HammingComputer16 { - uint64_t a0, a1; - - HammingComputer16() {} - - HammingComputer16(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, [[maybe_unused]] int code_size) { - assert(code_size == 16); - const uint64_t* a = reinterpret_cast(a8); - a0 = a[0]; - a1 = a[1]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = reinterpret_cast(b8); - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1); - } - - inline static constexpr int get_code_size() { - return 16; - } -}; - -// when applied to an array, 1/2 of the 64-bit accesses are unaligned. -// This incurs a penalty of ~10% wrt. fully aligned accesses. -struct HammingComputer20 { - uint64_t a0, a1; - uint32_t a2; - - HammingComputer20() {} - - HammingComputer20(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, [[maybe_unused]] int code_size) { - assert(code_size == 20); - const uint64_t* a = reinterpret_cast(a8); - const uint32_t* a32 = reinterpret_cast(a8); - a0 = a[0]; - a1 = a[1]; - // can't read a[2] since it is uint64_t, not uint32_t - // results in AddressSanitizer failure reading past end of array - a2 = a32[4]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = reinterpret_cast(b8); - const uint32_t* b32_tail = reinterpret_cast(b + 2); - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(*b32_tail ^ a2); - } - - inline static constexpr int get_code_size() { - return 20; - } -}; - -struct HammingComputer32 { - uint64_t a0, a1, a2, a3; - - HammingComputer32() {} - - HammingComputer32(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, [[maybe_unused]] int code_size) { - assert(code_size == 32); - const uint64_t* a = reinterpret_cast(a8); - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = reinterpret_cast(b8); - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3); - } - - inline static constexpr int get_code_size() { - return 32; - } -}; - -struct HammingComputer64 { - uint64_t a0, a1, a2, a3, a4, a5, a6, a7; - - HammingComputer64() {} - - HammingComputer64(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, [[maybe_unused]] int code_size) { - assert(code_size == 64); - const uint64_t* a = reinterpret_cast(a8); - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = reinterpret_cast(b8); - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) + - popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) + - popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7); - } - - inline static constexpr int get_code_size() { - return 64; - } -}; - -struct HammingComputerDefault { - const uint8_t* a8; - int quotient8; - int remainder8; - - HammingComputerDefault() {} - - HammingComputerDefault(const uint8_t* a8_in, int code_size) { - set(a8_in, code_size); - } - - void set(const uint8_t* a8_in, int code_size) { - this->a8 = a8_in; - quotient8 = code_size / 8; - remainder8 = code_size % 8; - } - - int hamming(const uint8_t* b8) const { - int accu = 0; - - const uint64_t* a64 = reinterpret_cast(a8); - const uint64_t* b64 = reinterpret_cast(b8); - int i = 0, len = quotient8; - switch (len & 7) { - default: - while (len > 7) { - len -= 8; - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 7: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 6: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 5: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 4: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 3: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 2: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 1: - accu += popcount64(a64[i] ^ b64[i]); - i++; - } - } - if (remainder8) { - const uint8_t* a = a8 + 8 * quotient8; - const uint8_t* b = b8 + 8 * quotient8; - switch (remainder8) { - case 7: - accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; - [[fallthrough]]; - case 6: - accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; - [[fallthrough]]; - case 5: - accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; - [[fallthrough]]; - case 4: - accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; - [[fallthrough]]; - case 3: - accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; - [[fallthrough]]; - case 2: - accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; - [[fallthrough]]; - case 1: - accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; - break; - default: - break; - } - } - - return accu; - } - - inline int get_code_size() const { - return quotient8 * 8 + remainder8; - } -}; - -/*************************************************************************** - * generalized Hamming = number of bytes that are different between - * two codes. - ***************************************************************************/ - -inline int generalized_hamming_64(uint64_t a) { - a |= a >> 1; - a |= a >> 2; - a |= a >> 4; - a &= 0x0101010101010101UL; - return popcount64(a); -} - -struct GenHammingComputer8 { - uint64_t a0; - - GenHammingComputer8(const uint8_t* a, [[maybe_unused]] int code_size) { - assert(code_size == 8); - const uint64_t* a64 = reinterpret_cast(a); - a0 = *a64; - } - - inline int hamming(const uint8_t* b) const { - const uint64_t* b64 = reinterpret_cast(b); - return generalized_hamming_64(*b64 ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - struct GenHammingComputer16 { uint64_t a0, a1; - GenHammingComputer16(const uint8_t* a8, [[maybe_unused]] int code_size) { + GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 16); const uint64_t* a = reinterpret_cast(a8); a0 = a[0]; @@ -406,7 +42,7 @@ struct GenHammingComputer16 { struct GenHammingComputer32 { uint64_t a0, a1, a2, a3; - GenHammingComputer32(const uint8_t* a8, [[maybe_unused]] int code_size) { + GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 32); const uint64_t* a = reinterpret_cast(a8); a0 = a[0]; diff --git a/faiss/utils/hamming_distance/neon-inl.h b/faiss/utils/hamming_distance/neon-inl.h index d00a402857..d29f40e0b5 100644 --- a/faiss/utils/hamming_distance/neon-inl.h +++ b/faiss/utils/hamming_distance/neon-inl.h @@ -8,8 +8,11 @@ #ifndef HAMMING_NEON_INL_H #define HAMMING_NEON_INL_H -// a specialized version of hamming is needed here, because both +// A specialized version of hamming is needed here, because both // gcc, clang and msvc seem to generate suboptimal code sometimes. +// +// Universal code (HammingComputer4, HammingComputer8, generalized_hamming_64, +// byte-level hamming template, hamming_popcount_tail) comes from common.h. #ifdef __aarch64__ @@ -25,18 +28,6 @@ namespace faiss { -/* Elementary Hamming distance computation: unoptimized */ -template -inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { - const size_t nbytes = nbits / 8; - size_t i; - T h = 0; - for (i = 0; i < nbytes; i++) { - h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; - } - return h; -} - /* Hamming distances for multiples of 64 bits */ template inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb) { @@ -125,59 +116,10 @@ inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb, size_t nwords) { } /****************************************************************** - * The HammingComputer series of classes compares a single code of - * size 4 to 32 to incoming codes. They are intended for use as a - * template class where it would be inefficient to switch on the code - * size in the inner loop. Hopefully the compiler will inline the - * hamming() functions and put the a0, a1, ... in registers. + * NEON-optimized HammingComputer classes for sizes 16-64. + * Sizes 4 and 8 use the scalar versions from common.h. ******************************************************************/ -struct HammingComputer4 { - uint32_t a0; - - HammingComputer4() {} - - HammingComputer4(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 4); - a0 = *(uint32_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint32_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 4; - } -}; - -struct HammingComputer8 { - uint64_t a0; - - HammingComputer8() {} - - HammingComputer8(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 8); - a0 = *(uint64_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint64_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - struct HammingComputer16 { uint8x16_t a0; @@ -187,7 +129,7 @@ struct HammingComputer16 { set(a8, code_size); } - void set(const uint8_t* a8, int code_size) { + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 16); a0 = vld1q_u8(a8); } @@ -218,7 +160,7 @@ struct HammingComputer20 { set(a8, code_size); } - void set(const uint8_t* a8, int code_size) { + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 20); a0 = vld1q_u8(a8); @@ -253,7 +195,7 @@ struct HammingComputer32 { set(a8, code_size); } - void set(const uint8_t* a8, int code_size) { + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 32); a0 = vld1q_u8(a8); a1 = vld1q_u8(a8 + 16); @@ -286,7 +228,7 @@ struct HammingComputer64 { set(a8, code_size); } - void set(const uint8_t* a8, int code_size) { + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 64); hc0.set(a8, 32); hc1.set(a8 + 32, 32); @@ -308,12 +250,12 @@ struct HammingComputerDefault { HammingComputerDefault() {} - HammingComputerDefault(const uint8_t* a8, int code_size) { - set(a8, code_size); + HammingComputerDefault(const uint8_t* a8_in, int code_size) { + set(a8_in, code_size); } - void set(const uint8_t* a8, int code_size) { - this->a8 = a8; + void set(const uint8_t* a8_in, int code_size) { + this->a8 = a8_in; quotient8 = code_size / 8; remainder8 = code_size % 8; } @@ -323,80 +265,15 @@ struct HammingComputerDefault { const uint64_t* a64 = reinterpret_cast(a8); const uint64_t* b64 = reinterpret_cast(b8); - int i = 0, len = quotient8; + int i = 0; int len256 = (quotient8 / 4) * 4; for (; i < len256; i += 4) { accu += ::faiss::hamming<256>(a64 + i, b64 + i); - len -= 4; - } - - switch (len & 7) { - default: - while (len > 7) { - len -= 8; - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 7: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 6: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 5: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 4: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 3: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 2: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 1: - accu += popcount64(a64[i] ^ b64[i]); - i++; - } - } - if (remainder8) { - const uint8_t* a = a8 + 8 * quotient8; - const uint8_t* b = b8 + 8 * quotient8; - switch (remainder8) { - case 7: - accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; - [[fallthrough]]; - case 6: - accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; - [[fallthrough]]; - case 5: - accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; - [[fallthrough]]; - case 4: - accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; - [[fallthrough]]; - case 3: - accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; - [[fallthrough]]; - case 2: - accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; - [[fallthrough]]; - case 1: - accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; - [[fallthrough]]; - default: - break; - } } + accu += hamming_popcount_tail( + a64, b64, i, quotient8, a8, b8, remainder8); return accu; } @@ -406,22 +283,13 @@ struct HammingComputerDefault { }; /*************************************************************************** - * generalized Hamming = number of bytes that are different between - * two codes. + * NEON-optimized generalized Hamming computers. ***************************************************************************/ -inline int generalized_hamming_64(uint64_t a) { - a |= a >> 1; - a |= a >> 2; - a |= a >> 4; - a &= 0x0101010101010101UL; - return popcount64(a); -} - struct GenHammingComputer8 { uint8x8_t a0; - GenHammingComputer8(const uint8_t* a8, int code_size) { + GenHammingComputer8(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 8); a0 = vld1_u8(a8); } @@ -441,7 +309,7 @@ struct GenHammingComputer8 { struct GenHammingComputer16 { uint8x16_t a0; - GenHammingComputer16(const uint8_t* a8, int code_size) { + GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 16); a0 = vld1q_u8(a8); } @@ -461,7 +329,7 @@ struct GenHammingComputer16 { struct GenHammingComputer32 { GenHammingComputer16 a0, a1; - GenHammingComputer32(const uint8_t* a8, int code_size) + GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) : a0(a8, 16), a1(a8 + 16, 16) { assert(code_size == 32); } diff --git a/faiss/utils/simd_impl/hamming_avx2.cpp b/faiss/utils/simd_impl/hamming_avx2.cpp new file mode 100644 index 0000000000..ac0787a094 --- /dev/null +++ b/faiss/utils/simd_impl/hamming_avx2.cpp @@ -0,0 +1,14 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_AVX2 + +#define THE_SIMD_LEVEL SIMDLevel::AVX2 +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include + +#endif // COMPILE_SIMD_AVX2 diff --git a/faiss/utils/simd_impl/hamming_impl.h b/faiss/utils/simd_impl/hamming_impl.h new file mode 100644 index 0000000000..35e63d575f --- /dev/null +++ b/faiss/utils/simd_impl/hamming_impl.h @@ -0,0 +1,629 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Shared implementation header for Hamming distance dynamic dispatch. +// Included by per-ISA TUs (hamming_avx2.cpp, hamming_neon.cpp) and by +// hamming.cpp (for the NONE fallback). +// +// THE_SIMD_LEVEL must be defined before including this header. +// +// INCLUDE ORDERING IS LOAD-BEARING: hamdis-inl.h must be included before +// hamming.h so that the ISA-specific struct definitions (selected by +// __AVX2__ / __aarch64__ / etc.) are established before hamming.h's own +// #include of hamdis-inl.h becomes a no-op via include guards. + +#pragma once + +#ifndef THE_SIMD_LEVEL +#error "Define THE_SIMD_LEVEL before including hamming_impl.h" +#endif + +// ISA-specific struct definitions — MUST come first (see comment above). +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace faiss { + +// Alias for readability in the template specializations below. +constexpr SIMDLevel hamming_impl_SL = THE_SIMD_LEVEL; + +// All implementation templates live in an anonymous namespace for ODR safety. +// Different TUs compile this header with different HammingComputer struct +// layouts (generic vs AVX2 vs NEON). The anonymous namespace ensures each +// TU gets its own copy with internal linkage, preventing the linker from +// merging incompatible instantiations. +namespace { + +/****************************************************************** + * Bit-level Hamming distance templates + ******************************************************************/ + +template +void hammings_impl( + const uint64_t* __restrict bs1, + const uint64_t* __restrict bs2, + size_t n1, + size_t n2, + hamdis_t* __restrict dis) { + size_t i, j; + const size_t nwords = nbits / 64; + for (i = 0; i < n1; i++) { + const uint64_t* __restrict bs1_ = bs1 + i * nwords; + hamdis_t* __restrict dis_ = dis + i * n2; + for (j = 0; j < n2; j++) { + dis_[j] = hamming(bs1_, bs2 + j * nwords); + } + } +} + +void hammings_impl_runtime( + const uint64_t* __restrict bs1, + const uint64_t* __restrict bs2, + size_t n1, + size_t n2, + size_t nbits, + hamdis_t* __restrict dis) { + size_t i, j; + const size_t nwords = nbits / 64; + for (i = 0; i < n1; i++) { + const uint64_t* __restrict bs1_ = bs1 + i * nwords; + hamdis_t* __restrict dis_ = dis + i * n2; + for (j = 0; j < n2; j++) { + dis_[j] = hamming(bs1_, bs2 + j * nwords, nwords); + } + } +} + +template +void hamming_count_thres_impl( + const uint64_t* __restrict bs1, + const uint64_t* __restrict bs2, + size_t n1, + size_t n2, + hamdis_t ht, + size_t* __restrict nptr) { + const size_t nwords = nbits / 64; + size_t i, j, posm = 0; + const uint64_t* bs2_ = bs2; + + for (i = 0; i < n1; i++) { + bs2 = bs2_; + for (j = 0; j < n2; j++) { + if (hamming(bs1, bs2) <= ht) { + posm++; + } + bs2 += nwords; + } + bs1 += nwords; + } + *nptr = posm; +} + +template +void crosshamming_count_thres_impl( + const uint64_t* __restrict dbs, + size_t n, + int ht, + size_t* __restrict nptr) { + const size_t nwords = nbits / 64; + size_t i, j, posm = 0; + const uint64_t* bs1 = dbs; + for (i = 0; i < n; i++) { + const uint64_t* bs2 = bs1 + 2; + for (j = i + 1; j < n; j++) { + if (hamming(bs1, bs2) <= ht) { + posm++; + } + bs2 += nwords; + } + bs1 += nwords; + } + *nptr = posm; +} + +template +size_t match_hamming_thres_impl( + const uint64_t* __restrict bs1, + const uint64_t* __restrict bs2, + size_t n1, + size_t n2, + int ht, + int64_t* __restrict idx, + hamdis_t* __restrict hams) { + const size_t nwords = nbits / 64; + size_t i, j, posm = 0; + hamdis_t h; + const uint64_t* bs2_ = bs2; + for (i = 0; i < n1; i++) { + bs2 = bs2_; + for (j = 0; j < n2; j++) { + h = hamming(bs1, bs2); + if (h <= ht) { + *idx = i; + idx++; + *idx = j; + idx++; + *hams = h; + hams++; + posm++; + } + bs2 += nwords; + } + bs1 += nwords; + } + return posm; +} + +/****************************************************************** + * HammingComputer-based search templates + ******************************************************************/ + +template +void hammings_knn_hc_impl( + int bytes_per_code, + int_maxheap_array_t* __restrict ha, + const uint8_t* __restrict bs1, + const uint8_t* __restrict bs2, + size_t n2, + bool order = true, + bool init_heap = true, + ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK, + const faiss::IDSelector* sel = nullptr) { + size_t k = ha->k; + if (init_heap) { + ha->heapify(); + } + + const size_t block_size = hamming_batch_size; + for (size_t j0 = 0; j0 < n2; j0 += block_size) { + const size_t j1 = std::min(j0 + block_size, n2); +#pragma omp parallel for + for (int64_t i = 0; i < static_cast(ha->nh); i++) { + HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code); + + const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code; + hamdis_t dis; + hamdis_t* __restrict bh_val_ = ha->val + i * k; + int64_t* __restrict bh_ids_ = ha->ids + i * k; + +#define HANDLE_APPROX(NB, BD) \ + case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \ + FAISS_THROW_IF_NOT_FMT( \ + k <= NB * BD, \ + "The chosen mode (%d) of approximate top-k supports " \ + "up to %d values, but %zd is requested.", \ + (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \ + NB * BD, \ + k); \ + HeapWithBucketsForHamming32< \ + CMax, \ + NB, \ + BD, \ + HammingComputer>:: \ + addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel); \ + break; + + switch (approx_topk_mode) { + HANDLE_APPROX(8, 3) + HANDLE_APPROX(8, 2) + HANDLE_APPROX(16, 2) + HANDLE_APPROX(32, 2) + default: { + for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) { + if (sel && !sel->is_member(j)) { + continue; + } + dis = hc.hamming(bs2_); + if (dis < bh_val_[0]) { + faiss::maxheap_replace_top( + k, bh_val_, bh_ids_, dis, j); + } + } + } break; + } + } + } + if (order) { + ha->reorder(); + } +} + +#undef HANDLE_APPROX + +template +void hammings_knn_mc_impl( + int bytes_per_code, + const uint8_t* __restrict a, + const uint8_t* __restrict b, + size_t na, + size_t nb, + size_t k, + int32_t* __restrict distances, + int64_t* __restrict labels, + const faiss::IDSelector* sel) { + const int nBuckets = bytes_per_code * 8 + 1; + std::vector all_counters(na * nBuckets, 0); + std::unique_ptr all_ids_per_dis(new int64_t[na * nBuckets * k]); + + std::vector> cs; + for (size_t i = 0; i < na; ++i) { + cs.push_back( + HCounterState( + all_counters.data() + i * nBuckets, + all_ids_per_dis.get() + i * nBuckets * k, + a + i * bytes_per_code, + 8 * bytes_per_code, + k)); + } + + const size_t block_size = hamming_batch_size; + for (size_t j0 = 0; j0 < nb; j0 += block_size) { + const size_t j1 = std::min(j0 + block_size, nb); +#pragma omp parallel for + for (int64_t i = 0; i < static_cast(na); ++i) { + for (size_t j = j0; j < j1; ++j) { + if (!sel || sel->is_member(j)) { + cs[i].update_counter(b + j * bytes_per_code, j); + } + } + } + } + + for (size_t i = 0; i < na; ++i) { + HCounterState& csi = cs[i]; + + size_t nres = 0; + for (int b_2 = 0; b_2 < nBuckets && nres < k; b_2++) { + for (int l = 0; l < csi.counters[b_2] && nres < k; l++) { + labels[i * k + nres] = csi.ids_per_dis[b_2 * k + l]; + distances[i * k + nres] = b_2; + nres++; + } + } + while (nres < k) { + labels[i * k + nres] = -1; + distances[i * k + nres] = std::numeric_limits::max(); + ++nres; + } + } +} + +template +void hamming_range_search_impl( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + int radius, + size_t code_size, + RangeSearchResult* res, + const faiss::IDSelector* sel) { +#pragma omp parallel + { + RangeSearchPartialResult pres(res); + +#pragma omp for + for (int64_t i = 0; i < static_cast(na); i++) { + HammingComputer hc(a + i * code_size, code_size); + const uint8_t* yi = b; + RangeQueryResult& qres = pres.new_result(i); + + for (size_t j = 0; j < nb; j++) { + if (!sel || sel->is_member(j)) { + int dis = hc.hamming(yi); + if (dis < radius) { + qres.add(dis, j); + } + } + yi += code_size; + } + } + pres.finalize(); + } +} + +// Consumer structs for dispatch_HammingComputer. +// These MUST be in the anonymous namespace so that each TU's +// dispatch_HammingComputer instantiation uses unique consumer types, +// preventing ODR violations across TUs with different struct layouts. + +struct Run_hammings_knn_hc { + using T = void; + template + void f(Types... args) { + hammings_knn_hc_impl(args...); + } +}; + +struct Run_hammings_knn_mc { + using T = void; + template + void f(Types... args) { + hammings_knn_mc_impl(args...); + } +}; + +struct Run_hamming_range_search { + using T = void; + template + void f(Types... args) { + hamming_range_search_impl(args...); + } +}; + +/****************************************************************** + * Generalized Hamming distances + ******************************************************************/ + +template +void hamming_dis_inner_loop( + const uint8_t* __restrict ca, + const uint8_t* __restrict cb, + size_t nb, + size_t code_size, + int k, + hamdis_t* __restrict bh_val_, + int64_t* __restrict bh_ids_) { + HammingComputer hc(ca, code_size); + + for (size_t j = 0; j < nb; j++) { + int ndiff = hc.hamming(cb); + cb += code_size; + if (ndiff < bh_val_[0]) { + maxheap_replace_top(k, bh_val_, bh_ids_, ndiff, j); + } + } +} + +void generalized_hammings_knn_hc_impl( + int_maxheap_array_t* __restrict ha, + const uint8_t* __restrict a, + const uint8_t* __restrict b, + size_t nb, + size_t code_size, + int ordered) { + int na = ha->nh; + int k = ha->k; + + if (ordered) { + ha->heapify(); + } + +#pragma omp parallel for + for (int i = 0; i < na; i++) { + const uint8_t* __restrict ca = a + i * code_size; + const uint8_t* __restrict cb = b; + + hamdis_t* __restrict bh_val_ = ha->val + i * k; + int64_t* __restrict bh_ids_ = ha->ids + i * k; + + switch (code_size) { + case 8: + hamming_dis_inner_loop( + ca, cb, nb, 8, k, bh_val_, bh_ids_); + break; + case 16: + hamming_dis_inner_loop( + ca, cb, nb, 16, k, bh_val_, bh_ids_); + break; + case 32: + hamming_dis_inner_loop( + ca, cb, nb, 32, k, bh_val_, bh_ids_); + break; + default: + hamming_dis_inner_loop( + ca, cb, nb, code_size, k, bh_val_, bh_ids_); + break; + } + } + + if (ordered) { + ha->reorder(); + } +} + +} // anonymous namespace + +/****************************************************************** + * Entry point template specializations at THE_SIMD_LEVEL + ******************************************************************/ + +#define C64(x) ((uint64_t*)x) + +template <> +void hammings_knn_hc_dispatch( + int_maxheap_array_t* ha, + const uint8_t* a, + const uint8_t* b, + size_t nb, + size_t ncodes, + int ordered, + ApproxTopK_mode_t approx_topk_mode, + const IDSelector* sel) { + Run_hammings_knn_hc r; + dispatch_HammingComputer( + ncodes, + r, + ncodes, + ha, + a, + b, + nb, + ordered, + true, + approx_topk_mode, + sel); +} + +template <> +void hammings_knn_mc_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + size_t k, + size_t ncodes, + int32_t* distances, + int64_t* labels, + const IDSelector* sel) { + Run_hammings_knn_mc r; + dispatch_HammingComputer( + ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel); +} + +template <> +void hamming_range_search_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + int radius, + size_t code_size, + RangeSearchResult* result, + const IDSelector* sel) { + Run_hamming_range_search r; + dispatch_HammingComputer( + code_size, r, a, b, na, nb, radius, code_size, result, sel); +} + +template <> +void hammings_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + size_t ncodes, + hamdis_t* dis) { + FAISS_THROW_IF_NOT(ncodes % 8 == 0); + switch (ncodes) { + case 8: + hammings_impl<64>(C64(a), C64(b), na, nb, dis); + return; + case 16: + hammings_impl<128>(C64(a), C64(b), na, nb, dis); + return; + case 32: + hammings_impl<256>(C64(a), C64(b), na, nb, dis); + return; + case 64: + hammings_impl<512>(C64(a), C64(b), na, nb, dis); + return; + default: + hammings_impl_runtime(C64(a), C64(b), na, nb, ncodes * 8, dis); + return; + } +} + +template <> +void generalized_hammings_knn_hc_dispatch( + int_maxheap_array_t* ha, + const uint8_t* a, + const uint8_t* b, + size_t nb, + size_t code_size, + int ordered) { + generalized_hammings_knn_hc_impl(ha, a, b, nb, code_size, ordered); +} + +template <> +void hamming_count_thres_dispatch( + const uint8_t* bs1, + const uint8_t* bs2, + size_t n1, + size_t n2, + hamdis_t ht, + size_t ncodes, + size_t* nptr) { + switch (ncodes) { + case 8: + hamming_count_thres_impl<64>(C64(bs1), C64(bs2), n1, n2, ht, nptr); + return; + case 16: + hamming_count_thres_impl<128>(C64(bs1), C64(bs2), n1, n2, ht, nptr); + return; + case 32: + hamming_count_thres_impl<256>(C64(bs1), C64(bs2), n1, n2, ht, nptr); + return; + case 64: + hamming_count_thres_impl<512>(C64(bs1), C64(bs2), n1, n2, ht, nptr); + return; + default: + FAISS_THROW_FMT("not implemented for %zu bits", ncodes); + } +} + +template <> +void crosshamming_count_thres_dispatch( + const uint8_t* dbs, + size_t n, + hamdis_t ht, + size_t ncodes, + size_t* nptr) { + switch (ncodes) { + case 8: + crosshamming_count_thres_impl<64>(C64(dbs), n, ht, nptr); + return; + case 16: + crosshamming_count_thres_impl<128>(C64(dbs), n, ht, nptr); + return; + case 32: + crosshamming_count_thres_impl<256>(C64(dbs), n, ht, nptr); + return; + case 64: + crosshamming_count_thres_impl<512>(C64(dbs), n, ht, nptr); + return; + default: + FAISS_THROW_FMT("not implemented for %zu bits", ncodes); + } +} + +template <> +size_t match_hamming_thres_dispatch( + const uint8_t* bs1, + const uint8_t* bs2, + size_t n1, + size_t n2, + hamdis_t ht, + size_t ncodes, + int64_t* idx, + hamdis_t* dis) { + switch (ncodes) { + case 8: + return match_hamming_thres_impl<64>( + C64(bs1), C64(bs2), n1, n2, ht, idx, dis); + case 16: + return match_hamming_thres_impl<128>( + C64(bs1), C64(bs2), n1, n2, ht, idx, dis); + case 32: + return match_hamming_thres_impl<256>( + C64(bs1), C64(bs2), n1, n2, ht, idx, dis); + case 64: + return match_hamming_thres_impl<512>( + C64(bs1), C64(bs2), n1, n2, ht, idx, dis); + default: + FAISS_THROW_FMT("not implemented for %zu bits", ncodes); + return 0; + } +} + +#undef C64 + +} // namespace faiss diff --git a/faiss/utils/simd_impl/hamming_neon.cpp b/faiss/utils/simd_impl/hamming_neon.cpp new file mode 100644 index 0000000000..6c72618028 --- /dev/null +++ b/faiss/utils/simd_impl/hamming_neon.cpp @@ -0,0 +1,14 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_ARM_NEON + +#define THE_SIMD_LEVEL SIMDLevel::ARM_NEON +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include + +#endif // COMPILE_SIMD_ARM_NEON