diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt index cf870cc159..08082e02f7 100644 --- a/faiss/CMakeLists.txt +++ b/faiss/CMakeLists.txt @@ -15,6 +15,7 @@ set(FAISS_SIMD_AVX2_SRC impl/scalar_quantizer/sq-avx2.cpp impl/approx_topk/avx2.cpp utils/simd_impl/distances_avx2.cpp + utils/simd_impl/hamming_avx2.cpp utils/simd_impl/partitioning_avx2.cpp utils/distances_fused/simdlib_based.cpp utils/simd_impl/rabitq_avx2.cpp @@ -33,6 +34,7 @@ set(FAISS_SIMD_NEON_SRC impl/scalar_quantizer/sq-neon.cpp impl/approx_topk/neon.cpp utils/simd_impl/distances_aarch64.cpp + utils/simd_impl/hamming_neon.cpp utils/simd_impl/partitioning_neon.cpp utils/distances_fused/simdlib_based_neon.cpp utils/simd_impl/rabitq_neon.cpp @@ -342,6 +344,7 @@ set(FAISS_HEADERS utils/hamming_distance/avx512-inl.h utils/simd_impl/distances_autovec-inl.h utils/simd_impl/distances_simdlib256.h + utils/simd_impl/hamming_impl.h utils/simd_impl/exhaustive_L2sqr_blas_cmax.h utils/simd_impl/IVFFlatScanner-inl.h utils/simd_impl/distances_sse-inl.h diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h index b7aa4111e3..8f18d98e54 100644 --- a/faiss/impl/platform_macros.h +++ b/faiss/impl/platform_macros.h @@ -126,9 +126,11 @@ inline int __builtin_clzll(uint64_t x) { #ifdef SWIG #define ALIGNED(x) #define FAISS_PACKED +#define FAISS_MAYBE_UNUSED #else #define ALIGNED(x) __attribute__((aligned(x))) #define FAISS_PACKED __attribute__((packed)) +#define FAISS_MAYBE_UNUSED [[maybe_unused]] #endif // On non-Windows, FAISS_PACKED handles packing, so these are no-ops diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp index 3c7931aa23..d02cd6c962 100644 --- a/faiss/utils/hamming.cpp +++ b/faiss/utils/hamming.cpp @@ -23,348 +23,28 @@ #include -#include #include -#include -#include +#include -#include #include -#include -#include -#include +#include #include +// Scalar (NONE) fallback — compiled without SIMD flags, hamdis-inl.h +// selects generic-inl.h via the preprocessor ISA ladder. +#define THE_SIMD_LEVEL SIMDLevel::NONE +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include +#undef THE_SIMD_LEVEL + namespace faiss { size_t hamming_batch_size = 65536; -template -void hammings( - const uint64_t* __restrict bs1, - const uint64_t* __restrict bs2, - size_t n1, - size_t n2, - hamdis_t* __restrict dis) - -{ - size_t i, j; - const size_t nwords = nbits / 64; - for (i = 0; i < n1; i++) { - const uint64_t* __restrict bs1_ = bs1 + i * nwords; - hamdis_t* __restrict dis_ = dis + i * n2; - for (j = 0; j < n2; j++) { - dis_[j] = hamming(bs1_, bs2 + j * nwords); - } - } -} - -void hammings( - const uint64_t* __restrict bs1, - const uint64_t* __restrict bs2, - size_t n1, - size_t n2, - size_t nbits, - hamdis_t* __restrict dis) { - size_t i, j; - const size_t nwords = nbits / 64; - for (i = 0; i < n1; i++) { - const uint64_t* __restrict bs1_ = bs1 + i * nwords; - hamdis_t* __restrict dis_ = dis + i * n2; - for (j = 0; j < n2; j++) { - dis_[j] = hamming(bs1_, bs2 + j * nwords, nwords); - } - } -} - -/* Count number of matches given a max threshold */ -template -void hamming_count_thres( - const uint64_t* __restrict bs1, - const uint64_t* __restrict bs2, - size_t n1, - size_t n2, - hamdis_t ht, - size_t* __restrict nptr) { - const size_t nwords = nbits / 64; - size_t i, j, posm = 0; - const uint64_t* bs2_ = bs2; - - for (i = 0; i < n1; i++) { - bs2 = bs2_; - for (j = 0; j < n2; j++) { - /* collect the match only if this satisfies the threshold */ - if (hamming(bs1, bs2) <= ht) { - posm++; - } - bs2 += nwords; - } - bs1 += nwords; /* next signature */ - } - *nptr = posm; -} - -template -void crosshamming_count_thres( - const uint64_t* __restrict dbs, - size_t n, - int ht, - size_t* __restrict nptr) { - const size_t nwords = nbits / 64; - size_t i, j, posm = 0; - const uint64_t* bs1 = dbs; - for (i = 0; i < n; i++) { - const uint64_t* bs2 = bs1 + 2; - for (j = i + 1; j < n; j++) { - /* collect the match only if this satisfies the threshold */ - if (hamming(bs1, bs2) <= ht) { - posm++; - } - bs2 += nwords; - } - bs1 += nwords; - } - *nptr = posm; -} - -template -size_t match_hamming_thres( - const uint64_t* __restrict bs1, - const uint64_t* __restrict bs2, - size_t n1, - size_t n2, - int ht, - int64_t* __restrict idx, - hamdis_t* __restrict hams) { - const size_t nwords = nbits / 64; - size_t i, j, posm = 0; - hamdis_t h; - const uint64_t* bs2_ = bs2; - for (i = 0; i < n1; i++) { - bs2 = bs2_; - for (j = 0; j < n2; j++) { - /* Here perform the real work of computing the distance */ - h = hamming(bs1, bs2); - - /* collect the match only if this satisfies the threshold */ - if (h <= ht) { - /* Enough space to store another match ? */ - *idx = i; - idx++; - *idx = j; - idx++; - *hams = h; - hams++; - posm++; - } - bs2 += nwords; /* next signature */ - } - bs1 += nwords; - } - return posm; -} - -namespace { - -/* Return closest neighbors w.r.t Hamming distance, using a heap. */ -template -void hammings_knn_hc( - int bytes_per_code, - int_maxheap_array_t* __restrict ha, - const uint8_t* __restrict bs1, - const uint8_t* __restrict bs2, - size_t n2, - bool order = true, - bool init_heap = true, - ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK, - const faiss::IDSelector* sel = nullptr) { - size_t k = ha->k; - if (init_heap) { - ha->heapify(); - } +/****************************************************************** + * Scalar utility functions (no SIMD, no dispatch needed) + ******************************************************************/ - const size_t block_size = hamming_batch_size; - for (size_t j0 = 0; j0 < n2; j0 += block_size) { - const size_t j1 = std::min(j0 + block_size, n2); -#pragma omp parallel for - for (int64_t i = 0; i < static_cast(ha->nh); i++) { - HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code); - - const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code; - hamdis_t dis; - hamdis_t* __restrict bh_val_ = ha->val + i * k; - int64_t* __restrict bh_ids_ = ha->ids + i * k; - - // if larger number of k is required, then ::bs_addn() needs to be - // used instead of ::addn() -#define HANDLE_APPROX(NB, BD) \ - case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \ - FAISS_THROW_IF_NOT_FMT( \ - k <= NB * BD, \ - "The chosen mode (%d) of approximate top-k supports " \ - "up to %d values, but %zd is requested.", \ - (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \ - NB * BD, \ - k); \ - HeapWithBucketsForHamming32< \ - CMax, \ - NB, \ - BD, \ - HammingComputer>:: \ - addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel); \ - break; - - switch (approx_topk_mode) { - HANDLE_APPROX(8, 3) - HANDLE_APPROX(8, 2) - HANDLE_APPROX(16, 2) - HANDLE_APPROX(32, 2) - default: { - for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) { - if (sel && !sel->is_member(j)) { - continue; - } - dis = hc.hamming(bs2_); - if (dis < bh_val_[0]) { - faiss::maxheap_replace_top( - k, bh_val_, bh_ids_, dis, j); - } - } - } break; - } - } - } - if (order) { - ha->reorder(); - } -} - -/* Return closest neighbors w.r.t Hamming distance, using max count. */ -template -void hammings_knn_mc( - int bytes_per_code, - const uint8_t* __restrict a, - const uint8_t* __restrict b, - size_t na, - size_t nb, - size_t k, - int32_t* __restrict distances, - int64_t* __restrict labels, - const faiss::IDSelector* sel) { - const int nBuckets = bytes_per_code * 8 + 1; - std::vector all_counters(na * nBuckets, 0); - std::unique_ptr all_ids_per_dis(new int64_t[na * nBuckets * k]); - - std::vector> cs; - for (size_t i = 0; i < na; ++i) { - cs.push_back( - HCounterState( - all_counters.data() + i * nBuckets, - all_ids_per_dis.get() + i * nBuckets * k, - a + i * bytes_per_code, - 8 * bytes_per_code, - k)); - } - - const size_t block_size = hamming_batch_size; - for (size_t j0 = 0; j0 < nb; j0 += block_size) { - const size_t j1 = std::min(j0 + block_size, nb); -#pragma omp parallel for - for (int64_t i = 0; i < static_cast(na); ++i) { - for (size_t j = j0; j < j1; ++j) { - if (!sel || sel->is_member(j)) { - cs[i].update_counter(b + j * bytes_per_code, j); - } - } - } - } - - for (size_t i = 0; i < na; ++i) { - HCounterState& csi = cs[i]; - - size_t nres = 0; - for (int b_2 = 0; b_2 < nBuckets && nres < k; b_2++) { - for (int l = 0; l < csi.counters[b_2] && nres < k; l++) { - labels[i * k + nres] = csi.ids_per_dis[b_2 * k + l]; - distances[i * k + nres] = b_2; - nres++; - } - } - while (nres < k) { - labels[i * k + nres] = -1; - distances[i * k + nres] = std::numeric_limits::max(); - ++nres; - } - } -} - -template -void hamming_range_search( - const uint8_t* a, - const uint8_t* b, - size_t na, - size_t nb, - int radius, - size_t code_size, - RangeSearchResult* res, - const faiss::IDSelector* sel) { -#pragma omp parallel - { - RangeSearchPartialResult pres(res); - -#pragma omp for - for (int64_t i = 0; i < static_cast(na); i++) { - HammingComputer hc(a + i * code_size, code_size); - const uint8_t* yi = b; - RangeQueryResult& qres = pres.new_result(i); - - for (size_t j = 0; j < nb; j++) { - if (!sel || sel->is_member(j)) { - int dis = hc.hamming(yi); - if (dis < radius) { - qres.add(dis, j); - } - } - yi += code_size; - } - } - pres.finalize(); - } -} - -struct Run_hammings_knn_hc { - using T = void; - template - void f(Types... args) { - hammings_knn_hc(args...); - } -}; - -struct Run_hammings_knn_mc { - using T = void; - template - void f(Types... args) { - hammings_knn_mc(args...); - } -}; - -struct Run_hamming_range_search { - using T = void; - template - void f(Types... args) { - hamming_range_search(args...); - } -}; - -} // namespace - -/* Functions to maps vectors to bits. Assume proper allocation done beforehand, - meaning that b should be be able to receive as many bits as x may produce. */ - -/* - * dimension 0 corresponds to the least significant bit of b[0], or - * equivalently to the lsb of the first byte that is stored. - */ void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) { for (size_t i = 0; i < d; i += 8) { uint8_t w = 0; @@ -381,8 +61,6 @@ void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) { } } -/* Same but for n vectors. - Ensure that the output b is byte-aligned (pad with 0s). */ void fvecs2bitvecs( const float* __restrict x, uint8_t* __restrict b, @@ -407,7 +85,6 @@ void bitvecs2fvecs( } } -/* Reverse bit (NOT a optimized function, only used for print purpose) */ static uint64_t uint64_reverse_bits(uint64_t b) { int i; uint64_t revb = 0; @@ -419,7 +96,6 @@ static uint64_t uint64_reverse_bits(uint64_t b) { return revb; } -/* print the bit vector */ void bitvec_print(const uint8_t* b, size_t d) { size_t i, j; for (i = 0; i < d;) { @@ -459,12 +135,10 @@ void bitvec_shuffle( } } -/*----------------------------------------*/ -/* Hamming distance computation and k-nn */ +/****************************************************************** + * Dispatched Hamming distance public API + ******************************************************************/ -#define C64(x) ((uint64_t*)x) - -/* Compute a set of Hamming distances */ void hammings( const uint8_t* __restrict a, const uint8_t* __restrict b, @@ -472,24 +146,9 @@ void hammings( size_t nb, size_t ncodes, hamdis_t* __restrict dis) { - FAISS_THROW_IF_NOT(ncodes % 8 == 0); - switch (ncodes) { - case 8: - faiss::hammings<64>(C64(a), C64(b), na, nb, dis); - return; - case 16: - faiss::hammings<128>(C64(a), C64(b), na, nb, dis); - return; - case 32: - faiss::hammings<256>(C64(a), C64(b), na, nb, dis); - return; - case 64: - faiss::hammings<512>(C64(a), C64(b), na, nb, dis); - return; - default: - faiss::hammings(C64(a), C64(b), na, nb, ncodes * 8, dis); - return; - } + with_simd_level_256bit([&]() { + hammings_dispatch(a, b, na, nb, ncodes, dis); + }); } void hammings_knn( @@ -511,19 +170,10 @@ void hammings_knn_hc( int order, ApproxTopK_mode_t approx_topk_mode, const faiss::IDSelector* sel) { - Run_hammings_knn_hc r; - dispatch_HammingComputer( - ncodes, - r, - ncodes, - ha, - a, - b, - nb, - order, - true, - approx_topk_mode, - sel); + with_simd_level_256bit([&]() { + hammings_knn_hc_dispatch( + ha, a, b, nb, ncodes, order, approx_topk_mode, sel); + }); } void hammings_knn_mc( @@ -536,9 +186,10 @@ void hammings_knn_mc( int32_t* __restrict distances, int64_t* __restrict labels, const faiss::IDSelector* sel) { - Run_hammings_knn_mc r; - dispatch_HammingComputer( - ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel); + with_simd_level_256bit([&]() { + hammings_knn_mc_dispatch( + a, b, na, nb, k, ncodes, distances, labels, sel); + }); } void hamming_range_search( @@ -550,12 +201,12 @@ void hamming_range_search( size_t code_size, RangeSearchResult* result, const faiss::IDSelector* sel) { - Run_hamming_range_search r; - dispatch_HammingComputer( - code_size, r, a, b, na, nb, radius, code_size, result, sel); + with_simd_level_256bit([&]() { + hamming_range_search_dispatch( + a, b, na, nb, radius, code_size, result, sel); + }); } -/* Count number of matches given a max threshold */ void hamming_count_thres( const uint8_t* bs1, const uint8_t* bs2, @@ -564,54 +215,22 @@ void hamming_count_thres( hamdis_t ht, size_t ncodes, size_t* nptr) { - switch (ncodes) { - case 8: - faiss::hamming_count_thres<64>( - C64(bs1), C64(bs2), n1, n2, ht, nptr); - return; - case 16: - faiss::hamming_count_thres<128>( - C64(bs1), C64(bs2), n1, n2, ht, nptr); - return; - case 32: - faiss::hamming_count_thres<256>( - C64(bs1), C64(bs2), n1, n2, ht, nptr); - return; - case 64: - faiss::hamming_count_thres<512>( - C64(bs1), C64(bs2), n1, n2, ht, nptr); - return; - default: - FAISS_THROW_FMT("not implemented for %zu bits", ncodes); - } + with_simd_level_256bit([&]() { + hamming_count_thres_dispatch(bs1, bs2, n1, n2, ht, ncodes, nptr); + }); } -/* Count number of cross-matches given a threshold */ void crosshamming_count_thres( const uint8_t* dbs, size_t n, hamdis_t ht, size_t ncodes, size_t* nptr) { - switch (ncodes) { - case 8: - faiss::crosshamming_count_thres<64>(C64(dbs), n, ht, nptr); - return; - case 16: - faiss::crosshamming_count_thres<128>(C64(dbs), n, ht, nptr); - return; - case 32: - faiss::crosshamming_count_thres<256>(C64(dbs), n, ht, nptr); - return; - case 64: - faiss::crosshamming_count_thres<512>(C64(dbs), n, ht, nptr); - return; - default: - FAISS_THROW_FMT("not implemented for %zu bits", ncodes); - } + with_simd_level_256bit([&]() { + crosshamming_count_thres_dispatch(dbs, n, ht, ncodes, nptr); + }); } -/* Returns all matches given a threshold */ size_t match_hamming_thres( const uint8_t* bs1, const uint8_t* bs2, @@ -621,49 +240,10 @@ size_t match_hamming_thres( size_t ncodes, int64_t* idx, hamdis_t* dis) { - switch (ncodes) { - case 8: - return faiss::match_hamming_thres<64>( - C64(bs1), C64(bs2), n1, n2, ht, idx, dis); - case 16: - return faiss::match_hamming_thres<128>( - C64(bs1), C64(bs2), n1, n2, ht, idx, dis); - case 32: - return faiss::match_hamming_thres<256>( - C64(bs1), C64(bs2), n1, n2, ht, idx, dis); - case 64: - return faiss::match_hamming_thres<512>( - C64(bs1), C64(bs2), n1, n2, ht, idx, dis); - default: - FAISS_THROW_FMT("not implemented for %zu bits", ncodes); - return 0; - } -} - -#undef C64 - -/************************************* - * generalized Hamming distances - ************************************/ - -template -static void hamming_dis_inner_loop( - const uint8_t* __restrict ca, - const uint8_t* __restrict cb, - size_t nb, - size_t code_size, - int k, - hamdis_t* __restrict bh_val_, - int64_t* __restrict bh_ids_) { - HammingComputer hc(ca, code_size); - - for (size_t j = 0; j < nb; j++) { - int ndiff = hc.hamming(cb); - cb += code_size; - if (ndiff < bh_val_[0]) { - maxheap_replace_top(k, bh_val_, bh_ids_, ndiff, j); - } - } + return with_simd_level_256bit([&]() -> size_t { + return match_hamming_thres_dispatch( + bs1, bs2, n1, n2, ht, ncodes, idx, dis); + }); } void generalized_hammings_knn_hc( @@ -673,46 +253,16 @@ void generalized_hammings_knn_hc( size_t nb, size_t code_size, int ordered) { - int na = ha->nh; - int k = ha->k; - - if (ordered) { - ha->heapify(); - } - -#pragma omp parallel for - for (int i = 0; i < na; i++) { - const uint8_t* __restrict ca = a + i * code_size; - const uint8_t* __restrict cb = b; - - hamdis_t* __restrict bh_val_ = ha->val + i * k; - int64_t* __restrict bh_ids_ = ha->ids + i * k; - - switch (code_size) { - case 8: - hamming_dis_inner_loop( - ca, cb, nb, 8, k, bh_val_, bh_ids_); - break; - case 16: - hamming_dis_inner_loop( - ca, cb, nb, 16, k, bh_val_, bh_ids_); - break; - case 32: - hamming_dis_inner_loop( - ca, cb, nb, 32, k, bh_val_, bh_ids_); - break; - default: - hamming_dis_inner_loop( - ca, cb, nb, code_size, k, bh_val_, bh_ids_); - break; - } - } - - if (ordered) { - ha->reorder(); - } + with_simd_level_256bit([&]() { + generalized_hammings_knn_hc_dispatch( + ha, a, b, nb, code_size, ordered); + }); } +/****************************************************************** + * Bitstring pack/unpack (scalar, no dispatch needed) + ******************************************************************/ + void pack_bitstrings( size_t n, size_t M, diff --git a/faiss/utils/hamming.h b/faiss/utils/hamming.h index 64dfbf7467..7011572856 100644 --- a/faiss/utils/hamming.h +++ b/faiss/utils/hamming.h @@ -30,6 +30,7 @@ #include #include #include +#include // Low-level Hamming distance computations and hamdis_t. #include @@ -284,6 +285,90 @@ void unpack_bitstrings( size_t code_size, int32_t* unpacked); +/** SIMD-dispatched Hamming function implementations. + * Specializations live in per-ISA TUs (hamming_avx2.cpp, etc.). */ + +template +void hammings_knn_hc_dispatch( + int_maxheap_array_t* ha, + const uint8_t* a, + const uint8_t* b, + size_t nb, + size_t ncodes, + int ordered, + ApproxTopK_mode_t approx_topk_mode, + const IDSelector* sel); + +template +void hammings_knn_mc_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + size_t k, + size_t ncodes, + int32_t* distances, + int64_t* labels, + const IDSelector* sel); + +template +void hamming_range_search_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + int radius, + size_t code_size, + RangeSearchResult* result, + const IDSelector* sel); + +template +void hammings_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + size_t ncodes, + hamdis_t* dis); + +template +void generalized_hammings_knn_hc_dispatch( + int_maxheap_array_t* ha, + const uint8_t* a, + const uint8_t* b, + size_t nb, + size_t code_size, + int ordered); + +template +void hamming_count_thres_dispatch( + const uint8_t* bs1, + const uint8_t* bs2, + size_t n1, + size_t n2, + hamdis_t ht, + size_t ncodes, + size_t* nptr); + +template +void crosshamming_count_thres_dispatch( + const uint8_t* dbs, + size_t n, + hamdis_t ht, + size_t ncodes, + size_t* nptr); + +template +size_t match_hamming_thres_dispatch( + const uint8_t* bs1, + const uint8_t* bs2, + size_t n1, + size_t n2, + hamdis_t ht, + size_t ncodes, + int64_t* idx, + hamdis_t* dis); + } // namespace faiss #include diff --git a/faiss/utils/hamming_distance/avx2-inl.h b/faiss/utils/hamming_distance/avx2-inl.h index b7d5f32e66..1ca322c7b2 100644 --- a/faiss/utils/hamming_distance/avx2-inl.h +++ b/faiss/utils/hamming_distance/avx2-inl.h @@ -8,10 +8,11 @@ #ifndef HAMMING_AVX2_INL_H #define HAMMING_AVX2_INL_H -// AVX2 version +// AVX2 version. +// Most code is now in common.h. This file only contains the +// GenHammingComputer classes that leverage SSE/AVX2 intrinsics. #include -#include #include #include @@ -20,376 +21,13 @@ namespace faiss { -/* Elementary Hamming distance computation: unoptimized */ -template -inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { - const size_t nbytes = nbits / 8; - size_t i; - T h = 0; - for (i = 0; i < nbytes; i++) { - h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; - } - return h; -} - -/* Hamming distances for multiples of 64 bits */ -template -inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) { - const size_t nwords = nbits / 64; - size_t i; - hamdis_t h = 0; - for (i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/* specialized (optimized) functions */ -template <> -inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]); -} - -template <> -inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]); -} - -template <> -inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) + - popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]); -} - -/* Hamming distances for multiple of 64 bits */ -inline hamdis_t hamming( - const uint64_t* bs1, - const uint64_t* bs2, - size_t nwords) { - hamdis_t h = 0; - for (size_t i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/****************************************************************** - * The HammingComputer series of classes compares a single code of - * size 4 to 32 to incoming codes. They are intended for use as a - * template class where it would be inefficient to switch on the code - * size in the inner loop. Hopefully the compiler will inline the - * hamming() functions and put the a0, a1, ... in registers. - ******************************************************************/ - -struct HammingComputer4 { - uint32_t a0; - - HammingComputer4() {} - - HammingComputer4(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 4); - (void)code_size; - a0 = *(uint32_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint32_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 4; - } -}; - -struct HammingComputer8 { - uint64_t a0; - - HammingComputer8() {} - - HammingComputer8(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 8); - (void)code_size; - a0 = *(uint64_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint64_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - -struct HammingComputer16 { - uint64_t a0, a1; - - HammingComputer16() {} - - HammingComputer16(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 16); - (void)code_size; - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1); - } - - inline static constexpr int get_code_size() { - return 16; - } -}; - -// when applied to an array, 1/2 of the 64-bit accesses are unaligned. -// This incurs a penalty of ~10% wrt. fully aligned accesses. -struct HammingComputer20 { - uint64_t a0, a1; - uint32_t a2; - - HammingComputer20() {} - - HammingComputer20(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 20); - (void)code_size; - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(*(uint32_t*)(b + 2) ^ a2); - } - - inline static constexpr int get_code_size() { - return 20; - } -}; - -struct HammingComputer32 { - uint64_t a0, a1, a2, a3; - - HammingComputer32() {} - - HammingComputer32(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 32); - (void)code_size; - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3); - } - - inline static constexpr int get_code_size() { - return 32; - } -}; - -struct HammingComputer64 { - uint64_t a0, a1, a2, a3, a4, a5, a6, a7; - - HammingComputer64() {} - - HammingComputer64(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 64); - (void)code_size; - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) + - popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) + - popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7); - } - - inline static constexpr int get_code_size() { - return 64; - } -}; - -struct HammingComputerDefault { - const uint8_t* a8; - int quotient8; - int remainder8; - - HammingComputerDefault() {} - - HammingComputerDefault(const uint8_t* a8_in, int code_size) { - set(a8_in, code_size); - } - - void set(const uint8_t* a8_2, int code_size) { - this->a8 = a8_2; - quotient8 = code_size / 8; - remainder8 = code_size % 8; - } - - int hamming(const uint8_t* b8) const { - int accu = 0; - - const uint64_t* a64 = reinterpret_cast(a8); - const uint64_t* b64 = reinterpret_cast(b8); - int i = 0, len = quotient8; - switch (len & 7) { - default: - while (len > 7) { - len -= 8; - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 7: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 6: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 5: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 4: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 3: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 2: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 1: - accu += popcount64(a64[i] ^ b64[i]); - i++; - } - } - if (remainder8) { - const uint8_t* a = a8 + 8 * quotient8; - const uint8_t* b = b8 + 8 * quotient8; - switch (remainder8) { - case 7: - accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; - [[fallthrough]]; - case 6: - accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; - [[fallthrough]]; - case 5: - accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; - [[fallthrough]]; - case 4: - accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; - [[fallthrough]]; - case 3: - accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; - [[fallthrough]]; - case 2: - accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; - [[fallthrough]]; - case 1: - accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; - [[fallthrough]]; - default: - break; - } - } - - return accu; - } - - inline int get_code_size() const { - return quotient8 * 8 + remainder8; - } -}; - -/*************************************************************************** - * generalized Hamming = number of bytes that are different between - * two codes. - ***************************************************************************/ - -inline int generalized_hamming_64(uint64_t a) { - a |= a >> 1; - a |= a >> 2; - a |= a >> 4; - a &= 0x0101010101010101UL; - return popcount64(a); -} - -struct GenHammingComputer8 { - uint64_t a0; - - GenHammingComputer8(const uint8_t* a, int code_size) { - assert(code_size == 8); - (void)code_size; - a0 = *(uint64_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return generalized_hamming_64(*(uint64_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - // I'm not sure whether this version is faster of slower, tbh // todo: test on different CPUs struct GenHammingComputer16 { __m128i a; - GenHammingComputer16(const uint8_t* a8, int code_size) { + GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 16); - (void)code_size; a = _mm_loadu_si128((const __m128i_u*)a8); } @@ -408,9 +46,8 @@ struct GenHammingComputer16 { struct GenHammingComputer32 { __m256i a; - GenHammingComputer32(const uint8_t* a8, int code_size) { + GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 32); - (void)code_size; a = _mm256_loadu_si256((const __m256i_u*)a8); } diff --git a/faiss/utils/hamming_distance/avx512-inl.h b/faiss/utils/hamming_distance/avx512-inl.h index dfd8f55bfe..af04ef175d 100644 --- a/faiss/utils/hamming_distance/avx512-inl.h +++ b/faiss/utils/hamming_distance/avx512-inl.h @@ -8,15 +8,13 @@ #ifndef HAMMING_AVX512_INL_H #define HAMMING_AVX512_INL_H -// AVX512 version -// The _mm512_popcnt_epi64 intrinsic is used to accelerate Hamming distance -// calculations in HammingComputerDefault and HammingComputer64. This intrinsic -// is not available in the default Faiss avx512 build mode but is only -// available in the avx512_spr build mode, which targets Intel(R) Sapphire -// Rapids. +// AVX512 version. +// Most code is now in common.h. This file provides AVX512-specific +// overrides for HammingComputer64 and HammingComputerDefault +// (using _mm512_popcnt_epi64 when __AVX512VPOPCNTDQ__ is available), +// and the GenHammingComputer classes that leverage SSE/AVX2 intrinsics. #include -#include #include #include @@ -25,199 +23,6 @@ namespace faiss { -/* Elementary Hamming distance computation: unoptimized */ -template -inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { - const size_t nbytes = nbits / 8; - size_t i; - T h = 0; - for (i = 0; i < nbytes; i++) { - h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; - } - return h; -} - -/* Hamming distances for multiples of 64 bits */ -template -inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) { - const size_t nwords = nbits / 64; - size_t i; - hamdis_t h = 0; - for (i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/* specialized (optimized) functions */ -template <> -inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]); -} - -template <> -inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]); -} - -template <> -inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) + - popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]); -} - -/* Hamming distances for multiple of 64 bits */ -inline hamdis_t hamming( - const uint64_t* bs1, - const uint64_t* bs2, - size_t nwords) { - hamdis_t h = 0; - for (size_t i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/****************************************************************** - * The HammingComputer series of classes compares a single code of - * size 4 to 32 to incoming codes. They are intended for use as a - * template class where it would be inefficient to switch on the code - * size in the inner loop. Hopefully the compiler will inline the - * hamming() functions and put the a0, a1, ... in registers. - ******************************************************************/ - -struct HammingComputer4 { - uint32_t a0; - - HammingComputer4() {} - - HammingComputer4(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 4); - a0 = *(uint32_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint32_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 4; - } -}; - -struct HammingComputer8 { - uint64_t a0; - - HammingComputer8() {} - - HammingComputer8(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 8); - a0 = *(uint64_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint64_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - -struct HammingComputer16 { - uint64_t a0, a1; - - HammingComputer16() {} - - HammingComputer16(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 16); - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1); - } - - inline static constexpr int get_code_size() { - return 16; - } -}; - -// when applied to an array, 1/2 of the 64-bit accesses are unaligned. -// This incurs a penalty of ~10% wrt. fully aligned accesses. -struct HammingComputer20 { - uint64_t a0, a1; - uint32_t a2; - - HammingComputer20() {} - - HammingComputer20(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 20); - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(*(uint32_t*)(b + 2) ^ a2); - } - - inline static constexpr int get_code_size() { - return 20; - } -}; - -struct HammingComputer32 { - uint64_t a0, a1, a2, a3; - - HammingComputer32() {} - - HammingComputer32(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, int code_size) { - assert(code_size == 32); - const uint64_t* a = (uint64_t*)a8; - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3); - } - - inline static constexpr int get_code_size() { - return 32; - } -}; - struct HammingComputer64 { uint64_t a0, a1, a2, a3, a4, a5, a6, a7; const uint64_t* a; @@ -228,9 +33,9 @@ struct HammingComputer64 { set(a8, code_size); } - void set(const uint8_t* a8, int code_size) { + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 64); - a = (uint64_t*)a8; + a = reinterpret_cast(a8); a0 = a[0]; a1 = a[1]; a2 = a[2]; @@ -242,7 +47,7 @@ struct HammingComputer64 { } inline int hamming(const uint8_t* b8) const { - const uint64_t* b = (uint64_t*)b8; + const uint64_t* b = reinterpret_cast(b8); #ifdef __AVX512VPOPCNTDQ__ __m512i vxor = _mm512_xor_si512(_mm512_loadu_si512(a), _mm512_loadu_si512(b)); @@ -269,8 +74,8 @@ struct HammingComputerDefault { HammingComputerDefault() {} - HammingComputerDefault(const uint8_t* a8, int code_size) { - set(a8, code_size); + HammingComputerDefault(const uint8_t* a8_in, int code_size) { + set(a8_in, code_size); } void set(const uint8_t* a8_2, int code_size) { @@ -298,73 +103,8 @@ struct HammingComputerDefault { } i *= 8; #endif - int len = quotient8 - i; - switch (len & 7) { - default: - while (len > 7) { - len -= 8; - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 7: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 6: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 5: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 4: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 3: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 2: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 1: - accu += popcount64(a64[i] ^ b64[i]); - i++; - } - } - if (remainder8) { - const uint8_t* a = a8 + 8 * quotient8; - const uint8_t* b = b8 + 8 * quotient8; - switch (remainder8) { - case 7: - accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; - [[fallthrough]]; - case 6: - accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; - [[fallthrough]]; - case 5: - accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; - [[fallthrough]]; - case 4: - accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; - [[fallthrough]]; - case 3: - accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; - [[fallthrough]]; - case 2: - accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; - [[fallthrough]]; - case 1: - accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; - [[fallthrough]]; - default: - break; - } - } - + accu += hamming_popcount_tail( + a64, b64, i, quotient8, a8, b8, remainder8); return accu; } @@ -373,42 +113,12 @@ struct HammingComputerDefault { } }; -/*************************************************************************** - * generalized Hamming = number of bytes that are different between - * two codes. - ***************************************************************************/ - -inline int generalized_hamming_64(uint64_t a) { - a |= a >> 1; - a |= a >> 2; - a |= a >> 4; - a &= 0x0101010101010101UL; - return popcount64(a); -} - -struct GenHammingComputer8 { - uint64_t a0; - - GenHammingComputer8(const uint8_t* a, int code_size) { - assert(code_size == 8); - a0 = *(uint64_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return generalized_hamming_64(*(uint64_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - // I'm not sure whether this version is faster of slower, tbh // todo: test on different CPUs struct GenHammingComputer16 { __m128i a; - GenHammingComputer16(const uint8_t* a8, int code_size) { + GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 16); a = _mm_loadu_si128((const __m128i_u*)a8); } @@ -428,7 +138,7 @@ struct GenHammingComputer16 { struct GenHammingComputer32 { __m256i a; - GenHammingComputer32(const uint8_t* a8, int code_size) { + GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 32); a = _mm256_loadu_si256((const __m256i_u*)a8); } diff --git a/faiss/utils/hamming_distance/common.h b/faiss/utils/hamming_distance/common.h index 154e14e2cc..b50feca2e0 100644 --- a/faiss/utils/hamming_distance/common.h +++ b/faiss/utils/hamming_distance/common.h @@ -8,6 +8,8 @@ #ifndef FAISS_hamming_common_h #define FAISS_hamming_common_h +#include +#include #include #include @@ -43,6 +45,407 @@ inline constexpr uint8_t hamdis_tab_ham_bytes[256] = { 4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8}; +/*************************************************************************** + * Universal code: identical across all architectures + ***************************************************************************/ + +/* Elementary Hamming distance computation: unoptimized */ +template +inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { + const size_t nbytes = nbits / 8; + size_t i; + T h = 0; + for (i = 0; i < nbytes; i++) { + h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; + } + return h; +} + +/*************************************************************************** + * generalized Hamming = number of bytes that are different between + * two codes. + ***************************************************************************/ + +inline int generalized_hamming_64(uint64_t a) { + a |= a >> 1; + a |= a >> 2; + a |= a >> 4; + a &= 0x0101010101010101UL; + return popcount64(a); +} + +/****************************************************************** + * The HammingComputer series of classes compares a single code of + * size 4 to 32 to incoming codes. They are intended for use as a + * template class where it would be inefficient to switch on the code + * size in the inner loop. Hopefully the compiler will inline the + * hamming() functions and put the a0, a1, ... in registers. + ******************************************************************/ + +struct HammingComputer4 { + uint32_t a0; + + HammingComputer4() {} + + HammingComputer4(const uint8_t* a, int code_size) { + set(a, code_size); + } + + void set(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 4); + const uint32_t* a32 = reinterpret_cast(a); + a0 = *a32; + } + + inline int hamming(const uint8_t* b) const { + const uint32_t* b32 = reinterpret_cast(b); + return popcount64(*b32 ^ a0); + } + + inline static constexpr int get_code_size() { + return 4; + } +}; + +struct HammingComputer8 { + uint64_t a0; + + HammingComputer8() {} + + HammingComputer8(const uint8_t* a, int code_size) { + set(a, code_size); + } + + void set(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 8); + const uint64_t* a64 = reinterpret_cast(a); + a0 = *a64; + } + + inline int hamming(const uint8_t* b) const { + const uint64_t* b64 = reinterpret_cast(b); + return popcount64(*b64 ^ a0); + } + + inline static constexpr int get_code_size() { + return 8; + } +}; + +/* Duff's device + byte remainder tail for HammingComputerDefault. + * Processes uint64 words starting at index i_start using popcount, + * then handles any remaining bytes via lookup table. */ +inline int hamming_popcount_tail( + const uint64_t* a64, + const uint64_t* b64, + int i_start, + int quotient8, + const uint8_t* a8, + const uint8_t* b8, + int remainder8) { + int accu = 0; + int i = i_start; + int len = quotient8 - i_start; + switch (len & 7) { + default: + while (len > 7) { + len -= 8; + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 7: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 6: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 5: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 4: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 3: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 2: + accu += popcount64(a64[i] ^ b64[i]); + i++; + [[fallthrough]]; + case 1: + accu += popcount64(a64[i] ^ b64[i]); + i++; + } + } + if (remainder8) { + const uint8_t* a = a8 + 8 * quotient8; + const uint8_t* b = b8 + 8 * quotient8; + switch (remainder8) { + case 7: + accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; + [[fallthrough]]; + case 6: + accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; + [[fallthrough]]; + case 5: + accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; + [[fallthrough]]; + case 4: + accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; + [[fallthrough]]; + case 3: + accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; + [[fallthrough]]; + case 2: + accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; + [[fallthrough]]; + case 1: + accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; + break; + default: + break; + } + } + return accu; +} + +/*************************************************************************** + * Scalar code shared by generic, AVX2, and AVX512 backends. + * NEON provides its own optimized versions of these. + ***************************************************************************/ + +#ifndef __aarch64__ + +/* Hamming distances for multiples of 64 bits */ +template +inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) { + const size_t nwords = nbits / 64; + size_t i; + hamdis_t h = 0; + for (i = 0; i < nwords; i++) { + h += popcount64(bs1[i] ^ bs2[i]); + } + return h; +} + +/* specialized (optimized) functions */ +template <> +inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) { + return popcount64(pa[0] ^ pb[0]); +} + +template <> +inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) { + return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]); +} + +template <> +inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) { + return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) + + popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]); +} + +/* Hamming distances for multiple of 64 bits */ +inline hamdis_t hamming( + const uint64_t* bs1, + const uint64_t* bs2, + size_t nwords) { + hamdis_t h = 0; + for (size_t i = 0; i < nwords; i++) { + h += popcount64(bs1[i] ^ bs2[i]); + } + return h; +} + +struct HammingComputer16 { + uint64_t a0, a1; + + HammingComputer16() {} + + HammingComputer16(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 16); + const uint64_t* a = reinterpret_cast(a8); + a0 = a[0]; + a1 = a[1]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = reinterpret_cast(b8); + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1); + } + + inline static constexpr int get_code_size() { + return 16; + } +}; + +// when applied to an array, 1/2 of the 64-bit accesses are unaligned. +// This incurs a penalty of ~10% wrt. fully aligned accesses. +struct HammingComputer20 { + uint64_t a0, a1; + uint32_t a2; + + HammingComputer20() {} + + HammingComputer20(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 20); + const uint64_t* a = reinterpret_cast(a8); + const uint32_t* a32 = reinterpret_cast(a8); + a0 = a[0]; + a1 = a[1]; + // can't read a[2] since it is uint64_t, not uint32_t + // results in AddressSanitizer failure reading past end of array + a2 = a32[4]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = reinterpret_cast(b8); + const uint32_t* b32_tail = reinterpret_cast(b + 2); + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + + popcount64(*b32_tail ^ a2); + } + + inline static constexpr int get_code_size() { + return 20; + } +}; + +struct HammingComputer32 { + uint64_t a0, a1, a2, a3; + + HammingComputer32() {} + + HammingComputer32(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 32); + const uint64_t* a = reinterpret_cast(a8); + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = reinterpret_cast(b8); + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + + popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3); + } + + inline static constexpr int get_code_size() { + return 32; + } +}; + +struct GenHammingComputer8 { + uint64_t a0; + + GenHammingComputer8(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 8); + const uint64_t* a64 = reinterpret_cast(a); + a0 = *a64; + } + + inline int hamming(const uint8_t* b) const { + const uint64_t* b64 = reinterpret_cast(b); + return generalized_hamming_64(*b64 ^ a0); + } + + inline static constexpr int get_code_size() { + return 8; + } +}; + +#endif // !__aarch64__ + +/*************************************************************************** + * Scalar code shared by generic and AVX2 backends only. + * AVX512 and NEON provide their own optimized versions. + ***************************************************************************/ + +#if !defined(__aarch64__) && !defined(__AVX512F__) + +struct HammingComputer64 { + uint64_t a0, a1, a2, a3, a4, a5, a6, a7; + + HammingComputer64() {} + + HammingComputer64(const uint8_t* a8, int code_size) { + set(a8, code_size); + } + + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { + assert(code_size == 64); + const uint64_t* a = reinterpret_cast(a8); + a0 = a[0]; + a1 = a[1]; + a2 = a[2]; + a3 = a[3]; + a4 = a[4]; + a5 = a[5]; + a6 = a[6]; + a7 = a[7]; + } + + inline int hamming(const uint8_t* b8) const { + const uint64_t* b = reinterpret_cast(b8); + return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + + popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) + + popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) + + popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7); + } + + inline static constexpr int get_code_size() { + return 64; + } +}; + +struct HammingComputerDefault { + const uint8_t* a8; + int quotient8; + int remainder8; + + HammingComputerDefault() {} + + HammingComputerDefault(const uint8_t* a8_in, int code_size) { + set(a8_in, code_size); + } + + void set(const uint8_t* a8_in, int code_size) { + this->a8 = a8_in; + quotient8 = code_size / 8; + remainder8 = code_size % 8; + } + + int hamming(const uint8_t* b8) const { + const uint64_t* a64 = reinterpret_cast(a8); + const uint64_t* b64 = reinterpret_cast(b8); + return hamming_popcount_tail( + a64, b64, 0, quotient8, a8, b8, remainder8); + } + + inline int get_code_size() const { + return quotient8 * 8 + remainder8; + } +}; + +#endif // !__aarch64__ && !__AVX512F__ + } // namespace faiss #endif diff --git a/faiss/utils/hamming_distance/generic-inl.h b/faiss/utils/hamming_distance/generic-inl.h index 7f229f4f11..59d942dea6 100644 --- a/faiss/utils/hamming_distance/generic-inl.h +++ b/faiss/utils/hamming_distance/generic-inl.h @@ -9,383 +9,19 @@ #define HAMMING_GENERIC_INL_H // A general-purpose version of hamming distance computation. +// Most code is now in common.h. This file only contains the +// GenHammingComputer classes that use scalar generalized_hamming_64. #include -#include #include #include namespace faiss { -/* Elementary Hamming distance computation: unoptimized */ -template -inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { - const size_t nbytes = nbits / 8; - size_t i; - T h = 0; - for (i = 0; i < nbytes; i++) { - h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; - } - return h; -} - -/* Hamming distances for multiples of 64 bits */ -template -inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) { - const size_t nwords = nbits / 64; - size_t i; - hamdis_t h = 0; - for (i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/* specialized (optimized) functions */ -template <> -inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]); -} - -template <> -inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]); -} - -template <> -inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) { - return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) + - popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]); -} - -/* Hamming distances for multiple of 64 bits */ -inline hamdis_t hamming( - const uint64_t* bs1, - const uint64_t* bs2, - size_t nwords) { - hamdis_t h = 0; - for (size_t i = 0; i < nwords; i++) { - h += popcount64(bs1[i] ^ bs2[i]); - } - return h; -} - -/****************************************************************** - * The HammingComputer series of classes compares a single code of - * size 4 to 32 to incoming codes. They are intended for use as a - * template class where it would be inefficient to switch on the code - * size in the inner loop. Hopefully the compiler will inline the - * hamming() functions and put the a0, a1, ... in registers. - ******************************************************************/ - -struct HammingComputer4 { - uint32_t a0; - - HammingComputer4() {} - - HammingComputer4(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, [[maybe_unused]] int code_size) { - assert(code_size == 4); - const uint32_t* a32 = reinterpret_cast(a); - a0 = *a32; - } - - inline int hamming(const uint8_t* b) const { - const uint32_t* b32 = reinterpret_cast(b); - return popcount64(*b32 ^ a0); - } - - inline static constexpr int get_code_size() { - return 4; - } -}; - -struct HammingComputer8 { - uint64_t a0; - - HammingComputer8() {} - - HammingComputer8(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, [[maybe_unused]] int code_size) { - assert(code_size == 8); - const uint64_t* a64 = reinterpret_cast(a); - a0 = *a64; - } - - inline int hamming(const uint8_t* b) const { - const uint64_t* b64 = reinterpret_cast(b); - return popcount64(*b64 ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - -struct HammingComputer16 { - uint64_t a0, a1; - - HammingComputer16() {} - - HammingComputer16(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, [[maybe_unused]] int code_size) { - assert(code_size == 16); - const uint64_t* a = reinterpret_cast(a8); - a0 = a[0]; - a1 = a[1]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = reinterpret_cast(b8); - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1); - } - - inline static constexpr int get_code_size() { - return 16; - } -}; - -// when applied to an array, 1/2 of the 64-bit accesses are unaligned. -// This incurs a penalty of ~10% wrt. fully aligned accesses. -struct HammingComputer20 { - uint64_t a0, a1; - uint32_t a2; - - HammingComputer20() {} - - HammingComputer20(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, [[maybe_unused]] int code_size) { - assert(code_size == 20); - const uint64_t* a = reinterpret_cast(a8); - const uint32_t* a32 = reinterpret_cast(a8); - a0 = a[0]; - a1 = a[1]; - // can't read a[2] since it is uint64_t, not uint32_t - // results in AddressSanitizer failure reading past end of array - a2 = a32[4]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = reinterpret_cast(b8); - const uint32_t* b32_tail = reinterpret_cast(b + 2); - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(*b32_tail ^ a2); - } - - inline static constexpr int get_code_size() { - return 20; - } -}; - -struct HammingComputer32 { - uint64_t a0, a1, a2, a3; - - HammingComputer32() {} - - HammingComputer32(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, [[maybe_unused]] int code_size) { - assert(code_size == 32); - const uint64_t* a = reinterpret_cast(a8); - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = reinterpret_cast(b8); - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3); - } - - inline static constexpr int get_code_size() { - return 32; - } -}; - -struct HammingComputer64 { - uint64_t a0, a1, a2, a3, a4, a5, a6, a7; - - HammingComputer64() {} - - HammingComputer64(const uint8_t* a8, int code_size) { - set(a8, code_size); - } - - void set(const uint8_t* a8, [[maybe_unused]] int code_size) { - assert(code_size == 64); - const uint64_t* a = reinterpret_cast(a8); - a0 = a[0]; - a1 = a[1]; - a2 = a[2]; - a3 = a[3]; - a4 = a[4]; - a5 = a[5]; - a6 = a[6]; - a7 = a[7]; - } - - inline int hamming(const uint8_t* b8) const { - const uint64_t* b = reinterpret_cast(b8); - return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) + - popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) + - popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) + - popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7); - } - - inline static constexpr int get_code_size() { - return 64; - } -}; - -struct HammingComputerDefault { - const uint8_t* a8; - int quotient8; - int remainder8; - - HammingComputerDefault() {} - - HammingComputerDefault(const uint8_t* a8_in, int code_size) { - set(a8_in, code_size); - } - - void set(const uint8_t* a8_in, int code_size) { - this->a8 = a8_in; - quotient8 = code_size / 8; - remainder8 = code_size % 8; - } - - int hamming(const uint8_t* b8) const { - int accu = 0; - - const uint64_t* a64 = reinterpret_cast(a8); - const uint64_t* b64 = reinterpret_cast(b8); - int i = 0, len = quotient8; - switch (len & 7) { - default: - while (len > 7) { - len -= 8; - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 7: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 6: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 5: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 4: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 3: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 2: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 1: - accu += popcount64(a64[i] ^ b64[i]); - i++; - } - } - if (remainder8) { - const uint8_t* a = a8 + 8 * quotient8; - const uint8_t* b = b8 + 8 * quotient8; - switch (remainder8) { - case 7: - accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; - [[fallthrough]]; - case 6: - accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; - [[fallthrough]]; - case 5: - accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; - [[fallthrough]]; - case 4: - accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; - [[fallthrough]]; - case 3: - accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; - [[fallthrough]]; - case 2: - accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; - [[fallthrough]]; - case 1: - accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; - break; - default: - break; - } - } - - return accu; - } - - inline int get_code_size() const { - return quotient8 * 8 + remainder8; - } -}; - -/*************************************************************************** - * generalized Hamming = number of bytes that are different between - * two codes. - ***************************************************************************/ - -inline int generalized_hamming_64(uint64_t a) { - a |= a >> 1; - a |= a >> 2; - a |= a >> 4; - a &= 0x0101010101010101UL; - return popcount64(a); -} - -struct GenHammingComputer8 { - uint64_t a0; - - GenHammingComputer8(const uint8_t* a, [[maybe_unused]] int code_size) { - assert(code_size == 8); - const uint64_t* a64 = reinterpret_cast(a); - a0 = *a64; - } - - inline int hamming(const uint8_t* b) const { - const uint64_t* b64 = reinterpret_cast(b); - return generalized_hamming_64(*b64 ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - struct GenHammingComputer16 { uint64_t a0, a1; - GenHammingComputer16(const uint8_t* a8, [[maybe_unused]] int code_size) { + GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 16); const uint64_t* a = reinterpret_cast(a8); a0 = a[0]; @@ -406,7 +42,7 @@ struct GenHammingComputer16 { struct GenHammingComputer32 { uint64_t a0, a1, a2, a3; - GenHammingComputer32(const uint8_t* a8, [[maybe_unused]] int code_size) { + GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 32); const uint64_t* a = reinterpret_cast(a8); a0 = a[0]; diff --git a/faiss/utils/hamming_distance/neon-inl.h b/faiss/utils/hamming_distance/neon-inl.h index d00a402857..d29f40e0b5 100644 --- a/faiss/utils/hamming_distance/neon-inl.h +++ b/faiss/utils/hamming_distance/neon-inl.h @@ -8,8 +8,11 @@ #ifndef HAMMING_NEON_INL_H #define HAMMING_NEON_INL_H -// a specialized version of hamming is needed here, because both +// A specialized version of hamming is needed here, because both // gcc, clang and msvc seem to generate suboptimal code sometimes. +// +// Universal code (HammingComputer4, HammingComputer8, generalized_hamming_64, +// byte-level hamming template, hamming_popcount_tail) comes from common.h. #ifdef __aarch64__ @@ -25,18 +28,6 @@ namespace faiss { -/* Elementary Hamming distance computation: unoptimized */ -template -inline T hamming(const uint8_t* bs1, const uint8_t* bs2) { - const size_t nbytes = nbits / 8; - size_t i; - T h = 0; - for (i = 0; i < nbytes; i++) { - h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]]; - } - return h; -} - /* Hamming distances for multiples of 64 bits */ template inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb) { @@ -125,59 +116,10 @@ inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb, size_t nwords) { } /****************************************************************** - * The HammingComputer series of classes compares a single code of - * size 4 to 32 to incoming codes. They are intended for use as a - * template class where it would be inefficient to switch on the code - * size in the inner loop. Hopefully the compiler will inline the - * hamming() functions and put the a0, a1, ... in registers. + * NEON-optimized HammingComputer classes for sizes 16-64. + * Sizes 4 and 8 use the scalar versions from common.h. ******************************************************************/ -struct HammingComputer4 { - uint32_t a0; - - HammingComputer4() {} - - HammingComputer4(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 4); - a0 = *(uint32_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint32_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 4; - } -}; - -struct HammingComputer8 { - uint64_t a0; - - HammingComputer8() {} - - HammingComputer8(const uint8_t* a, int code_size) { - set(a, code_size); - } - - void set(const uint8_t* a, int code_size) { - assert(code_size == 8); - a0 = *(uint64_t*)a; - } - - inline int hamming(const uint8_t* b) const { - return popcount64(*(uint64_t*)b ^ a0); - } - - inline static constexpr int get_code_size() { - return 8; - } -}; - struct HammingComputer16 { uint8x16_t a0; @@ -187,7 +129,7 @@ struct HammingComputer16 { set(a8, code_size); } - void set(const uint8_t* a8, int code_size) { + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 16); a0 = vld1q_u8(a8); } @@ -218,7 +160,7 @@ struct HammingComputer20 { set(a8, code_size); } - void set(const uint8_t* a8, int code_size) { + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 20); a0 = vld1q_u8(a8); @@ -253,7 +195,7 @@ struct HammingComputer32 { set(a8, code_size); } - void set(const uint8_t* a8, int code_size) { + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 32); a0 = vld1q_u8(a8); a1 = vld1q_u8(a8 + 16); @@ -286,7 +228,7 @@ struct HammingComputer64 { set(a8, code_size); } - void set(const uint8_t* a8, int code_size) { + void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 64); hc0.set(a8, 32); hc1.set(a8 + 32, 32); @@ -308,12 +250,12 @@ struct HammingComputerDefault { HammingComputerDefault() {} - HammingComputerDefault(const uint8_t* a8, int code_size) { - set(a8, code_size); + HammingComputerDefault(const uint8_t* a8_in, int code_size) { + set(a8_in, code_size); } - void set(const uint8_t* a8, int code_size) { - this->a8 = a8; + void set(const uint8_t* a8_in, int code_size) { + this->a8 = a8_in; quotient8 = code_size / 8; remainder8 = code_size % 8; } @@ -323,80 +265,15 @@ struct HammingComputerDefault { const uint64_t* a64 = reinterpret_cast(a8); const uint64_t* b64 = reinterpret_cast(b8); - int i = 0, len = quotient8; + int i = 0; int len256 = (quotient8 / 4) * 4; for (; i < len256; i += 4) { accu += ::faiss::hamming<256>(a64 + i, b64 + i); - len -= 4; - } - - switch (len & 7) { - default: - while (len > 7) { - len -= 8; - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 7: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 6: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 5: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 4: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 3: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 2: - accu += popcount64(a64[i] ^ b64[i]); - i++; - [[fallthrough]]; - case 1: - accu += popcount64(a64[i] ^ b64[i]); - i++; - } - } - if (remainder8) { - const uint8_t* a = a8 + 8 * quotient8; - const uint8_t* b = b8 + 8 * quotient8; - switch (remainder8) { - case 7: - accu += hamdis_tab_ham_bytes[a[6] ^ b[6]]; - [[fallthrough]]; - case 6: - accu += hamdis_tab_ham_bytes[a[5] ^ b[5]]; - [[fallthrough]]; - case 5: - accu += hamdis_tab_ham_bytes[a[4] ^ b[4]]; - [[fallthrough]]; - case 4: - accu += hamdis_tab_ham_bytes[a[3] ^ b[3]]; - [[fallthrough]]; - case 3: - accu += hamdis_tab_ham_bytes[a[2] ^ b[2]]; - [[fallthrough]]; - case 2: - accu += hamdis_tab_ham_bytes[a[1] ^ b[1]]; - [[fallthrough]]; - case 1: - accu += hamdis_tab_ham_bytes[a[0] ^ b[0]]; - [[fallthrough]]; - default: - break; - } } + accu += hamming_popcount_tail( + a64, b64, i, quotient8, a8, b8, remainder8); return accu; } @@ -406,22 +283,13 @@ struct HammingComputerDefault { }; /*************************************************************************** - * generalized Hamming = number of bytes that are different between - * two codes. + * NEON-optimized generalized Hamming computers. ***************************************************************************/ -inline int generalized_hamming_64(uint64_t a) { - a |= a >> 1; - a |= a >> 2; - a |= a >> 4; - a &= 0x0101010101010101UL; - return popcount64(a); -} - struct GenHammingComputer8 { uint8x8_t a0; - GenHammingComputer8(const uint8_t* a8, int code_size) { + GenHammingComputer8(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 8); a0 = vld1_u8(a8); } @@ -441,7 +309,7 @@ struct GenHammingComputer8 { struct GenHammingComputer16 { uint8x16_t a0; - GenHammingComputer16(const uint8_t* a8, int code_size) { + GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) { assert(code_size == 16); a0 = vld1q_u8(a8); } @@ -461,7 +329,7 @@ struct GenHammingComputer16 { struct GenHammingComputer32 { GenHammingComputer16 a0, a1; - GenHammingComputer32(const uint8_t* a8, int code_size) + GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) : a0(a8, 16), a1(a8 + 16, 16) { assert(code_size == 32); } diff --git a/faiss/utils/simd_impl/hamming_avx2.cpp b/faiss/utils/simd_impl/hamming_avx2.cpp new file mode 100644 index 0000000000..ac0787a094 --- /dev/null +++ b/faiss/utils/simd_impl/hamming_avx2.cpp @@ -0,0 +1,14 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_AVX2 + +#define THE_SIMD_LEVEL SIMDLevel::AVX2 +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include + +#endif // COMPILE_SIMD_AVX2 diff --git a/faiss/utils/simd_impl/hamming_impl.h b/faiss/utils/simd_impl/hamming_impl.h new file mode 100644 index 0000000000..35e63d575f --- /dev/null +++ b/faiss/utils/simd_impl/hamming_impl.h @@ -0,0 +1,629 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Shared implementation header for Hamming distance dynamic dispatch. +// Included by per-ISA TUs (hamming_avx2.cpp, hamming_neon.cpp) and by +// hamming.cpp (for the NONE fallback). +// +// THE_SIMD_LEVEL must be defined before including this header. +// +// INCLUDE ORDERING IS LOAD-BEARING: hamdis-inl.h must be included before +// hamming.h so that the ISA-specific struct definitions (selected by +// __AVX2__ / __aarch64__ / etc.) are established before hamming.h's own +// #include of hamdis-inl.h becomes a no-op via include guards. + +#pragma once + +#ifndef THE_SIMD_LEVEL +#error "Define THE_SIMD_LEVEL before including hamming_impl.h" +#endif + +// ISA-specific struct definitions — MUST come first (see comment above). +#include + +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +namespace faiss { + +// Alias for readability in the template specializations below. +constexpr SIMDLevel hamming_impl_SL = THE_SIMD_LEVEL; + +// All implementation templates live in an anonymous namespace for ODR safety. +// Different TUs compile this header with different HammingComputer struct +// layouts (generic vs AVX2 vs NEON). The anonymous namespace ensures each +// TU gets its own copy with internal linkage, preventing the linker from +// merging incompatible instantiations. +namespace { + +/****************************************************************** + * Bit-level Hamming distance templates + ******************************************************************/ + +template +void hammings_impl( + const uint64_t* __restrict bs1, + const uint64_t* __restrict bs2, + size_t n1, + size_t n2, + hamdis_t* __restrict dis) { + size_t i, j; + const size_t nwords = nbits / 64; + for (i = 0; i < n1; i++) { + const uint64_t* __restrict bs1_ = bs1 + i * nwords; + hamdis_t* __restrict dis_ = dis + i * n2; + for (j = 0; j < n2; j++) { + dis_[j] = hamming(bs1_, bs2 + j * nwords); + } + } +} + +void hammings_impl_runtime( + const uint64_t* __restrict bs1, + const uint64_t* __restrict bs2, + size_t n1, + size_t n2, + size_t nbits, + hamdis_t* __restrict dis) { + size_t i, j; + const size_t nwords = nbits / 64; + for (i = 0; i < n1; i++) { + const uint64_t* __restrict bs1_ = bs1 + i * nwords; + hamdis_t* __restrict dis_ = dis + i * n2; + for (j = 0; j < n2; j++) { + dis_[j] = hamming(bs1_, bs2 + j * nwords, nwords); + } + } +} + +template +void hamming_count_thres_impl( + const uint64_t* __restrict bs1, + const uint64_t* __restrict bs2, + size_t n1, + size_t n2, + hamdis_t ht, + size_t* __restrict nptr) { + const size_t nwords = nbits / 64; + size_t i, j, posm = 0; + const uint64_t* bs2_ = bs2; + + for (i = 0; i < n1; i++) { + bs2 = bs2_; + for (j = 0; j < n2; j++) { + if (hamming(bs1, bs2) <= ht) { + posm++; + } + bs2 += nwords; + } + bs1 += nwords; + } + *nptr = posm; +} + +template +void crosshamming_count_thres_impl( + const uint64_t* __restrict dbs, + size_t n, + int ht, + size_t* __restrict nptr) { + const size_t nwords = nbits / 64; + size_t i, j, posm = 0; + const uint64_t* bs1 = dbs; + for (i = 0; i < n; i++) { + const uint64_t* bs2 = bs1 + 2; + for (j = i + 1; j < n; j++) { + if (hamming(bs1, bs2) <= ht) { + posm++; + } + bs2 += nwords; + } + bs1 += nwords; + } + *nptr = posm; +} + +template +size_t match_hamming_thres_impl( + const uint64_t* __restrict bs1, + const uint64_t* __restrict bs2, + size_t n1, + size_t n2, + int ht, + int64_t* __restrict idx, + hamdis_t* __restrict hams) { + const size_t nwords = nbits / 64; + size_t i, j, posm = 0; + hamdis_t h; + const uint64_t* bs2_ = bs2; + for (i = 0; i < n1; i++) { + bs2 = bs2_; + for (j = 0; j < n2; j++) { + h = hamming(bs1, bs2); + if (h <= ht) { + *idx = i; + idx++; + *idx = j; + idx++; + *hams = h; + hams++; + posm++; + } + bs2 += nwords; + } + bs1 += nwords; + } + return posm; +} + +/****************************************************************** + * HammingComputer-based search templates + ******************************************************************/ + +template +void hammings_knn_hc_impl( + int bytes_per_code, + int_maxheap_array_t* __restrict ha, + const uint8_t* __restrict bs1, + const uint8_t* __restrict bs2, + size_t n2, + bool order = true, + bool init_heap = true, + ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK, + const faiss::IDSelector* sel = nullptr) { + size_t k = ha->k; + if (init_heap) { + ha->heapify(); + } + + const size_t block_size = hamming_batch_size; + for (size_t j0 = 0; j0 < n2; j0 += block_size) { + const size_t j1 = std::min(j0 + block_size, n2); +#pragma omp parallel for + for (int64_t i = 0; i < static_cast(ha->nh); i++) { + HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code); + + const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code; + hamdis_t dis; + hamdis_t* __restrict bh_val_ = ha->val + i * k; + int64_t* __restrict bh_ids_ = ha->ids + i * k; + +#define HANDLE_APPROX(NB, BD) \ + case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD: \ + FAISS_THROW_IF_NOT_FMT( \ + k <= NB * BD, \ + "The chosen mode (%d) of approximate top-k supports " \ + "up to %d values, but %zd is requested.", \ + (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \ + NB * BD, \ + k); \ + HeapWithBucketsForHamming32< \ + CMax, \ + NB, \ + BD, \ + HammingComputer>:: \ + addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel); \ + break; + + switch (approx_topk_mode) { + HANDLE_APPROX(8, 3) + HANDLE_APPROX(8, 2) + HANDLE_APPROX(16, 2) + HANDLE_APPROX(32, 2) + default: { + for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) { + if (sel && !sel->is_member(j)) { + continue; + } + dis = hc.hamming(bs2_); + if (dis < bh_val_[0]) { + faiss::maxheap_replace_top( + k, bh_val_, bh_ids_, dis, j); + } + } + } break; + } + } + } + if (order) { + ha->reorder(); + } +} + +#undef HANDLE_APPROX + +template +void hammings_knn_mc_impl( + int bytes_per_code, + const uint8_t* __restrict a, + const uint8_t* __restrict b, + size_t na, + size_t nb, + size_t k, + int32_t* __restrict distances, + int64_t* __restrict labels, + const faiss::IDSelector* sel) { + const int nBuckets = bytes_per_code * 8 + 1; + std::vector all_counters(na * nBuckets, 0); + std::unique_ptr all_ids_per_dis(new int64_t[na * nBuckets * k]); + + std::vector> cs; + for (size_t i = 0; i < na; ++i) { + cs.push_back( + HCounterState( + all_counters.data() + i * nBuckets, + all_ids_per_dis.get() + i * nBuckets * k, + a + i * bytes_per_code, + 8 * bytes_per_code, + k)); + } + + const size_t block_size = hamming_batch_size; + for (size_t j0 = 0; j0 < nb; j0 += block_size) { + const size_t j1 = std::min(j0 + block_size, nb); +#pragma omp parallel for + for (int64_t i = 0; i < static_cast(na); ++i) { + for (size_t j = j0; j < j1; ++j) { + if (!sel || sel->is_member(j)) { + cs[i].update_counter(b + j * bytes_per_code, j); + } + } + } + } + + for (size_t i = 0; i < na; ++i) { + HCounterState& csi = cs[i]; + + size_t nres = 0; + for (int b_2 = 0; b_2 < nBuckets && nres < k; b_2++) { + for (int l = 0; l < csi.counters[b_2] && nres < k; l++) { + labels[i * k + nres] = csi.ids_per_dis[b_2 * k + l]; + distances[i * k + nres] = b_2; + nres++; + } + } + while (nres < k) { + labels[i * k + nres] = -1; + distances[i * k + nres] = std::numeric_limits::max(); + ++nres; + } + } +} + +template +void hamming_range_search_impl( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + int radius, + size_t code_size, + RangeSearchResult* res, + const faiss::IDSelector* sel) { +#pragma omp parallel + { + RangeSearchPartialResult pres(res); + +#pragma omp for + for (int64_t i = 0; i < static_cast(na); i++) { + HammingComputer hc(a + i * code_size, code_size); + const uint8_t* yi = b; + RangeQueryResult& qres = pres.new_result(i); + + for (size_t j = 0; j < nb; j++) { + if (!sel || sel->is_member(j)) { + int dis = hc.hamming(yi); + if (dis < radius) { + qres.add(dis, j); + } + } + yi += code_size; + } + } + pres.finalize(); + } +} + +// Consumer structs for dispatch_HammingComputer. +// These MUST be in the anonymous namespace so that each TU's +// dispatch_HammingComputer instantiation uses unique consumer types, +// preventing ODR violations across TUs with different struct layouts. + +struct Run_hammings_knn_hc { + using T = void; + template + void f(Types... args) { + hammings_knn_hc_impl(args...); + } +}; + +struct Run_hammings_knn_mc { + using T = void; + template + void f(Types... args) { + hammings_knn_mc_impl(args...); + } +}; + +struct Run_hamming_range_search { + using T = void; + template + void f(Types... args) { + hamming_range_search_impl(args...); + } +}; + +/****************************************************************** + * Generalized Hamming distances + ******************************************************************/ + +template +void hamming_dis_inner_loop( + const uint8_t* __restrict ca, + const uint8_t* __restrict cb, + size_t nb, + size_t code_size, + int k, + hamdis_t* __restrict bh_val_, + int64_t* __restrict bh_ids_) { + HammingComputer hc(ca, code_size); + + for (size_t j = 0; j < nb; j++) { + int ndiff = hc.hamming(cb); + cb += code_size; + if (ndiff < bh_val_[0]) { + maxheap_replace_top(k, bh_val_, bh_ids_, ndiff, j); + } + } +} + +void generalized_hammings_knn_hc_impl( + int_maxheap_array_t* __restrict ha, + const uint8_t* __restrict a, + const uint8_t* __restrict b, + size_t nb, + size_t code_size, + int ordered) { + int na = ha->nh; + int k = ha->k; + + if (ordered) { + ha->heapify(); + } + +#pragma omp parallel for + for (int i = 0; i < na; i++) { + const uint8_t* __restrict ca = a + i * code_size; + const uint8_t* __restrict cb = b; + + hamdis_t* __restrict bh_val_ = ha->val + i * k; + int64_t* __restrict bh_ids_ = ha->ids + i * k; + + switch (code_size) { + case 8: + hamming_dis_inner_loop( + ca, cb, nb, 8, k, bh_val_, bh_ids_); + break; + case 16: + hamming_dis_inner_loop( + ca, cb, nb, 16, k, bh_val_, bh_ids_); + break; + case 32: + hamming_dis_inner_loop( + ca, cb, nb, 32, k, bh_val_, bh_ids_); + break; + default: + hamming_dis_inner_loop( + ca, cb, nb, code_size, k, bh_val_, bh_ids_); + break; + } + } + + if (ordered) { + ha->reorder(); + } +} + +} // anonymous namespace + +/****************************************************************** + * Entry point template specializations at THE_SIMD_LEVEL + ******************************************************************/ + +#define C64(x) ((uint64_t*)x) + +template <> +void hammings_knn_hc_dispatch( + int_maxheap_array_t* ha, + const uint8_t* a, + const uint8_t* b, + size_t nb, + size_t ncodes, + int ordered, + ApproxTopK_mode_t approx_topk_mode, + const IDSelector* sel) { + Run_hammings_knn_hc r; + dispatch_HammingComputer( + ncodes, + r, + ncodes, + ha, + a, + b, + nb, + ordered, + true, + approx_topk_mode, + sel); +} + +template <> +void hammings_knn_mc_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + size_t k, + size_t ncodes, + int32_t* distances, + int64_t* labels, + const IDSelector* sel) { + Run_hammings_knn_mc r; + dispatch_HammingComputer( + ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel); +} + +template <> +void hamming_range_search_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + int radius, + size_t code_size, + RangeSearchResult* result, + const IDSelector* sel) { + Run_hamming_range_search r; + dispatch_HammingComputer( + code_size, r, a, b, na, nb, radius, code_size, result, sel); +} + +template <> +void hammings_dispatch( + const uint8_t* a, + const uint8_t* b, + size_t na, + size_t nb, + size_t ncodes, + hamdis_t* dis) { + FAISS_THROW_IF_NOT(ncodes % 8 == 0); + switch (ncodes) { + case 8: + hammings_impl<64>(C64(a), C64(b), na, nb, dis); + return; + case 16: + hammings_impl<128>(C64(a), C64(b), na, nb, dis); + return; + case 32: + hammings_impl<256>(C64(a), C64(b), na, nb, dis); + return; + case 64: + hammings_impl<512>(C64(a), C64(b), na, nb, dis); + return; + default: + hammings_impl_runtime(C64(a), C64(b), na, nb, ncodes * 8, dis); + return; + } +} + +template <> +void generalized_hammings_knn_hc_dispatch( + int_maxheap_array_t* ha, + const uint8_t* a, + const uint8_t* b, + size_t nb, + size_t code_size, + int ordered) { + generalized_hammings_knn_hc_impl(ha, a, b, nb, code_size, ordered); +} + +template <> +void hamming_count_thres_dispatch( + const uint8_t* bs1, + const uint8_t* bs2, + size_t n1, + size_t n2, + hamdis_t ht, + size_t ncodes, + size_t* nptr) { + switch (ncodes) { + case 8: + hamming_count_thres_impl<64>(C64(bs1), C64(bs2), n1, n2, ht, nptr); + return; + case 16: + hamming_count_thres_impl<128>(C64(bs1), C64(bs2), n1, n2, ht, nptr); + return; + case 32: + hamming_count_thres_impl<256>(C64(bs1), C64(bs2), n1, n2, ht, nptr); + return; + case 64: + hamming_count_thres_impl<512>(C64(bs1), C64(bs2), n1, n2, ht, nptr); + return; + default: + FAISS_THROW_FMT("not implemented for %zu bits", ncodes); + } +} + +template <> +void crosshamming_count_thres_dispatch( + const uint8_t* dbs, + size_t n, + hamdis_t ht, + size_t ncodes, + size_t* nptr) { + switch (ncodes) { + case 8: + crosshamming_count_thres_impl<64>(C64(dbs), n, ht, nptr); + return; + case 16: + crosshamming_count_thres_impl<128>(C64(dbs), n, ht, nptr); + return; + case 32: + crosshamming_count_thres_impl<256>(C64(dbs), n, ht, nptr); + return; + case 64: + crosshamming_count_thres_impl<512>(C64(dbs), n, ht, nptr); + return; + default: + FAISS_THROW_FMT("not implemented for %zu bits", ncodes); + } +} + +template <> +size_t match_hamming_thres_dispatch( + const uint8_t* bs1, + const uint8_t* bs2, + size_t n1, + size_t n2, + hamdis_t ht, + size_t ncodes, + int64_t* idx, + hamdis_t* dis) { + switch (ncodes) { + case 8: + return match_hamming_thres_impl<64>( + C64(bs1), C64(bs2), n1, n2, ht, idx, dis); + case 16: + return match_hamming_thres_impl<128>( + C64(bs1), C64(bs2), n1, n2, ht, idx, dis); + case 32: + return match_hamming_thres_impl<256>( + C64(bs1), C64(bs2), n1, n2, ht, idx, dis); + case 64: + return match_hamming_thres_impl<512>( + C64(bs1), C64(bs2), n1, n2, ht, idx, dis); + default: + FAISS_THROW_FMT("not implemented for %zu bits", ncodes); + return 0; + } +} + +#undef C64 + +} // namespace faiss diff --git a/faiss/utils/simd_impl/hamming_neon.cpp b/faiss/utils/simd_impl/hamming_neon.cpp new file mode 100644 index 0000000000..6c72618028 --- /dev/null +++ b/faiss/utils/simd_impl/hamming_neon.cpp @@ -0,0 +1,14 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */ + +#ifdef COMPILE_SIMD_ARM_NEON + +#define THE_SIMD_LEVEL SIMDLevel::ARM_NEON +// NOLINTNEXTLINE(facebook-hte-InlineHeader) +#include + +#endif // COMPILE_SIMD_ARM_NEON