diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index cf870cc159..c5c320257e 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -14,7 +14,9 @@ set(FAISS_SIMD_AVX2_SRC
   impl/pq_code_distance/pq_code_distance-avx2.cpp
   impl/scalar_quantizer/sq-avx2.cpp
   impl/approx_topk/avx2.cpp
+  impl/binary_hamming/avx2.cpp
   utils/simd_impl/distances_avx2.cpp
+  utils/simd_impl/hamming_avx2.cpp
   utils/simd_impl/partitioning_avx2.cpp
   utils/distances_fused/simdlib_based.cpp
   utils/simd_impl/rabitq_avx2.cpp
@@ -32,7 +34,9 @@ set(FAISS_SIMD_NEON_SRC
   impl/fast_scan/impl-neon.cpp
   impl/scalar_quantizer/sq-neon.cpp
   impl/approx_topk/neon.cpp
+  impl/binary_hamming/neon.cpp
   utils/simd_impl/distances_aarch64.cpp
+  utils/simd_impl/hamming_neon.cpp
   utils/simd_impl/partitioning_neon.cpp
   utils/distances_fused/simdlib_based_neon.cpp
   utils/simd_impl/rabitq_neon.cpp
@@ -342,6 +346,7 @@ set(FAISS_HEADERS
   utils/hamming_distance/avx512-inl.h
   utils/simd_impl/distances_autovec-inl.h
   utils/simd_impl/distances_simdlib256.h
+  utils/simd_impl/hamming_impl.h
   utils/simd_impl/exhaustive_L2sqr_blas_cmax.h
   utils/simd_impl/IVFFlatScanner-inl.h
   utils/simd_impl/distances_sse-inl.h
diff --git a/faiss/IndexBinaryHNSW.cpp b/faiss/IndexBinaryHNSW.cpp
index 21f10e53f3..80925e6c75 100644
--- a/faiss/IndexBinaryHNSW.cpp
+++ b/faiss/IndexBinaryHNSW.cpp
@@ -27,8 +27,16 @@
 #include <faiss/utils/hamming.h>
 #include <faiss/utils/random.h>
 
+#include <faiss/impl/simd_dispatch.h>
+
 #include <random>
 
+// Scalar (NONE) fallback for dynamic dispatch
+#define THE_SIMD_LEVEL SIMDLevel::NONE
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h>
+#undef THE_SIMD_LEVEL
+
 namespace faiss {
 
 /**************************************************************
@@ -280,50 +288,15 @@ void IndexBinaryHNSW::reconstruct(idx_t key, uint8_t* recons) const {
     storage->reconstruct(key, recons);
 }
 
-namespace {
-
-template <class HammingComputer>
-struct FlatHammingDis : DistanceComputer {
-    const int code_size;
-    const uint8_t* b;
-    HammingComputer hc;
-
-    float operator()(idx_t i) override {
-        return hc.hamming(b + i * code_size);
-    }
-
-    float symmetric_dis(idx_t i, idx_t j) override {
-        return HammingComputerDefault(b + j * code_size, code_size)
-                .hamming(b + i * code_size);
-    }
-
-    explicit FlatHammingDis(const IndexBinaryFlat& storage)
-            : code_size(storage.code_size), b(storage.xb.data()), hc() {}
-
-    // NOTE: Pointers are cast from float in order to reuse the floating-point
-    //   DistanceComputer.
-    void set_query(const float* x) override {
-        hc.set((uint8_t*)x, code_size);
-    }
-};
-
-struct BuildDistanceComputer {
-    using T = DistanceComputer*;
-    template <class HammingComputer>
-    DistanceComputer* f(IndexBinaryFlat* flat_storage) {
-        return new FlatHammingDis<HammingComputer>(*flat_storage);
-    }
-};
-
-} // namespace
-
 DistanceComputer* IndexBinaryHNSW::get_distance_computer() const {
     IndexBinaryFlat* flat_storage = dynamic_cast<IndexBinaryFlat*>(storage);
     FAISS_THROW_IF_NOT_MSG(
             flat_storage != nullptr,
             "IndexBinaryHNSW requires IndexBinaryFlat storage");
-    BuildDistanceComputer bd;
-    return dispatch_HammingComputer(code_size, bd, flat_storage);
+    return with_simd_level_256bit([&]<SIMDLevel SL>() {
+        return make_binary_hnsw_distance_computer_dispatch<SL>(
+                code_size, flat_storage);
+    });
 }
 
 /**************************************************************
diff --git a/faiss/IndexBinaryHash.cpp b/faiss/IndexBinaryHash.cpp
index b8c0092eae..8a253a3a93 100644
--- a/faiss/IndexBinaryHash.cpp
+++ b/faiss/IndexBinaryHash.cpp
@@ -20,6 +20,14 @@
 #include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
 
+#include <faiss/impl/simd_dispatch.h>
+
+// Scalar (NONE) fallback for dynamic dispatch
+#define THE_SIMD_LEVEL SIMDLevel::NONE
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexBinaryHash_impl.h>
+#undef THE_SIMD_LEVEL
+
 namespace faiss {
 
 void IndexBinaryHash::InvertedList::add(
@@ -64,139 +72,8 @@ void IndexBinaryHash::add_with_ids(
     ntotal += n;
 }
 
-namespace {
-
-/** Enumerate all bit vectors of size nbit with up to maxflip 1s
- * test in P127257851 P127258235
- */
-struct FlipEnumerator {
-    int nbit, nflip, maxflip;
-    uint64_t mask, x;
-
-    FlipEnumerator(int nbit_, int maxflip_) : nbit(nbit_), maxflip(maxflip_) {
-        nflip = 0;
-        mask = 0;
-        x = 0;
-    }
-
-    bool next() {
-        if (x == mask) {
-            if (nflip == maxflip) {
-                return false;
-            }
-            // increase Hamming radius
-            nflip++;
-            mask = (((uint64_t)1 << nflip) - 1);
-            x = mask << (nbit - nflip);
-            return true;
-        }
-
-        int i = __builtin_ctzll(x);
-
-        if (i > 0) {
-            x ^= (uint64_t)3 << (i - 1);
-        } else {
-            // nb of LSB 1s
-            int n1 = __builtin_ctzll(~x);
-            // clear them
-            x &= ((uint64_t)(-1) << n1);
-            int n2 = __builtin_ctzll(x);
-            x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1);
-        }
-        return true;
-    }
-};
-
-struct RangeSearchResults {
-    int radius;
-    RangeQueryResult& qres;
-
-    inline void add(float dis, idx_t id) {
-        if (dis < radius) {
-            qres.add(dis, id);
-        }
-    }
-};
-
-struct KnnSearchResults {
-    // heap params
-    idx_t k;
-    int32_t* heap_sim;
-    idx_t* heap_ids;
-
-    using C = CMax<int, idx_t>;
-
-    inline void add(float dis, idx_t id) {
-        if (dis < heap_sim[0]) {
-            heap_replace_top<C>(k, heap_sim, heap_ids, dis, id);
-        }
-    }
-};
-
-template <class HammingComputer, class SearchResults>
-void search_single_query_template(
-        const IndexBinaryHash& index,
-        const uint8_t* q,
-        SearchResults& res,
-        size_t& n0,
-        size_t& nlist,
-        size_t& ndis) {
-    size_t code_size = index.code_size;
-    BitstringReader br(q, code_size);
-    uint64_t qhash = br.read(index.b);
-    HammingComputer hc(q, code_size);
-    FlipEnumerator fe(index.b, index.nflip);
-
-    // loop over neighbors that are at most at nflip bits
-    do {
-        uint64_t hash = qhash ^ fe.x;
-        auto it = index.invlists.find(hash);
-
-        if (it == index.invlists.end()) {
-            continue;
-        }
-
-        const IndexBinaryHash::InvertedList& il = it->second;
-
-        size_t nv = il.ids.size();
-
-        if (nv == 0) {
-            n0++;
-        } else {
-            const uint8_t* codes = il.vecs.data();
-            for (size_t i = 0; i < nv; i++) {
-                int dis = hc.hamming(codes);
-                res.add(dis, il.ids[i]);
-                codes += code_size;
-            }
-            ndis += nv;
-            nlist++;
-        }
-    } while (fe.next());
-}
-
-struct Run_search_single_query {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    T f(Types*... args) {
-        search_single_query_template<HammingComputer>(*args...);
-    }
-};
-
-template <class SearchResults>
-void search_single_query(
-        const IndexBinaryHash& index,
-        const uint8_t* q,
-        SearchResults& res,
-        size_t& n0,
-        size_t& nlist,
-        size_t& ndis) {
-    Run_search_single_query r;
-    dispatch_HammingComputer(
-            index.code_size, r, &index, &q, &res, &n0, &nlist, &ndis);
-}
-
-} // anonymous namespace
+// search_single_query_template and helpers are now in
+// impl/binary_hamming/IndexBinaryHash_impl.h (compiled per-ISA)
 
 void IndexBinaryHash::range_search(
         idx_t n,
@@ -215,10 +92,12 @@ void IndexBinaryHash::range_search(
 #pragma omp for
         for (idx_t i = 0; i < n; i++) { // loop queries
             RangeQueryResult& qres = pres.new_result(i);
-            RangeSearchResults res = {radius, qres};
             const uint8_t* q = x + i * code_size;
 
-            search_single_query(*this, q, res, n0, nlist, ndis);
+            with_simd_level_256bit([&]<SIMDLevel SL>() {
+                binary_hash_range_search_dispatch<SL>(
+                        *this, q, radius, qres, n0, nlist, ndis);
+            });
         }
         pres.finalize();
     }
@@ -248,10 +127,12 @@ void IndexBinaryHash::search(
         idx_t* idxi = labels + k * i;
 
         heap_heapify<HeapForL2>(k, simi, idxi);
-        KnnSearchResults res = {k, simi, idxi};
         const uint8_t* q = x + i * code_size;
 
-        search_single_query(*this, q, res, n0, nlist, ndis);
+        with_simd_level_256bit([&]<SIMDLevel SL>() {
+            binary_hash_knn_search_dispatch<SL>(
+                    *this, q, k, simi, idxi, n0, nlist, ndis);
+        });
 
         heap_reorder<HeapForL2>(k, simi, idxi);
     }
@@ -324,75 +205,8 @@ void IndexBinaryMultiHash::add(idx_t n, const uint8_t* x) {
     ntotal += n;
 }
 
-namespace {
-
-template <class HammingComputer, class SearchResults>
-static void verify_shortlist(
-        const IndexBinaryFlat* index,
-        const uint8_t* q,
-        const std::unordered_set<idx_t>& shortlist,
-        SearchResults& res) {
-    size_t code_size = index->code_size;
-
-    HammingComputer hc(q, code_size);
-    const uint8_t* codes = index->xb.data();
-
-    for (auto i : shortlist) {
-        int dis = hc.hamming(codes + i * code_size);
-        res.add(dis, i);
-    }
-}
-
-struct Run_verify_shortlist {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        verify_shortlist<HammingComputer>(args...);
-    }
-};
-
-template <class SearchResults>
-void search_1_query_multihash(
-        const IndexBinaryMultiHash& index,
-        const uint8_t* xi,
-        SearchResults& res,
-        size_t& n0,
-        size_t& nlist,
-        size_t& ndis) {
-    std::unordered_set<idx_t> shortlist;
-    int b = index.b;
-
-    BitstringReader br(xi, index.code_size);
-    for (int h = 0; h < index.nhash; h++) {
-        uint64_t qhash = br.read(b);
-        const IndexBinaryMultiHash::Map& map = index.maps[h];
-
-        FlipEnumerator fe(index.b, index.nflip);
-        // loop over neighbors that are at most at nflip bits
-        do {
-            uint64_t hash = qhash ^ fe.x;
-            auto it = map.find(hash);
-
-            if (it != map.end()) {
-                const std::vector<idx_t>& v = it->second;
-                for (auto i : v) {
-                    shortlist.insert(i);
-                }
-                nlist++;
-            } else {
-                n0++;
-            }
-        } while (fe.next());
-    }
-    ndis += shortlist.size();
-
-    // verify shortlist
-    Run_verify_shortlist r;
-    dispatch_HammingComputer(
-            index.code_size, r, index.storage, xi, shortlist, res);
-}
-
-} // anonymous namespace
+// verify_shortlist and search_1_query_multihash are now in
+// impl/binary_hamming/IndexBinaryHash_impl.h (compiled per-ISA)
 
 void IndexBinaryMultiHash::range_search(
         idx_t n,
@@ -411,10 +225,12 @@ void IndexBinaryMultiHash::range_search(
 #pragma omp for
         for (idx_t i = 0; i < n; i++) { // loop queries
             RangeQueryResult& qres = pres.new_result(i);
-            RangeSearchResults res = {radius, qres};
             const uint8_t* q = x + i * code_size;
 
-            search_1_query_multihash(*this, q, res, n0, nlist, ndis);
+            with_simd_level_256bit([&]<SIMDLevel SL>() {
+                binary_multihash_range_search_dispatch<SL>(
+                        *this, q, radius, qres, n0, nlist, ndis);
+            });
         }
         pres.finalize();
     }
@@ -444,10 +260,12 @@ void IndexBinaryMultiHash::search(
         idx_t* idxi = labels + k * i;
 
         heap_heapify<HeapForL2>(k, simi, idxi);
-        KnnSearchResults res = {k, simi, idxi};
         const uint8_t* q = x + i * code_size;
 
-        search_1_query_multihash(*this, q, res, n0, nlist, ndis);
+        with_simd_level_256bit([&]<SIMDLevel SL>() {
+            binary_multihash_knn_search_dispatch<SL>(
+                    *this, q, k, simi, idxi, n0, nlist, ndis);
+        });
 
         heap_reorder<HeapForL2>(k, simi, idxi);
     }
diff --git a/faiss/IndexBinaryIVF.cpp b/faiss/IndexBinaryIVF.cpp
index 90c5ecc301..c53cdb3dc1 100644
--- a/faiss/IndexBinaryIVF.cpp
+++ b/faiss/IndexBinaryIVF.cpp
@@ -25,6 +25,14 @@
 #include <faiss/utils/sorting.h>
 #include <faiss/utils/utils.h>
 
+#include <faiss/impl/simd_dispatch.h>
+
+// Scalar (NONE) fallback for dynamic dispatch
+#define THE_SIMD_LEVEL SIMDLevel::NONE
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexBinaryIVF_impl.h>
+#undef THE_SIMD_LEVEL
+
 namespace faiss {
 
 IndexBinaryIVF::IndexBinaryIVF(
@@ -339,68 +347,11 @@ void IndexBinaryIVF::replace_invlists(InvertedLists* il, bool own) {
     own_invlists = own;
 }
 
-namespace {
-
-template <class HammingComputer>
-struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
-    HammingComputer hc;
-    size_t code_size;
-    bool store_pairs;
-
-    IVFBinaryScannerL2(size_t code_size_, bool store_pairs_)
-            : code_size(code_size_), store_pairs(store_pairs_) {}
-
-    void set_query(const uint8_t* query_vector) override {
-        hc.set(query_vector, code_size);
-    }
-
-    idx_t list_no;
-    void set_list(idx_t list_no_2, uint8_t /* coarse_dis */) override {
-        this->list_no = list_no_2;
-    }
-
-    uint32_t distance_to_code(const uint8_t* code) const override {
-        return hc.hamming(code);
-    }
-
-    size_t scan_codes(
-            size_t n,
-            const uint8_t* __restrict codes,
-            const idx_t* __restrict ids,
-            int32_t* __restrict simi,
-            idx_t* __restrict idxi,
-            size_t k) const override {
-        using C = CMax<int32_t, idx_t>;
-
-        size_t nup = 0;
-        for (size_t j = 0; j < n; j++) {
-            uint32_t dis = hc.hamming(codes);
-            if (dis < static_cast<uint32_t>(simi[0])) {
-                idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                heap_replace_top<C>(k, simi, idxi, dis, id);
-                nup++;
-            }
-            codes += code_size;
-        }
-        return nup;
-    }
+// IVFBinaryScannerL2, search_knn_hamming_count, BlockSearch,
+// BlockSearchVariableK, search_knn_hamming_per_invlist are now in
+// impl/binary_hamming/IndexBinaryIVF_impl.h (compiled per-ISA)
 
-    void scan_codes_range(
-            size_t n,
-            const uint8_t* __restrict codes,
-            const idx_t* __restrict ids,
-            int radius,
-            RangeQueryResult& result) const override {
-        for (size_t j = 0; j < n; j++) {
-            uint32_t dis = hc.hamming(codes);
-            if (dis < static_cast<uint32_t>(radius)) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                result.add(dis, id);
-            }
-            codes += code_size;
-        }
-    }
-};
+namespace {
 
 void search_knn_hamming_heap(
         const IndexBinaryIVF* ivf,
@@ -507,317 +458,17 @@ void search_knn_hamming_heap(
     indexIVF_stats.nheap_updates += nheap;
 }
 
-template <class HammingComputer, bool store_pairs>
-void search_knn_hamming_count(
-        const IndexBinaryIVF* ivf,
-        size_t nx,
-        const uint8_t* __restrict x,
-        const idx_t* __restrict keys,
-        int k,
-        int32_t* __restrict distances,
-        idx_t* __restrict labels,
-        const IVFSearchParameters* params) {
-    const int nBuckets = ivf->d + 1;
-    std::vector<int> all_counters(nx * nBuckets, 0);
-    std::unique_ptr<idx_t[]> all_ids_per_dis(new idx_t[nx * nBuckets * k]);
-
-    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
-    nprobe = std::min((idx_t)ivf->nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
-
-    std::vector<HCounterState<HammingComputer>> cs;
-    for (size_t i = 0; i < nx; ++i) {
-        cs.push_back(
-                HCounterState<HammingComputer>(
-                        all_counters.data() + i * nBuckets,
-                        all_ids_per_dis.get() + i * nBuckets * k,
-                        x + i * ivf->code_size,
-                        ivf->d,
-                        k));
-    }
-
-    size_t nlistv = 0, ndis = 0;
-
-#pragma omp parallel for reduction(+ : nlistv, ndis)
-    for (int64_t i = 0; i < static_cast<int64_t>(nx); i++) {
-        const idx_t* keysi = keys + i * nprobe;
-        HCounterState<HammingComputer>& csi = cs[i];
-
-        size_t nscan = 0;
-
-        for (idx_t ik = 0; ik < nprobe; ik++) {
-            idx_t key = keysi[ik]; /* select the list  */
-            if (key < 0) {
-                // not enough centroids for multiprobe
-                continue;
-            }
-            FAISS_THROW_IF_NOT_FMT(
-                    key < (idx_t)ivf->nlist,
-                    "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
-                    key,
-                    static_cast<size_t>(ik),
-                    ivf->nlist);
-
-            nlistv++;
-            size_t list_size = ivf->invlists->list_size(key);
-            size_t list_size_max = static_cast<size_t>(max_codes) - nscan;
-            if (list_size > list_size_max) {
-                list_size = list_size_max;
-            }
-            InvertedLists::ScopedCodes scodes(ivf->invlists, key);
-            const uint8_t* list_vecs = scodes.get();
-            const idx_t* ids =
-                    store_pairs ? nullptr : ivf->invlists->get_ids(key);
-
-            for (size_t j = 0; j < list_size; j++) {
-                const uint8_t* yj = list_vecs + ivf->code_size * j;
-
-                idx_t id = store_pairs ? (key << 32 | j) : ids[j];
-                csi.update_counter(yj, id);
-            }
-            if (ids) {
-                ivf->invlists->release_ids(key, ids);
-            }
-
-            nscan += list_size;
-            if (nscan >= static_cast<size_t>(max_codes)) {
-                break;
-            }
-        }
-        ndis += nscan;
-
-        int nres = 0;
-        for (int b = 0; b < nBuckets && nres < k; b++) {
-            for (int l = 0; l < csi.counters[b] && nres < k; l++) {
-                labels[i * k + nres] = csi.ids_per_dis[b * k + l];
-                distances[i * k + nres] = b;
-                nres++;
-            }
-        }
-        while (nres < k) {
-            labels[i * k + nres] = -1;
-            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
-            ++nres;
-        }
-    }
-
-    indexIVF_stats.nq += nx;
-    indexIVF_stats.nlist += nlistv;
-    indexIVF_stats.ndis += ndis;
-}
-
-/* Manages NQ queries at a time, stores results */
-template <class HammingComputer, int NQ, int K>
-struct BlockSearch {
-    HammingComputer hcs[NQ];
-    // heaps to update for each query
-    int32_t* distances[NQ];
-    idx_t* labels[NQ];
-    // curent top of heap
-    int32_t heap_tops[NQ];
-
-    BlockSearch(
-            size_t code_size,
-            const uint8_t* __restrict x,
-            const int32_t* __restrict keys,
-            int32_t* __restrict all_distances,
-            idx_t* __restrict all_labels) {
-        for (idx_t q = 0; q < NQ; q++) {
-            idx_t qno = keys[q];
-            hcs[q] = HammingComputer(x + qno * code_size, code_size);
-            distances[q] = all_distances + qno * K;
-            labels[q] = all_labels + qno * K;
-            heap_tops[q] = distances[q][0];
-        }
-    }
-
-    void add_bcode(const uint8_t* bcode, idx_t id) {
-        using C = CMax<int32_t, idx_t>;
-        for (int q = 0; q < NQ; q++) {
-            int dis = hcs[q].hamming(bcode);
-            if (dis < heap_tops[q]) {
-                heap_replace_top<C>(K, distances[q], labels[q], dis, id);
-                heap_tops[q] = distances[q][0];
-            }
-        }
-    }
-};
-
-template <class HammingComputer, int NQ>
-struct BlockSearchVariableK {
-    int k;
-    HammingComputer hcs[NQ];
-    // heaps to update for each query
-    int32_t* distances[NQ];
-    idx_t* labels[NQ];
-    // curent top of heap
-    int32_t heap_tops[NQ];
-
-    BlockSearchVariableK(
-            size_t code_size,
-            int k_,
-            const uint8_t* __restrict x,
-            const int32_t* __restrict keys,
-            int32_t* __restrict all_distances,
-            idx_t* __restrict all_labels)
-            : k(k_) {
-        for (idx_t q = 0; q < NQ; q++) {
-            idx_t qno = keys[q];
-            hcs[q] = HammingComputer(x + qno * code_size, code_size);
-            distances[q] = all_distances + qno * k;
-            labels[q] = all_labels + qno * k;
-            heap_tops[q] = distances[q][0];
-        }
-    }
-
-    void add_bcode(const uint8_t* bcode, idx_t id) {
-        using C = CMax<int32_t, idx_t>;
-        for (int q = 0; q < NQ; q++) {
-            int dis = hcs[q].hamming(bcode);
-            if (dis < heap_tops[q]) {
-                heap_replace_top<C>(k, distances[q], labels[q], dis, id);
-                heap_tops[q] = distances[q][0];
-            }
-        }
-    }
-};
-
-template <class HammingComputer>
-void search_knn_hamming_per_invlist(
-        const IndexBinaryIVF* ivf,
-        size_t n,
-        const uint8_t* __restrict x,
-        idx_t k,
-        const idx_t* __restrict keys_in,
-        const int32_t* __restrict /* coarse_dis */,
-        int32_t* __restrict distances,
-        idx_t* __restrict labels,
-        bool store_pairs,
-        const IVFSearchParameters* params) {
-    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
-    nprobe = std::min((idx_t)ivf->nlist, nprobe);
-    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
-    FAISS_THROW_IF_NOT(max_codes == 0);
-    FAISS_THROW_IF_NOT(!store_pairs);
-
-    // reorder buckets
-    std::vector<int64_t> lims(n + 1);
-    int32_t* keys = new int32_t[n * nprobe];
-    std::unique_ptr<int32_t[]> delete_keys(keys);
-    for (size_t i = 0; i < n * static_cast<size_t>(nprobe); i++) {
-        keys[i] = keys_in[i];
-    }
-    matrix_bucket_sort_inplace(n, nprobe, keys, ivf->nlist, lims.data(), 0);
-
-    using C = CMax<int32_t, idx_t>;
-    heap_heapify<C>(n * k, distances, labels);
-    const size_t code_size = ivf->code_size;
-
-    for (size_t l = 0; l < ivf->nlist; l++) {
-        idx_t l0 = lims[l], nq = lims[l + 1] - l0;
-
-        InvertedLists::ScopedCodes scodes(ivf->invlists, l);
-        InvertedLists::ScopedIds sidx(ivf->invlists, l);
-        idx_t nb = ivf->invlists->list_size(l);
-        const uint8_t* bcodes = scodes.get();
-        const idx_t* ids = sidx.get();
-
-        idx_t i = 0;
-
-        // process as much as possible by blocks
-        constexpr int BS = 4;
-
-        if (k == 1) {
-            for (; i + BS <= nq; i += BS) {
-                BlockSearch<HammingComputer, BS, 1> bc(
-                        code_size, x, keys + l0 + i, distances, labels);
-                for (idx_t j = 0; j < nb; j++) {
-                    bc.add_bcode(bcodes + j * code_size, ids[j]);
-                }
-            }
-        } else if (k == 2) {
-            for (; i + BS <= nq; i += BS) {
-                BlockSearch<HammingComputer, BS, 2> bc(
-                        code_size, x, keys + l0 + i, distances, labels);
-                for (idx_t j = 0; j < nb; j++) {
-                    bc.add_bcode(bcodes + j * code_size, ids[j]);
-                }
-            }
-        } else if (k == 4) {
-            for (; i + BS <= nq; i += BS) {
-                BlockSearch<HammingComputer, BS, 4> bc(
-                        code_size, x, keys + l0 + i, distances, labels);
-                for (idx_t j = 0; j < nb; j++) {
-                    bc.add_bcode(bcodes + j * code_size, ids[j]);
-                }
-            }
-        } else {
-            for (; i + BS <= nq; i += BS) {
-                BlockSearchVariableK<HammingComputer, BS> bc(
-                        code_size, k, x, keys + l0 + i, distances, labels);
-                for (idx_t j = 0; j < nb; j++) {
-                    bc.add_bcode(bcodes + j * code_size, ids[j]);
-                }
-            }
-        }
-
-        // leftovers
-        for (; i < nq; i++) {
-            idx_t qno = keys[l0 + i];
-            HammingComputer hc(x + qno * code_size, code_size);
-            idx_t* __restrict idxi = labels + qno * k;
-            int32_t* __restrict simi = distances + qno * k;
-            int32_t simi0 = simi[0];
-            for (idx_t j = 0; j < nb; j++) {
-                int dis = hc.hamming(bcodes + j * code_size);
-
-                if (dis < simi0) {
-                    idx_t id = store_pairs ? lo_build(l, j) : ids[j];
-                    heap_replace_top<C>(k, simi, idxi, dis, id);
-                    simi0 = simi[0];
-                }
-            }
-        }
-    }
-    for (size_t i = 0; i < n; i++) {
-        heap_reorder<C>(k, distances + i * k, labels + i * k);
-    }
-}
-
-struct Run_search_knn_hamming_per_invlist {
-    using T = void;
-
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        search_knn_hamming_per_invlist<HammingComputer>(args...);
-    }
-};
-
-template <bool store_pairs>
-struct Run_search_knn_hamming_count {
-    using T = void;
-
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        search_knn_hamming_count<HammingComputer, store_pairs>(args...);
-    }
-};
-
-struct BuildScanner {
-    using T = BinaryInvertedListScanner*;
-
-    template <class HammingComputer>
-    T f(size_t code_size, bool store_pairs) {
-        return new IVFBinaryScannerL2<HammingComputer>(code_size, store_pairs);
-    }
-};
-
 } // anonymous namespace
 
+// The remaining template code (search_knn_hamming_count,
+// search_knn_hamming_per_invlist, etc.) has been moved to
+// impl/binary_hamming/IndexBinaryIVF_impl.h
+
 BinaryInvertedListScanner* IndexBinaryIVF::get_InvertedListScanner(
         bool store_pairs) const {
-    BuildScanner bs;
-    return dispatch_HammingComputer(code_size, bs, code_size, store_pairs);
+    return with_simd_level_256bit([&]<SIMDLevel SL>() {
+        return make_binary_ivf_scanner_dispatch<SL>(code_size, store_pairs);
+    });
 }
 
 void IndexBinaryIVF::search_preassigned(
@@ -831,23 +482,37 @@ void IndexBinaryIVF::search_preassigned(
         bool store_pairs,
         const IVFSearchParameters* params) const {
     if (per_invlist_search) {
-        Run_search_knn_hamming_per_invlist r;
-        // clang-format off
-        dispatch_HammingComputer(
-                code_size, r, this, n, x, k,
-                cidx, cdis, dis, idx, store_pairs, params);
-        // clang-format on
+        with_simd_level_256bit([&]<SIMDLevel SL>() {
+            search_knn_hamming_per_invlist_dispatch<SL>(
+                    code_size,
+                    this,
+                    n,
+                    x,
+                    k,
+                    cidx,
+                    cdis,
+                    dis,
+                    idx,
+                    store_pairs,
+                    params);
+        });
     } else if (use_heap) {
         search_knn_hamming_heap(
                 this, n, x, k, cidx, cdis, dis, idx, store_pairs, params);
-    } else if (store_pairs) { // !use_heap && store_pairs
-        Run_search_knn_hamming_count<true> r;
-        dispatch_HammingComputer(
-                code_size, r, this, n, x, cidx, k, dis, idx, params);
-    } else { // !use_heap && !store_pairs
-        Run_search_knn_hamming_count<false> r;
-        dispatch_HammingComputer(
-                code_size, r, this, n, x, cidx, k, dis, idx, params);
+    } else {
+        with_simd_level_256bit([&]<SIMDLevel SL>() {
+            search_knn_hamming_count_dispatch<SL>(
+                    code_size,
+                    store_pairs,
+                    this,
+                    n,
+                    x,
+                    cidx,
+                    k,
+                    dis,
+                    idx,
+                    params);
+        });
     }
 }
 
diff --git a/faiss/IndexIVFSpectralHash.cpp b/faiss/IndexIVFSpectralHash.cpp
index f0e53d36ea..22f97e1a5f 100644
--- a/faiss/IndexIVFSpectralHash.cpp
+++ b/faiss/IndexIVFSpectralHash.cpp
@@ -20,6 +20,14 @@
 #include <faiss/impl/FaissAssert.h>
 #include <faiss/utils/hamming.h>
 
+#include <faiss/impl/simd_dispatch.h>
+
+// Scalar (NONE) fallback for dynamic dispatch
+#define THE_SIMD_LEVEL SIMDLevel::NONE
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h>
+#undef THE_SIMD_LEVEL
+
 namespace faiss {
 
 IndexIVFSpectralHash::IndexIVFSpectralHash(
@@ -207,115 +215,15 @@ void IndexIVFSpectralHash::encode_vectors(
     }
 }
 
-namespace {
-
-template <class HammingComputer>
-struct IVFScanner : InvertedListScanner {
-    using InvertedListScanner::scan_codes;
-    // copied from index structure
-    const IndexIVFSpectralHash* index;
-    size_t nbit;
-
-    float period, freq;
-    std::vector<float> q;
-    std::vector<float> zero;
-    std::vector<uint8_t> qcode;
-    HammingComputer hc;
-
-    IVFScanner(const IndexIVFSpectralHash* index_in, bool store_pairs_in)
-            : index(index_in),
-              nbit(index_in->nbit),
-              period(index_in->period),
-              freq(2.0 / index_in->period),
-              q(nbit),
-              zero(nbit),
-              qcode(index_in->code_size),
-              hc(qcode.data(), index_in->code_size) {
-        this->store_pairs = store_pairs_in;
-        this->code_size = index->code_size;
-        this->keep_max = is_similarity_metric(index->metric_type);
-    }
-
-    void set_query(const float* query) override {
-        FAISS_THROW_IF_NOT(query);
-        FAISS_THROW_IF_NOT(q.size() == static_cast<size_t>(nbit));
-        index->vt->apply_noalloc(1, query, q.data());
-
-        if (index->threshold_type == IndexIVFSpectralHash::Thresh_global) {
-            binarize_with_freq(nbit, freq, q.data(), zero.data(), qcode.data());
-            hc.set(qcode.data(), code_size);
-        }
-    }
-
-    void set_list(idx_t list_no_in, float /*coarse_dis*/) override {
-        this->list_no = list_no_in;
-        if (index->threshold_type != IndexIVFSpectralHash::Thresh_global) {
-            const float* c = index->trained.data() + list_no_in * nbit;
-            binarize_with_freq(nbit, freq, q.data(), c, qcode.data());
-            hc.set(qcode.data(), code_size);
-        }
-    }
-
-    float distance_to_code(const uint8_t* code) const final {
-        return hc.hamming(code);
-    }
-
-    size_t scan_codes(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float* simi,
-            idx_t* idxi,
-            size_t k) const override {
-        size_t nup = 0;
-        for (size_t j = 0; j < list_size; j++) {
-            float dis = hc.hamming(codes);
-
-            if (dis < simi[0]) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                maxheap_replace_top(k, simi, idxi, dis, id);
-                nup++;
-            }
-            codes += code_size;
-        }
-        return nup;
-    }
-
-    void scan_codes_range(
-            size_t list_size,
-            const uint8_t* codes,
-            const idx_t* ids,
-            float radius,
-            RangeQueryResult& res) const override {
-        for (size_t j = 0; j < list_size; j++) {
-            float dis = hc.hamming(codes);
-            if (dis < radius) {
-                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
-                res.add(dis, id);
-            }
-            codes += code_size;
-        }
-    }
-};
-
-struct BuildScanner {
-    using T = InvertedListScanner*;
-
-    template <class HammingComputer>
-    static T f(const IndexIVFSpectralHash* index, bool store_pairs) {
-        return new IVFScanner<HammingComputer>(index, store_pairs);
-    }
-};
-
-} // anonymous namespace
-
 InvertedListScanner* IndexIVFSpectralHash::get_InvertedListScanner(
         bool store_pairs,
         const IDSelector* sel,
         const IVFSearchParameters*) const {
     FAISS_THROW_IF_NOT(!sel);
-    BuildScanner bs;
-    return dispatch_HammingComputer(code_size, bs, this, store_pairs);
+    return with_simd_level_256bit([&]<SIMDLevel SL>() {
+        return make_spectral_hash_scanner_dispatch<SL>(
+                code_size, this, store_pairs);
+    });
 }
 
 void IndexIVFSpectralHash::replace_vt(VectorTransform* vt_in, bool own) {
diff --git a/faiss/IndexPQ.cpp b/faiss/IndexPQ.cpp
index a6a5ed4634..b97f924d2c 100644
--- a/faiss/IndexPQ.cpp
+++ b/faiss/IndexPQ.cpp
@@ -22,6 +22,12 @@
 #include <faiss/impl/pq_code_distance/pq_code_distance-inl.h>
 #include <faiss/impl/simd_dispatch.h>
 
+// Scalar (NONE) fallback for dynamic dispatch
+#define THE_SIMD_LEVEL SIMDLevel::NONE
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexPQ_impl.h>
+#undef THE_SIMD_LEVEL
+
 namespace faiss {
 
 /*********************************************************
@@ -275,59 +281,8 @@ void IndexPQStats::reset() {
 
 IndexPQStats indexPQ_stats;
 
-namespace {
-
-template <class HammingComputer>
-size_t polysemous_inner_loop(
-        const IndexPQ* index,
-        const float* dis_table_qi,
-        const uint8_t* q_code,
-        size_t k,
-        float* heap_dis,
-        int64_t* heap_ids,
-        int ht) {
-    size_t M = index->pq.M;
-    size_t code_size = index->pq.code_size;
-    size_t ksub = index->pq.ksub;
-    size_t ntotal = index->ntotal;
-
-    const uint8_t* b_code = index->codes.data();
-
-    size_t n_pass_i = 0;
-
-    HammingComputer hc(q_code, code_size);
-
-    for (int64_t bi = 0; bi < static_cast<int64_t>(ntotal); bi++) {
-        int hd = hc.hamming(b_code);
-
-        if (hd < ht) {
-            n_pass_i++;
-
-            float dis = 0;
-            const float* dis_table = dis_table_qi;
-            for (size_t m = 0; m < M; m++) {
-                dis += dis_table[b_code[m]];
-                dis_table += ksub;
-            }
-
-            if (dis < heap_dis[0]) {
-                maxheap_replace_top(k, heap_dis, heap_ids, dis, bi);
-            }
-        }
-        b_code += code_size;
-    }
-    return n_pass_i;
-}
-
-struct Run_polysemous_inner_loop {
-    using T = size_t;
-    template <class HammingComputer, class... Types>
-    size_t f(Types... args) {
-        return polysemous_inner_loop<HammingComputer>(args...);
-    }
-};
-
-} // anonymous namespace
+// polysemous_inner_loop template code is now in
+// impl/binary_hamming/IndexPQ_impl.h (compiled per-ISA)
 
 void IndexPQ::search_core_polysemous(
         idx_t n,
@@ -377,17 +332,17 @@ void IndexPQ::search_core_polysemous(
         maxheap_heapify(k, heap_dis, heap_ids);
 
         if (!generalized_hamming) {
-            Run_polysemous_inner_loop r;
-            n_pass += dispatch_HammingComputer(
-                    pq.code_size,
-                    r,
-                    this,
-                    dis_table_qi,
-                    q_code,
-                    k,
-                    heap_dis,
-                    heap_ids,
-                    param_polysemous_ht);
+            n_pass += with_simd_level_256bit([&]<SIMDLevel SL>() {
+                return polysemous_inner_loop_dispatch<SL>(
+                        pq.code_size,
+                        this,
+                        dis_table_qi,
+                        q_code,
+                        k,
+                        heap_dis,
+                        heap_ids,
+                        param_polysemous_ht);
+            });
 
         } else { // generalized hamming
             switch (pq.code_size) {
diff --git a/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h b/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h
new file mode 100644
index 0000000000..be16cd0733
--- /dev/null
+++ b/faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Per-ISA implementation of Hamming distance computation for
+ * IndexBinaryHNSW. Included once per SIMD TU with THE_SIMD_LEVEL
+ * set to the desired SIMDLevel.
+ */
+
+#pragma once
+
+#ifndef THE_SIMD_LEVEL
+#error "THE_SIMD_LEVEL must be defined before including this file"
+#endif
+
+// hamdis-inl.h MUST be included first — the ISA ladder checks __AVX2__ etc.
+// which are set by per-ISA TU compilation flags.
+#include <faiss/utils/hamming_distance/hamdis-inl.h>
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/impl/DistanceComputer.h>
+#include <faiss/impl/binary_hamming/dispatch.h>
+#include <faiss/utils/hamming.h>
+
+namespace faiss {
+
+namespace {
+
+template <class HammingComputer>
+struct FlatHammingDis : DistanceComputer {
+    const int code_size;
+    const uint8_t* b;
+    HammingComputer hc;
+
+    float operator()(idx_t i) override {
+        return hc.hamming(b + i * code_size);
+    }
+
+    float symmetric_dis(idx_t i, idx_t j) override {
+        return HammingComputerDefault(b + j * code_size, code_size)
+                .hamming(b + i * code_size);
+    }
+
+    explicit FlatHammingDis(const IndexBinaryFlat& storage)
+            : code_size(storage.code_size), b(storage.xb.data()), hc() {}
+
+    // NOTE: Pointers are cast from float in order to reuse the floating-point
+    //   DistanceComputer.
+    void set_query(const float* x) override {
+        hc.set((uint8_t*)x, code_size);
+    }
+};
+
+struct BuildDistanceComputer {
+    using T = DistanceComputer*;
+    template <class HammingComputer>
+    DistanceComputer* f(IndexBinaryFlat* flat_storage) {
+        return new FlatHammingDis<HammingComputer>(*flat_storage);
+    }
+};
+
+} // anonymous namespace
+
+template <>
+DistanceComputer* make_binary_hnsw_distance_computer_dispatch<THE_SIMD_LEVEL>(
+        int code_size,
+        IndexBinaryFlat* flat_storage) {
+    BuildDistanceComputer bd;
+    return dispatch_HammingComputer(code_size, bd, flat_storage);
+}
+
+} // namespace faiss
diff --git a/faiss/impl/binary_hamming/IndexBinaryHash_impl.h b/faiss/impl/binary_hamming/IndexBinaryHash_impl.h
new file mode 100644
index 0000000000..330ed9b104
--- /dev/null
+++ b/faiss/impl/binary_hamming/IndexBinaryHash_impl.h
@@ -0,0 +1,280 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Per-ISA implementation of Hamming distance computation for
+ * IndexBinaryHash and IndexBinaryMultiHash. Included once per SIMD TU
+ * with THE_SIMD_LEVEL set to the desired SIMDLevel.
+ */
+
+#pragma once
+
+#ifndef THE_SIMD_LEVEL
+#error "THE_SIMD_LEVEL must be defined before including this file"
+#endif
+
+#include <faiss/utils/hamming_distance/hamdis-inl.h>
+
+#include <unordered_set>
+
+#include <faiss/IndexBinaryFlat.h>
+#include <faiss/IndexBinaryHash.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/binary_hamming/dispatch.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/hamming.h>
+
+namespace faiss {
+
+namespace {
+
+/** Enumerate all bit vectors of size nbit with up to maxflip 1s */
+struct FlipEnumerator {
+    int nbit, nflip, maxflip;
+    uint64_t mask, x;
+
+    FlipEnumerator(int nbit_, int maxflip_) : nbit(nbit_), maxflip(maxflip_) {
+        nflip = 0;
+        mask = 0;
+        x = 0;
+    }
+
+    bool next() {
+        if (x == mask) {
+            if (nflip == maxflip) {
+                return false;
+            }
+            // increase Hamming radius
+            nflip++;
+            mask = (((uint64_t)1 << nflip) - 1);
+            x = mask << (nbit - nflip);
+            return true;
+        }
+
+        int i = __builtin_ctzll(x);
+
+        if (i > 0) {
+            x ^= (uint64_t)3 << (i - 1);
+        } else {
+            // nb of LSB 1s
+            int n1 = __builtin_ctzll(~x);
+            // clear them
+            x &= ((uint64_t)(-1) << n1);
+            int n2 = __builtin_ctzll(x);
+            x ^= (((uint64_t)1 << (n1 + 2)) - 1) << (n2 - n1 - 1);
+        }
+        return true;
+    }
+};
+
+struct RangeSearchResults {
+    int radius;
+    RangeQueryResult& qres;
+
+    inline void add(float dis, idx_t id) {
+        if (dis < radius) {
+            qres.add(dis, id);
+        }
+    }
+};
+
+struct KnnSearchResults {
+    // heap params
+    idx_t k;
+    int32_t* heap_sim;
+    idx_t* heap_ids;
+
+    using C = CMax<int, idx_t>;
+
+    inline void add(float dis, idx_t id) {
+        if (dis < heap_sim[0]) {
+            heap_replace_top<C>(k, heap_sim, heap_ids, dis, id);
+        }
+    }
+};
+
+template <class HammingComputer, class SearchResults>
+void search_single_query_template(
+        const IndexBinaryHash& index,
+        const uint8_t* q,
+        SearchResults& res,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis) {
+    size_t code_size = index.code_size;
+    BitstringReader br(q, code_size);
+    uint64_t qhash = br.read(index.b);
+    HammingComputer hc(q, code_size);
+    FlipEnumerator fe(index.b, index.nflip);
+
+    // loop over neighbors that are at most at nflip bits
+    do {
+        uint64_t hash = qhash ^ fe.x;
+        auto it = index.invlists.find(hash);
+
+        if (it == index.invlists.end()) {
+            continue;
+        }
+
+        const IndexBinaryHash::InvertedList& il = it->second;
+
+        size_t nv = il.ids.size();
+
+        if (nv == 0) {
+            n0++;
+        } else {
+            const uint8_t* codes = il.vecs.data();
+            for (size_t i = 0; i < nv; i++) {
+                int dis = hc.hamming(codes);
+                res.add(dis, il.ids[i]);
+                codes += code_size;
+            }
+            ndis += nv;
+            nlist++;
+        }
+    } while (fe.next());
+}
+
+struct Run_search_single_query {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    T f(Types*... args) {
+        search_single_query_template<HammingComputer>(*args...);
+    }
+};
+
+template <class HammingComputer, class SearchResults>
+static void verify_shortlist(
+        const IndexBinaryFlat* index,
+        const uint8_t* q,
+        const std::unordered_set<idx_t>& shortlist,
+        SearchResults& res) {
+    size_t code_size = index->code_size;
+
+    HammingComputer hc(q, code_size);
+    const uint8_t* codes = index->xb.data();
+
+    for (auto i : shortlist) {
+        int dis = hc.hamming(codes + i * code_size);
+        res.add(dis, i);
+    }
+}
+
+struct Run_verify_shortlist {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        verify_shortlist<HammingComputer>(args...);
+    }
+};
+
+template <class SearchResults>
+void search_1_query_multihash(
+        const IndexBinaryMultiHash& index,
+        const uint8_t* xi,
+        SearchResults& res,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis) {
+    std::unordered_set<idx_t> shortlist;
+    int b = index.b;
+
+    BitstringReader br(xi, index.code_size);
+    for (int h = 0; h < index.nhash; h++) {
+        uint64_t qhash = br.read(b);
+        const IndexBinaryMultiHash::Map& map = index.maps[h];
+
+        FlipEnumerator fe(index.b, index.nflip);
+        // loop over neighbors that are at most at nflip bits
+        do {
+            uint64_t hash = qhash ^ fe.x;
+            auto it = map.find(hash);
+
+            if (it != map.end()) {
+                const std::vector<idx_t>& v = it->second;
+                for (auto i : v) {
+                    shortlist.insert(i);
+                }
+                nlist++;
+            } else {
+                n0++;
+            }
+        } while (fe.next());
+    }
+    ndis += shortlist.size();
+
+    // verify shortlist
+    Run_verify_shortlist r;
+    dispatch_HammingComputer(
+            index.code_size, r, index.storage, xi, shortlist, res);
+}
+
+} // anonymous namespace
+
+// --- IndexBinaryHash entry points ---
+
+template <>
+void binary_hash_knn_search_dispatch<THE_SIMD_LEVEL>(
+        const IndexBinaryHash& index,
+        const uint8_t* q,
+        idx_t k,
+        int32_t* heap_sim,
+        idx_t* heap_ids,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis) {
+    KnnSearchResults res = {k, heap_sim, heap_ids};
+    Run_search_single_query r;
+    dispatch_HammingComputer(
+            index.code_size, r, &index, &q, &res, &n0, &nlist, &ndis);
+}
+
+template <>
+void binary_hash_range_search_dispatch<THE_SIMD_LEVEL>(
+        const IndexBinaryHash& index,
+        const uint8_t* q,
+        int radius,
+        RangeQueryResult& qres,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis) {
+    RangeSearchResults res = {radius, qres};
+    Run_search_single_query r;
+    dispatch_HammingComputer(
+            index.code_size, r, &index, &q, &res, &n0, &nlist, &ndis);
+}
+
+// --- IndexBinaryMultiHash entry points ---
+
+template <>
+void binary_multihash_knn_search_dispatch<THE_SIMD_LEVEL>(
+        const IndexBinaryMultiHash& index,
+        const uint8_t* q,
+        idx_t k,
+        int32_t* heap_sim,
+        idx_t* heap_ids,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis) {
+    KnnSearchResults res = {k, heap_sim, heap_ids};
+    search_1_query_multihash(index, q, res, n0, nlist, ndis);
+}
+
+template <>
+void binary_multihash_range_search_dispatch<THE_SIMD_LEVEL>(
+        const IndexBinaryMultiHash& index,
+        const uint8_t* q,
+        int radius,
+        RangeQueryResult& qres,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis) {
+    RangeSearchResults res = {radius, qres};
+    search_1_query_multihash(index, q, res, n0, nlist, ndis);
+}
+
+} // namespace faiss
diff --git a/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h b/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h
new file mode 100644
index 0000000000..55c19ad20e
--- /dev/null
+++ b/faiss/impl/binary_hamming/IndexBinaryIVF_impl.h
@@ -0,0 +1,472 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Per-ISA implementation of Hamming distance computation for
+ * IndexBinaryIVF. Included once per SIMD TU with THE_SIMD_LEVEL
+ * set to the desired SIMDLevel.
+ *
+ * Contains: IVFBinaryScannerL2, search_knn_hamming_count,
+ * BlockSearch, BlockSearchVariableK, search_knn_hamming_per_invlist.
+ */
+
+#pragma once
+
+#ifndef THE_SIMD_LEVEL
+#error "THE_SIMD_LEVEL must be defined before including this file"
+#endif
+
+#include <faiss/utils/hamming_distance/hamdis-inl.h>
+
+#include <algorithm>
+#include <cinttypes>
+#include <limits>
+#include <memory>
+
+#include <faiss/IndexBinaryIVF.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/binary_hamming/dispatch.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/hamming.h>
+#include <faiss/utils/sorting.h>
+
+namespace faiss {
+
+namespace {
+
+template <class HammingComputer>
+struct IVFBinaryScannerL2 : BinaryInvertedListScanner {
+    HammingComputer hc;
+    size_t code_size;
+    bool store_pairs;
+
+    IVFBinaryScannerL2(size_t code_size_, bool store_pairs_)
+            : code_size(code_size_), store_pairs(store_pairs_) {}
+
+    void set_query(const uint8_t* query_vector) override {
+        hc.set(query_vector, code_size);
+    }
+
+    idx_t list_no;
+    void set_list(idx_t list_no_2, uint8_t /* coarse_dis */) override {
+        this->list_no = list_no_2;
+    }
+
+    uint32_t distance_to_code(const uint8_t* code) const override {
+        return hc.hamming(code);
+    }
+
+    size_t scan_codes(
+            size_t n,
+            const uint8_t* __restrict codes,
+            const idx_t* __restrict ids,
+            int32_t* __restrict simi,
+            idx_t* __restrict idxi,
+            size_t k) const override {
+        using C = CMax<int32_t, idx_t>;
+
+        size_t nup = 0;
+        for (size_t j = 0; j < n; j++) {
+            uint32_t dis = hc.hamming(codes);
+            if (dis < static_cast<uint32_t>(simi[0])) {
+                idx_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                heap_replace_top<C>(k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+        return nup;
+    }
+
+    void scan_codes_range(
+            size_t n,
+            const uint8_t* __restrict codes,
+            const idx_t* __restrict ids,
+            int radius,
+            RangeQueryResult& result) const override {
+        for (size_t j = 0; j < n; j++) {
+            uint32_t dis = hc.hamming(codes);
+            if (dis < static_cast<uint32_t>(radius)) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                result.add(dis, id);
+            }
+            codes += code_size;
+        }
+    }
+};
+
+template <class HammingComputer, bool store_pairs>
+void search_knn_hamming_count(
+        const IndexBinaryIVF* ivf,
+        size_t nx,
+        const uint8_t* __restrict x,
+        const idx_t* __restrict keys,
+        int k,
+        int32_t* __restrict distances,
+        idx_t* __restrict labels,
+        const IVFSearchParameters* params) {
+    const int nBuckets = ivf->d + 1;
+    std::vector<int> all_counters(nx * nBuckets, 0);
+    std::unique_ptr<idx_t[]> all_ids_per_dis(new idx_t[nx * nBuckets * k]);
+
+    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
+    nprobe = std::min((idx_t)ivf->nlist, nprobe);
+    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
+
+    std::vector<HCounterState<HammingComputer>> cs;
+    for (size_t i = 0; i < nx; ++i) {
+        cs.push_back(
+                HCounterState<HammingComputer>(
+                        all_counters.data() + i * nBuckets,
+                        all_ids_per_dis.get() + i * nBuckets * k,
+                        x + i * ivf->code_size,
+                        ivf->d,
+                        k));
+    }
+
+    size_t nlistv = 0, ndis = 0;
+
+#pragma omp parallel for reduction(+ : nlistv, ndis)
+    for (int64_t i = 0; i < static_cast<int64_t>(nx); i++) {
+        const idx_t* keysi = keys + i * nprobe;
+        HCounterState<HammingComputer>& csi = cs[i];
+
+        size_t nscan = 0;
+
+        for (idx_t ik = 0; ik < nprobe; ik++) {
+            idx_t key = keysi[ik]; /* select the list  */
+            if (key < 0) {
+                // not enough centroids for multiprobe
+                continue;
+            }
+            FAISS_THROW_IF_NOT_FMT(
+                    key < (idx_t)ivf->nlist,
+                    "Invalid key=%" PRId64 " at ik=%zd nlist=%zd\n",
+                    key,
+                    static_cast<size_t>(ik),
+                    ivf->nlist);
+
+            nlistv++;
+            size_t list_size = ivf->invlists->list_size(key);
+            size_t list_size_max = static_cast<size_t>(max_codes) - nscan;
+            if (list_size > list_size_max) {
+                list_size = list_size_max;
+            }
+            InvertedLists::ScopedCodes scodes(ivf->invlists, key);
+            const uint8_t* list_vecs = scodes.get();
+            const idx_t* ids =
+                    store_pairs ? nullptr : ivf->invlists->get_ids(key);
+
+            for (size_t j = 0; j < list_size; j++) {
+                const uint8_t* yj = list_vecs + ivf->code_size * j;
+
+                idx_t id = store_pairs ? (key << 32 | j) : ids[j];
+                csi.update_counter(yj, id);
+            }
+            if (ids) {
+                ivf->invlists->release_ids(key, ids);
+            }
+
+            nscan += list_size;
+            if (nscan >= static_cast<size_t>(max_codes)) {
+                break;
+            }
+        }
+        ndis += nscan;
+
+        int nres = 0;
+        for (int b = 0; b < nBuckets && nres < k; b++) {
+            for (int l = 0; l < csi.counters[b] && nres < k; l++) {
+                labels[i * k + nres] = csi.ids_per_dis[b * k + l];
+                distances[i * k + nres] = b;
+                nres++;
+            }
+        }
+        while (nres < k) {
+            labels[i * k + nres] = -1;
+            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
+            ++nres;
+        }
+    }
+
+    indexIVF_stats.nq += nx;
+    indexIVF_stats.nlist += nlistv;
+    indexIVF_stats.ndis += ndis;
+}
+
+/* Manages NQ queries at a time, stores results */
+template <class HammingComputer, int NQ, int K>
+struct BlockSearch {
+    HammingComputer hcs[NQ];
+    // heaps to update for each query
+    int32_t* distances[NQ];
+    idx_t* labels[NQ];
+    // curent top of heap
+    int32_t heap_tops[NQ];
+
+    BlockSearch(
+            size_t code_size,
+            const uint8_t* __restrict x,
+            const int32_t* __restrict keys,
+            int32_t* __restrict all_distances,
+            idx_t* __restrict all_labels) {
+        for (idx_t q = 0; q < NQ; q++) {
+            idx_t qno = keys[q];
+            hcs[q] = HammingComputer(x + qno * code_size, code_size);
+            distances[q] = all_distances + qno * K;
+            labels[q] = all_labels + qno * K;
+            heap_tops[q] = distances[q][0];
+        }
+    }
+
+    void add_bcode(const uint8_t* bcode, idx_t id) {
+        using C = CMax<int32_t, idx_t>;
+        for (int q = 0; q < NQ; q++) {
+            int dis = hcs[q].hamming(bcode);
+            if (dis < heap_tops[q]) {
+                heap_replace_top<C>(K, distances[q], labels[q], dis, id);
+                heap_tops[q] = distances[q][0];
+            }
+        }
+    }
+};
+
+template <class HammingComputer, int NQ>
+struct BlockSearchVariableK {
+    int k;
+    HammingComputer hcs[NQ];
+    // heaps to update for each query
+    int32_t* distances[NQ];
+    idx_t* labels[NQ];
+    // curent top of heap
+    int32_t heap_tops[NQ];
+
+    BlockSearchVariableK(
+            size_t code_size,
+            int k_,
+            const uint8_t* __restrict x,
+            const int32_t* __restrict keys,
+            int32_t* __restrict all_distances,
+            idx_t* __restrict all_labels)
+            : k(k_) {
+        for (idx_t q = 0; q < NQ; q++) {
+            idx_t qno = keys[q];
+            hcs[q] = HammingComputer(x + qno * code_size, code_size);
+            distances[q] = all_distances + qno * k;
+            labels[q] = all_labels + qno * k;
+            heap_tops[q] = distances[q][0];
+        }
+    }
+
+    void add_bcode(const uint8_t* bcode, idx_t id) {
+        using C = CMax<int32_t, idx_t>;
+        for (int q = 0; q < NQ; q++) {
+            int dis = hcs[q].hamming(bcode);
+            if (dis < heap_tops[q]) {
+                heap_replace_top<C>(k, distances[q], labels[q], dis, id);
+                heap_tops[q] = distances[q][0];
+            }
+        }
+    }
+};
+
+template <class HammingComputer>
+void search_knn_hamming_per_invlist(
+        const IndexBinaryIVF* ivf,
+        size_t n,
+        const uint8_t* __restrict x,
+        idx_t k,
+        const idx_t* __restrict keys_in,
+        const int32_t* __restrict /* coarse_dis */,
+        int32_t* __restrict distances,
+        idx_t* __restrict labels,
+        bool store_pairs,
+        const IVFSearchParameters* params) {
+    idx_t nprobe = params ? params->nprobe : ivf->nprobe;
+    nprobe = std::min((idx_t)ivf->nlist, nprobe);
+    idx_t max_codes = params ? params->max_codes : ivf->max_codes;
+    FAISS_THROW_IF_NOT(max_codes == 0);
+    FAISS_THROW_IF_NOT(!store_pairs);
+
+    // reorder buckets
+    std::vector<int64_t> lims(n + 1);
+    int32_t* keys = new int32_t[n * nprobe];
+    std::unique_ptr<int32_t[]> delete_keys(keys);
+    for (size_t i = 0; i < n * static_cast<size_t>(nprobe); i++) {
+        keys[i] = keys_in[i];
+    }
+    matrix_bucket_sort_inplace(n, nprobe, keys, ivf->nlist, lims.data(), 0);
+
+    using C = CMax<int32_t, idx_t>;
+    heap_heapify<C>(n * k, distances, labels);
+    const size_t code_size = ivf->code_size;
+
+    for (size_t l = 0; l < ivf->nlist; l++) {
+        idx_t l0 = lims[l], nq = lims[l + 1] - l0;
+
+        InvertedLists::ScopedCodes scodes(ivf->invlists, l);
+        InvertedLists::ScopedIds sidx(ivf->invlists, l);
+        idx_t nb = ivf->invlists->list_size(l);
+        const uint8_t* bcodes = scodes.get();
+        const idx_t* ids = sidx.get();
+
+        idx_t i = 0;
+
+        // process as much as possible by blocks
+        constexpr int BS = 4;
+
+        if (k == 1) {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearch<HammingComputer, BS, 1> bc(
+                        code_size, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        } else if (k == 2) {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearch<HammingComputer, BS, 2> bc(
+                        code_size, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        } else if (k == 4) {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearch<HammingComputer, BS, 4> bc(
+                        code_size, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        } else {
+            for (; i + BS <= nq; i += BS) {
+                BlockSearchVariableK<HammingComputer, BS> bc(
+                        code_size, k, x, keys + l0 + i, distances, labels);
+                for (idx_t j = 0; j < nb; j++) {
+                    bc.add_bcode(bcodes + j * code_size, ids[j]);
+                }
+            }
+        }
+
+        // leftovers
+        for (; i < nq; i++) {
+            idx_t qno = keys[l0 + i];
+            HammingComputer hc(x + qno * code_size, code_size);
+            idx_t* __restrict idxi = labels + qno * k;
+            int32_t* __restrict simi = distances + qno * k;
+            int32_t simi0 = simi[0];
+            for (idx_t j = 0; j < nb; j++) {
+                int dis = hc.hamming(bcodes + j * code_size);
+
+                if (dis < simi0) {
+                    idx_t id = store_pairs ? lo_build(l, j) : ids[j];
+                    heap_replace_top<C>(k, simi, idxi, dis, id);
+                    simi0 = simi[0];
+                }
+            }
+        }
+    }
+    for (size_t i = 0; i < n; i++) {
+        heap_reorder<C>(k, distances + i * k, labels + i * k);
+    }
+}
+
+struct Run_search_knn_hamming_per_invlist {
+    using T = void;
+
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        search_knn_hamming_per_invlist<HammingComputer>(args...);
+    }
+};
+
+template <bool store_pairs>
+struct Run_search_knn_hamming_count {
+    using T = void;
+
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        search_knn_hamming_count<HammingComputer, store_pairs>(args...);
+    }
+};
+
+struct BuildScannerBinaryIVF {
+    using T = BinaryInvertedListScanner*;
+
+    template <class HammingComputer>
+    T f(size_t code_size, bool store_pairs) {
+        return new IVFBinaryScannerL2<HammingComputer>(code_size, store_pairs);
+    }
+};
+
+} // anonymous namespace
+
+// --- Entry points ---
+
+template <>
+BinaryInvertedListScanner* make_binary_ivf_scanner_dispatch<THE_SIMD_LEVEL>(
+        size_t code_size,
+        bool store_pairs) {
+    BuildScannerBinaryIVF bs;
+    return dispatch_HammingComputer(code_size, bs, code_size, store_pairs);
+}
+
+template <>
+void search_knn_hamming_per_invlist_dispatch<THE_SIMD_LEVEL>(
+        int code_size,
+        const IndexBinaryIVF* ivf,
+        size_t n,
+        const uint8_t* x,
+        idx_t k,
+        const idx_t* keys_in,
+        const int32_t* coarse_dis,
+        int32_t* distances,
+        idx_t* labels,
+        bool store_pairs,
+        const IVFSearchParameters* params) {
+    Run_search_knn_hamming_per_invlist r;
+    dispatch_HammingComputer(
+            code_size,
+            r,
+            ivf,
+            n,
+            x,
+            k,
+            keys_in,
+            coarse_dis,
+            distances,
+            labels,
+            store_pairs,
+            params);
+}
+
+template <>
+void search_knn_hamming_count_dispatch<THE_SIMD_LEVEL>(
+        int code_size,
+        bool store_pairs,
+        const IndexBinaryIVF* ivf,
+        size_t nx,
+        const uint8_t* x,
+        const idx_t* keys,
+        int k,
+        int32_t* distances,
+        idx_t* labels,
+        const IVFSearchParameters* params) {
+    if (store_pairs) {
+        Run_search_knn_hamming_count<true> r;
+        dispatch_HammingComputer(
+                code_size, r, ivf, nx, x, keys, k, distances, labels, params);
+    } else {
+        Run_search_knn_hamming_count<false> r;
+        dispatch_HammingComputer(
+                code_size, r, ivf, nx, x, keys, k, distances, labels, params);
+    }
+}
+
+} // namespace faiss
diff --git a/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h b/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h
new file mode 100644
index 0000000000..b7c8abd26b
--- /dev/null
+++ b/faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Per-ISA implementation of Hamming distance computation for
+ * IndexIVFSpectralHash. Included once per SIMD TU with THE_SIMD_LEVEL
+ * set to the desired SIMDLevel.
+ */
+
+#pragma once
+
+#ifndef THE_SIMD_LEVEL
+#error "THE_SIMD_LEVEL must be defined before including this file"
+#endif
+
+#include <faiss/utils/hamming_distance/hamdis-inl.h>
+
+#include <cmath>
+#include <cstring>
+
+#include <faiss/IndexIVFSpectralHash.h>
+#include <faiss/VectorTransform.h>
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/binary_hamming/dispatch.h>
+#include <faiss/utils/hamming.h>
+
+namespace faiss {
+
+namespace {
+
+void binarize_with_freq_impl(
+        size_t nbit,
+        float freq,
+        const float* x,
+        const float* c,
+        uint8_t* codes) {
+    memset(codes, 0, (nbit + 7) / 8);
+    for (size_t i = 0; i < nbit; i++) {
+        float xf = (x[i] - c[i]);
+        int64_t xi = int64_t(floor(xf * freq));
+        int64_t bit = xi & 1;
+        codes[i >> 3] |= bit << (i & 7);
+    }
+}
+
+template <class HammingComputer>
+struct IVFScanner : InvertedListScanner {
+    using InvertedListScanner::scan_codes;
+    const IndexIVFSpectralHash* index;
+    size_t nbit;
+
+    float period, freq;
+    std::vector<float> q;
+    std::vector<float> zero;
+    std::vector<uint8_t> qcode;
+    HammingComputer hc;
+
+    IVFScanner(const IndexIVFSpectralHash* index_in, bool store_pairs_in)
+            : index(index_in),
+              nbit(index_in->nbit),
+              period(index_in->period),
+              freq(2.0 / index_in->period),
+              q(nbit),
+              zero(nbit),
+              qcode(index_in->code_size),
+              hc(qcode.data(), index_in->code_size) {
+        this->store_pairs = store_pairs_in;
+        this->code_size = index->code_size;
+        this->keep_max = is_similarity_metric(index->metric_type);
+    }
+
+    void set_query(const float* query) override {
+        FAISS_THROW_IF_NOT(query);
+        FAISS_THROW_IF_NOT(q.size() == static_cast<size_t>(nbit));
+        index->vt->apply_noalloc(1, query, q.data());
+
+        if (index->threshold_type == IndexIVFSpectralHash::Thresh_global) {
+            binarize_with_freq_impl(
+                    nbit, freq, q.data(), zero.data(), qcode.data());
+            hc.set(qcode.data(), code_size);
+        }
+    }
+
+    void set_list(idx_t list_no_in, float /*coarse_dis*/) override {
+        this->list_no = list_no_in;
+        if (index->threshold_type != IndexIVFSpectralHash::Thresh_global) {
+            const float* c = index->trained.data() + list_no_in * nbit;
+            binarize_with_freq_impl(nbit, freq, q.data(), c, qcode.data());
+            hc.set(qcode.data(), code_size);
+        }
+    }
+
+    float distance_to_code(const uint8_t* code) const final {
+        return hc.hamming(code);
+    }
+
+    size_t scan_codes(
+            size_t list_size,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float* simi,
+            idx_t* idxi,
+            size_t k) const override {
+        size_t nup = 0;
+        for (size_t j = 0; j < list_size; j++) {
+            float dis = hc.hamming(codes);
+
+            if (dis < simi[0]) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                maxheap_replace_top(k, simi, idxi, dis, id);
+                nup++;
+            }
+            codes += code_size;
+        }
+        return nup;
+    }
+
+    void scan_codes_range(
+            size_t list_size,
+            const uint8_t* codes,
+            const idx_t* ids,
+            float radius,
+            RangeQueryResult& result) const override {
+        for (size_t j = 0; j < list_size; j++) {
+            float dis = hc.hamming(codes);
+            if (dis < radius) {
+                int64_t id = store_pairs ? lo_build(list_no, j) : ids[j];
+                result.add(dis, id);
+            }
+            codes += code_size;
+        }
+    }
+};
+
+struct BuildScannerSpectralHash {
+    using T = InvertedListScanner*;
+
+    template <class HammingComputer>
+    static T f(const IndexIVFSpectralHash* index, bool store_pairs) {
+        return new IVFScanner<HammingComputer>(index, store_pairs);
+    }
+};
+
+} // anonymous namespace
+
+template <>
+InvertedListScanner* make_spectral_hash_scanner_dispatch<THE_SIMD_LEVEL>(
+        int code_size,
+        const IndexIVFSpectralHash* index,
+        bool store_pairs) {
+    BuildScannerSpectralHash bs;
+    return dispatch_HammingComputer(code_size, bs, index, store_pairs);
+}
+
+} // namespace faiss
diff --git a/faiss/impl/binary_hamming/IndexPQ_impl.h b/faiss/impl/binary_hamming/IndexPQ_impl.h
new file mode 100644
index 0000000000..09cbf9a1eb
--- /dev/null
+++ b/faiss/impl/binary_hamming/IndexPQ_impl.h
@@ -0,0 +1,106 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * Per-ISA implementation of polysemous Hamming inner loop for IndexPQ.
+ * Included once per SIMD TU with THE_SIMD_LEVEL set to the desired
+ * SIMDLevel.
+ */
+
+#pragma once
+
+#ifndef THE_SIMD_LEVEL
+#error "THE_SIMD_LEVEL must be defined before including this file"
+#endif
+
+#include <faiss/utils/hamming_distance/hamdis-inl.h>
+
+#include <faiss/IndexPQ.h>
+#include <faiss/impl/binary_hamming/dispatch.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/hamming.h>
+
+namespace faiss {
+
+namespace {
+
+template <class HammingComputer>
+size_t polysemous_inner_loop(
+        const IndexPQ* index,
+        const float* dis_table_qi,
+        const uint8_t* q_code,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids,
+        int ht) {
+    size_t M = index->pq.M;
+    size_t code_size = index->pq.code_size;
+    size_t ksub = index->pq.ksub;
+    size_t ntotal = index->ntotal;
+
+    const uint8_t* b_code = index->codes.data();
+
+    size_t n_pass_i = 0;
+
+    HammingComputer hc(q_code, code_size);
+
+    for (int64_t bi = 0; bi < static_cast<int64_t>(ntotal); bi++) {
+        int hd = hc.hamming(b_code);
+
+        if (hd < ht) {
+            n_pass_i++;
+
+            float dis = 0;
+            const float* dis_table = dis_table_qi;
+            for (size_t m = 0; m < M; m++) {
+                dis += dis_table[b_code[m]];
+                dis_table += ksub;
+            }
+
+            if (dis < heap_dis[0]) {
+                maxheap_replace_top(k, heap_dis, heap_ids, dis, bi);
+            }
+        }
+        b_code += code_size;
+    }
+    return n_pass_i;
+}
+
+struct Run_polysemous_inner_loop {
+    using T = size_t;
+    template <class HammingComputer, class... Types>
+    size_t f(Types... args) {
+        return polysemous_inner_loop<HammingComputer>(args...);
+    }
+};
+
+} // anonymous namespace
+
+template <>
+size_t polysemous_inner_loop_dispatch<THE_SIMD_LEVEL>(
+        int code_size,
+        const IndexPQ* index,
+        const float* dis_table_qi,
+        const uint8_t* q_code,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids,
+        int ht) {
+    Run_polysemous_inner_loop r;
+    return dispatch_HammingComputer(
+            code_size,
+            r,
+            index,
+            dis_table_qi,
+            q_code,
+            k,
+            heap_dis,
+            heap_ids,
+            ht);
+}
+
+} // namespace faiss
diff --git a/faiss/impl/binary_hamming/avx2.cpp b/faiss/impl/binary_hamming/avx2.cpp
new file mode 100644
index 0000000000..af39b0f7bc
--- /dev/null
+++ b/faiss/impl/binary_hamming/avx2.cpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_AVX2
+
+#define THE_SIMD_LEVEL SIMDLevel::AVX2
+
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h>
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexBinaryHash_impl.h>
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexBinaryIVF_impl.h>
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h>
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexPQ_impl.h>
+
+#endif // COMPILE_SIMD_AVX2
diff --git a/faiss/impl/binary_hamming/dispatch.h b/faiss/impl/binary_hamming/dispatch.h
new file mode 100644
index 0000000000..331739e808
--- /dev/null
+++ b/faiss/impl/binary_hamming/dispatch.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <faiss/utils/simd_levels.h>
+
+#include <cstddef>
+#include <cstdint>
+
+namespace faiss {
+
+// Forward declarations
+struct DistanceComputer;
+struct InvertedListScanner;
+struct BinaryInvertedListScanner;
+struct IndexBinaryFlat;
+struct IndexBinaryHash;
+struct IndexBinaryMultiHash;
+struct IndexBinaryIVF;
+struct IndexIVFSpectralHash;
+struct IndexPQ;
+struct SearchParametersIVF;
+using IVFSearchParameters = SearchParametersIVF;
+struct RangeQueryResult;
+using idx_t = int64_t;
+
+/** @name IndexBinaryHNSW dispatch
+ * @{ */
+template <SIMDLevel SL>
+DistanceComputer* make_binary_hnsw_distance_computer_dispatch(
+        int code_size,
+        IndexBinaryFlat* flat_storage);
+/** @} */
+
+/** @name IndexBinaryIVF dispatch
+ * @{ */
+template <SIMDLevel SL>
+BinaryInvertedListScanner* make_binary_ivf_scanner_dispatch(
+        size_t code_size,
+        bool store_pairs);
+
+template <SIMDLevel SL>
+void search_knn_hamming_per_invlist_dispatch(
+        int code_size,
+        const IndexBinaryIVF* ivf,
+        size_t n,
+        const uint8_t* x,
+        idx_t k,
+        const idx_t* keys_in,
+        const int32_t* coarse_dis,
+        int32_t* distances,
+        idx_t* labels,
+        bool store_pairs,
+        const IVFSearchParameters* params);
+
+template <SIMDLevel SL>
+void search_knn_hamming_count_dispatch(
+        int code_size,
+        bool store_pairs,
+        const IndexBinaryIVF* ivf,
+        size_t nx,
+        const uint8_t* x,
+        const idx_t* keys,
+        int k,
+        int32_t* distances,
+        idx_t* labels,
+        const IVFSearchParameters* params);
+/** @} */
+
+/** @name IndexBinaryHash dispatch
+ * @{ */
+template <SIMDLevel SL>
+void binary_hash_knn_search_dispatch(
+        const IndexBinaryHash& index,
+        const uint8_t* q,
+        idx_t k,
+        int32_t* heap_sim,
+        idx_t* heap_ids,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis);
+
+template <SIMDLevel SL>
+void binary_hash_range_search_dispatch(
+        const IndexBinaryHash& index,
+        const uint8_t* q,
+        int radius,
+        RangeQueryResult& qres,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis);
+
+template <SIMDLevel SL>
+void binary_multihash_knn_search_dispatch(
+        const IndexBinaryMultiHash& index,
+        const uint8_t* q,
+        idx_t k,
+        int32_t* heap_sim,
+        idx_t* heap_ids,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis);
+
+template <SIMDLevel SL>
+void binary_multihash_range_search_dispatch(
+        const IndexBinaryMultiHash& index,
+        const uint8_t* q,
+        int radius,
+        RangeQueryResult& qres,
+        size_t& n0,
+        size_t& nlist,
+        size_t& ndis);
+/** @} */
+
+/** @name IndexIVFSpectralHash dispatch
+ * @{ */
+template <SIMDLevel SL>
+InvertedListScanner* make_spectral_hash_scanner_dispatch(
+        int code_size,
+        const IndexIVFSpectralHash* index,
+        bool store_pairs);
+/** @} */
+
+/** @name IndexPQ polysemous dispatch
+ * @{ */
+template <SIMDLevel SL>
+size_t polysemous_inner_loop_dispatch(
+        int code_size,
+        const IndexPQ* index,
+        const float* dis_table_qi,
+        const uint8_t* q_code,
+        size_t k,
+        float* heap_dis,
+        int64_t* heap_ids,
+        int ht);
+/** @} */
+
+} // namespace faiss
diff --git a/faiss/impl/binary_hamming/neon.cpp b/faiss/impl/binary_hamming/neon.cpp
new file mode 100644
index 0000000000..0ed1d2db88
--- /dev/null
+++ b/faiss/impl/binary_hamming/neon.cpp
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_ARM_NEON
+
+#define THE_SIMD_LEVEL SIMDLevel::ARM_NEON
+
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexBinaryHNSW_impl.h>
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexBinaryHash_impl.h>
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexBinaryIVF_impl.h>
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexIVFSpectralHash_impl.h>
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/impl/binary_hamming/IndexPQ_impl.h>
+
+#endif // COMPILE_SIMD_ARM_NEON
diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h
index b7aa4111e3..8f18d98e54 100644
--- a/faiss/impl/platform_macros.h
+++ b/faiss/impl/platform_macros.h
@@ -126,9 +126,11 @@ inline int __builtin_clzll(uint64_t x) {
 #ifdef SWIG
 #define ALIGNED(x)
 #define FAISS_PACKED
+#define FAISS_MAYBE_UNUSED
 #else
 #define ALIGNED(x) __attribute__((aligned(x)))
 #define FAISS_PACKED __attribute__((packed))
+#define FAISS_MAYBE_UNUSED [[maybe_unused]]
 #endif
 
 // On non-Windows, FAISS_PACKED handles packing, so these are no-ops
diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp
index 3c7931aa23..d02cd6c962 100644
--- a/faiss/utils/hamming.cpp
+++ b/faiss/utils/hamming.cpp
@@ -23,348 +23,28 @@
 
 #include <faiss/utils/hamming.h>
 
-#include <algorithm>
 #include <cstdio>
-#include <memory>
-#include <vector>
+#include <cstring>
 
-#include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/approx_topk_hamming/approx_topk_hamming.h>
+#include <faiss/impl/simd_dispatch.h>
 #include <faiss/utils/utils.h>
 
+// Scalar (NONE) fallback — compiled without SIMD flags, hamdis-inl.h
+// selects generic-inl.h via the preprocessor ISA ladder.
+#define THE_SIMD_LEVEL SIMDLevel::NONE
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/utils/simd_impl/hamming_impl.h>
+#undef THE_SIMD_LEVEL
+
 namespace faiss {
 
 size_t hamming_batch_size = 65536;
 
-template <size_t nbits>
-void hammings(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t* __restrict dis)
-
-{
-    size_t i, j;
-    const size_t nwords = nbits / 64;
-    for (i = 0; i < n1; i++) {
-        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
-        hamdis_t* __restrict dis_ = dis + i * n2;
-        for (j = 0; j < n2; j++) {
-            dis_[j] = hamming<nbits>(bs1_, bs2 + j * nwords);
-        }
-    }
-}
-
-void hammings(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        size_t nbits,
-        hamdis_t* __restrict dis) {
-    size_t i, j;
-    const size_t nwords = nbits / 64;
-    for (i = 0; i < n1; i++) {
-        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
-        hamdis_t* __restrict dis_ = dis + i * n2;
-        for (j = 0; j < n2; j++) {
-            dis_[j] = hamming(bs1_, bs2 + j * nwords, nwords);
-        }
-    }
-}
-
-/* Count number of matches given a max threshold */
-template <size_t nbits>
-void hamming_count_thres(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t* __restrict nptr) {
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    const uint64_t* bs2_ = bs2;
-
-    for (i = 0; i < n1; i++) {
-        bs2 = bs2_;
-        for (j = 0; j < n2; j++) {
-            /* collect the match only if this satisfies the threshold */
-            if (hamming<nbits>(bs1, bs2) <= ht) {
-                posm++;
-            }
-            bs2 += nwords;
-        }
-        bs1 += nwords; /* next signature */
-    }
-    *nptr = posm;
-}
-
-template <size_t nbits>
-void crosshamming_count_thres(
-        const uint64_t* __restrict dbs,
-        size_t n,
-        int ht,
-        size_t* __restrict nptr) {
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    const uint64_t* bs1 = dbs;
-    for (i = 0; i < n; i++) {
-        const uint64_t* bs2 = bs1 + 2;
-        for (j = i + 1; j < n; j++) {
-            /* collect the match only if this satisfies the threshold */
-            if (hamming<nbits>(bs1, bs2) <= ht) {
-                posm++;
-            }
-            bs2 += nwords;
-        }
-        bs1 += nwords;
-    }
-    *nptr = posm;
-}
-
-template <size_t nbits>
-size_t match_hamming_thres(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        int ht,
-        int64_t* __restrict idx,
-        hamdis_t* __restrict hams) {
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    hamdis_t h;
-    const uint64_t* bs2_ = bs2;
-    for (i = 0; i < n1; i++) {
-        bs2 = bs2_;
-        for (j = 0; j < n2; j++) {
-            /* Here perform the real work of computing the distance */
-            h = hamming<nbits>(bs1, bs2);
-
-            /* collect the match only if this satisfies the threshold */
-            if (h <= ht) {
-                /* Enough space to store another match ? */
-                *idx = i;
-                idx++;
-                *idx = j;
-                idx++;
-                *hams = h;
-                hams++;
-                posm++;
-            }
-            bs2 += nwords; /* next signature */
-        }
-        bs1 += nwords;
-    }
-    return posm;
-}
-
-namespace {
-
-/* Return closest neighbors w.r.t Hamming distance, using a heap. */
-template <class HammingComputer>
-void hammings_knn_hc(
-        int bytes_per_code,
-        int_maxheap_array_t* __restrict ha,
-        const uint8_t* __restrict bs1,
-        const uint8_t* __restrict bs2,
-        size_t n2,
-        bool order = true,
-        bool init_heap = true,
-        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK,
-        const faiss::IDSelector* sel = nullptr) {
-    size_t k = ha->k;
-    if (init_heap) {
-        ha->heapify();
-    }
+/******************************************************************
+ * Scalar utility functions (no SIMD, no dispatch needed)
+ ******************************************************************/
 
-    const size_t block_size = hamming_batch_size;
-    for (size_t j0 = 0; j0 < n2; j0 += block_size) {
-        const size_t j1 = std::min(j0 + block_size, n2);
-#pragma omp parallel for
-        for (int64_t i = 0; i < static_cast<int64_t>(ha->nh); i++) {
-            HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code);
-
-            const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code;
-            hamdis_t dis;
-            hamdis_t* __restrict bh_val_ = ha->val + i * k;
-            int64_t* __restrict bh_ids_ = ha->ids + i * k;
-
-            // if larger number of k is required, then ::bs_addn() needs to be
-            // used instead of ::addn()
-#define HANDLE_APPROX(NB, BD)                                                \
-    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD:               \
-        FAISS_THROW_IF_NOT_FMT(                                              \
-                k <= NB * BD,                                                \
-                "The chosen mode (%d) of approximate top-k supports "        \
-                "up to %d values, but %zd is requested.",                    \
-                (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \
-                NB * BD,                                                     \
-                k);                                                          \
-        HeapWithBucketsForHamming32<                                         \
-                CMax<hamdis_t, int64_t>,                                     \
-                NB,                                                          \
-                BD,                                                          \
-                HammingComputer>::                                           \
-                addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel);           \
-        break;
-
-            switch (approx_topk_mode) {
-                HANDLE_APPROX(8, 3)
-                HANDLE_APPROX(8, 2)
-                HANDLE_APPROX(16, 2)
-                HANDLE_APPROX(32, 2)
-                default: {
-                    for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) {
-                        if (sel && !sel->is_member(j)) {
-                            continue;
-                        }
-                        dis = hc.hamming(bs2_);
-                        if (dis < bh_val_[0]) {
-                            faiss::maxheap_replace_top<hamdis_t>(
-                                    k, bh_val_, bh_ids_, dis, j);
-                        }
-                    }
-                } break;
-            }
-        }
-    }
-    if (order) {
-        ha->reorder();
-    }
-}
-
-/* Return closest neighbors w.r.t Hamming distance, using max count. */
-template <class HammingComputer>
-void hammings_knn_mc(
-        int bytes_per_code,
-        const uint8_t* __restrict a,
-        const uint8_t* __restrict b,
-        size_t na,
-        size_t nb,
-        size_t k,
-        int32_t* __restrict distances,
-        int64_t* __restrict labels,
-        const faiss::IDSelector* sel) {
-    const int nBuckets = bytes_per_code * 8 + 1;
-    std::vector<int> all_counters(na * nBuckets, 0);
-    std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
-
-    std::vector<HCounterState<HammingComputer>> cs;
-    for (size_t i = 0; i < na; ++i) {
-        cs.push_back(
-                HCounterState<HammingComputer>(
-                        all_counters.data() + i * nBuckets,
-                        all_ids_per_dis.get() + i * nBuckets * k,
-                        a + i * bytes_per_code,
-                        8 * bytes_per_code,
-                        k));
-    }
-
-    const size_t block_size = hamming_batch_size;
-    for (size_t j0 = 0; j0 < nb; j0 += block_size) {
-        const size_t j1 = std::min(j0 + block_size, nb);
-#pragma omp parallel for
-        for (int64_t i = 0; i < static_cast<int64_t>(na); ++i) {
-            for (size_t j = j0; j < j1; ++j) {
-                if (!sel || sel->is_member(j)) {
-                    cs[i].update_counter(b + j * bytes_per_code, j);
-                }
-            }
-        }
-    }
-
-    for (size_t i = 0; i < na; ++i) {
-        HCounterState<HammingComputer>& csi = cs[i];
-
-        size_t nres = 0;
-        for (int b_2 = 0; b_2 < nBuckets && nres < k; b_2++) {
-            for (int l = 0; l < csi.counters[b_2] && nres < k; l++) {
-                labels[i * k + nres] = csi.ids_per_dis[b_2 * k + l];
-                distances[i * k + nres] = b_2;
-                nres++;
-            }
-        }
-        while (nres < k) {
-            labels[i * k + nres] = -1;
-            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
-            ++nres;
-        }
-    }
-}
-
-template <class HammingComputer>
-void hamming_range_search(
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t na,
-        size_t nb,
-        int radius,
-        size_t code_size,
-        RangeSearchResult* res,
-        const faiss::IDSelector* sel) {
-#pragma omp parallel
-    {
-        RangeSearchPartialResult pres(res);
-
-#pragma omp for
-        for (int64_t i = 0; i < static_cast<int64_t>(na); i++) {
-            HammingComputer hc(a + i * code_size, code_size);
-            const uint8_t* yi = b;
-            RangeQueryResult& qres = pres.new_result(i);
-
-            for (size_t j = 0; j < nb; j++) {
-                if (!sel || sel->is_member(j)) {
-                    int dis = hc.hamming(yi);
-                    if (dis < radius) {
-                        qres.add(dis, j);
-                    }
-                }
-                yi += code_size;
-            }
-        }
-        pres.finalize();
-    }
-}
-
-struct Run_hammings_knn_hc {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        hammings_knn_hc<HammingComputer>(args...);
-    }
-};
-
-struct Run_hammings_knn_mc {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        hammings_knn_mc<HammingComputer>(args...);
-    }
-};
-
-struct Run_hamming_range_search {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        hamming_range_search<HammingComputer>(args...);
-    }
-};
-
-} // namespace
-
-/* Functions to maps vectors to bits. Assume proper allocation done beforehand,
-   meaning that b should be be able to receive as many bits as x may produce. */
-
-/*
- * dimension 0 corresponds to the least significant bit of b[0], or
- * equivalently to the lsb of the first byte that is stored.
- */
 void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) {
     for (size_t i = 0; i < d; i += 8) {
         uint8_t w = 0;
@@ -381,8 +61,6 @@ void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) {
     }
 }
 
-/* Same but for n vectors.
-   Ensure that the output b is byte-aligned (pad with 0s). */
 void fvecs2bitvecs(
         const float* __restrict x,
         uint8_t* __restrict b,
@@ -407,7 +85,6 @@ void bitvecs2fvecs(
     }
 }
 
-/* Reverse bit (NOT a optimized function, only used for print purpose) */
 static uint64_t uint64_reverse_bits(uint64_t b) {
     int i;
     uint64_t revb = 0;
@@ -419,7 +96,6 @@ static uint64_t uint64_reverse_bits(uint64_t b) {
     return revb;
 }
 
-/* print the bit vector */
 void bitvec_print(const uint8_t* b, size_t d) {
     size_t i, j;
     for (i = 0; i < d;) {
@@ -459,12 +135,10 @@ void bitvec_shuffle(
     }
 }
 
-/*----------------------------------------*/
-/* Hamming distance computation and k-nn  */
+/******************************************************************
+ * Dispatched Hamming distance public API
+ ******************************************************************/
 
-#define C64(x) ((uint64_t*)x)
-
-/* Compute a set of Hamming distances */
 void hammings(
         const uint8_t* __restrict a,
         const uint8_t* __restrict b,
@@ -472,24 +146,9 @@ void hammings(
         size_t nb,
         size_t ncodes,
         hamdis_t* __restrict dis) {
-    FAISS_THROW_IF_NOT(ncodes % 8 == 0);
-    switch (ncodes) {
-        case 8:
-            faiss::hammings<64>(C64(a), C64(b), na, nb, dis);
-            return;
-        case 16:
-            faiss::hammings<128>(C64(a), C64(b), na, nb, dis);
-            return;
-        case 32:
-            faiss::hammings<256>(C64(a), C64(b), na, nb, dis);
-            return;
-        case 64:
-            faiss::hammings<512>(C64(a), C64(b), na, nb, dis);
-            return;
-        default:
-            faiss::hammings(C64(a), C64(b), na, nb, ncodes * 8, dis);
-            return;
-    }
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        hammings_dispatch<SL>(a, b, na, nb, ncodes, dis);
+    });
 }
 
 void hammings_knn(
@@ -511,19 +170,10 @@ void hammings_knn_hc(
         int order,
         ApproxTopK_mode_t approx_topk_mode,
         const faiss::IDSelector* sel) {
-    Run_hammings_knn_hc r;
-    dispatch_HammingComputer(
-            ncodes,
-            r,
-            ncodes,
-            ha,
-            a,
-            b,
-            nb,
-            order,
-            true,
-            approx_topk_mode,
-            sel);
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        hammings_knn_hc_dispatch<SL>(
+                ha, a, b, nb, ncodes, order, approx_topk_mode, sel);
+    });
 }
 
 void hammings_knn_mc(
@@ -536,9 +186,10 @@ void hammings_knn_mc(
         int32_t* __restrict distances,
         int64_t* __restrict labels,
         const faiss::IDSelector* sel) {
-    Run_hammings_knn_mc r;
-    dispatch_HammingComputer(
-            ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel);
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        hammings_knn_mc_dispatch<SL>(
+                a, b, na, nb, k, ncodes, distances, labels, sel);
+    });
 }
 
 void hamming_range_search(
@@ -550,12 +201,12 @@ void hamming_range_search(
         size_t code_size,
         RangeSearchResult* result,
         const faiss::IDSelector* sel) {
-    Run_hamming_range_search r;
-    dispatch_HammingComputer(
-            code_size, r, a, b, na, nb, radius, code_size, result, sel);
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        hamming_range_search_dispatch<SL>(
+                a, b, na, nb, radius, code_size, result, sel);
+    });
 }
 
-/* Count number of matches given a max threshold            */
 void hamming_count_thres(
         const uint8_t* bs1,
         const uint8_t* bs2,
@@ -564,54 +215,22 @@ void hamming_count_thres(
         hamdis_t ht,
         size_t ncodes,
         size_t* nptr) {
-    switch (ncodes) {
-        case 8:
-            faiss::hamming_count_thres<64>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        case 16:
-            faiss::hamming_count_thres<128>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        case 32:
-            faiss::hamming_count_thres<256>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        case 64:
-            faiss::hamming_count_thres<512>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        default:
-            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
-    }
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        hamming_count_thres_dispatch<SL>(bs1, bs2, n1, n2, ht, ncodes, nptr);
+    });
 }
 
-/* Count number of cross-matches given a threshold */
 void crosshamming_count_thres(
         const uint8_t* dbs,
         size_t n,
         hamdis_t ht,
         size_t ncodes,
         size_t* nptr) {
-    switch (ncodes) {
-        case 8:
-            faiss::crosshamming_count_thres<64>(C64(dbs), n, ht, nptr);
-            return;
-        case 16:
-            faiss::crosshamming_count_thres<128>(C64(dbs), n, ht, nptr);
-            return;
-        case 32:
-            faiss::crosshamming_count_thres<256>(C64(dbs), n, ht, nptr);
-            return;
-        case 64:
-            faiss::crosshamming_count_thres<512>(C64(dbs), n, ht, nptr);
-            return;
-        default:
-            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
-    }
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        crosshamming_count_thres_dispatch<SL>(dbs, n, ht, ncodes, nptr);
+    });
 }
 
-/* Returns all matches given a threshold */
 size_t match_hamming_thres(
         const uint8_t* bs1,
         const uint8_t* bs2,
@@ -621,49 +240,10 @@ size_t match_hamming_thres(
         size_t ncodes,
         int64_t* idx,
         hamdis_t* dis) {
-    switch (ncodes) {
-        case 8:
-            return faiss::match_hamming_thres<64>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        case 16:
-            return faiss::match_hamming_thres<128>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        case 32:
-            return faiss::match_hamming_thres<256>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        case 64:
-            return faiss::match_hamming_thres<512>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        default:
-            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
-            return 0;
-    }
-}
-
-#undef C64
-
-/*************************************
- * generalized Hamming distances
- ************************************/
-
-template <class HammingComputer>
-static void hamming_dis_inner_loop(
-        const uint8_t* __restrict ca,
-        const uint8_t* __restrict cb,
-        size_t nb,
-        size_t code_size,
-        int k,
-        hamdis_t* __restrict bh_val_,
-        int64_t* __restrict bh_ids_) {
-    HammingComputer hc(ca, code_size);
-
-    for (size_t j = 0; j < nb; j++) {
-        int ndiff = hc.hamming(cb);
-        cb += code_size;
-        if (ndiff < bh_val_[0]) {
-            maxheap_replace_top<hamdis_t>(k, bh_val_, bh_ids_, ndiff, j);
-        }
-    }
+    return with_simd_level_256bit([&]<SIMDLevel SL>() -> size_t {
+        return match_hamming_thres_dispatch<SL>(
+                bs1, bs2, n1, n2, ht, ncodes, idx, dis);
+    });
 }
 
 void generalized_hammings_knn_hc(
@@ -673,46 +253,16 @@ void generalized_hammings_knn_hc(
         size_t nb,
         size_t code_size,
         int ordered) {
-    int na = ha->nh;
-    int k = ha->k;
-
-    if (ordered) {
-        ha->heapify();
-    }
-
-#pragma omp parallel for
-    for (int i = 0; i < na; i++) {
-        const uint8_t* __restrict ca = a + i * code_size;
-        const uint8_t* __restrict cb = b;
-
-        hamdis_t* __restrict bh_val_ = ha->val + i * k;
-        int64_t* __restrict bh_ids_ = ha->ids + i * k;
-
-        switch (code_size) {
-            case 8:
-                hamming_dis_inner_loop<GenHammingComputer8>(
-                        ca, cb, nb, 8, k, bh_val_, bh_ids_);
-                break;
-            case 16:
-                hamming_dis_inner_loop<GenHammingComputer16>(
-                        ca, cb, nb, 16, k, bh_val_, bh_ids_);
-                break;
-            case 32:
-                hamming_dis_inner_loop<GenHammingComputer32>(
-                        ca, cb, nb, 32, k, bh_val_, bh_ids_);
-                break;
-            default:
-                hamming_dis_inner_loop<GenHammingComputerM8>(
-                        ca, cb, nb, code_size, k, bh_val_, bh_ids_);
-                break;
-        }
-    }
-
-    if (ordered) {
-        ha->reorder();
-    }
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        generalized_hammings_knn_hc_dispatch<SL>(
+                ha, a, b, nb, code_size, ordered);
+    });
 }
 
+/******************************************************************
+ * Bitstring pack/unpack (scalar, no dispatch needed)
+ ******************************************************************/
+
 void pack_bitstrings(
         size_t n,
         size_t M,
diff --git a/faiss/utils/hamming.h b/faiss/utils/hamming.h
index 64dfbf7467..7011572856 100644
--- a/faiss/utils/hamming.h
+++ b/faiss/utils/hamming.h
@@ -30,6 +30,7 @@
 #include <faiss/impl/IDSelector.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
+#include <faiss/utils/simd_levels.h>
 
 // Low-level Hamming distance computations and hamdis_t.
 #include <faiss/utils/hamming_distance/hamdis-inl.h>
@@ -284,6 +285,90 @@ void unpack_bitstrings(
         size_t code_size,
         int32_t* unpacked);
 
+/** SIMD-dispatched Hamming function implementations.
+ *  Specializations live in per-ISA TUs (hamming_avx2.cpp, etc.). */
+
+template <SIMDLevel SL>
+void hammings_knn_hc_dispatch(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t ncodes,
+        int ordered,
+        ApproxTopK_mode_t approx_topk_mode,
+        const IDSelector* sel);
+
+template <SIMDLevel SL>
+void hammings_knn_mc_dispatch(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        size_t ncodes,
+        int32_t* distances,
+        int64_t* labels,
+        const IDSelector* sel);
+
+template <SIMDLevel SL>
+void hamming_range_search_dispatch(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        int radius,
+        size_t code_size,
+        RangeSearchResult* result,
+        const IDSelector* sel);
+
+template <SIMDLevel SL>
+void hammings_dispatch(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        size_t ncodes,
+        hamdis_t* dis);
+
+template <SIMDLevel SL>
+void generalized_hammings_knn_hc_dispatch(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t code_size,
+        int ordered);
+
+template <SIMDLevel SL>
+void hamming_count_thres_dispatch(
+        const uint8_t* bs1,
+        const uint8_t* bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t* nptr);
+
+template <SIMDLevel SL>
+void crosshamming_count_thres_dispatch(
+        const uint8_t* dbs,
+        size_t n,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t* nptr);
+
+template <SIMDLevel SL>
+size_t match_hamming_thres_dispatch(
+        const uint8_t* bs1,
+        const uint8_t* bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        int64_t* idx,
+        hamdis_t* dis);
+
 } // namespace faiss
 
 #include <faiss/utils/hamming-inl.h>
diff --git a/faiss/utils/hamming_distance/avx2-inl.h b/faiss/utils/hamming_distance/avx2-inl.h
index b7d5f32e66..1ca322c7b2 100644
--- a/faiss/utils/hamming_distance/avx2-inl.h
+++ b/faiss/utils/hamming_distance/avx2-inl.h
@@ -8,10 +8,11 @@
 #ifndef HAMMING_AVX2_INL_H
 #define HAMMING_AVX2_INL_H
 
-// AVX2 version
+// AVX2 version.
+// Most code is now in common.h. This file only contains the
+// GenHammingComputer classes that leverage SSE/AVX2 intrinsics.
 
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 
 #include <faiss/impl/platform_macros.h>
@@ -20,376 +21,13 @@
 
 namespace faiss {
 
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t nwords) {
-    hamdis_t h = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        (void)code_size;
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        (void)code_size;
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        (void)code_size;
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 20);
-        (void)code_size;
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*(uint32_t*)(b + 2) ^ a2);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 20;
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        (void)code_size;
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-struct HammingComputer64 {
-    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
-
-    HammingComputer64() {}
-
-    HammingComputer64(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 64);
-        (void)code_size;
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-        a4 = a[4];
-        a5 = a[5];
-        a6 = a[6];
-        a7 = a[7];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
-                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
-                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 64;
-    }
-};
-
-struct HammingComputerDefault {
-    const uint8_t* a8;
-    int quotient8;
-    int remainder8;
-
-    HammingComputerDefault() {}
-
-    HammingComputerDefault(const uint8_t* a8_in, int code_size) {
-        set(a8_in, code_size);
-    }
-
-    void set(const uint8_t* a8_2, int code_size) {
-        this->a8 = a8_2;
-        quotient8 = code_size / 8;
-        remainder8 = code_size % 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        int accu = 0;
-
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0, len = quotient8;
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    [[fallthrough]];
-                default:
-                    break;
-            }
-        }
-
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return quotient8 * 8 + remainder8;
-    }
-};
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        (void)code_size;
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return generalized_hamming_64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
 // I'm not sure whether this version is faster of slower, tbh
 // todo: test on different CPUs
 struct GenHammingComputer16 {
     __m128i a;
 
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
+    GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 16);
-        (void)code_size;
         a = _mm_loadu_si128((const __m128i_u*)a8);
     }
 
@@ -408,9 +46,8 @@ struct GenHammingComputer16 {
 struct GenHammingComputer32 {
     __m256i a;
 
-    GenHammingComputer32(const uint8_t* a8, int code_size) {
+    GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 32);
-        (void)code_size;
         a = _mm256_loadu_si256((const __m256i_u*)a8);
     }
 
diff --git a/faiss/utils/hamming_distance/avx512-inl.h b/faiss/utils/hamming_distance/avx512-inl.h
index dfd8f55bfe..af04ef175d 100644
--- a/faiss/utils/hamming_distance/avx512-inl.h
+++ b/faiss/utils/hamming_distance/avx512-inl.h
@@ -8,15 +8,13 @@
 #ifndef HAMMING_AVX512_INL_H
 #define HAMMING_AVX512_INL_H
 
-// AVX512 version
-// The _mm512_popcnt_epi64 intrinsic is used to accelerate Hamming distance
-// calculations in HammingComputerDefault and HammingComputer64. This intrinsic
-// is not available in the default Faiss avx512 build mode but is only
-// available in the avx512_spr build mode, which targets Intel(R) Sapphire
-// Rapids.
+// AVX512 version.
+// Most code is now in common.h. This file provides AVX512-specific
+// overrides for HammingComputer64 and HammingComputerDefault
+// (using _mm512_popcnt_epi64 when __AVX512VPOPCNTDQ__ is available),
+// and the GenHammingComputer classes that leverage SSE/AVX2 intrinsics.
 
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 
 #include <faiss/impl/platform_macros.h>
@@ -25,199 +23,6 @@
 
 namespace faiss {
 
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t nwords) {
-    hamdis_t h = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 20);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*(uint32_t*)(b + 2) ^ a2);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 20;
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
 struct HammingComputer64 {
     uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
     const uint64_t* a;
@@ -228,9 +33,9 @@ struct HammingComputer64 {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 64);
-        a = (uint64_t*)a8;
+        a = reinterpret_cast<const uint64_t*>(a8);
         a0 = a[0];
         a1 = a[1];
         a2 = a[2];
@@ -242,7 +47,7 @@ struct HammingComputer64 {
     }
 
     inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
+        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
 #ifdef __AVX512VPOPCNTDQ__
         __m512i vxor =
                 _mm512_xor_si512(_mm512_loadu_si512(a), _mm512_loadu_si512(b));
@@ -269,8 +74,8 @@ struct HammingComputerDefault {
 
     HammingComputerDefault() {}
 
-    HammingComputerDefault(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
+    HammingComputerDefault(const uint8_t* a8_in, int code_size) {
+        set(a8_in, code_size);
     }
 
     void set(const uint8_t* a8_2, int code_size) {
@@ -298,73 +103,8 @@ struct HammingComputerDefault {
         }
         i *= 8;
 #endif
-        int len = quotient8 - i;
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    [[fallthrough]];
-                default:
-                    break;
-            }
-        }
-
+        accu += hamming_popcount_tail(
+                a64, b64, i, quotient8, a8, b8, remainder8);
         return accu;
     }
 
@@ -373,42 +113,12 @@ struct HammingComputerDefault {
     }
 };
 
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return generalized_hamming_64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
 // I'm not sure whether this version is faster of slower, tbh
 // todo: test on different CPUs
 struct GenHammingComputer16 {
     __m128i a;
 
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
+    GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 16);
         a = _mm_loadu_si128((const __m128i_u*)a8);
     }
@@ -428,7 +138,7 @@ struct GenHammingComputer16 {
 struct GenHammingComputer32 {
     __m256i a;
 
-    GenHammingComputer32(const uint8_t* a8, int code_size) {
+    GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 32);
         a = _mm256_loadu_si256((const __m256i_u*)a8);
     }
diff --git a/faiss/utils/hamming_distance/common.h b/faiss/utils/hamming_distance/common.h
index 154e14e2cc..b50feca2e0 100644
--- a/faiss/utils/hamming_distance/common.h
+++ b/faiss/utils/hamming_distance/common.h
@@ -8,6 +8,8 @@
 #ifndef FAISS_hamming_common_h
 #define FAISS_hamming_common_h
 
+#include <cassert>
+#include <cstddef>
 #include <cstdint>
 
 #include <faiss/impl/platform_macros.h>
@@ -43,6 +45,407 @@ inline constexpr uint8_t hamdis_tab_ham_bytes[256] = {
         4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
         4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
 
+/***************************************************************************
+ * Universal code: identical across all architectures
+ ***************************************************************************/
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++) {
+        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
+    }
+    return h;
+}
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+inline int generalized_hamming_64(uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64(a);
+}
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4() {}
+
+    HammingComputer4(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 4);
+        const uint32_t* a32 = reinterpret_cast<const uint32_t*>(a);
+        a0 = *a32;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        const uint32_t* b32 = reinterpret_cast<const uint32_t*>(b);
+        return popcount64(*b32 ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 4;
+    }
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8() {}
+
+    HammingComputer8(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 8);
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a);
+        a0 = *a64;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b);
+        return popcount64(*b64 ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+/* Duff's device + byte remainder tail for HammingComputerDefault.
+ * Processes uint64 words starting at index i_start using popcount,
+ * then handles any remaining bytes via lookup table. */
+inline int hamming_popcount_tail(
+        const uint64_t* a64,
+        const uint64_t* b64,
+        int i_start,
+        int quotient8,
+        const uint8_t* a8,
+        const uint8_t* b8,
+        int remainder8) {
+    int accu = 0;
+    int i = i_start;
+    int len = quotient8 - i_start;
+    switch (len & 7) {
+        default:
+            while (len > 7) {
+                len -= 8;
+                accu += popcount64(a64[i] ^ b64[i]);
+                i++;
+                [[fallthrough]];
+                case 7:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 6:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 5:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 4:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 3:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 2:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 1:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+            }
+    }
+    if (remainder8) {
+        const uint8_t* a = a8 + 8 * quotient8;
+        const uint8_t* b = b8 + 8 * quotient8;
+        switch (remainder8) {
+            case 7:
+                accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                [[fallthrough]];
+            case 6:
+                accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                [[fallthrough]];
+            case 5:
+                accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                [[fallthrough]];
+            case 4:
+                accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                [[fallthrough]];
+            case 3:
+                accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                [[fallthrough]];
+            case 2:
+                accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                [[fallthrough]];
+            case 1:
+                accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                break;
+            default:
+                break;
+        }
+    }
+    return accu;
+}
+
+/***************************************************************************
+ * Scalar code shared by generic, AVX2, and AVX512 backends.
+ * NEON provides its own optimized versions of these.
+ ***************************************************************************/
+
+#ifndef __aarch64__
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
+    const size_t nwords = nbits / 64;
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/* specialized (optimized) functions */
+template <>
+inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]);
+}
+
+template <>
+inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
+}
+
+template <>
+inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
+            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
+}
+
+/* Hamming distances for multiple of 64 bits */
+inline hamdis_t hamming(
+        const uint64_t* bs1,
+        const uint64_t* bs2,
+        size_t nwords) {
+    hamdis_t h = 0;
+    for (size_t i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+struct HammingComputer16 {
+    uint64_t a0, a1;
+
+    HammingComputer16() {}
+
+    HammingComputer16(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint64_t a0, a1;
+    uint32_t a2;
+
+    HammingComputer20() {}
+
+    HammingComputer20(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 20);
+        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
+        const uint32_t* a32 = reinterpret_cast<const uint32_t*>(a8);
+        a0 = a[0];
+        a1 = a[1];
+        // can't read a[2] since it is uint64_t, not uint32_t
+        // results in AddressSanitizer failure reading past end of array
+        a2 = a32[4];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
+        const uint32_t* b32_tail = reinterpret_cast<const uint32_t*>(b + 2);
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(*b32_tail ^ a2);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 20;
+    }
+};
+
+struct HammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    HammingComputer32() {}
+
+    HammingComputer32(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct GenHammingComputer8 {
+    uint64_t a0;
+
+    GenHammingComputer8(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 8);
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a);
+        a0 = *a64;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b);
+        return generalized_hamming_64(*b64 ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+#endif // !__aarch64__
+
+/***************************************************************************
+ * Scalar code shared by generic and AVX2 backends only.
+ * AVX512 and NEON provide their own optimized versions.
+ ***************************************************************************/
+
+#if !defined(__aarch64__) && !defined(__AVX512F__)
+
+struct HammingComputer64 {
+    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
+
+    HammingComputer64() {}
+
+    HammingComputer64(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 64);
+        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+        a4 = a[4];
+        a5 = a[5];
+        a6 = a[6];
+        a7 = a[7];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
+                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
+                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 64;
+    }
+};
+
+struct HammingComputerDefault {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+
+    HammingComputerDefault() {}
+
+    HammingComputerDefault(const uint8_t* a8_in, int code_size) {
+        set(a8_in, code_size);
+    }
+
+    void set(const uint8_t* a8_in, int code_size) {
+        this->a8 = a8_in;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+        return hamming_popcount_tail(
+                a64, b64, 0, quotient8, a8, b8, remainder8);
+    }
+
+    inline int get_code_size() const {
+        return quotient8 * 8 + remainder8;
+    }
+};
+
+#endif // !__aarch64__ && !__AVX512F__
+
 } // namespace faiss
 
 #endif
diff --git a/faiss/utils/hamming_distance/generic-inl.h b/faiss/utils/hamming_distance/generic-inl.h
index 7f229f4f11..59d942dea6 100644
--- a/faiss/utils/hamming_distance/generic-inl.h
+++ b/faiss/utils/hamming_distance/generic-inl.h
@@ -9,383 +9,19 @@
 #define HAMMING_GENERIC_INL_H
 
 // A general-purpose version of hamming distance computation.
+// Most code is now in common.h. This file only contains the
+// GenHammingComputer classes that use scalar generalized_hamming_64.
 
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 
 #include <faiss/impl/platform_macros.h>
 
 namespace faiss {
 
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t nwords) {
-    hamdis_t h = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, [[maybe_unused]] int code_size) {
-        assert(code_size == 4);
-        const uint32_t* a32 = reinterpret_cast<const uint32_t*>(a);
-        a0 = *a32;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        const uint32_t* b32 = reinterpret_cast<const uint32_t*>(b);
-        return popcount64(*b32 ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, [[maybe_unused]] int code_size) {
-        assert(code_size == 8);
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a);
-        a0 = *a64;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b);
-        return popcount64(*b64 ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, [[maybe_unused]] int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, [[maybe_unused]] int code_size) {
-        assert(code_size == 20);
-        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
-        const uint32_t* a32 = reinterpret_cast<const uint32_t*>(a8);
-        a0 = a[0];
-        a1 = a[1];
-        // can't read a[2] since it is uint64_t, not uint32_t
-        // results in AddressSanitizer failure reading past end of array
-        a2 = a32[4];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
-        const uint32_t* b32_tail = reinterpret_cast<const uint32_t*>(b + 2);
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*b32_tail ^ a2);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 20;
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, [[maybe_unused]] int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-struct HammingComputer64 {
-    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
-
-    HammingComputer64() {}
-
-    HammingComputer64(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, [[maybe_unused]] int code_size) {
-        assert(code_size == 64);
-        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-        a4 = a[4];
-        a5 = a[5];
-        a6 = a[6];
-        a7 = a[7];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
-                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
-                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 64;
-    }
-};
-
-struct HammingComputerDefault {
-    const uint8_t* a8;
-    int quotient8;
-    int remainder8;
-
-    HammingComputerDefault() {}
-
-    HammingComputerDefault(const uint8_t* a8_in, int code_size) {
-        set(a8_in, code_size);
-    }
-
-    void set(const uint8_t* a8_in, int code_size) {
-        this->a8 = a8_in;
-        quotient8 = code_size / 8;
-        remainder8 = code_size % 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        int accu = 0;
-
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0, len = quotient8;
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    break;
-                default:
-                    break;
-            }
-        }
-
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return quotient8 * 8 + remainder8;
-    }
-};
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, [[maybe_unused]] int code_size) {
-        assert(code_size == 8);
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a);
-        a0 = *a64;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b);
-        return generalized_hamming_64(*b64 ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
 struct GenHammingComputer16 {
     uint64_t a0, a1;
-    GenHammingComputer16(const uint8_t* a8, [[maybe_unused]] int code_size) {
+    GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 16);
         const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
         a0 = a[0];
@@ -406,7 +42,7 @@ struct GenHammingComputer16 {
 struct GenHammingComputer32 {
     uint64_t a0, a1, a2, a3;
 
-    GenHammingComputer32(const uint8_t* a8, [[maybe_unused]] int code_size) {
+    GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 32);
         const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
         a0 = a[0];
diff --git a/faiss/utils/hamming_distance/neon-inl.h b/faiss/utils/hamming_distance/neon-inl.h
index d00a402857..d29f40e0b5 100644
--- a/faiss/utils/hamming_distance/neon-inl.h
+++ b/faiss/utils/hamming_distance/neon-inl.h
@@ -8,8 +8,11 @@
 #ifndef HAMMING_NEON_INL_H
 #define HAMMING_NEON_INL_H
 
-// a specialized version of hamming is needed here, because both
+// A specialized version of hamming is needed here, because both
 // gcc, clang and msvc seem to generate suboptimal code sometimes.
+//
+// Universal code (HammingComputer4, HammingComputer8, generalized_hamming_64,
+// byte-level hamming template, hamming_popcount_tail) comes from common.h.
 
 #ifdef __aarch64__
 
@@ -25,18 +28,6 @@
 
 namespace faiss {
 
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
 /* Hamming distances for multiples of 64 bits */
 template <size_t nbits>
 inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb) {
@@ -125,59 +116,10 @@ inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb, size_t nwords) {
 }
 
 /******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
+ * NEON-optimized HammingComputer classes for sizes 16-64.
+ * Sizes 4 and 8 use the scalar versions from common.h.
  ******************************************************************/
 
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
 struct HammingComputer16 {
     uint8x16_t a0;
 
@@ -187,7 +129,7 @@ struct HammingComputer16 {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 16);
         a0 = vld1q_u8(a8);
     }
@@ -218,7 +160,7 @@ struct HammingComputer20 {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 20);
 
         a0 = vld1q_u8(a8);
@@ -253,7 +195,7 @@ struct HammingComputer32 {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 32);
         a0 = vld1q_u8(a8);
         a1 = vld1q_u8(a8 + 16);
@@ -286,7 +228,7 @@ struct HammingComputer64 {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 64);
         hc0.set(a8, 32);
         hc1.set(a8 + 32, 32);
@@ -308,12 +250,12 @@ struct HammingComputerDefault {
 
     HammingComputerDefault() {}
 
-    HammingComputerDefault(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
+    HammingComputerDefault(const uint8_t* a8_in, int code_size) {
+        set(a8_in, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
-        this->a8 = a8;
+    void set(const uint8_t* a8_in, int code_size) {
+        this->a8 = a8_in;
         quotient8 = code_size / 8;
         remainder8 = code_size % 8;
     }
@@ -323,80 +265,15 @@ struct HammingComputerDefault {
 
         const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
         const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0, len = quotient8;
+        int i = 0;
 
         int len256 = (quotient8 / 4) * 4;
         for (; i < len256; i += 4) {
             accu += ::faiss::hamming<256>(a64 + i, b64 + i);
-            len -= 4;
-        }
-
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    [[fallthrough]];
-                default:
-                    break;
-            }
         }
 
+        accu += hamming_popcount_tail(
+                a64, b64, i, quotient8, a8, b8, remainder8);
         return accu;
     }
 
@@ -406,22 +283,13 @@ struct HammingComputerDefault {
 };
 
 /***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
+ * NEON-optimized generalized Hamming computers.
  ***************************************************************************/
 
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
 struct GenHammingComputer8 {
     uint8x8_t a0;
 
-    GenHammingComputer8(const uint8_t* a8, int code_size) {
+    GenHammingComputer8(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 8);
         a0 = vld1_u8(a8);
     }
@@ -441,7 +309,7 @@ struct GenHammingComputer8 {
 struct GenHammingComputer16 {
     uint8x16_t a0;
 
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
+    GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 16);
         a0 = vld1q_u8(a8);
     }
@@ -461,7 +329,7 @@ struct GenHammingComputer16 {
 struct GenHammingComputer32 {
     GenHammingComputer16 a0, a1;
 
-    GenHammingComputer32(const uint8_t* a8, int code_size)
+    GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size)
             : a0(a8, 16), a1(a8 + 16, 16) {
         assert(code_size == 32);
     }
diff --git a/faiss/utils/simd_impl/hamming_avx2.cpp b/faiss/utils/simd_impl/hamming_avx2.cpp
new file mode 100644
index 0000000000..ac0787a094
--- /dev/null
+++ b/faiss/utils/simd_impl/hamming_avx2.cpp
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_AVX2
+
+#define THE_SIMD_LEVEL SIMDLevel::AVX2
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/utils/simd_impl/hamming_impl.h>
+
+#endif // COMPILE_SIMD_AVX2
diff --git a/faiss/utils/simd_impl/hamming_impl.h b/faiss/utils/simd_impl/hamming_impl.h
new file mode 100644
index 0000000000..35e63d575f
--- /dev/null
+++ b/faiss/utils/simd_impl/hamming_impl.h
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Shared implementation header for Hamming distance dynamic dispatch.
+// Included by per-ISA TUs (hamming_avx2.cpp, hamming_neon.cpp) and by
+// hamming.cpp (for the NONE fallback).
+//
+// THE_SIMD_LEVEL must be defined before including this header.
+//
+// INCLUDE ORDERING IS LOAD-BEARING: hamdis-inl.h must be included before
+// hamming.h so that the ISA-specific struct definitions (selected by
+// __AVX2__ / __aarch64__ / etc.) are established before hamming.h's own
+// #include of hamdis-inl.h becomes a no-op via include guards.
+
+#pragma once
+
+#ifndef THE_SIMD_LEVEL
+#error "Define THE_SIMD_LEVEL before including hamming_impl.h"
+#endif
+
+// ISA-specific struct definitions — MUST come first (see comment above).
+#include <faiss/utils/hamming_distance/hamdis-inl.h>
+
+#include <faiss/utils/hamming.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/approx_topk_hamming/approx_topk_hamming.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+// Alias for readability in the template specializations below.
+constexpr SIMDLevel hamming_impl_SL = THE_SIMD_LEVEL;
+
+// All implementation templates live in an anonymous namespace for ODR safety.
+// Different TUs compile this header with different HammingComputer struct
+// layouts (generic vs AVX2 vs NEON). The anonymous namespace ensures each
+// TU gets its own copy with internal linkage, preventing the linker from
+// merging incompatible instantiations.
+namespace {
+
+/******************************************************************
+ * Bit-level Hamming distance templates
+ ******************************************************************/
+
+template <size_t nbits>
+void hammings_impl(
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t* __restrict dis) {
+    size_t i, j;
+    const size_t nwords = nbits / 64;
+    for (i = 0; i < n1; i++) {
+        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
+        hamdis_t* __restrict dis_ = dis + i * n2;
+        for (j = 0; j < n2; j++) {
+            dis_[j] = hamming<nbits>(bs1_, bs2 + j * nwords);
+        }
+    }
+}
+
+void hammings_impl_runtime(
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
+        size_t n1,
+        size_t n2,
+        size_t nbits,
+        hamdis_t* __restrict dis) {
+    size_t i, j;
+    const size_t nwords = nbits / 64;
+    for (i = 0; i < n1; i++) {
+        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
+        hamdis_t* __restrict dis_ = dis + i * n2;
+        for (j = 0; j < n2; j++) {
+            dis_[j] = hamming(bs1_, bs2 + j * nwords, nwords);
+        }
+    }
+}
+
+template <size_t nbits>
+void hamming_count_thres_impl(
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t* __restrict nptr) {
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    const uint64_t* bs2_ = bs2;
+
+    for (i = 0; i < n1; i++) {
+        bs2 = bs2_;
+        for (j = 0; j < n2; j++) {
+            if (hamming<nbits>(bs1, bs2) <= ht) {
+                posm++;
+            }
+            bs2 += nwords;
+        }
+        bs1 += nwords;
+    }
+    *nptr = posm;
+}
+
+template <size_t nbits>
+void crosshamming_count_thres_impl(
+        const uint64_t* __restrict dbs,
+        size_t n,
+        int ht,
+        size_t* __restrict nptr) {
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    const uint64_t* bs1 = dbs;
+    for (i = 0; i < n; i++) {
+        const uint64_t* bs2 = bs1 + 2;
+        for (j = i + 1; j < n; j++) {
+            if (hamming<nbits>(bs1, bs2) <= ht) {
+                posm++;
+            }
+            bs2 += nwords;
+        }
+        bs1 += nwords;
+    }
+    *nptr = posm;
+}
+
+template <size_t nbits>
+size_t match_hamming_thres_impl(
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
+        size_t n1,
+        size_t n2,
+        int ht,
+        int64_t* __restrict idx,
+        hamdis_t* __restrict hams) {
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    hamdis_t h;
+    const uint64_t* bs2_ = bs2;
+    for (i = 0; i < n1; i++) {
+        bs2 = bs2_;
+        for (j = 0; j < n2; j++) {
+            h = hamming<nbits>(bs1, bs2);
+            if (h <= ht) {
+                *idx = i;
+                idx++;
+                *idx = j;
+                idx++;
+                *hams = h;
+                hams++;
+                posm++;
+            }
+            bs2 += nwords;
+        }
+        bs1 += nwords;
+    }
+    return posm;
+}
+
+/******************************************************************
+ * HammingComputer-based search templates
+ ******************************************************************/
+
+template <class HammingComputer>
+void hammings_knn_hc_impl(
+        int bytes_per_code,
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict bs1,
+        const uint8_t* __restrict bs2,
+        size_t n2,
+        bool order = true,
+        bool init_heap = true,
+        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK,
+        const faiss::IDSelector* sel = nullptr) {
+    size_t k = ha->k;
+    if (init_heap) {
+        ha->heapify();
+    }
+
+    const size_t block_size = hamming_batch_size;
+    for (size_t j0 = 0; j0 < n2; j0 += block_size) {
+        const size_t j1 = std::min(j0 + block_size, n2);
+#pragma omp parallel for
+        for (int64_t i = 0; i < static_cast<int64_t>(ha->nh); i++) {
+            HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code);
+
+            const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code;
+            hamdis_t dis;
+            hamdis_t* __restrict bh_val_ = ha->val + i * k;
+            int64_t* __restrict bh_ids_ = ha->ids + i * k;
+
+#define HANDLE_APPROX(NB, BD)                                                \
+    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD:               \
+        FAISS_THROW_IF_NOT_FMT(                                              \
+                k <= NB * BD,                                                \
+                "The chosen mode (%d) of approximate top-k supports "        \
+                "up to %d values, but %zd is requested.",                    \
+                (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \
+                NB * BD,                                                     \
+                k);                                                          \
+        HeapWithBucketsForHamming32<                                         \
+                CMax<hamdis_t, int64_t>,                                     \
+                NB,                                                          \
+                BD,                                                          \
+                HammingComputer>::                                           \
+                addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel);           \
+        break;
+
+            switch (approx_topk_mode) {
+                HANDLE_APPROX(8, 3)
+                HANDLE_APPROX(8, 2)
+                HANDLE_APPROX(16, 2)
+                HANDLE_APPROX(32, 2)
+                default: {
+                    for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) {
+                        if (sel && !sel->is_member(j)) {
+                            continue;
+                        }
+                        dis = hc.hamming(bs2_);
+                        if (dis < bh_val_[0]) {
+                            faiss::maxheap_replace_top<hamdis_t>(
+                                    k, bh_val_, bh_ids_, dis, j);
+                        }
+                    }
+                } break;
+            }
+        }
+    }
+    if (order) {
+        ha->reorder();
+    }
+}
+
+#undef HANDLE_APPROX
+
+template <class HammingComputer>
+void hammings_knn_mc_impl(
+        int bytes_per_code,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        int32_t* __restrict distances,
+        int64_t* __restrict labels,
+        const faiss::IDSelector* sel) {
+    const int nBuckets = bytes_per_code * 8 + 1;
+    std::vector<int> all_counters(na * nBuckets, 0);
+    std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
+
+    std::vector<HCounterState<HammingComputer>> cs;
+    for (size_t i = 0; i < na; ++i) {
+        cs.push_back(
+                HCounterState<HammingComputer>(
+                        all_counters.data() + i * nBuckets,
+                        all_ids_per_dis.get() + i * nBuckets * k,
+                        a + i * bytes_per_code,
+                        8 * bytes_per_code,
+                        k));
+    }
+
+    const size_t block_size = hamming_batch_size;
+    for (size_t j0 = 0; j0 < nb; j0 += block_size) {
+        const size_t j1 = std::min(j0 + block_size, nb);
+#pragma omp parallel for
+        for (int64_t i = 0; i < static_cast<int64_t>(na); ++i) {
+            for (size_t j = j0; j < j1; ++j) {
+                if (!sel || sel->is_member(j)) {
+                    cs[i].update_counter(b + j * bytes_per_code, j);
+                }
+            }
+        }
+    }
+
+    for (size_t i = 0; i < na; ++i) {
+        HCounterState<HammingComputer>& csi = cs[i];
+
+        size_t nres = 0;
+        for (int b_2 = 0; b_2 < nBuckets && nres < k; b_2++) {
+            for (int l = 0; l < csi.counters[b_2] && nres < k; l++) {
+                labels[i * k + nres] = csi.ids_per_dis[b_2 * k + l];
+                distances[i * k + nres] = b_2;
+                nres++;
+            }
+        }
+        while (nres < k) {
+            labels[i * k + nres] = -1;
+            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
+            ++nres;
+        }
+    }
+}
+
+template <class HammingComputer>
+void hamming_range_search_impl(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        int radius,
+        size_t code_size,
+        RangeSearchResult* res,
+        const faiss::IDSelector* sel) {
+#pragma omp parallel
+    {
+        RangeSearchPartialResult pres(res);
+
+#pragma omp for
+        for (int64_t i = 0; i < static_cast<int64_t>(na); i++) {
+            HammingComputer hc(a + i * code_size, code_size);
+            const uint8_t* yi = b;
+            RangeQueryResult& qres = pres.new_result(i);
+
+            for (size_t j = 0; j < nb; j++) {
+                if (!sel || sel->is_member(j)) {
+                    int dis = hc.hamming(yi);
+                    if (dis < radius) {
+                        qres.add(dis, j);
+                    }
+                }
+                yi += code_size;
+            }
+        }
+        pres.finalize();
+    }
+}
+
+// Consumer structs for dispatch_HammingComputer.
+// These MUST be in the anonymous namespace so that each TU's
+// dispatch_HammingComputer instantiation uses unique consumer types,
+// preventing ODR violations across TUs with different struct layouts.
+
+struct Run_hammings_knn_hc {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        hammings_knn_hc_impl<HammingComputer>(args...);
+    }
+};
+
+struct Run_hammings_knn_mc {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        hammings_knn_mc_impl<HammingComputer>(args...);
+    }
+};
+
+struct Run_hamming_range_search {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        hamming_range_search_impl<HammingComputer>(args...);
+    }
+};
+
+/******************************************************************
+ * Generalized Hamming distances
+ ******************************************************************/
+
+template <class HammingComputer>
+void hamming_dis_inner_loop(
+        const uint8_t* __restrict ca,
+        const uint8_t* __restrict cb,
+        size_t nb,
+        size_t code_size,
+        int k,
+        hamdis_t* __restrict bh_val_,
+        int64_t* __restrict bh_ids_) {
+    HammingComputer hc(ca, code_size);
+
+    for (size_t j = 0; j < nb; j++) {
+        int ndiff = hc.hamming(cb);
+        cb += code_size;
+        if (ndiff < bh_val_[0]) {
+            maxheap_replace_top<hamdis_t>(k, bh_val_, bh_ids_, ndiff, j);
+        }
+    }
+}
+
+void generalized_hammings_knn_hc_impl(
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
+        size_t nb,
+        size_t code_size,
+        int ordered) {
+    int na = ha->nh;
+    int k = ha->k;
+
+    if (ordered) {
+        ha->heapify();
+    }
+
+#pragma omp parallel for
+    for (int i = 0; i < na; i++) {
+        const uint8_t* __restrict ca = a + i * code_size;
+        const uint8_t* __restrict cb = b;
+
+        hamdis_t* __restrict bh_val_ = ha->val + i * k;
+        int64_t* __restrict bh_ids_ = ha->ids + i * k;
+
+        switch (code_size) {
+            case 8:
+                hamming_dis_inner_loop<GenHammingComputer8>(
+                        ca, cb, nb, 8, k, bh_val_, bh_ids_);
+                break;
+            case 16:
+                hamming_dis_inner_loop<GenHammingComputer16>(
+                        ca, cb, nb, 16, k, bh_val_, bh_ids_);
+                break;
+            case 32:
+                hamming_dis_inner_loop<GenHammingComputer32>(
+                        ca, cb, nb, 32, k, bh_val_, bh_ids_);
+                break;
+            default:
+                hamming_dis_inner_loop<GenHammingComputerM8>(
+                        ca, cb, nb, code_size, k, bh_val_, bh_ids_);
+                break;
+        }
+    }
+
+    if (ordered) {
+        ha->reorder();
+    }
+}
+
+} // anonymous namespace
+
+/******************************************************************
+ * Entry point template specializations at THE_SIMD_LEVEL
+ ******************************************************************/
+
+#define C64(x) ((uint64_t*)x)
+
+template <>
+void hammings_knn_hc_dispatch<hamming_impl_SL>(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t ncodes,
+        int ordered,
+        ApproxTopK_mode_t approx_topk_mode,
+        const IDSelector* sel) {
+    Run_hammings_knn_hc r;
+    dispatch_HammingComputer(
+            ncodes,
+            r,
+            ncodes,
+            ha,
+            a,
+            b,
+            nb,
+            ordered,
+            true,
+            approx_topk_mode,
+            sel);
+}
+
+template <>
+void hammings_knn_mc_dispatch<hamming_impl_SL>(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        size_t ncodes,
+        int32_t* distances,
+        int64_t* labels,
+        const IDSelector* sel) {
+    Run_hammings_knn_mc r;
+    dispatch_HammingComputer(
+            ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel);
+}
+
+template <>
+void hamming_range_search_dispatch<hamming_impl_SL>(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        int radius,
+        size_t code_size,
+        RangeSearchResult* result,
+        const IDSelector* sel) {
+    Run_hamming_range_search r;
+    dispatch_HammingComputer(
+            code_size, r, a, b, na, nb, radius, code_size, result, sel);
+}
+
+template <>
+void hammings_dispatch<hamming_impl_SL>(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        size_t ncodes,
+        hamdis_t* dis) {
+    FAISS_THROW_IF_NOT(ncodes % 8 == 0);
+    switch (ncodes) {
+        case 8:
+            hammings_impl<64>(C64(a), C64(b), na, nb, dis);
+            return;
+        case 16:
+            hammings_impl<128>(C64(a), C64(b), na, nb, dis);
+            return;
+        case 32:
+            hammings_impl<256>(C64(a), C64(b), na, nb, dis);
+            return;
+        case 64:
+            hammings_impl<512>(C64(a), C64(b), na, nb, dis);
+            return;
+        default:
+            hammings_impl_runtime(C64(a), C64(b), na, nb, ncodes * 8, dis);
+            return;
+    }
+}
+
+template <>
+void generalized_hammings_knn_hc_dispatch<hamming_impl_SL>(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t code_size,
+        int ordered) {
+    generalized_hammings_knn_hc_impl(ha, a, b, nb, code_size, ordered);
+}
+
+template <>
+void hamming_count_thres_dispatch<hamming_impl_SL>(
+        const uint8_t* bs1,
+        const uint8_t* bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t* nptr) {
+    switch (ncodes) {
+        case 8:
+            hamming_count_thres_impl<64>(C64(bs1), C64(bs2), n1, n2, ht, nptr);
+            return;
+        case 16:
+            hamming_count_thres_impl<128>(C64(bs1), C64(bs2), n1, n2, ht, nptr);
+            return;
+        case 32:
+            hamming_count_thres_impl<256>(C64(bs1), C64(bs2), n1, n2, ht, nptr);
+            return;
+        case 64:
+            hamming_count_thres_impl<512>(C64(bs1), C64(bs2), n1, n2, ht, nptr);
+            return;
+        default:
+            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
+    }
+}
+
+template <>
+void crosshamming_count_thres_dispatch<hamming_impl_SL>(
+        const uint8_t* dbs,
+        size_t n,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t* nptr) {
+    switch (ncodes) {
+        case 8:
+            crosshamming_count_thres_impl<64>(C64(dbs), n, ht, nptr);
+            return;
+        case 16:
+            crosshamming_count_thres_impl<128>(C64(dbs), n, ht, nptr);
+            return;
+        case 32:
+            crosshamming_count_thres_impl<256>(C64(dbs), n, ht, nptr);
+            return;
+        case 64:
+            crosshamming_count_thres_impl<512>(C64(dbs), n, ht, nptr);
+            return;
+        default:
+            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
+    }
+}
+
+template <>
+size_t match_hamming_thres_dispatch<hamming_impl_SL>(
+        const uint8_t* bs1,
+        const uint8_t* bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        int64_t* idx,
+        hamdis_t* dis) {
+    switch (ncodes) {
+        case 8:
+            return match_hamming_thres_impl<64>(
+                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
+        case 16:
+            return match_hamming_thres_impl<128>(
+                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
+        case 32:
+            return match_hamming_thres_impl<256>(
+                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
+        case 64:
+            return match_hamming_thres_impl<512>(
+                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
+        default:
+            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
+            return 0;
+    }
+}
+
+#undef C64
+
+} // namespace faiss
diff --git a/faiss/utils/simd_impl/hamming_neon.cpp b/faiss/utils/simd_impl/hamming_neon.cpp
new file mode 100644
index 0000000000..6c72618028
--- /dev/null
+++ b/faiss/utils/simd_impl/hamming_neon.cpp
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_ARM_NEON
+
+#define THE_SIMD_LEVEL SIMDLevel::ARM_NEON
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/utils/simd_impl/hamming_impl.h>
+
+#endif // COMPILE_SIMD_ARM_NEON