diff --git a/faiss/CMakeLists.txt b/faiss/CMakeLists.txt
index cf870cc159..08082e02f7 100644
--- a/faiss/CMakeLists.txt
+++ b/faiss/CMakeLists.txt
@@ -15,6 +15,7 @@ set(FAISS_SIMD_AVX2_SRC
   impl/scalar_quantizer/sq-avx2.cpp
   impl/approx_topk/avx2.cpp
   utils/simd_impl/distances_avx2.cpp
+  utils/simd_impl/hamming_avx2.cpp
   utils/simd_impl/partitioning_avx2.cpp
   utils/distances_fused/simdlib_based.cpp
   utils/simd_impl/rabitq_avx2.cpp
@@ -33,6 +34,7 @@ set(FAISS_SIMD_NEON_SRC
   impl/scalar_quantizer/sq-neon.cpp
   impl/approx_topk/neon.cpp
   utils/simd_impl/distances_aarch64.cpp
+  utils/simd_impl/hamming_neon.cpp
   utils/simd_impl/partitioning_neon.cpp
   utils/distances_fused/simdlib_based_neon.cpp
   utils/simd_impl/rabitq_neon.cpp
@@ -342,6 +344,7 @@ set(FAISS_HEADERS
   utils/hamming_distance/avx512-inl.h
   utils/simd_impl/distances_autovec-inl.h
   utils/simd_impl/distances_simdlib256.h
+  utils/simd_impl/hamming_impl.h
   utils/simd_impl/exhaustive_L2sqr_blas_cmax.h
   utils/simd_impl/IVFFlatScanner-inl.h
   utils/simd_impl/distances_sse-inl.h
diff --git a/faiss/impl/platform_macros.h b/faiss/impl/platform_macros.h
index b7aa4111e3..8f18d98e54 100644
--- a/faiss/impl/platform_macros.h
+++ b/faiss/impl/platform_macros.h
@@ -126,9 +126,11 @@ inline int __builtin_clzll(uint64_t x) {
 #ifdef SWIG
 #define ALIGNED(x)
 #define FAISS_PACKED
+#define FAISS_MAYBE_UNUSED
 #else
 #define ALIGNED(x) __attribute__((aligned(x)))
 #define FAISS_PACKED __attribute__((packed))
+#define FAISS_MAYBE_UNUSED [[maybe_unused]]
 #endif
 
 // On non-Windows, FAISS_PACKED handles packing, so these are no-ops
diff --git a/faiss/utils/hamming.cpp b/faiss/utils/hamming.cpp
index 3c7931aa23..d02cd6c962 100644
--- a/faiss/utils/hamming.cpp
+++ b/faiss/utils/hamming.cpp
@@ -23,348 +23,28 @@
 
 #include <faiss/utils/hamming.h>
 
-#include <algorithm>
 #include <cstdio>
-#include <memory>
-#include <vector>
+#include <cstring>
 
-#include <faiss/impl/AuxIndexStructures.h>
 #include <faiss/impl/FaissAssert.h>
-#include <faiss/impl/IDSelector.h>
-#include <faiss/utils/Heap.h>
-#include <faiss/utils/approx_topk_hamming/approx_topk_hamming.h>
+#include <faiss/impl/simd_dispatch.h>
 #include <faiss/utils/utils.h>
 
+// Scalar (NONE) fallback — compiled without SIMD flags, hamdis-inl.h
+// selects generic-inl.h via the preprocessor ISA ladder.
+#define THE_SIMD_LEVEL SIMDLevel::NONE
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/utils/simd_impl/hamming_impl.h>
+#undef THE_SIMD_LEVEL
+
 namespace faiss {
 
 size_t hamming_batch_size = 65536;
 
-template <size_t nbits>
-void hammings(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t* __restrict dis)
-
-{
-    size_t i, j;
-    const size_t nwords = nbits / 64;
-    for (i = 0; i < n1; i++) {
-        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
-        hamdis_t* __restrict dis_ = dis + i * n2;
-        for (j = 0; j < n2; j++) {
-            dis_[j] = hamming<nbits>(bs1_, bs2 + j * nwords);
-        }
-    }
-}
-
-void hammings(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        size_t nbits,
-        hamdis_t* __restrict dis) {
-    size_t i, j;
-    const size_t nwords = nbits / 64;
-    for (i = 0; i < n1; i++) {
-        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
-        hamdis_t* __restrict dis_ = dis + i * n2;
-        for (j = 0; j < n2; j++) {
-            dis_[j] = hamming(bs1_, bs2 + j * nwords, nwords);
-        }
-    }
-}
-
-/* Count number of matches given a max threshold */
-template <size_t nbits>
-void hamming_count_thres(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        hamdis_t ht,
-        size_t* __restrict nptr) {
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    const uint64_t* bs2_ = bs2;
-
-    for (i = 0; i < n1; i++) {
-        bs2 = bs2_;
-        for (j = 0; j < n2; j++) {
-            /* collect the match only if this satisfies the threshold */
-            if (hamming<nbits>(bs1, bs2) <= ht) {
-                posm++;
-            }
-            bs2 += nwords;
-        }
-        bs1 += nwords; /* next signature */
-    }
-    *nptr = posm;
-}
-
-template <size_t nbits>
-void crosshamming_count_thres(
-        const uint64_t* __restrict dbs,
-        size_t n,
-        int ht,
-        size_t* __restrict nptr) {
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    const uint64_t* bs1 = dbs;
-    for (i = 0; i < n; i++) {
-        const uint64_t* bs2 = bs1 + 2;
-        for (j = i + 1; j < n; j++) {
-            /* collect the match only if this satisfies the threshold */
-            if (hamming<nbits>(bs1, bs2) <= ht) {
-                posm++;
-            }
-            bs2 += nwords;
-        }
-        bs1 += nwords;
-    }
-    *nptr = posm;
-}
-
-template <size_t nbits>
-size_t match_hamming_thres(
-        const uint64_t* __restrict bs1,
-        const uint64_t* __restrict bs2,
-        size_t n1,
-        size_t n2,
-        int ht,
-        int64_t* __restrict idx,
-        hamdis_t* __restrict hams) {
-    const size_t nwords = nbits / 64;
-    size_t i, j, posm = 0;
-    hamdis_t h;
-    const uint64_t* bs2_ = bs2;
-    for (i = 0; i < n1; i++) {
-        bs2 = bs2_;
-        for (j = 0; j < n2; j++) {
-            /* Here perform the real work of computing the distance */
-            h = hamming<nbits>(bs1, bs2);
-
-            /* collect the match only if this satisfies the threshold */
-            if (h <= ht) {
-                /* Enough space to store another match ? */
-                *idx = i;
-                idx++;
-                *idx = j;
-                idx++;
-                *hams = h;
-                hams++;
-                posm++;
-            }
-            bs2 += nwords; /* next signature */
-        }
-        bs1 += nwords;
-    }
-    return posm;
-}
-
-namespace {
-
-/* Return closest neighbors w.r.t Hamming distance, using a heap. */
-template <class HammingComputer>
-void hammings_knn_hc(
-        int bytes_per_code,
-        int_maxheap_array_t* __restrict ha,
-        const uint8_t* __restrict bs1,
-        const uint8_t* __restrict bs2,
-        size_t n2,
-        bool order = true,
-        bool init_heap = true,
-        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK,
-        const faiss::IDSelector* sel = nullptr) {
-    size_t k = ha->k;
-    if (init_heap) {
-        ha->heapify();
-    }
+/******************************************************************
+ * Scalar utility functions (no SIMD, no dispatch needed)
+ ******************************************************************/
 
-    const size_t block_size = hamming_batch_size;
-    for (size_t j0 = 0; j0 < n2; j0 += block_size) {
-        const size_t j1 = std::min(j0 + block_size, n2);
-#pragma omp parallel for
-        for (int64_t i = 0; i < static_cast<int64_t>(ha->nh); i++) {
-            HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code);
-
-            const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code;
-            hamdis_t dis;
-            hamdis_t* __restrict bh_val_ = ha->val + i * k;
-            int64_t* __restrict bh_ids_ = ha->ids + i * k;
-
-            // if larger number of k is required, then ::bs_addn() needs to be
-            // used instead of ::addn()
-#define HANDLE_APPROX(NB, BD)                                                \
-    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD:               \
-        FAISS_THROW_IF_NOT_FMT(                                              \
-                k <= NB * BD,                                                \
-                "The chosen mode (%d) of approximate top-k supports "        \
-                "up to %d values, but %zd is requested.",                    \
-                (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \
-                NB * BD,                                                     \
-                k);                                                          \
-        HeapWithBucketsForHamming32<                                         \
-                CMax<hamdis_t, int64_t>,                                     \
-                NB,                                                          \
-                BD,                                                          \
-                HammingComputer>::                                           \
-                addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel);           \
-        break;
-
-            switch (approx_topk_mode) {
-                HANDLE_APPROX(8, 3)
-                HANDLE_APPROX(8, 2)
-                HANDLE_APPROX(16, 2)
-                HANDLE_APPROX(32, 2)
-                default: {
-                    for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) {
-                        if (sel && !sel->is_member(j)) {
-                            continue;
-                        }
-                        dis = hc.hamming(bs2_);
-                        if (dis < bh_val_[0]) {
-                            faiss::maxheap_replace_top<hamdis_t>(
-                                    k, bh_val_, bh_ids_, dis, j);
-                        }
-                    }
-                } break;
-            }
-        }
-    }
-    if (order) {
-        ha->reorder();
-    }
-}
-
-/* Return closest neighbors w.r.t Hamming distance, using max count. */
-template <class HammingComputer>
-void hammings_knn_mc(
-        int bytes_per_code,
-        const uint8_t* __restrict a,
-        const uint8_t* __restrict b,
-        size_t na,
-        size_t nb,
-        size_t k,
-        int32_t* __restrict distances,
-        int64_t* __restrict labels,
-        const faiss::IDSelector* sel) {
-    const int nBuckets = bytes_per_code * 8 + 1;
-    std::vector<int> all_counters(na * nBuckets, 0);
-    std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
-
-    std::vector<HCounterState<HammingComputer>> cs;
-    for (size_t i = 0; i < na; ++i) {
-        cs.push_back(
-                HCounterState<HammingComputer>(
-                        all_counters.data() + i * nBuckets,
-                        all_ids_per_dis.get() + i * nBuckets * k,
-                        a + i * bytes_per_code,
-                        8 * bytes_per_code,
-                        k));
-    }
-
-    const size_t block_size = hamming_batch_size;
-    for (size_t j0 = 0; j0 < nb; j0 += block_size) {
-        const size_t j1 = std::min(j0 + block_size, nb);
-#pragma omp parallel for
-        for (int64_t i = 0; i < static_cast<int64_t>(na); ++i) {
-            for (size_t j = j0; j < j1; ++j) {
-                if (!sel || sel->is_member(j)) {
-                    cs[i].update_counter(b + j * bytes_per_code, j);
-                }
-            }
-        }
-    }
-
-    for (size_t i = 0; i < na; ++i) {
-        HCounterState<HammingComputer>& csi = cs[i];
-
-        size_t nres = 0;
-        for (int b_2 = 0; b_2 < nBuckets && nres < k; b_2++) {
-            for (int l = 0; l < csi.counters[b_2] && nres < k; l++) {
-                labels[i * k + nres] = csi.ids_per_dis[b_2 * k + l];
-                distances[i * k + nres] = b_2;
-                nres++;
-            }
-        }
-        while (nres < k) {
-            labels[i * k + nres] = -1;
-            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
-            ++nres;
-        }
-    }
-}
-
-template <class HammingComputer>
-void hamming_range_search(
-        const uint8_t* a,
-        const uint8_t* b,
-        size_t na,
-        size_t nb,
-        int radius,
-        size_t code_size,
-        RangeSearchResult* res,
-        const faiss::IDSelector* sel) {
-#pragma omp parallel
-    {
-        RangeSearchPartialResult pres(res);
-
-#pragma omp for
-        for (int64_t i = 0; i < static_cast<int64_t>(na); i++) {
-            HammingComputer hc(a + i * code_size, code_size);
-            const uint8_t* yi = b;
-            RangeQueryResult& qres = pres.new_result(i);
-
-            for (size_t j = 0; j < nb; j++) {
-                if (!sel || sel->is_member(j)) {
-                    int dis = hc.hamming(yi);
-                    if (dis < radius) {
-                        qres.add(dis, j);
-                    }
-                }
-                yi += code_size;
-            }
-        }
-        pres.finalize();
-    }
-}
-
-struct Run_hammings_knn_hc {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        hammings_knn_hc<HammingComputer>(args...);
-    }
-};
-
-struct Run_hammings_knn_mc {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        hammings_knn_mc<HammingComputer>(args...);
-    }
-};
-
-struct Run_hamming_range_search {
-    using T = void;
-    template <class HammingComputer, class... Types>
-    void f(Types... args) {
-        hamming_range_search<HammingComputer>(args...);
-    }
-};
-
-} // namespace
-
-/* Functions to maps vectors to bits. Assume proper allocation done beforehand,
-   meaning that b should be be able to receive as many bits as x may produce. */
-
-/*
- * dimension 0 corresponds to the least significant bit of b[0], or
- * equivalently to the lsb of the first byte that is stored.
- */
 void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) {
     for (size_t i = 0; i < d; i += 8) {
         uint8_t w = 0;
@@ -381,8 +61,6 @@ void fvec2bitvec(const float* __restrict x, uint8_t* __restrict b, size_t d) {
     }
 }
 
-/* Same but for n vectors.
-   Ensure that the output b is byte-aligned (pad with 0s). */
 void fvecs2bitvecs(
         const float* __restrict x,
         uint8_t* __restrict b,
@@ -407,7 +85,6 @@ void bitvecs2fvecs(
     }
 }
 
-/* Reverse bit (NOT a optimized function, only used for print purpose) */
 static uint64_t uint64_reverse_bits(uint64_t b) {
     int i;
     uint64_t revb = 0;
@@ -419,7 +96,6 @@ static uint64_t uint64_reverse_bits(uint64_t b) {
     return revb;
 }
 
-/* print the bit vector */
 void bitvec_print(const uint8_t* b, size_t d) {
     size_t i, j;
     for (i = 0; i < d;) {
@@ -459,12 +135,10 @@ void bitvec_shuffle(
     }
 }
 
-/*----------------------------------------*/
-/* Hamming distance computation and k-nn  */
+/******************************************************************
+ * Dispatched Hamming distance public API
+ ******************************************************************/
 
-#define C64(x) ((uint64_t*)x)
-
-/* Compute a set of Hamming distances */
 void hammings(
         const uint8_t* __restrict a,
         const uint8_t* __restrict b,
@@ -472,24 +146,9 @@ void hammings(
         size_t nb,
         size_t ncodes,
         hamdis_t* __restrict dis) {
-    FAISS_THROW_IF_NOT(ncodes % 8 == 0);
-    switch (ncodes) {
-        case 8:
-            faiss::hammings<64>(C64(a), C64(b), na, nb, dis);
-            return;
-        case 16:
-            faiss::hammings<128>(C64(a), C64(b), na, nb, dis);
-            return;
-        case 32:
-            faiss::hammings<256>(C64(a), C64(b), na, nb, dis);
-            return;
-        case 64:
-            faiss::hammings<512>(C64(a), C64(b), na, nb, dis);
-            return;
-        default:
-            faiss::hammings(C64(a), C64(b), na, nb, ncodes * 8, dis);
-            return;
-    }
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        hammings_dispatch<SL>(a, b, na, nb, ncodes, dis);
+    });
 }
 
 void hammings_knn(
@@ -511,19 +170,10 @@ void hammings_knn_hc(
         int order,
         ApproxTopK_mode_t approx_topk_mode,
         const faiss::IDSelector* sel) {
-    Run_hammings_knn_hc r;
-    dispatch_HammingComputer(
-            ncodes,
-            r,
-            ncodes,
-            ha,
-            a,
-            b,
-            nb,
-            order,
-            true,
-            approx_topk_mode,
-            sel);
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        hammings_knn_hc_dispatch<SL>(
+                ha, a, b, nb, ncodes, order, approx_topk_mode, sel);
+    });
 }
 
 void hammings_knn_mc(
@@ -536,9 +186,10 @@ void hammings_knn_mc(
         int32_t* __restrict distances,
         int64_t* __restrict labels,
         const faiss::IDSelector* sel) {
-    Run_hammings_knn_mc r;
-    dispatch_HammingComputer(
-            ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel);
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        hammings_knn_mc_dispatch<SL>(
+                a, b, na, nb, k, ncodes, distances, labels, sel);
+    });
 }
 
 void hamming_range_search(
@@ -550,12 +201,12 @@ void hamming_range_search(
         size_t code_size,
         RangeSearchResult* result,
         const faiss::IDSelector* sel) {
-    Run_hamming_range_search r;
-    dispatch_HammingComputer(
-            code_size, r, a, b, na, nb, radius, code_size, result, sel);
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        hamming_range_search_dispatch<SL>(
+                a, b, na, nb, radius, code_size, result, sel);
+    });
 }
 
-/* Count number of matches given a max threshold            */
 void hamming_count_thres(
         const uint8_t* bs1,
         const uint8_t* bs2,
@@ -564,54 +215,22 @@ void hamming_count_thres(
         hamdis_t ht,
         size_t ncodes,
         size_t* nptr) {
-    switch (ncodes) {
-        case 8:
-            faiss::hamming_count_thres<64>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        case 16:
-            faiss::hamming_count_thres<128>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        case 32:
-            faiss::hamming_count_thres<256>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        case 64:
-            faiss::hamming_count_thres<512>(
-                    C64(bs1), C64(bs2), n1, n2, ht, nptr);
-            return;
-        default:
-            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
-    }
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        hamming_count_thres_dispatch<SL>(bs1, bs2, n1, n2, ht, ncodes, nptr);
+    });
 }
 
-/* Count number of cross-matches given a threshold */
 void crosshamming_count_thres(
         const uint8_t* dbs,
         size_t n,
         hamdis_t ht,
         size_t ncodes,
         size_t* nptr) {
-    switch (ncodes) {
-        case 8:
-            faiss::crosshamming_count_thres<64>(C64(dbs), n, ht, nptr);
-            return;
-        case 16:
-            faiss::crosshamming_count_thres<128>(C64(dbs), n, ht, nptr);
-            return;
-        case 32:
-            faiss::crosshamming_count_thres<256>(C64(dbs), n, ht, nptr);
-            return;
-        case 64:
-            faiss::crosshamming_count_thres<512>(C64(dbs), n, ht, nptr);
-            return;
-        default:
-            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
-    }
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        crosshamming_count_thres_dispatch<SL>(dbs, n, ht, ncodes, nptr);
+    });
 }
 
-/* Returns all matches given a threshold */
 size_t match_hamming_thres(
         const uint8_t* bs1,
         const uint8_t* bs2,
@@ -621,49 +240,10 @@ size_t match_hamming_thres(
         size_t ncodes,
         int64_t* idx,
         hamdis_t* dis) {
-    switch (ncodes) {
-        case 8:
-            return faiss::match_hamming_thres<64>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        case 16:
-            return faiss::match_hamming_thres<128>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        case 32:
-            return faiss::match_hamming_thres<256>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        case 64:
-            return faiss::match_hamming_thres<512>(
-                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
-        default:
-            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
-            return 0;
-    }
-}
-
-#undef C64
-
-/*************************************
- * generalized Hamming distances
- ************************************/
-
-template <class HammingComputer>
-static void hamming_dis_inner_loop(
-        const uint8_t* __restrict ca,
-        const uint8_t* __restrict cb,
-        size_t nb,
-        size_t code_size,
-        int k,
-        hamdis_t* __restrict bh_val_,
-        int64_t* __restrict bh_ids_) {
-    HammingComputer hc(ca, code_size);
-
-    for (size_t j = 0; j < nb; j++) {
-        int ndiff = hc.hamming(cb);
-        cb += code_size;
-        if (ndiff < bh_val_[0]) {
-            maxheap_replace_top<hamdis_t>(k, bh_val_, bh_ids_, ndiff, j);
-        }
-    }
+    return with_simd_level_256bit([&]<SIMDLevel SL>() -> size_t {
+        return match_hamming_thres_dispatch<SL>(
+                bs1, bs2, n1, n2, ht, ncodes, idx, dis);
+    });
 }
 
 void generalized_hammings_knn_hc(
@@ -673,46 +253,16 @@ void generalized_hammings_knn_hc(
         size_t nb,
         size_t code_size,
         int ordered) {
-    int na = ha->nh;
-    int k = ha->k;
-
-    if (ordered) {
-        ha->heapify();
-    }
-
-#pragma omp parallel for
-    for (int i = 0; i < na; i++) {
-        const uint8_t* __restrict ca = a + i * code_size;
-        const uint8_t* __restrict cb = b;
-
-        hamdis_t* __restrict bh_val_ = ha->val + i * k;
-        int64_t* __restrict bh_ids_ = ha->ids + i * k;
-
-        switch (code_size) {
-            case 8:
-                hamming_dis_inner_loop<GenHammingComputer8>(
-                        ca, cb, nb, 8, k, bh_val_, bh_ids_);
-                break;
-            case 16:
-                hamming_dis_inner_loop<GenHammingComputer16>(
-                        ca, cb, nb, 16, k, bh_val_, bh_ids_);
-                break;
-            case 32:
-                hamming_dis_inner_loop<GenHammingComputer32>(
-                        ca, cb, nb, 32, k, bh_val_, bh_ids_);
-                break;
-            default:
-                hamming_dis_inner_loop<GenHammingComputerM8>(
-                        ca, cb, nb, code_size, k, bh_val_, bh_ids_);
-                break;
-        }
-    }
-
-    if (ordered) {
-        ha->reorder();
-    }
+    with_simd_level_256bit([&]<SIMDLevel SL>() {
+        generalized_hammings_knn_hc_dispatch<SL>(
+                ha, a, b, nb, code_size, ordered);
+    });
 }
 
+/******************************************************************
+ * Bitstring pack/unpack (scalar, no dispatch needed)
+ ******************************************************************/
+
 void pack_bitstrings(
         size_t n,
         size_t M,
diff --git a/faiss/utils/hamming.h b/faiss/utils/hamming.h
index 64dfbf7467..7011572856 100644
--- a/faiss/utils/hamming.h
+++ b/faiss/utils/hamming.h
@@ -30,6 +30,7 @@
 #include <faiss/impl/IDSelector.h>
 #include <faiss/impl/platform_macros.h>
 #include <faiss/utils/Heap.h>
+#include <faiss/utils/simd_levels.h>
 
 // Low-level Hamming distance computations and hamdis_t.
 #include <faiss/utils/hamming_distance/hamdis-inl.h>
@@ -284,6 +285,90 @@ void unpack_bitstrings(
         size_t code_size,
         int32_t* unpacked);
 
+/** SIMD-dispatched Hamming function implementations.
+ *  Specializations live in per-ISA TUs (hamming_avx2.cpp, etc.). */
+
+template <SIMDLevel SL>
+void hammings_knn_hc_dispatch(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t ncodes,
+        int ordered,
+        ApproxTopK_mode_t approx_topk_mode,
+        const IDSelector* sel);
+
+template <SIMDLevel SL>
+void hammings_knn_mc_dispatch(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        size_t ncodes,
+        int32_t* distances,
+        int64_t* labels,
+        const IDSelector* sel);
+
+template <SIMDLevel SL>
+void hamming_range_search_dispatch(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        int radius,
+        size_t code_size,
+        RangeSearchResult* result,
+        const IDSelector* sel);
+
+template <SIMDLevel SL>
+void hammings_dispatch(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        size_t ncodes,
+        hamdis_t* dis);
+
+template <SIMDLevel SL>
+void generalized_hammings_knn_hc_dispatch(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t code_size,
+        int ordered);
+
+template <SIMDLevel SL>
+void hamming_count_thres_dispatch(
+        const uint8_t* bs1,
+        const uint8_t* bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t* nptr);
+
+template <SIMDLevel SL>
+void crosshamming_count_thres_dispatch(
+        const uint8_t* dbs,
+        size_t n,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t* nptr);
+
+template <SIMDLevel SL>
+size_t match_hamming_thres_dispatch(
+        const uint8_t* bs1,
+        const uint8_t* bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        int64_t* idx,
+        hamdis_t* dis);
+
 } // namespace faiss
 
 #include <faiss/utils/hamming-inl.h>
diff --git a/faiss/utils/hamming_distance/avx2-inl.h b/faiss/utils/hamming_distance/avx2-inl.h
index b7d5f32e66..1ca322c7b2 100644
--- a/faiss/utils/hamming_distance/avx2-inl.h
+++ b/faiss/utils/hamming_distance/avx2-inl.h
@@ -8,10 +8,11 @@
 #ifndef HAMMING_AVX2_INL_H
 #define HAMMING_AVX2_INL_H
 
-// AVX2 version
+// AVX2 version.
+// Most code is now in common.h. This file only contains the
+// GenHammingComputer classes that leverage SSE/AVX2 intrinsics.
 
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 
 #include <faiss/impl/platform_macros.h>
@@ -20,376 +21,13 @@
 
 namespace faiss {
 
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t nwords) {
-    hamdis_t h = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        (void)code_size;
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        (void)code_size;
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        (void)code_size;
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 20);
-        (void)code_size;
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*(uint32_t*)(b + 2) ^ a2);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 20;
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        (void)code_size;
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-struct HammingComputer64 {
-    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
-
-    HammingComputer64() {}
-
-    HammingComputer64(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 64);
-        (void)code_size;
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-        a4 = a[4];
-        a5 = a[5];
-        a6 = a[6];
-        a7 = a[7];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
-                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
-                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 64;
-    }
-};
-
-struct HammingComputerDefault {
-    const uint8_t* a8;
-    int quotient8;
-    int remainder8;
-
-    HammingComputerDefault() {}
-
-    HammingComputerDefault(const uint8_t* a8_in, int code_size) {
-        set(a8_in, code_size);
-    }
-
-    void set(const uint8_t* a8_2, int code_size) {
-        this->a8 = a8_2;
-        quotient8 = code_size / 8;
-        remainder8 = code_size % 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        int accu = 0;
-
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0, len = quotient8;
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    [[fallthrough]];
-                default:
-                    break;
-            }
-        }
-
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return quotient8 * 8 + remainder8;
-    }
-};
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        (void)code_size;
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return generalized_hamming_64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
 // I'm not sure whether this version is faster of slower, tbh
 // todo: test on different CPUs
 struct GenHammingComputer16 {
     __m128i a;
 
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
+    GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 16);
-        (void)code_size;
         a = _mm_loadu_si128((const __m128i_u*)a8);
     }
 
@@ -408,9 +46,8 @@ struct GenHammingComputer16 {
 struct GenHammingComputer32 {
     __m256i a;
 
-    GenHammingComputer32(const uint8_t* a8, int code_size) {
+    GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 32);
-        (void)code_size;
         a = _mm256_loadu_si256((const __m256i_u*)a8);
     }
 
diff --git a/faiss/utils/hamming_distance/avx512-inl.h b/faiss/utils/hamming_distance/avx512-inl.h
index dfd8f55bfe..af04ef175d 100644
--- a/faiss/utils/hamming_distance/avx512-inl.h
+++ b/faiss/utils/hamming_distance/avx512-inl.h
@@ -8,15 +8,13 @@
 #ifndef HAMMING_AVX512_INL_H
 #define HAMMING_AVX512_INL_H
 
-// AVX512 version
-// The _mm512_popcnt_epi64 intrinsic is used to accelerate Hamming distance
-// calculations in HammingComputerDefault and HammingComputer64. This intrinsic
-// is not available in the default Faiss avx512 build mode but is only
-// available in the avx512_spr build mode, which targets Intel(R) Sapphire
-// Rapids.
+// AVX512 version.
+// Most code is now in common.h. This file provides AVX512-specific
+// overrides for HammingComputer64 and HammingComputerDefault
+// (using _mm512_popcnt_epi64 when __AVX512VPOPCNTDQ__ is available),
+// and the GenHammingComputer classes that leverage SSE/AVX2 intrinsics.
 
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 
 #include <faiss/impl/platform_macros.h>
@@ -25,199 +23,6 @@
 
 namespace faiss {
 
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t nwords) {
-    hamdis_t h = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 20);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*(uint32_t*)(b + 2) ^ a2);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 20;
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = (uint64_t*)a8;
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
 struct HammingComputer64 {
     uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
     const uint64_t* a;
@@ -228,9 +33,9 @@ struct HammingComputer64 {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 64);
-        a = (uint64_t*)a8;
+        a = reinterpret_cast<const uint64_t*>(a8);
         a0 = a[0];
         a1 = a[1];
         a2 = a[2];
@@ -242,7 +47,7 @@ struct HammingComputer64 {
     }
 
     inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = (uint64_t*)b8;
+        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
 #ifdef __AVX512VPOPCNTDQ__
         __m512i vxor =
                 _mm512_xor_si512(_mm512_loadu_si512(a), _mm512_loadu_si512(b));
@@ -269,8 +74,8 @@ struct HammingComputerDefault {
 
     HammingComputerDefault() {}
 
-    HammingComputerDefault(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
+    HammingComputerDefault(const uint8_t* a8_in, int code_size) {
+        set(a8_in, code_size);
     }
 
     void set(const uint8_t* a8_2, int code_size) {
@@ -298,73 +103,8 @@ struct HammingComputerDefault {
         }
         i *= 8;
 #endif
-        int len = quotient8 - i;
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    [[fallthrough]];
-                default:
-                    break;
-            }
-        }
-
+        accu += hamming_popcount_tail(
+                a64, b64, i, quotient8, a8, b8, remainder8);
         return accu;
     }
 
@@ -373,42 +113,12 @@ struct HammingComputerDefault {
     }
 };
 
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return generalized_hamming_64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
 // I'm not sure whether this version is faster of slower, tbh
 // todo: test on different CPUs
 struct GenHammingComputer16 {
     __m128i a;
 
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
+    GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 16);
         a = _mm_loadu_si128((const __m128i_u*)a8);
     }
@@ -428,7 +138,7 @@ struct GenHammingComputer16 {
 struct GenHammingComputer32 {
     __m256i a;
 
-    GenHammingComputer32(const uint8_t* a8, int code_size) {
+    GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 32);
         a = _mm256_loadu_si256((const __m256i_u*)a8);
     }
diff --git a/faiss/utils/hamming_distance/common.h b/faiss/utils/hamming_distance/common.h
index 154e14e2cc..b50feca2e0 100644
--- a/faiss/utils/hamming_distance/common.h
+++ b/faiss/utils/hamming_distance/common.h
@@ -8,6 +8,8 @@
 #ifndef FAISS_hamming_common_h
 #define FAISS_hamming_common_h
 
+#include <cassert>
+#include <cstddef>
 #include <cstdint>
 
 #include <faiss/impl/platform_macros.h>
@@ -43,6 +45,407 @@ inline constexpr uint8_t hamdis_tab_ham_bytes[256] = {
         4, 5, 5, 6, 5, 6, 6, 7, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
         4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
 
+/***************************************************************************
+ * Universal code: identical across all architectures
+ ***************************************************************************/
+
+/* Elementary Hamming distance computation: unoptimized  */
+template <size_t nbits, typename T>
+inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
+    const size_t nbytes = nbits / 8;
+    size_t i;
+    T h = 0;
+    for (i = 0; i < nbytes; i++) {
+        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
+    }
+    return h;
+}
+
+/***************************************************************************
+ * generalized Hamming = number of bytes that are different between
+ * two codes.
+ ***************************************************************************/
+
+inline int generalized_hamming_64(uint64_t a) {
+    a |= a >> 1;
+    a |= a >> 2;
+    a |= a >> 4;
+    a &= 0x0101010101010101UL;
+    return popcount64(a);
+}
+
+/******************************************************************
+ * The HammingComputer series of classes compares a single code of
+ * size 4 to 32 to incoming codes. They are intended for use as a
+ * template class where it would be inefficient to switch on the code
+ * size in the inner loop. Hopefully the compiler will inline the
+ * hamming() functions and put the a0, a1, ... in registers.
+ ******************************************************************/
+
+struct HammingComputer4 {
+    uint32_t a0;
+
+    HammingComputer4() {}
+
+    HammingComputer4(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 4);
+        const uint32_t* a32 = reinterpret_cast<const uint32_t*>(a);
+        a0 = *a32;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        const uint32_t* b32 = reinterpret_cast<const uint32_t*>(b);
+        return popcount64(*b32 ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 4;
+    }
+};
+
+struct HammingComputer8 {
+    uint64_t a0;
+
+    HammingComputer8() {}
+
+    HammingComputer8(const uint8_t* a, int code_size) {
+        set(a, code_size);
+    }
+
+    void set(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 8);
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a);
+        a0 = *a64;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b);
+        return popcount64(*b64 ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+/* Duff's device + byte remainder tail for HammingComputerDefault.
+ * Processes uint64 words starting at index i_start using popcount,
+ * then handles any remaining bytes via lookup table. */
+inline int hamming_popcount_tail(
+        const uint64_t* a64,
+        const uint64_t* b64,
+        int i_start,
+        int quotient8,
+        const uint8_t* a8,
+        const uint8_t* b8,
+        int remainder8) {
+    int accu = 0;
+    int i = i_start;
+    int len = quotient8 - i_start;
+    switch (len & 7) {
+        default:
+            while (len > 7) {
+                len -= 8;
+                accu += popcount64(a64[i] ^ b64[i]);
+                i++;
+                [[fallthrough]];
+                case 7:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 6:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 5:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 4:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 3:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 2:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+                    [[fallthrough]];
+                case 1:
+                    accu += popcount64(a64[i] ^ b64[i]);
+                    i++;
+            }
+    }
+    if (remainder8) {
+        const uint8_t* a = a8 + 8 * quotient8;
+        const uint8_t* b = b8 + 8 * quotient8;
+        switch (remainder8) {
+            case 7:
+                accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
+                [[fallthrough]];
+            case 6:
+                accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
+                [[fallthrough]];
+            case 5:
+                accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
+                [[fallthrough]];
+            case 4:
+                accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
+                [[fallthrough]];
+            case 3:
+                accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
+                [[fallthrough]];
+            case 2:
+                accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
+                [[fallthrough]];
+            case 1:
+                accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
+                break;
+            default:
+                break;
+        }
+    }
+    return accu;
+}
+
+/***************************************************************************
+ * Scalar code shared by generic, AVX2, and AVX512 backends.
+ * NEON provides its own optimized versions of these.
+ ***************************************************************************/
+
+#ifndef __aarch64__
+
+/* Hamming distances for multiples of 64 bits */
+template <size_t nbits>
+inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
+    const size_t nwords = nbits / 64;
+    size_t i;
+    hamdis_t h = 0;
+    for (i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+/* specialized (optimized) functions */
+template <>
+inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]);
+}
+
+template <>
+inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
+}
+
+template <>
+inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
+    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
+            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
+}
+
+/* Hamming distances for multiple of 64 bits */
+inline hamdis_t hamming(
+        const uint64_t* bs1,
+        const uint64_t* bs2,
+        size_t nwords) {
+    hamdis_t h = 0;
+    for (size_t i = 0; i < nwords; i++) {
+        h += popcount64(bs1[i] ^ bs2[i]);
+    }
+    return h;
+}
+
+struct HammingComputer16 {
+    uint64_t a0, a1;
+
+    HammingComputer16() {}
+
+    HammingComputer16(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 16);
+        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
+        a0 = a[0];
+        a1 = a[1];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 16;
+    }
+};
+
+// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
+// This incurs a penalty of ~10% wrt. fully aligned accesses.
+struct HammingComputer20 {
+    uint64_t a0, a1;
+    uint32_t a2;
+
+    HammingComputer20() {}
+
+    HammingComputer20(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 20);
+        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
+        const uint32_t* a32 = reinterpret_cast<const uint32_t*>(a8);
+        a0 = a[0];
+        a1 = a[1];
+        // can't read a[2] since it is uint64_t, not uint32_t
+        // results in AddressSanitizer failure reading past end of array
+        a2 = a32[4];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
+        const uint32_t* b32_tail = reinterpret_cast<const uint32_t*>(b + 2);
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(*b32_tail ^ a2);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 20;
+    }
+};
+
+struct HammingComputer32 {
+    uint64_t a0, a1, a2, a3;
+
+    HammingComputer32() {}
+
+    HammingComputer32(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 32);
+        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 32;
+    }
+};
+
+struct GenHammingComputer8 {
+    uint64_t a0;
+
+    GenHammingComputer8(const uint8_t* a, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 8);
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a);
+        a0 = *a64;
+    }
+
+    inline int hamming(const uint8_t* b) const {
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b);
+        return generalized_hamming_64(*b64 ^ a0);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 8;
+    }
+};
+
+#endif // !__aarch64__
+
+/***************************************************************************
+ * Scalar code shared by generic and AVX2 backends only.
+ * AVX512 and NEON provide their own optimized versions.
+ ***************************************************************************/
+
+#if !defined(__aarch64__) && !defined(__AVX512F__)
+
+struct HammingComputer64 {
+    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
+
+    HammingComputer64() {}
+
+    HammingComputer64(const uint8_t* a8, int code_size) {
+        set(a8, code_size);
+    }
+
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
+        assert(code_size == 64);
+        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
+        a0 = a[0];
+        a1 = a[1];
+        a2 = a[2];
+        a3 = a[3];
+        a4 = a[4];
+        a5 = a[5];
+        a6 = a[6];
+        a7 = a[7];
+    }
+
+    inline int hamming(const uint8_t* b8) const {
+        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
+        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
+                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
+                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
+                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
+    }
+
+    inline static constexpr int get_code_size() {
+        return 64;
+    }
+};
+
+struct HammingComputerDefault {
+    const uint8_t* a8;
+    int quotient8;
+    int remainder8;
+
+    HammingComputerDefault() {}
+
+    HammingComputerDefault(const uint8_t* a8_in, int code_size) {
+        set(a8_in, code_size);
+    }
+
+    void set(const uint8_t* a8_in, int code_size) {
+        this->a8 = a8_in;
+        quotient8 = code_size / 8;
+        remainder8 = code_size % 8;
+    }
+
+    int hamming(const uint8_t* b8) const {
+        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
+        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
+        return hamming_popcount_tail(
+                a64, b64, 0, quotient8, a8, b8, remainder8);
+    }
+
+    inline int get_code_size() const {
+        return quotient8 * 8 + remainder8;
+    }
+};
+
+#endif // !__aarch64__ && !__AVX512F__
+
 } // namespace faiss
 
 #endif
diff --git a/faiss/utils/hamming_distance/generic-inl.h b/faiss/utils/hamming_distance/generic-inl.h
index 7f229f4f11..59d942dea6 100644
--- a/faiss/utils/hamming_distance/generic-inl.h
+++ b/faiss/utils/hamming_distance/generic-inl.h
@@ -9,383 +9,19 @@
 #define HAMMING_GENERIC_INL_H
 
 // A general-purpose version of hamming distance computation.
+// Most code is now in common.h. This file only contains the
+// GenHammingComputer classes that use scalar generalized_hamming_64.
 
 #include <cassert>
-#include <cstddef>
 #include <cstdint>
 
 #include <faiss/impl/platform_macros.h>
 
 namespace faiss {
 
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
-/* Hamming distances for multiples of 64 bits */
-template <size_t nbits>
-inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
-    const size_t nwords = nbits / 64;
-    size_t i;
-    hamdis_t h = 0;
-    for (i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/* specialized (optimized) functions */
-template <>
-inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]);
-}
-
-template <>
-inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
-}
-
-template <>
-inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
-    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
-            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
-}
-
-/* Hamming distances for multiple of 64 bits */
-inline hamdis_t hamming(
-        const uint64_t* bs1,
-        const uint64_t* bs2,
-        size_t nwords) {
-    hamdis_t h = 0;
-    for (size_t i = 0; i < nwords; i++) {
-        h += popcount64(bs1[i] ^ bs2[i]);
-    }
-    return h;
-}
-
-/******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
- ******************************************************************/
-
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, [[maybe_unused]] int code_size) {
-        assert(code_size == 4);
-        const uint32_t* a32 = reinterpret_cast<const uint32_t*>(a);
-        a0 = *a32;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        const uint32_t* b32 = reinterpret_cast<const uint32_t*>(b);
-        return popcount64(*b32 ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, [[maybe_unused]] int code_size) {
-        assert(code_size == 8);
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a);
-        a0 = *a64;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b);
-        return popcount64(*b64 ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
-struct HammingComputer16 {
-    uint64_t a0, a1;
-
-    HammingComputer16() {}
-
-    HammingComputer16(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, [[maybe_unused]] int code_size) {
-        assert(code_size == 16);
-        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
-        a0 = a[0];
-        a1 = a[1];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 16;
-    }
-};
-
-// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
-// This incurs a penalty of ~10% wrt. fully aligned accesses.
-struct HammingComputer20 {
-    uint64_t a0, a1;
-    uint32_t a2;
-
-    HammingComputer20() {}
-
-    HammingComputer20(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, [[maybe_unused]] int code_size) {
-        assert(code_size == 20);
-        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
-        const uint32_t* a32 = reinterpret_cast<const uint32_t*>(a8);
-        a0 = a[0];
-        a1 = a[1];
-        // can't read a[2] since it is uint64_t, not uint32_t
-        // results in AddressSanitizer failure reading past end of array
-        a2 = a32[4];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
-        const uint32_t* b32_tail = reinterpret_cast<const uint32_t*>(b + 2);
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(*b32_tail ^ a2);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 20;
-    }
-};
-
-struct HammingComputer32 {
-    uint64_t a0, a1, a2, a3;
-
-    HammingComputer32() {}
-
-    HammingComputer32(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, [[maybe_unused]] int code_size) {
-        assert(code_size == 32);
-        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 32;
-    }
-};
-
-struct HammingComputer64 {
-    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
-
-    HammingComputer64() {}
-
-    HammingComputer64(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
-    }
-
-    void set(const uint8_t* a8, [[maybe_unused]] int code_size) {
-        assert(code_size == 64);
-        const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
-        a0 = a[0];
-        a1 = a[1];
-        a2 = a[2];
-        a3 = a[3];
-        a4 = a[4];
-        a5 = a[5];
-        a6 = a[6];
-        a7 = a[7];
-    }
-
-    inline int hamming(const uint8_t* b8) const {
-        const uint64_t* b = reinterpret_cast<const uint64_t*>(b8);
-        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
-                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
-                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
-                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 64;
-    }
-};
-
-struct HammingComputerDefault {
-    const uint8_t* a8;
-    int quotient8;
-    int remainder8;
-
-    HammingComputerDefault() {}
-
-    HammingComputerDefault(const uint8_t* a8_in, int code_size) {
-        set(a8_in, code_size);
-    }
-
-    void set(const uint8_t* a8_in, int code_size) {
-        this->a8 = a8_in;
-        quotient8 = code_size / 8;
-        remainder8 = code_size % 8;
-    }
-
-    int hamming(const uint8_t* b8) const {
-        int accu = 0;
-
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0, len = quotient8;
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    break;
-                default:
-                    break;
-            }
-        }
-
-        return accu;
-    }
-
-    inline int get_code_size() const {
-        return quotient8 * 8 + remainder8;
-    }
-};
-
-/***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
- ***************************************************************************/
-
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
-struct GenHammingComputer8 {
-    uint64_t a0;
-
-    GenHammingComputer8(const uint8_t* a, [[maybe_unused]] int code_size) {
-        assert(code_size == 8);
-        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a);
-        a0 = *a64;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b);
-        return generalized_hamming_64(*b64 ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
 struct GenHammingComputer16 {
     uint64_t a0, a1;
-    GenHammingComputer16(const uint8_t* a8, [[maybe_unused]] int code_size) {
+    GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 16);
         const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
         a0 = a[0];
@@ -406,7 +42,7 @@ struct GenHammingComputer16 {
 struct GenHammingComputer32 {
     uint64_t a0, a1, a2, a3;
 
-    GenHammingComputer32(const uint8_t* a8, [[maybe_unused]] int code_size) {
+    GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 32);
         const uint64_t* a = reinterpret_cast<const uint64_t*>(a8);
         a0 = a[0];
diff --git a/faiss/utils/hamming_distance/neon-inl.h b/faiss/utils/hamming_distance/neon-inl.h
index d00a402857..d29f40e0b5 100644
--- a/faiss/utils/hamming_distance/neon-inl.h
+++ b/faiss/utils/hamming_distance/neon-inl.h
@@ -8,8 +8,11 @@
 #ifndef HAMMING_NEON_INL_H
 #define HAMMING_NEON_INL_H
 
-// a specialized version of hamming is needed here, because both
+// A specialized version of hamming is needed here, because both
 // gcc, clang and msvc seem to generate suboptimal code sometimes.
+//
+// Universal code (HammingComputer4, HammingComputer8, generalized_hamming_64,
+// byte-level hamming template, hamming_popcount_tail) comes from common.h.
 
 #ifdef __aarch64__
 
@@ -25,18 +28,6 @@
 
 namespace faiss {
 
-/* Elementary Hamming distance computation: unoptimized  */
-template <size_t nbits, typename T>
-inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
-    const size_t nbytes = nbits / 8;
-    size_t i;
-    T h = 0;
-    for (i = 0; i < nbytes; i++) {
-        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
-    }
-    return h;
-}
-
 /* Hamming distances for multiples of 64 bits */
 template <size_t nbits>
 inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb) {
@@ -125,59 +116,10 @@ inline hamdis_t hamming(const uint64_t* pa, const uint64_t* pb, size_t nwords) {
 }
 
 /******************************************************************
- * The HammingComputer series of classes compares a single code of
- * size 4 to 32 to incoming codes. They are intended for use as a
- * template class where it would be inefficient to switch on the code
- * size in the inner loop. Hopefully the compiler will inline the
- * hamming() functions and put the a0, a1, ... in registers.
+ * NEON-optimized HammingComputer classes for sizes 16-64.
+ * Sizes 4 and 8 use the scalar versions from common.h.
  ******************************************************************/
 
-struct HammingComputer4 {
-    uint32_t a0;
-
-    HammingComputer4() {}
-
-    HammingComputer4(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 4);
-        a0 = *(uint32_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint32_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 4;
-    }
-};
-
-struct HammingComputer8 {
-    uint64_t a0;
-
-    HammingComputer8() {}
-
-    HammingComputer8(const uint8_t* a, int code_size) {
-        set(a, code_size);
-    }
-
-    void set(const uint8_t* a, int code_size) {
-        assert(code_size == 8);
-        a0 = *(uint64_t*)a;
-    }
-
-    inline int hamming(const uint8_t* b) const {
-        return popcount64(*(uint64_t*)b ^ a0);
-    }
-
-    inline static constexpr int get_code_size() {
-        return 8;
-    }
-};
-
 struct HammingComputer16 {
     uint8x16_t a0;
 
@@ -187,7 +129,7 @@ struct HammingComputer16 {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 16);
         a0 = vld1q_u8(a8);
     }
@@ -218,7 +160,7 @@ struct HammingComputer20 {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 20);
 
         a0 = vld1q_u8(a8);
@@ -253,7 +195,7 @@ struct HammingComputer32 {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 32);
         a0 = vld1q_u8(a8);
         a1 = vld1q_u8(a8 + 16);
@@ -286,7 +228,7 @@ struct HammingComputer64 {
         set(a8, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
+    void set(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 64);
         hc0.set(a8, 32);
         hc1.set(a8 + 32, 32);
@@ -308,12 +250,12 @@ struct HammingComputerDefault {
 
     HammingComputerDefault() {}
 
-    HammingComputerDefault(const uint8_t* a8, int code_size) {
-        set(a8, code_size);
+    HammingComputerDefault(const uint8_t* a8_in, int code_size) {
+        set(a8_in, code_size);
     }
 
-    void set(const uint8_t* a8, int code_size) {
-        this->a8 = a8;
+    void set(const uint8_t* a8_in, int code_size) {
+        this->a8 = a8_in;
         quotient8 = code_size / 8;
         remainder8 = code_size % 8;
     }
@@ -323,80 +265,15 @@ struct HammingComputerDefault {
 
         const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
         const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
-        int i = 0, len = quotient8;
+        int i = 0;
 
         int len256 = (quotient8 / 4) * 4;
         for (; i < len256; i += 4) {
             accu += ::faiss::hamming<256>(a64 + i, b64 + i);
-            len -= 4;
-        }
-
-        switch (len & 7) {
-            default:
-                while (len > 7) {
-                    len -= 8;
-                    accu += popcount64(a64[i] ^ b64[i]);
-                    i++;
-                    [[fallthrough]];
-                    case 7:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 6:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 5:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 4:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 3:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 2:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                        [[fallthrough]];
-                    case 1:
-                        accu += popcount64(a64[i] ^ b64[i]);
-                        i++;
-                }
-        }
-        if (remainder8) {
-            const uint8_t* a = a8 + 8 * quotient8;
-            const uint8_t* b = b8 + 8 * quotient8;
-            switch (remainder8) {
-                case 7:
-                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
-                    [[fallthrough]];
-                case 6:
-                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
-                    [[fallthrough]];
-                case 5:
-                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
-                    [[fallthrough]];
-                case 4:
-                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
-                    [[fallthrough]];
-                case 3:
-                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
-                    [[fallthrough]];
-                case 2:
-                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
-                    [[fallthrough]];
-                case 1:
-                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
-                    [[fallthrough]];
-                default:
-                    break;
-            }
         }
 
+        accu += hamming_popcount_tail(
+                a64, b64, i, quotient8, a8, b8, remainder8);
         return accu;
     }
 
@@ -406,22 +283,13 @@ struct HammingComputerDefault {
 };
 
 /***************************************************************************
- * generalized Hamming = number of bytes that are different between
- * two codes.
+ * NEON-optimized generalized Hamming computers.
  ***************************************************************************/
 
-inline int generalized_hamming_64(uint64_t a) {
-    a |= a >> 1;
-    a |= a >> 2;
-    a |= a >> 4;
-    a &= 0x0101010101010101UL;
-    return popcount64(a);
-}
-
 struct GenHammingComputer8 {
     uint8x8_t a0;
 
-    GenHammingComputer8(const uint8_t* a8, int code_size) {
+    GenHammingComputer8(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 8);
         a0 = vld1_u8(a8);
     }
@@ -441,7 +309,7 @@ struct GenHammingComputer8 {
 struct GenHammingComputer16 {
     uint8x16_t a0;
 
-    GenHammingComputer16(const uint8_t* a8, int code_size) {
+    GenHammingComputer16(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size) {
         assert(code_size == 16);
         a0 = vld1q_u8(a8);
     }
@@ -461,7 +329,7 @@ struct GenHammingComputer16 {
 struct GenHammingComputer32 {
     GenHammingComputer16 a0, a1;
 
-    GenHammingComputer32(const uint8_t* a8, int code_size)
+    GenHammingComputer32(const uint8_t* a8, FAISS_MAYBE_UNUSED int code_size)
             : a0(a8, 16), a1(a8 + 16, 16) {
         assert(code_size == 32);
     }
diff --git a/faiss/utils/simd_impl/hamming_avx2.cpp b/faiss/utils/simd_impl/hamming_avx2.cpp
new file mode 100644
index 0000000000..ac0787a094
--- /dev/null
+++ b/faiss/utils/simd_impl/hamming_avx2.cpp
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_AVX2
+
+#define THE_SIMD_LEVEL SIMDLevel::AVX2
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/utils/simd_impl/hamming_impl.h>
+
+#endif // COMPILE_SIMD_AVX2
diff --git a/faiss/utils/simd_impl/hamming_impl.h b/faiss/utils/simd_impl/hamming_impl.h
new file mode 100644
index 0000000000..35e63d575f
--- /dev/null
+++ b/faiss/utils/simd_impl/hamming_impl.h
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Shared implementation header for Hamming distance dynamic dispatch.
+// Included by per-ISA TUs (hamming_avx2.cpp, hamming_neon.cpp) and by
+// hamming.cpp (for the NONE fallback).
+//
+// THE_SIMD_LEVEL must be defined before including this header.
+//
+// INCLUDE ORDERING IS LOAD-BEARING: hamdis-inl.h must be included before
+// hamming.h so that the ISA-specific struct definitions (selected by
+// __AVX2__ / __aarch64__ / etc.) are established before hamming.h's own
+// #include of hamdis-inl.h becomes a no-op via include guards.
+
+#pragma once
+
+#ifndef THE_SIMD_LEVEL
+#error "Define THE_SIMD_LEVEL before including hamming_impl.h"
+#endif
+
+// ISA-specific struct definitions — MUST come first (see comment above).
+#include <faiss/utils/hamming_distance/hamdis-inl.h>
+
+#include <faiss/utils/hamming.h>
+
+#include <algorithm>
+#include <cstdio>
+#include <limits>
+#include <memory>
+#include <vector>
+
+#include <faiss/impl/AuxIndexStructures.h>
+#include <faiss/impl/FaissAssert.h>
+#include <faiss/impl/IDSelector.h>
+#include <faiss/utils/Heap.h>
+#include <faiss/utils/approx_topk_hamming/approx_topk_hamming.h>
+#include <faiss/utils/utils.h>
+
+namespace faiss {
+
+// Alias for readability in the template specializations below.
+constexpr SIMDLevel hamming_impl_SL = THE_SIMD_LEVEL;
+
+// All implementation templates live in an anonymous namespace for ODR safety.
+// Different TUs compile this header with different HammingComputer struct
+// layouts (generic vs AVX2 vs NEON). The anonymous namespace ensures each
+// TU gets its own copy with internal linkage, preventing the linker from
+// merging incompatible instantiations.
+namespace {
+
+/******************************************************************
+ * Bit-level Hamming distance templates
+ ******************************************************************/
+
+template <size_t nbits>
+void hammings_impl(
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t* __restrict dis) {
+    size_t i, j;
+    const size_t nwords = nbits / 64;
+    for (i = 0; i < n1; i++) {
+        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
+        hamdis_t* __restrict dis_ = dis + i * n2;
+        for (j = 0; j < n2; j++) {
+            dis_[j] = hamming<nbits>(bs1_, bs2 + j * nwords);
+        }
+    }
+}
+
+void hammings_impl_runtime(
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
+        size_t n1,
+        size_t n2,
+        size_t nbits,
+        hamdis_t* __restrict dis) {
+    size_t i, j;
+    const size_t nwords = nbits / 64;
+    for (i = 0; i < n1; i++) {
+        const uint64_t* __restrict bs1_ = bs1 + i * nwords;
+        hamdis_t* __restrict dis_ = dis + i * n2;
+        for (j = 0; j < n2; j++) {
+            dis_[j] = hamming(bs1_, bs2 + j * nwords, nwords);
+        }
+    }
+}
+
+template <size_t nbits>
+void hamming_count_thres_impl(
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t* __restrict nptr) {
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    const uint64_t* bs2_ = bs2;
+
+    for (i = 0; i < n1; i++) {
+        bs2 = bs2_;
+        for (j = 0; j < n2; j++) {
+            if (hamming<nbits>(bs1, bs2) <= ht) {
+                posm++;
+            }
+            bs2 += nwords;
+        }
+        bs1 += nwords;
+    }
+    *nptr = posm;
+}
+
+template <size_t nbits>
+void crosshamming_count_thres_impl(
+        const uint64_t* __restrict dbs,
+        size_t n,
+        int ht,
+        size_t* __restrict nptr) {
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    const uint64_t* bs1 = dbs;
+    for (i = 0; i < n; i++) {
+        const uint64_t* bs2 = bs1 + 2;
+        for (j = i + 1; j < n; j++) {
+            if (hamming<nbits>(bs1, bs2) <= ht) {
+                posm++;
+            }
+            bs2 += nwords;
+        }
+        bs1 += nwords;
+    }
+    *nptr = posm;
+}
+
+template <size_t nbits>
+size_t match_hamming_thres_impl(
+        const uint64_t* __restrict bs1,
+        const uint64_t* __restrict bs2,
+        size_t n1,
+        size_t n2,
+        int ht,
+        int64_t* __restrict idx,
+        hamdis_t* __restrict hams) {
+    const size_t nwords = nbits / 64;
+    size_t i, j, posm = 0;
+    hamdis_t h;
+    const uint64_t* bs2_ = bs2;
+    for (i = 0; i < n1; i++) {
+        bs2 = bs2_;
+        for (j = 0; j < n2; j++) {
+            h = hamming<nbits>(bs1, bs2);
+            if (h <= ht) {
+                *idx = i;
+                idx++;
+                *idx = j;
+                idx++;
+                *hams = h;
+                hams++;
+                posm++;
+            }
+            bs2 += nwords;
+        }
+        bs1 += nwords;
+    }
+    return posm;
+}
+
+/******************************************************************
+ * HammingComputer-based search templates
+ ******************************************************************/
+
+template <class HammingComputer>
+void hammings_knn_hc_impl(
+        int bytes_per_code,
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict bs1,
+        const uint8_t* __restrict bs2,
+        size_t n2,
+        bool order = true,
+        bool init_heap = true,
+        ApproxTopK_mode_t approx_topk_mode = ApproxTopK_mode_t::EXACT_TOPK,
+        const faiss::IDSelector* sel = nullptr) {
+    size_t k = ha->k;
+    if (init_heap) {
+        ha->heapify();
+    }
+
+    const size_t block_size = hamming_batch_size;
+    for (size_t j0 = 0; j0 < n2; j0 += block_size) {
+        const size_t j1 = std::min(j0 + block_size, n2);
+#pragma omp parallel for
+        for (int64_t i = 0; i < static_cast<int64_t>(ha->nh); i++) {
+            HammingComputer hc(bs1 + i * bytes_per_code, bytes_per_code);
+
+            const uint8_t* __restrict bs2_ = bs2 + j0 * bytes_per_code;
+            hamdis_t dis;
+            hamdis_t* __restrict bh_val_ = ha->val + i * k;
+            int64_t* __restrict bh_ids_ = ha->ids + i * k;
+
+#define HANDLE_APPROX(NB, BD)                                                \
+    case ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD:               \
+        FAISS_THROW_IF_NOT_FMT(                                              \
+                k <= NB * BD,                                                \
+                "The chosen mode (%d) of approximate top-k supports "        \
+                "up to %d values, but %zd is requested.",                    \
+                (int)(ApproxTopK_mode_t::APPROX_TOPK_BUCKETS_B##NB##_D##BD), \
+                NB * BD,                                                     \
+                k);                                                          \
+        HeapWithBucketsForHamming32<                                         \
+                CMax<hamdis_t, int64_t>,                                     \
+                NB,                                                          \
+                BD,                                                          \
+                HammingComputer>::                                           \
+                addn(j1 - j0, hc, bs2_, k, bh_val_, bh_ids_, sel);           \
+        break;
+
+            switch (approx_topk_mode) {
+                HANDLE_APPROX(8, 3)
+                HANDLE_APPROX(8, 2)
+                HANDLE_APPROX(16, 2)
+                HANDLE_APPROX(32, 2)
+                default: {
+                    for (size_t j = j0; j < j1; j++, bs2_ += bytes_per_code) {
+                        if (sel && !sel->is_member(j)) {
+                            continue;
+                        }
+                        dis = hc.hamming(bs2_);
+                        if (dis < bh_val_[0]) {
+                            faiss::maxheap_replace_top<hamdis_t>(
+                                    k, bh_val_, bh_ids_, dis, j);
+                        }
+                    }
+                } break;
+            }
+        }
+    }
+    if (order) {
+        ha->reorder();
+    }
+}
+
+#undef HANDLE_APPROX
+
+template <class HammingComputer>
+void hammings_knn_mc_impl(
+        int bytes_per_code,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        int32_t* __restrict distances,
+        int64_t* __restrict labels,
+        const faiss::IDSelector* sel) {
+    const int nBuckets = bytes_per_code * 8 + 1;
+    std::vector<int> all_counters(na * nBuckets, 0);
+    std::unique_ptr<int64_t[]> all_ids_per_dis(new int64_t[na * nBuckets * k]);
+
+    std::vector<HCounterState<HammingComputer>> cs;
+    for (size_t i = 0; i < na; ++i) {
+        cs.push_back(
+                HCounterState<HammingComputer>(
+                        all_counters.data() + i * nBuckets,
+                        all_ids_per_dis.get() + i * nBuckets * k,
+                        a + i * bytes_per_code,
+                        8 * bytes_per_code,
+                        k));
+    }
+
+    const size_t block_size = hamming_batch_size;
+    for (size_t j0 = 0; j0 < nb; j0 += block_size) {
+        const size_t j1 = std::min(j0 + block_size, nb);
+#pragma omp parallel for
+        for (int64_t i = 0; i < static_cast<int64_t>(na); ++i) {
+            for (size_t j = j0; j < j1; ++j) {
+                if (!sel || sel->is_member(j)) {
+                    cs[i].update_counter(b + j * bytes_per_code, j);
+                }
+            }
+        }
+    }
+
+    for (size_t i = 0; i < na; ++i) {
+        HCounterState<HammingComputer>& csi = cs[i];
+
+        size_t nres = 0;
+        for (int b_2 = 0; b_2 < nBuckets && nres < k; b_2++) {
+            for (int l = 0; l < csi.counters[b_2] && nres < k; l++) {
+                labels[i * k + nres] = csi.ids_per_dis[b_2 * k + l];
+                distances[i * k + nres] = b_2;
+                nres++;
+            }
+        }
+        while (nres < k) {
+            labels[i * k + nres] = -1;
+            distances[i * k + nres] = std::numeric_limits<int32_t>::max();
+            ++nres;
+        }
+    }
+}
+
+template <class HammingComputer>
+void hamming_range_search_impl(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        int radius,
+        size_t code_size,
+        RangeSearchResult* res,
+        const faiss::IDSelector* sel) {
+#pragma omp parallel
+    {
+        RangeSearchPartialResult pres(res);
+
+#pragma omp for
+        for (int64_t i = 0; i < static_cast<int64_t>(na); i++) {
+            HammingComputer hc(a + i * code_size, code_size);
+            const uint8_t* yi = b;
+            RangeQueryResult& qres = pres.new_result(i);
+
+            for (size_t j = 0; j < nb; j++) {
+                if (!sel || sel->is_member(j)) {
+                    int dis = hc.hamming(yi);
+                    if (dis < radius) {
+                        qres.add(dis, j);
+                    }
+                }
+                yi += code_size;
+            }
+        }
+        pres.finalize();
+    }
+}
+
+// Consumer structs for dispatch_HammingComputer.
+// These MUST be in the anonymous namespace so that each TU's
+// dispatch_HammingComputer instantiation uses unique consumer types,
+// preventing ODR violations across TUs with different struct layouts.
+
+struct Run_hammings_knn_hc {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        hammings_knn_hc_impl<HammingComputer>(args...);
+    }
+};
+
+struct Run_hammings_knn_mc {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        hammings_knn_mc_impl<HammingComputer>(args...);
+    }
+};
+
+struct Run_hamming_range_search {
+    using T = void;
+    template <class HammingComputer, class... Types>
+    void f(Types... args) {
+        hamming_range_search_impl<HammingComputer>(args...);
+    }
+};
+
+/******************************************************************
+ * Generalized Hamming distances
+ ******************************************************************/
+
+template <class HammingComputer>
+void hamming_dis_inner_loop(
+        const uint8_t* __restrict ca,
+        const uint8_t* __restrict cb,
+        size_t nb,
+        size_t code_size,
+        int k,
+        hamdis_t* __restrict bh_val_,
+        int64_t* __restrict bh_ids_) {
+    HammingComputer hc(ca, code_size);
+
+    for (size_t j = 0; j < nb; j++) {
+        int ndiff = hc.hamming(cb);
+        cb += code_size;
+        if (ndiff < bh_val_[0]) {
+            maxheap_replace_top<hamdis_t>(k, bh_val_, bh_ids_, ndiff, j);
+        }
+    }
+}
+
+void generalized_hammings_knn_hc_impl(
+        int_maxheap_array_t* __restrict ha,
+        const uint8_t* __restrict a,
+        const uint8_t* __restrict b,
+        size_t nb,
+        size_t code_size,
+        int ordered) {
+    int na = ha->nh;
+    int k = ha->k;
+
+    if (ordered) {
+        ha->heapify();
+    }
+
+#pragma omp parallel for
+    for (int i = 0; i < na; i++) {
+        const uint8_t* __restrict ca = a + i * code_size;
+        const uint8_t* __restrict cb = b;
+
+        hamdis_t* __restrict bh_val_ = ha->val + i * k;
+        int64_t* __restrict bh_ids_ = ha->ids + i * k;
+
+        switch (code_size) {
+            case 8:
+                hamming_dis_inner_loop<GenHammingComputer8>(
+                        ca, cb, nb, 8, k, bh_val_, bh_ids_);
+                break;
+            case 16:
+                hamming_dis_inner_loop<GenHammingComputer16>(
+                        ca, cb, nb, 16, k, bh_val_, bh_ids_);
+                break;
+            case 32:
+                hamming_dis_inner_loop<GenHammingComputer32>(
+                        ca, cb, nb, 32, k, bh_val_, bh_ids_);
+                break;
+            default:
+                hamming_dis_inner_loop<GenHammingComputerM8>(
+                        ca, cb, nb, code_size, k, bh_val_, bh_ids_);
+                break;
+        }
+    }
+
+    if (ordered) {
+        ha->reorder();
+    }
+}
+
+} // anonymous namespace
+
+/******************************************************************
+ * Entry point template specializations at THE_SIMD_LEVEL
+ ******************************************************************/
+
+#define C64(x) ((uint64_t*)x)
+
+template <>
+void hammings_knn_hc_dispatch<hamming_impl_SL>(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t ncodes,
+        int ordered,
+        ApproxTopK_mode_t approx_topk_mode,
+        const IDSelector* sel) {
+    Run_hammings_knn_hc r;
+    dispatch_HammingComputer(
+            ncodes,
+            r,
+            ncodes,
+            ha,
+            a,
+            b,
+            nb,
+            ordered,
+            true,
+            approx_topk_mode,
+            sel);
+}
+
+template <>
+void hammings_knn_mc_dispatch<hamming_impl_SL>(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        size_t k,
+        size_t ncodes,
+        int32_t* distances,
+        int64_t* labels,
+        const IDSelector* sel) {
+    Run_hammings_knn_mc r;
+    dispatch_HammingComputer(
+            ncodes, r, ncodes, a, b, na, nb, k, distances, labels, sel);
+}
+
+template <>
+void hamming_range_search_dispatch<hamming_impl_SL>(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        int radius,
+        size_t code_size,
+        RangeSearchResult* result,
+        const IDSelector* sel) {
+    Run_hamming_range_search r;
+    dispatch_HammingComputer(
+            code_size, r, a, b, na, nb, radius, code_size, result, sel);
+}
+
+template <>
+void hammings_dispatch<hamming_impl_SL>(
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t na,
+        size_t nb,
+        size_t ncodes,
+        hamdis_t* dis) {
+    FAISS_THROW_IF_NOT(ncodes % 8 == 0);
+    switch (ncodes) {
+        case 8:
+            hammings_impl<64>(C64(a), C64(b), na, nb, dis);
+            return;
+        case 16:
+            hammings_impl<128>(C64(a), C64(b), na, nb, dis);
+            return;
+        case 32:
+            hammings_impl<256>(C64(a), C64(b), na, nb, dis);
+            return;
+        case 64:
+            hammings_impl<512>(C64(a), C64(b), na, nb, dis);
+            return;
+        default:
+            hammings_impl_runtime(C64(a), C64(b), na, nb, ncodes * 8, dis);
+            return;
+    }
+}
+
+template <>
+void generalized_hammings_knn_hc_dispatch<hamming_impl_SL>(
+        int_maxheap_array_t* ha,
+        const uint8_t* a,
+        const uint8_t* b,
+        size_t nb,
+        size_t code_size,
+        int ordered) {
+    generalized_hammings_knn_hc_impl(ha, a, b, nb, code_size, ordered);
+}
+
+template <>
+void hamming_count_thres_dispatch<hamming_impl_SL>(
+        const uint8_t* bs1,
+        const uint8_t* bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t* nptr) {
+    switch (ncodes) {
+        case 8:
+            hamming_count_thres_impl<64>(C64(bs1), C64(bs2), n1, n2, ht, nptr);
+            return;
+        case 16:
+            hamming_count_thres_impl<128>(C64(bs1), C64(bs2), n1, n2, ht, nptr);
+            return;
+        case 32:
+            hamming_count_thres_impl<256>(C64(bs1), C64(bs2), n1, n2, ht, nptr);
+            return;
+        case 64:
+            hamming_count_thres_impl<512>(C64(bs1), C64(bs2), n1, n2, ht, nptr);
+            return;
+        default:
+            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
+    }
+}
+
+template <>
+void crosshamming_count_thres_dispatch<hamming_impl_SL>(
+        const uint8_t* dbs,
+        size_t n,
+        hamdis_t ht,
+        size_t ncodes,
+        size_t* nptr) {
+    switch (ncodes) {
+        case 8:
+            crosshamming_count_thres_impl<64>(C64(dbs), n, ht, nptr);
+            return;
+        case 16:
+            crosshamming_count_thres_impl<128>(C64(dbs), n, ht, nptr);
+            return;
+        case 32:
+            crosshamming_count_thres_impl<256>(C64(dbs), n, ht, nptr);
+            return;
+        case 64:
+            crosshamming_count_thres_impl<512>(C64(dbs), n, ht, nptr);
+            return;
+        default:
+            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
+    }
+}
+
+template <>
+size_t match_hamming_thres_dispatch<hamming_impl_SL>(
+        const uint8_t* bs1,
+        const uint8_t* bs2,
+        size_t n1,
+        size_t n2,
+        hamdis_t ht,
+        size_t ncodes,
+        int64_t* idx,
+        hamdis_t* dis) {
+    switch (ncodes) {
+        case 8:
+            return match_hamming_thres_impl<64>(
+                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
+        case 16:
+            return match_hamming_thres_impl<128>(
+                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
+        case 32:
+            return match_hamming_thres_impl<256>(
+                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
+        case 64:
+            return match_hamming_thres_impl<512>(
+                    C64(bs1), C64(bs2), n1, n2, ht, idx, dis);
+        default:
+            FAISS_THROW_FMT("not implemented for %zu bits", ncodes);
+            return 0;
+    }
+}
+
+#undef C64
+
+} // namespace faiss
diff --git a/faiss/utils/simd_impl/hamming_neon.cpp b/faiss/utils/simd_impl/hamming_neon.cpp
new file mode 100644
index 0000000000..6c72618028
--- /dev/null
+++ b/faiss/utils/simd_impl/hamming_neon.cpp
@@ -0,0 +1,14 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#ifdef COMPILE_SIMD_ARM_NEON
+
+#define THE_SIMD_LEVEL SIMDLevel::ARM_NEON
+// NOLINTNEXTLINE(facebook-hte-InlineHeader)
+#include <faiss/utils/simd_impl/hamming_impl.h>
+
+#endif // COMPILE_SIMD_ARM_NEON