Fix race condition, memory management, debug output, and hashtable lookup in sorting.cpp (facebookresearch#5078)

alibeklfc · meta-codesync[bot] · commit aa3ce376ced6 · 2026-04-10T17:59:30.000-07:00
Summary: Pull Request resolved: facebookresearch#5078 Four fixes in `faiss/utils/sorting.cpp`: **1. OpenMP directive fix in `fvec_argsort_parallel`** The initialization loop used `#pragma omp parallel` without the `for` directive. This caused every thread to execute the entire loop independently rather than distributing iterations. With `nt` threads, each `permA[i]` was written by all `nt` threads concurrently — a data race under the C++ memory model (multiple unsynchronized writes to the same non-atomic location), and O(n * nt) wasted work instead of O(n). Fixed by changing to `#pragma omp parallel for`. In practice, all threads write the same value (`permA[i] = i`), so the output was always correct despite the UB. The fix eliminates the undefined behavior and the redundant work. **2. RAII memory management in `fvec_argsort_parallel`** Replaced `new size_t[n]` / `delete[] perm2` with `std::vector<size_t>`. The old code had no realistic exception path between allocation and deallocation (all intermediate code is either C functions or non-throwing OpenMP regions), but the manual `new`/`delete` pattern is fragile against future edits that might introduce a throwing path. The `std::vector` provides RAII lifetime management with no behavioral change. **3. Removed debug `printf` in `fvec_argsort_parallel`** A leftover `printf("merge %d %d, %d threads\n", ...)` in the parallel merge loop wrote to stdout during normal operation. Removed. **4. Missing early termination in `hashtable_int64_to_int64_lookup`** The linear probing loop did not check for empty slots (`tab[slot * 2] == -1`). In an open-addressing hash table with no deletion support, an empty slot is definitive proof that the key was not inserted — the insert function would have placed it there or earlier. Without this check, lookups for absent keys probed every slot in the bucket before the wrap-around termination at `slot == hk_i`. The fix adds the standard empty-slot check, matching the structure of the insert function (`hashtable_int64_to_int64_add`). This is a performance optimization — the old code always returned the correct result (`-1` after a full bucket scan), just slower. Reviewed By: mnorris11 Differential Revision: D100317917 fbshipit-source-id: aadfe33b1d76c34e04db7fe0c9b7ca53b4a30c71
diff --git a/faiss/utils/sorting.cpp b/faiss/utils/sorting.cpp
@@ -134,9 +134,9 @@ void fvec_argsort(size_t n, const float* vals, size_t* perm) {
 }
 
 void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
-    size_t* perm2 = new size_t[n];
+    std::vector<size_t> perm2(n);
     // 2 result tables, during merging, flip between them
-    size_t *permB = perm2, *permA = perm;
+    size_t *permB = perm2.data(), *permA = perm;
 
     int nt = omp_get_max_threads();
     { // prepare correct permutation so that the result ends in perm
@@ -148,8 +148,8 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
         }
     }
 
-#pragma omp parallel
-    for (size_t i = 0; i < n; i++) {
+#pragma omp parallel for
+    for (int64_t i = 0; i < static_cast<int64_t>(n); i++) {
         permA[i] = i;
     }
 
@@ -184,7 +184,6 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
             } else {
                 int t0 = s * sub_nt / sub_nseg1;
                 int t1 = (s + 1) * sub_nt / sub_nseg1;
-                printf("merge %d %d, %d threads\n", s, s + 1, t1 - t0);
                 parallel_merge(
                         permA, permB, segs[s], segs[s + 1], t1 - t0, comp);
             }
@@ -197,7 +196,6 @@ void fvec_argsort_parallel(size_t n, const float* vals, size_t* perm) {
     }
     assert(permA == perm);
     omp_set_nested(prev_nested);
-    delete[] perm2;
 }
 
 /*****************************************************************************
@@ -816,6 +814,10 @@ void hashtable_int64_to_int64_lookup(
             size_t k0 = bucket << (log2_capacity - log2_nbucket);
             size_t k1 = (bucket + 1) << (log2_capacity - log2_nbucket);
             for (;;) {
+                if (tab[slot * 2] == -1) { // empty slot, key not in table
+                    vals[i] = -1;
+                    break;
+                }
                 if (tab[slot * 2] == k) { // found!
                     vals[i] = tab[2 * slot + 1];
                     break;
diff --git a/tests/test_sorting.cpp b/tests/test_sorting.cpp
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ *
+ * This source code is licensed under the MIT license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <gtest/gtest.h>
+
+#include <random>
+#include <vector>
+
+#include <faiss/utils/sorting.h>
+
+TEST(TestSorting, argsort_parallel_matches_serial) {
+    // n > 1M to exercise the parallel merge path
+    size_t n = 2000000;
+
+    std::vector<float> vals(n);
+    std::mt19937 rng(42);
+    std::uniform_real_distribution<float> dist(-1000.0f, 1000.0f);
+    for (size_t i = 0; i < n; i++) {
+        vals[i] = dist(rng);
+    }
+
+    std::vector<size_t> perm_serial(n);
+    faiss::fvec_argsort(n, vals.data(), perm_serial.data());
+
+    std::vector<size_t> perm_parallel(n);
+    faiss::fvec_argsort_parallel(n, vals.data(), perm_parallel.data());
+
+    // Permutations may differ on ties, but sorted values must match
+    for (size_t i = 0; i < n; i++) {
+        ASSERT_FLOAT_EQ(vals[perm_serial[i]], vals[perm_parallel[i]])
+                << "mismatch at position " << i;
+    }
+}
+
+TEST(TestSorting, hashtable_lookup) {
+    int log2_capacity = 12;
+    size_t capacity = (size_t)1 << log2_capacity;
+
+    std::vector<int64_t> tab(capacity * 2);
+    faiss::hashtable_int64_to_int64_init(log2_capacity, tab.data());
+
+    size_t n = 200;
+    std::vector<int64_t> keys(n), vals(n);
+    for (size_t i = 0; i < n; i++) {
+        keys[i] = static_cast<int64_t>(i * 3);
+        vals[i] = static_cast<int64_t>(i + 1);
+    }
+    faiss::hashtable_int64_to_int64_add(
+            log2_capacity, tab.data(), n, keys.data(), vals.data());
+
+    // Interleave present and absent keys
+    size_t n_query = n * 2;
+    std::vector<int64_t> query_keys(n_query);
+    std::vector<int64_t> expected(n_query);
+    for (size_t i = 0; i < n; i++) {
+        query_keys[2 * i] = keys[i];
+        expected[2 * i] = vals[i];
+        query_keys[2 * i + 1] =
+                keys[i] + 1; // not a multiple of 3, never inserted
+        expected[2 * i + 1] = -1;
+    }
+
+    std::vector<int64_t> result(n_query);
+    faiss::hashtable_int64_to_int64_lookup(
+            log2_capacity,
+            tab.data(),
+            n_query,
+            query_keys.data(),
+            result.data());
+
+    for (size_t i = 0; i < n_query; i++) {
+        ASSERT_EQ(result[i], expected[i])
+                << "query key " << query_keys[i] << " at index " << i;
+    }
+}