Skip to content

Commit 2322afd

Browse files
alibeklfcmeta-codesync[bot]
authored andcommitted
faiss: parallelize post-BLAS reduction loop and end_multiple() in result handlers (#5185)
Summary: Pull Request resolved: #5185 Three sequential post-BLAS / end_multiple loops in faiss were leaving OMP threads idle while a single thread did all the work. Each is parallelized with `#pragma omp parallel for schedule(static)`, gated by an `if (...)` clause to avoid spawn-cost regressions on small workloads. **Changes** 1. `exhaustive_L2sqr_blas_cmax` (AVX2 + ARM SVE): after `sgemm_` completes, the per-query result accumulation loop ran single-threaded while all OMP threads were idle. Each query `i` reads a distinct row of `ip_block` and writes to `dis_tab[i]/ids_tab[i]` — no cross-query dependencies. Added `#pragma omp parallel for schedule(static) if ((i1 - i0) >= 16)` to both ISA specializations. 2. `HeapBlockResultHandler::end_multiple`: `heap_reorder` is O(k log k) per query and was sequential. The original author left a `// maybe parallel for` comment. `add_results` in the same class already has `#pragma omp parallel for`; `end_multiple` was the only remaining sequential step. Gate: `if ((i1 - i0) * k >= 1024)`. 3. `ReservoirBlockResultHandler::end_multiple`: same pattern — reservoir `to_result` (partial sort, O(capacity)) was sequential despite `add_results` being parallelized. `// maybe parallel for` comment removed and replaced with the actual pragma. Gate: `if ((i1 - i0) * this->k >= 1024)`. The `if (...)` thresholds were chosen from microbenchmark data: below the threshold, OMP fanout cost exceeds the work, producing 3-6× regressions on small batches. Above the threshold, parallelization yields 9-14× speedups at 16 threads. Data independence verified for all three: each loop iteration operates on a disjoint slice of `dis_tab`/`ids_tab` indexed by query `i`. **Benchmark results** A local microbench (not landed) was used for A/B measurement. Host: Intel Sapphire Rapids, 28 physical cores, AVX-512. Pinned with `taskset -c 0-15` (OMP=16) and `taskset -c 0` (OMP=1). Median of 5 reps. Synthetic uniform-random distance distributions. `HeapBlockResultHandler::end_multiple` (us, lower better): | nq | k | parent t=1 | this t=1 | parent t=16 | this t=16 | speedup t=16 | |------:|-----:|-----------:|---------:|------------:|----------:|--------------:| | 64 | 10 | 9.2 | 7.2 | 8.1 | 8.3 | 0.98× (gated) | | 64 | 100 | 340 | 345 | 318 | 67 | 4.79× | | 64 | 1000 | 5,796 | 5,700 | 5,886 | 501 | 11.76× | | 512 | 100 | 2,811 | 2,769 | 2,677 | 312 | 8.59× | | 512 | 1000 | 46,109 | 46,070 | 45,758 | 3,778 | 12.11× | | 4096 | 100 | 22,041 | 21,588 | 21,672 | 1,869 | 11.60× | | 4096 | 1000 | 369,069 | 376,541 | 372,481 | 25,442 | 14.64× | `ReservoirBlockResultHandler::end_multiple` (us): | nq | k | parent t=16 | this t=16 | speedup | |------:|-----:|------------:|----------:|--------------:| | 64 | 10 | 18.0 | 18.1 | 0.99× (gated) | | 64 | 100 | 659 | 96 | 6.86× | | 64 | 1000 | 7,592 | 553 | 13.73× | | 512 | 100 | 5,498 | 490 | 11.21× | | 512 | 1000 | 59,548 | 4,677 | 12.73× | | 4096 | 100 | 44,064 | 3,230 | 13.64× | | 4096 | 1000 | 476,388 | 32,237 | 14.78× | `IndexFlatL2::search` end-to-end — drives `exhaustive_L2sqr_blas_cmax` (ms): | nb | nq | k | parent t=16 | this t=16 | speedup | |------:|------:|----:|------------:|----------:|--------:| | 1024 | 1024 | 10 | 1.71 | 1.45 | 1.18× | | 1024 | 4096 | 100 | 58.5 | 15.5 | 3.78× | | 4096 | 4096 | 100 | 76.9 | 39.4 | 1.95× | Single-threaded paths (OMP=1) are within ±5% of parent across all configurations — the `if (...)` clause makes the pragma a no-op below the threshold, eliminating overhead for serial callers. Caveats: the `IndexFlatL2::search` numbers measure the full search path, so the speedup attributed to change #1 also includes contributions from change #2 (heap handler, also called by this path). The `end_multiple` numbers isolate the changed function via `PauseTiming`/`ResumeTiming` around setup. ARM SVE not measured directly (no Graviton host); the AVX2 numbers are the strongest available proxy. Reviewed By: mnorris11 Differential Revision: D103830810 fbshipit-source-id: 8434fa6f16b78c5ff7b2244ac5d5fe9cc8c012a5
1 parent 6cef1bb commit 2322afd

3 files changed

Lines changed: 15 additions & 7 deletions

File tree

faiss/impl/ResultHandler.h

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -377,8 +377,9 @@ struct HeapBlockResultHandler : TopkBlockResultHandler<C, use_sel> {
377377

378378
/// series of results for queries i0..i1 is done
379379
void end_multiple() final {
380-
// maybe parallel for
381-
for (size_t i = i0; i < i1; i++) {
380+
#pragma omp parallel for schedule(static) if ((i1 - i0) * k >= 1024)
381+
for (int64_t i = static_cast<int64_t>(i0); i < static_cast<int64_t>(i1);
382+
i++) {
382383
heap_reorder<C>(k, this->dis_tab + i * k, this->ids_tab + i * k);
383384
}
384385
}
@@ -568,9 +569,10 @@ struct ReservoirBlockResultHandler : TopkBlockResultHandler<C, use_sel> {
568569

569570
/// series of results for queries i0..i1 is done
570571
void end_multiple() final {
571-
// maybe parallel for
572-
for (size_t i = i0; i < i1; i++) {
573-
reservoirs[i - i0].to_result(
572+
#pragma omp parallel for schedule(static) if ((i1 - i0) * this->k >= 1024)
573+
for (int64_t i = static_cast<int64_t>(i0); i < static_cast<int64_t>(i1);
574+
i++) {
575+
reservoirs[i - static_cast<int64_t>(i0)].to_result(
574576
this->dis_tab + i * this->k, this->ids_tab + i * this->k);
575577
}
576578
}

faiss/utils/simd_impl/distances_arm_sve.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,10 @@ void exhaustive_L2sqr_blas_cmax<SIMDLevel::ARM_SVE>(
655655
ip_block.get(),
656656
&nyi);
657657
}
658-
for (int64_t i = i0; i < i1; i++) {
658+
#pragma omp parallel for schedule(static) if ((i1 - i0) >= 16)
659+
for (int64_t i = static_cast<int64_t>(i0);
660+
i < static_cast<int64_t>(i1);
661+
i++) {
659662
const size_t count = j1 - j0;
660663
float* ip_line = ip_block.get() + (i - i0) * count;
661664

faiss/utils/simd_impl/distances_avx2.cpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1276,7 +1276,10 @@ void exhaustive_L2sqr_blas_cmax<SIMDLevel::AVX2>(
12761276
ip_block.get(),
12771277
&nyi);
12781278
}
1279-
for (int64_t i = i0; i < i1; i++) {
1279+
#pragma omp parallel for schedule(static) if ((i1 - i0) >= 16)
1280+
for (int64_t i = static_cast<int64_t>(i0);
1281+
i < static_cast<int64_t>(i1);
1282+
i++) {
12801283
float* ip_line = ip_block.get() + (i - i0) * (j1 - j0);
12811284

12821285
_mm_prefetch((const char*)ip_line, _MM_HINT_NTA);

0 commit comments

Comments
 (0)