@@ -162,40 +162,6 @@ TEST_F(SIMDPerfTest, AVX2FasterThanNONE) {
162162 << " NONE=" << none_time << " ms, AVX2=" << avx2_time << " ms" ;
163163}
164164
165- TEST_F (SIMDPerfTest, AVX512FasterThanAVX2IfAvailable) {
166- if (!faiss::SIMDConfig::is_simd_level_available (faiss::SIMDLevel::AVX512)) {
167- GTEST_SKIP () << " AVX512 not available on this machine" ;
168- }
169-
170- verify_dispatch (faiss::SIMDLevel::AVX2);
171- verify_dispatch (faiss::SIMDLevel::AVX512);
172-
173- auto bench_fn = [this ](faiss::SIMDLevel level) {
174- return benchmark_fvec_inner_products_ny (level);
175- };
176- auto [avx2_time, avx512_time] = benchmark_interleaved_best (
177- bench_fn, faiss::SIMDLevel::AVX2, faiss::SIMDLevel::AVX512);
178-
179- printf (" fvec_inner_products_ny AVX2: %.2f ms (best of %d runs)\n " ,
180- avx2_time,
181- kBenchmarkReps );
182- printf (" fvec_inner_products_ny AVX512: %.2f ms (best of %d runs)\n " ,
183- avx512_time,
184- kBenchmarkReps );
185-
186- double speedup = avx2_time / avx512_time;
187- printf (" Speedup (AVX512 vs AVX2): %.2fx\n " , speedup);
188-
189- // AVX512 fvec_inner_products_ny (d=8) uses 16x8 register transpose
190- // (16 vectors/iteration) vs AVX2's 8x8 transpose (8 vectors/iteration).
191- // Expected speedup is ~1.5x on bare metal. We use 1.1x threshold to
192- // allow for AVX-512 frequency throttling on Intel CPUs.
193- EXPECT_GT (speedup, 1.1 )
194- << " AVX512 should be at least 1.1x faster than AVX2 for "
195- << " fvec_inner_products_ny. "
196- << " AVX2=" << avx2_time << " ms, AVX512=" << avx512_time << " ms" ;
197- }
198-
199165// Additional test: Verify fvec_L2sqr dispatch is at least not slower.
200166// fvec_L2sqr uses auto-vectorization, so AVX2 may only be slightly faster.
201167TEST_F (SIMDPerfTest, L2sqrAutoVecDispatchWorks) {
0 commit comments