Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions MicroBenchmarks/LoopVectorization/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ endif()
llvm_test_run()

llvm_test_executable(LoopVectorizationBenchmarks
ControlFlowVectorization.cpp
ConditionalScalarAssignment.cpp
main.cpp
MathFunctions.cpp
Expand Down
188 changes: 188 additions & 0 deletions MicroBenchmarks/LoopVectorization/ControlFlowVectorization.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
#include <iostream>
#include <memory>
#include <random>

#include "benchmark/benchmark.h"

#define ITERATIONS 100000

template <typename T> using CFVFunc = void (*)(T *, unsigned);

// Define conditional increment loop with given stride.
#define DEF_COND_INC_LOOP(name, stride) \
template <typename T> \
__attribute__((noinline)) static void run_##name##_autovec(T *A, \

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the meaning of this benchmark ? Just track current state of cf vectorization of novec and autovec or help to identify better LMUL to vectorize the loop ? If latter, it does make sense to add similar functions with forced vectorization for default LMUL and specified LMULs

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

IIUC, this benchmark serves as a test suite for other targets to measure the performance impact of enabling control-flow vectorization.

I've updated the PR description to make it more clear.

unsigned N) { \
for (unsigned i = 0; i < N; i++) { \
if (i % stride == 0) { \
A[i] = A[i] + 1; \
} \
} \
} \
template <typename T> \
__attribute__((noinline)) static void run_##name##_novec(T *A, unsigned N) { \
_Pragma("clang loop vectorize(disable) interleave(disable)") \
for (unsigned i = 0; i < N; i++) { \
if (i % stride == 0) { \
A[i] = A[i] + 1; \
} \
} \
}

// Define conditional increment by value loop.
#define DEF_COND_INC_VALUE_LOOP(name, marker) \
template <typename T> \
__attribute__((noinline)) static void run_##name##_autovec(T *A, \
unsigned N) { \
for (unsigned i = 0; i < N; i++) { \
if (A[i] == marker) { \
A[i] = A[i] + 1; \
} \
} \
} \
template <typename T> \
__attribute__((noinline)) static void run_##name##_novec(T *A, unsigned N) { \
_Pragma("clang loop vectorize(disable) interleave(disable)") \
for (unsigned i = 0; i < N; i++) { \
if (A[i] == marker) { \
A[i] = A[i] + 1; \
} \
} \
}

// Define unconditional increment loop.
template <typename T>
__attribute__((noinline)) static void run_uncond_inc_autovec(T *A, unsigned N) {
for (unsigned i = 0; i < N; i++) {
A[i] = A[i] + 1;
}
}

template <typename T>
__attribute__((noinline)) static void run_uncond_inc_novec(T *A, unsigned N) {
_Pragma("clang loop vectorize(disable) interleave(disable)")
for (unsigned i = 0; i < N; i++) {
A[i] = A[i] + 1;
}
}

// Define loops with different strides.
// stride=2: 50% active lanes
// stride=4: 25% active lanes
// stride=8: 12.5% active lanes
// stride=16: 6.25% active lanes
// stride=32: 3.125% active lanes
// stride=64: 1.5625% active lanes
// stride=128: 0.78% active lanes
DEF_COND_INC_LOOP(cond_inc_stride_2, 2)
DEF_COND_INC_LOOP(cond_inc_stride_4, 4)
DEF_COND_INC_LOOP(cond_inc_stride_8, 8)
DEF_COND_INC_LOOP(cond_inc_stride_16, 16)
DEF_COND_INC_LOOP(cond_inc_stride_32, 32)
DEF_COND_INC_LOOP(cond_inc_stride_64, 64)
DEF_COND_INC_LOOP(cond_inc_stride_128, 128)

// Conditional increment by value (sparse condition).
DEF_COND_INC_VALUE_LOOP(cond_inc_by_value, 42)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's going to be a % of active lanes here ? Is it really worth to be tracked ?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Removed small-stride cases to focus on larger strides.
This allows for entirely inactive lanes (in most of the cases), which helps testing conditional vector block optimizations(control-flow vectorization) across different targets.


// Initialize array with random numbers.
template <typename T> static void init_data(T *A) {
std::uniform_int_distribution<T> dist(0, 100);
std::mt19937 rng(12345);
for (unsigned i = 0; i < ITERATIONS; i++) {
A[i] = dist(rng);
}
}

// Benchmark vectorized version.
template <typename T>
static void __attribute__((always_inline))
benchmark_cfv_autovec(benchmark::State &state, CFVFunc<T> VecFn,
CFVFunc<T> NoVecFn) {
std::unique_ptr<T[]> A(new T[ITERATIONS]);
std::unique_ptr<T[]> A_vec(new T[ITERATIONS]);
std::unique_ptr<T[]> A_novec(new T[ITERATIONS]);
init_data(&A[0]);

#ifdef BENCH_AND_VERIFY
// Verify the vectorized and scalar versions produce the same results.
{
std::copy(&A[0], &A[0] + ITERATIONS, &A_vec[0]);
std::copy(&A[0], &A[0] + ITERATIONS, &A_novec[0]);
VecFn(&A_vec[0], ITERATIONS);
NoVecFn(&A_novec[0], ITERATIONS);
for (unsigned i = 0; i < ITERATIONS; i++) {
if (A_vec[i] != A_novec[i]) {
std::cerr << "ERROR: vectorization result different at index " << i
<< "; " << A_vec[i] << " != " << A_novec[i] << "\n";
exit(1);
}
}
}
#endif

for (auto _ : state) {
std::copy(&A[0], &A[0] + ITERATIONS, &A_vec[0]);
VecFn(&A_vec[0], ITERATIONS);
benchmark::DoNotOptimize(A_vec);
benchmark::ClobberMemory();
}
}

// Benchmark version with vectorization disabled.
template <typename T>
static void __attribute__((always_inline))
benchmark_cfv_novec(benchmark::State &state, CFVFunc<T> NoVecFn) {
std::unique_ptr<T[]> A(new T[ITERATIONS]);
std::unique_ptr<T[]> A_work(new T[ITERATIONS]);
init_data(&A[0]);

for (auto _ : state) {
std::copy(&A[0], &A[0] + ITERATIONS, &A_work[0]);
NoVecFn(&A_work[0], ITERATIONS);
benchmark::DoNotOptimize(A_work);
benchmark::ClobberMemory();
}
}

#define BENCHMARK_CFV_CASE(name, ty) \
void BENCHMARK_##name##_autovec_##ty##_(benchmark::State &state) { \
benchmark_cfv_autovec<ty>(state, run_##name##_autovec, run_##name##_novec);\
} \
BENCHMARK(BENCHMARK_##name##_autovec_##ty##_)->Unit(benchmark::kNanosecond); \
\
void BENCHMARK_##name##_novec_##ty##_(benchmark::State &state) { \
benchmark_cfv_novec<ty>(state, run_##name##_novec); \
} \
BENCHMARK(BENCHMARK_##name##_novec_##ty##_)->Unit(benchmark::kNanosecond);

// Unconditional increment benchmark.
#define BENCHMARK_UNCOND_CASE(ty) \
void BENCHMARK_uncond_inc_autovec_##ty##_(benchmark::State &state) { \
benchmark_cfv_autovec<ty>(state, run_uncond_inc_autovec, \
run_uncond_inc_novec); \
} \
BENCHMARK(BENCHMARK_uncond_inc_autovec_##ty##_) \
->Unit(benchmark::kNanosecond); \
\
void BENCHMARK_uncond_inc_novec_##ty##_(benchmark::State &state) { \
benchmark_cfv_novec<ty>(state, run_uncond_inc_novec); \
} \
BENCHMARK(BENCHMARK_uncond_inc_novec_##ty##_)->Unit(benchmark::kNanosecond);

// Add benchmarks for all variants.
#define ADD_CFV_BENCHMARKS(ty) \
BENCHMARK_UNCOND_CASE(ty) \
BENCHMARK_CFV_CASE(cond_inc_stride_2, ty) \
BENCHMARK_CFV_CASE(cond_inc_stride_4, ty) \
BENCHMARK_CFV_CASE(cond_inc_stride_8, ty) \
BENCHMARK_CFV_CASE(cond_inc_stride_16, ty) \
BENCHMARK_CFV_CASE(cond_inc_stride_32, ty) \
BENCHMARK_CFV_CASE(cond_inc_stride_64, ty) \
BENCHMARK_CFV_CASE(cond_inc_stride_128, ty) \
BENCHMARK_CFV_CASE(cond_inc_by_value, ty)

ADD_CFV_BENCHMARKS(int64_t)
ADD_CFV_BENCHMARKS(int32_t)
ADD_CFV_BENCHMARKS(int16_t)