-
Notifications
You must be signed in to change notification settings - Fork 395
Expand file tree
/
Copy pathcommon.h
More file actions
121 lines (109 loc) · 6.66 KB
/
common.h
File metadata and controls
121 lines (109 loc) · 6.66 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#include <memory>
#include <random>
// Helper macros for stringification in _Pragma
#define XSTR(s) STR(s)
#define STR(s) #s
#define PRAGMA_VF(VF) _Pragma(STR(clang loop vectorize_width(VF)))
#define PRAGMA_IC(IC) _Pragma(STR(clang loop interleave_count(IC)))
#define DEFINE_SCALAR_AND_VECTOR_FN1_TYPE(Init, Loop, Type) \
auto ScalarFn = [](auto *A, Type TC) -> Type { \
Init _Pragma("clang loop vectorize(disable) interleave_count(1)") Loop \
}; \
auto VectorFn = [](auto *A, Type TC) -> Type { \
Init _Pragma("clang loop vectorize(enable)") Loop \
};
#define DEFINE_SCALAR_AND_VECTOR_FN2(Init, Loop) \
auto ScalarFn = [](auto *A, auto *B, unsigned TC) { \
Init _Pragma("clang loop vectorize(disable) interleave_count(1)") Loop \
}; \
auto VectorFn = [](auto *A, auto *B, unsigned TC) { \
Init _Pragma("clang loop vectorize(enable)") Loop \
};
// Macro with explicit VF and interleave count control
#define DEFINE_SCALAR_AND_VECTOR_FN2_VF_INTERLEAVE(Init, Loop, VF, IC) \
auto ScalarFn = [](auto *A, auto *B, unsigned TC) { \
Init _Pragma("clang loop vectorize(disable) interleave_count(1)") Loop \
}; \
auto VectorFn = [](auto *A, auto *B, unsigned TC) { \
Init PRAGMA_VF(VF) PRAGMA_IC(IC) Loop \
};
#define DEFINE_SCALAR_AND_VECTOR_FN2_TYPE(Init, Loop, Type) \
auto ScalarFn = [](auto *A, auto *B, Type TC) -> Type { \
Init _Pragma("clang loop vectorize(disable) interleave_count(1)") Loop \
}; \
auto VectorFn = [](auto *A, auto *B, Type TC) -> Type { \
Init _Pragma("clang loop vectorize(enable)") Loop \
};
#define DEFINE_SCALAR_AND_VECTOR_FN3(Loop) \
auto ScalarFn = [](auto *A, auto *B, auto *C, unsigned TC) { \
_Pragma("clang loop vectorize(disable) interleave_count(1)") Loop \
}; \
auto VectorFn = [](auto *A, auto *B, auto *C, unsigned TC) { \
_Pragma("clang loop vectorize(enable)") Loop \
};
#define DEFINE_NESTED_SCALAR_AND_VECTOR_FN4(InnerLoopCode) \
auto ScalarFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \
for (unsigned long i = 0; i < OuterTC; i++) { \
_Pragma("clang loop vectorize(disable) interleave_count(1)") \
for (unsigned long j = 0; j < InnerTC; j++) { \
InnerLoopCode \
} \
} \
}; \
auto VectorFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \
for (unsigned long i = 0; i < OuterTC; i++) { \
_Pragma("clang loop vectorize(enable)") \
for (unsigned long j = 0; j < InnerTC; j++) { \
InnerLoopCode \
} \
} \
};
#define DEFINE_NESTED_SCALAR_AND_VECTOR_FN5(InnerLoopCode) \
auto ScalarFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \
for (long i = OuterTC - 1; i >= 0; i--) { \
_Pragma("clang loop vectorize(disable) interleave_count(1)") \
for (unsigned long j = 0; j < InnerTC; j++) { \
InnerLoopCode \
} \
} \
}; \
auto VectorFn = [](auto *A, auto *B, unsigned OuterTC, unsigned InnerTC) { \
for (long i = OuterTC - 1; i >= 0; i--) { \
_Pragma("clang loop vectorize(enable)") \
for (unsigned long j = 0; j < InnerTC; j++) { \
InnerLoopCode \
} \
} \
};
#define DEFINE_NESTED_SCALAR_AND_VECTOR_FN4_PTR(OuterLoop, InnerLoop, Ret) \
auto ScalarFn = [](auto *FirstA, auto *LastA, auto *FirstB, auto *LastB) { \
OuterLoop _Pragma("clang loop vectorize(disable) interleave_count(1)") \
InnerLoop Ret \
}; \
auto VectorFn = [](auto *FirstA, auto *LastA, auto *FirstB, auto *LastB) { \
OuterLoop _Pragma("clang loop vectorize(enable)") InnerLoop Ret \
};
static std::mt19937 rng;
// Initialize arrays A with random numbers.
template <typename Ty>
static void init_data(const std::unique_ptr<Ty[]> &A, unsigned N) {
if constexpr (std::is_floating_point_v<Ty>) {
std::uniform_real_distribution<Ty> distrib(
std::numeric_limits<Ty>::min(), std::numeric_limits<Ty>::max());
for (unsigned i = 0; i < N; i++)
A[i] = distrib(rng);
} else {
std::uniform_int_distribution<Ty> distrib(
std::numeric_limits<Ty>::min(), std::numeric_limits<Ty>::max());
for (unsigned i = 0; i < N; i++)
A[i] = distrib(rng);
}
}
template <typename Ty>
static void check(const std::unique_ptr<Ty[]> &Reference,
const std::unique_ptr<Ty[]> &Tmp, unsigned NumElements) {
if (!std::equal(&Reference[0], &Reference[0] + NumElements, &Tmp[0])) {
std::cerr << "Miscompare\n";
exit(1);
}
}