Skip to content

Commit 3e20cc6

Browse files
committed
constexpr simd vec: perf tuning for packed vec3
1 parent 7f7eb3c commit 3e20cc6

File tree

3 files changed

+61
-38
lines changed

3 files changed

+61
-38
lines changed

glm/detail/qualifier.hpp

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -109,15 +109,20 @@ namespace detail
109109

110110
# if GLM_ARCH & GLM_ARCH_SSE2_BIT
111111
#if defined(__clang__) || defined(__GNUC__)
112+
#if __x86_64__
113+
#define ATTR(size) __attribute__((packed,aligned(size)))
114+
#else
115+
#define ATTR(size)
116+
#endif
112117
template<typename T>
113-
struct storage<2, T, false>
118+
struct ATTR(sizeof(T)/2) storage<2, T, false>
114119
{
115-
typedef T type __attribute__((aligned(sizeof(T)),vector_size(2*sizeof(T))));
120+
typedef T type __attribute__((aligned(sizeof(T)/2),vector_size(2*sizeof(T))));
116121
};
117122
template<typename T>
118-
struct storage<1, T, false>
123+
struct ATTR(1) storage<1, T, false>
119124
{
120-
typedef T type __attribute__((aligned(1),vector_size(sizeof(T))));
125+
typedef T type;
121126
};
122127
template<typename T>
123128
struct storage<2, T, true>
@@ -129,6 +134,7 @@ namespace detail
129134
{
130135
typedef T type __attribute__((aligned(sizeof(T)),vector_size(sizeof(T))));
131136
};
137+
#undef ATTR
132138
#endif
133139
template<>
134140
struct storage<4, float, true>

glm/detail/simd_constexpr/simd_helpers.inl

Lines changed: 33 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,47 +9,53 @@ namespace glm::detail
99
using FirstTx = Tx0;
1010
};
1111
template <length_t Lx, typename Tx, qualifier Qx>
12-
using PaddedVec = PaddedGccVec<Lx, Tx, Qx, detail::BVecNeedsPadding<Lx, Tx, Qx>()>;
13-
using gcc_vec_t = PaddedVec<L, T, Q>::GccV;
12+
using GccVec = typename detail::GccVExt<Lx, Tx, Qx>::GccV;
13+
using gcc_vec_t = GccVec<L, T, Q>;
1414
using data_t = typename detail::storage<L, T, detail::is_aligned<Q>::value>::type;
1515

16-
static inline auto __attribute__((always_inline)) gcc_vec_to_data(PaddedVec<L, T, Q> v) {
17-
if constexpr (L == 3 && !BIsAlignedQ<Q>()) {
16+
static inline auto __attribute__((always_inline)) gcc_vec_to_data(auto v) {
17+
static constexpr auto size = std::min(sizeof(v), sizeof(data_t));
18+
static constexpr auto biggerSize = std::max(sizeof(v), sizeof(data_t));
19+
if constexpr (size == biggerSize) {
20+
return std::bit_cast<data_t>(v);
21+
} else {
1822
data_t d;
19-
std::memcpy(&d, &v, sizeof(d));
23+
std::memcpy(&d, &v, size);
2024
return d;
21-
} else {
22-
return std::bit_cast<data_t>(v);
2325
}
2426
}
2527

2628
static inline auto __attribute__((always_inline)) simd_ctor_scalar(arithmetic auto scalar) {
27-
PaddedVec<L, T, Q> v = {};
28-
v.gcc_vec = v.gcc_vec + ( (T)scalar );
29+
gcc_vec_t v = gcc_vec_t{} + ( (T)scalar );
30+
using Tx = decltype(scalar);
31+
scalar.Tx::~Tx();
2932
return gcc_vec_to_data(v);
3033
}
3134

3235
template <length_t Lx, typename Tx, qualifier Qx> requires (Lx == L)
3336
static inline auto __attribute__((always_inline)) simd_ctor(::glm::vec<Lx, Tx, Qx> v)
3437
{
35-
using OtherPaddedVec = PaddedVec<Lx, Tx, Qx>;
36-
OtherPaddedVec o = std::bit_cast<OtherPaddedVec>(v.data);
37-
PaddedVec<L, T, Q> converted = {.gcc_vec=__builtin_convertvector(o.gcc_vec, gcc_vec_t)};
38+
using OtherVec = GccVec<Lx, Tx, Qx>;
39+
OtherVec o;
40+
static constexpr auto size = std::min(sizeof(v.data), sizeof(o));
41+
std::memcpy(&o, &(v.data), size);
42+
using o_vec_t = decltype(v);
43+
v.o_vec_t::~o_vec_t();
44+
gcc_vec_t converted = __builtin_convertvector(o, gcc_vec_t);
3845
return gcc_vec_to_data(converted);
3946
}
4047

4148
template <length_t Lx, typename Tx, qualifier Qx> requires (Lx != L && Lx < L)
4249
static inline auto __attribute__((always_inline)) simd_ctor(::glm::vec<Lx, Tx, Qx> v)
4350
{
44-
using OtherPaddedVec = PaddedVec<Lx, Tx, Qx>;
45-
using OurSizeTheirType = PaddedVec<L, Tx, Qx>;
46-
OtherPaddedVec o = std::bit_cast<OtherPaddedVec>(v.data);
47-
OurSizeTheirType oExpanded = {};
48-
for (length_t i = 0; i < Lx; i++) {
49-
oExpanded.gcc_vec[i] = o.gcc_vec[i];
50-
}
51+
using OurSizeTheirType = GccVec<L, Tx, Qx>;
52+
static constexpr auto size = std::min(sizeof(OurSizeTheirType), sizeof(v.data));
53+
OurSizeTheirType oExpanded;
54+
std::memcpy(&oExpanded, &(v.data), size);
55+
using o_vec_t = decltype(v);
56+
v.o_vec_t::~o_vec_t();
5157

52-
PaddedVec<L, T, Q> converted = {.gcc_vec=__builtin_convertvector(oExpanded.gcc_vec, gcc_vec_t)};
58+
gcc_vec_t converted = __builtin_convertvector(oExpanded, gcc_vec_t);
5359
return gcc_vec_to_data(converted);
5460
}
5561

@@ -62,11 +68,13 @@ namespace glm::detail
6268
static inline auto __attribute__((always_inline)) simd_ctor_multi_scalars(A... scalars) requires ( isLengthOfVector<A...>() && SameArithmeticTypes<A...>())
6369
{
6470
//assuming that number of scalars is always the same as the length of the to-be-constructed vector
65-
using Tx = typename GetFirstType<A...>::FirstTx;
66-
using OtherPaddedVec = PaddedVec<L, Tx, Q>;
67-
typename OtherPaddedVec::GccV o = {Tx(scalars)...};
68-
PaddedVec<L, T, Q> converted = {.gcc_vec=__builtin_convertvector(o, gcc_vec_t)};
69-
return gcc_vec_to_data(converted);
71+
gcc_vec_t v;
72+
std::array<T, sizeof...(scalars)> pack{scalars...};
73+
for (int i = 0; i != sizeof...(scalars); i++ ) {
74+
v[i] = pack[i];
75+
pack[i].T::~T();
76+
}
77+
return gcc_vec_to_data(v);
7078
}
7179
};
7280
}

glm/detail/simd_constexpr/vec.hpp

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -94,9 +94,10 @@ namespace glm
9494
};
9595
}
9696

97+
/*template <length_t L, typename T, qualifier Q>
98+
using PaddedGccVec = detail::PaddedGccVec<L, T, Q, detail::BVecNeedsPadding<L, T, Q>()>;*/
9799
template <length_t L, typename T, qualifier Q>
98-
using PaddedGccVec = detail::PaddedGccVec<L, T, Q, detail::BVecNeedsPadding<L, T, Q>()>;
99-
100+
using GccVec = typename detail::GccVExt<L, T, Q>::GccV;
100101
template <length_t L, typename T, qualifier Q>
101102
using VecDataArray = detail::VecDataArray<L, T, Q, detail::BDataNeedsPadding<L, T, Q>()>;
102103

@@ -161,6 +162,7 @@ namespace glm
161162
switch (i)
162163
{
163164
default:
165+
__builtin_unreachable();
164166
case 0:
165167
return x;
166168
case 1: {
@@ -192,6 +194,7 @@ namespace glm
192194
switch (i)
193195
{
194196
default:
197+
__builtin_unreachable();
195198
case 0:
196199
return x;
197200
case 1: {
@@ -296,7 +299,7 @@ namespace glm
296299
{
297300
using VTX = decltype(vs0);
298301
if constexpr ( std::is_integral_v<VTX> || std::is_floating_point_v<VTX> ) {
299-
return RetArr<1>{vs0};
302+
return RetArr<1>{(T)vs0};
300303
} else if constexpr ( ( requires { VTX::k_len; }) ) {
301304
using Tx = VTX::value_type;
302305
using ArrX = VecDataArray<VTX::k_len, Tx, VTX::k_qual>;
@@ -323,7 +326,7 @@ namespace glm
323326
: EC<L, T, Q>
324327
{.data= [scalar...]() -> data_t
325328
{
326-
if (std::is_constant_evaluated()) {
329+
if (std::is_constant_evaluated() || (L == 3 && !BIsAlignedQ<Q>())) {
327330
DataArray a = {.p={ T(scalar)... }};
328331
return std::bit_cast<data_t>(a);
329332
} else {
@@ -454,7 +457,7 @@ namespace glm
454457
}
455458

456459
template<typename Tx>
457-
inline GLM_CONSTEXPR vec<L, T, Q> & operator*=(vec<L, Tx, Q> v)
460+
inline GLM_CONSTEXPR vec<L, T, Q> & operator*=(vec<L, Tx, Q> const& __restrict__ v) __restrict__
458461
{
459462
if constexpr (L < 3) {
460463
this->data *= v.data;
@@ -788,9 +791,12 @@ namespace glm
788791
}
789792

790793

791-
friend inline GLM_CONSTEXPR vec<L, T, Q> operator*(vec<L, T, Q> v1, vec<L, T, Q> v2)
794+
friend inline GLM_CONSTEXPR vec<L, T, Q> __attribute__((const, always_inline, nothrow, no_stack_protector)) operator*(vec<L, T, Q> v1, vec<L, T, Q> const& __restrict__ v2)
792795
{
793-
return vec<L, T, Q>(v1) *= v2;
796+
if constexpr (L == 3 && !BIsAlignedQ<Q>())
797+
return *(new (&v1) vec<L, T, Q>(v1.x*v2.x, v1.y*v2.y, v1.z*v2.z));
798+
else
799+
return v1 *= v2;
794800
}
795801

796802

@@ -813,9 +819,12 @@ namespace glm
813819
}
814820

815821
template <length_t Lx>
816-
friend inline GLM_CONSTEXPR vec<L, T, Q> operator/(vec<Lx, T, Q> v1, vec<L, T, Q> v2) requires (!NotVec1<Lx> && NotVec1<L>)
822+
friend inline GLM_CONSTEXPR vec<L, T, Q> operator/(vec<Lx, T, Q> v1, vec<L, T, Q> && __restrict__ v2) requires (!NotVec1<Lx> && NotVec1<L>)
817823
{
818-
return vec<L, T, Q>(v1.x) /= v2;
824+
if constexpr (L == 3 && !BIsAlignedQ<Q>())
825+
return *(new (&v2) vec<L, T, Q>( v1.data / v2.x, v1.data/v2.y, v1.data/v2.z ));
826+
else
827+
return vec<L, T, Q>(v1.x) /= v2;
819828
}
820829

821830

0 commit comments

Comments
 (0)