constexpr simd vec: perf tuning for packed vec3

sharkautarch · sharkautarch · commit 3e20cc66544d · 2024-09-15T11:50:07.000-04:00
diff --git a/glm/detail/qualifier.hpp b/glm/detail/qualifier.hpp
@@ -109,15 +109,20 @@ namespace detail
 
 #	if GLM_ARCH & GLM_ARCH_SSE2_BIT
 #if defined(__clang__) || defined(__GNUC__)
+#if __x86_64__
+#define ATTR(size)  __attribute__((packed,aligned(size)))
+#else
+#define ATTR(size)
+#endif
 	template<typename T>
-	struct storage<2, T, false>
+	struct ATTR(sizeof(T)/2) storage<2, T, false>
 	{
-		typedef T type __attribute__((aligned(sizeof(T)),vector_size(2*sizeof(T))));
+		typedef T type __attribute__((aligned(sizeof(T)/2),vector_size(2*sizeof(T))));
 	};
 	template<typename T>
-	struct storage<1, T, false>
+	struct ATTR(1) storage<1, T, false>
 	{
-		typedef T type __attribute__((aligned(1),vector_size(sizeof(T))));
+		typedef T type;
 	};
 	template<typename T>
 	struct storage<2, T, true>
@@ -129,6 +134,7 @@ namespace detail
 	{
 		typedef T type __attribute__((aligned(sizeof(T)),vector_size(sizeof(T))));
 	};
+#undef ATTR
 #endif
 	template<>
 	struct storage<4, float, true>
diff --git a/glm/detail/simd_constexpr/simd_helpers.inl b/glm/detail/simd_constexpr/simd_helpers.inl
@@ -9,47 +9,53 @@ namespace glm::detail
 			using FirstTx = Tx0;
 		};
 		template <length_t Lx, typename Tx, qualifier Qx>
-		using PaddedVec = PaddedGccVec<Lx, Tx, Qx, detail::BVecNeedsPadding<Lx, Tx, Qx>()>;
-		using gcc_vec_t = PaddedVec<L, T, Q>::GccV;
+		using GccVec = typename detail::GccVExt<Lx, Tx, Qx>::GccV;
+		using gcc_vec_t = GccVec<L, T, Q>;
 		using data_t = typename detail::storage<L, T, detail::is_aligned<Q>::value>::type;
 		
-		static inline auto __attribute__((always_inline)) gcc_vec_to_data(PaddedVec<L, T, Q> v) {
-			if constexpr (L == 3 && !BIsAlignedQ<Q>()) {
+		static inline auto __attribute__((always_inline)) gcc_vec_to_data(auto v) {
+			static constexpr auto size = std::min(sizeof(v), sizeof(data_t));
+			static constexpr auto biggerSize = std::max(sizeof(v), sizeof(data_t));
+			if constexpr (size == biggerSize) {
+				return std::bit_cast<data_t>(v);
+			} else {
 				data_t d;
-				std::memcpy(&d, &v, sizeof(d));
+				std::memcpy(&d, &v, size);
 				return d;
-			} else {
-				return std::bit_cast<data_t>(v);
 			}
 		}
 		
 		static inline auto __attribute__((always_inline)) simd_ctor_scalar(arithmetic auto scalar) {
-			PaddedVec<L, T, Q> v = {};
-			v.gcc_vec = v.gcc_vec + ( (T)scalar );
+			gcc_vec_t v = gcc_vec_t{} + ( (T)scalar );
+			using Tx = decltype(scalar);
+			scalar.Tx::~Tx();
 			return gcc_vec_to_data(v);
 		}
 		
 		template <length_t Lx, typename Tx, qualifier Qx> requires (Lx == L)
 		static inline auto __attribute__((always_inline)) simd_ctor(::glm::vec<Lx, Tx, Qx> v)
 		{
-			using OtherPaddedVec = PaddedVec<Lx, Tx, Qx>;
-			OtherPaddedVec o = std::bit_cast<OtherPaddedVec>(v.data);
-			PaddedVec<L, T, Q> converted = {.gcc_vec=__builtin_convertvector(o.gcc_vec, gcc_vec_t)};
+			using OtherVec = GccVec<Lx, Tx, Qx>;
+			OtherVec o;
+			static constexpr auto size = std::min(sizeof(v.data), sizeof(o));
+			std::memcpy(&o, &(v.data), size);
+			using o_vec_t = decltype(v);
+			v.o_vec_t::~o_vec_t();
+			gcc_vec_t converted = __builtin_convertvector(o, gcc_vec_t);
 			return gcc_vec_to_data(converted);
 		}
 		
 		template <length_t Lx, typename Tx, qualifier Qx> requires (Lx != L && Lx < L)
 		static inline auto __attribute__((always_inline)) simd_ctor(::glm::vec<Lx, Tx, Qx> v)
 		{
-			using OtherPaddedVec = PaddedVec<Lx, Tx, Qx>;
-			using OurSizeTheirType = PaddedVec<L, Tx, Qx>;
-			OtherPaddedVec o = std::bit_cast<OtherPaddedVec>(v.data);
-			OurSizeTheirType oExpanded = {};
-			for (length_t i = 0; i < Lx; i++) {
-				oExpanded.gcc_vec[i] = o.gcc_vec[i];
-			}
+			using OurSizeTheirType = GccVec<L, Tx, Qx>;
+			static constexpr auto size = std::min(sizeof(OurSizeTheirType), sizeof(v.data));
+			OurSizeTheirType oExpanded;
+			std::memcpy(&oExpanded, &(v.data), size);
+			using o_vec_t = decltype(v);
+			v.o_vec_t::~o_vec_t();
 			
-			PaddedVec<L, T, Q> converted = {.gcc_vec=__builtin_convertvector(oExpanded.gcc_vec, gcc_vec_t)};
+			gcc_vec_t converted = __builtin_convertvector(oExpanded, gcc_vec_t);
 			return gcc_vec_to_data(converted);
 		}
 		
@@ -62,11 +68,13 @@ namespace glm::detail
 		static inline auto __attribute__((always_inline)) simd_ctor_multi_scalars(A... scalars) requires ( isLengthOfVector<A...>() && SameArithmeticTypes<A...>())
 		{
 			//assuming that number of scalars is always the same as the length of the to-be-constructed vector
-			using Tx = typename GetFirstType<A...>::FirstTx;
-			using OtherPaddedVec = PaddedVec<L, Tx, Q>;
-			typename OtherPaddedVec::GccV o = {Tx(scalars)...};
-			PaddedVec<L, T, Q> converted = {.gcc_vec=__builtin_convertvector(o, gcc_vec_t)};
-			return gcc_vec_to_data(converted);
+			gcc_vec_t v;
+			std::array<T, sizeof...(scalars)> pack{scalars...};
+			for (int i = 0; i != sizeof...(scalars); i++ ) {
+				v[i] = pack[i];
+				pack[i].T::~T();
+			}
+			return gcc_vec_to_data(v);
 		}
 	};
 }
diff --git a/glm/detail/simd_constexpr/vec.hpp b/glm/detail/simd_constexpr/vec.hpp
@@ -94,9 +94,10 @@ namespace glm
 		};
 	}
 	
+	/*template <length_t L, typename T, qualifier Q>
+	using PaddedGccVec = detail::PaddedGccVec<L, T, Q, detail::BVecNeedsPadding<L, T, Q>()>;*/
 	template <length_t L, typename T, qualifier Q>
-	using PaddedGccVec = detail::PaddedGccVec<L, T, Q, detail::BVecNeedsPadding<L, T, Q>()>;
-	
+	using GccVec = typename detail::GccVExt<L, T, Q>::GccV;
 	template <length_t L, typename T, qualifier Q>
 	using VecDataArray = detail::VecDataArray<L, T, Q, detail::BDataNeedsPadding<L, T, Q>()>;
 	
@@ -161,6 +162,7 @@ namespace glm
 			switch (i)
 			{
 				default:
+					__builtin_unreachable();
 				case 0:
 					return x;
 				case 1: {
@@ -192,6 +194,7 @@ namespace glm
 			switch (i)
 			{
 				default:
+					__builtin_unreachable();
 				case 0:
 					return x;
 				case 1: {
@@ -296,7 +299,7 @@ namespace glm
 		{
 			using VTX = decltype(vs0);
 			if constexpr ( std::is_integral_v<VTX> || std::is_floating_point_v<VTX> ) {
-				return RetArr<1>{vs0};
+				return RetArr<1>{(T)vs0};
 			} else if constexpr ( ( requires { VTX::k_len; }) ) {
 				using Tx = VTX::value_type;
 				using ArrX = VecDataArray<VTX::k_len, Tx, VTX::k_qual>;
@@ -323,7 +326,7 @@ namespace glm
 		: EC<L, T, Q>
 			{.data= [scalar...]() -> data_t
 				{
-					if (std::is_constant_evaluated()) {
+					if (std::is_constant_evaluated() || (L == 3 && !BIsAlignedQ<Q>())) {
 						DataArray a = {.p={ T(scalar)... }};
 						return std::bit_cast<data_t>(a);
 					} else {
@@ -454,7 +457,7 @@ namespace glm
 		}
 
 		template<typename Tx>
-		inline GLM_CONSTEXPR vec<L, T, Q> & operator*=(vec<L, Tx, Q> v)
+		inline GLM_CONSTEXPR vec<L, T, Q> & operator*=(vec<L, Tx, Q> const& __restrict__ v) __restrict__
 		{
 			if constexpr (L < 3) {
 				this->data *= v.data;
@@ -788,9 +791,12 @@ namespace glm
 		}
 
 		
-		friend inline GLM_CONSTEXPR vec<L, T, Q> operator*(vec<L, T, Q> v1, vec<L, T, Q>  v2)
+		friend inline GLM_CONSTEXPR vec<L, T, Q> __attribute__((const, always_inline, nothrow, no_stack_protector)) operator*(vec<L, T, Q> v1, vec<L, T, Q> const& __restrict__ v2)
 		{
-			return vec<L, T, Q>(v1) *= v2;
+			if constexpr (L == 3 && !BIsAlignedQ<Q>())
+				return *(new (&v1) vec<L, T, Q>(v1.x*v2.x, v1.y*v2.y, v1.z*v2.z));
+			else
+				return v1 *= v2;
 		}
 
 		
@@ -813,9 +819,12 @@ namespace glm
 		}
 
 		template <length_t Lx>
-		friend inline GLM_CONSTEXPR vec<L, T, Q> operator/(vec<Lx, T, Q>  v1, vec<L, T, Q>  v2) requires (!NotVec1<Lx> && NotVec1<L>)
+		friend inline GLM_CONSTEXPR vec<L, T, Q> operator/(vec<Lx, T, Q>  v1, vec<L, T, Q> && __restrict__ v2) requires (!NotVec1<Lx> && NotVec1<L>)
 		{
-			return vec<L, T, Q>(v1.x) /= v2;
+			if constexpr (L == 3 && !BIsAlignedQ<Q>())
+				return *(new (&v2) vec<L, T, Q>( v1.data / v2.x, v1.data/v2.y, v1.data/v2.z ));
+			else
+				return vec<L, T, Q>(v1.x) /= v2;
 		}
 
 		

Original file line number	Diff line number	Diff line change
`@@ -94,9 +94,10 @@ namespace glm`
`94`	`94`	`};`
`95`	`95`	`}`
`96`	`96`
	`97`	`+ /*template <length_t L, typename T, qualifier Q>`
	`98`	`+ using PaddedGccVec = detail::PaddedGccVec<L, T, Q, detail::BVecNeedsPadding<L, T, Q>()>;*/`
`97`	`99`	`template <length_t L, typename T, qualifier Q>`
`98`		`- using PaddedGccVec = detail::PaddedGccVec<L, T, Q, detail::BVecNeedsPadding<L, T, Q>()>;`
`99`		`-`
	`100`	`+ using GccVec = typename detail::GccVExt<L, T, Q>::GccV;`
`100`	`101`	`template <length_t L, typename T, qualifier Q>`
`101`	`102`	`using VecDataArray = detail::VecDataArray<L, T, Q, detail::BDataNeedsPadding<L, T, Q>()>;`
`102`	`103`
`@@ -161,6 +162,7 @@ namespace glm`
`161`	`162`	`switch (i)`
`162`	`163`	`{`
`163`	`164`	`default:`
	`165`	`+ __builtin_unreachable();`
`164`	`166`	`case 0:`
`165`	`167`	`return x;`
`166`	`168`	`case 1: {`
`@@ -192,6 +194,7 @@ namespace glm`
`192`	`194`	`switch (i)`
`193`	`195`	`{`
`194`	`196`	`default:`
	`197`	`+ __builtin_unreachable();`
`195`	`198`	`case 0:`
`196`	`199`	`return x;`
`197`	`200`	`case 1: {`
`@@ -296,7 +299,7 @@ namespace glm`
`296`	`299`	`{`
`297`	`300`	`using VTX = decltype(vs0);`
`298`	`301`	`if constexpr ( std::is_integral_v<VTX> \|\| std::is_floating_point_v<VTX> ) {`
`299`		`- return RetArr<1>{vs0};`
	`302`	`+ return RetArr<1>{(T)vs0};`
`300`	`303`	`} else if constexpr ( ( requires { VTX::k_len; }) ) {`
`301`	`304`	`using Tx = VTX::value_type;`
`302`	`305`	`using ArrX = VecDataArray<VTX::k_len, Tx, VTX::k_qual>;`
`@@ -323,7 +326,7 @@ namespace glm`
`323`	`326`	`: EC<L, T, Q>`
`324`	`327`	`{.data= [scalar...]() -> data_t`
`325`	`328`	`{`
`326`		`- if (std::is_constant_evaluated()) {`
	`329`	`+ if (std::is_constant_evaluated() \|\| (L == 3 && !BIsAlignedQ<Q>())) {`
`327`	`330`	`DataArray a = {.p={ T(scalar)... }};`
`328`	`331`	`return std::bit_cast<data_t>(a);`
`329`	`332`	`} else {`
`@@ -454,7 +457,7 @@ namespace glm`
`454`	`457`	`}`
`455`	`458`
`456`	`459`	`template<typename Tx>`
`457`		`- inline GLM_CONSTEXPR vec<L, T, Q> & operator*=(vec<L, Tx, Q> v)`
	`460`	`+ inline GLM_CONSTEXPR vec<L, T, Q> & operator*=(vec<L, Tx, Q> const& __restrict__ v) __restrict__`
`458`	`461`	`{`
`459`	`462`	`if constexpr (L < 3) {`
`460`	`463`	`this->data *= v.data;`
`@@ -788,9 +791,12 @@ namespace glm`
`788`	`791`	`}`
`789`	`792`
`790`	`793`
`791`		`- friend inline GLM_CONSTEXPR vec<L, T, Q> operator*(vec<L, T, Q> v1, vec<L, T, Q> v2)`
	`794`	`+ friend inline GLM_CONSTEXPR vec<L, T, Q> __attribute__((const, always_inline, nothrow, no_stack_protector)) operator*(vec<L, T, Q> v1, vec<L, T, Q> const& __restrict__ v2)`
`792`	`795`	`{`
`793`		`- return vec<L, T, Q>(v1) *= v2;`
	`796`	`+ if constexpr (L == 3 && !BIsAlignedQ<Q>())`
	`797`	`+ return (new (&v1) vec<L, T, Q>(v1.xv2.x, v1.yv2.y, v1.zv2.z));`
	`798`	`+ else`
	`799`	`+ return v1 *= v2;`
`794`	`800`	`}`
`795`	`801`
`796`	`802`
`@@ -813,9 +819,12 @@ namespace glm`
`813`	`819`	`}`
`814`	`820`
`815`	`821`	`template <length_t Lx>`
`816`		`- friend inline GLM_CONSTEXPR vec<L, T, Q> operator/(vec<Lx, T, Q> v1, vec<L, T, Q> v2) requires (!NotVec1<Lx> && NotVec1<L>)`
	`822`	`+ friend inline GLM_CONSTEXPR vec<L, T, Q> operator/(vec<Lx, T, Q> v1, vec<L, T, Q> && __restrict__ v2) requires (!NotVec1<Lx> && NotVec1<L>)`
`817`	`823`	`{`
`818`		`- return vec<L, T, Q>(v1.x) /= v2;`
	`824`	`+ if constexpr (L == 3 && !BIsAlignedQ<Q>())`
	`825`	`+ return *(new (&v2) vec<L, T, Q>( v1.data / v2.x, v1.data/v2.y, v1.data/v2.z ));`
	`826`	`+ else`
	`827`	`+ return vec<L, T, Q>(v1.x) /= v2;`
`819`	`828`	`}`
`820`	`829`
`821`	`830`