From 361ca749a7dbcfd2abeb94b7c657fca7a8c5f4b4 Mon Sep 17 00:00:00 2001 From: Kimball Thurston Date: Thu, 14 Nov 2024 23:51:43 +1300 Subject: [PATCH 1/2] remove use of subscript operator Signed-off-by: Kimball Thurston --- src/Imath/ImathMatrixAlgo.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/Imath/ImathMatrixAlgo.h b/src/Imath/ImathMatrixAlgo.h index 291aabac..fd074a15 100644 --- a/src/Imath/ImathMatrixAlgo.h +++ b/src/Imath/ImathMatrixAlgo.h @@ -1385,10 +1385,17 @@ template bool checkForZeroScaleInRow (const T& scl, const Vec2& row, bool exc /* = true */) { - for (int i = 0; i < 2; i++) + if (abs (scl) < T(1)) { - if ((abs (scl) < 1 && - abs (row[i]) >= std::numeric_limits::max () * abs (scl))) + if (abs (row.x) >= std::numeric_limits::max () * abs (scl)) + { + if (exc) + throw std::domain_error ( + "Cannot remove zero scaling from matrix."); + else + return false; + } + if (abs (row.y) >= std::numeric_limits::max () * abs (scl)) { if (exc) throw std::domain_error ( From 5758c953b19c431ea9cf257395fc296c8fa81797 Mon Sep 17 00:00:00 2001 From: Kimball Thurston Date: Thu, 14 Nov 2024 23:57:15 +1300 Subject: [PATCH 2/2] Add branchless access in an indexed way When direct .x/.y/.z access is not possible, and subscript operators are needed, this should enable the auto-vectorize optimization passes to have a chance at making more efficient pipelines. This access method change will also enable temporary stack vectors to be elided and live entirely in registers and not need to be stored to the stack. Signed-off-by: Kimball Thurston --- src/Imath/ImathVec.h | 479 +++++++++++++++++++++++++++++++++++++------ 1 file changed, 415 insertions(+), 64 deletions(-) diff --git a/src/Imath/ImathVec.h b/src/Imath/ImathVec.h index d286aba6..76557052 100644 --- a/src/Imath/ImathVec.h +++ b/src/Imath/ImathVec.h @@ -26,7 +26,9 @@ #include #include #include +#include #include +#include #if (defined _WIN32 || defined _WIN64) && defined _MSC_VER // suppress exception specification warnings @@ -46,6 +48,234 @@ enum IMATH_EXPORT_ENUM InfException INF_EXCEPTION }; +namespace detail { + +// we take by value such that things are pre evaluated prior to +// ternery op which should give the optimizer a chance to +// emit a masked move or similar on appropriate architectures +template +constexpr T get_nth (const int i, const T x, const T y) IMATH_NOEXCEPT +{ + return i == 0 ? x : y; +} + +template +constexpr T get_nth (const int i, const T x, const T y, const T z) IMATH_NOEXCEPT +{ + return i == 0 ? x : ((i == 1) ? y : z); +} + +template +constexpr T get_nth (const int i, const T x, const T y, const T z, const T w) IMATH_NOEXCEPT +{ + return i == 0 ? x : ((i == 1) ? y : ((i == 2) ? z : w)); +} + +template +constexpr +std::enable_if_t>, std::decay_t> +branchless_get_nth (const int i, T x, T y) IMATH_NOEXCEPT +{ + // this pattern seems to optimize to branchless far more + // consistently across compilers than other strategies + // including x * (i == 0) + y * (i != 0) especially + // for other vector sizes + return x * (i == 0) + y * ((i - 1) == 0); +} + +template +constexpr +std::enable_if_t>, std::decay_t> +branchless_get_nth (const int i, T x, T y, T z) IMATH_NOEXCEPT +{ + return x * (i == 0) + y * ((i - 1) == 0) + z * ((i - 2) == 0); +} + +template +constexpr +std::enable_if_t>, std::decay_t> +branchless_get_nth (const int i, T x, T y, T z, T w) IMATH_NOEXCEPT +{ + return x * (i == 0) + y * ((i - 1) == 0) + z * ((i - 2) == 0) + w * ((i - 3) == 0); +} + +template +using FloatMaskType = std::conditional_t< + sizeof(std::decay_t) == 2, uint16_t, + std::conditional_t< + sizeof(std::decay_t) == 4, uint32_t, + std::conditional_t) == 8, uint64_t, void> + > + >; + +template +struct FloatBitMonger +{ + using vtype = std::decay_t; + using mtype = FloatMaskType; + + static constexpr mtype to_bits(vtype v) + { + mtype vi; + std::memcpy(&vi, &v, sizeof(mtype)); + return vi; + } + + template + static constexpr mtype to_bits_and_mask(const int idx, vtype v) + { + mtype vi; + std::memcpy(&vi, &v, sizeof(mtype)); + return vi * ((idx - N) == 0); + } + + static constexpr vtype from_bits_to_fp(mtype b) + { + vtype r; + std::memcpy(&r, &b, sizeof(vtype)); + return r; + } +}; + +template +constexpr +std::enable_if_t>, std::decay_t> +branchless_get_nth (const int i, T x, T y) IMATH_NOEXCEPT +{ + using BitAccess = FloatBitMonger; + return BitAccess::from_bits_to_fp( + BitAccess::template to_bits_and_mask<0>(i, x) + + BitAccess::template to_bits_and_mask<1>(i, y) ); +} + +template +constexpr +std::enable_if_t>, std::decay_t> +branchless_get_nth (const int i, T x, T y, T z) IMATH_NOEXCEPT +{ + using BitAccess = FloatBitMonger; + return BitAccess::from_bits_to_fp( + BitAccess::template to_bits_and_mask<0>(i, x) + + BitAccess::template to_bits_and_mask<1>(i, y) + + BitAccess::template to_bits_and_mask<2>(i, z) ); +} + +template +constexpr +std::enable_if_t>, std::decay_t> +branchless_get_nth (const int i, T x, T y, T z, T w) IMATH_NOEXCEPT +{ + using BitAccess = FloatBitMonger; + return BitAccess::from_bits_to_fp( + BitAccess::template to_bits_and_mask<0>(i, x) + + BitAccess::template to_bits_and_mask<1>(i, y) + + BitAccess::template to_bits_and_mask<2>(i, z) + + BitAccess::template to_bits_and_mask<3>(i, w) ); +} + +// fall back for other types where we can't really do branchless +// as we make no assumptions about T other than copy constructible +template +constexpr +std::enable_if_t> && + !std::is_floating_point_v>, std::decay_t> +branchless_get_nth (const int i, T x, T y) IMATH_NOEXCEPT +{ + return (i == 0) ? x : y; +} + +template +constexpr +std::enable_if_t> && + !std::is_floating_point_v>, std::decay_t> +branchless_get_nth (const int i, T x, T y, T z) IMATH_NOEXCEPT +{ + return (i == 0) ? x : branchless_get_nth (i - 1, y, z); +} + +template +constexpr +std::enable_if_t> && + !std::is_floating_point_v>, std::decay_t> +branchless_get_nth (const int i, T x, T y, T z, T w) IMATH_NOEXCEPT +{ + return (i == 0) ? x : branchless_get_nth (i - 1, y, z, w); +} + +// take advantage of native masking with avx 512f +#ifndef IMATH_IS_CONSTANT_ARG +# ifdef __AVX512F__ +# define IMATH_IS_CONSTANT_ARG(idxvar) (true) +# endif +#endif + +#ifndef IMATH_IS_CONSTANT_ARG +# if defined(__has_builtin) +# if __has_builtin(__builtin_constant_p) +# define IMATH_IS_CONSTANT_ARG(idxvar) (__builtin_constant_p(idxvar)) +# endif +# endif +#endif + +#ifndef IMATH_IS_CONSTANT_ARG +# if defined(__cpp_if_consteval) && (__cpp_if_consteval >= 202106L) +# define IMATH_IS_CONSTANT_ARG(idxvar) consteval +# elif defined(__cpp_lib_is_constant_evaluated) +# define IMATH_IS_CONSTANT_ARG(idxvar) (std::is_constant_evaluated ()) +# elif defined(__has_builtin) +# if __has_builtin(__builtin_is_constant_evaluated) +# define IMATH_IS_CONSTANT_ARG(idxvar) (__builtin_is_constant_evaluated()) +# endif +# endif +# ifndef IMATH_IS_CONSTANT_ARG +# define IMATH_IS_CONSTANT_ARG(idxvar) (false) +# endif +#endif + +template +constexpr +std::enable_if_t>, std::decay_t> +select_when (const int i, T a, T b) IMATH_NOEXCEPT +{ + if IMATH_IS_CONSTANT_ARG(i) + { + return (i == N) ? a : b; + } + else + { + return a * (i == N) + b * (i != N); + } +} + +template +constexpr +std::enable_if_t>, std::decay_t> +select_when (const int i, T a, T b) IMATH_NOEXCEPT +{ + if IMATH_IS_CONSTANT_ARG(i) + { + return (i == N) ? a : b; + } + else + { + using BitAccess = FloatBitMonger; + return BitAccess::from_bits_to_fp( + BitAccess::to_bits(a) * (i == N) + + BitAccess::to_bits(b) * (i != N)); + } +} + +template +constexpr +std::enable_if_t> && + !std::is_floating_point_v>, std::decay_t> +select_when (const int i, T a, T b) IMATH_NOEXCEPT +{ + return (i == N) ? a : b; +} + +} // namespace detail + /// /// 2-element vector /// @@ -67,16 +297,22 @@ template class IMATH_EXPORT_TEMPLATE_TYPE Vec2 /// stored to the stack and other missed vectorization /// opportunities. Use of direct access to x, y when /// possible should be preferred. - IMATH_HOSTDEVICE IMATH_CONSTEXPR14 T& operator[] (int i) IMATH_NOEXCEPT; + IMATH_HOSTDEVICE IMATH_CONSTEXPR14 T& operator[] (const int i) IMATH_NOEXCEPT; /// Element access by index. /// - /// NB: This method of access may use dynamic array accesses which - /// can prevent compiler optimizations and force temporaries to be - /// stored to the stack and other missed vectorization - /// opportunities. Use of direct access to x, y when - /// possible should be preferred. - IMATH_HOSTDEVICE constexpr const T& operator[] (int i) const IMATH_NOEXCEPT; + /// This attempts to use a branchless scheme to return values + /// which can often optimize better in the presence of loop + /// unrolling. However, it is recommended to use direct access to + /// x, y when possible. This is only active for scalar types. + /// + /// Legacy dynamic array behavior can be enabled by adding a + /// preprocess define IMATH_USE_LEGACY_DYNAMIC_INDEX prior to + /// including this header file. + /// + /// Note: this is following recommended practice for const + /// operator[] and return by value, not by reference. + IMATH_HOSTDEVICE constexpr T operator[] (const int i) const IMATH_NOEXCEPT; /// @{ /// @name Constructors and Assignment @@ -179,6 +415,22 @@ template class IMATH_EXPORT_TEMPLATE_TYPE Vec2 /// @} + /// @{ + /// @name Auto-vectorization accessors + + /// Enables branchless query of value which may be able to vectorize better + /// + /// Prefer to use the direct .x, .y access if possible + IMATH_HOSTDEVICE IMATH_CONSTEXPR14 T getValueBranchless (const int idx) const IMATH_NOEXCEPT; + + /// Enables branchless store of a particular index which may be able to vectorize better + /// + /// Prefer to use the direct .x, .y access if possible + template + IMATH_HOSTDEVICE IMATH_CONSTEXPR14 void setValueBranchless (const int idx, S&& v) IMATH_NOEXCEPT; + + /// @} + /// @{ /// @name Arithmetic and Comparison @@ -376,16 +628,22 @@ template class IMATH_EXPORT_TEMPLATE_TYPE Vec3 /// stored to the stack and other missed vectorization /// opportunities. Use of direct access to x, y, z when /// possible should be preferred. - IMATH_HOSTDEVICE IMATH_CONSTEXPR14 T& operator[] (int i) IMATH_NOEXCEPT; + IMATH_HOSTDEVICE IMATH_CONSTEXPR14 T& operator[] (const int i) IMATH_NOEXCEPT; /// Element access by index. /// - /// NB: This method of access uses dynamic array accesses which - /// can prevent compiler optimizations and force temporaries to be - /// stored to the stack and other missed vectorization - /// opportunities. Use of direct access to x, y, z when - /// possible should be preferred. - IMATH_HOSTDEVICE constexpr const T& operator[] (int i) const IMATH_NOEXCEPT; + /// This attempts to use a branchless scheme to return values + /// which can often optimize better in the presence of loop + /// unrolling. However, it is recommended to use direct access to + /// x, y, z when possible. This is only active for scalar types. + /// + /// Legacy dynamic array behavior can be enabled by adding a + /// preprocess define IMATH_USE_LEGACY_DYNAMIC_INDEX prior to + /// including this header file. + /// + /// Note: this is following recommended practice for const + /// operator[] and return by value, not by reference. + IMATH_HOSTDEVICE constexpr T operator[] (const int i) const IMATH_NOEXCEPT; /// @{ /// @name Constructors and Assignment @@ -508,6 +766,22 @@ template class IMATH_EXPORT_TEMPLATE_TYPE Vec3 /// @} + /// @{ + /// @name Auto-vectorization accessors + + /// Enables branchless query of value which may be able to vectorize better + /// + /// Prefer to use the direct .x, .y access if possible + IMATH_HOSTDEVICE IMATH_CONSTEXPR14 T getValueBranchless (const int idx) const IMATH_NOEXCEPT; + + /// Enables branchless store of a particular index which may be able to vectorize better + /// + /// Prefer to use the direct .x, .y access if possible + template + IMATH_HOSTDEVICE IMATH_CONSTEXPR14 void setValueBranchless (const int idx, S&& v) IMATH_NOEXCEPT; + + /// @} + /// @{ /// @name Arithmetic and Comparison @@ -709,16 +983,22 @@ template class IMATH_EXPORT_TEMPLATE_TYPE Vec4 /// stored to the stack and other missed vectorization /// opportunities. Use of direct access to x, y, z, w when /// possible should be preferred. - IMATH_HOSTDEVICE IMATH_CONSTEXPR14 T& operator[] (int i) IMATH_NOEXCEPT; + IMATH_HOSTDEVICE IMATH_CONSTEXPR14 T& operator[] (const int i) IMATH_NOEXCEPT; /// Element access by index. /// - /// NB: This method of access uses dynamic array accesses which - /// can prevent compiler optimizations and force temporaries to be - /// stored to the stack and other missed vectorization - /// opportunities. Use of direct access to x, y, z, w when - /// possible should be preferred. - IMATH_HOSTDEVICE constexpr const T& operator[] (int i) const IMATH_NOEXCEPT; + /// This attempts to use a branchless scheme to return values + /// which can often optimize better in the presence of loop + /// unrolling. However, it is recommended to use direct access to + /// x, y, z when possible. This is only active for scalar types. + /// + /// Legacy dynamic array behavior can be enabled by adding a + /// preprocess define IMATH_USE_LEGACY_DYNAMIC_INDEX prior to + /// including this header file. + /// + /// Note: this is following recommended practice for const + /// operator[] and return by value, not by reference. + IMATH_HOSTDEVICE constexpr T operator[] (const int i) const IMATH_NOEXCEPT; /// @{ /// @name Constructors and Assignment @@ -833,6 +1113,22 @@ template class IMATH_EXPORT_TEMPLATE_TYPE Vec4 /// @} + /// @{ + /// @name Auto-vectorization accessors + + /// Enables branchless query of value which may be able to vectorize better + /// + /// Prefer to use the direct .x, .y access if possible + IMATH_HOSTDEVICE IMATH_CONSTEXPR14 T getValueBranchless (const int idx) const IMATH_NOEXCEPT; + + /// Enables branchless store of a particular index which may be able to vectorize better + /// + /// Prefer to use the direct .x, .y access if possible + template + IMATH_HOSTDEVICE IMATH_CONSTEXPR14 void setValueBranchless (const int idx, S&& v) IMATH_NOEXCEPT; + + /// @} + /// @{ /// @name Arithmetic and Comparison @@ -1261,27 +1557,21 @@ Vec4::normalizedNonNull () const IMATH_NOEXCEPT = delete; //------------------------ template -IMATH_CONSTEXPR14 IMATH_HOSTDEVICE inline T& -Vec2::operator[] (int i) IMATH_NOEXCEPT +IMATH_CONSTEXPR14 IMATH_HOSTDEVICE inline +T& +Vec2::operator[] (const int i) IMATH_NOEXCEPT { - return reinterpret_cast (this)[i]; + return getValue ()[i]; } template -constexpr IMATH_HOSTDEVICE inline const T& -Vec2::operator[] (int i) const IMATH_NOEXCEPT +constexpr IMATH_HOSTDEVICE inline T +Vec2::operator[] (const int i) const IMATH_NOEXCEPT { -#ifdef __cpp_if_consteval - if consteval - { - return (i == 0) ? x : y; - } - else - { - return reinterpret_cast (this)[i]; - } +#ifdef IMATH_USE_LEGACY_DYNAMIC_INDEX + return getValue ()[i]; #else - return reinterpret_cast (this)[i]; + return getValueBranchless (i); #endif } @@ -1373,6 +1663,30 @@ Vec2::getValue () const IMATH_NOEXCEPT return reinterpret_cast (this); } +template +IMATH_HOSTDEVICE IMATH_CONSTEXPR14 inline T +Vec2::getValueBranchless (const int idx) const IMATH_NOEXCEPT +{ + if IMATH_IS_CONSTANT_ARG(idx) + { + return detail::get_nth (idx, x, y); + } + else + { + return detail::branchless_get_nth (idx, x, y); + } +} + +template +template +IMATH_HOSTDEVICE IMATH_CONSTEXPR14 inline void +Vec2::setValueBranchless (const int idx, S&& v) IMATH_NOEXCEPT +{ + T tv = std::forward (v); + x = detail::select_when (idx, tv, x); + y = detail::select_when (idx, tv, y); +} + template template IMATH_HOSTDEVICE constexpr inline bool @@ -1679,26 +1993,19 @@ Vec2::normalizedNonNull () const IMATH_NOEXCEPT template IMATH_HOSTDEVICE IMATH_CONSTEXPR14 inline T& -Vec3::operator[] (int i) IMATH_NOEXCEPT +Vec3::operator[] (const int i) IMATH_NOEXCEPT { - return reinterpret_cast (this)[i]; + return getValue ()[i]; } template -constexpr IMATH_HOSTDEVICE inline const T& -Vec3::operator[] (int i) const IMATH_NOEXCEPT +constexpr IMATH_HOSTDEVICE inline T +Vec3::operator[] (const int i) const IMATH_NOEXCEPT { -#ifdef __cpp_if_consteval - if consteval - { - return (i == 0) ? x : ((i == 1) ? y : z); - } - else - { - return reinterpret_cast (this)[i]; - } +#ifdef IMATH_USE_LEGACY_DYNAMIC_INDEX + return getValue ()[i]; #else - return reinterpret_cast (this)[i]; + return getValueBranchless (i); #endif } @@ -1832,6 +2139,31 @@ Vec3::getValue () const IMATH_NOEXCEPT return reinterpret_cast (this); } +template +IMATH_HOSTDEVICE IMATH_CONSTEXPR14 inline T +Vec3::getValueBranchless (const int idx) const IMATH_NOEXCEPT +{ + if IMATH_IS_CONSTANT_ARG(idx) + { + return detail::get_nth (idx, x, y, z); + } + else + { + return detail::branchless_get_nth (idx, x, y, z); + } +} + +template +template +IMATH_HOSTDEVICE IMATH_CONSTEXPR14 inline void +Vec3::setValueBranchless (const int idx, S&& v) IMATH_NOEXCEPT +{ + T tv = std::forward (v); + x = detail::select_when (idx, tv, x); + y = detail::select_when (idx, tv, y); + z = detail::select_when (idx, tv, z); +} + template template IMATH_HOSTDEVICE constexpr inline bool @@ -2163,26 +2495,19 @@ Vec3::normalizedNonNull () const IMATH_NOEXCEPT template IMATH_HOSTDEVICE IMATH_CONSTEXPR14 inline T& -Vec4::operator[] (int i) IMATH_NOEXCEPT +Vec4::operator[] (const int i) IMATH_NOEXCEPT { - return reinterpret_cast (this)[i]; + return getValue ()[i]; } template -IMATH_HOSTDEVICE constexpr inline const T& -Vec4::operator[] (int i) const IMATH_NOEXCEPT +IMATH_HOSTDEVICE constexpr inline T +Vec4::operator[] (const int i) const IMATH_NOEXCEPT { -#ifdef __cpp_if_consteval - if consteval - { - return (i == 0) ? x : ((i == 1) ? y : ((i == 2) ? z : w)); - } - else - { - return reinterpret_cast (this)[i]; - } +#ifdef IMATH_USE_LEGACY_DYNAMIC_INDEX + return getValue ()[i]; #else - return reinterpret_cast (this)[i]; + return getValueBranchless (i); #endif } @@ -2301,6 +2626,32 @@ Vec4::getValue () const IMATH_NOEXCEPT return reinterpret_cast (this); } +template +IMATH_HOSTDEVICE IMATH_CONSTEXPR14 inline T +Vec4::getValueBranchless (const int idx) const IMATH_NOEXCEPT +{ + if IMATH_IS_CONSTANT_ARG(idx) + { + return detail::get_nth (idx, x, y, z, w); + } + else + { + return detail::branchless_get_nth (idx, x, y, z, w); + } +} + +template +template +IMATH_HOSTDEVICE IMATH_CONSTEXPR14 inline void +Vec4::setValueBranchless (const int idx, S&& v) IMATH_NOEXCEPT +{ + T tv = std::forward (v); + x = detail::select_when (idx, tv, x); + y = detail::select_when (idx, tv, y); + z = detail::select_when (idx, tv, z); + w = detail::select_when (idx, tv, w); +} + template template IMATH_HOSTDEVICE constexpr inline bool