Skip to content
This repository was archived by the owner on Aug 2, 2024. It is now read-only.

Commit f300b12

Browse files
committed
Restructure headers in preparation for supporting additional instruction
sets
1 parent c8460a6 commit f300b12

14 files changed

Lines changed: 1763 additions & 1741 deletions

public/klein/detail/exp_log.hpp

Lines changed: 1 addition & 191 deletions
Original file line numberDiff line numberDiff line change
@@ -1,193 +1,3 @@
1-
// File: exp_log.hpp
2-
// Purpose: Provide routines for taking bivector/motor exponentials and
3-
// logarithms.
4-
51
#pragma once
62

7-
#include "sse.hpp"
8-
#include <cmath>
9-
10-
namespace kln
11-
{
12-
inline namespace detail
13-
{
14-
// Partition memory layouts
15-
// LSB --> MSB
16-
// p0: (e0, e1, e2, e3)
17-
// p1: (1, e23, e31, e12)
18-
// p2: (e0123, e01, e02, e03)
19-
// p3: (e123, e032, e013, e021)
20-
21-
// a := p1
22-
// b := p2
23-
// a + b is a general bivector but it is most likely *non-simple* meaning
24-
// that it is neither purely real nor purely ideal.
25-
// Exponentiates the bivector and returns the motor defined by partitions 1
26-
// and 2.
27-
KLN_INLINE void KLN_VEC_CALL exp(__m128 const& a,
28-
__m128 const& b,
29-
__m128& p1_out,
30-
__m128& p2_out)
31-
{
32-
// The exponential map produces a continuous group of rotations about an
33-
// axis. We'd *like* to evaluate the exp(a + b) as exp(a)exp(b) but we
34-
// cannot do that in general because a and b do not commute (consider
35-
// the differences between the Taylor expansion of exp(ab) and
36-
// exp(a)exp(b)).
37-
38-
// First, we need to decompose the bivector into the sum of two
39-
// commutative bivectors (the product of these two parts will be a
40-
// scalar multiple of the pseudoscalar; see "Bivector times its ideal
41-
// axis and vice versa in demo.klein"). To do this, we compute the
42-
// squared norm of the bivector:
43-
//
44-
// NOTE: a sign flip is introduced since the square of a Euclidean
45-
// line is negative
46-
//
47-
// (a1^2 + a2^2 + a3^2) - 2(a1 b1 + a2 b2 + a3 b3) e0123
48-
49-
// Broadcast dot(a, a) ignoring the scalar component to all components
50-
// of a2
51-
__m128 a2 = _mm_dp_ps(a, a, 0b11101111);
52-
__m128 ab = _mm_dp_ps(a, b, 0b11101111);
53-
54-
// Next, we need the sqrt of that quantity. Since e0123 squares to 0,
55-
// this has a closed form solution.
56-
//
57-
// sqrt(a1^2 + a2^2 + a3^2)
58-
// - (a1 b1 + a2 b2 + a3 b3) / sqrt(a1^2 + a2^2 + a3^2) e0123
59-
//
60-
// (relabeling) = u + vI
61-
//
62-
// (square the above quantity yourself to quickly verify the claim)
63-
// Maximum relative error < 1.5*2e-12
64-
__m128 a2_sqrt_rcp = _mm_rsqrt_ps(a2);
65-
__m128 u = _mm_rcp_ps(a2_sqrt_rcp);
66-
// Don't forget the minus later!
67-
__m128 minus_v = _mm_mul_ps(ab, a2_sqrt_rcp);
68-
69-
// Last, we need the reciprocal of the norm to compute the normalized
70-
// bivector.
71-
//
72-
// 1 / sqrt(a1^2 + a2^2 + a3^2)
73-
// + (a1 b1 + a2 b2 + a3 b3) / (a1^2 + a2^2 + a3^2)^(3/2) e0123
74-
//
75-
// The original bivector * the inverse norm gives us a normalized
76-
// bivector.
77-
__m128 norm_real = _mm_mul_ps(a, a2_sqrt_rcp);
78-
__m128 norm_ideal = _mm_mul_ps(b, a2_sqrt_rcp);
79-
// The real part of the bivector also interacts with the pseudoscalar to
80-
// produce a portion of the normalized ideal part
81-
// e12 e0123 = -e03, e31 e0123 = -e02, e23 e0123 = -e01
82-
// Notice how the products above actually commute
83-
norm_ideal = _mm_sub_ps(
84-
norm_ideal,
85-
_mm_mul_ps(
86-
a, _mm_mul_ps(ab, _mm_mul_ps(a2_sqrt_rcp, _mm_rcp_ps(a2)))));
87-
88-
// The norm * our normalized bivector is the original bivector (a + b).
89-
// Thus, we have:
90-
//
91-
// (u + vI)n = u n + v n e0123
92-
//
93-
// Note that n and n e0123 are perpendicular (n e0123 lies on the ideal
94-
// plane, and all ideal components of n are extinguished after
95-
// polarization). As a result, we can now decompose the exponential.
96-
//
97-
// e^(u n + v n e0123) = e^(u n) e^(v n e0123) =
98-
// (cosu + sinu n) * (1 + v n e0123) =
99-
// cosu + sinu n + v n cosu e0123 + v sinu n^2 e0123 =
100-
// cosu + sinu n + v n cosu e0123 - v sinu e0123
101-
//
102-
// where we've used the fact that n is normalized and squares to -1.
103-
float uv[2];
104-
_mm_store_ss(uv, u);
105-
// Note the v here corresponds to minus_v
106-
_mm_store_ss(uv + 1, minus_v);
107-
108-
float sincosu[2];
109-
sincosu[0] = std::sin(uv[0]);
110-
sincosu[1] = std::cos(uv[0]);
111-
112-
__m128 sinu = _mm_set1_ps(sincosu[0]);
113-
p1_out = _mm_add_ps(_mm_set_ss(sincosu[1]), _mm_mul_ps(sinu, norm_real));
114-
115-
// The second partition has contributions from both the real and ideal
116-
// parts.
117-
__m128 cosu = _mm_set_ps(sincosu[1], sincosu[1], sincosu[1], 0.f);
118-
__m128 minus_vcosu = _mm_mul_ps(minus_v, cosu);
119-
p2_out = _mm_mul_ps(sinu, norm_ideal);
120-
p2_out = _mm_add_ps(p2_out, _mm_mul_ps(minus_vcosu, norm_real));
121-
float minus_vsinu = uv[1] * sincosu[0];
122-
p2_out = _mm_add_ps(_mm_set_ss(minus_vsinu), p2_out);
123-
}
124-
125-
KLN_INLINE void KLN_VEC_CALL log(__m128 const& p1,
126-
__m128 const& p2,
127-
__m128& p1_out,
128-
__m128& p2_out)
129-
{
130-
// The logarithm follows from the derivation of the exponential. Working
131-
// backwards, we ended up computing the exponential like so:
132-
//
133-
// cosu + sinu n + v n cosu e0123 - v sinu e0123 =
134-
// (cosu - v sinu e0123) + (sinu + v cosu e0123) n
135-
//
136-
// where n is the normalized bivector. If we compute the norm, that will
137-
// allow us to match it to sinu + vcosu e0123, which will then allow us
138-
// to deduce u and v.
139-
140-
// The first thing we need to do is extract only the bivector components
141-
// from the motor.
142-
__m128 bv_mask = _mm_set_ps(1.f, 1.f, 1.f, 0.f);
143-
__m128 a = _mm_mul_ps(bv_mask, p1);
144-
__m128 b = _mm_mul_ps(bv_mask, p2);
145-
146-
// Next, we need to compute the norm as in the exponential.
147-
__m128 a2 = _mm_dp_ps(a, a, 0b11101111);
148-
// TODO: handle case when a2 is 0
149-
__m128 ab = _mm_dp_ps(a, b, 0b11101111);
150-
__m128 s = _mm_sqrt_ps(a2);
151-
__m128 a2_sqrt_rcp = _mm_rcp_ps(s);
152-
__m128 minus_t = _mm_mul_ps(ab, a2_sqrt_rcp);
153-
// s + t e0123 is the norm of our bivector.
154-
155-
// Store the scalar component
156-
float p;
157-
_mm_store_ss(&p, p1);
158-
159-
// Store the pseudoscalar component
160-
float q;
161-
_mm_store_ss(&q, p2);
162-
163-
float s_scalar;
164-
_mm_store_ss(&s_scalar, s);
165-
float t_scalar;
166-
_mm_store_ss(&t_scalar, minus_t);
167-
t_scalar *= -1.f;
168-
// p = cosu
169-
// q = -v sinu
170-
// s_scalar = sinu
171-
// t_scalar = v cosu
172-
173-
bool p_zero = std::abs(p) < 1e-6;
174-
float u = p_zero ? std::atan2(-q, t_scalar) : std::atan2(s_scalar, p);
175-
float v = p_zero ? -q / s_scalar : t_scalar / p;
176-
177-
// Now, (u + v e0123) * n when exponentiated will give us the motor, so
178-
// (u + v e0123) * n is the logarithm. To proceed, we need to compute
179-
// the normalized bivector.
180-
__m128 norm_real = _mm_mul_ps(a, a2_sqrt_rcp);
181-
__m128 norm_ideal = _mm_mul_ps(b, a2_sqrt_rcp);
182-
norm_ideal = _mm_sub_ps(
183-
norm_ideal,
184-
_mm_mul_ps(
185-
a, _mm_mul_ps(ab, _mm_mul_ps(a2_sqrt_rcp, _mm_rcp_ps(a2)))));
186-
187-
__m128 uvec = _mm_set1_ps(u);
188-
p1_out = _mm_mul_ps(uvec, norm_real);
189-
p2_out = _mm_mul_ps(uvec, norm_ideal);
190-
p2_out = _mm_sub_ps(p2_out, _mm_mul_ps(_mm_set1_ps(v), norm_real));
191-
}
192-
} // namespace detail
193-
} // namespace kln
3+
#include "x86/x86_exp_log.hpp"
Lines changed: 1 addition & 135 deletions
Original file line numberDiff line numberDiff line change
@@ -1,137 +1,3 @@
11
#pragma once
22

3-
#include "sse.hpp"
4-
5-
namespace kln
6-
{
7-
inline namespace detail
8-
{
9-
// Partition memory layouts
10-
// LSB --> MSB
11-
// p0: (e0, e1, e2, e3)
12-
// p1: (1, e23, e31, e12)
13-
// p2: (e0123, e01, e02, e03)
14-
// p3: (e123, e032, e013, e021)
15-
16-
KLN_INLINE void KLN_VEC_CALL ext00(__m128 const& a,
17-
__m128 const& b,
18-
__m128& p1_out,
19-
__m128& p2_out) noexcept
20-
{
21-
// (a2 b3 - a3 b2) e23 +
22-
// (a3 b1 - a1 b3) e31 +
23-
// (a1 b2 - a2 b1) e12 +
24-
// (a0 b1 - a1 b0) e01 +
25-
// (a0 b2 - a2 b0) e02 +
26-
// (a0 b3 - a3 b0) e03
27-
28-
p1_out
29-
= _mm_mul_ps(KLN_SWIZZLE(a, 1, 3, 2, 0), KLN_SWIZZLE(b, 2, 1, 3, 0));
30-
p1_out = _mm_sub_ps(
31-
p1_out,
32-
_mm_mul_ps(KLN_SWIZZLE(a, 2, 1, 3, 0), KLN_SWIZZLE(b, 1, 3, 2, 0)));
33-
34-
p2_out = _mm_mul_ps(KLN_SWIZZLE(a, 0, 0, 0, 0), b);
35-
p2_out = _mm_sub_ps(p2_out, _mm_mul_ps(a, KLN_SWIZZLE(b, 0, 0, 0, 0)));
36-
37-
// For both outputs above, we don't zero the lowest component because
38-
// we've arranged a cancelation
39-
}
40-
41-
// NOTE: p1 ^ p0 and p0 ^ p1 produce identical results
42-
// p0: (e0, e1, e2, e3)
43-
// p3: (e123, e021, e013, e032)
44-
KLN_INLINE void KLN_VEC_CALL ext01(__m128 const& a,
45-
__m128 const& b,
46-
__m128& p0_out,
47-
__m128& p3_out) noexcept
48-
{
49-
// (a0 b0) e0 +
50-
// (a1 b0) e1 +
51-
// (a2 b0) e2 +
52-
// (a3 b0) e3 +
53-
// (a1 b1 + a2 b2 + a3 b3) e123 +
54-
// (-a0 b1) e032 +
55-
// (-a0 b2) e013 +
56-
// (-a0 b3) e021
57-
58-
p0_out = _mm_mul_ps(a, KLN_SWIZZLE(b, 0, 0, 0, 0));
59-
60-
p3_out = _mm_mul_ps(_mm_mul_ps(KLN_SWIZZLE(a, 0, 0, 0, 1), b),
61-
_mm_set_ps(-1.f, -1.f, -1.f, 0.f));
62-
63-
p3_out = _mm_add_ss(p3_out, _mm_dp_ps(a, b, 0b11100001));
64-
}
65-
66-
// p0 ^ p2 = p2 ^ p0
67-
KLN_INLINE void KLN_VEC_CALL ext02(__m128 const& a,
68-
__m128 const& b,
69-
__m128& p3_out) noexcept
70-
{
71-
// (a2 b3 - a3 b2) e032 +
72-
// (a3 b1 - a1 b3) e013 +
73-
// (a1 b2 - a2 b1) e021
74-
75-
p3_out
76-
= _mm_mul_ps(KLN_SWIZZLE(a, 1, 3, 2, 0), KLN_SWIZZLE(b, 2, 1, 3, 0));
77-
p3_out = _mm_sub_ps(
78-
p3_out,
79-
_mm_mul_ps(KLN_SWIZZLE(a, 2, 1, 3, 0), KLN_SWIZZLE(b, 1, 3, 2, 0)));
80-
}
81-
82-
// p0 ^ p3 = -p3 ^ p0
83-
template <bool Flip = false>
84-
KLN_INLINE void KLN_VEC_CALL ext03(__m128 const& a,
85-
__m128 const& b,
86-
__m128& p2_out) noexcept
87-
{
88-
// (a0 b0 + a1 b1 + a2 b2 + a3 b3) e0123
89-
p2_out = _mm_dp_ps(a, b, 0b11110001);
90-
if constexpr (Flip)
91-
{
92-
p2_out = _mm_xor_ps(p2_out, _mm_set_ss(-0.f));
93-
}
94-
}
95-
96-
KLN_INLINE void KLN_VEC_CALL ext11(__m128 const& a,
97-
__m128 const& b,
98-
__m128& p1_out) noexcept
99-
{
100-
// a0 b0 +
101-
// (a0 b1 + a1 b0) e23 +
102-
// (a0 b2 + a2 b0) e31 +
103-
// (a0 b3 + a3 b0) e12
104-
105-
p1_out = _mm_mul_ps(KLN_SWIZZLE(a, 0, 0, 0, 0), b);
106-
p1_out = _mm_add_ps(p1_out, _mm_mul_ps(a, KLN_SWIZZLE(b, 0, 0, 0, 0)));
107-
p1_out = _mm_mul_ss(p1_out, _mm_set_ss(0.5f));
108-
}
109-
110-
// p1 ^ p2 = p2 ^ p1
111-
KLN_INLINE void KLN_VEC_CALL ext12(__m128 const& a,
112-
__m128 const& b,
113-
__m128& p2_out) noexcept
114-
{
115-
// (a0 b0 + a1 b1 + a2 b2 + a3 b3) e0123 +
116-
// (a0 b1) e01 +
117-
// (a0 b2) e02 +
118-
// (a0 b3) e03
119-
p2_out = _mm_mul_ps(KLN_SWIZZLE(a, 0, 0, 0, 0), b);
120-
p2_out = _mm_add_ps(p2_out, _mm_dp_ps(a, b, 0b11100001));
121-
}
122-
123-
// p1 ^ p3 = p3 ^ p1
124-
KLN_INLINE void KLN_VEC_CALL ext13(__m128 const& a,
125-
__m128 const& b,
126-
__m128& p3_out) noexcept
127-
{
128-
// a0 b0 e123 +
129-
// a0 b1 e032 +
130-
// a0 b2 e013 +
131-
// a0 b3 e021
132-
p3_out = _mm_mul_ps(KLN_SWIZZLE(a, 0, 0, 0, 0), b);
133-
}
134-
135-
// The exterior products p2 ^ p2, p2 ^ p3, p3 ^ p2, and p3 ^ p3 all vanish
136-
} // namespace detail
137-
} // namespace kln
3+
#include "x86/x86_exterior_product.hpp"

0 commit comments

Comments
 (0)