Skip to content

Commit 0cda315

Browse files
committed
Adds ReexpressSpatialVector
1 parent 47b96b9 commit 0cda315

8 files changed

+154
-14
lines changed

.bazeliskrc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# specifies which version of Bazel should be used to build and test Drake.
33
# Keep the in sync with doc/_pages/from_source.md (only the major.minor part).
44
USE_BAZEL_VERSION=8.0.0
5+
#USE_BAZEL_VERSION=7.4.1
56

67
# For some reason the google mirrors are very flaky in Drake CI in EC2, so
78
# we'll point to the GitHub mirrors instead.

math/fast_pose_composition_functions.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
#pragma once
22

3+
#include "drake/common/eigen_types.h"
4+
35
namespace drake {
46
namespace math {
57

@@ -66,6 +68,17 @@ void ComposeXinvX(const RigidTransform<double>& X_BA,
6668
const RigidTransform<double>& X_BC,
6769
RigidTransform<double>* X_AC);
6870

71+
// TODO(sherm1) Consider making the signature take SpatialVector instead of
72+
// Vector6.
73+
/* Re-express a spatial vector via `S_A = R_AB * S_B`. A spatial vector is
74+
a Vector6 composed of two 3-element Vector3's that are re-expressed
75+
independently. It is OK if S_A is exactly the same memory as S_B, but not if
76+
they partially overlap or overlap with R_AB.
77+
This requires 30 floating point operations but can be done more efficiently
78+
exploing SIMD instructions when available. */
79+
void ReexpressSpatialVector(const RotationMatrix<double>& R_AB,
80+
const Vector6<double>& V_B, Vector6<double>* V_A);
81+
6982
} // namespace internal
7083
} // namespace math
7184
} // namespace drake

math/fast_pose_composition_functions_avx2_fma.cc

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <algorithm>
66
#include <cstdint>
7+
#include <iostream>
78

89
// This is the magic juju that compiles our impl functions for multiple CPUs.
910
#undef HWY_TARGET_INCLUDE
@@ -591,6 +592,68 @@ void ComposeXinvXImpl(const double* X_BA, const double* X_BC, double* X_AC) {
591592
hn::StoreN(stu_, tag, X_AC + 9, 3); // 3-wide write to stay in bounds
592593
}
593594

595+
/* Re-express SpatialVector via V_A = R_AB * V_B.
596+
597+
R_AB is 9 consecutive doubles in column-major order, the vectors are 6
598+
consecutive elements comprising two independent 3-vectors. It is allowable for
599+
V_A and V_B to be the same memory.
600+
601+
R_AB = abcdefghi
602+
V_B = xyzrst
603+
V_A = XYZRST
604+
605+
We want to perform two matrix-vector products:
606+
607+
V_A R_AB V_B
608+
609+
X a d g x
610+
Y = b e h @ y 15 flops
611+
Z c f i z
612+
613+
R a d g r
614+
S = b e h @ s 15 flops
615+
T c f i t
616+
617+
We can do this in 6 SIMD instructions. We end up doing 40 flops and throwing
618+
10 of them away.
619+
620+
llvm-mca says 620 cycles / 100 iterations
621+
*/
622+
void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
623+
double* V_A) {
624+
const hn::FixedTag<double, 4> tag;
625+
626+
const auto abc_ = hn::LoadU(tag, R_AB); // (d is loaded but unused)
627+
const auto def_ = hn::LoadU(tag, R_AB + 3); // (g is loaded but unused)
628+
629+
// Llvm-mca rates this two-step implementation as a half-cycle better than
630+
// the equivalent `ghi0 = hn::LoadN(tag, R_AB + 6, 3)` which gives 670/100
631+
// cycles (gcc 11.4 & clang 14.0.0).
632+
const auto fghi = hn::LoadU(tag, R_AB + 5); // (f not wanted)
633+
const auto ghi0 = hn::SlideDownLanes(tag, fghi, 1);
634+
635+
const auto xxx_ = hn::Set(tag, V_B[0]);
636+
const auto yyy_ = hn::Set(tag, V_B[1]);
637+
const auto zzz_ = hn::Set(tag, V_B[2]);
638+
639+
// Vector XYZ: X Y Z _
640+
auto XYZ_ = hn::Mul(abc_, xxx_); // ax bx cx _
641+
XYZ_ = hn::MulAdd(def_, yyy_, XYZ_); // +dy +ey +fy _
642+
XYZ_ = hn::MulAdd(ghi0, zzz_, XYZ_); // +gz +hz +iz _
643+
644+
const auto rrr_ = hn::Set(tag, V_B[3]);
645+
const auto sss_ = hn::Set(tag, V_B[4]);
646+
const auto ttt_ = hn::Set(tag, V_B[5]);
647+
648+
// Vector RST: R S T _
649+
auto RST_ = hn::Mul(abc_, rrr_); // ar br cr _
650+
RST_ = hn::MulAdd(def_, sss_, RST_); // +ds +es +fs _
651+
RST_ = hn::MulAdd(ghi0, ttt_, RST_); // +gt +ht +it _
652+
653+
hn::StoreU(XYZ_, tag, V_A); // 4-wide write temporarily overwrites R
654+
hn::StoreN(RST_, tag, V_A + 3, 3); // 3-wide write to stay in bounds
655+
}
656+
594657
#else // HWY_MAX_BYTES
595658

596659
/* The portable versions are always defined. They should be written to maximize
@@ -698,6 +761,24 @@ void ComposeXinvXImpl(const double* X_BA, const double* X_BC, double* X_AC) {
698761
std::copy(X_AC_temp, X_AC_temp + 12, X_AC);
699762
}
700763

764+
void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
765+
double* V_A) {
766+
DRAKE_ASSERT(V_A != nullptr);
767+
double x, y, z; // Protect from overlap with V_B.
768+
x = row_x_col(&R_AB[0], &V_B[0]);
769+
y = row_x_col(&R_AB[1], &V_B[0]);
770+
z = row_x_col(&R_AB[2], &V_B[0]);
771+
V_A[0] = x;
772+
V_A[1] = y;
773+
V_A[2] = z;
774+
x = row_x_col(&R_AB[0], &V_B[3]);
775+
y = row_x_col(&R_AB[1], &V_B[3]);
776+
z = row_x_col(&R_AB[2], &V_B[3]);
777+
V_A[3] = x;
778+
V_A[4] = y;
779+
V_A[5] = z;
780+
}
781+
701782
#endif // HWY_MAX_BYTES
702783

703784
} // namespace HWY_NAMESPACE
@@ -732,6 +813,10 @@ HWY_EXPORT(ComposeXinvXImpl);
732813
struct ChooseBestComposeXinvX {
733814
auto operator()() { return HWY_DYNAMIC_POINTER(ComposeXinvXImpl); }
734815
};
816+
HWY_EXPORT(ReexpressSpatialVectorImpl);
817+
struct ChooseBestReexpressSpatialVector {
818+
auto operator()() { return HWY_DYNAMIC_POINTER(ReexpressSpatialVectorImpl); }
819+
};
735820

736821
// These sugar functions convert C++ types into bare arrays.
737822
const double* GetRawData(const RotationMatrix<double>& R) {
@@ -750,6 +835,14 @@ double* GetRawData(RigidTransform<double>* X) {
750835
// the rotation matrix first followed by the translation.
751836
return const_cast<double*>(X->rotation().matrix().data());
752837
}
838+
template <int N>
839+
const double* GetRawData(const Vector<double, N>& v) {
840+
return v.data();
841+
}
842+
template <int N>
843+
double* GetRawData(Vector<double, N>* v) {
844+
return v->data();
845+
}
753846

754847
} // namespace
755848

@@ -781,6 +874,12 @@ void ComposeXinvX(const RigidTransform<double>& X_BA,
781874
GetRawData(X_BA), GetRawData(X_BC), GetRawData(X_AC));
782875
}
783876

877+
void ReexpressSpatialVector(const RotationMatrix<double>& R_AB,
878+
const Vector6<double>& V_B, Vector6<double>* V_A) {
879+
LateBoundFunction<ChooseBestReexpressSpatialVector>::Call(
880+
GetRawData(R_AB), GetRawData(V_B), GetRawData(V_A));
881+
}
882+
784883
} // namespace internal
785884
} // namespace math
786885
} // namespace drake

math/test/fast_pose_composition_functions_test.cc

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,23 @@ TEST_P(FastPoseCompositionFunctions, ComposeXX) {
171171
EXPECT_TRUE(CompareMatrices(arg3->GetAsMatrix4(), expected));
172172
}
173173

174+
GTEST_TEST(FastReexpressSpatialVector, ReexpressSpatialVector) {
175+
constexpr double kTolerance = 8 * std::numeric_limits<double>::epsilon();
176+
const RotationMatrixd R_AB(RollPitchYawd(1.0, 1.25, 1.5));
177+
Vector6d V_B;
178+
V_B << 1.0, 2.0, 3.0, 4.0, 5.0, 6.0;
179+
Vector6d V_A_expected;
180+
V_A_expected.head<3>() = R_AB * V_B.head<3>();
181+
V_A_expected.tail<3>() = R_AB * V_B.tail<3>();
182+
Vector6d V_A;
183+
ReexpressSpatialVector(R_AB, V_B, &V_A);
184+
EXPECT_TRUE(CompareMatrices(V_A, V_A_expected, kTolerance));
185+
186+
// Ensure that it works when V_A and V_B are the same vector.
187+
ReexpressSpatialVector(R_AB, V_B, &V_B);
188+
EXPECT_TRUE(CompareMatrices(V_B, V_A_expected, kTolerance));
189+
}
190+
174191
} // namespace
175192
} // namespace internal
176193
} // namespace math

multibody/math/spatial_acceleration.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ class SpatialAcceleration : public SpatialVector<SpatialAcceleration, T> {
134134
const Vector3<T>& alpha_MB_E = this->rotational();
135135
// Calculate point Co's translational acceleration measured in M.
136136
Vector3<T>& a_MCo_E = this->translational();
137-
a_MCo_E += (alpha_MB_E.cross(p_BoCo_E)
137+
a_MCo_E += (alpha_MB_E.cross(p_BoCo_E) // 33 flops total
138138
+ w_MB_E.cross(w_MB_E.cross(p_BoCo_E)));
139139
}
140140

multibody/math/spatial_vector.h

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -278,7 +278,15 @@ class SpatialVector {
278278
/// </pre>
279279
friend SpatialQuantity operator*(
280280
const math::RotationMatrix<T>& R_FE, const SpatialQuantity& V_E) {
281-
return SpatialQuantity(R_FE * V_E.rotational(), R_FE * V_E.translational());
281+
if constexpr (std::is_same_v<T, double>) {
282+
SpatialQuantity V_F;
283+
math::internal::ReexpressSpatialVector(R_FE, V_E.get_coeffs(),
284+
&V_F.get_coeffs());
285+
return V_F;
286+
} else {
287+
return SpatialQuantity(R_FE * V_E.rotational(),
288+
R_FE * V_E.translational());
289+
}
282290
}
283291

284292
/// Factory to create a _zero_ spatial vector, i.e., a %SpatialVector whose

multibody/tree/body_node_impl.cc

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -279,7 +279,8 @@ void BodyNodeImpl<T, ConcreteMobilizer>::CalcVelocityKinematicsCache2_BaseToTip(
279279
const SpatialVelocity<T> V_WMc_Mp = V_WMp_Mp.Shift(p_MpM_Mp); // 15 flops
280280

281281
SpatialVelocity<T>& V_WM_M = vc2->get_mutable_V_WM_M(index);
282-
V_WM_M = R_MMp * V_WMc_Mp + V_FM_M; // 36 flops
282+
V_WM_M = R_MMp * V_WMc_Mp // 6 SIMD flops
283+
+ V_FM_M; // 6 flops
283284
}
284285

285286
// As a guideline for developers, a summary of the computations performed in
@@ -439,7 +440,7 @@ void BodyNodeImpl<T, ConcreteMobilizer>::CalcSpatialAcceleration2_BaseToTip(
439440
const math::RotationMatrix<T> R_MMp = X_MpM.rotation().inverse();
440441
const Vector3<T>& w_WP_Mp = V_WMp_Mp.rotational(); // all frames on P same ω
441442

442-
// Start with parent contribution. 42 flops for shift + 30 for re-express
443+
// Start with parent contribution. shift: 33 flops, reexpress: 6 SIMD flops
443444
A_WM_M = R_MMp * A_WMp_Mp.Shift(p_MpM_Mp, w_WP_Mp); // parent contribution
444445

445446
// Next add in the velocity-dependent bias acceleration:
@@ -672,17 +673,18 @@ void BodyNodeImpl<T, ConcreteMobilizer>::CalcInverseDynamics2_TipToBase(
672673
// Unfortunately the applied force is given at Bo and expressed in W. We
673674
// need it applied at Mo and expressed in M.
674675
const SpatialForce<T>& Fapp_BBo_W = Fapplied_Bo_W_array[index];
675-
const SpatialForce<T> Fapp_BBo_M = R_MW * Fapp_BBo_W; // 30 flops
676+
const SpatialForce<T> Fapp_BBo_M = R_MW * Fapp_BBo_W; // 6 SIMD flops
676677
const SpatialForce<T> Fapp_BMo_M = Fapp_BBo_M.Shift(-p_MoBo_M); // 15 flops
677678

678-
// Calculate the velocity-dependent bias force on B (57 flops).
679+
// Calculate the velocity-dependent bias force on B (48 flops).
679680
const SpatialForce<T> Fbias_BMo_M(
680-
mass * w_WM_M.cross(G_BMo_M * w_WM_M), // bias moment, 30 flops
681-
mass * w_WM_M.cross(w_WM_M.cross(p_MoBcm_M))); // bias force, 27 flops
681+
mass * w_WM_M.cross(G_BMo_M * w_WM_M), // bias moment, 27 flops
682+
mass * w_WM_M.cross(w_WM_M.cross(p_MoBcm_M))); // bias force, 21 flops
682683

683684
// F_BMo_M is an output and will be the total force on B.
684685
SpatialForce<T>& F_BMo_M = (*F_BMo_M_array)[index];
685-
F_BMo_M = M_BMo_M * A_WM_M + Fbias_BMo_M - Fapp_BMo_M; // 78 flops
686+
F_BMo_M = M_BMo_M * A_WM_M // 45 flops
687+
+ Fbias_BMo_M - Fapp_BMo_M; // 12 flops
686688

687689
// Next, account for forces applied to B through its outboard joints, treated
688690
// as though they were locked. Since we're going tip to base we already have
@@ -696,7 +698,7 @@ void BodyNodeImpl<T, ConcreteMobilizer>::CalcInverseDynamics2_TipToBase(
696698
const Vector3<T>& p_MoMc_M = X_MMc.translation();
697699

698700
const SpatialForce<T>& F_CMc_Mc = (*F_BMo_M_array)[outboard_index];
699-
const SpatialForce<T> F_CMc_M = R_MMc * F_CMc_Mc; // 30 flops
701+
const SpatialForce<T> F_CMc_M = R_MMc * F_CMc_Mc; // 6 SIMD flops
700702
const SpatialForce<T> F_CMo_M = F_CMc_M.Shift(-p_MoMc_M); // 15 flops
701703
F_BMo_M += F_CMo_M; // 6 flops
702704
}

multibody/tree/spatial_inertia.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -448,10 +448,10 @@ SpatialForce<T> SpatialInertia<T>::operator*(
448448
return SpatialForce<T>(
449449
// Note: p_PScm_E here is p_BoBcm in the above notation.
450450
// Rotational
451-
mass_ * (G_SP_E_ * alpha_WB_E + p_PScm_E_.cross(a_WBo_E)),
451+
mass_ * (G_SP_E_ * alpha_WB_E + p_PScm_E_.cross(a_WBo_E)), // 30 flops
452452
// Translational: notice the order of the cross product is the reversed
453453
// of the documentation above and thus no minus sign is needed.
454-
mass_ * (alpha_WB_E.cross(p_PScm_E_) + a_WBo_E));
454+
mass_ * (alpha_WB_E.cross(p_PScm_E_) + a_WBo_E)); // 15 flops
455455
}
456456

457457
template <typename T>
@@ -466,10 +466,10 @@ SpatialMomentum<T> SpatialInertia<T>::operator*(
466466
return SpatialMomentum<T>(
467467
// Note: p_PScm_E here is p_BoBcm in the above notation.
468468
// Rotational
469-
mass_ * (G_SP_E_ * w_WB_E + p_PScm_E_.cross(v_WP_E)),
469+
mass_ * (G_SP_E_ * w_WB_E + p_PScm_E_.cross(v_WP_E)), // 30 flops
470470
// Translational: notice the order of the cross product is the reversed
471471
// of the documentation above and thus no minus sign is needed.
472-
mass_ * (w_WB_E.cross(p_PScm_E_) + v_WP_E));
472+
mass_ * (w_WB_E.cross(p_PScm_E_) + v_WP_E)); // 15 flops
473473
}
474474

475475
template <typename T>

0 commit comments

Comments
 (0)