Adds ReexpressSpatialVector

sherm1 · sherm1 · commit 0cda3158e64e · 2025-01-13T14:26:22.000-08:00
diff --git a/.bazeliskrc b/.bazeliskrc
@@ -2,6 +2,7 @@
 # specifies which version of Bazel should be used to build and test Drake.
 # Keep the in sync with doc/_pages/from_source.md (only the major.minor part).
 USE_BAZEL_VERSION=8.0.0
+#USE_BAZEL_VERSION=7.4.1
 
 # For some reason the google mirrors are very flaky in Drake CI in EC2, so
 # we'll point to the GitHub mirrors instead.
diff --git a/math/fast_pose_composition_functions.h b/math/fast_pose_composition_functions.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include "drake/common/eigen_types.h"
+
 namespace drake {
 namespace math {
 
@@ -66,6 +68,17 @@ void ComposeXinvX(const RigidTransform<double>& X_BA,
                   const RigidTransform<double>& X_BC,
                   RigidTransform<double>* X_AC);
 
+// TODO(sherm1) Consider making the signature take SpatialVector instead of
+//  Vector6.
+/* Re-express a spatial vector via `S_A = R_AB * S_B`. A spatial vector is
+a Vector6 composed of two 3-element Vector3's that are re-expressed
+independently. It is OK if S_A is exactly the same memory as S_B, but not if
+they partially overlap or overlap with R_AB.
+This requires 30 floating point operations but can be done more efficiently
+exploing SIMD instructions when available. */
+void ReexpressSpatialVector(const RotationMatrix<double>& R_AB,
+                            const Vector6<double>& V_B, Vector6<double>* V_A);
+
 }  // namespace internal
 }  // namespace math
 }  // namespace drake
diff --git a/math/fast_pose_composition_functions_avx2_fma.cc b/math/fast_pose_composition_functions_avx2_fma.cc
@@ -4,6 +4,7 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <iostream>
 
 // This is the magic juju that compiles our impl functions for multiple CPUs.
 #undef HWY_TARGET_INCLUDE
@@ -591,6 +592,68 @@ void ComposeXinvXImpl(const double* X_BA, const double* X_BC, double* X_AC) {
   hn::StoreN(stu_, tag, X_AC + 9, 3);  // 3-wide write to stay in bounds
 }
 
+/* Re-express SpatialVector via V_A = R_AB * V_B.
+
+R_AB is 9 consecutive doubles in column-major order, the vectors are 6
+consecutive elements comprising two independent 3-vectors. It is allowable for
+V_A and V_B to be the same memory.
+
+  R_AB = abcdefghi
+  V_B = xyzrst
+  V_A = XYZRST
+
+We want to perform two matrix-vector products:
+
+   V_A     R_AB    V_B
+
+    X     a d g     x
+    Y  =  b e h  @  y      15 flops
+    Z     c f i     z
+
+    R     a d g     r
+    S  =  b e h  @  s      15 flops
+    T     c f i     t
+
+We can do this in 6 SIMD instructions. We end up doing 40 flops and throwing
+10 of them away.
+
+llvm-mca says 620 cycles / 100 iterations
+*/
+void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
+                                double* V_A) {
+  const hn::FixedTag<double, 4> tag;
+
+  const auto abc_ = hn::LoadU(tag, R_AB);      // (d is loaded but unused)
+  const auto def_ = hn::LoadU(tag, R_AB + 3);  // (g is loaded but unused)
+
+  // Llvm-mca rates this two-step implementation as a half-cycle better than
+  // the equivalent `ghi0 = hn::LoadN(tag, R_AB + 6, 3)` which gives 670/100
+  // cycles (gcc 11.4 & clang 14.0.0).
+  const auto fghi = hn::LoadU(tag, R_AB + 5);  // (f not wanted)
+  const auto ghi0 = hn::SlideDownLanes(tag, fghi, 1);
+
+  const auto xxx_ = hn::Set(tag, V_B[0]);
+  const auto yyy_ = hn::Set(tag, V_B[1]);
+  const auto zzz_ = hn::Set(tag, V_B[2]);
+
+  // Vector XYZ:                            X   Y   Z   _
+  auto XYZ_ = hn::Mul(abc_, xxx_);      //  ax  bx  cx  _
+  XYZ_ = hn::MulAdd(def_, yyy_, XYZ_);  // +dy +ey +fy  _
+  XYZ_ = hn::MulAdd(ghi0, zzz_, XYZ_);  // +gz +hz +iz  _
+
+  const auto rrr_ = hn::Set(tag, V_B[3]);
+  const auto sss_ = hn::Set(tag, V_B[4]);
+  const auto ttt_ = hn::Set(tag, V_B[5]);
+
+  // Vector RST:                            R   S   T   _
+  auto RST_ = hn::Mul(abc_, rrr_);      //  ar  br  cr  _
+  RST_ = hn::MulAdd(def_, sss_, RST_);  // +ds +es +fs  _
+  RST_ = hn::MulAdd(ghi0, ttt_, RST_);  // +gt +ht +it  _
+
+  hn::StoreU(XYZ_, tag, V_A);         // 4-wide write temporarily overwrites R
+  hn::StoreN(RST_, tag, V_A + 3, 3);  // 3-wide write to stay in bounds
+}
+
 #else  // HWY_MAX_BYTES
 
 /* The portable versions are always defined. They should be written to maximize
@@ -698,6 +761,24 @@ void ComposeXinvXImpl(const double* X_BA, const double* X_BC, double* X_AC) {
   std::copy(X_AC_temp, X_AC_temp + 12, X_AC);
 }
 
+void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
+                                double* V_A) {
+  DRAKE_ASSERT(V_A != nullptr);
+  double x, y, z;  // Protect from overlap with V_B.
+  x = row_x_col(&R_AB[0], &V_B[0]);
+  y = row_x_col(&R_AB[1], &V_B[0]);
+  z = row_x_col(&R_AB[2], &V_B[0]);
+  V_A[0] = x;
+  V_A[1] = y;
+  V_A[2] = z;
+  x = row_x_col(&R_AB[0], &V_B[3]);
+  y = row_x_col(&R_AB[1], &V_B[3]);
+  z = row_x_col(&R_AB[2], &V_B[3]);
+  V_A[3] = x;
+  V_A[4] = y;
+  V_A[5] = z;
+}
+
 #endif  // HWY_MAX_BYTES
 
 }  // namespace HWY_NAMESPACE
@@ -732,6 +813,10 @@ HWY_EXPORT(ComposeXinvXImpl);
 struct ChooseBestComposeXinvX {
   auto operator()() { return HWY_DYNAMIC_POINTER(ComposeXinvXImpl); }
 };
+HWY_EXPORT(ReexpressSpatialVectorImpl);
+struct ChooseBestReexpressSpatialVector {
+  auto operator()() { return HWY_DYNAMIC_POINTER(ReexpressSpatialVectorImpl); }
+};
 
 // These sugar functions convert C++ types into bare arrays.
 const double* GetRawData(const RotationMatrix<double>& R) {
@@ -750,6 +835,14 @@ double* GetRawData(RigidTransform<double>* X) {
   // the rotation matrix first followed by the translation.
   return const_cast<double*>(X->rotation().matrix().data());
 }
+template <int N>
+const double* GetRawData(const Vector<double, N>& v) {
+  return v.data();
+}
+template <int N>
+double* GetRawData(Vector<double, N>* v) {
+  return v->data();
+}
 
 }  // namespace
 
@@ -781,6 +874,12 @@ void ComposeXinvX(const RigidTransform<double>& X_BA,
       GetRawData(X_BA), GetRawData(X_BC), GetRawData(X_AC));
 }
 
+void ReexpressSpatialVector(const RotationMatrix<double>& R_AB,
+                            const Vector6<double>& V_B, Vector6<double>* V_A) {
+  LateBoundFunction<ChooseBestReexpressSpatialVector>::Call(
+      GetRawData(R_AB), GetRawData(V_B), GetRawData(V_A));
+}
+
 }  // namespace internal
 }  // namespace math
 }  // namespace drake
diff --git a/math/test/fast_pose_composition_functions_test.cc b/math/test/fast_pose_composition_functions_test.cc
@@ -171,6 +171,23 @@ TEST_P(FastPoseCompositionFunctions, ComposeXX) {
   EXPECT_TRUE(CompareMatrices(arg3->GetAsMatrix4(), expected));
 }
 
+GTEST_TEST(FastReexpressSpatialVector, ReexpressSpatialVector) {
+  constexpr double kTolerance = 8 * std::numeric_limits<double>::epsilon();
+  const RotationMatrixd R_AB(RollPitchYawd(1.0, 1.25, 1.5));
+  Vector6d V_B;
+  V_B << 1.0, 2.0, 3.0, 4.0, 5.0, 6.0;
+  Vector6d V_A_expected;
+  V_A_expected.head<3>() = R_AB * V_B.head<3>();
+  V_A_expected.tail<3>() = R_AB * V_B.tail<3>();
+  Vector6d V_A;
+  ReexpressSpatialVector(R_AB, V_B, &V_A);
+  EXPECT_TRUE(CompareMatrices(V_A, V_A_expected, kTolerance));
+
+  // Ensure that it works when V_A and V_B are the same vector.
+  ReexpressSpatialVector(R_AB, V_B, &V_B);
+  EXPECT_TRUE(CompareMatrices(V_B, V_A_expected, kTolerance));
+}
+
 }  // namespace
 }  // namespace internal
 }  // namespace math
diff --git a/multibody/math/spatial_acceleration.h b/multibody/math/spatial_acceleration.h
@@ -134,7 +134,7 @@ class SpatialAcceleration : public SpatialVector<SpatialAcceleration, T> {
     const Vector3<T>& alpha_MB_E = this->rotational();
     // Calculate point Co's translational acceleration measured in M.
     Vector3<T>& a_MCo_E = this->translational();
-    a_MCo_E += (alpha_MB_E.cross(p_BoCo_E)
+    a_MCo_E += (alpha_MB_E.cross(p_BoCo_E)  // 33 flops total
             +   w_MB_E.cross(w_MB_E.cross(p_BoCo_E)));
   }
 
diff --git a/multibody/math/spatial_vector.h b/multibody/math/spatial_vector.h
@@ -278,7 +278,15 @@ class SpatialVector {
   /// </pre>
   friend SpatialQuantity operator*(
       const math::RotationMatrix<T>& R_FE, const SpatialQuantity& V_E) {
-    return SpatialQuantity(R_FE * V_E.rotational(), R_FE * V_E.translational());
+    if constexpr (std::is_same_v<T, double>) {
+      SpatialQuantity V_F;
+      math::internal::ReexpressSpatialVector(R_FE, V_E.get_coeffs(),
+                                             &V_F.get_coeffs());
+      return V_F;
+    } else {
+      return SpatialQuantity(R_FE * V_E.rotational(),
+                             R_FE * V_E.translational());
+    }
   }
 
   /// Factory to create a _zero_ spatial vector, i.e., a %SpatialVector whose
diff --git a/multibody/tree/body_node_impl.cc b/multibody/tree/body_node_impl.cc
@@ -279,7 +279,8 @@ void BodyNodeImpl<T, ConcreteMobilizer>::CalcVelocityKinematicsCache2_BaseToTip(
   const SpatialVelocity<T> V_WMc_Mp = V_WMp_Mp.Shift(p_MpM_Mp);  // 15 flops
 
   SpatialVelocity<T>& V_WM_M = vc2->get_mutable_V_WM_M(index);
-  V_WM_M = R_MMp * V_WMc_Mp + V_FM_M;  // 36 flops
+  V_WM_M = R_MMp * V_WMc_Mp  // 6 SIMD flops
+           + V_FM_M;         // 6 flops
 }
 
 // As a guideline for developers, a summary of the computations performed in
@@ -439,7 +440,7 @@ void BodyNodeImpl<T, ConcreteMobilizer>::CalcSpatialAcceleration2_BaseToTip(
   const math::RotationMatrix<T> R_MMp = X_MpM.rotation().inverse();
   const Vector3<T>& w_WP_Mp = V_WMp_Mp.rotational();  // all frames on P same ω
 
-  // Start with parent contribution. 42 flops for shift + 30 for re-express
+  // Start with parent contribution. shift: 33 flops, reexpress: 6 SIMD flops
   A_WM_M = R_MMp * A_WMp_Mp.Shift(p_MpM_Mp, w_WP_Mp);  // parent contribution
 
   // Next add in the velocity-dependent bias acceleration:
@@ -672,17 +673,18 @@ void BodyNodeImpl<T, ConcreteMobilizer>::CalcInverseDynamics2_TipToBase(
   // Unfortunately the applied force is given at Bo and expressed in W. We
   // need it applied at Mo and expressed in M.
   const SpatialForce<T>& Fapp_BBo_W = Fapplied_Bo_W_array[index];
-  const SpatialForce<T> Fapp_BBo_M = R_MW * Fapp_BBo_W;            // 30 flops
+  const SpatialForce<T> Fapp_BBo_M = R_MW * Fapp_BBo_W;  // 6 SIMD flops
   const SpatialForce<T> Fapp_BMo_M = Fapp_BBo_M.Shift(-p_MoBo_M);  // 15 flops
 
-  // Calculate the velocity-dependent bias force on B (57 flops).
+  // Calculate the velocity-dependent bias force on B (48 flops).
   const SpatialForce<T> Fbias_BMo_M(
-      mass * w_WM_M.cross(G_BMo_M * w_WM_M),          // bias moment, 30 flops
-      mass * w_WM_M.cross(w_WM_M.cross(p_MoBcm_M)));  // bias force, 27 flops
+      mass * w_WM_M.cross(G_BMo_M * w_WM_M),          // bias moment, 27 flops
+      mass * w_WM_M.cross(w_WM_M.cross(p_MoBcm_M)));  // bias force, 21 flops
 
   // F_BMo_M is an output and will be the total force on B.
   SpatialForce<T>& F_BMo_M = (*F_BMo_M_array)[index];
-  F_BMo_M = M_BMo_M * A_WM_M + Fbias_BMo_M - Fapp_BMo_M;  // 78 flops
+  F_BMo_M = M_BMo_M * A_WM_M             // 45 flops
+            + Fbias_BMo_M - Fapp_BMo_M;  // 12 flops
 
   // Next, account for forces applied to B through its outboard joints, treated
   // as though they were locked. Since we're going tip to base we already have
@@ -696,7 +698,7 @@ void BodyNodeImpl<T, ConcreteMobilizer>::CalcInverseDynamics2_TipToBase(
     const Vector3<T>& p_MoMc_M = X_MMc.translation();
 
     const SpatialForce<T>& F_CMc_Mc = (*F_BMo_M_array)[outboard_index];
-    const SpatialForce<T> F_CMc_M = R_MMc * F_CMc_Mc;          // 30 flops
+    const SpatialForce<T> F_CMc_M = R_MMc * F_CMc_Mc;          // 6 SIMD flops
     const SpatialForce<T> F_CMo_M = F_CMc_M.Shift(-p_MoMc_M);  // 15 flops
     F_BMo_M += F_CMo_M;                                        // 6 flops
   }
diff --git a/multibody/tree/spatial_inertia.cc b/multibody/tree/spatial_inertia.cc
@@ -448,10 +448,10 @@ SpatialForce<T> SpatialInertia<T>::operator*(
   return SpatialForce<T>(
       // Note: p_PScm_E here is p_BoBcm in the above notation.
       // Rotational
-      mass_ * (G_SP_E_ * alpha_WB_E + p_PScm_E_.cross(a_WBo_E)),
+      mass_ * (G_SP_E_ * alpha_WB_E + p_PScm_E_.cross(a_WBo_E)),  // 30 flops
       // Translational: notice the order of the cross product is the reversed
       // of the documentation above and thus no minus sign is needed.
-      mass_ * (alpha_WB_E.cross(p_PScm_E_) + a_WBo_E));
+      mass_ * (alpha_WB_E.cross(p_PScm_E_) + a_WBo_E));  // 15 flops
 }
 
 template <typename T>
@@ -466,10 +466,10 @@ SpatialMomentum<T> SpatialInertia<T>::operator*(
   return SpatialMomentum<T>(
       // Note: p_PScm_E here is p_BoBcm in the above notation.
       // Rotational
-      mass_ * (G_SP_E_ * w_WB_E + p_PScm_E_.cross(v_WP_E)),
+      mass_ * (G_SP_E_ * w_WB_E + p_PScm_E_.cross(v_WP_E)),  // 30 flops
       // Translational: notice the order of the cross product is the reversed
       // of the documentation above and thus no minus sign is needed.
-      mass_ * (w_WB_E.cross(p_PScm_E_) + v_WP_E));
+      mass_ * (w_WB_E.cross(p_PScm_E_) + v_WP_E));  // 15 flops
 }
 
 template <typename T>

Original file line number	Diff line number	Diff line change
`@@ -134,7 +134,7 @@ class SpatialAcceleration : public SpatialVector<SpatialAcceleration, T> {`
`134`	`134`	`const Vector3<T>& alpha_MB_E = this->rotational();`
`135`	`135`	`// Calculate point Co's translational acceleration measured in M.`
`136`	`136`	`Vector3<T>& a_MCo_E = this->translational();`
`137`		`- a_MCo_E += (alpha_MB_E.cross(p_BoCo_E)`
	`137`	`+ a_MCo_E += (alpha_MB_E.cross(p_BoCo_E) // 33 flops total`
`138`	`138`	`+ w_MB_E.cross(w_MB_E.cross(p_BoCo_E)));`
`139`	`139`	`}`
`140`	`140`