Playing with Highway code

sherm1 · sherm1 · commit 8df852227669 · 2025-01-09T18:00:53.000-08:00
diff --git a/.bazeliskrc b/.bazeliskrc
@@ -1,8 +1,8 @@
 # When bazelisk in use (as is typical, per Drake install_prereqs), this dotfile
 # specifies which version of Bazel should be used to build and test Drake.
 # Keep the in sync with doc/_pages/from_source.md (only the major.minor part).
-#USE_BAZEL_VERSION=8.0.0
-USE_BAZEL_VERSION=7.4.1
+USE_BAZEL_VERSION=8.0.0
+#USE_BAZEL_VERSION=7.4.1
 
 # For some reason the google mirrors are very flaky in Drake CI in EC2, so
 # we'll point to the GitHub mirrors instead.
diff --git a/math/fast_pose_composition_functions_avx2_fma.cc b/math/fast_pose_composition_functions_avx2_fma.cc
@@ -4,6 +4,7 @@
 
 #include <algorithm>
 #include <cstdint>
+#include <iostream>
 
 // This is the magic juju that compiles our impl functions for multiple CPUs.
 #undef HWY_TARGET_INCLUDE
@@ -615,14 +616,21 @@ We want to perform two matrix-vector products:
 
 We can do this in 6 SIMD instructions. We end up doing 40 flops and throwing
 10 of them away.
+
+llvm-mca says 620 cycles / 100 iterations
 */
 void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
                                 double* V_A) {
   const hn::FixedTag<double, 4> tag;
 
   const auto abc_ = hn::LoadU(tag, R_AB);      // (d is loaded but unused)
   const auto def_ = hn::LoadU(tag, R_AB + 3);  // (g is loaded but unused)
-  const auto ghi_ = hn::LoadN(tag, R_AB + 6, 3);
+
+  // Llvm-mca rates this two-step implementation as a half-cycle better than
+  // the equivalent `ghi0 = hn::LoadN(tag, R_AB + 6, 3)` which gives 670/100
+  // cycles (gcc 11.4 & clang 14.0.0).
+  const auto fghi = hn::LoadU(tag, R_AB + 5);  // (f not wanted)
+  const auto ghi0 = hn::SlideDownLanes(tag, fghi, 1);
 
   const auto xxx_ = hn::Set(tag, V_B[0]);
   const auto yyy_ = hn::Set(tag, V_B[1]);
@@ -631,7 +639,7 @@ void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
   // Vector XYZ:                            X   Y   Z   _
   auto XYZ_ = hn::Mul(abc_, xxx_);      //  ax  bx  cx  _
   XYZ_ = hn::MulAdd(def_, yyy_, XYZ_);  // +dy +ey +fy  _
-  XYZ_ = hn::MulAdd(ghi_, zzz_, XYZ_);  // +gz +hz +iz  _
+  XYZ_ = hn::MulAdd(ghi0, zzz_, XYZ_);  // +gz +hz +iz  _
 
   const auto rrr_ = hn::Set(tag, V_B[3]);
   const auto sss_ = hn::Set(tag, V_B[4]);
@@ -640,7 +648,7 @@ void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
   // Vector RST:                            R   S   T   _
   auto RST_ = hn::Mul(abc_, rrr_);      //  ar  br  cr  _
   RST_ = hn::MulAdd(def_, sss_, RST_);  // +ds +es +fs  _
-  RST_ = hn::MulAdd(ghi_, ttt_, RST_);  // +gt +ht +it  _
+  RST_ = hn::MulAdd(ghi0, ttt_, RST_);  // +gt +ht +it  _
 
   hn::StoreU(XYZ_, tag, V_A);         // 4-wide write temporarily overwrites R
   hn::StoreN(RST_, tag, V_A + 3, 3);  // 3-wide write to stay in bounds
@@ -667,7 +675,7 @@ void CrossProductImpl(const double* w, const double* r, double* wXr) {
   hn::StoreN(wXr_, tag, wXr, 3);
 }
 
-
+/*
 // w x w x r
 void CrossCrossProductImpl(const double* w, const double* r, double* wXwXr) {
   const hn::FixedTag<double, 4> tag;
@@ -691,32 +699,38 @@ void CrossCrossProductImpl(const double* w, const double* r, double* wXwXr) {
 
   hn::StoreN(wXwXr_, tag, wXwXr, 3);
 }
+*/
 
+/*
 // TODO(sherm1) Untested -- does this even work?
 // G is a - - but symmetric, so we need columns abc, bde, cef
 //      b d -
 //      c e f
-/* This is 522 cycles according to llvm-mca */
+// Caution: symmetric elements might be NaN; don't compute with them.
+// This is 619 cycles according to llvm-mca
 void SymTimesVectorImpl(const double* G, const double* w, double* Gw) {
   const hn::FixedTag<double, 4> tag;
-  const auto abc_ = hn::LoadU(tag, G);
-  const auto uuu_ = hn::Set(tag, w[0]);
-  auto Gw_ = hn::Mul(abc_, uuu_);  // au bu cu _
-
+  const auto abc0 = hn::LoadN(tag, G, 3);  // Avoid the NaN
   const auto c_de = hn::LoadU(tag, G + 2);
-  const auto abde = hn::ConcatUpperLower(tag, c_de, abc_);
-  const auto bde0 = hn::ShiftLeftLanes<1>(tag, abde);
-  const auto vvv_ = hn::Set(tag, w[1]);
-  Gw_ = hn::MulAdd(bde0, vvv_, Gw_);  // +bv +dv +ev
+  const double f = G[8];
+  const auto uuuu = hn::Set(tag, w[0]);
+  const auto vvvv = hn::Set(tag, w[1]);
+  const auto wwww = hn::Set(tag, w[2]);
+
+  const auto abde = hn::ConcatUpperLower(tag, c_de, abc0);
+  const auto bde0 = hn::SlideUpLanes(tag, abde, 1);
 
   const auto ced_ = hn::Per4LaneBlockShuffle<2, 1, 3, 0>(c_de);
-  const double f = G[8];
   const auto cef_ = hn::InsertLane(ced_, 1, f);
-  const auto www_ = hn::Set(tag, w[2]);
-  Gw_ = hn::MulAdd(cef_, www_, Gw_);  // +cw +ew +fw
+  const auto cef0 = hn::InsertLane(cef_, 0, 0.0);
 
-  hn::StoreN(Gw_, tag, Gw, 3);
+  auto Gw0 = hn::Mul(abc0, uuuu);     //  au  bu  cu 0
+  Gw0 = hn::MulAdd(bde0, vvvv, Gw0);  // +bv +dv +ev 0
+  Gw0 = hn::MulAdd(cef0, wwww, Gw0);  // +cw +ew +fw 0
+
+  hn::StoreN(Gw0, tag, Gw, 3);
 }
+*/
 
 /* This is considerably slower (617 cycles)
 void SymTimesVectorImpl2(const double* G, const double* w, double* Gw) {