Skip to content

Commit 8df8522

Browse files
committed
Playing with Highway code
1 parent 50f906f commit 8df8522

File tree

2 files changed

+33
-19
lines changed

2 files changed

+33
-19
lines changed

.bazeliskrc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# When bazelisk in use (as is typical, per Drake install_prereqs), this dotfile
22
# specifies which version of Bazel should be used to build and test Drake.
33
# Keep the in sync with doc/_pages/from_source.md (only the major.minor part).
4-
#USE_BAZEL_VERSION=8.0.0
5-
USE_BAZEL_VERSION=7.4.1
4+
USE_BAZEL_VERSION=8.0.0
5+
#USE_BAZEL_VERSION=7.4.1
66

77
# For some reason the google mirrors are very flaky in Drake CI in EC2, so
88
# we'll point to the GitHub mirrors instead.

math/fast_pose_composition_functions_avx2_fma.cc

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include <algorithm>
66
#include <cstdint>
7+
#include <iostream>
78

89
// This is the magic juju that compiles our impl functions for multiple CPUs.
910
#undef HWY_TARGET_INCLUDE
@@ -615,14 +616,21 @@ We want to perform two matrix-vector products:
615616
616617
We can do this in 6 SIMD instructions. We end up doing 40 flops and throwing
617618
10 of them away.
619+
620+
llvm-mca says 620 cycles / 100 iterations
618621
*/
619622
void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
620623
double* V_A) {
621624
const hn::FixedTag<double, 4> tag;
622625

623626
const auto abc_ = hn::LoadU(tag, R_AB); // (d is loaded but unused)
624627
const auto def_ = hn::LoadU(tag, R_AB + 3); // (g is loaded but unused)
625-
const auto ghi_ = hn::LoadN(tag, R_AB + 6, 3);
628+
629+
// Llvm-mca rates this two-step implementation as a half-cycle better than
630+
// the equivalent `ghi0 = hn::LoadN(tag, R_AB + 6, 3)` which gives 670/100
631+
// cycles (gcc 11.4 & clang 14.0.0).
632+
const auto fghi = hn::LoadU(tag, R_AB + 5); // (f not wanted)
633+
const auto ghi0 = hn::SlideDownLanes(tag, fghi, 1);
626634

627635
const auto xxx_ = hn::Set(tag, V_B[0]);
628636
const auto yyy_ = hn::Set(tag, V_B[1]);
@@ -631,7 +639,7 @@ void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
631639
// Vector XYZ: X Y Z _
632640
auto XYZ_ = hn::Mul(abc_, xxx_); // ax bx cx _
633641
XYZ_ = hn::MulAdd(def_, yyy_, XYZ_); // +dy +ey +fy _
634-
XYZ_ = hn::MulAdd(ghi_, zzz_, XYZ_); // +gz +hz +iz _
642+
XYZ_ = hn::MulAdd(ghi0, zzz_, XYZ_); // +gz +hz +iz _
635643

636644
const auto rrr_ = hn::Set(tag, V_B[3]);
637645
const auto sss_ = hn::Set(tag, V_B[4]);
@@ -640,7 +648,7 @@ void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
640648
// Vector RST: R S T _
641649
auto RST_ = hn::Mul(abc_, rrr_); // ar br cr _
642650
RST_ = hn::MulAdd(def_, sss_, RST_); // +ds +es +fs _
643-
RST_ = hn::MulAdd(ghi_, ttt_, RST_); // +gt +ht +it _
651+
RST_ = hn::MulAdd(ghi0, ttt_, RST_); // +gt +ht +it _
644652

645653
hn::StoreU(XYZ_, tag, V_A); // 4-wide write temporarily overwrites R
646654
hn::StoreN(RST_, tag, V_A + 3, 3); // 3-wide write to stay in bounds
@@ -667,7 +675,7 @@ void CrossProductImpl(const double* w, const double* r, double* wXr) {
667675
hn::StoreN(wXr_, tag, wXr, 3);
668676
}
669677

670-
678+
/*
671679
// w x w x r
672680
void CrossCrossProductImpl(const double* w, const double* r, double* wXwXr) {
673681
const hn::FixedTag<double, 4> tag;
@@ -691,32 +699,38 @@ void CrossCrossProductImpl(const double* w, const double* r, double* wXwXr) {
691699
692700
hn::StoreN(wXwXr_, tag, wXwXr, 3);
693701
}
702+
*/
694703

704+
/*
695705
// TODO(sherm1) Untested -- does this even work?
696706
// G is a - - but symmetric, so we need columns abc, bde, cef
697707
// b d -
698708
// c e f
699-
/* This is 522 cycles according to llvm-mca */
709+
// Caution: symmetric elements might be NaN; don't compute with them.
710+
// This is 619 cycles according to llvm-mca
700711
void SymTimesVectorImpl(const double* G, const double* w, double* Gw) {
701712
const hn::FixedTag<double, 4> tag;
702-
const auto abc_ = hn::LoadU(tag, G);
703-
const auto uuu_ = hn::Set(tag, w[0]);
704-
auto Gw_ = hn::Mul(abc_, uuu_); // au bu cu _
705-
713+
const auto abc0 = hn::LoadN(tag, G, 3); // Avoid the NaN
706714
const auto c_de = hn::LoadU(tag, G + 2);
707-
const auto abde = hn::ConcatUpperLower(tag, c_de, abc_);
708-
const auto bde0 = hn::ShiftLeftLanes<1>(tag, abde);
709-
const auto vvv_ = hn::Set(tag, w[1]);
710-
Gw_ = hn::MulAdd(bde0, vvv_, Gw_); // +bv +dv +ev
715+
const double f = G[8];
716+
const auto uuuu = hn::Set(tag, w[0]);
717+
const auto vvvv = hn::Set(tag, w[1]);
718+
const auto wwww = hn::Set(tag, w[2]);
719+
720+
const auto abde = hn::ConcatUpperLower(tag, c_de, abc0);
721+
const auto bde0 = hn::SlideUpLanes(tag, abde, 1);
711722
712723
const auto ced_ = hn::Per4LaneBlockShuffle<2, 1, 3, 0>(c_de);
713-
const double f = G[8];
714724
const auto cef_ = hn::InsertLane(ced_, 1, f);
715-
const auto www_ = hn::Set(tag, w[2]);
716-
Gw_ = hn::MulAdd(cef_, www_, Gw_); // +cw +ew +fw
725+
const auto cef0 = hn::InsertLane(cef_, 0, 0.0);
717726
718-
hn::StoreN(Gw_, tag, Gw, 3);
727+
auto Gw0 = hn::Mul(abc0, uuuu); // au bu cu 0
728+
Gw0 = hn::MulAdd(bde0, vvvv, Gw0); // +bv +dv +ev 0
729+
Gw0 = hn::MulAdd(cef0, wwww, Gw0); // +cw +ew +fw 0
730+
731+
hn::StoreN(Gw0, tag, Gw, 3);
719732
}
733+
*/
720734

721735
/* This is considerably slower (617 cycles)
722736
void SymTimesVectorImpl2(const double* G, const double* w, double* Gw) {

0 commit comments

Comments
 (0)