4
4
5
5
#include < algorithm>
6
6
#include < cstdint>
7
+ #include < iostream>
7
8
8
9
// This is the magic juju that compiles our impl functions for multiple CPUs.
9
10
#undef HWY_TARGET_INCLUDE
@@ -615,14 +616,21 @@ We want to perform two matrix-vector products:
615
616
616
617
We can do this in 6 SIMD instructions. We end up doing 40 flops and throwing
617
618
10 of them away.
619
+
620
+ llvm-mca says 620 cycles / 100 iterations
618
621
*/
619
622
void ReexpressSpatialVectorImpl (const double * R_AB, const double * V_B,
620
623
double * V_A) {
621
624
const hn::FixedTag<double , 4 > tag;
622
625
623
626
const auto abc_ = hn::LoadU (tag, R_AB); // (d is loaded but unused)
624
627
const auto def_ = hn::LoadU (tag, R_AB + 3 ); // (g is loaded but unused)
625
- const auto ghi_ = hn::LoadN (tag, R_AB + 6 , 3 );
628
+
629
+ // Llvm-mca rates this two-step implementation as a half-cycle better than
630
+ // the equivalent `ghi0 = hn::LoadN(tag, R_AB + 6, 3)` which gives 670/100
631
+ // cycles (gcc 11.4 & clang 14.0.0).
632
+ const auto fghi = hn::LoadU (tag, R_AB + 5 ); // (f not wanted)
633
+ const auto ghi0 = hn::SlideDownLanes (tag, fghi, 1 );
626
634
627
635
const auto xxx_ = hn::Set (tag, V_B[0 ]);
628
636
const auto yyy_ = hn::Set (tag, V_B[1 ]);
@@ -631,7 +639,7 @@ void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
631
639
// Vector XYZ: X Y Z _
632
640
auto XYZ_ = hn::Mul (abc_, xxx_); // ax bx cx _
633
641
XYZ_ = hn::MulAdd (def_, yyy_, XYZ_); // +dy +ey +fy _
634
- XYZ_ = hn::MulAdd (ghi_ , zzz_, XYZ_); // +gz +hz +iz _
642
+ XYZ_ = hn::MulAdd (ghi0 , zzz_, XYZ_); // +gz +hz +iz _
635
643
636
644
const auto rrr_ = hn::Set (tag, V_B[3 ]);
637
645
const auto sss_ = hn::Set (tag, V_B[4 ]);
@@ -640,7 +648,7 @@ void ReexpressSpatialVectorImpl(const double* R_AB, const double* V_B,
640
648
// Vector RST: R S T _
641
649
auto RST_ = hn::Mul (abc_, rrr_); // ar br cr _
642
650
RST_ = hn::MulAdd (def_, sss_, RST_); // +ds +es +fs _
643
- RST_ = hn::MulAdd (ghi_ , ttt_, RST_); // +gt +ht +it _
651
+ RST_ = hn::MulAdd (ghi0 , ttt_, RST_); // +gt +ht +it _
644
652
645
653
hn::StoreU (XYZ_, tag, V_A); // 4-wide write temporarily overwrites R
646
654
hn::StoreN (RST_, tag, V_A + 3 , 3 ); // 3-wide write to stay in bounds
@@ -667,7 +675,7 @@ void CrossProductImpl(const double* w, const double* r, double* wXr) {
667
675
hn::StoreN (wXr_, tag, wXr, 3 );
668
676
}
669
677
670
-
678
+ /*
671
679
// w x w x r
672
680
void CrossCrossProductImpl(const double* w, const double* r, double* wXwXr) {
673
681
const hn::FixedTag<double, 4> tag;
@@ -691,32 +699,38 @@ void CrossCrossProductImpl(const double* w, const double* r, double* wXwXr) {
691
699
692
700
hn::StoreN(wXwXr_, tag, wXwXr, 3);
693
701
}
702
+ */
694
703
704
+ /*
695
705
// TODO(sherm1) Untested -- does this even work?
696
706
// G is a - - but symmetric, so we need columns abc, bde, cef
697
707
// b d -
698
708
// c e f
699
- /* This is 522 cycles according to llvm-mca */
709
+ // Caution: symmetric elements might be NaN; don't compute with them.
710
+ // This is 619 cycles according to llvm-mca
700
711
void SymTimesVectorImpl(const double* G, const double* w, double* Gw) {
701
712
const hn::FixedTag<double, 4> tag;
702
- const auto abc_ = hn::LoadU (tag, G);
703
- const auto uuu_ = hn::Set (tag, w[0 ]);
704
- auto Gw_ = hn::Mul (abc_, uuu_); // au bu cu _
705
-
713
+ const auto abc0 = hn::LoadN(tag, G, 3); // Avoid the NaN
706
714
const auto c_de = hn::LoadU(tag, G + 2);
707
- const auto abde = hn::ConcatUpperLower (tag, c_de, abc_);
708
- const auto bde0 = hn::ShiftLeftLanes<1 >(tag, abde);
709
- const auto vvv_ = hn::Set (tag, w[1 ]);
710
- Gw_ = hn::MulAdd (bde0, vvv_, Gw_); // +bv +dv +ev
715
+ const double f = G[8];
716
+ const auto uuuu = hn::Set(tag, w[0]);
717
+ const auto vvvv = hn::Set(tag, w[1]);
718
+ const auto wwww = hn::Set(tag, w[2]);
719
+
720
+ const auto abde = hn::ConcatUpperLower(tag, c_de, abc0);
721
+ const auto bde0 = hn::SlideUpLanes(tag, abde, 1);
711
722
712
723
const auto ced_ = hn::Per4LaneBlockShuffle<2, 1, 3, 0>(c_de);
713
- const double f = G[8 ];
714
724
const auto cef_ = hn::InsertLane(ced_, 1, f);
715
- const auto www_ = hn::Set (tag, w[2 ]);
716
- Gw_ = hn::MulAdd (cef_, www_, Gw_); // +cw +ew +fw
725
+ const auto cef0 = hn::InsertLane(cef_, 0, 0.0);
717
726
718
- hn::StoreN (Gw_, tag, Gw, 3 );
727
+ auto Gw0 = hn::Mul(abc0, uuuu); // au bu cu 0
728
+ Gw0 = hn::MulAdd(bde0, vvvv, Gw0); // +bv +dv +ev 0
729
+ Gw0 = hn::MulAdd(cef0, wwww, Gw0); // +cw +ew +fw 0
730
+
731
+ hn::StoreN(Gw0, tag, Gw, 3);
719
732
}
733
+ */
720
734
721
735
/* This is considerably slower (617 cycles)
722
736
void SymTimesVectorImpl2(const double* G, const double* w, double* Gw) {
0 commit comments