Skip to content

Commit 94cc351

Browse files
committed
optimize BinSumsBoosting for AVX2 with hessian by loading gradients and hessians together in 64 bit chunks
1 parent 6bc6693 commit 94cc351

File tree

1 file changed

+31
-49
lines changed

1 file changed

+31
-49
lines changed

shared/libebm/compute/BinSumsBoosting.hpp

+31-49
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,8 @@ template<typename TFloat,
140140
bool bWeight,
141141
size_t cCompilerScores,
142142
int cCompilerPack,
143-
typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores, int>::type = 0>
143+
typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores && (!bHessian || 8 != TFloat::TInt::k_cSIMDPack),
144+
int>::type = 0>
144145
GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridge* const pParams) {
145146

146147
static_assert(1 == cCompilerScores, "This specialization of BinSumsBoostingInternal cannot handle multiclass.");
@@ -362,42 +363,14 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
362363
}
363364

364365

365-
/*
366-
This speculative version is a specialization where bHessian is true. When there is
367-
a hessian instead of loading the gradients from the bins first then hessians after that we
368-
can use a gathering load to get the gradient and hessian for the first 1/2 of the
369-
samples, then get the next 1/2 of the samples. The potential benefit is that
370-
the CPU might benefit from getting gradients and hessians that are located next
371-
to eachother. Unfortunately this seems to introduce a data dependency issue since
372-
the first 1/2 of the gradients/hessians need to be loaded/modified/stored before the
373-
next 1/2 can be worked on. This seems slower but more investigation is required
374-
375-
376-
inline void PermuteForInterleaf(Avx2_32_Int& v0, Avx2_32_Int& v1) const noexcept {
377-
// this function permutes the values into positions that the Interleaf function expects
378-
// but for any SIMD implementation the positions can be variable as long as they work together
379-
v0 = Avx2_32_Int(_mm256_permutevar8x32_epi32(m_data, _mm256_setr_epi32(0, 0, 1, 1, 4, 4, 5, 5)));
380-
v1 = Avx2_32_Int(_mm256_permutevar8x32_epi32(m_data, _mm256_setr_epi32(2, 2, 3, 3, 6, 6, 7, 7)));
381-
}
382-
inline static Avx2_32_Int MakeAlternating() noexcept { return Avx2_32_Int(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)); }
383-
inline static Avx2_32_Int MakeHalfIndexes() noexcept { return Avx2_32_Int(_mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); }
384-
385-
inline static void Interleaf(Avx2_32_Float& val0, Avx2_32_Float& val1) noexcept {
386-
// this function permutes the values into positions that the PermuteForInterleaf function expects
387-
// but for any SIMD implementation, the positions can be variable as long as they work together
388-
__m256 temp = _mm256_unpacklo_ps(val0.m_data, val1.m_data);
389-
val1 = Avx2_32_Float(_mm256_unpackhi_ps(val0.m_data, val1.m_data));
390-
val0 = Avx2_32_Float(temp);
391-
}
392-
393366
template<typename TFloat,
394367
bool bParallel,
395368
bool bCollapsed,
396369
bool bHessian,
397370
bool bWeight,
398371
size_t cCompilerScores,
399372
int cCompilerPack,
400-
typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores && bHessian, int>::type = 0>
373+
typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores && bHessian && 8 == TFloat::TInt::k_cSIMDPack, int>::type = 0>
401374
GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridge* const pParams) {
402375

403376
static_assert(1 == cCompilerScores, "This specialization of BinSumsBoostingInternal cannot handle multiclass.");
@@ -419,8 +392,7 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
419392

420393
const typename TFloat::T* pGradientAndHessian =
421394
reinterpret_cast<const typename TFloat::T*>(pParams->m_aGradientsAndHessians);
422-
const typename TFloat::T* const pGradientsAndHessiansEnd =
423-
pGradientAndHessian + size_t{2} * cSamples;
395+
const typename TFloat::T* const pGradientsAndHessiansEnd = pGradientAndHessian + size_t{2} * cSamples;
424396

425397
static constexpr typename TFloat::TInt::T cBytesPerBin = static_cast<typename TFloat::TInt::T>(
426398
GetBinSize<typename TFloat::T, typename TFloat::TInt::T>(false, false, bHessian, size_t{1}));
@@ -462,8 +434,25 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
462434

463435
EBM_ASSERT(0 == pParams->m_cBytesFastBins % static_cast<size_t>(cBytesPerBin));
464436

437+
// The compiler is normally pretty good about optimizing multiplications into shifts when possible
438+
// BUT, when compiling for SIMD, it seems to use a SIMD multiplication instruction instead of shifts
439+
// even when the multiplication has a fixed compile time constant value that is a power of 2, so
440+
// we manually convert the multiplications into shifts.
441+
//
442+
// We also have tried the Multiply templated function that is designed to convert multiplications
443+
// into shifts, but using that templated function breaks the compiler optimization that unrolls
444+
// the bitpacking loop.
445+
//
446+
constexpr static bool bSmall = 4 == cBytesPerBin;
447+
constexpr static bool bMed = 8 == cBytesPerBin;
448+
constexpr static bool bLarge = 16 == cBytesPerBin;
449+
static_assert(bSmall || bMed || bLarge, "cBytesPerBin size must be small, medium, or large");
450+
constexpr static int cFixedShift = bSmall ? 2 : bMed ? 3 : 4;
451+
static_assert(1 << cFixedShift == cBytesPerBin, "cFixedShift must match the BinSize");
452+
EBM_ASSERT(0 == pParams->m_cBytesFastBins % static_cast<size_t>(cBytesPerBin));
453+
465454
const typename TFloat::TInt offsets =
466-
TFloat::TInt::MakeHalfIndexes() * static_cast<typename TFloat::TInt::T>(pParams->m_cBytesFastBins);
455+
TFloat::TInt::MakeIndexes() * static_cast<typename TFloat::TInt::T>(pParams->m_cBytesFastBins >> cFixedShift);
467456

468457
do {
469458
// TODO: maybe, it might be useful to preload the iTensorBinCombined, weight, gradient, hessian for the next loop
@@ -506,36 +495,30 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
506495
TFloat gradhess1 = TFloat::Load(&pGradientAndHessian[TFloat::k_cSIMDPack]);
507496
pGradientAndHessian += size_t{2} * TFloat::k_cSIMDPack;
508497

509-
TFloat::Interleaf(gradhess0, gradhess1);
510-
511498
if(bWeight) {
512499
gradhess0 *= weight;
513500
gradhess1 *= weight;
514501
}
515502

516-
typename TFloat::TInt iTensorBin =
517-
(((iTensorBinCombined >> cShift) & maskBits) << (TFloat::k_cTypeShift + 1)) + offsets;
503+
TFloat::Interleaf(gradhess0, gradhess1);
504+
505+
typename TFloat::TInt iTensorBin = ((iTensorBinCombined >> cShift) & maskBits) + offsets;
506+
507+
iTensorBin = PermuteForInterleaf(iTensorBin);
518508

519509
// TODO: instead of loading the gradient and hessian as separate loads, it might be better to
520510
// load the gradients and hessians as part as the same gather load because the CPU might
521511
// be better at loading items from the same cache line, and also because it would reduce our
522512
// memory by a factor of 2x since we could then handle two items in this loop sequentially
523513
// The drawback is that we need to shuffle our indexes and gradient/hessians values that we get back
524514

525-
typename TFloat::TInt iTensorBin0;
526-
typename TFloat::TInt iTensorBin1;
527-
iTensorBin.PermuteForInterleaf(iTensorBin0, iTensorBin1);
515+
TFloat bin0;
516+
TFloat bin1;
528517

529-
iTensorBin0 = iTensorBin0 + TFloat::TInt::MakeAlternating() * sizeof(TFloat::TInt::T);
530-
iTensorBin1 = iTensorBin1 + TFloat::TInt::MakeAlternating() * sizeof(TFloat::TInt::T);
531-
532-
TFloat bin0 = TFloat::template Load<0>(aBins, iTensorBin0);
518+
TFloat::template DoubleLoad<cFixedShift>(aBins, iTensorBin, bin0, bin1);
533519
bin0 += gradhess0;
534-
bin0.template Store<0>(aBins, iTensorBin0);
535-
536-
TFloat bin1 = TFloat::template Load<0>(aBins, iTensorBin1);
537520
bin1 += gradhess1;
538-
bin1.template Store<0>(aBins, iTensorBin1);
521+
TFloat::template DoubleStore<cFixedShift>(aBins, iTensorBin, bin0, bin1);
539522

540523
cShift -= cBitsPerItemMax;
541524
} while(0 <= cShift);
@@ -544,7 +527,6 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
544527
}
545528
} while(pGradientsAndHessiansEnd != pGradientAndHessian);
546529
}
547-
*/
548530

549531
template<typename TFloat,
550532
bool bParallel,

0 commit comments

Comments
 (0)