@@ -140,7 +140,8 @@ template<typename TFloat,
140140 bool bWeight,
141141 size_t cCompilerScores,
142142 int cCompilerPack,
143- typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores, int >::type = 0 >
143+ typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores && (!bHessian || 8 != TFloat::TInt::k_cSIMDPack),
144+ int >::type = 0 >
144145GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal (BinSumsBoostingBridge* const pParams) {
145146
146147 static_assert (1 == cCompilerScores, " This specialization of BinSumsBoostingInternal cannot handle multiclass." );
@@ -362,42 +363,14 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
362363}
363364
364365
365- /*
366- This speculative version is a specialization where bHessian is true. When there is
367- a hessian instead of loading the gradients from the bins first then hessians after that we
368- can use a gathering load to get the gradient and hessian for the first 1/2 of the
369- samples, then get the next 1/2 of the samples. The potential benefit is that
370- the CPU might benefit from getting gradients and hessians that are located next
371- to eachother. Unfortunately this seems to introduce a data dependency issue since
372- the first 1/2 of the gradients/hessians need to be loaded/modified/stored before the
373- next 1/2 can be worked on. This seems slower but more investigation is required
374-
375-
376- inline void PermuteForInterleaf(Avx2_32_Int& v0, Avx2_32_Int& v1) const noexcept {
377- // this function permutes the values into positions that the Interleaf function expects
378- // but for any SIMD implementation the positions can be variable as long as they work together
379- v0 = Avx2_32_Int(_mm256_permutevar8x32_epi32(m_data, _mm256_setr_epi32(0, 0, 1, 1, 4, 4, 5, 5)));
380- v1 = Avx2_32_Int(_mm256_permutevar8x32_epi32(m_data, _mm256_setr_epi32(2, 2, 3, 3, 6, 6, 7, 7)));
381- }
382- inline static Avx2_32_Int MakeAlternating() noexcept { return Avx2_32_Int(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)); }
383- inline static Avx2_32_Int MakeHalfIndexes() noexcept { return Avx2_32_Int(_mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); }
384-
385- inline static void Interleaf(Avx2_32_Float& val0, Avx2_32_Float& val1) noexcept {
386- // this function permutes the values into positions that the PermuteForInterleaf function expects
387- // but for any SIMD implementation, the positions can be variable as long as they work together
388- __m256 temp = _mm256_unpacklo_ps(val0.m_data, val1.m_data);
389- val1 = Avx2_32_Float(_mm256_unpackhi_ps(val0.m_data, val1.m_data));
390- val0 = Avx2_32_Float(temp);
391- }
392-
393366template <typename TFloat,
394367 bool bParallel,
395368 bool bCollapsed,
396369 bool bHessian,
397370 bool bWeight,
398371 size_t cCompilerScores,
399372 int cCompilerPack,
400- typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores && bHessian, int>::type = 0>
373+ typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores && bHessian && 8 == TFloat::TInt::k_cSIMDPack , int >::type = 0 >
401374GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal (BinSumsBoostingBridge* const pParams) {
402375
403376 static_assert (1 == cCompilerScores, " This specialization of BinSumsBoostingInternal cannot handle multiclass." );
@@ -419,8 +392,7 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
419392
420393 const typename TFloat::T* pGradientAndHessian =
421394 reinterpret_cast <const typename TFloat::T*>(pParams->m_aGradientsAndHessians );
422- const typename TFloat::T* const pGradientsAndHessiansEnd =
423- pGradientAndHessian + size_t{2} * cSamples;
395+ const typename TFloat::T* const pGradientsAndHessiansEnd = pGradientAndHessian + size_t {2 } * cSamples;
424396
425397 static constexpr typename TFloat::TInt::T cBytesPerBin = static_cast <typename TFloat::TInt::T>(
426398 GetBinSize<typename TFloat::T, typename TFloat::TInt::T>(false , false , bHessian, size_t {1 }));
@@ -462,8 +434,25 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
462434
463435 EBM_ASSERT (0 == pParams->m_cBytesFastBins % static_cast <size_t >(cBytesPerBin));
464436
437+ // The compiler is normally pretty good about optimizing multiplications into shifts when possible
438+ // BUT, when compiling for SIMD, it seems to use a SIMD multiplication instruction instead of shifts
439+ // even when the multiplication has a fixed compile time constant value that is a power of 2, so
440+ // we manually convert the multiplications into shifts.
441+ //
442+ // We also have tried the Multiply templated function that is designed to convert multiplications
443+ // into shifts, but using that templated function breaks the compiler optimization that unrolls
444+ // the bitpacking loop.
445+ //
446+ constexpr static bool bSmall = 4 == cBytesPerBin;
447+ constexpr static bool bMed = 8 == cBytesPerBin;
448+ constexpr static bool bLarge = 16 == cBytesPerBin;
449+ static_assert (bSmall || bMed || bLarge, " cBytesPerBin size must be small, medium, or large" );
450+ constexpr static int cFixedShift = bSmall ? 2 : bMed ? 3 : 4 ;
451+ static_assert (1 << cFixedShift == cBytesPerBin, " cFixedShift must match the BinSize" );
452+ EBM_ASSERT (0 == pParams->m_cBytesFastBins % static_cast <size_t >(cBytesPerBin));
453+
465454 const typename TFloat::TInt offsets =
466- TFloat::TInt::MakeHalfIndexes () * static_cast<typename TFloat::TInt::T>(pParams->m_cBytesFastBins);
455+ TFloat::TInt::MakeIndexes () * static_cast <typename TFloat::TInt::T>(pParams->m_cBytesFastBins >> cFixedShift );
467456
468457 do {
469458 // TODO: maybe, it might be useful to preload the iTensorBinCombined, weight, gradient, hessian for the next loop
@@ -506,36 +495,30 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
506495 TFloat gradhess1 = TFloat::Load (&pGradientAndHessian[TFloat::k_cSIMDPack]);
507496 pGradientAndHessian += size_t {2 } * TFloat::k_cSIMDPack;
508497
509- TFloat::Interleaf(gradhess0, gradhess1);
510-
511498 if (bWeight) {
512499 gradhess0 *= weight;
513500 gradhess1 *= weight;
514501 }
515502
516- typename TFloat::TInt iTensorBin =
517- (((iTensorBinCombined >> cShift) & maskBits) << (TFloat::k_cTypeShift + 1)) + offsets;
503+ TFloat::Interleaf (gradhess0, gradhess1);
504+
505+ typename TFloat::TInt iTensorBin = ((iTensorBinCombined >> cShift) & maskBits) + offsets;
506+
507+ iTensorBin = PermuteForInterleaf (iTensorBin);
518508
519509 // TODO: instead of loading the gradient and hessian as separate loads, it might be better to
520510 // load the gradients and hessians as part as the same gather load because the CPU might
521511 // be better at loading items from the same cache line, and also because it would reduce our
522512 // memory by a factor of 2x since we could then handle two items in this loop sequentially
523513 // The drawback is that we need to shuffle our indexes and gradient/hessians values that we get back
524514
525- typename TFloat::TInt iTensorBin0;
526- typename TFloat::TInt iTensorBin1;
527- iTensorBin.PermuteForInterleaf(iTensorBin0, iTensorBin1);
515+ TFloat bin0;
516+ TFloat bin1;
528517
529- iTensorBin0 = iTensorBin0 + TFloat::TInt::MakeAlternating() * sizeof(TFloat::TInt::T);
530- iTensorBin1 = iTensorBin1 + TFloat::TInt::MakeAlternating() * sizeof(TFloat::TInt::T);
531-
532- TFloat bin0 = TFloat::template Load<0>(aBins, iTensorBin0);
518+ TFloat::template DoubleLoad<cFixedShift>(aBins, iTensorBin, bin0, bin1);
533519 bin0 += gradhess0;
534- bin0.template Store<0>(aBins, iTensorBin0);
535-
536- TFloat bin1 = TFloat::template Load<0>(aBins, iTensorBin1);
537520 bin1 += gradhess1;
538- bin1. template Store<0 >(aBins, iTensorBin1 );
521+ TFloat:: template DoubleStore<cFixedShift >(aBins, iTensorBin, bin0, bin1 );
539522
540523 cShift -= cBitsPerItemMax;
541524 } while (0 <= cShift);
@@ -544,7 +527,6 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
544527 }
545528 } while (pGradientsAndHessiansEnd != pGradientAndHessian);
546529}
547- */
548530
549531template <typename TFloat,
550532 bool bParallel,
0 commit comments