@@ -140,7 +140,8 @@ template<typename TFloat,
140
140
bool bWeight,
141
141
size_t cCompilerScores,
142
142
int cCompilerPack,
143
- typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores, int >::type = 0 >
143
+ typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores && (!bHessian || 8 != TFloat::TInt::k_cSIMDPack),
144
+ int >::type = 0 >
144
145
GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal (BinSumsBoostingBridge* const pParams) {
145
146
146
147
static_assert (1 == cCompilerScores, " This specialization of BinSumsBoostingInternal cannot handle multiclass." );
@@ -362,42 +363,14 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
362
363
}
363
364
364
365
365
- /*
366
- This speculative version is a specialization where bHessian is true. When there is
367
- a hessian instead of loading the gradients from the bins first then hessians after that we
368
- can use a gathering load to get the gradient and hessian for the first 1/2 of the
369
- samples, then get the next 1/2 of the samples. The potential benefit is that
370
- the CPU might benefit from getting gradients and hessians that are located next
371
- to eachother. Unfortunately this seems to introduce a data dependency issue since
372
- the first 1/2 of the gradients/hessians need to be loaded/modified/stored before the
373
- next 1/2 can be worked on. This seems slower but more investigation is required
374
-
375
-
376
- inline void PermuteForInterleaf(Avx2_32_Int& v0, Avx2_32_Int& v1) const noexcept {
377
- // this function permutes the values into positions that the Interleaf function expects
378
- // but for any SIMD implementation the positions can be variable as long as they work together
379
- v0 = Avx2_32_Int(_mm256_permutevar8x32_epi32(m_data, _mm256_setr_epi32(0, 0, 1, 1, 4, 4, 5, 5)));
380
- v1 = Avx2_32_Int(_mm256_permutevar8x32_epi32(m_data, _mm256_setr_epi32(2, 2, 3, 3, 6, 6, 7, 7)));
381
- }
382
- inline static Avx2_32_Int MakeAlternating() noexcept { return Avx2_32_Int(_mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0)); }
383
- inline static Avx2_32_Int MakeHalfIndexes() noexcept { return Avx2_32_Int(_mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0)); }
384
-
385
- inline static void Interleaf(Avx2_32_Float& val0, Avx2_32_Float& val1) noexcept {
386
- // this function permutes the values into positions that the PermuteForInterleaf function expects
387
- // but for any SIMD implementation, the positions can be variable as long as they work together
388
- __m256 temp = _mm256_unpacklo_ps(val0.m_data, val1.m_data);
389
- val1 = Avx2_32_Float(_mm256_unpackhi_ps(val0.m_data, val1.m_data));
390
- val0 = Avx2_32_Float(temp);
391
- }
392
-
393
366
template <typename TFloat,
394
367
bool bParallel,
395
368
bool bCollapsed,
396
369
bool bHessian,
397
370
bool bWeight,
398
371
size_t cCompilerScores,
399
372
int cCompilerPack,
400
- typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores && bHessian, int>::type = 0>
373
+ typename std::enable_if<bParallel && !bCollapsed && 1 == cCompilerScores && bHessian && 8 == TFloat::TInt::k_cSIMDPack , int >::type = 0 >
401
374
GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal (BinSumsBoostingBridge* const pParams) {
402
375
403
376
static_assert (1 == cCompilerScores, " This specialization of BinSumsBoostingInternal cannot handle multiclass." );
@@ -419,8 +392,7 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
419
392
420
393
const typename TFloat::T* pGradientAndHessian =
421
394
reinterpret_cast <const typename TFloat::T*>(pParams->m_aGradientsAndHessians );
422
- const typename TFloat::T* const pGradientsAndHessiansEnd =
423
- pGradientAndHessian + size_t{2} * cSamples;
395
+ const typename TFloat::T* const pGradientsAndHessiansEnd = pGradientAndHessian + size_t {2 } * cSamples;
424
396
425
397
static constexpr typename TFloat::TInt::T cBytesPerBin = static_cast <typename TFloat::TInt::T>(
426
398
GetBinSize<typename TFloat::T, typename TFloat::TInt::T>(false , false , bHessian, size_t {1 }));
@@ -462,8 +434,25 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
462
434
463
435
EBM_ASSERT (0 == pParams->m_cBytesFastBins % static_cast <size_t >(cBytesPerBin));
464
436
437
+ // The compiler is normally pretty good about optimizing multiplications into shifts when possible
438
+ // BUT, when compiling for SIMD, it seems to use a SIMD multiplication instruction instead of shifts
439
+ // even when the multiplication has a fixed compile time constant value that is a power of 2, so
440
+ // we manually convert the multiplications into shifts.
441
+ //
442
+ // We also have tried the Multiply templated function that is designed to convert multiplications
443
+ // into shifts, but using that templated function breaks the compiler optimization that unrolls
444
+ // the bitpacking loop.
445
+ //
446
+ constexpr static bool bSmall = 4 == cBytesPerBin;
447
+ constexpr static bool bMed = 8 == cBytesPerBin;
448
+ constexpr static bool bLarge = 16 == cBytesPerBin;
449
+ static_assert (bSmall || bMed || bLarge, " cBytesPerBin size must be small, medium, or large" );
450
+ constexpr static int cFixedShift = bSmall ? 2 : bMed ? 3 : 4 ;
451
+ static_assert (1 << cFixedShift == cBytesPerBin, " cFixedShift must match the BinSize" );
452
+ EBM_ASSERT (0 == pParams->m_cBytesFastBins % static_cast <size_t >(cBytesPerBin));
453
+
465
454
const typename TFloat::TInt offsets =
466
- TFloat::TInt::MakeHalfIndexes () * static_cast<typename TFloat::TInt::T>(pParams->m_cBytesFastBins);
455
+ TFloat::TInt::MakeIndexes () * static_cast <typename TFloat::TInt::T>(pParams->m_cBytesFastBins >> cFixedShift );
467
456
468
457
do {
469
458
// TODO: maybe, it might be useful to preload the iTensorBinCombined, weight, gradient, hessian for the next loop
@@ -506,36 +495,30 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
506
495
TFloat gradhess1 = TFloat::Load (&pGradientAndHessian[TFloat::k_cSIMDPack]);
507
496
pGradientAndHessian += size_t {2 } * TFloat::k_cSIMDPack;
508
497
509
- TFloat::Interleaf(gradhess0, gradhess1);
510
-
511
498
if (bWeight) {
512
499
gradhess0 *= weight;
513
500
gradhess1 *= weight;
514
501
}
515
502
516
- typename TFloat::TInt iTensorBin =
517
- (((iTensorBinCombined >> cShift) & maskBits) << (TFloat::k_cTypeShift + 1)) + offsets;
503
+ TFloat::Interleaf (gradhess0, gradhess1);
504
+
505
+ typename TFloat::TInt iTensorBin = ((iTensorBinCombined >> cShift) & maskBits) + offsets;
506
+
507
+ iTensorBin = PermuteForInterleaf (iTensorBin);
518
508
519
509
// TODO: instead of loading the gradient and hessian as separate loads, it might be better to
520
510
// load the gradients and hessians as part as the same gather load because the CPU might
521
511
// be better at loading items from the same cache line, and also because it would reduce our
522
512
// memory by a factor of 2x since we could then handle two items in this loop sequentially
523
513
// The drawback is that we need to shuffle our indexes and gradient/hessians values that we get back
524
514
525
- typename TFloat::TInt iTensorBin0;
526
- typename TFloat::TInt iTensorBin1;
527
- iTensorBin.PermuteForInterleaf(iTensorBin0, iTensorBin1);
515
+ TFloat bin0;
516
+ TFloat bin1;
528
517
529
- iTensorBin0 = iTensorBin0 + TFloat::TInt::MakeAlternating() * sizeof(TFloat::TInt::T);
530
- iTensorBin1 = iTensorBin1 + TFloat::TInt::MakeAlternating() * sizeof(TFloat::TInt::T);
531
-
532
- TFloat bin0 = TFloat::template Load<0>(aBins, iTensorBin0);
518
+ TFloat::template DoubleLoad<cFixedShift>(aBins, iTensorBin, bin0, bin1);
533
519
bin0 += gradhess0;
534
- bin0.template Store<0>(aBins, iTensorBin0);
535
-
536
- TFloat bin1 = TFloat::template Load<0>(aBins, iTensorBin1);
537
520
bin1 += gradhess1;
538
- bin1. template Store<0 >(aBins, iTensorBin1 );
521
+ TFloat:: template DoubleStore<cFixedShift >(aBins, iTensorBin, bin0, bin1 );
539
522
540
523
cShift -= cBitsPerItemMax;
541
524
} while (0 <= cShift);
@@ -544,7 +527,6 @@ GPU_DEVICE NEVER_INLINE static void BinSumsBoostingInternal(BinSumsBoostingBridg
544
527
}
545
528
} while (pGradientsAndHessiansEnd != pGradientAndHessian);
546
529
}
547
- */
548
530
549
531
template <typename TFloat,
550
532
bool bParallel,
0 commit comments