Skip to content

Commit ac6452e

Browse files
committed
add randomized tests
1 parent eb0aae1 commit ac6452e

File tree

7 files changed

+108
-70
lines changed

7 files changed

+108
-70
lines changed

shared/libebm/PartitionMultiDimensionalStraight.cpp

Lines changed: 0 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalS
106106

107107
EBM_ASSERT(std::numeric_limits<FloatCalc>::min() <= hessianMin);
108108

109-
#ifndef NDEBUG
110-
bool bAnySplits = false;
111-
#endif // NDEBUG
112-
113109
const bool bUseLogitBoost = bHessian && !(CalcInteractionFlags_DisableNewton & flags);
114110

115111
// if a negative value were to occur, then it would be due to numeric instability, so clip it to zero here
@@ -205,10 +201,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalS
205201
}
206202

207203
{
208-
#ifndef NDEBUG
209-
bAnySplits = true;
210-
#endif // NDEBUG
211-
212204
const FloatCalc w00 = static_cast<FloatCalc>(bin00.GetWeight());
213205
const FloatCalc w01 = static_cast<FloatCalc>(bin01.GetWeight());
214206
const FloatCalc w10 = static_cast<FloatCalc>(bin10.GetWeight());
@@ -427,22 +419,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalS
427419
regLambda,
428420
deltaStepMax);
429421
}
430-
431-
// bestGain should be positive, or NaN, BUT it can be slightly negative due to floating point noise
432-
// it could also be -inf if the parent/total bin overflows, but the children parts did not.
433-
// bestGain can also be substantially negative if we didn't find any legal cuts and
434-
// then we subtracted the base partial gain here from zero
435-
436-
// if no legal splits were found, then bestGain will be zero. In theory we should
437-
// therefore not subtract the parent partial gain, but doing so does no harm since we later set any
438-
// negative interaction score to zero in the caller of this function. Due to that we don't
439-
// need to check here, since any value we subtract from zero will lead to a negative number and
440-
// then will be zeroed by our caller
441-
// BUT, for debugging purposes, check here for that condition so that we can check for illegal negative
442-
// gain.
443-
444-
EBM_ASSERT(std::isnan(bestGain) || -std::numeric_limits<FloatCalc>::infinity() == bestGain ||
445-
k_epsilonNegativeGainAllowed <= bestGain || !bAnySplits);
446422
}
447423
}
448424

shared/libebm/PartitionMultiDimensionalTree.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -864,13 +864,10 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalT
864864
}
865865

866866
EBM_ASSERT(std::numeric_limits<FloatCalc>::infinity() != bestGain);
867-
EBM_ASSERT(std::isnan(bestGain) || -std::numeric_limits<FloatCalc>::infinity() == bestGain ||
868-
k_epsilonNegativeGainAllowed <= bestGain);
869867

870868
if(LIKELY(/* NaN */ std::numeric_limits<FloatCalc>::lowest() <= bestGain)) {
871869
EBM_ASSERT(!std::isnan(bestGain));
872870
EBM_ASSERT(!std::isinf(bestGain));
873-
EBM_ASSERT(k_epsilonNegativeGainAllowed <= bestGain);
874871

875872
*pTotalGain = 0;
876873
if(LIKELY(k_gainMin <= bestGain)) {

shared/libebm/PartitionOneDimensionalBoosting.cpp

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -616,12 +616,7 @@ static int FindBestSplitGain(RandomDeterministic* const pRng,
616616
++iScoreParent;
617617
} while(cScores != iScoreParent);
618618

619-
// bestGain could be -inf if the partial gain on the children reached a number close to +inf and then
620-
// the children were -inf due to floating point noise.
621-
EBM_ASSERT(std::isnan(bestGain) || -std::numeric_limits<FloatCalc>::infinity() == bestGain ||
622-
k_epsilonNegativeGainAllowed <= bestGain);
623619
EBM_ASSERT(std::numeric_limits<FloatCalc>::infinity() != bestGain);
624-
625620
EBM_ASSERT(std::numeric_limits<FloatCalc>::min() <= k_gainMin);
626621
if(UNLIKELY(/* NaN */ !LIKELY(k_gainMin <= bestGain))) {
627622
// do not allow splits on gains that are too small

shared/libebm/ebm_internal.hpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,6 @@ typedef double FloatPrecomp;
4444

4545
static constexpr double k_illegalGainDouble = std::numeric_limits<double>::lowest();
4646

47-
#ifndef NDEBUG
48-
static constexpr FloatCalc k_epsilonNegativeGainAllowed = FloatCalc{-1e-7};
49-
#endif // NDEBUG
50-
5147
extern double FloatTickIncrementInternal(double deprecisioned[1]) noexcept;
5248
extern double FloatTickDecrementInternal(double deprecisioned[1]) noexcept;
5349

shared/libebm/tests/boosting_unusual_inputs.cpp

Lines changed: 75 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2061,8 +2061,7 @@ TEST_CASE("lossguide, boosting, regression") {
20612061
CHECK_APPROX(termScore, 0.40592050000000002);
20622062
}
20632063

2064-
TEST_CASE("stress test, boosting, regression") {
2065-
const IntEbm cClasses = Task_Regression;
2064+
TEST_CASE("stress test, boosting") {
20662065
auto rng = MakeRng(0);
20672066
const IntEbm cTrainSamples = 200;
20682067
const IntEbm cValidationSamples = 100;
@@ -2088,31 +2087,81 @@ TEST_CASE("stress test, boosting, regression") {
20882087
if(4 <= features.size()) {
20892088
// terms.push_back({0, 1, 2, 3}); // TODO: enable when fast enough
20902089
}
2091-
const auto train = MakeRandomDataset(rng, cClasses, cTrainSamples, features);
2092-
const auto validation = MakeRandomDataset(rng, cClasses, cValidationSamples, features);
2093-
const size_t cRounds = 1000;
2094-
const IntEbm boostFlags = TermBoostFlags_Default;
2095-
2096-
TestBoost test = TestBoost(cClasses,
2097-
features,
2098-
terms,
2099-
train,
2100-
validation,
2101-
k_countInnerBagsDefault,
2102-
k_testCreateBoosterFlags_Default,
2103-
AccelerationFlags_NONE);
2104-
2105-
double validationMetric = 0;
2106-
for(size_t iRound = 0; iRound < cRounds; ++iRound) {
2107-
for(IntEbm iTerm = 0; iTerm < static_cast<IntEbm>(terms.size()); ++iTerm) {
2108-
// we allow 1 cut more than the number of bins to test excessive leaves.
2109-
const IntEbm cLeaves = 1 + TestRand(rng, features[terms[iTerm][0]].CountRealBins() + 1);
2110-
const std::vector<IntEbm> leaves(terms[iTerm].size(), cLeaves);
2111-
validationMetric =
2112-
test.Boost(iTerm, boostFlags, k_learningRateDefault, 1, 0.0, 0.0, 0.0, 0.0, 10.0, 6, 0.75, leaves)
2113-
.validationMetric;
2090+
const size_t cRounds = 200;
2091+
std::vector<IntEbm> boostFlagsAny{TermBoostFlags_PurifyGain,
2092+
TermBoostFlags_DisableNewtonGain,
2093+
TermBoostFlags_DisableCategorical,
2094+
TermBoostFlags_DisableNewtonUpdate,
2095+
TermBoostFlags_RandomSplits};
2096+
std::vector<IntEbm> boostFlagsChoose{TermBoostFlags_Default,
2097+
TermBoostFlags_MissingLow,
2098+
TermBoostFlags_MissingHigh,
2099+
TermBoostFlags_MissingSeparate,
2100+
TermBoostFlags_MissingDrop};
2101+
2102+
double validationMetric = 0.0;
2103+
2104+
for(IntEbm classesCount = Task_Regression; classesCount < 5; ++classesCount) {
2105+
if(classesCount != Task_Regression && classesCount < 2) {
2106+
continue;
2107+
}
2108+
const auto train = MakeRandomDataset(rng, classesCount, cTrainSamples, features);
2109+
const auto validation = MakeRandomDataset(rng, classesCount, cValidationSamples, features);
2110+
for(IntEbm innerBagCount = 0; innerBagCount < 3; ++innerBagCount) {
2111+
TestBoost test = TestBoost(classesCount,
2112+
features,
2113+
terms,
2114+
train,
2115+
validation,
2116+
innerBagCount,
2117+
k_testCreateBoosterFlags_Default,
2118+
AccelerationFlags_NONE);
2119+
2120+
double validationMetricIteration = 0.0;
2121+
for(size_t iRound = 0; iRound < cRounds; ++iRound) {
2122+
for(IntEbm iTerm = 0; iTerm < static_cast<IntEbm>(terms.size()); ++iTerm) {
2123+
const IntEbm cRealBins = features[terms[iTerm][0]].CountRealBins();
2124+
const IntEbm cDimensions = terms[iTerm].size();
2125+
2126+
const TermBoostFlags boostFlags =
2127+
static_cast<TermBoostFlags>(ChooseAny(rng, boostFlagsAny) | ChooseFrom(rng, boostFlagsChoose));
2128+
2129+
const double learningRate = 0.015625;
2130+
const IntEbm minSamplesLeaf = TestRand(rng, 5) + 1;
2131+
const double minHessian = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
2132+
const double regAlpha = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
2133+
const double regLambda = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
2134+
const double maxDeltaStep = 0 == TestRand(rng, 5) ? 1.0 : 0.0;
2135+
const double categoricalSmoothing = 10.0;
2136+
const IntEbm maxCategoricalThreshold = 1 + TestRand(rng, cRealBins + 1);
2137+
const double categoricalInclusionPercent = 0 == TestRand(rng, 2) ? 0.75 : 1.0;
2138+
2139+
// we allow 1 cut more than the number of bins to test excessive leaves.
2140+
const IntEbm cLeaves = 1 + TestRand(rng, cRealBins + 1);
2141+
const std::vector<IntEbm> leaves(cDimensions, cLeaves);
2142+
const MonotoneDirection direction =
2143+
0 == TestRand(rng, 5) ? static_cast<MonotoneDirection>(TestRand(rng, 2) * 2 - 1) : 0;
2144+
const std::vector<MonotoneDirection> monotonicity(cDimensions, direction);
2145+
2146+
validationMetricIteration = test.Boost(iTerm,
2147+
boostFlags,
2148+
learningRate,
2149+
minSamplesLeaf,
2150+
minHessian,
2151+
regAlpha,
2152+
regLambda,
2153+
maxDeltaStep,
2154+
categoricalSmoothing,
2155+
maxCategoricalThreshold,
2156+
categoricalInclusionPercent,
2157+
leaves,
2158+
monotonicity)
2159+
.validationMetric;
2160+
}
2161+
}
2162+
validationMetric += validationMetricIteration;
21142163
}
21152164
}
21162165

2117-
CHECK(validationMetric == 15092.795219174546);
2166+
CHECK(validationMetric == 42031.143270308334);
21182167
}

shared/libebm/tests/libebm_test.cpp

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -607,18 +607,22 @@ BoostRet TestBoost::Boost(const IntEbm indexTerm,
607607

608608
std::vector<double> scoreTensor(cUpdateScores);
609609

610-
memset(&scoreTensor[0], 0xFF, sizeof(double) * cUpdateScores);
611-
error = GetTermUpdate(m_boosterHandle, &scoreTensor[0]);
610+
if(0 != cUpdateScores) {
611+
memset(scoreTensor.data(), 0xFF, sizeof(double) * cUpdateScores);
612+
}
613+
error = GetTermUpdate(m_boosterHandle, scoreTensor.data());
612614
if(Error_None != error) {
613615
throw TestException(error, "SetTermUpdate");
614616
}
615617

616618
if(0 != (TermBoostFlags_GradientSums & flags)) {
617619
// if sums are on, then we MUST change the term update
618-
memset(&scoreTensor[0], 0, sizeof(double) * cUpdateScores);
620+
if(0 != cUpdateScores) {
621+
memset(scoreTensor.data(), 0, sizeof(double) * cUpdateScores);
622+
}
619623
}
620624

621-
error = SetTermUpdate(m_boosterHandle, indexTerm, &scoreTensor[0]);
625+
error = SetTermUpdate(m_boosterHandle, indexTerm, scoreTensor.data());
622626
if(Error_None != error) {
623627
throw TestException(error, "SetTermUpdate");
624628
}
@@ -629,14 +633,18 @@ BoostRet TestBoost::Boost(const IntEbm indexTerm,
629633
}
630634

631635
if(0 <= indexTerm) {
632-
memset(&scoreTensor[0], 0xFF, sizeof(double) * cUpdateScores);
633-
error = GetBestTermScores(m_boosterHandle, indexTerm, &scoreTensor[0]);
636+
if(0 != cUpdateScores) {
637+
memset(scoreTensor.data(), 0xFF, sizeof(double) * cUpdateScores);
638+
}
639+
error = GetBestTermScores(m_boosterHandle, indexTerm, scoreTensor.data());
634640
if(Error_None != error) {
635641
throw TestException(error, "ApplyTermUpdate");
636642
}
637643

638-
memset(&scoreTensor[0], 0xFF, sizeof(double) * cUpdateScores);
639-
error = GetCurrentTermScores(m_boosterHandle, indexTerm, &scoreTensor[0]);
644+
if(0 != cUpdateScores) {
645+
memset(scoreTensor.data(), 0xFF, sizeof(double) * cUpdateScores);
646+
}
647+
error = GetCurrentTermScores(m_boosterHandle, indexTerm, scoreTensor.data());
640648
if(Error_None != error) {
641649
throw TestException(error, "ApplyTermUpdate");
642650
}
@@ -1004,6 +1012,20 @@ extern void DisplayCuts(IntEbm countSamples,
10041012
std::cout << std::endl << std::endl;
10051013
}
10061014

1015+
extern IntEbm ChooseAny(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options) {
1016+
IntEbm ret = 0;
1017+
for(const IntEbm option : options) {
1018+
if(0 == TestRand(rng, 3)) {
1019+
ret |= option;
1020+
}
1021+
}
1022+
return ret;
1023+
}
1024+
1025+
extern IntEbm ChooseFrom(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options) {
1026+
return options[TestRand(rng, options.size())];
1027+
}
1028+
10071029
extern std::vector<TestSample> MakeRandomDataset(std::vector<unsigned char>& rng,
10081030
const IntEbm cClasses,
10091031
const size_t cSamples,

shared/libebm/tests/libebm_test.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -545,6 +545,9 @@ std::vector<TestSample> MakeRandomDataset(std::vector<unsigned char>& rng,
545545

546546
std::vector<std::vector<IntEbm>> MakeMains(const std::vector<FeatureTest>& features);
547547

548+
IntEbm ChooseAny(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options);
549+
IntEbm ChooseFrom(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options);
550+
548551
inline static std::vector<unsigned char> MakeRng(const SeedEbm seed) {
549552
std::vector<unsigned char> rng(static_cast<size_t>(MeasureRNG()));
550553
InitRNG(seed, &rng[0]);

0 commit comments

Comments
 (0)