Skip to content

Commit 0a7d04d

Browse files
committed
add new testing utilities for random dataset generation and add randomized stress test
1 parent 57af6a8 commit 0a7d04d

7 files changed

+209
-44
lines changed

shared/libebm/PartitionMultiDimensionalStraight.cpp

-24
Original file line numberDiff line numberDiff line change
@@ -106,10 +106,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalS
106106

107107
EBM_ASSERT(std::numeric_limits<FloatCalc>::min() <= hessianMin);
108108

109-
#ifndef NDEBUG
110-
bool bAnySplits = false;
111-
#endif // NDEBUG
112-
113109
const bool bUseLogitBoost = bHessian && !(CalcInteractionFlags_DisableNewton & flags);
114110

115111
// if a negative value were to occur, then it would be due to numeric instability, so clip it to zero here
@@ -205,10 +201,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalS
205201
}
206202

207203
{
208-
#ifndef NDEBUG
209-
bAnySplits = true;
210-
#endif // NDEBUG
211-
212204
const FloatCalc w00 = static_cast<FloatCalc>(bin00.GetWeight());
213205
const FloatCalc w01 = static_cast<FloatCalc>(bin01.GetWeight());
214206
const FloatCalc w10 = static_cast<FloatCalc>(bin10.GetWeight());
@@ -427,22 +419,6 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalS
427419
regLambda,
428420
deltaStepMax);
429421
}
430-
431-
// bestGain should be positive, or NaN, BUT it can be slightly negative due to floating point noise
432-
// it could also be -inf if the parent/total bin overflows, but the children parts did not.
433-
// bestGain can also be substantially negative if we didn't find any legal cuts and
434-
// then we subtracted the base partial gain here from zero
435-
436-
// if no legal splits were found, then bestGain will be zero. In theory we should
437-
// therefore not subtract the parent partial gain, but doing so does no harm since we later set any
438-
// negative interaction score to zero in the caller of this function. Due to that we don't
439-
// need to check here, since any value we subtract from zero will lead to a negative number and
440-
// then will be zeroed by our caller
441-
// BUT, for debugging purposes, check here for that condition so that we can check for illegal negative
442-
// gain.
443-
444-
EBM_ASSERT(std::isnan(bestGain) || -std::numeric_limits<FloatCalc>::infinity() == bestGain ||
445-
k_epsilonNegativeGainAllowed <= bestGain || !bAnySplits);
446422
}
447423
}
448424

shared/libebm/PartitionMultiDimensionalTree.cpp

-3
Original file line numberDiff line numberDiff line change
@@ -864,13 +864,10 @@ template<bool bHessian, size_t cCompilerScores> class PartitionMultiDimensionalT
864864
}
865865

866866
EBM_ASSERT(std::numeric_limits<FloatCalc>::infinity() != bestGain);
867-
EBM_ASSERT(std::isnan(bestGain) || -std::numeric_limits<FloatCalc>::infinity() == bestGain ||
868-
k_epsilonNegativeGainAllowed <= bestGain);
869867

870868
if(LIKELY(/* NaN */ std::numeric_limits<FloatCalc>::lowest() <= bestGain)) {
871869
EBM_ASSERT(!std::isnan(bestGain));
872870
EBM_ASSERT(!std::isinf(bestGain));
873-
EBM_ASSERT(k_epsilonNegativeGainAllowed <= bestGain);
874871

875872
*pTotalGain = 0;
876873
if(LIKELY(k_gainMin <= bestGain)) {

shared/libebm/PartitionOneDimensionalBoosting.cpp

-5
Original file line numberDiff line numberDiff line change
@@ -616,12 +616,7 @@ static int FindBestSplitGain(RandomDeterministic* const pRng,
616616
++iScoreParent;
617617
} while(cScores != iScoreParent);
618618

619-
// bestGain could be -inf if the partial gain on the children reached a number close to +inf and then
620-
// the children were -inf due to floating point noise.
621-
EBM_ASSERT(std::isnan(bestGain) || -std::numeric_limits<FloatCalc>::infinity() == bestGain ||
622-
k_epsilonNegativeGainAllowed <= bestGain);
623619
EBM_ASSERT(std::numeric_limits<FloatCalc>::infinity() != bestGain);
624-
625620
EBM_ASSERT(std::numeric_limits<FloatCalc>::min() <= k_gainMin);
626621
if(UNLIKELY(/* NaN */ !LIKELY(k_gainMin <= bestGain))) {
627622
// do not allow splits on gains that are too small

shared/libebm/ebm_internal.hpp

-4
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,6 @@ typedef double FloatPrecomp;
4444

4545
static constexpr double k_illegalGainDouble = std::numeric_limits<double>::lowest();
4646

47-
#ifndef NDEBUG
48-
static constexpr FloatCalc k_epsilonNegativeGainAllowed = FloatCalc{-1e-7};
49-
#endif // NDEBUG
50-
5147
extern double FloatTickIncrementInternal(double deprecisioned[1]) noexcept;
5248
extern double FloatTickDecrementInternal(double deprecisioned[1]) noexcept;
5349

shared/libebm/tests/boosting_unusual_inputs.cpp

+111
Original file line numberDiff line numberDiff line change
@@ -2060,3 +2060,114 @@ TEST_CASE("lossguide, boosting, regression") {
20602060
termScore = test.GetCurrentTermScore(0, {0}, 0);
20612061
CHECK_APPROX(termScore, 0.40592050000000002);
20622062
}
2063+
2064+
TEST_CASE("stress test, boosting") {
2065+
auto rng = MakeRng(0);
2066+
const IntEbm cTrainSamples = 200;
2067+
const IntEbm cValidationSamples = 100;
2068+
const std::vector<FeatureTest> features = {
2069+
FeatureTest(10, false, false, false),
2070+
FeatureTest(10, false, false, true),
2071+
FeatureTest(10, false, true, false),
2072+
FeatureTest(10, false, true, true),
2073+
FeatureTest(10, true, false, false),
2074+
FeatureTest(10, true, false, true),
2075+
FeatureTest(10, true, true, false),
2076+
FeatureTest(10, true, true, true),
2077+
};
2078+
auto terms = MakeMains(features);
2079+
terms.push_back({0, 0});
2080+
if(2 <= features.size()) {
2081+
terms.push_back({0, 1});
2082+
terms.push_back({1, 0});
2083+
}
2084+
if(3 <= features.size()) {
2085+
// terms.push_back({0, 1, 2}); // TODO: enable when fast enough
2086+
}
2087+
if(4 <= features.size()) {
2088+
// terms.push_back({0, 1, 2, 3}); // TODO: enable when fast enough
2089+
}
2090+
const size_t cRounds = 200;
2091+
std::vector<IntEbm> boostFlagsAny{// TermBoostFlags_PurifyGain,
2092+
TermBoostFlags_DisableNewtonGain,
2093+
TermBoostFlags_DisableCategorical,
2094+
// TermBoostFlags_PurifyUpdate,
2095+
// TermBoostFlags_GradientSums, // does not return a metric
2096+
TermBoostFlags_DisableNewtonUpdate,
2097+
TermBoostFlags_RandomSplits};
2098+
std::vector<IntEbm> boostFlagsChoose{TermBoostFlags_Default,
2099+
TermBoostFlags_MissingLow,
2100+
TermBoostFlags_MissingHigh,
2101+
TermBoostFlags_MissingSeparate,
2102+
TermBoostFlags_MissingDrop};
2103+
2104+
double validationMetric = 1.0;
2105+
2106+
for(IntEbm classesCount = Task_Regression; classesCount < 5; ++classesCount) {
2107+
if(classesCount != Task_Regression && classesCount < 1) {
2108+
continue;
2109+
}
2110+
const auto train = MakeRandomDataset(rng, classesCount, cTrainSamples, features);
2111+
const auto validation = MakeRandomDataset(rng, classesCount, cValidationSamples, features);
2112+
for(IntEbm innerBagCount = 0; innerBagCount < 3; ++innerBagCount) {
2113+
TestBoost test = TestBoost(classesCount,
2114+
features,
2115+
terms,
2116+
train,
2117+
validation,
2118+
innerBagCount,
2119+
k_testCreateBoosterFlags_Default,
2120+
AccelerationFlags_NONE);
2121+
2122+
double validationMetricIteration = 0.0;
2123+
for(size_t iRound = 0; iRound < cRounds; ++iRound) {
2124+
for(IntEbm iTerm = 0; iTerm < static_cast<IntEbm>(terms.size()); ++iTerm) {
2125+
const IntEbm cRealBins = features[terms[iTerm][0]].CountRealBins();
2126+
const IntEbm cDimensions = terms[iTerm].size();
2127+
2128+
const TermBoostFlags boostFlags =
2129+
static_cast<TermBoostFlags>(ChooseAny(rng, boostFlagsAny) | ChooseFrom(rng, boostFlagsChoose));
2130+
2131+
const double learningRate = 0.015625;
2132+
const IntEbm minSamplesLeaf = TestRand(rng, 5) + 1;
2133+
const double minHessian = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
2134+
const double regAlpha = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
2135+
const double regLambda = 0 == TestRand(rng, 5) ? 0.015625 : 0.0;
2136+
const double maxDeltaStep = 0 == TestRand(rng, 5) ? 1.0 : 0.0;
2137+
const double categoricalSmoothing = 10.0;
2138+
const IntEbm maxCategoricalThreshold = 1 + TestRand(rng, cRealBins + 1);
2139+
const double categoricalInclusionPercent = 0 == TestRand(rng, 2) ? 0.75 : 1.0;
2140+
2141+
// we allow 1 cut more than the number of bins to test excessive leaves.
2142+
const IntEbm cLeaves = 1 + TestRand(rng, cRealBins + 1);
2143+
const std::vector<IntEbm> leaves(cDimensions, cLeaves);
2144+
const MonotoneDirection direction =
2145+
0 == TestRand(rng, 5) ? static_cast<MonotoneDirection>(TestRand(rng, 2) * 2 - 1) : 0;
2146+
const std::vector<MonotoneDirection> monotonicity(cDimensions, direction);
2147+
2148+
validationMetricIteration = test.Boost(iTerm,
2149+
boostFlags,
2150+
learningRate,
2151+
minSamplesLeaf,
2152+
minHessian,
2153+
regAlpha,
2154+
regLambda,
2155+
maxDeltaStep,
2156+
categoricalSmoothing,
2157+
maxCategoricalThreshold,
2158+
categoricalInclusionPercent,
2159+
leaves,
2160+
monotonicity)
2161+
.validationMetric;
2162+
}
2163+
}
2164+
if(classesCount == 1) {
2165+
CHECK(std::numeric_limits<double>::infinity() == validationMetricIteration);
2166+
} else {
2167+
validationMetric *= validationMetricIteration;
2168+
}
2169+
}
2170+
}
2171+
2172+
CHECK(validationMetric == 62013566170252.117);
2173+
}

shared/libebm/tests/libebm_test.cpp

+67-8
Original file line numberDiff line numberDiff line change
@@ -607,18 +607,22 @@ BoostRet TestBoost::Boost(const IntEbm indexTerm,
607607

608608
std::vector<double> scoreTensor(cUpdateScores);
609609

610-
memset(&scoreTensor[0], 0xFF, sizeof(double) * cUpdateScores);
611-
error = GetTermUpdate(m_boosterHandle, &scoreTensor[0]);
610+
if(0 != cUpdateScores) {
611+
memset(scoreTensor.data(), 0xFF, sizeof(double) * cUpdateScores);
612+
}
613+
error = GetTermUpdate(m_boosterHandle, scoreTensor.data());
612614
if(Error_None != error) {
613615
throw TestException(error, "SetTermUpdate");
614616
}
615617

616618
if(0 != (TermBoostFlags_GradientSums & flags)) {
617619
// if sums are on, then we MUST change the term update
618-
memset(&scoreTensor[0], 0, sizeof(double) * cUpdateScores);
620+
if(0 != cUpdateScores) {
621+
memset(scoreTensor.data(), 0, sizeof(double) * cUpdateScores);
622+
}
619623
}
620624

621-
error = SetTermUpdate(m_boosterHandle, indexTerm, &scoreTensor[0]);
625+
error = SetTermUpdate(m_boosterHandle, indexTerm, scoreTensor.data());
622626
if(Error_None != error) {
623627
throw TestException(error, "SetTermUpdate");
624628
}
@@ -629,14 +633,18 @@ BoostRet TestBoost::Boost(const IntEbm indexTerm,
629633
}
630634

631635
if(0 <= indexTerm) {
632-
memset(&scoreTensor[0], 0xFF, sizeof(double) * cUpdateScores);
633-
error = GetBestTermScores(m_boosterHandle, indexTerm, &scoreTensor[0]);
636+
if(0 != cUpdateScores) {
637+
memset(scoreTensor.data(), 0xFF, sizeof(double) * cUpdateScores);
638+
}
639+
error = GetBestTermScores(m_boosterHandle, indexTerm, scoreTensor.data());
634640
if(Error_None != error) {
635641
throw TestException(error, "ApplyTermUpdate");
636642
}
637643

638-
memset(&scoreTensor[0], 0xFF, sizeof(double) * cUpdateScores);
639-
error = GetCurrentTermScores(m_boosterHandle, indexTerm, &scoreTensor[0]);
644+
if(0 != cUpdateScores) {
645+
memset(scoreTensor.data(), 0xFF, sizeof(double) * cUpdateScores);
646+
}
647+
error = GetCurrentTermScores(m_boosterHandle, indexTerm, scoreTensor.data());
640648
if(Error_None != error) {
641649
throw TestException(error, "ApplyTermUpdate");
642650
}
@@ -1004,6 +1012,57 @@ extern void DisplayCuts(IntEbm countSamples,
10041012
std::cout << std::endl << std::endl;
10051013
}
10061014

1015+
extern IntEbm ChooseAny(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options) {
1016+
IntEbm ret = 0;
1017+
for(const IntEbm option : options) {
1018+
if(0 == TestRand(rng, 3)) {
1019+
ret |= option;
1020+
}
1021+
}
1022+
return ret;
1023+
}
1024+
1025+
extern IntEbm ChooseFrom(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options) {
1026+
return options[TestRand(rng, options.size())];
1027+
}
1028+
1029+
extern std::vector<TestSample> MakeRandomDataset(std::vector<unsigned char>& rng,
1030+
const IntEbm cClasses,
1031+
const size_t cSamples,
1032+
const std::vector<FeatureTest>& features) {
1033+
std::vector<TestSample> samples;
1034+
1035+
for(size_t iSample = 0; iSample < cSamples; ++iSample) {
1036+
std::vector<IntEbm> sampleBinIndexes;
1037+
for(const FeatureTest& feature : features) {
1038+
IntEbm iBin = TestRand(rng, feature.CountRealBins());
1039+
if(!feature.m_bMissing) {
1040+
++iBin;
1041+
}
1042+
sampleBinIndexes.push_back(iBin);
1043+
}
1044+
1045+
double target;
1046+
if(Task_GeneralClassification <= cClasses) {
1047+
target = static_cast<double>(TestRand(rng, cClasses));
1048+
} else {
1049+
target = TestRand(rng);
1050+
}
1051+
1052+
samples.push_back(TestSample(sampleBinIndexes, target));
1053+
}
1054+
return samples;
1055+
}
1056+
1057+
extern std::vector<std::vector<IntEbm>> MakeMains(const std::vector<FeatureTest>& features) {
1058+
const IntEbm cFeatures = static_cast<IntEbm>(features.size());
1059+
std::vector<std::vector<IntEbm>> termFeatures;
1060+
for(IntEbm iFeature = 0; iFeature < cFeatures; ++iFeature) {
1061+
termFeatures.push_back({iFeature});
1062+
}
1063+
return termFeatures;
1064+
}
1065+
10071066
int main() {
10081067
SetLogCallback(&LogCallback);
10091068
SetTraceLevel(Trace_Verbose);

shared/libebm/tests/libebm_test.hpp

+31
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,8 @@ class FeatureTest final {
179179
const bool m_bUnseen;
180180
const bool m_bNominal;
181181

182+
inline IntEbm CountRealBins() const { return m_countBins - (m_bMissing ? 0 : 1) - (m_bUnseen ? 0 : 1); }
183+
182184
inline FeatureTest(
183185
const IntEbm countBins, const bool bMissing = true, const bool bUnseen = true, const bool bNominal = false) :
184186
m_countBins(countBins), m_bMissing(bMissing), m_bUnseen(bUnseen), m_bNominal(bNominal) {}
@@ -536,4 +538,33 @@ void DisplayCuts(IntEbm countSamples,
536538
double minFeatureVal,
537539
double maxFeatureVal);
538540

541+
std::vector<TestSample> MakeRandomDataset(std::vector<unsigned char>& rng,
542+
const IntEbm cClasses,
543+
const size_t cSamples,
544+
const std::vector<FeatureTest>& features);
545+
546+
std::vector<std::vector<IntEbm>> MakeMains(const std::vector<FeatureTest>& features);
547+
548+
IntEbm ChooseAny(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options);
549+
IntEbm ChooseFrom(std::vector<unsigned char>& rng, const std::vector<IntEbm>& options);
550+
551+
inline static std::vector<unsigned char> MakeRng(const SeedEbm seed) {
552+
std::vector<unsigned char> rng(static_cast<size_t>(MeasureRNG()));
553+
InitRNG(seed, &rng[0]);
554+
return rng;
555+
}
556+
557+
inline IntEbm TestRand(std::vector<unsigned char>& rng, const IntEbm count) {
558+
// this isn't balanced, but good enough for tests
559+
SeedEbm randomNum;
560+
GenerateSeed(&rng[0], &randomNum);
561+
return static_cast<IntEbm>(static_cast<USeedEbm>(randomNum) % static_cast<USeedEbm>(count));
562+
}
563+
564+
inline double TestRand(std::vector<unsigned char>& rng) {
565+
double ret;
566+
GenerateGaussianRandom(&rng[0], 100.0, 1, &ret);
567+
return ret;
568+
}
569+
539570
#endif // LIBEBM_TEST_HPP

0 commit comments

Comments
 (0)