@@ -108,6 +108,7 @@ WARNING_DISABLE_UNINITIALIZED_LOCAL_POINTER
108
108
// do not inline this. Not inlining it makes fewer versions that can be called from the more templated functions
109
109
template <bool bHessian>
110
110
static ErrorEbm Flatten (BoosterShell* const pBoosterShell,
111
+ bool bExtraMissingCut,
111
112
const bool bNominal,
112
113
const TermBoostFlags flags,
113
114
const FloatCalc regAlpha,
@@ -131,6 +132,7 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
131
132
EBM_ASSERT (2 <= cBins);
132
133
EBM_ASSERT (cSlices <= cBins);
133
134
EBM_ASSERT (!bNominal || cSlices == cBins);
135
+ EBM_ASSERT (!bExtraMissingCut || !bNominal); // for Nominal we cut everywhere
134
136
135
137
ErrorEbm error;
136
138
@@ -176,7 +178,7 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
176
178
} else {
177
179
pUpdateScore = aUpdateScore;
178
180
179
- if (nullptr != pMissingValueTreeNode) {
181
+ if (nullptr != pMissingValueTreeNode || bExtraMissingCut ) {
180
182
// always put a split on the missing bin
181
183
*pSplit = 1 ;
182
184
++pSplit;
@@ -237,6 +239,18 @@ static ErrorEbm Flatten(BoosterShell* const pBoosterShell,
237
239
}
238
240
}
239
241
242
+ if (bExtraMissingCut) {
243
+ EBM_ASSERT (!bNominal); // for Nominal we cut everywhere
244
+ if (TermBoostFlags_MissingLow & flags) {
245
+ if (nullptr == pMissingBin) {
246
+ pMissingBin = pTreeNode->GetBin ();
247
+ }
248
+ } else {
249
+ EBM_ASSERT (TermBoostFlags_MissingHigh & flags);
250
+ pMissingBin = pTreeNode->GetBin ();
251
+ }
252
+ }
253
+
240
254
EBM_ASSERT (apBins <= ppBinLast);
241
255
EBM_ASSERT (ppBinLast < apBins + (cBins - (nullptr != pMissingValueTreeNode ? size_t {1 } : size_t {0 })));
242
256
@@ -365,6 +379,8 @@ static int FindBestSplitGain(RandomDeterministic* const pRng,
365
379
const FloatCalc regLambda,
366
380
const FloatCalc deltaStepMax,
367
381
const MonotoneDirection monotoneDirection,
382
+ const Bin<FloatMain, UIntMain, true, true, bHessian, GetArrayScores(cCompilerScores)>* const pMissingBin,
383
+ bool* pbMissingIsolated,
368
384
const TreeNode<bHessian, GetArrayScores(cCompilerScores)>** const ppMissingValueTreeNode) {
369
385
370
386
LOG_N (Trace_Verbose,
@@ -401,6 +417,9 @@ static int FindBestSplitGain(RandomDeterministic* const pRng,
401
417
if (ppBinCur == ppBinLast) {
402
418
// There is just one bin and therefore no splits
403
419
pTreeNode->AFTER_RejectSplit ();
420
+ if (pMissingBin == *ppBinCur) {
421
+ *pbMissingIsolated = true ;
422
+ }
404
423
return 1 ;
405
424
}
406
425
@@ -822,10 +841,16 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
822
841
const Bin<FloatMain, UIntMain, true , true , bHessian, GetArrayScores (cCompilerScores)>** ppBin = apBins;
823
842
const Bin<FloatMain, UIntMain, true , true , bHessian, GetArrayScores (cCompilerScores)>* pBin = aBins;
824
843
844
+ const Bin<FloatMain, UIntMain, true , true , bHessian, GetArrayScores (cCompilerScores)>* pMissingBin = nullptr ;
845
+ bool bMissingIsolated = false ;
846
+
825
847
size_t cBinsAdjusted = cBins;
826
848
const TreeNode<bHessian, GetArrayScores (cCompilerScores)>* pMissingValueTreeNode = nullptr ;
827
849
if (TermBoostFlags_MissingLow & flags) {
828
850
if (bMissing) {
851
+ if (!bNominal) {
852
+ pMissingBin = pBin;
853
+ }
829
854
*ppBin = pBin;
830
855
pBin = IndexBin (pBin, cBytesPerBin);
831
856
++ppBin;
@@ -879,6 +904,8 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
879
904
regLambda,
880
905
deltaStepMax,
881
906
monotoneDirection,
907
+ pMissingBin,
908
+ &bMissingIsolated,
882
909
&pMissingValueTreeNode);
883
910
size_t cSplitsRemaining = cSplitsMax;
884
911
FloatCalc totalGain = 0 ;
@@ -952,6 +979,8 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
952
979
regLambda,
953
980
deltaStepMax,
954
981
monotoneDirection,
982
+ pMissingBin,
983
+ &bMissingIsolated,
955
984
&pMissingValueTreeNode);
956
985
// if FindBestSplitGain returned -1 to indicate an
957
986
// overflow ignore it here. We successfully made a root node split, so we might as well continue
@@ -976,6 +1005,8 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
976
1005
regLambda,
977
1006
deltaStepMax,
978
1007
monotoneDirection,
1008
+ pMissingBin,
1009
+ &bMissingIsolated,
979
1010
&pMissingValueTreeNode);
980
1011
// if FindBestSplitGain returned -1 to indicate an
981
1012
// overflow ignore it here. We successfully made a root node split, so we might as well continue
@@ -1007,9 +1038,23 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
1007
1038
}
1008
1039
}
1009
1040
*pTotalGain = static_cast <double >(totalGain);
1010
- size_t cSlices =
1011
- bNominal ? cBins : cSplitsMax - cSplitsRemaining + 1 + (nullptr != pMissingValueTreeNode ? 1 : 0 );
1012
- return Flatten<bHessian>(pBoosterShell,
1041
+
1042
+ size_t cSlices = cSplitsMax - cSplitsRemaining + 1 ;
1043
+ bool bExtraMissingCut = false ;
1044
+ if (nullptr != pMissingValueTreeNode) {
1045
+ EBM_ASSERT (nullptr == pMissingBin);
1046
+ ++cSlices;
1047
+ } else {
1048
+ if (nullptr != pMissingBin && !bMissingIsolated) {
1049
+ bExtraMissingCut = true ;
1050
+ ++cSlices;
1051
+ }
1052
+ }
1053
+ if (bNominal) {
1054
+ cSlices = cBins;
1055
+ }
1056
+ const ErrorEbm error = Flatten<bHessian>(pBoosterShell,
1057
+ bExtraMissingCut,
1013
1058
bNominal,
1014
1059
flags,
1015
1060
regAlpha,
@@ -1024,6 +1069,11 @@ template<bool bHessian, size_t cCompilerScores> class PartitionOneDimensionalBoo
1024
1069
cBins
1025
1070
#endif // NDEBUG
1026
1071
);
1072
+
1073
+ EBM_ASSERT (!bMissing || 2 <= pBoosterShell->GetInnerTermUpdate ()->GetCountSlices (iDimension));
1074
+ EBM_ASSERT (!bMissing || *pBoosterShell->GetInnerTermUpdate ()->GetSplitPointer (iDimension) == 1 );
1075
+
1076
+ return error;
1027
1077
}
1028
1078
};
1029
1079
0 commit comments