Skip to content

Commit d3ec116

Browse files
authored
Revert ntree limit fix (#6616) (#6622)
The old (before fix) best_ntree_limit ignores the num_class parameters, which is incorrect. In before we workarounded it in c++ layer to avoid possible breaking changes on other language bindings. But the Python interpretation stayed incorrect. The PR fixed that in Python to consider num_class, but didn't remove the old workaround, so tree calculation in predictor is incorrect, see PredictBatch in CPUPredictor.
1 parent a018028 commit d3ec116

File tree

4 files changed

+26
-14
lines changed

4 files changed

+26
-14
lines changed

python-package/xgboost/training.py

+5-6
Original file line numberDiff line numberDiff line change
@@ -142,9 +142,7 @@ def _train_internal(params, dtrain,
142142
)
143143
else:
144144
raise ValueError(f'Unknown booster: {booster}')
145-
num_groups = int(config['learner']['learner_model_param']['num_class'])
146-
num_groups = 1 if num_groups == 0 else num_groups
147-
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree * num_groups
145+
bst.best_ntree_limit = (bst.best_iteration + 1) * num_parallel_tree
148146

149147
# Copy to serialise and unserialise booster to reset state and free
150148
# training memory
@@ -184,9 +182,10 @@ def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
184182
If there's more than one metric in the **eval_metric** parameter given in
185183
**params**, the last metric will be used for early stopping.
186184
If early stopping occurs, the model will have three additional fields:
187-
``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``.
188-
(Use ``bst.best_ntree_limit`` to get the correct value if
189-
``num_parallel_tree`` and/or ``num_class`` appears in the parameters)
185+
``bst.best_score``, ``bst.best_iteration`` and ``bst.best_ntree_limit``. Use
186+
``bst.best_ntree_limit`` to get the correct value if ``num_parallel_tree`` and/or
187+
``num_class`` appears in the parameters. ``best_ntree_limit`` is the result of
188+
``num_parallel_tree * best_iteration``.
190189
evals_result: dict
191190
This dictionary stores the evaluation results of all the items in watchlist.
192191

tests/python/test_predict.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,15 @@ def run_predict_leaf(predictor):
3333
y = rng.randint(low=0, high=classes, size=rows)
3434
m = xgb.DMatrix(X, y)
3535
booster = xgb.train(
36-
{'num_parallel_tree': num_parallel_tree, 'num_class': classes,
37-
'predictor': predictor, 'tree_method': 'hist'}, m,
38-
num_boost_round=num_boost_round)
36+
{
37+
"num_parallel_tree": num_parallel_tree,
38+
"num_class": classes,
39+
"predictor": predictor,
40+
"tree_method": "hist",
41+
},
42+
m,
43+
num_boost_round=num_boost_round,
44+
)
3945

4046
empty = xgb.DMatrix(np.ones(shape=(0, cols)))
4147
empty_leaf = booster.predict(empty, pred_leaf=True)
@@ -52,12 +58,19 @@ def run_predict_leaf(predictor):
5258
end = classes * num_parallel_tree * (j + 1)
5359
layer = row[start: end]
5460
for c in range(classes):
55-
tree_group = layer[c * num_parallel_tree:
56-
(c+1) * num_parallel_tree]
61+
tree_group = layer[c * num_parallel_tree: (c + 1) * num_parallel_tree]
5762
assert tree_group.shape[0] == num_parallel_tree
5863
# no subsampling so tree in same forest should output same
5964
# leaf.
6065
assert np.all(tree_group == tree_group[0])
66+
67+
ntree_limit = 2
68+
sliced = booster.predict(
69+
m, pred_leaf=True, ntree_limit=num_parallel_tree * ntree_limit
70+
)
71+
first = sliced[0, ...]
72+
73+
assert first.shape[0] == classes * num_parallel_tree * ntree_limit
6174
return leaf
6275

6376

tests/python/test_training_continuation.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -123,13 +123,13 @@ def run_training_continuation(self, xgb_params_01, xgb_params_02,
123123
gbdt_05 = xgb.train(xgb_params_03, dtrain_5class,
124124
num_boost_round=7)
125125
assert gbdt_05.best_ntree_limit == (
126-
gbdt_05.best_iteration + 1) * self.num_parallel_tree * 5
126+
gbdt_05.best_iteration + 1) * self.num_parallel_tree
127127
gbdt_05 = xgb.train(xgb_params_03,
128128
dtrain_5class,
129129
num_boost_round=3,
130130
xgb_model=gbdt_05)
131131
assert gbdt_05.best_ntree_limit == (
132-
gbdt_05.best_iteration + 1) * self.num_parallel_tree * 5
132+
gbdt_05.best_iteration + 1) * self.num_parallel_tree
133133

134134
res1 = gbdt_05.predict(dtrain_5class)
135135
res2 = gbdt_05.predict(dtrain_5class,

tests/python/test_with_sklearn.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ def train(booster, forest):
9292
)
9393

9494
if forest:
95-
assert cls.best_ntree_limit == rounds * forest * cls.n_classes_
95+
assert cls.best_ntree_limit == rounds * forest
9696
else:
9797
assert cls.best_ntree_limit == 0
9898

0 commit comments

Comments
 (0)