Skip to content

Commit 3de30bd

Browse files
committed
[mt] Implement one-hot categorical feature for CPU hist.
Pre commit. typo. Update tests.
1 parent e7358f9 commit 3de30bd

File tree

12 files changed

+460
-268
lines changed

12 files changed

+460
-268
lines changed

doc/changes/v3.2.0.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ Currently missing features for the ``hist`` tree method with vector leaf:
7777
Features
7878
********
7979

80-
- As part of the vector leaf work, CPU ```hist`` now supports gradient-based sampling.
80+
- As part of the vector leaf work, CPU ``hist`` now supports gradient-based sampling.
8181
- The deprecated CLI (command line interface) has been removed. It was deprecated in
8282
2.1. (:pr:`11720`)
8383
- Expose the categories container to the C API, allowing C users to access category

include/xgboost/tree_model.h

Lines changed: 25 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ namespace xgboost {
2929
namespace tree {
3030
struct ScalarTreeView;
3131
struct MultiTargetTreeView;
32-
}
32+
} // namespace tree
3333

3434
class Json;
3535

@@ -88,7 +88,7 @@ class RegTree : public Model {
8888
/** @brief tree node */
8989
class Node {
9090
public:
91-
XGBOOST_DEVICE Node() {
91+
XGBOOST_DEVICE Node() {
9292
// assert compact alignment
9393
static_assert(sizeof(Node) == 4 * sizeof(int) + sizeof(Info), "Node: 64 bit align");
9494
}
@@ -132,16 +132,12 @@ class RegTree : public Model {
132132
* \brief set the left child
133133
* \param nid node id to right child
134134
*/
135-
XGBOOST_DEVICE void SetLeftChild(int nid) {
136-
this->cleft_ = nid;
137-
}
135+
XGBOOST_DEVICE void SetLeftChild(int nid) { this->cleft_ = nid; }
138136
/*!
139137
* \brief set the right child
140138
* \param nid node id to right child
141139
*/
142-
XGBOOST_DEVICE void SetRightChild(int nid) {
143-
this->cright_ = nid;
144-
}
140+
XGBOOST_DEVICE void SetRightChild(int nid) { this->cright_ = nid; }
145141
/*!
146142
* \brief set split condition of current node
147143
* \param split_index feature index to split
@@ -166,30 +162,25 @@ class RegTree : public Model {
166162
this->cright_ = right;
167163
}
168164
/*! \brief mark that this node is deleted */
169-
XGBOOST_DEVICE void MarkDelete() {
170-
this->sindex_ = kDeletedNodeMarker;
171-
}
165+
XGBOOST_DEVICE void MarkDelete() { this->sindex_ = kDeletedNodeMarker; }
172166
/*! \brief Reuse this deleted node. */
173-
XGBOOST_DEVICE void Reuse() {
174-
this->sindex_ = 0;
175-
}
167+
XGBOOST_DEVICE void Reuse() { this->sindex_ = 0; }
176168
// set parent
177169
XGBOOST_DEVICE void SetParent(int pidx, bool is_left_child = true) {
178170
if (is_left_child) pidx |= (1U << 31);
179171
this->parent_ = pidx;
180172
}
181173
bool operator==(const Node& b) const {
182-
return parent_ == b.parent_ && cleft_ == b.cleft_ &&
183-
cright_ == b.cright_ && sindex_ == b.sindex_ &&
184-
info_.leaf_value == b.info_.leaf_value;
174+
return parent_ == b.parent_ && cleft_ == b.cleft_ && cright_ == b.cright_ &&
175+
sindex_ == b.sindex_ && info_.leaf_value == b.info_.leaf_value;
185176
}
186177

187178
private:
188179
/*!
189180
* \brief in leaf node, we have weights, in non-leaf nodes,
190181
* we have split condition
191182
*/
192-
union Info{
183+
union Info {
193184
bst_float leaf_value;
194185
SplitCondT split_cond;
195186
};
@@ -277,9 +268,7 @@ class RegTree : public Model {
277268
}
278269

279270
/*! \brief get node statistics given nid */
280-
RTreeNodeStat& Stat(int nid) {
281-
return stats_.HostVector()[nid];
282-
}
271+
RTreeNodeStat& Stat(int nid) { return stats_.HostVector()[nid]; }
283272

284273
void LoadModel(Json const& in) override;
285274
void SaveModel(Json* out) const override;
@@ -314,11 +303,9 @@ class RegTree : public Model {
314303
* \param leaf_right_child The right child index of leaf, by default kInvalidNodeId,
315304
* some updaters use the right child index of leaf as a marker
316305
*/
317-
void ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_value,
318-
bool default_left, bst_float base_weight,
319-
bst_float left_leaf_weight, bst_float right_leaf_weight,
320-
bst_float loss_change, float sum_hess, float left_sum,
321-
float right_sum,
306+
void ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_value, bool default_left,
307+
bst_float base_weight, bst_float left_leaf_weight, bst_float right_leaf_weight,
308+
bst_float loss_change, float sum_hess, float left_sum, float right_sum,
322309
bst_node_t leaf_right_child = kInvalidNodeId);
323310
/**
324311
* @brief Expands a leaf node into two additional leaf nodes for a multi-target tree.
@@ -365,6 +352,15 @@ class RegTree : public Model {
365352
bst_float base_weight, bst_float left_leaf_weight,
366353
bst_float right_leaf_weight, bst_float loss_change, float sum_hess,
367354
float left_sum, float right_sum);
355+
/**
356+
* @brief Expands a leaf node with categories for a multi-target tree.
357+
*/
358+
void ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
359+
common::Span<const uint32_t> split_cat, bool default_left,
360+
linalg::VectorView<float const> base_weight,
361+
linalg::VectorView<float const> left_weight,
362+
linalg::VectorView<float const> right_weight, float loss_chg,
363+
float sum_hess, float left_sum, float right_sum);
368364
/**
369365
* @brief Whether this tree has categorical split.
370366
*/
@@ -567,7 +563,7 @@ class RegTree : public Model {
567563
// vector of nodes
568564
HostDeviceVector<Node> nodes_;
569565
// free node space, used during training process
570-
std::vector<int> deleted_nodes_;
566+
std::vector<int> deleted_nodes_;
571567
// stats of nodes
572568
HostDeviceVector<RTreeNodeStat> stats_;
573569
HostDeviceVector<FeatureType> split_types_;
@@ -632,13 +628,9 @@ inline void RegTree::FVec::Fill(SparsePage::Inst const& inst) {
632628

633629
inline void RegTree::FVec::Drop() { this->Init(this->Size()); }
634630

635-
inline size_t RegTree::FVec::Size() const {
636-
return data_.size();
637-
}
631+
inline size_t RegTree::FVec::Size() const { return data_.size(); }
638632

639-
inline float RegTree::FVec::GetFvalue(size_t i) const {
640-
return data_[i];
641-
}
633+
inline float RegTree::FVec::GetFvalue(size_t i) const { return data_[i]; }
642634

643635
inline bool RegTree::FVec::IsMissing(size_t i) const { return std::isnan(data_[i]); }
644636

python-package/xgboost/testing/dask.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,23 +276,33 @@ def make_categorical( # pylint: disable=too-many-locals, too-many-arguments
276276
n_categories: int,
277277
*,
278278
onehot: bool = False,
279+
n_targets: int = 1,
279280
cat_dtype: np.typing.DTypeLike = np.int64,
280281
) -> Tuple[dd.DataFrame, dd.Series]:
281282
"""Synthesize categorical data with dask."""
282283
workers = get_client_workers(client)
283284
n_workers = len(workers)
284285
dfs = []
285286

287+
label_cols = (
288+
[f"label_{i}" for i in range(n_targets)] if n_targets > 1 else ["label"]
289+
)
290+
286291
def pack(**kwargs: Any) -> dd.DataFrame:
287292
X, y = make_cat_local(**kwargs)
288-
X["label"] = y
293+
if y.ndim == 2:
294+
for i in range(y.shape[1]):
295+
X[f"label_{i}"] = y[:, i]
296+
else:
297+
X["label"] = y
289298
return X
290299

291300
meta = pack(
292301
n_samples=1,
293302
n_features=n_features,
294303
n_categories=n_categories,
295304
onehot=False,
305+
n_targets=n_targets,
296306
cat_dtype=cat_dtype,
297307
)
298308

@@ -308,15 +318,18 @@ def pack(**kwargs: Any) -> dd.DataFrame:
308318
n_samples=l_n_samples,
309319
n_features=n_features,
310320
n_categories=n_categories,
321+
n_targets=n_targets,
311322
cat_dtype=cat_dtype,
312323
onehot=False,
313324
workers=[worker],
314325
)
315326
dfs.append(future)
316327

317328
df: dd.DataFrame = cast(dd.DataFrame, dd.from_delayed(dfs, meta=meta))
318-
y = df["label"]
319-
X = df[df.columns.difference(["label"])]
329+
y = df[label_cols]
330+
if n_targets == 1:
331+
y = y[label_cols[0]]
332+
X = df[df.columns.difference(label_cols)]
320333

321334
if onehot:
322335
return dd.get_dummies(X), y

python-package/xgboost/testing/data.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1014,6 +1014,7 @@ def make_categorical(
10141014
n_categories: int,
10151015
*,
10161016
onehot: bool,
1017+
n_targets: int = 1,
10171018
sparsity: float = 0.0,
10181019
cat_ratio: float = 1.0,
10191020
shuffle: bool = False,
@@ -1029,6 +1030,9 @@ def make_categorical(
10291030
Number of categories for categorical features.
10301031
onehot:
10311032
Should we apply one-hot encoding to the data?
1033+
n_targets:
1034+
Number of targets. When greater than 1, the label is a 2D array with shape
1035+
``(n_samples, n_targets)``.
10321036
sparsity:
10331037
The ratio of the amount of missing values over the number of all entries.
10341038
cat_ratio:
@@ -1068,13 +1072,18 @@ def make_categorical(
10681072
num = row_rng.randint(low=0, high=n_categories, size=n_samples)
10691073
df[str(i)] = pd.Series(num, dtype=num.dtype)
10701074

1071-
label = np.zeros(shape=(n_samples,))
1075+
target_rng = np.random.RandomState(random_state + 2)
1076+
label: np.ndarray = np.ones((n_samples, n_targets))
10721077
for col in df.columns:
10731078
if isinstance(df[col].dtype, pd.CategoricalDtype):
1074-
label += df[col].cat.codes
1079+
codes = df[col].cat.codes.values
1080+
effects = target_rng.normal(size=(len(df[col].cat.categories), n_targets))
1081+
label += effects[codes]
10751082
else:
1076-
label += df[col]
1077-
label += 1
1083+
w = target_rng.uniform(low=0.5, high=1.5, size=n_targets)
1084+
label += np.outer(df[col].values, w)
1085+
if n_targets == 1:
1086+
label = label.squeeze(axis=1)
10781087

10791088
if sparsity > 0.0:
10801089
for i in range(n_features):

python-package/xgboost/testing/data_iter.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,7 @@ def __init__( # pylint: disable=too-many-arguments,too-many-locals
119119
onehot: bool,
120120
device: str,
121121
cache: Optional[str],
122+
n_targets: int = 1,
122123
) -> None:
123124
super().__init__(cache_prefix=cache)
124125
self.n_batches = n_batches
@@ -132,6 +133,7 @@ def __init__( # pylint: disable=too-many-arguments,too-many-locals
132133
onehot=onehot,
133134
cat_ratio=cat_ratio,
134135
sparsity=sparsity,
136+
n_targets=n_targets,
135137
)
136138
xs, ys = [], []
137139

python-package/xgboost/testing/ranking.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,8 @@ def run_ranking_categorical(device: str) -> None:
9797
X, y = tm.make_categorical(
9898
n_samples=512, n_features=10, n_categories=3, onehot=False
9999
)
100+
# NDCG requires non-negative integer relevance labels.
101+
y = np.clip(np.round(y - y.min()).astype(int), 0, None)
100102
rng = np.random.default_rng(1994)
101103
qid = rng.choice(3, size=y.shape[0])
102104
qid = np.sort(qid)

0 commit comments

Comments
 (0)