dmlc
diff --git a/‎doc/changes/v3.2.0.rst‎
Lines changed: 1 addition & 1 deletion b/‎doc/changes/v3.2.0.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/xgboost/tree_model.h‎
Lines changed: 25 additions & 33 deletions b/‎include/xgboost/tree_model.h‎
Lines changed: 25 additions & 33 deletions
diff --git a/‎python-package/xgboost/testing/dask.py‎
Lines changed: 16 additions & 3 deletions b/‎python-package/xgboost/testing/dask.py‎
Lines changed: 16 additions & 3 deletions
diff --git a/‎python-package/xgboost/testing/data.py‎
Lines changed: 13 additions & 4 deletions b/‎python-package/xgboost/testing/data.py‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎python-package/xgboost/testing/data_iter.py‎
Lines changed: 2 additions & 0 deletions b/‎python-package/xgboost/testing/data_iter.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎python-package/xgboost/testing/ranking.py‎
Lines changed: 2 additions & 0 deletions b/‎python-package/xgboost/testing/ranking.py‎
Lines changed: 2 additions & 0 deletions
@@ -77,7 +77,7 @@ Currently missing features for the ``hist`` tree method with vector leaf:
 Features
 ********
 
-- As part of the vector leaf work, CPU ```hist`` now supports gradient-based sampling.
+- As part of the vector leaf work, CPU ``hist`` now supports gradient-based sampling.
 - The deprecated CLI (command line interface) has been removed. It was deprecated in
   2.1. (:pr:`11720`)
 - Expose the categories container to the C API, allowing C users to access category
 
@@ -29,7 +29,7 @@ namespace xgboost {
 namespace tree {
 struct ScalarTreeView;
 struct MultiTargetTreeView;
-}
+}  // namespace tree
 
 class Json;
 
@@ -88,7 +88,7 @@ class RegTree : public Model {
   /** @brief tree node */
   class Node {
    public:
-    XGBOOST_DEVICE Node()  {
+    XGBOOST_DEVICE Node() {
       // assert compact alignment
       static_assert(sizeof(Node) == 4 * sizeof(int) + sizeof(Info), "Node: 64 bit align");
     }
@@ -132,16 +132,12 @@ class RegTree : public Model {
      * \brief set the left child
      * \param nid node id to right child
      */
-    XGBOOST_DEVICE void SetLeftChild(int nid) {
-      this->cleft_ = nid;
-    }
+    XGBOOST_DEVICE void SetLeftChild(int nid) { this->cleft_ = nid; }
     /*!
      * \brief set the right child
      * \param nid node id to right child
      */
-    XGBOOST_DEVICE void SetRightChild(int nid) {
-      this->cright_ = nid;
-    }
+    XGBOOST_DEVICE void SetRightChild(int nid) { this->cright_ = nid; }
     /*!
      * \brief set split condition of current node
      * \param split_index feature index to split
@@ -166,30 +162,25 @@ class RegTree : public Model {
       this->cright_ = right;
     }
     /*! \brief mark that this node is deleted */
-    XGBOOST_DEVICE void MarkDelete() {
-      this->sindex_ = kDeletedNodeMarker;
-    }
+    XGBOOST_DEVICE void MarkDelete() { this->sindex_ = kDeletedNodeMarker; }
     /*! \brief Reuse this deleted node. */
-    XGBOOST_DEVICE void Reuse() {
-      this->sindex_ = 0;
-    }
+    XGBOOST_DEVICE void Reuse() { this->sindex_ = 0; }
     // set parent
     XGBOOST_DEVICE void SetParent(int pidx, bool is_left_child = true) {
       if (is_left_child) pidx |= (1U << 31);
       this->parent_ = pidx;
     }
     bool operator==(const Node& b) const {
-      return parent_ == b.parent_ && cleft_ == b.cleft_ &&
-             cright_ == b.cright_ && sindex_ == b.sindex_ &&
-             info_.leaf_value == b.info_.leaf_value;
+      return parent_ == b.parent_ && cleft_ == b.cleft_ && cright_ == b.cright_ &&
+             sindex_ == b.sindex_ && info_.leaf_value == b.info_.leaf_value;
     }
 
    private:
     /*!
      * \brief in leaf node, we have weights, in non-leaf nodes,
      *        we have split condition
      */
-    union Info{
+    union Info {
       bst_float leaf_value;
       SplitCondT split_cond;
     };
@@ -277,9 +268,7 @@ class RegTree : public Model {
   }
 
   /*! \brief get node statistics given nid */
-  RTreeNodeStat& Stat(int nid) {
-    return stats_.HostVector()[nid];
-  }
+  RTreeNodeStat& Stat(int nid) { return stats_.HostVector()[nid]; }
 
   void LoadModel(Json const& in) override;
   void SaveModel(Json* out) const override;
@@ -314,11 +303,9 @@ class RegTree : public Model {
    * \param leaf_right_child  The right child index of leaf, by default kInvalidNodeId,
    *                          some updaters use the right child index of leaf as a marker
    */
-  void ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_value,
-                  bool default_left, bst_float base_weight,
-                  bst_float left_leaf_weight, bst_float right_leaf_weight,
-                  bst_float loss_change, float sum_hess, float left_sum,
-                  float right_sum,
+  void ExpandNode(bst_node_t nid, unsigned split_index, bst_float split_value, bool default_left,
+                  bst_float base_weight, bst_float left_leaf_weight, bst_float right_leaf_weight,
+                  bst_float loss_change, float sum_hess, float left_sum, float right_sum,
                   bst_node_t leaf_right_child = kInvalidNodeId);
   /**
    * @brief Expands a leaf node into two additional leaf nodes for a multi-target tree.
@@ -365,6 +352,15 @@ class RegTree : public Model {
                          bst_float base_weight, bst_float left_leaf_weight,
                          bst_float right_leaf_weight, bst_float loss_change, float sum_hess,
                          float left_sum, float right_sum);
+  /**
+   * @brief Expands a leaf node with categories for a multi-target tree.
+   */
+  void ExpandCategorical(bst_node_t nidx, bst_feature_t split_index,
+                         common::Span<const uint32_t> split_cat, bool default_left,
+                         linalg::VectorView<float const> base_weight,
+                         linalg::VectorView<float const> left_weight,
+                         linalg::VectorView<float const> right_weight, float loss_chg,
+                         float sum_hess, float left_sum, float right_sum);
   /**
    * @brief Whether this tree has categorical split.
    */
@@ -567,7 +563,7 @@ class RegTree : public Model {
   // vector of nodes
   HostDeviceVector<Node> nodes_;
   // free node space, used during training process
-  std::vector<int>  deleted_nodes_;
+  std::vector<int> deleted_nodes_;
   // stats of nodes
   HostDeviceVector<RTreeNodeStat> stats_;
   HostDeviceVector<FeatureType> split_types_;
@@ -632,13 +628,9 @@ inline void RegTree::FVec::Fill(SparsePage::Inst const& inst) {
 
 inline void RegTree::FVec::Drop() { this->Init(this->Size()); }
 
-inline size_t RegTree::FVec::Size() const {
-  return data_.size();
-}
+inline size_t RegTree::FVec::Size() const { return data_.size(); }
 
-inline float RegTree::FVec::GetFvalue(size_t i) const {
-  return data_[i];
-}
+inline float RegTree::FVec::GetFvalue(size_t i) const { return data_[i]; }
 
 inline bool RegTree::FVec::IsMissing(size_t i) const { return std::isnan(data_[i]); }
 
 
@@ -276,23 +276,33 @@ def make_categorical(  # pylint: disable=too-many-locals, too-many-arguments
     n_categories: int,
     *,
     onehot: bool = False,
+    n_targets: int = 1,
     cat_dtype: np.typing.DTypeLike = np.int64,
 ) -> Tuple[dd.DataFrame, dd.Series]:
     """Synthesize categorical data with dask."""
     workers = get_client_workers(client)
     n_workers = len(workers)
     dfs = []
 
+    label_cols = (
+        [f"label_{i}" for i in range(n_targets)] if n_targets > 1 else ["label"]
+    )
+
     def pack(**kwargs: Any) -> dd.DataFrame:
         X, y = make_cat_local(**kwargs)
-        X["label"] = y
+        if y.ndim == 2:
+            for i in range(y.shape[1]):
+                X[f"label_{i}"] = y[:, i]
+        else:
+            X["label"] = y
         return X
 
     meta = pack(
         n_samples=1,
         n_features=n_features,
         n_categories=n_categories,
         onehot=False,
+        n_targets=n_targets,
         cat_dtype=cat_dtype,
     )
 
@@ -308,15 +318,18 @@ def pack(**kwargs: Any) -> dd.DataFrame:
             n_samples=l_n_samples,
             n_features=n_features,
             n_categories=n_categories,
+            n_targets=n_targets,
             cat_dtype=cat_dtype,
             onehot=False,
             workers=[worker],
         )
         dfs.append(future)
 
     df: dd.DataFrame = cast(dd.DataFrame, dd.from_delayed(dfs, meta=meta))
-    y = df["label"]
-    X = df[df.columns.difference(["label"])]
+    y = df[label_cols]
+    if n_targets == 1:
+        y = y[label_cols[0]]
+    X = df[df.columns.difference(label_cols)]
 
     if onehot:
         return dd.get_dummies(X), y
 
@@ -1014,6 +1014,7 @@ def make_categorical(
     n_categories: int,
     *,
     onehot: bool,
+    n_targets: int = 1,
     sparsity: float = 0.0,
     cat_ratio: float = 1.0,
     shuffle: bool = False,
@@ -1029,6 +1030,9 @@ def make_categorical(
         Number of categories for categorical features.
     onehot:
         Should we apply one-hot encoding to the data?
+    n_targets:
+        Number of targets. When greater than 1, the label is a 2D array with shape
+        ``(n_samples, n_targets)``.
     sparsity:
         The ratio of the amount of missing values over the number of all entries.
     cat_ratio:
@@ -1068,13 +1072,18 @@ def make_categorical(
             num = row_rng.randint(low=0, high=n_categories, size=n_samples)
             df[str(i)] = pd.Series(num, dtype=num.dtype)
 
-    label = np.zeros(shape=(n_samples,))
+    target_rng = np.random.RandomState(random_state + 2)
+    label: np.ndarray = np.ones((n_samples, n_targets))
     for col in df.columns:
         if isinstance(df[col].dtype, pd.CategoricalDtype):
-            label += df[col].cat.codes
+            codes = df[col].cat.codes.values
+            effects = target_rng.normal(size=(len(df[col].cat.categories), n_targets))
+            label += effects[codes]
         else:
-            label += df[col]
-    label += 1
+            w = target_rng.uniform(low=0.5, high=1.5, size=n_targets)
+            label += np.outer(df[col].values, w)
+    if n_targets == 1:
+        label = label.squeeze(axis=1)
 
     if sparsity > 0.0:
         for i in range(n_features):
 
@@ -119,6 +119,7 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-locals
         onehot: bool,
         device: str,
         cache: Optional[str],
+        n_targets: int = 1,
     ) -> None:
         super().__init__(cache_prefix=cache)
         self.n_batches = n_batches
@@ -132,6 +133,7 @@ def __init__(  # pylint: disable=too-many-arguments,too-many-locals
             onehot=onehot,
             cat_ratio=cat_ratio,
             sparsity=sparsity,
+            n_targets=n_targets,
         )
         xs, ys = [], []
 
 
@@ -97,6 +97,8 @@ def run_ranking_categorical(device: str) -> None:
     X, y = tm.make_categorical(
         n_samples=512, n_features=10, n_categories=3, onehot=False
     )
+    # NDCG requires non-negative integer relevance labels.
+    y = np.clip(np.round(y - y.min()).astype(int), 0, None)
     rng = np.random.default_rng(1994)
     qid = rng.choice(3, size=y.shape[0])
     qid = np.sort(qid)
Original file line number	Diff line number	Diff line change
`@@ -97,6 +97,8 @@ def run_ranking_categorical(device: str) -> None:`
`97`	`97`	`X, y = tm.make_categorical(`
`98`	`98`	`n_samples=512, n_features=10, n_categories=3, onehot=False`
`99`	`99`	`)`
	`100`	`+ # NDCG requires non-negative integer relevance labels.`
	`101`	`+ y = np.clip(np.round(y - y.min()).astype(int), 0, None)`
`100`	`102`	`rng = np.random.default_rng(1994)`
`101`	`103`	`qid = rng.choice(3, size=y.shape[0])`
`102`	`104`	`qid = np.sort(qid)`