Implement ranking support (#189)

Yard1 · web-flow · commit 15396fd96daf · 2022-02-03T16:57:23.000+01:00
Adds support for the qid parameter, allowing ranking to work correctly. The group parameter is not supported - this is also the case for XGBoost's Dask interface.

Support for LightGBM to come in the future. Not sure how that will look like, as it doesn't support the qid parameter.

Also fixes the ray_dmatrix_params arg being mandatory for RayXGBRanker.
diff --git a/xgboost_ray/main.py b/xgboost_ray/main.py
@@ -302,6 +302,8 @@ def _get_dmatrix(data: RayDMatrix, param: Dict) -> xgb.DMatrix:
                 param["label"] = [param["label"]]
             if not isinstance(param["weight"], list):
                 param["weight"] = [param["weight"]]
+            if not isinstance(param["qid"], list):
+                param["qid"] = [param["qid"]]
             if not isinstance(param["data"], list):
                 param["base_margin"] = [param["base_margin"]]
 
@@ -322,6 +324,7 @@ def _get_dmatrix(data: RayDMatrix, param: Dict) -> xgb.DMatrix:
                 "data": concat_dataframes(param["data"]),
                 "label": concat_dataframes(param["label"]),
                 "weight": concat_dataframes(param["weight"]),
+                "qid": concat_dataframes(param["qid"]),
                 "base_margin": concat_dataframes(param["base_margin"]),
                 "label_lower_bound": concat_dataframes(
                     param["label_lower_bound"]),
@@ -335,6 +338,7 @@ def _get_dmatrix(data: RayDMatrix, param: Dict) -> xgb.DMatrix:
 
         if LEGACY_MATRIX:
             param.pop("base_margin", None)
+            param.pop("qid", None)
 
         matrix = xgb.DMatrix(**param)
 
diff --git a/xgboost_ray/matrix.py b/xgboost_ray/matrix.py
@@ -83,6 +83,7 @@ def __init__(
             label: List[Optional[Data]],
             missing: Optional[float],
             weight: List[Optional[Data]],
+            qid: List[Optional[Data]],
             base_margin: List[Optional[Data]],
             label_lower_bound: List[Optional[Data]],
             label_upper_bound: List[Optional[Data]],
@@ -97,6 +98,7 @@ def __init__(
         self._label = label
         self._missing = missing
         self._weight = weight
+        self._qid = qid
         self._base_margin = base_margin
         self._label_lower_bound = label_lower_bound
         self._label_upper_bound = label_upper_bound
@@ -128,6 +130,7 @@ def next(self, input_data: Callable):
             data=self._prop(self._data),
             label=self._prop(self._label),
             weight=self._prop(self._weight),
+            qid=self._prop(self._qid),
             group=None,
             label_lower_bound=self._prop(self._label_lower_bound),
             label_upper_bound=self._prop(self._label_upper_bound),
@@ -148,6 +151,7 @@ def __init__(self,
                  label_upper_bound: Optional[Data] = None,
                  feature_names: Optional[List[str]] = None,
                  feature_types: Optional[List[np.dtype]] = None,
+                 qid: Optional[Data] = None,
                  filetype: Optional[RayFileType] = None,
                  ignore: Optional[List[str]] = None,
                  **kwargs):
@@ -160,6 +164,7 @@ def __init__(self,
         self.label_upper_bound = label_upper_bound
         self.feature_names = feature_names
         self.feature_types = feature_types
+        self.qid = qid
 
         self.data_source = None
         self.actor_shards = None
@@ -233,6 +238,10 @@ def _split_dataframe(
         if exclude:
             exclude_cols.add(exclude)
 
+        qid, exclude = data_source.get_column(local_data, self.qid)
+        if exclude:
+            exclude_cols.add(exclude)
+
         base_margin, exclude = data_source.get_column(local_data,
                                                       self.base_margin)
         if exclude:
@@ -253,7 +262,7 @@ def _split_dataframe(
             x = x[[col for col in x.columns if col not in exclude_cols]]
 
         return x, label, weight, base_margin, label_lower_bound, \
-            label_upper_bound
+            label_upper_bound, qid
 
     def load_data(self,
                   num_actors: int,
@@ -341,7 +350,7 @@ def load_data(self,
         # yet. Instead, we'll be selecting the rows below.
         local_df = data_source.load_data(
             self.data, ignore=self.ignore, indices=None, **self.kwargs)
-        x, y, w, b, ll, lu = self._split_dataframe(
+        x, y, w, b, ll, lu, qid = self._split_dataframe(
             local_df, data_source=data_source)
 
         if isinstance(x, list):
@@ -362,7 +371,8 @@ def load_data(self,
                 "label_lower_bound": ray.put(ll.iloc[indices]
                                              if ll is not None else None),
                 "label_upper_bound": ray.put(lu.iloc[indices]
-                                             if lu is not None else None)
+                                             if lu is not None else None),
+                "qid": ray.put(qid.iloc[indices] if qid is not None else None),
             }
             refs[i] = actor_refs
 
@@ -505,7 +515,7 @@ def load_data(self,
                 indices=rank_shards,
                 ignore=self.ignore,
                 **self.kwargs)
-            x, y, w, b, ll, lu = self._split_dataframe(
+            x, y, w, b, ll, lu, qid = self._split_dataframe(
                 local_df, data_source=data_source)
 
             if isinstance(x, list):
@@ -517,15 +527,16 @@ def load_data(self,
             indices = _get_sharding_indices(sharding, rank, num_actors, n)
 
             if not indices:
-                x, y, w, b, ll, lu = None, None, None, None, None, None
+                x, y, w, b, ll, lu, qid = (None, None, None, None, None, None,
+                                           None)
                 n = 0
             else:
                 local_df = data_source.load_data(
                     self.data,
                     ignore=self.ignore,
                     indices=indices,
                     **self.kwargs)
-                x, y, w, b, ll, lu = self._split_dataframe(
+                x, y, w, b, ll, lu, qid = self._split_dataframe(
                     local_df, data_source=data_source)
 
                 if isinstance(x, list):
@@ -540,7 +551,8 @@ def load_data(self,
                 "weight": ray.put(w),
                 "base_margin": ray.put(b),
                 "label_lower_bound": ray.put(ll),
-                "label_upper_bound": ray.put(lu)
+                "label_upper_bound": ray.put(lu),
+                "qid": ray.put(qid),
             }
         }
 
@@ -648,6 +660,7 @@ def __init__(self,
                  label_upper_bound: Optional[Data] = None,
                  feature_names: Optional[List[str]] = None,
                  feature_types: Optional[List[np.dtype]] = None,
+                 qid: Optional[Data] = None,
                  num_actors: Optional[int] = None,
                  filetype: Optional[RayFileType] = None,
                  ignore: Optional[List[str]] = None,
@@ -656,10 +669,20 @@ def __init__(self,
                  lazy: bool = False,
                  **kwargs):
 
+        if kwargs.get("group", None) is not None:
+            raise ValueError(
+                "`group` parameter is not supported. "
+                "If you are using XGBoost-Ray, use `qid` parameter instead. "
+                "If you are using LightGBM-Ray, ranking is not yet supported.")
+
+        if qid is not None and weight is not None:
+            raise NotImplementedError("per-group weight is not implemented.")
+
         self._uid = uuid.uuid4().int
 
         self.feature_names = feature_names
         self.feature_types = feature_types
+        self.qid = qid
         self.missing = missing
 
         self.num_actors = num_actors
@@ -691,6 +714,7 @@ def __init__(self,
                 feature_types=feature_types,
                 filetype=filetype,
                 ignore=ignore,
+                qid=qid,
                 **kwargs)
         else:
             self.loader = _CentralRayDMatrixLoader(
@@ -705,6 +729,7 @@ def __init__(self,
                 feature_types=feature_types,
                 filetype=filetype,
                 ignore=ignore,
+                qid=qid,
                 **kwargs)
 
         self.refs: Dict[int, Dict[str, ray.ObjectRef]] = {}
@@ -809,6 +834,7 @@ def __init__(self,
                  label_upper_bound: Optional[Data] = None,
                  feature_names: Optional[List[str]] = None,
                  feature_types: Optional[List[np.dtype]] = None,
+                 qid: Optional[Data] = None,
                  *args,
                  **kwargs):
         if cp is None:
@@ -831,6 +857,7 @@ def __init__(self,
             label_upper_bound=None,
             feature_names=feature_names,
             feature_types=feature_types,
+            qid=qid,
             *args,
             **kwargs)
 
diff --git a/xgboost_ray/sklearn.py b/xgboost_ray/sklearn.py
@@ -239,9 +239,13 @@ def inner_f(*args, **kwargs):
     return inner_f
 
 
-def _check_if_params_are_ray_dmatrix(X, sample_weight, base_margin, eval_set,
+def _check_if_params_are_ray_dmatrix(X,
+                                     sample_weight,
+                                     base_margin,
+                                     eval_set,
                                      sample_weight_eval_set,
-                                     base_margin_eval_set):
+                                     base_margin_eval_set,
+                                     eval_qid=None):
     train_dmatrix = None
     evals = ()
     eval_set = eval_set or ()
@@ -266,6 +270,8 @@ def _check_if_params_are_ray_dmatrix(X, sample_weight, base_margin, eval_set,
             params_to_warn_about.append("sample_weight_eval_set")
         if base_margin_eval_set is not None:
             params_to_warn_about.append("base_margin_eval_set")
+        if eval_qid is not None:
+            params_to_warn_about.append("eval_qid")
         if params_to_warn_about:
             warnings.warn(
                 "`eval_set` is composed of RayDMatrix tuples, "
@@ -951,18 +957,24 @@ def fit(
             ray_dmatrix_params: Optional[Dict] = None,
     ):
 
-        # check if group information is provided
-        if group is None and qid is None:
-            raise ValueError("group or qid is required for ranking task")
+        if not (group is None and eval_group is None):
+            raise ValueError("Use `qid` instead of `group` for RayXGBRanker.")
+        if qid is None:
+            raise ValueError("`qid` is required for ranking.")
 
         if eval_set is not None:
-            if eval_group is None and eval_qid is None:
-                raise ValueError("eval_group or eval_qid is required if"
-                                 " eval_set is not None")
+            if eval_qid is None:
+                raise ValueError("`eval_qid `is required if"
+                                 " `eval_set` is not None")
+
+        evals_result = {}
+        ray_dmatrix_params = ray_dmatrix_params or {}
+
+        params = self.get_xgb_params()
 
         train_dmatrix, evals = _check_if_params_are_ray_dmatrix(
             X, sample_weight, base_margin, eval_set, sample_weight_eval_set,
-            base_margin_eval_set)
+            base_margin_eval_set, eval_qid)
 
         if train_dmatrix is None:
             train_dmatrix, evals = _wrap_evaluation_matrices(
@@ -986,9 +998,6 @@ def fit(
                 }),
                 **self._ray_get_wrap_evaluation_matrices_compat_kwargs())
 
-        evals_result = {}
-        params = self.get_xgb_params()
-
         try:
             model, feval, params = self._configure_fit(xgb_model, eval_metric,
                                                        params)
diff --git a/xgboost_ray/tests/test_end_to_end.py b/xgboost_ray/tests/test_end_to_end.py
@@ -9,6 +9,8 @@
 import ray
 from ray.exceptions import RayActorError, RayTaskError
 
+from scipy.sparse import csr_matrix
+
 from xgboost_ray import RayParams, train, RayDMatrix, predict, RayShardingMode
 from xgboost_ray.main import RayXGBoostTrainingError
 from xgboost_ray.callback import DistributedCallback
@@ -341,6 +343,38 @@ def testKwargsValidation(self):
                 ray_params=RayParams(num_actors=1, max_actor_restarts=0),
                 totally_invalid_kwarg="")
 
+    def testRanking(self):
+        Xrow = np.array([1, 2, 6, 8, 11, 14, 16, 17])
+        Xcol = np.array([0, 0, 1, 1, 2, 2, 3, 3])
+        X = csr_matrix(
+            (np.ones(shape=8), (Xrow, Xcol)), shape=(20, 4)).toarray()
+        y = np.array([
+            0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, 0.0,
+            0.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0
+        ])
+
+        qid = np.array([0] * 5 + [1] * 5 + [2] * 5 + [3] * 5)
+        dtrain = RayDMatrix(X, label=y, qid=qid)
+
+        params = {
+            "eta": 1,
+            "objective": "rank:pairwise",
+            "eval_metric": ["auc", "aucpr"],
+            "max_depth": 1
+        }
+        evals_result = {}
+        train(
+            params,
+            dtrain,
+            10,
+            evals=[(dtrain, "train")],
+            evals_result=evals_result,
+            ray_params=RayParams(num_actors=2, max_actor_restarts=0))
+        auc_rec = evals_result["train"]["auc"]
+        self.assertTrue(all(p <= q for p, q in zip(auc_rec, auc_rec[1:])))
+        auc_rec = evals_result["train"]["aucpr"]
+        self.assertTrue((p <= q for p, q in zip(auc_rec, auc_rec[1:])))
+
 
 if __name__ == "__main__":
     import pytest
diff --git a/xgboost_ray/tests/test_sklearn.py b/xgboost_ray/tests/test_sklearn.py
@@ -42,7 +42,8 @@
                                  RayXGBRFClassifier, RayXGBRFRegressor,
                                  RayXGBRanker)
 
-from xgboost_ray.main import XGBOOST_VERSION_TUPLE
+from xgboost_ray.main import (XGBOOST_VERSION_TUPLE, RayDMatrix, RayParams,
+                              train, predict)
 from xgboost_ray.matrix import RayShardingMode
 
 
@@ -1211,6 +1212,64 @@ def test_estimator_type(self):
             cls = RayXGBClassifier()
             cls.load_model(path)  # no error
 
+    def test_ranking(self):
+        # generate random data
+        x_train = np.random.rand(1000, 10)
+        y_train = np.random.randint(5, size=1000)
+        train_qid = np.repeat(np.array([list(range(20))]), 50)
+
+        x_valid = np.random.rand(200, 10)
+        y_valid = np.random.randint(5, size=200)
+        valid_qid = np.repeat(np.array([list(range(4))]), 50)
+
+        x_test = np.random.rand(100, 10)
+
+        params = {
+            "objective": "rank:pairwise",
+            "learning_rate": 0.1,
+            "gamma": 1.0,
+            "min_child_weight": 0.1,
+            "max_depth": 6,
+            "n_estimators": 4,
+            "random_state": 1,
+            "n_jobs": 2
+        }
+        model = RayXGBRanker(**params)
+        model.fit(
+            x_train,
+            y_train,
+            qid=train_qid,
+            eval_set=[(x_valid, y_valid)],
+            eval_qid=[valid_qid])
+        assert model.evals_result()
+
+        pred = model.predict(x_test)
+
+        train_data = RayDMatrix(x_train, y_train, qid=train_qid)
+        valid_data = RayDMatrix(x_valid, y_valid, qid=valid_qid)
+        test_data = RayDMatrix(x_test)
+
+        params_orig = {
+            "objective": "rank:pairwise",
+            "eta": 0.1,
+            "gamma": 1.0,
+            "min_child_weight": 0.1,
+            "max_depth": 6,
+            "random_state": 1
+        }
+        xgb_model_orig = train(
+            params_orig,
+            train_data,
+            num_boost_round=4,
+            evals=[(valid_data, "validation")],
+            ray_params=RayParams(num_actors=2, max_actor_restarts=0))
+        pred_orig = predict(
+            xgb_model_orig,
+            test_data,
+            ray_params=RayParams(num_actors=2, max_actor_restarts=0))
+
+        np.testing.assert_almost_equal(pred, pred_orig)
+
 
 if __name__ == "__main__":
     import pytest