Allow train and predict to run in remote function, enabling Ray client mode (#57)

krfricke · web-flow · commit 1e6494f65d27 · 2021-02-19T16:14:30.000+01:00
* Update README

* Detect ray client session and run in remote call

* Shut down ray

* Ray client compatibility

* Move init check

* move order

* Fix typo
diff --git a/xgboost_ray/main.py b/xgboost_ray/main.py
@@ -37,7 +37,7 @@
     RAY_INSTALLED = False
 
 from xgboost_ray.tune import _try_add_tune_callback, _get_tune_resources, \
-    TUNE_USING_PG
+    TUNE_USING_PG, is_session_enabled
 
 from xgboost_ray.matrix import RayDMatrix, combine_data, \
     RayDeviceQuantileDMatrix, RayDataIter, concat_dataframes
@@ -93,6 +93,13 @@ def _assert_ray_support():
             "Try: `pip install ray`")
 
 
+def _is_client_connected() -> bool:
+    try:
+        return ray.util.client.ray.is_connected()
+    except Exception:
+        return False
+
+
 class _RabitTracker(xgb.RabitTracker):
     """
     This method overwrites the xgboost-provided RabitTracker to switch
@@ -639,7 +646,7 @@ def _create_placement_group(cpus_per_actor, gpus_per_actor,
 
 def _create_communication_processes(added_tune_callback: bool = False):
     # Create Queue and Event actors and make sure to colocate with driver node.
-    node_ip = ray.services.get_node_ip_address()
+    node_ip = get_node_ip_address()
     # Have to explicitly set num_cpus to 0.
     placement_option = {"num_cpus": 0}
     if added_tune_callback and TUNE_USING_PG:
@@ -925,6 +932,7 @@ def train(params: Dict,
           evals_result: Optional[Dict] = None,
           additional_results: Optional[Dict] = None,
           ray_params: Union[None, RayParams, Dict] = None,
+          _remote: Optional[bool] = None,
           **kwargs):
     """Distributed XGBoost training via Ray.
 
@@ -967,11 +975,51 @@ def train(params: Dict,
         ray_params (Union[None, RayParams, Dict]): Parameters to configure
             Ray-specific behavior. See :class:`RayParams` for a list of valid
             configuration parameters.
+        _remote (bool): Whether to run the driver process in a remote
+            function. This is enabled by default in Ray client mode.
         **kwargs: Keyword arguments will be passed to the local
             `xgb.train()` calls.
 
     Returns: An ``xgboost.Booster`` object.
     """
+    os.environ.setdefault("RAY_IGNORE_UNHANDLED_ERRORS", "1")
+
+    if _remote is None:
+        _remote = _is_client_connected() and \
+                  not is_session_enabled()
+
+    if not ray.is_initialized():
+        ray.init()
+
+    if _remote:
+        # Run this function as a remote function to support Ray client mode
+        @ray.remote(num_cpus=0)
+        def _wrapped(*args, **kwargs):
+            _evals_result = {}
+            _additional_results = {}
+            bst = train(
+                *args,
+                evals_result=_evals_result,
+                additional_results=_additional_results,
+                **kwargs)
+            return bst, _evals_result, _additional_results
+
+        bst, train_evals_result, train_additional_results = ray.get(
+            _wrapped.remote(
+                params,
+                dtrain,
+                *args,
+                evals=evals,
+                ray_params=ray_params,
+                _remote=False,
+                **kwargs,
+            ))
+        if isinstance(evals_result, dict):
+            evals_result.update(train_evals_result)
+        if isinstance(additional_results, dict):
+            additional_results.update(train_additional_results)
+        return bst
+
     ray_params = _validate_ray_params(ray_params)
 
     max_actor_restarts = ray_params.max_actor_restarts \
@@ -986,9 +1034,6 @@ def train(params: Dict,
             "`dtrain = RayDMatrix(data=data, label=label)`.".format(
                 type(dtrain)))
 
-    if not ray.is_initialized():
-        ray.init()
-
     cpus_per_actor, gpus_per_actor = _autodetect_resources(
         ray_params=ray_params,
         use_tree_method="tree_method" in params
@@ -1211,6 +1256,7 @@ def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams,
 def predict(model: xgb.Booster,
             data: RayDMatrix,
             ray_params: Union[None, RayParams, Dict] = None,
+            _remote: Optional[bool] = None,
             **kwargs) -> Optional[np.ndarray]:
     """Distributed XGBoost predict via Ray.
 
@@ -1225,12 +1271,28 @@ def predict(model: xgb.Booster,
         ray_params (Union[None, RayParams, Dict]): Parameters to configure
             Ray-specific behavior. See :class:`RayParams` for a list of valid
             configuration parameters.
+        _remote (bool): Whether to run the driver process in a remote
+            function. This is enabled by default in Ray client mode.
         **kwargs: Keyword arguments will be passed to the local
             `xgb.predict()` calls.
 
     Returns: ``np.ndarray`` containing the predicted labels.
 
     """
+    os.environ.setdefault("RAY_IGNORE_UNHANDLED_ERRORS", "1")
+
+    if _remote is None:
+        _remote = _is_client_connected() and \
+                  not is_session_enabled()
+
+    if not ray.is_initialized():
+        ray.init()
+
+    if _remote:
+        return ray.get(
+            ray.remote(num_cpus=0)(predict).remote(
+                model, data, ray_params, _remote=False, **kwargs))
+
     ray_params = _validate_ray_params(ray_params)
 
     max_actor_restarts = ray_params.max_actor_restarts \
diff --git a/xgboost_ray/matrix.py b/xgboost_ray/matrix.py
@@ -652,7 +652,6 @@ def __init__(self,
         self.feature_types = feature_types
         self.missing = missing
 
-        self.memory_node_ip = ray.services.get_node_ip_address()
         self.num_actors = num_actors
         self.sharding = sharding
 
diff --git a/xgboost_ray/tests/test_end_to_end.py b/xgboost_ray/tests/test_end_to_end.py
@@ -4,7 +4,7 @@
 
 import ray
 
-from xgboost_ray import RayParams, train, RayDMatrix
+from xgboost_ray import RayParams, train, RayDMatrix, predict
 
 
 class XGBoostRayEndToEndTest(unittest.TestCase):
@@ -40,6 +40,10 @@ def setUp(self):
             "num_class": 4
         }
 
+    def tearDown(self):
+        if ray.is_initialized:
+            ray.shutdown()
+
     def testSingleTraining(self):
         """Test that XGBoost learns to predict full matrix"""
         dtrain = xgb.DMatrix(self.x, self.y)
@@ -99,6 +103,45 @@ def testJointTraining(self):
         pred_y = bst.predict(x_mat)
         self.assertSequenceEqual(list(self.y), list(pred_y))
 
+    def testTrainPredict(self, init=True, remote=None):
+        """Train with evaluation and predict"""
+        if init:
+            ray.init(num_cpus=2, num_gpus=0)
+
+        dtrain = RayDMatrix(self.x, self.y)
+
+        evals_result = {}
+        bst = train(
+            self.params,
+            dtrain,
+            ray_params=RayParams(num_actors=2),
+            evals=[(dtrain, "dtrain")],
+            evals_result=evals_result,
+            _remote=remote)
+
+        self.assertTrue("dtrain" in evals_result)
+
+        x_mat = RayDMatrix(self.x)
+        pred_y = predict(bst, x_mat, _remote=remote)
+        self.assertSequenceEqual(list(self.y), list(pred_y))
+
+    def testTrainPredictRemote(self):
+        """Train with evaluation and predict in a remote call"""
+        self.testTrainPredict(init=True, remote=True)
+
+    def testTrainPredictClient(self):
+        """Train with evaluation and predict in a client session"""
+        if ray.__version__ <= "1.2.0":
+            self.skipTest("Ray client mocks do not work in Ray <= 1.2.0")
+        from ray.util.client.ray_client_helpers import ray_start_client_server
+
+        ray.init(num_cpus=2, num_gpus=0)
+        self.assertFalse(ray.util.client.ray.is_connected())
+        with ray_start_client_server():
+            self.assertTrue(ray.util.client.ray.is_connected())
+
+            self.testTrainPredict(init=False, remote=None)
+
 
 if __name__ == "__main__":
     import pytest
diff --git a/xgboost_ray/tests/test_tune.py b/xgboost_ray/tests/test_tune.py
@@ -65,6 +65,7 @@ def tearDown(self):
 
     # noinspection PyTypeChecker
     def testNumIters(self):
+        """Test that the number of reported tune results is correct"""
         ray_params = RayParams(cpus_per_actor=1, num_actors=2)
         analysis = tune.run(
             self.train_func(ray_params),
@@ -76,6 +77,18 @@ def testNumIters(self):
             list(analysis.results_df["training_iteration"]),
             list(analysis.results_df["config.num_boost_round"]))
 
+    def testNumItersClient(self):
+        """Test ray client mode"""
+        if ray.__version__ <= "1.2.0":
+            self.skipTest("Ray client mocks do not work in Ray <= 1.2.0")
+
+        from ray.util.client.ray_client_helpers import ray_start_client_server
+
+        self.assertFalse(ray.util.client.ray.is_connected())
+        with ray_start_client_server():
+            self.assertTrue(ray.util.client.ray.is_connected())
+            self.testNumIters()
+
     def testElasticFails(self):
         """Test if error is thrown when using Tune with elastic training."""
         ray_params = RayParams(