Fix number of boost rounds after failures (#59)

krfricke · web-flow · commit 93ff047b0ab2 · 2021-02-22T09:44:12.000+01:00
* Update README

* Fix number of boosting rounds after failures

* Fix for client mode
diff --git a/xgboost_ray/main.py b/xgboost_ray/main.py
@@ -819,9 +819,6 @@ def handle_actor_failure(actor_id):
                 f"checkpointed model instead.")
             return kwargs["xgb_model"], {}, _training_state.additional_results
 
-        kwargs["num_boost_round"] = kwargs.get(
-            "num_boost_round", 10) - _training_state.checkpoint.iteration - 1
-
     # The callback_returns dict contains actor-rank indexed lists of
     # results obtained through the `put_queue` function, usually
     # sent via callbacks.
@@ -928,13 +925,14 @@ def handle_actor_failure(actor_id):
 
 def train(params: Dict,
           dtrain: RayDMatrix,
+          num_boost_round: int = 10,
           *args,
           evals=(),
           evals_result: Optional[Dict] = None,
           additional_results: Optional[Dict] = None,
           ray_params: Union[None, RayParams, Dict] = None,
           _remote: Optional[bool] = None,
-          **kwargs):
+          **kwargs) -> xgb.Booster:
     """Distributed XGBoost training via Ray.
 
     This function will connect to a Ray cluster, create ``num_actors``
@@ -1000,6 +998,7 @@ def _wrapped(*args, **kwargs):
             _additional_results = {}
             bst = train(
                 *args,
+                num_boost_round=num_boost_round,
                 evals_result=_evals_result,
                 additional_results=_additional_results,
                 **kwargs)
@@ -1108,7 +1107,19 @@ def _wrapped(*args, **kwargs):
     start_actor_ranks = set(range(ray_params.num_actors))  # Start these
 
     total_training_time = 0.
+    boost_rounds_left = num_boost_round
+    last_checkpoint_value = checkpoint.value
     while tries <= max_actor_restarts:
+        # Only update number of iterations if the checkpoint changed
+        # If it didn't change, we already subtracted the iterations.
+        if checkpoint.iteration >= 0 and \
+                checkpoint.value != last_checkpoint_value:
+            boost_rounds_left -= checkpoint.iteration + 1
+
+        last_checkpoint_value = checkpoint.value
+
+        logger.debug(f"Boost rounds left: {boost_rounds_left}")
+
         training_state = _TrainingState(
             actors=actors,
             queue=queue,
@@ -1124,6 +1135,7 @@ def _wrapped(*args, **kwargs):
             bst, train_evals_result, train_additional_results = _train(
                 params,
                 dtrain,
+                boost_rounds_left,
                 *args,
                 evals=evals,
                 ray_params=ray_params,
diff --git a/xgboost_ray/tests/test_end_to_end.py b/xgboost_ray/tests/test_end_to_end.py
@@ -5,6 +5,7 @@
 import ray
 
 from xgboost_ray import RayParams, train, RayDMatrix, predict
+from xgboost_ray.tests.utils import get_num_trees
 
 
 class XGBoostRayEndToEndTest(unittest.TestCase):
@@ -114,11 +115,14 @@ def testTrainPredict(self, init=True, remote=None):
         bst = train(
             self.params,
             dtrain,
+            num_boost_round=38,
             ray_params=RayParams(num_actors=2),
             evals=[(dtrain, "dtrain")],
             evals_result=evals_result,
             _remote=remote)
 
+        self.assertEqual(get_num_trees(bst), 38)
+
         self.assertTrue("dtrain" in evals_result)
 
         x_mat = RayDMatrix(self.x)
diff --git a/xgboost_ray/tests/test_fault_tolerance.py b/xgboost_ray/tests/test_fault_tolerance.py
@@ -14,7 +14,7 @@
 from xgboost_ray import train, RayDMatrix, RayParams
 from xgboost_ray.main import RayXGBoostActorAvailable
 from xgboost_ray.tests.utils import flatten_obj, _checkpoint_callback, \
-    _fail_callback, tree_obj, _kill_callback, _sleep_callback
+    _fail_callback, tree_obj, _kill_callback, _sleep_callback, get_num_trees
 
 
 class _FakeTask(MagicMock):
@@ -90,6 +90,8 @@ def keep(actors, *args, **kwargs):
                 ray_params=RayParams(max_actor_restarts=1, num_actors=2),
                 additional_results=additional_results)
 
+        self.assertEqual(20, get_num_trees(bst))
+
         x_mat = xgb.DMatrix(self.x)
         pred_y = bst.predict(x_mat)
         self.assertSequenceEqual(list(self.y), list(pred_y))
@@ -129,6 +131,8 @@ def keep(actors, *args, **kwargs):
                     max_failed_actors=1),
                 additional_results=additional_results)
 
+        self.assertEqual(20, get_num_trees(bst))
+
         x_mat = xgb.DMatrix(self.x)
         pred_y = bst.predict(x_mat)
         self.assertSequenceEqual(list(self.y), list(pred_y))
@@ -172,6 +176,8 @@ def keep(actors, *args, **kwargs):
                     max_failed_actors=1),
                 additional_results=additional_results)
 
+        self.assertEqual(20, get_num_trees(bst))
+
         x_mat = xgb.DMatrix(self.x)
         pred_y = bst.predict(x_mat)
         self.assertSequenceEqual(list(self.y), list(pred_y))
@@ -186,6 +192,37 @@ def keep(actors, *args, **kwargs):
         # Both workers finished, so n=32
         self.assertEqual(additional_results["total_n"], 32)
 
+    @patch("xgboost_ray.main.ELASTIC_RESTART_DISABLED", True)
+    def testTrainingContinuationElasticMultiKilled(self):
+        """This should still show 20 boost rounds after two failures."""
+        logging.getLogger().setLevel(10)
+
+        additional_results = {}
+
+        bst = train(
+            self.params,
+            RayDMatrix(self.x, self.y),
+            callbacks=[
+                _kill_callback(
+                    self.die_lock_file, fail_iteration=6, actor_rank=0),
+                _kill_callback(
+                    self.die_lock_file_2, fail_iteration=14, actor_rank=1),
+            ],
+            num_boost_round=20,
+            ray_params=RayParams(
+                max_actor_restarts=2,
+                num_actors=2,
+                elastic_training=True,
+                max_failed_actors=2),
+            additional_results=additional_results)
+
+        self.assertEqual(20, get_num_trees(bst))
+
+        x_mat = xgb.DMatrix(self.x)
+        pred_y = bst.predict(x_mat)
+        self.assertSequenceEqual(list(self.y), list(pred_y))
+        print(f"Got correct predictions: {pred_y}")
+
     @patch("xgboost_ray.main.ELASTIC_RESTART_DISABLED", True)
     def testTrainingContinuationElasticFailed(self):
         """This should continue after one actor failed training."""
@@ -211,6 +248,8 @@ def keep(actors, *args, **kwargs):
                     max_failed_actors=1),
                 additional_results=additional_results)
 
+        self.assertEqual(20, get_num_trees(bst))
+
         x_mat = xgb.DMatrix(self.x)
         pred_y = bst.predict(x_mat)
         self.assertSequenceEqual(list(self.y), list(pred_y))
diff --git a/xgboost_ray/tests/utils.py b/xgboost_ray/tests/utils.py
@@ -13,6 +13,12 @@
 from xgboost_ray.session import get_actor_rank, put_queue
 
 
+def get_num_trees(bst: xgb.Booster):
+    import json
+    data = [json.loads(d) for d in bst.get_dump(dump_format="json")]
+    return len(data) // 4
+
+
 def create_data(num_rows: int, num_cols: int, dtype: np.dtype = np.float32):
 
     return pd.DataFrame(