Lazily read environment variables (#166)

krfricke · web-flow · commit 03e0a3457167 · 2021-10-29T12:15:37.000+02:00
diff --git a/xgboost_ray/elastic.py b/xgboost_ray/elastic.py
@@ -5,8 +5,7 @@
 
 from xgboost_ray.main import RayParams, _TrainingState, \
     logger, ActorHandle, _PrepareActorTask, _create_actor, \
-    RayXGBoostActorAvailable, \
-    ELASTIC_RESTART_RESOURCE_CHECK_S, ELASTIC_RESTART_GRACE_PERIOD_S
+    RayXGBoostActorAvailable, ENV
 
 from xgboost_ray.matrix import RayDMatrix
 
@@ -36,7 +35,7 @@ def _maybe_schedule_new_actors(
 
     # Check periodically every n seconds.
     if now < training_state.last_resource_check_at + \
-            ELASTIC_RESTART_RESOURCE_CHECK_S:
+            ENV.ELASTIC_RESTART_RESOURCE_CHECK_S:
         return False
 
     training_state.last_resource_check_at = now
@@ -108,7 +107,7 @@ def _update_scheduled_actor_states(training_state: _TrainingState):
         # If an actor became ready but other actors are pending, we wait
         # for n seconds before restarting, as chances are that they become
         # ready as well (e.g. if a large node came up).
-        grace_period = ELASTIC_RESTART_GRACE_PERIOD_S
+        grace_period = ENV.ELASTIC_RESTART_GRACE_PERIOD_S
         if training_state.restart_training_at is None:
             logger.debug(
                 f"A RayXGBoostActor became ready for training. Waiting "
diff --git a/xgboost_ray/main.py b/xgboost_ray/main.py
@@ -16,7 +16,14 @@
 import pandas as pd
 
 from xgboost_ray.xgb import xgboost as xgb
-from xgboost.core import XGBoostError, EarlyStopException
+from xgboost.core import XGBoostError
+
+try:
+    from xgboost.core import EarlyStopException
+except ImportError:
+
+    class EarlyStopException(XGBoostError):
+        pass
 
 from xgboost_ray.callback import DistributedCallback, \
     DistributedCallbackContainer
@@ -64,28 +71,56 @@ def inner_f(*args, **kwargs):
 from xgboost_ray.session import init_session, put_queue, \
     set_session_queue
 
-# Whether to use SPREAD placement group strategy for training.
-_USE_SPREAD_STRATEGY = int(os.getenv("RXGB_USE_SPREAD_STRATEGY", 1))
 
-# How long to wait for placement group creation before failing.
-PLACEMENT_GROUP_TIMEOUT_S = int(
-    os.getenv("RXGB_PLACEMENT_GROUP_TIMEOUT_S", 100))
+def _get_environ(item: str, old_val: Any):
+    env_var = f"RXGB_{item}"
+    new_val = old_val
+    if env_var in os.environ:
+        new_val_str = os.environ.get(env_var)
+
+        if isinstance(old_val, bool):
+            new_val = bool(int(new_val_str))
+        elif isinstance(old_val, int):
+            new_val = int(new_val_str)
+        elif isinstance(old_val, float):
+            new_val = float(new_val_str)
+        else:
+            new_val = new_val_str
+
+    return new_val
+
+
+@dataclass
+class _XGBoostEnv:
+    # Whether to use SPREAD placement group strategy for training.
+    USE_SPREAD_STRATEGY: bool = True
+
+    # How long to wait for placement group creation before failing.
+    PLACEMENT_GROUP_TIMEOUT_S: int = 100
+
+    # Status report frequency when waiting for initial actors
+    # and during training
+    STATUS_FREQUENCY_S: int = 30
+
+    # If restarting failed actors is disabled
+    ELASTIC_RESTART_DISABLED: bool = False
+
+    # How often to check for new available resources
+    ELASTIC_RESTART_RESOURCE_CHECK_S: int = 30
 
-# Status report frequency when waiting for initial actors and during training
-STATUS_FREQUENCY_S = int(os.getenv("RXGB_STATUS_FREQUENCY_S", 30))
+    # How long to wait before triggering a new start of the training loop
+    # when new actors become available
+    ELASTIC_RESTART_GRACE_PERIOD_S: int = 10
 
-# If restarting failed actors is disabled
-ELASTIC_RESTART_DISABLED = bool(
-    int(os.getenv("RXGB_ELASTIC_RESTART_DISABLED", 0)))
+    def __getattribute__(self, item):
+        old_val = super(_XGBoostEnv, self).__getattribute__(item)
+        new_val = _get_environ(item, old_val)
+        if new_val != old_val:
+            setattr(self, item, new_val)
+        return super(_XGBoostEnv, self).__getattribute__(item)
 
-# How often to check for new available resources
-ELASTIC_RESTART_RESOURCE_CHECK_S = int(
-    os.getenv("RXGB_ELASTIC_RESTART_RESOURCE_CHECK_S", 30))
 
-# How long to wait before triggering a new start of the training loop
-# when new actors become available
-ELASTIC_RESTART_GRACE_PERIOD_S = int(
-    os.getenv("RXGB_ELASTIC_RESTART_GRACE_PERIOD_S", 10))
+ENV = _XGBoostEnv()
 
 xgboost_version = xgb.__version__ if xgb else "0.0.0"
 
@@ -138,22 +173,32 @@ def _is_client_connected() -> bool:
         return False
 
 
-class _RabitTracker(RabitTracker):
+class _RabitTrackerCompatMixin:
+    """Fallback calls to legacy terminology"""
+
+    def accept_workers(self, n_workers: int):
+        return self.accept_slaves(n_workers)
+
+    def worker_envs(self):
+        return self.slave_envs()
+
+
+class _RabitTracker(RabitTracker, _RabitTrackerCompatMixin):
     """
     This method overwrites the xgboost-provided RabitTracker to switch
     from a daemon thread to a multiprocessing Process. This is so that
     we are able to terminate/kill the tracking process at will.
     """
 
-    def start(self, nslave):
+    def start(self, nworker):
         # TODO: refactor RabitTracker to support spawn process creation.
         # In python 3.8, spawn is used as default process creation on macOS.
         # But spawn doesn't work because `run` is not pickleable.
         # For now we force the start method to use fork.
         multiprocessing.set_start_method("fork", force=True)
 
         def run():
-            self.accept_slaves(nslave)
+            self.accept_workers(nworker)
 
         self.thread = multiprocessing.Process(target=run, args=())
         self.thread.start()
@@ -178,10 +223,10 @@ def _start_rabit_tracker(num_workers: int):
 
     env = {"DMLC_NUM_WORKER": num_workers}
 
-    rabit_tracker = _RabitTracker(hostIP=host, nslave=num_workers)
+    rabit_tracker = _RabitTracker(host, num_workers)
 
     # Get tracker Host + IP
-    env.update(rabit_tracker.slave_envs())
+    env.update(rabit_tracker.worker_envs())
     rabit_tracker.start(num_workers)
 
     logger.debug(
@@ -704,7 +749,7 @@ def _create_actor(
 
 def _trigger_data_load(actor, dtrain, evals):
     wait_load = [actor.load_data.remote(dtrain)]
-    for deval, name in evals:
+    for deval, _name in evals:
         wait_load.append(actor.load_data.remote(deval))
     return wait_load
 
@@ -778,7 +823,7 @@ def _create_placement_group(cpus_per_actor, gpus_per_actor,
     pg = placement_group(bundles, strategy=strategy)
     # Wait for placement group to get created.
     logger.debug("Waiting for placement group to start.")
-    ready, _ = ray.wait([pg.ready()], timeout=PLACEMENT_GROUP_TIMEOUT_S)
+    ready, _ = ray.wait([pg.ready()], timeout=ENV.PLACEMENT_GROUP_TIMEOUT_S)
     if ready:
         logger.debug("Placement group has started.")
     else:
@@ -955,7 +1000,7 @@ def handle_actor_failure(actor_id):
         # Construct list before calling any() to force evaluation
         ready_states = [task.is_ready() for task in prepare_actor_tasks]
         while not all(ready_states):
-            if time.time() >= last_status + STATUS_FREQUENCY_S:
+            if time.time() >= last_status + ENV.STATUS_FREQUENCY_S:
                 wait_time = time.time() - start_wait
                 logger.info(f"Waiting until actors are ready "
                             f"({wait_time:.0f} seconds passed).")
@@ -1029,7 +1074,7 @@ def handle_actor_failure(actor_id):
                     callback_returns=callback_returns)
 
             if ray_params.elastic_training \
-                    and not ELASTIC_RESTART_DISABLED:
+                    and not ENV.ELASTIC_RESTART_DISABLED:
                 _maybe_schedule_new_actors(
                     training_state=_training_state,
                     num_cpus_per_actor=cpus_per_actor,
@@ -1041,7 +1086,7 @@ def handle_actor_failure(actor_id):
                 # This may raise RayXGBoostActorAvailable
                 _update_scheduled_actor_states(_training_state)
 
-            if time.time() >= last_status + STATUS_FREQUENCY_S:
+            if time.time() >= last_status + ENV.STATUS_FREQUENCY_S:
                 wait_time = time.time() - start_wait
                 logger.info(f"Training in progress "
                             f"({wait_time:.0f} seconds since last restart).")
@@ -1290,7 +1335,7 @@ def _wrapped(*args, **kwargs):
     if not dtrain.loaded and not dtrain.distributed:
         dtrain.load_data(ray_params.num_actors)
 
-    for (deval, name) in evals:
+    for (deval, _name) in evals:
         if not deval.has_label:
             raise ValueError(
                 "Evaluation data has no label set. Please make sure to set "
@@ -1321,7 +1366,7 @@ def _wrapped(*args, **kwargs):
                 placement_strategy = None
             else:
                 placement_strategy = "PACK"
-        elif bool(_USE_SPREAD_STRATEGY):
+        elif bool(ENV.USE_SPREAD_STRATEGY):
             placement_strategy = "SPREAD"
 
     if placement_strategy is not None:
diff --git a/xgboost_ray/tests/test_colocation.py b/xgboost_ray/tests/test_colocation.py
@@ -176,9 +176,10 @@ def inner_func(config):
                 num_samples=1,
             )
 
-    @patch("xgboost_ray.main.PLACEMENT_GROUP_TIMEOUT_S", 5)
     def test_timeout(self):
         """Checks that an error occurs when placement group setup times out."""
+        os.environ["RXGB_PLACEMENT_GROUP_TIMEOUT_S"] = "5"
+
         with self.ray_start_cluster() as cluster:
             ray.init(address=cluster.address)
 
diff --git a/xgboost_ray/tests/test_fault_tolerance.py b/xgboost_ray/tests/test_fault_tolerance.py
@@ -33,6 +33,9 @@ class XGBoostRayFaultToleranceTest(unittest.TestCase):
     """
 
     def setUp(self):
+        # Set default
+        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "0"
+
         repeat = 8  # Repeat data a couple of times for stability
         self.x = np.array([
             [1, 0, 0, 0],  # Feature 0 -> Label 0
@@ -107,9 +110,10 @@ def keep(actors, *args, **kwargs):
         # Two workers finished, so N=32
         self.assertEqual(additional_results["total_n"], 32)
 
-    @patch("xgboost_ray.main.ELASTIC_RESTART_DISABLED", True)
     def testTrainingContinuationElasticKilled(self):
         """This should continue after one actor died."""
+        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1"
+
         logging.getLogger().setLevel(10)
 
         additional_results = {}
@@ -148,7 +152,6 @@ def keep(actors, *args, **kwargs):
         # Only one worker finished, so n=16
         self.assertEqual(additional_results["total_n"], 16)
 
-    @patch("xgboost_ray.main.ELASTIC_RESTART_DISABLED", False)
     def testTrainingContinuationElasticKilledRestarted(self):
         """This should continue after one actor died and restart it."""
         logging.getLogger().setLevel(10)
@@ -201,9 +204,10 @@ def keep(actors, *args, **kwargs):
         # Both workers finished, so n=32
         self.assertEqual(additional_results["total_n"], 32)
 
-    @patch("xgboost_ray.main.ELASTIC_RESTART_DISABLED", True)
     def testTrainingContinuationElasticMultiKilled(self):
         """This should still show 20 boost rounds after two failures."""
+        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1"
+
         logging.getLogger().setLevel(10)
 
         additional_results = {}
@@ -232,9 +236,9 @@ def testTrainingContinuationElasticMultiKilled(self):
         self.assertSequenceEqual(list(self.y), list(pred_y))
         print(f"Got correct predictions: {pred_y}")
 
-    @patch("xgboost_ray.main.ELASTIC_RESTART_DISABLED", True)
     def testTrainingContinuationElasticFailed(self):
         """This should continue after one actor failed training."""
+        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "1"
 
         additional_results = {}
         keep_actors = {}
@@ -285,6 +289,8 @@ def testTrainingStop(self):
 
     def testTrainingStopElastic(self):
         """This should now stop training after one actor died."""
+        os.environ["RXGB_ELASTIC_RESTART_DISABLED"] = "0"
+
         # The `train()` function raises a RuntimeError
         ft_manager = FaultToleranceManager.remote()
 
@@ -419,8 +425,6 @@ def testSameResultWithAndWithoutError(self):
     @patch("xgboost_ray.main._PrepareActorTask", _FakeTask)
     @patch("xgboost_ray.elastic._PrepareActorTask", _FakeTask)
     @patch("xgboost_ray.main._RemoteRayXGBoostActor", MagicMock)
-    @patch("xgboost_ray.main.ELASTIC_RESTART_GRACE_PERIOD_S", 30)
-    @patch("xgboost_ray.elastic.ELASTIC_RESTART_GRACE_PERIOD_S", 30)
     def testMaybeScheduleNewActors(self):
         """Test scheduling of new actors if resources become available.
 
@@ -436,6 +440,8 @@ def testMaybeScheduleNewActors(self):
         from xgboost_ray.elastic import _update_scheduled_actor_states
         from xgboost_ray.elastic import _maybe_schedule_new_actors
 
+        os.environ["RXGB_ELASTIC_RESTART_GRACE_PERIOD_S"] = "30"
+
         # Three actors are dead
         actors = [
             MagicMock(), None,
@@ -520,7 +526,7 @@ def fake_create_actor(rank, *args, **kwargs):
             # actor.
             _update_scheduled_actor_states(training_state=state)
 
-            # Grace period is set through ELASTIC_RESTART_GRACE_PERIOD_S
+            # Grace period is set through ENV.ELASTIC_RESTART_GRACE_PERIOD_S
             # Allow for some slack in test execution
             self.assertGreaterEqual(state.restart_training_at,
                                     time.time() + 22)
diff --git a/xgboost_ray/tests/test_sklearn_matrix.py b/xgboost_ray/tests/test_sklearn_matrix.py
@@ -43,22 +43,21 @@ def testClassifier(self, n_class=2):
         train_matrix = RayDMatrix(X_train, y_train)
         test_matrix = RayDMatrix(X_test, y_test)
 
-        with self.assertRaisesRegex(ValueError, "use_label_encoder"):
+        with self.assertRaisesRegex(Exception, "use_label_encoder"):
             RayXGBClassifier(
                 use_label_encoder=True, **self.params).fit(train_matrix, None)
 
-        with self.assertRaisesRegex(ValueError, "num_class"):
+        with self.assertRaisesRegex(Exception, "num_class"):
             RayXGBClassifier(
                 use_label_encoder=False, **self.params).fit(
                     train_matrix, None)
 
-        with self.assertRaisesRegex(ValueError,
-                                    r"must be \(RayDMatrix, str\)"):
+        with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"):
             RayXGBClassifier(
                 use_label_encoder=False, **self.params).fit(
                     train_matrix, None, eval_set=[(X_test, y_test)])
 
-        with self.assertRaisesRegex(ValueError,
+        with self.assertRaisesRegex(Exception,
                                     r"must be \(array_like, array_like\)"):
             RayXGBClassifier(
                 use_label_encoder=False, **self.params).fit(
@@ -97,15 +96,14 @@ def testClassifierLegacy(self, n_class=2):
         train_matrix = RayDMatrix(X_train, y_train)
         test_matrix = RayDMatrix(X_test, y_test)
 
-        with self.assertRaisesRegex(ValueError, "num_class"):
+        with self.assertRaisesRegex(Exception, "num_class"):
             RayXGBClassifier(**self.params).fit(train_matrix, None)
 
-        with self.assertRaisesRegex(ValueError,
-                                    r"must be \(RayDMatrix, str\)"):
+        with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"):
             RayXGBClassifier(**self.params).fit(
                 train_matrix, None, eval_set=[(X_test, y_test)])
 
-        with self.assertRaisesRegex(ValueError,
+        with self.assertRaisesRegex(Exception,
                                     r"must be \(array_like, array_like\)"):
             RayXGBClassifier(**self.params).fit(
                 X_train, y_train, eval_set=[(test_matrix, "eval")])
@@ -140,12 +138,11 @@ def testRegressor(self):
         train_matrix = RayDMatrix(X_train, y_train)
         test_matrix = RayDMatrix(X_test, y_test)
 
-        with self.assertRaisesRegex(ValueError,
-                                    r"must be \(RayDMatrix, str\)"):
+        with self.assertRaisesRegex(Exception, r"must be \(RayDMatrix, str\)"):
             RayXGBRegressor(**self.params).fit(
                 train_matrix, None, eval_set=[(X_test, y_test)])
 
-        with self.assertRaisesRegex(ValueError,
+        with self.assertRaisesRegex(Exception,
                                     r"must be \(array_like, array_like\)"):
             RayXGBRegressor(**self.params).fit(
                 X_train, y_train, eval_set=[(test_matrix, "eval")])