Skip to content
This repository was archived by the owner on Jan 12, 2026. It is now read-only.

Commit cad8c3e

Browse files
richardliawKai Fricke
andauthored
Reduce verbosity for ray events (#226)
Adds a verbose flag to RayParams to toggle events. Defaults to False, but open to discuss. The main thing is that when using AIR/Tune, this logging becomes quite verbose. Signed-off-by: Richard Liaw <rliaw@berkeley.edu> Co-authored-by: Kai Fricke <kai@anyscale.com>
1 parent 08f3bc1 commit cad8c3e

File tree

1 file changed

+38
-10
lines changed

1 file changed

+38
-10
lines changed

xgboost_ray/main.py

Lines changed: 38 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,8 @@ class RayParams:
370370
Defaults to 0 (no retries). Set to -1 for unlimited retries.
371371
checkpoint_frequency (int): How often to save checkpoints. Defaults
372372
to ``5`` (every 5th iteration).
373+
verbose (bool): Whether to output Ray-specific info messages
374+
during training/prediction.
373375
"""
374376
# Actor scheduling
375377
num_actors: int = 0
@@ -386,6 +388,8 @@ class RayParams:
386388
# Distributed callbacks
387389
distributed_callbacks: Optional[List[DistributedCallback]] = None
388390

391+
verbose: Optional[bool] = None
392+
389393
def get_tune_resources(self):
390394
"""Return the resources to use for xgboost_ray training with Tune."""
391395
if self.cpus_per_actor <= 0 or self.num_actors <= 0:
@@ -426,6 +430,9 @@ def _validate_ray_params(ray_params: Union[None, RayParams, dict]) \
426430
warnings.warn(
427431
f"`num_actors` in `ray_params` is smaller than 2 "
428432
f"({ray_params.num_actors}). XGBoost will NOT be distributed!")
433+
if ray_params.verbose is None:
434+
# In Tune sessions, reduce verbosity
435+
ray_params.verbose = not is_session_enabled()
429436
return ray_params
430437

431438

@@ -930,6 +937,9 @@ def _train(params: Dict,
930937
from xgboost_ray.elastic import _maybe_schedule_new_actors, \
931938
_update_scheduled_actor_states, _get_actor_alive_status
932939

940+
# Do not modify original parameters
941+
params = params.copy()
942+
933943
# Un-schedule possible scheduled restarts
934944
_training_state.restart_training_at = None
935945

@@ -944,6 +954,13 @@ def _train(params: Dict,
944954
params["nthread"] = cpus_per_actor
945955
params["n_jobs"] = cpus_per_actor
946956

957+
if ray_params.verbose:
958+
maybe_log = logger.info
959+
params.setdefault("verbosity", 1)
960+
else:
961+
maybe_log = logger.debug
962+
params.setdefault("verbosity", 0)
963+
947964
# This is a callback that handles actor failures.
948965
# We identify the rank of the failed actor, add this to a set of
949966
# failed actors (which we might want to restart later), and set its
@@ -979,9 +996,10 @@ def handle_actor_failure(actor_id):
979996
newly_created += 1
980997

981998
alive_actors = sum(1 for a in _training_state.actors if a is not None)
982-
logger.info(f"[RayXGBoost] Created {newly_created} new actors "
983-
f"({alive_actors} total actors). Waiting until actors "
984-
f"are ready for training.")
999+
1000+
maybe_log(f"[RayXGBoost] Created {newly_created} new actors "
1001+
f"({alive_actors} total actors). Waiting until actors "
1002+
f"are ready for training.")
9851003

9861004
# For distributed datasets (e.g. Modin), this will initialize
9871005
# (and fix) the assignment of data shards to actor ranks
@@ -1024,7 +1042,7 @@ def handle_actor_failure(actor_id):
10241042
_get_actor_alive_status(_training_state.actors, handle_actor_failure)
10251043
raise RayActorError from exc
10261044

1027-
logger.info("[RayXGBoost] Starting XGBoost training.")
1045+
maybe_log("[RayXGBoost] Starting XGBoost training.")
10281046

10291047
# Start Rabit tracker for gradient sharing
10301048
rabit_process, env = _start_rabit_tracker(alive_actors)
@@ -1515,10 +1533,15 @@ def _wrapped(*args, **kwargs):
15151533
train_additional_results["training_time_s"] = total_training_time
15161534
train_additional_results["total_time_s"] = total_time
15171535

1518-
logger.info("[RayXGBoost] Finished XGBoost training on training data "
1519-
"with total N={total_n:,} in {total_time_s:.2f} seconds "
1520-
"({training_time_s:.2f} pure XGBoost training time).".format(
1521-
**train_additional_results))
1536+
if ray_params.verbose:
1537+
maybe_log = logger.info
1538+
else:
1539+
maybe_log = logger.debug
1540+
1541+
maybe_log("[RayXGBoost] Finished XGBoost training on training data "
1542+
"with total N={total_n:,} in {total_time_s:.2f} seconds "
1543+
"({training_time_s:.2f} pure XGBoost training time).".format(
1544+
**train_additional_results))
15221545

15231546
_shutdown(
15241547
actors=actors,
@@ -1540,6 +1563,11 @@ def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams,
15401563
**kwargs):
15411564
_assert_ray_support()
15421565

1566+
if ray_params.verbose:
1567+
maybe_log = logger.info
1568+
else:
1569+
maybe_log = logger.debug
1570+
15431571
if not ray.is_initialized():
15441572
ray.init()
15451573

@@ -1555,7 +1583,7 @@ def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams,
15551583
distributed_callbacks=ray_params.distributed_callbacks)
15561584
for i in range(ray_params.num_actors)
15571585
]
1558-
logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.")
1586+
maybe_log(f"[RayXGBoost] Created {len(actors)} remote actors.")
15591587

15601588
# Split data across workers
15611589
wait_load = []
@@ -1572,7 +1600,7 @@ def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams,
15721600
# Put model into object store
15731601
model_ref = ray.put(model)
15741602

1575-
logger.info("[RayXGBoost] Starting XGBoost prediction.")
1603+
maybe_log("[RayXGBoost] Starting XGBoost prediction.")
15761604

15771605
# Train
15781606
fut = [actor.predict.remote(model_ref, data, **kwargs) for actor in actors]

0 commit comments

Comments
 (0)