Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce verbosity for ray events #226

Merged
merged 4 commits into from
Aug 15, 2022
Merged
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 20 additions & 10 deletions xgboost_ray/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,8 @@ class RayParams:
Defaults to 0 (no retries). Set to -1 for unlimited retries.
checkpoint_frequency (int): How often to save checkpoints. Defaults
to ``5`` (every 5th iteration).
verbose (bool): Whether to output Ray-specific info messages
during training/prediction.
"""
# Actor scheduling
num_actors: int = 0
Expand All @@ -382,6 +384,8 @@ class RayParams:
# Distributed callbacks
distributed_callbacks: Optional[List[DistributedCallback]] = None

verbose: bool = False
krfricke marked this conversation as resolved.
Show resolved Hide resolved

def get_tune_resources(self):
"""Return the resources to use for xgboost_ray training with Tune."""
if self.cpus_per_actor <= 0 or self.num_actors <= 0:
Expand Down Expand Up @@ -977,9 +981,10 @@ def handle_actor_failure(actor_id):
newly_created += 1

alive_actors = sum(1 for a in _training_state.actors if a is not None)
logger.info(f"[RayXGBoost] Created {newly_created} new actors "
f"({alive_actors} total actors). Waiting until actors "
f"are ready for training.")
if ray_params.verbose:
logger.info(f"[RayXGBoost] Created {newly_created} new actors "
f"({alive_actors} total actors). Waiting until actors "
f"are ready for training.")

# For distributed datasets (e.g. Modin), this will initialize
# (and fix) the assignment of data shards to actor ranks
Expand Down Expand Up @@ -1022,7 +1027,8 @@ def handle_actor_failure(actor_id):
_get_actor_alive_status(_training_state.actors, handle_actor_failure)
raise RayActorError from exc

logger.info("[RayXGBoost] Starting XGBoost training.")
if ray_params.verbose:
logger.info("[RayXGBoost] Starting XGBoost training.")

# Start Rabit tracker for gradient sharing
rabit_process, env = _start_rabit_tracker(alive_actors)
Expand Down Expand Up @@ -1513,10 +1519,12 @@ def _wrapped(*args, **kwargs):
train_additional_results["training_time_s"] = total_training_time
train_additional_results["total_time_s"] = total_time

logger.info("[RayXGBoost] Finished XGBoost training on training data "
"with total N={total_n:,} in {total_time_s:.2f} seconds "
"({training_time_s:.2f} pure XGBoost training time).".format(
**train_additional_results))
if ray_params.verbose:
logger.info(
"[RayXGBoost] Finished XGBoost training on training data "
"with total N={total_n:,} in {total_time_s:.2f} seconds "
"({training_time_s:.2f} pure XGBoost training time).".format(
**train_additional_results))

_shutdown(
actors=actors,
Expand Down Expand Up @@ -1553,7 +1561,8 @@ def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams,
distributed_callbacks=ray_params.distributed_callbacks)
for i in range(ray_params.num_actors)
]
logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.")
if ray_params.verbose:
logger.info(f"[RayXGBoost] Created {len(actors)} remote actors.")

# Split data across workers
wait_load = []
Expand All @@ -1570,7 +1579,8 @@ def _predict(model: xgb.Booster, data: RayDMatrix, ray_params: RayParams,
# Put model into object store
model_ref = ray.put(model)

logger.info("[RayXGBoost] Starting XGBoost prediction.")
if ray_params.verbose:
logger.info("[RayXGBoost] Starting XGBoost prediction.")

# Train
fut = [actor.predict.remote(model_ref, data, **kwargs) for actor in actors]
Expand Down