From 9f24fd9f689a5aeadd99b356e6322bf77d43e70f Mon Sep 17 00:00:00 2001 From: Sofian Hnaide <103539032+sofianhnaide@users.noreply.github.com> Date: Tue, 17 Sep 2024 18:14:36 -0700 Subject: [PATCH] [Data] Re-phrase the streaming executor current usage string (#47515) ## Why are these changes needed? The progress bar for ray data could still end up showing higher utilization of what the cluster currently have. https://github.com/ray-project/ray/pull/46729 was the first attempt to fix it which addressed the issue in static clusters, but we still have that issue for clusters that autoscales. This change simply rephrase the string so it is less confusing. Before image After image This comes from the fact that operators don't track the task state (and currently ray core does not even provide that api). Which means Ray data operators does not know if the task is assigned to a node or not, so once the task is submitted to ray it is marked active even if it is pending a node assignment. The dashboard does better here since it does have extra information from the task. image In the future we can visit adding the core api for remote state reporting and allowing operators to provide more detailed state (active, pending_scheduled, pending_node_assignment). ## Related issue number ## Checks - [ ] I've signed off every commit(by using the -s flag, i.e., `git commit -s`) in this PR. - [ ] I've run `scripts/format.sh` to lint the changes in this PR. - [ ] I've included any doc changes needed for https://docs.ray.io/en/master/. - [ ] I've added any new APIs to the API Reference. For example, if I added a method in Tune, I've added it in `doc/source/tune/api/` under the corresponding `.rst` file. - [ ] I've made sure the tests are passing. Note that there might be a few flaky tests, see the recent failures at https://flakey-tests.ray.io/ - Testing Strategy - [ ] Unit tests - [ ] Release tests - [ ] This PR is not tested :( --------- Signed-off-by: Sofian Hnaide <103539032+sofianhnaide@users.noreply.github.com> Co-authored-by: scottjlee Co-authored-by: matthewdeng Signed-off-by: ujjawal-khare --- .../data/_internal/execution/streaming_executor.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/ray/data/_internal/execution/streaming_executor.py b/python/ray/data/_internal/execution/streaming_executor.py index 238f6f9421cc..2bf7d185961a 100644 --- a/python/ray/data/_internal/execution/streaming_executor.py +++ b/python/ray/data/_internal/execution/streaming_executor.py @@ -363,9 +363,14 @@ def _report_current_usage(self) -> None: pending_usage = self._resource_manager.get_global_pending_usage() limits = self._resource_manager.get_global_limits() resources_status = ( - # TODO(scottjlee): Add dataset name/ID to progress bar output. - "Running Dataset. Active & requested resources: " - f"{running_usage.cpu:.4g}/{limits.cpu:.4g} CPU, " + "Active & requested resources: " + f"{running_usage.cpu:.4g} of {limits.cpu:.4g} available CPU, " + f"{running_usage.gpu:.4g} of {limits.gpu:.4g} available GPU, " + f"{running_usage.object_store_memory_str()} of " + f"{limits.object_store_memory_str()} available object_store_memory " + "(pending: " + f"{pending_usage.cpu:.4g} CPU, " + f"{pending_usage.gpu:.4g} GPU)" ) if running_usage.gpu > 0: resources_status += f"{running_usage.gpu:.4g}/{limits.gpu:.4g} GPU, "