-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[tune] Better error message for Tune nested tasks / actors #25241
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -3,9 +3,17 @@ | |||||
import os | ||||||
import logging | ||||||
import traceback | ||||||
from typing import Dict, Optional, Set | ||||||
|
||||||
import ray | ||||||
from ray.util.debug import log_once | ||||||
from ray.util.annotations import PublicAPI, DeveloperAPI | ||||||
from ray.util.placement_group import _valid_resource_shape | ||||||
from ray.util.scheduling_strategies import ( | ||||||
SchedulingStrategyT, | ||||||
PlacementGroupSchedulingStrategy, | ||||||
) | ||||||
from ray.tune.error import TuneError | ||||||
|
||||||
logger = logging.getLogger(__name__) | ||||||
|
||||||
|
@@ -67,9 +75,84 @@ def init(reporter, ignore_reinit_error=True): | |||||
"Most session commands will have no effect." | ||||||
) | ||||||
|
||||||
# Setup hooks for generating placement group resource deadlock warnings. | ||||||
from ray import actor, remote_function | ||||||
|
||||||
if "TUNE_DISABLE_RESOURCE_CHECKS" not in os.environ: | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we want to add to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would be great if you could add this to the docs. Also
Suggested change
to follow convention with other tune env variables There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd like to keep this a hidden / internal flag only, that's not documented. It should only be used for internal debugging. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ok with me |
||||||
actor._actor_launch_hook = tune_task_and_actor_launch_hook | ||||||
remote_function._task_launch_hook = tune_task_and_actor_launch_hook | ||||||
|
||||||
_session = reporter | ||||||
|
||||||
|
||||||
# Cache of resource dicts that have been checked by the launch hook already. | ||||||
_checked_resources: Set[frozenset] = set() | ||||||
|
||||||
|
||||||
def tune_task_and_actor_launch_hook( | ||||||
fn, resources: Dict[str, float], strategy: Optional[SchedulingStrategyT] | ||||||
): | ||||||
"""Launch hook to catch nested tasks that can't fit in the placement group. | ||||||
|
||||||
This gives users a nice warning in case they launch a nested task in a Tune trial | ||||||
without reserving resources in the trial placement group to fit it. | ||||||
""" | ||||||
|
||||||
# Already checked, skip for performance reasons. | ||||||
key = frozenset({(k, v) for k, v in resources.items() if v > 0}) | ||||||
if not key or key in _checked_resources: | ||||||
return | ||||||
|
||||||
# No need to check if placement group is None. | ||||||
if ( | ||||||
not isinstance(strategy, PlacementGroupSchedulingStrategy) | ||||||
or strategy.placement_group is None | ||||||
): | ||||||
return | ||||||
|
||||||
# Check if the resource request is targeting the current placement group. | ||||||
cur_pg = ray.util.get_current_placement_group() | ||||||
if not cur_pg or strategy.placement_group.id != cur_pg.id: | ||||||
return | ||||||
|
||||||
_checked_resources.add(key) | ||||||
|
||||||
# Check if the request can be fulfilled by the current placement group. | ||||||
pgf = get_trial_resources() | ||||||
|
||||||
if pgf.head_bundle_is_empty: | ||||||
available_bundles = cur_pg.bundle_specs[0:] | ||||||
else: | ||||||
available_bundles = cur_pg.bundle_specs[1:] | ||||||
|
||||||
# Check if the request can be fulfilled by the current placement group. | ||||||
if _valid_resource_shape(resources, available_bundles): | ||||||
return | ||||||
|
||||||
if fn.class_name: | ||||||
submitted = "actor" | ||||||
name = fn.module_name + "." + fn.class_name + "." + fn.function_name | ||||||
else: | ||||||
submitted = "task" | ||||||
name = fn.module_name + "." + fn.function_name | ||||||
|
||||||
# Normalize the resource spec so it looks the same as the placement group bundle. | ||||||
main_resources = cur_pg.bundle_specs[0] | ||||||
resources = {k: float(v) for k, v in resources.items() if v > 0} | ||||||
|
||||||
raise TuneError( | ||||||
f"No trial resources are available for launching the {submitted} `{name}`. " | ||||||
"To resolve this, specify the Tune option:\n\n" | ||||||
"> resources_per_trial=tune.PlacementGroupFactory(\n" | ||||||
f"> [{main_resources}] + [{resources}] * N\n" | ||||||
"> )\n\n" | ||||||
f"Where `N` is the number of slots to reserve for trial {submitted}s. " | ||||||
"If you are using a Ray training library, there might be a utility function " | ||||||
"to set this automatically for you. For more information, refer to " | ||||||
"https://docs.ray.io/en/latest/tune/tutorials/tune-resources.html" | ||||||
) | ||||||
|
||||||
|
||||||
def shutdown(): | ||||||
"""Cleans up the trial and removes it from the global context.""" | ||||||
|
||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,143 @@ | ||
import pytest | ||
|
||
import ray | ||
from ray import tune | ||
from ray.data.context import DatasetContext | ||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy | ||
from ray.tune.error import TuneError | ||
|
||
|
||
def test_nowarn_zero_cpu(): | ||
def f(*a): | ||
@ray.remote(num_cpus=0) | ||
def f(): | ||
pass | ||
|
||
@ray.remote(num_cpus=0) | ||
class Actor: | ||
def f(self): | ||
pass | ||
|
||
ray.get(f.remote()) | ||
a = Actor.remote() | ||
ray.get(a.f.remote()) | ||
|
||
tune.run(f, verbose=0) | ||
|
||
|
||
def test_warn_cpu(): | ||
def f(*a): | ||
@ray.remote(num_cpus=1) | ||
def f(): | ||
pass | ||
|
||
ray.get(f.remote()) | ||
|
||
with pytest.raises(TuneError): | ||
tune.run(f, verbose=0) | ||
|
||
with pytest.raises(TuneError): | ||
tune.run( | ||
f, resources_per_trial=tune.PlacementGroupFactory([{"CPU": 1}]), verbose=0 | ||
) | ||
|
||
def g(*a): | ||
@ray.remote(num_cpus=1) | ||
class Actor: | ||
def f(self): | ||
pass | ||
|
||
a = Actor.remote() | ||
ray.get(a.f.remote()) | ||
|
||
with pytest.raises(TuneError): | ||
tune.run(g, verbose=0) | ||
|
||
with pytest.raises(TuneError): | ||
tune.run( | ||
g, resources_per_trial=tune.PlacementGroupFactory([{"CPU": 1}]), verbose=0 | ||
) | ||
|
||
|
||
def test_pg_slots_ok(): | ||
def f(*a): | ||
@ray.remote(num_cpus=1) | ||
def f(): | ||
pass | ||
|
||
@ray.remote(num_cpus=1) | ||
class Actor: | ||
def f(self): | ||
pass | ||
|
||
ray.get(f.remote()) | ||
a = Actor.remote() | ||
ray.get(a.f.remote()) | ||
|
||
tune.run( | ||
f, resources_per_trial=tune.PlacementGroupFactory([{"CPU": 1}] * 2), verbose=0 | ||
) | ||
|
||
|
||
def test_bad_pg_slots(): | ||
def f(*a): | ||
@ray.remote(num_cpus=2) | ||
def f(): | ||
pass | ||
|
||
ray.get(f.remote()) | ||
|
||
with pytest.raises(TuneError): | ||
tune.run( | ||
f, | ||
resources_per_trial=tune.PlacementGroupFactory([{"CPU": 1}] * 2), | ||
verbose=0, | ||
) | ||
|
||
|
||
def test_dataset_ok(): | ||
def f(*a): | ||
ray.data.range(10).show() | ||
|
||
tune.run(f, verbose=0) | ||
|
||
def g(*a): | ||
ctx = DatasetContext.get_current() | ||
ctx.scheduling_strategy = PlacementGroupSchedulingStrategy( | ||
ray.util.get_current_placement_group() | ||
) | ||
ray.data.range(10).show() | ||
|
||
with pytest.raises(TuneError): | ||
tune.run(g, verbose=0) | ||
|
||
tune.run( | ||
g, resources_per_trial=tune.PlacementGroupFactory([{"CPU": 1}] * 2), verbose=0 | ||
) | ||
|
||
|
||
def test_scheduling_strategy_override(): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hmmm I got lost here. Can you add a few comments about what this is trying to capture? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
def f(*a): | ||
@ray.remote(num_cpus=1, scheduling_strategy="SPREAD") | ||
def f(): | ||
pass | ||
|
||
@ray.remote(num_cpus=1, scheduling_strategy="SPREAD") | ||
class Actor: | ||
def f(self): | ||
pass | ||
|
||
# SPREAD tasks are not captured by placement groups, so don't warn. | ||
ray.get(f.remote()) | ||
|
||
# SPREAD actors are not captured by placement groups, so don't warn. | ||
a = Actor.remote() | ||
ray.get(a.f.remote()) | ||
|
||
tune.run(f, verbose=0) | ||
|
||
|
||
if __name__ == "__main__": | ||
import sys | ||
|
||
sys.exit(pytest.main(["-v", __file__])) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If CPU == 0 / empty, fallback to 1 instead of parallelism (too large).