Skip to content

Commit

Permalink
[serve] Use controller namespace when getting actors (#23896)
Browse files Browse the repository at this point in the history
Serve gets actors using the current Ray namespace. However, the Ray namespace and the controller namespace may not match when using the `_override_controller_namespace` argument in `serve.start()`. This change ensures that the `get_actor()` calls in `ActorReplicaWrapper` use the controller namespace. This also allows `num_replicas` to be scaled up and down properly when using `_override_controller_namespace`.
  • Loading branch information
shrekris-anyscale authored Apr 13, 2022
1 parent e3bd598 commit f400c20
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 3 deletions.
12 changes: 9 additions & 3 deletions python/ray/serve/deployment_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,9 @@ def graceful_stop(self) -> Duration:
Returns the timeout after which to kill the actor.
"""
try:
handle = ray.get_actor(self._actor_name)
handle = ray.get_actor(
self._actor_name, namespace=self._controller_namespace
)
self._graceful_shutdown_ref = handle.prepare_for_shutdown.remote()
except ValueError:
pass
Expand All @@ -441,7 +443,9 @@ def graceful_stop(self) -> Duration:
def check_stopped(self) -> bool:
"""Check if the actor has exited."""
try:
handle = ray.get_actor(self._actor_name)
handle = ray.get_actor(
self._actor_name, namespace=self._controller_namespace
)
stopped = self._check_obj_ref_ready(self._graceful_shutdown_ref)
if stopped:
ray.kill(handle, no_restart=True)
Expand Down Expand Up @@ -573,7 +577,9 @@ def check_health(self) -> bool:
def force_stop(self):
"""Force the actor to exit without shutting down gracefully."""
try:
ray.kill(ray.get_actor(self._actor_name))
ray.kill(
ray.get_actor(self._actor_name, namespace=self._controller_namespace)
)
except ValueError:
pass

Expand Down
33 changes: 33 additions & 0 deletions python/ray/serve/tests/test_standalone2.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,39 @@ def f(*args):
serve.shutdown()


@pytest.mark.parametrize("detached", [True, False])
def test_update_num_replicas_with_overriden_namespace(shutdown_ray, detached):
"""Test updating num_replicas with overriden namespace."""

ray_namespace = "ray_namespace"
controller_namespace = "controller_namespace"

ray.init(namespace=ray_namespace)
serve.start(detached=detached, _override_controller_namespace=controller_namespace)

@serve.deployment(num_replicas=2)
def f(*args):
return "got f"

f.deploy()

actors = ray.util.list_named_actors(all_namespaces=True)

f.options(num_replicas=4).deploy()
updated_actors = ray.util.list_named_actors(all_namespaces=True)

# Check that only 2 new replicas were created
assert len(updated_actors) == len(actors) + 2

f.options(num_replicas=1).deploy()
updated_actors = ray.util.list_named_actors(all_namespaces=True)

# Check that all but 1 replica has spun down
assert len(updated_actors) == len(actors) - 1

serve.shutdown()


@pytest.mark.parametrize("detached", [True, False])
def test_refresh_controller_after_death(shutdown_ray, detached):
"""Check if serve.start() refreshes the controller handle if it's dead."""
Expand Down

0 comments on commit f400c20

Please sign in to comment.