Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[serve] Set status message if deployment pending for too long #25861

Merged
merged 22 commits into from
Jun 28, 2022
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
ff4e88f
Set message field of status if allocations/initializations have been …
zcin Jun 16, 2022
9eb7c87
format changes
zcin Jun 16, 2022
8d43a1b
Add unit test
zcin Jun 17, 2022
b9bdb4c
change comments
zcin Jun 17, 2022
a5e5b88
minor fixes
zcin Jun 17, 2022
885b722
Add unit test for checking that an unhealthy status should not be ove…
zcin Jun 21, 2022
4e0d28b
format changes
zcin Jun 21, 2022
44dff49
move unit tests from test_standalone2 to test_standalone
zcin Jun 21, 2022
a8fccba
Merge branch 'master' of https://github.com/ray-project/ray into upda…
zcin Jun 21, 2022
d5d85ab
fix bug
zcin Jun 21, 2022
b8fb9fe
increase timeout
zcin Jun 21, 2022
077e42d
change timeout
zcin Jun 21, 2022
f9c7a26
Merge branch 'master' of https://github.com/ray-project/ray into upda…
zcin Jun 22, 2022
b4aa1ab
Merge branch 'master' of https://github.com/ray-project/ray into upda…
zcin Jun 23, 2022
7096ce2
Merge branch 'master' of https://github.com/ray-project/ray into upda…
zcin Jun 24, 2022
de6cc2c
modify global variables through environment variables
zcin Jun 24, 2022
6dc874c
format
zcin Jun 24, 2022
e855501
Update python/ray/serve/tests/test_standalone.py
zcin Jun 27, 2022
6e0e76d
Update python/ray/serve/tests/test_standalone.py
zcin Jun 27, 2022
f900d89
Merge branch 'update_status_message' of https://github.com/zcin/ray i…
zcin Jun 27, 2022
d0bd51c
Merge branch 'master' of https://github.com/ray-project/ray into upda…
zcin Jun 27, 2022
6bcb8c6
change method of getting environment variables
zcin Jun 27, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 26 additions & 4 deletions python/ray/serve/deployment_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,10 @@ class ReplicaHealthCheckResponse(Enum):


CHECKPOINT_KEY = "serve-deployment-state-checkpoint"
SLOW_STARTUP_WARNING_S = 30
SLOW_STARTUP_WARNING_PERIOD_S = 30
SLOW_STARTUP_WARNING_S = int(os.getenv("SERVE_SLOW_STARTUP_WARNING_S", 30))
SLOW_STARTUP_WARNING_PERIOD_S = int(
os.getenv("SERVE_SLOW_STARTUP_WARNING_PERIOD_S", 30)
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
SLOW_STARTUP_WARNING_S = int(os.getenv("SERVE_SLOW_STARTUP_WARNING_S", 30))
SLOW_STARTUP_WARNING_PERIOD_S = int(
os.getenv("SERVE_SLOW_STARTUP_WARNING_PERIOD_S", 30)
)
SLOW_STARTUP_WARNING_S = int(os.environ.get("SERVE_SLOW_STARTUP_WARNING_S", 30))
SLOW_STARTUP_WARNING_PERIOD_S = int(
os.environ.get("SERVE_SLOW_STARTUP_WARNING_PERIOD_S", 30)
)

In Serve codebase we uses os.environ, it would be great to keep them consistent so it makes our life easier in next few months during refactoring.

Copy link
Contributor Author

@zcin zcin Jun 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Got it, thanks!


ALL_REPLICA_STATES = list(ReplicaState)
USE_PLACEMENT_GROUP = os.environ.get("SERVE_USE_PLACEMENT_GROUP", "1") != "0"
Expand Down Expand Up @@ -1496,7 +1498,7 @@ def _check_and_update_replicas(self) -> bool:

if len(pending_allocation) > 0:
required, available = slow_start_replicas[0][0].resource_requirements()
logger.warning(
message = (
f"Deployment '{self._name}' has "
f"{len(pending_allocation)} replicas that have taken "
f"more than {SLOW_STARTUP_WARNING_S}s to be scheduled. "
Expand All @@ -1506,16 +1508,36 @@ def _check_and_update_replicas(self) -> bool:
f"Resources required for each replica: {required}, "
f"resources available: {available}."
)
logger.warning(message)
if _SCALING_LOG_ENABLED:
print_verbose_scaling_log()
# If status is UNHEALTHY, leave the status and message as is.
# The issue that caused the deployment to be unhealthy should be
# prioritized over this resource availability issue.
if self._curr_status_info.status != DeploymentStatus.UNHEALTHY:
self._curr_status_info = DeploymentStatusInfo(
name=self._name,
status=DeploymentStatus.UPDATING,
message=message,
)

if len(pending_initialization) > 0:
logger.warning(
message = (
f"Deployment '{self._name}' has "
f"{len(pending_initialization)} replicas that have taken "
f"more than {SLOW_STARTUP_WARNING_S}s to initialize. This "
f"may be caused by a slow __init__ or reconfigure method."
)
logger.warning(message)
# If status is UNHEALTHY, leave the status and message as is.
# The issue that caused the deployment to be unhealthy should be
# prioritized over this resource availability issue.
if self._curr_status_info.status != DeploymentStatus.UNHEALTHY:
self._curr_status_info = DeploymentStatusInfo(
name=self._name,
status=DeploymentStatus.UPDATING,
message=message,
)

self._prev_startup_warning = time.time()

Expand Down
82 changes: 82 additions & 0 deletions python/ray/serve/tests/test_standalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,32 @@ def ray_cluster():
cluster.shutdown()


@pytest.fixture()
def lower_slow_startup_threshold_and_reset():
original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S")
original_slow_startup_warning_period_s = os.getenv(
"SERVE_SLOW_STARTUP_WARNING_PERIOD_S"
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same comment for using os.environ

# Lower slow startup warning threshold to 1 second to reduce test duration
os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = "1"
os.environ["SERVE_SLOW_STARTUP_WARNING_PERIOD_S"] = "1"

ray.init(num_cpus=2)
client = serve.start(detached=True)

yield client

serve.shutdown()
ray.shutdown()

# Reset slow startup warning threshold to prevent state sharing across unit
# tests
os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = original_slow_startup_warning_s
os.environ[
"SERVE_SLOW_STARTUP_WARNING_PERIOD_S"
] = original_slow_startup_warning_period_s
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please move these env meddling into a fixture https://docs.pytest.org/en/6.2.x/fixture.html#yield-fixtures-recommended

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's currently in a fixture, called lower_slow_startup_threshold_and_reset



def test_shutdown(ray_shutdown):
ray.init(num_cpus=16)
serve.start(http_options=dict(port=8003))
Expand Down Expand Up @@ -701,5 +727,61 @@ def f():
ray.shutdown()


def test_updating_status_message(lower_slow_startup_threshold_and_reset):
"""Check if status message says if a serve deployment has taken a long time"""

client = lower_slow_startup_threshold_and_reset

@serve.deployment(
num_replicas=5,
ray_actor_options={"num_cpus": 1},
)
def f(*args):
pass

f.deploy(_blocking=False)

def updating_message():
deployment_status = client.get_serve_status().deployment_statuses[0]
message_substring = "more than 1s to be scheduled."
return (deployment_status.status == "UPDATING") and (
message_substring in deployment_status.message
)

wait_for_condition(updating_message, timeout=20)


def test_unhealthy_override_updating_status(lower_slow_startup_threshold_and_reset):
"""
Check that if status is UNHEALTHY and there is a resource availability
issue, the status should not change. The issue that caused the deployment to
be unhealthy should be prioritized over this resource availability issue.
"""

client = lower_slow_startup_threshold_and_reset

@serve.deployment
class f:
def __init__(self):
self.num = 1 / 0

def __call__(self, request):
pass

f.deploy(_blocking=False)

wait_for_condition(
lambda: client.get_serve_status().deployment_statuses[0].status == "UNHEALTHY",
timeout=20,
)

with pytest.raises(RuntimeError):
wait_for_condition(
lambda: client.get_serve_status().deployment_statuses[0].status
== "UPDATING",
timeout=10,
)


if __name__ == "__main__":
sys.exit(pytest.main(["-v", "-s", __file__]))