-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[serve] Set status message if deployment pending for too long #25861
Changes from 19 commits
ff4e88f
9eb7c87
8d43a1b
b9bdb4c
a5e5b88
885b722
4e0d28b
44dff49
a8fccba
d5d85ab
b8fb9fe
077e42d
f9c7a26
b4aa1ab
7096ce2
de6cc2c
6dc874c
e855501
6e0e76d
f900d89
d0bd51c
6bcb8c6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,6 +52,32 @@ def ray_cluster(): | |
cluster.shutdown() | ||
|
||
|
||
@pytest.fixture() | ||
def lower_slow_startup_threshold_and_reset(): | ||
original_slow_startup_warning_s = os.getenv("SERVE_SLOW_STARTUP_WARNING_S") | ||
original_slow_startup_warning_period_s = os.getenv( | ||
"SERVE_SLOW_STARTUP_WARNING_PERIOD_S" | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same comment for using os.environ |
||
# Lower slow startup warning threshold to 1 second to reduce test duration | ||
os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = "1" | ||
os.environ["SERVE_SLOW_STARTUP_WARNING_PERIOD_S"] = "1" | ||
|
||
ray.init(num_cpus=2) | ||
client = serve.start(detached=True) | ||
|
||
yield client | ||
|
||
serve.shutdown() | ||
ray.shutdown() | ||
|
||
# Reset slow startup warning threshold to prevent state sharing across unit | ||
# tests | ||
os.environ["SERVE_SLOW_STARTUP_WARNING_S"] = original_slow_startup_warning_s | ||
os.environ[ | ||
"SERVE_SLOW_STARTUP_WARNING_PERIOD_S" | ||
] = original_slow_startup_warning_period_s | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. please move these env meddling into a fixture https://docs.pytest.org/en/6.2.x/fixture.html#yield-fixtures-recommended There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's currently in a fixture, called |
||
|
||
|
||
def test_shutdown(ray_shutdown): | ||
ray.init(num_cpus=16) | ||
serve.start(http_options=dict(port=8003)) | ||
|
@@ -701,5 +727,61 @@ def f(): | |
ray.shutdown() | ||
|
||
|
||
def test_updating_status_message(lower_slow_startup_threshold_and_reset): | ||
"""Check if status message says if a serve deployment has taken a long time""" | ||
|
||
client = lower_slow_startup_threshold_and_reset | ||
|
||
@serve.deployment( | ||
num_replicas=5, | ||
ray_actor_options={"num_cpus": 1}, | ||
) | ||
def f(*args): | ||
pass | ||
|
||
f.deploy(_blocking=False) | ||
|
||
def updating_message(): | ||
deployment_status = client.get_serve_status().deployment_statuses[0] | ||
message_substring = "more than 1s to be scheduled." | ||
return (deployment_status.status == "UPDATING") and ( | ||
message_substring in deployment_status.message | ||
) | ||
|
||
wait_for_condition(updating_message, timeout=20) | ||
|
||
|
||
def test_unhealthy_override_updating_status(lower_slow_startup_threshold_and_reset): | ||
""" | ||
Check that if status is UNHEALTHY and there is a resource availability | ||
issue, the status should not change. The issue that caused the deployment to | ||
be unhealthy should be prioritized over this resource availability issue. | ||
""" | ||
|
||
client = lower_slow_startup_threshold_and_reset | ||
|
||
@serve.deployment | ||
class f: | ||
def __init__(self): | ||
self.num = 1 / 0 | ||
|
||
def __call__(self, request): | ||
pass | ||
|
||
f.deploy(_blocking=False) | ||
|
||
wait_for_condition( | ||
lambda: client.get_serve_status().deployment_statuses[0].status == "UNHEALTHY", | ||
timeout=20, | ||
) | ||
|
||
with pytest.raises(RuntimeError): | ||
wait_for_condition( | ||
lambda: client.get_serve_status().deployment_statuses[0].status | ||
== "UPDATING", | ||
timeout=10, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(pytest.main(["-v", "-s", __file__])) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In Serve codebase we uses os.environ, it would be great to keep them consistent so it makes our life easier in next few months during refactoring.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Got it, thanks!