-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[serve] Add exponential backoff when retrying replicas #31436
Changes from 8 commits
889512a
aa5706a
33152f4
40bdbb0
0d54eca
523bd5e
8aacc68
d02a066
86bf391
86dae89
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -978,6 +978,10 @@ def __init__( | |
# DeploymentInfo and bring current deployment to meet new status. | ||
self._target_state: DeploymentTargetState = DeploymentTargetState.default() | ||
self._prev_startup_warning: float = time.time() | ||
# Exponential backoff when retrying a consistently failing deployment | ||
self._last_retry: float = 0.0 | ||
self._backoff_time: int = 1 | ||
self._max_backoff: int = 64 | ||
self._replica_constructor_retry_counter: int = 0 | ||
self._replicas: ReplicaStateContainer = ReplicaStateContainer() | ||
self._curr_status_info: DeploymentStatusInfo = DeploymentStatusInfo( | ||
|
@@ -1104,6 +1108,7 @@ def _set_target_state(self, target_info: DeploymentInfo) -> None: | |
self._name, DeploymentStatus.UPDATING | ||
) | ||
self._replica_constructor_retry_counter = 0 | ||
self._backoff_time = 1 | ||
|
||
logger.debug(f"Deploying new version of {self._name}: {target_state.version}.") | ||
|
||
|
@@ -1308,28 +1313,43 @@ def _scale_deployment_replicas(self) -> bool: | |
) | ||
to_add = max(delta_replicas - stopping_replicas, 0) | ||
if to_add > 0: | ||
# Exponential backoff | ||
failed_to_start_threshold = min( | ||
zcin marked this conversation as resolved.
Show resolved
Hide resolved
|
||
MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT, | ||
self._target_state.num_replicas * 3, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the reason for choosing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah is it to compare against the total number of replica restarts across the deployment? (So on average each replica has failed 3 times?) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I used the threshold used for setting the deployment unhealthy: code pointer. Basically, perform exponential backoff after a replica fails 3 times and the deployment is determined unhealthy. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Yup, I believe so. |
||
) | ||
if self._replica_constructor_retry_counter >= failed_to_start_threshold: | ||
# Wait 1, 2, 4, ... seconds before consecutive retries, with random | ||
# offset added to avoid synchronization | ||
if ( | ||
time.time() - self._last_retry | ||
< self._backoff_time + random.uniform(0, 3) | ||
): | ||
return replicas_stopped | ||
|
||
self._last_retry = time.time() | ||
logger.info( | ||
f"Adding {to_add} replica{'s' if to_add > 1 else ''} " | ||
f"to deployment '{self._name}'." | ||
) | ||
for _ in range(to_add): | ||
replica_name = ReplicaName(self._name, get_random_letters()) | ||
new_deployment_replica = DeploymentReplica( | ||
self._controller_name, | ||
self._detached, | ||
replica_name.replica_tag, | ||
replica_name.deployment_tag, | ||
self._target_state.version, | ||
) | ||
new_deployment_replica.start( | ||
self._target_state.info, self._target_state.version | ||
) | ||
for _ in range(to_add): | ||
replica_name = ReplicaName(self._name, get_random_letters()) | ||
new_deployment_replica = DeploymentReplica( | ||
self._controller_name, | ||
self._detached, | ||
replica_name.replica_tag, | ||
replica_name.deployment_tag, | ||
self._target_state.version, | ||
) | ||
new_deployment_replica.start( | ||
self._target_state.info, self._target_state.version | ||
) | ||
|
||
self._replicas.add(ReplicaState.STARTING, new_deployment_replica) | ||
logger.debug( | ||
"Adding STARTING to replica_tag: " | ||
f"{replica_name}, deployment: {self._name}" | ||
) | ||
self._replicas.add(ReplicaState.STARTING, new_deployment_replica) | ||
logger.debug( | ||
"Adding STARTING to replica_tag: " | ||
f"{replica_name}, deployment: {self._name}" | ||
) | ||
|
||
elif delta_replicas < 0: | ||
replicas_stopped = True | ||
|
@@ -1407,10 +1427,10 @@ def _check_curr_status(self) -> bool: | |
name=self._name, | ||
status=DeploymentStatus.UNHEALTHY, | ||
message=( | ||
f"The Deployment failed to start {failed_to_start_count} " | ||
"times in a row. This may be due to a problem with the " | ||
"deployment constructor or the initial health check failing. " | ||
"See logs for details." | ||
f"The Deployment failed to start {failed_to_start_count} times " | ||
"in a row. This may be due to a problem with the deployment " | ||
"constructor or the initial health check failing. See logs for " | ||
f"details. Retrying after {self._backoff_time} seconds." | ||
), | ||
) | ||
return False | ||
|
@@ -1453,6 +1473,7 @@ def _check_startup_replicas( | |
""" | ||
slow_replicas = [] | ||
transitioned_to_running = False | ||
replicas_failed = False | ||
for replica in self._replicas.pop(states=[original_state]): | ||
start_status = replica.check_started() | ||
if start_status == ReplicaStartupStatus.SUCCEEDED: | ||
|
@@ -1466,6 +1487,7 @@ def _check_startup_replicas( | |
# Increase startup failure counter if we're tracking it | ||
self._replica_constructor_retry_counter += 1 | ||
|
||
replicas_failed = True | ||
replica.stop(graceful=False) | ||
self._replicas.add(ReplicaState.STOPPING, replica) | ||
elif start_status in [ | ||
|
@@ -1485,6 +1507,18 @@ def _check_startup_replicas( | |
else: | ||
self._replicas.add(original_state, replica) | ||
|
||
# If replicas have failed enough times, execute exponential backoff | ||
# Wait 1, 2, 4, ... seconds before consecutive retries | ||
failed_to_start_threshold = min( | ||
MAX_DEPLOYMENT_CONSTRUCTOR_RETRY_COUNT, | ||
self._target_state.num_replicas * 3, | ||
) | ||
if ( | ||
replicas_failed | ||
and self._replica_constructor_retry_counter > failed_to_start_threshold | ||
): | ||
self._backoff_time = min(2 * self._backoff_time, self._max_backoff) | ||
|
||
return slow_replicas, transitioned_to_running | ||
|
||
def _check_and_update_replicas(self) -> bool: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: _max_backoff_time_s, _backoff_time_s
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
consider making these parametrizable via env var
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the suggestions, applied! I made the backoff factor and the max backoff time env variables.