Skip to content

Commit

Permalink
[core] Refactoring the return value of teardown (#1595)
Browse files Browse the repository at this point in the history
* Use runtime error instead

* fix

* fix

* fix

* Add comment

* pylint
  • Loading branch information
Michaelvll authored May 26, 2023
1 parent 4849c86 commit f733415
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 14 deletions.
40 changes: 27 additions & 13 deletions sky/backends/cloud_vm_ray_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -2843,6 +2843,20 @@ def _teardown(self,
handle: CloudVmRayResourceHandle,
terminate: bool,
purge: bool = False):
"""Tear down/ Stop the cluster.
Args:
handle: The handle to the cluster.
terminate: Terminate or stop the cluster.
purge: Purge the cluster record from the cluster table, even if
the teardown fails.
Raises:
exceptions.ClusterOwnerIdentityMismatchError: If the cluster is
owned by another user.
exceptions.CloudUserIdentityError: if we fail to get the current
user identity.
RuntimeError: If the cluster fails to be terminated/stopped.
"""
cluster_name = handle.cluster_name
# Check if the cluster is owned by the current user. Raise
# exceptions.ClusterOwnerIdentityMismatchError
Expand Down Expand Up @@ -2875,7 +2889,7 @@ def _teardown(self,
with filelock.FileLock(
lock_path,
backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
success = self.teardown_no_lock(
self.teardown_no_lock(
handle,
terminate,
purge,
Expand All @@ -2885,7 +2899,7 @@ def _teardown(self,
# ClusterOwnerIdentityMismatchError. The argument/flag
# `purge` should bypass such ID mismatch errors.
refresh_cluster_status=not is_identity_mismatch_and_purge)
if success and terminate:
if terminate:
common_utils.remove_file_if_exists(lock_path)
except filelock.Timeout as e:
raise RuntimeError(
Expand Down Expand Up @@ -3072,14 +3086,17 @@ def teardown_no_lock(self,
terminate: bool,
purge: bool = False,
post_teardown_cleanup: bool = True,
refresh_cluster_status: bool = True) -> bool:
refresh_cluster_status: bool = True) -> None:
"""Teardown the cluster without acquiring the cluster status lock.
NOTE: This method should not be called without holding the cluster
status lock already.
refresh_cluster_status is only used internally in the status refresh
process, and should not be set to False in other cases.
Raises:
RuntimeError: If the cluster fails to be terminated/stopped.
"""
if refresh_cluster_status:
prev_cluster_status, _ = (
Expand All @@ -3098,7 +3115,6 @@ def teardown_no_lock(self,
logger.warning(
f'Cluster {handle.cluster_name!r} is already terminated. '
'Skipped.')
return True
log_path = os.path.join(os.path.expanduser(self.log_dir),
'teardown.log')
log_abs_path = os.path.abspath(log_path)
Expand Down Expand Up @@ -3246,33 +3262,33 @@ def teardown_no_lock(self,
elif ('TPU must be specified.' not in stderr and
'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr and
'(ResourceGroupNotFound)' not in stderr):
logger.error(
raise RuntimeError(
_TEARDOWN_FAILURE_MESSAGE.format(
extra_reason='',
cluster_name=handle.cluster_name,
stdout=stdout,
stderr=stderr))
return False

# No need to clean up if the cluster is already terminated
# (i.e., prev_status is None), as the cleanup has already been done
# if the cluster is removed from the status table.
if post_teardown_cleanup:
return self.post_teardown_cleanup(handle, terminate, purge)
else:
return True
self.post_teardown_cleanup(handle, terminate, purge)

def post_teardown_cleanup(self,
handle: CloudVmRayResourceHandle,
terminate: bool,
purge: bool = False) -> bool:
purge: bool = False) -> None:
"""Cleanup local configs/caches and delete TPUs after teardown.
This method will handle the following cleanup steps:
* Deleting the TPUs;
* Removing ssh configs for the cluster;
* Updating the local state of the cluster;
* Removing the terminated cluster's scripts and ray yaml files.
Raises:
RuntimeError: If it fails to delete the TPU.
"""
log_path = os.path.join(os.path.expanduser(self.log_dir),
'teardown.log')
Expand All @@ -3295,13 +3311,12 @@ def post_teardown_cleanup(self,
_TEARDOWN_PURGE_WARNING.format(
reason='stopping/terminating TPU'))
else:
logger.error(
raise RuntimeError(
_TEARDOWN_FAILURE_MESSAGE.format(
extra_reason='It is caused by TPU failure.',
cluster_name=handle.cluster_name,
stdout=tpu_stdout,
stderr=tpu_stderr))
return False

# The cluster file must exist because the cluster_yaml will only
# be removed after the cluster entry in the database is removed.
Expand All @@ -3325,7 +3340,6 @@ def post_teardown_cleanup(self,
# No try-except is needed since Ray will fail to teardown the
# cluster if the cluster_yaml is missing.
common_utils.remove_file_if_exists(handle.cluster_yaml)
return True

def set_autostop(self,
handle: CloudVmRayResourceHandle,
Expand Down
2 changes: 1 addition & 1 deletion sky/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -2664,7 +2664,7 @@ def _down_or_stop(name: str):
message = (
f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
f'{colorama.Style.RESET_ALL}'
f'\n\tReason: {common_utils.format_exception(e)}.')
f'\nReason: {common_utils.format_exception(e)}.')
except (exceptions.NotSupportedError,
exceptions.ClusterOwnerIdentityMismatchError) as e:
message = str(e)
Expand Down
1 change: 1 addition & 0 deletions sky/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ def down(cluster_name: str, purge: bool = False) -> None:
Raises:
ValueError: the specified cluster does not exist.
RuntimeError: failed to tear down the cluster.
sky.exceptions.NotSupportedError: the specified cluster is the managed
spot controller.
"""
Expand Down

0 comments on commit f733415

Please sign in to comment.