[core] Refactoring the return value of teardown (#1595)

* Use runtime error instead * fix * fix * fix * Add comment * pylint
skypilot-org · May 26, 2023 · f733415 · f733415
1 parent 4849c86
commit f733415
Show file tree

Hide file tree

Showing 3 changed files with 29 additions and 14 deletions.
diff --git a/sky/backends/cloud_vm_ray_backend.py b/sky/backends/cloud_vm_ray_backend.py
@@ -2843,6 +2843,20 @@ def _teardown(self,
                   handle: CloudVmRayResourceHandle,
                   terminate: bool,
                   purge: bool = False):
+        """Tear down/ Stop the cluster.
+
+        Args:
+            handle: The handle to the cluster.
+            terminate: Terminate or stop the cluster.
+            purge: Purge the cluster record from the cluster table, even if
+                the teardown fails.
+        Raises:
+            exceptions.ClusterOwnerIdentityMismatchError: If the cluster is
+                owned by another user.
+            exceptions.CloudUserIdentityError: if we fail to get the current
+                user identity.
+            RuntimeError: If the cluster fails to be terminated/stopped.
+        """
         cluster_name = handle.cluster_name
         # Check if the cluster is owned by the current user. Raise
         # exceptions.ClusterOwnerIdentityMismatchError
@@ -2875,7 +2889,7 @@ def _teardown(self,
             with filelock.FileLock(
                     lock_path,
                     backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
-                success = self.teardown_no_lock(
+                self.teardown_no_lock(
                     handle,
                     terminate,
                     purge,
@@ -2885,7 +2899,7 @@ def _teardown(self,
                     # ClusterOwnerIdentityMismatchError. The argument/flag
                     # `purge` should bypass such ID mismatch errors.
                     refresh_cluster_status=not is_identity_mismatch_and_purge)
-            if success and terminate:
+            if terminate:
                 common_utils.remove_file_if_exists(lock_path)
         except filelock.Timeout as e:
             raise RuntimeError(
@@ -3072,14 +3086,17 @@ def teardown_no_lock(self,
                          terminate: bool,
                          purge: bool = False,
                          post_teardown_cleanup: bool = True,
-                         refresh_cluster_status: bool = True) -> bool:
+                         refresh_cluster_status: bool = True) -> None:
         """Teardown the cluster without acquiring the cluster status lock.
 
         NOTE: This method should not be called without holding the cluster
         status lock already.
 
         refresh_cluster_status is only used internally in the status refresh
         process, and should not be set to False in other cases.
+
+        Raises:
+            RuntimeError: If the cluster fails to be terminated/stopped.
         """
         if refresh_cluster_status:
             prev_cluster_status, _ = (
@@ -3098,7 +3115,6 @@ def teardown_no_lock(self,
             logger.warning(
                 f'Cluster {handle.cluster_name!r} is already terminated. '
                 'Skipped.')
-            return True
         log_path = os.path.join(os.path.expanduser(self.log_dir),
                                 'teardown.log')
         log_abs_path = os.path.abspath(log_path)
@@ -3246,33 +3262,33 @@ def teardown_no_lock(self,
             elif ('TPU must be specified.' not in stderr and
                   'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr and
                   '(ResourceGroupNotFound)' not in stderr):
-                logger.error(
+                raise RuntimeError(
                     _TEARDOWN_FAILURE_MESSAGE.format(
                         extra_reason='',
                         cluster_name=handle.cluster_name,
                         stdout=stdout,
                         stderr=stderr))
-                return False
 
         # No need to clean up if the cluster is already terminated
         # (i.e., prev_status is None), as the cleanup has already been done
         # if the cluster is removed from the status table.
         if post_teardown_cleanup:
-            return self.post_teardown_cleanup(handle, terminate, purge)
-        else:
-            return True
+            self.post_teardown_cleanup(handle, terminate, purge)
 
     def post_teardown_cleanup(self,
                               handle: CloudVmRayResourceHandle,
                               terminate: bool,
-                              purge: bool = False) -> bool:
+                              purge: bool = False) -> None:
         """Cleanup local configs/caches and delete TPUs after teardown.
 
         This method will handle the following cleanup steps:
         * Deleting the TPUs;
         * Removing ssh configs for the cluster;
         * Updating the local state of the cluster;
         * Removing the terminated cluster's scripts and ray yaml files.
+
+        Raises:
+            RuntimeError: If it fails to delete the TPU.
         """
         log_path = os.path.join(os.path.expanduser(self.log_dir),
                                 'teardown.log')
@@ -3295,13 +3311,12 @@ def post_teardown_cleanup(self,
                         _TEARDOWN_PURGE_WARNING.format(
                             reason='stopping/terminating TPU'))
                 else:
-                    logger.error(
+                    raise RuntimeError(
                         _TEARDOWN_FAILURE_MESSAGE.format(
                             extra_reason='It is caused by TPU failure.',
                             cluster_name=handle.cluster_name,
                             stdout=tpu_stdout,
                             stderr=tpu_stderr))
-                    return False
 
         # The cluster file must exist because the cluster_yaml will only
         # be removed after the cluster entry in the database is removed.
@@ -3325,7 +3340,6 @@ def post_teardown_cleanup(self,
             # No try-except is needed since Ray will fail to teardown the
             # cluster if the cluster_yaml is missing.
             common_utils.remove_file_if_exists(handle.cluster_yaml)
-        return True
 
     def set_autostop(self,
                      handle: CloudVmRayResourceHandle,

diff --git a/sky/cli.py b/sky/cli.py
@@ -2664,7 +2664,7 @@ def _down_or_stop(name: str):
                 message = (
                     f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
                     f'{colorama.Style.RESET_ALL}'
-                    f'\n\tReason: {common_utils.format_exception(e)}.')
+                    f'\nReason: {common_utils.format_exception(e)}.')
             except (exceptions.NotSupportedError,
                     exceptions.ClusterOwnerIdentityMismatchError) as e:
                 message = str(e)

diff --git a/sky/core.py b/sky/core.py
@@ -351,6 +351,7 @@ def down(cluster_name: str, purge: bool = False) -> None:
 
     Raises:
         ValueError: the specified cluster does not exist.
+        RuntimeError: failed to tear down the cluster.
         sky.exceptions.NotSupportedError: the specified cluster is the managed
           spot controller.
     """