ray-project · ericl · Aug 8, 2023 · Jul 23, 2023 · Jul 27, 2023 · Jul 27, 2023
@@ -211,35 +211,38 @@ def as_directory(self) -> Iterator[str]:
         if isinstance(self.filesystem, pyarrow.fs.LocalFileSystem):
             yield self.path
         else:
-            temp_dir = self.to_directory()
-            del_lock_path = _get_del_lock_path(temp_dir)
+            del_lock_path = _get_del_lock_path(self._get_temporary_checkpoint_dir())
             open(del_lock_path, "a").close()
 
-            yield temp_dir
-
-            # Cleanup
             try:
-                os.remove(del_lock_path)
-            except Exception:
-                logger.warning(
-                    f"Could not remove {del_lock_path} deletion file lock. "
-                    f"Traceback:\n{traceback.format_exc()}"
-                )
-
-            # In the edge case (process crash before del lock file is removed),
-            # we do not remove the directory at all.
-            # Since it's in /tmp, this is not that big of a deal.
-            # check if any lock files are remaining
-            remaining_locks = _list_existing_del_locks(temp_dir)
-            if not remaining_locks:
+                temp_dir = self.to_directory()
+                yield temp_dir
+            finally:
+                # Always cleanup the del lock after we're done with the directory.
+                # This avoids leaving a lock file behind in the case of an exception
+                # in the user code.
                 try:
-                    # Timeout 0 means there will be only one attempt to acquire
-                    # the file lock. If it cannot be acquired, a TimeoutError
-                    # will be thrown.
-                    with TempFileLock(f"{temp_dir}.lock", timeout=0):
-                        shutil.rmtree(temp_dir, ignore_errors=True)
-                except TimeoutError:
-                    pass
+                    os.remove(del_lock_path)
+                except Exception:
+                    logger.warning(
+                        f"Could not remove {del_lock_path} deletion file lock. "
+                        f"Traceback:\n{traceback.format_exc()}"
+                    )
+
+                # In the edge case (process crash before del lock file is removed),
+                # we do not remove the directory at all.
+                # Since it's in /tmp, this is not that big of a deal.
+                # check if any lock files are remaining
+                remaining_locks = _list_existing_del_locks(temp_dir)
+                if not remaining_locks:
+                    try:
+                        # Timeout 0 means there will be only one attempt to acquire
+                        # the file lock. If it cannot be acquired, a TimeoutError
+                        # will be thrown.
+                        with TempFileLock(temp_dir, timeout=0):
+                            shutil.rmtree(temp_dir, ignore_errors=True)
+                    except TimeoutError:
+                        pass
 
     def _get_temporary_checkpoint_dir(self) -> str:
         """Return the name for the temporary checkpoint dir that this checkpoint

@@ -400,7 +400,7 @@ def __init__(
         experiment_dir_name: str,
         storage_filesystem: Optional[pyarrow.fs.FileSystem] = None,
         trial_dir_name: Optional[str] = None,
-        current_checkpoint_index: Optional[int] = None,
+        current_checkpoint_index: int = 0,
     ):
         storage_path_provided = storage_path is not None
 
@@ -591,18 +591,13 @@ def trial_fs_path(self) -> str:
 
     @property
     def checkpoint_fs_path(self) -> str:
-        """The trial directory path on the `storage_filesystem`.
+        """The current checkpoint directory path on the `storage_filesystem`.
 
-        Raises a ValueError if `current_checkpoint_index` is not set beforehand.
+        "Current" refers to the checkpoint that is currently being created/persisted.
+        The user of this class is responsible for setting the `current_checkpoint_index`
+        (e.g., incrementing when needed).
         """
-        from ray.tune.trainable.util import TrainableUtil
-
-        if self.current_checkpoint_index is None:
-            raise RuntimeError(
-                "Should not access `checkpoint_fs_path` without setting "
-                "`current_checkpoint_index`"
-            )
-        checkpoint_dir_name = TrainableUtil._make_checkpoint_dir_name(
+        checkpoint_dir_name = StorageContext._make_checkpoint_dir_name(
             self.current_checkpoint_index
         )
         return os.path.join(self.trial_fs_path, checkpoint_dir_name)
@@ -620,6 +615,11 @@ def get_experiment_dir_name(run_obj: Union[str, Callable, Type]) -> str:
             dir_name = "{}_{}".format(run_identifier, date_str())
         return dir_name
 
+    @staticmethod
+    def _make_checkpoint_dir_name(index: int):
+        """Get the name of the checkpoint directory, given an index."""
+        return f"checkpoint_{index:06d}"
+
 
 _storage_context: Optional[StorageContext] = None
 

@@ -22,6 +22,7 @@
 from ray.air.config import RunConfig, ScalingConfig
 from ray.air.result import Result
 from ray.train._internal import session
+from ray.train._internal.storage import _use_storage_context
 from ray.train.constants import TRAIN_DATASET_KEY
 from ray.util import PublicAPI
 from ray.util.annotations import DeveloperAPI
@@ -191,7 +192,7 @@ def __init__(
         self.run_config = run_config if run_config is not None else RunConfig()
         self.datasets = datasets if datasets is not None else {}
         self.preprocessor = preprocessor
-        self.resume_from_checkpoint = resume_from_checkpoint
+        self.starting_checkpoint = resume_from_checkpoint
 
         # This path should only be set through restore
         self._restore_path = None
@@ -377,7 +378,7 @@ def __repr__(self):
             "run_config": RunConfig(),
             "datasets": {},
             "preprocessor": None,
-            "resume_from_checkpoint": None,
+            "starting_checkpoint": None,
         }
 
         non_default_arguments = []
@@ -452,13 +453,13 @@ def _validate_attributes(self):
                 f"found {type(self.preprocessor)} with value `{self.preprocessor}`."
             )
 
-        if self.resume_from_checkpoint is not None and not isinstance(
-            self.resume_from_checkpoint, ray.air.Checkpoint
+        if self.starting_checkpoint is not None and not isinstance(
+            self.starting_checkpoint, ray.air.Checkpoint
         ):
             raise ValueError(
                 f"`resume_from_checkpoint` should be an instance of "
-                f"`ray.train.Checkpoint`, found {type(self.resume_from_checkpoint)} "
-                f"with value `{self.resume_from_checkpoint}`."
+                f"`ray.train.Checkpoint`, found {type(self.starting_checkpoint)} "
+                f"with value `{self.starting_checkpoint}`."
             )
 
     @classmethod
@@ -700,18 +701,22 @@ def train_func(config):
             # Instantiate new Trainer in Trainable.
             trainer = trainer_cls(**config)
 
-            # Get the checkpoint from the train context, and use it to initialize
-            # the restored trainer.
-            # This handles both worker-level and cluster-level restoration
-            # of the Train experiment.
+            # Get the checkpoint from Tune and pass it to workers later on.
             checkpoint = session.get_checkpoint()
             if checkpoint:
-                trainer.resume_from_checkpoint = checkpoint
-                # Always load the preprocessor from an available checkpoint
-                # Unless we are restoring the experiment and have explicitly
-                # passed in a new preprocessor
-                if not (restored and trainer.preprocessor):
-                    trainer.preprocessor = checkpoint.get_preprocessor()
+                # Set `starting_checkpoint` for auto-recovery fault-tolerance
+                # as well as manual restoration.
+                trainer.starting_checkpoint = checkpoint
+
+                # TODO(justinvyu): Remove this when Preprocessor is removed from Trainer
+                if not _use_storage_context():
+                    # Always load the preprocessor from an available checkpoint
+                    # Unless we are restoring the experiment and have explicitly
+                    # passed in a new preprocessor
+                    if not (restored and trainer.preprocessor):
+                        trainer.preprocessor = checkpoint.get_preprocessor()
+            # Else: Train will restore from the user-provided
+            # `resume_from_checkpoint` == `starting_checkpoint`.
 
             trainer.setup()
             trainer.preprocess_datasets()

@@ -527,7 +527,7 @@ def clear_lazy_checkpoint_marker():
             datasets=self.datasets,
             data_config=self._data_config,
             checkpoint_manager=checkpoint_manager,
-            checkpoint=self.resume_from_checkpoint,
+            checkpoint=self.starting_checkpoint,
             checkpoint_strategy=checkpoint_strategy,
             storage_path=self.run_config.storage_path,
         )

@@ -280,8 +280,8 @@ def training_loop(self) -> None:
         evals_result = {}
 
         init_model = None
-        if self.resume_from_checkpoint:
-            init_model, _ = self._load_checkpoint(self.resume_from_checkpoint)
+        if self.starting_checkpoint:
+            init_model, _ = self._load_checkpoint(self.starting_checkpoint)
 
         config.setdefault("verbose_eval", False)
         config.setdefault("callbacks", [])

@@ -398,7 +398,7 @@ def test_large_params(ray_start_4_cpus):
     array_size = int(1e8)
 
     def training_loop(self):
-        checkpoint = self.resume_from_checkpoint.to_dict()["ckpt"]
+        checkpoint = self.starting_checkpoint.to_dict()["ckpt"]
         assert len(checkpoint) == array_size
 
     checkpoint = Checkpoint.from_dict({"ckpt": np.zeros(shape=array_size)})

@@ -156,6 +156,21 @@ def test_multiprocess_as_directory(checkpoint: Checkpoint, monkeypatch):
         assert not Path(checkpoint_dir_1).exists()
 
 
+def test_as_directory_lock_cleanup(checkpoint: Checkpoint):
+    """Errors when accessing a checkpoint with `as_directory`
+    shouldn't leave behind lock files.
+    """
+    with pytest.raises(RuntimeError):
+        with checkpoint.as_directory() as checkpoint_dir:
+            raise RuntimeError
+
+    assert not _list_existing_del_locks(checkpoint_dir)
+
+    is_local_checkpoint = isinstance(checkpoint.filesystem, pyarrow.fs.LocalFileSystem)
+    if not is_local_checkpoint:
+        assert not Path(checkpoint_dir).exists()
+
+
 def test_metadata(checkpoint: Checkpoint):
     assert checkpoint.get_metadata() == {}
 

@@ -143,24 +143,28 @@ def train_fn(config):
     for i in range(start, config.get("num_iterations", 5)):
         time.sleep(0.25)
 
-        checkpoint_file_name = "checkpoint.pkl"
+        temp_dir = tempfile.mkdtemp()
+        with open(os.path.join(temp_dir, "checkpoint.pkl"), "wb") as f:
+            pickle.dump({"iter": i}, f)
+
         artifact_file_name = f"artifact-iter={i}.txt"
         if in_trainer:
             rank = train.get_context().get_world_rank()
-            checkpoint_file_name = f"checkpoint_shard-rank={rank}.pkl"
             artifact_file_name = f"artifact-rank={rank}-iter={i}.txt"
 
+            checkpoint_file_name = f"checkpoint_shard-rank={rank}.pkl"
+            with open(os.path.join(temp_dir, checkpoint_file_name), "wb") as f:
+                pickle.dump({"iter": i}, f)
+
         with open(artifact_file_name, "w") as f:
             f.write(f"{i}")
 
-        temp_dir = tempfile.mkdtemp()
-        with open(os.path.join(temp_dir, checkpoint_file_name), "wb") as f:
-            pickle.dump({"iter": i}, f)
-
         train.report(
             {"iter": i, _SCORE_KEY: i},
             checkpoint=NewCheckpoint.from_directory(temp_dir),
         )
+        if i in config.get("fail_iters", []):
+            raise RuntimeError(f"Failing on iter={i}!!")
 
 
 @pytest.mark.parametrize("storage_path_type", [None, "nfs", "cloud", "custom_fs"])
@@ -287,6 +291,7 @@ def test_trainer(
         ├── progress.csv
         ├── result.json
         ├── checkpoint_000000
+        │   ├── checkpoint.pkl                    <- Shared checkpoint file
         │   ├── checkpoint_shard-rank=0.pkl       <- Worker checkpoint shards
         │   └── checkpoint_shard-rank=1.pkl
         ├── ...
@@ -309,14 +314,19 @@ def test_trainer(
         NUM_WORKERS = 2
         trainer = DataParallelTrainer(
             train_fn,
-            train_loop_config={"in_trainer": True, "num_iterations": NUM_ITERATIONS},
+            train_loop_config={
+                "in_trainer": True,
+                "num_iterations": NUM_ITERATIONS,
+                "fail_iters": [2, 4],
+            },
             scaling_config=train.ScalingConfig(num_workers=2),
             run_config=train.RunConfig(
                 storage_path=storage_path,
                 storage_filesystem=storage_filesystem,
                 name=exp_name,
                 verbose=0,
                 checkpoint_config=checkpoint_config,
+                failure_config=train.FailureConfig(max_failures=2),
             ),
         )
         result = trainer.fit()
@@ -352,6 +362,8 @@ def test_trainer(
 
         assert len(list(trial_dir.glob("checkpoint_*"))) == expected_num_checkpoints
         for checkpoint_dir in trial_dir.glob("checkpoint_*"):
+            # 1 shared checkpoint.pkl file, written by all workers.
+            assert len(list(checkpoint_dir.glob("checkpoint.pkl"))) == 1
             # 1 checkpoint shard per worker.
             assert (
                 len(list(checkpoint_dir.glob("checkpoint_shard-*.pkl"))) == NUM_WORKERS

@@ -71,9 +71,9 @@ def __init__(
         # TrainingResult event. There's no need to do these one at a time.
         self._checkpoint_to_report = None
 
-        # TODO(justinvyu): Is this the best way to do this? Need to save this
-        # as part of checkpoint metadata and load it back on restore.
-        self._latest_checkpoint_index = 0
+        self._storage = None
+        if _use_storage_context():
+            self._storage = get_storage_context()
 
         self._start_training(
             train_func=train_func,
@@ -103,7 +103,10 @@ def _start_training(
             run_dir=run_dir,
             latest_checkpoint_id=latest_checkpoint_id,
         )
-        checkpoint = self._checkpoint_manager._load_checkpoint(checkpoint)
+
+        if not _use_storage_context():
+            checkpoint = self._checkpoint_manager._load_checkpoint(checkpoint)
+
         self._run_with_error_handling(
             lambda: self._backend_executor.start_training(
                 train_func=train_func,
@@ -119,18 +122,10 @@ def _send_next_checkpoint_path_to_workers(self):
         # NOTE: Always upload to storage from workers in the new persistence path
         # (no need to check for the `checkpoint_upload_from_workers` flag)
         if _use_storage_context():
-            storage = get_storage_context()
-
-            # NOTE: Idea: this checkpoint dir name should be customizable
-            # and created on the fly when the checkpoint is reported with metrics.
-            # Ex: lambda metrics: f"checkpoint_iter={metrics['training_iteration']}"
-            storage.current_checkpoint_index = self._latest_checkpoint_index
-
             self._backend_executor._set_checkpoint_index(
-                storage.current_checkpoint_index
+                self._storage.current_checkpoint_index
             )
-
-            self._latest_checkpoint_index += 1
+            self._storage.current_checkpoint_index += 1
 
         elif self._checkpoint_strategy._checkpoint_upload_from_workers:
             self._backend_executor._set_legacy_checkpoint_uri(

@@ -1956,11 +1956,29 @@ def _checkpoint_trial_if_needed(self, trial, force=False):
     ###
     # RESTORE
     def _schedule_trial_restore(self, trial: Trial) -> bool:
-        checkpoint = trial.checkpoint
-
         if _use_storage_context():
-            # TODO(justinvyu): Skipping restoration altogether for now.
-            return False
+            checkpoint_result = trial.checkpoint_manager.latest_checkpoint_result
+
+            if not checkpoint_result:
+                logger.debug(f"Not restoring trial {trial}: No checkpoint found.")
+                return False
+
+            # TODO(justinvyu): Is this really needed?
+            trial.restoring_from = checkpoint_result
+
+            method_name = "restore"
+            args = (checkpoint_result,)
+            self._schedule_trial_task(
+                trial=trial,
+                method_name=method_name,
+                args=args,
+                kwargs={},
+                on_result=self._on_restoring_result,
+                on_error=self._trial_task_failure,
+            )
+            return True
+
+        checkpoint = trial.checkpoint
 
         if checkpoint.dir_or_data is None:
             logger.debug(f"Not restoring trial {trial}: No checkpoint found.")