diff --git a/cli/determined_cli/experiment.py b/cli/determined_cli/experiment.py index a95a59500d0f..b907821ec8fa 100644 --- a/cli/determined_cli/experiment.py +++ b/cli/determined_cli/experiment.py @@ -173,7 +173,7 @@ def local_experiment(args: Namespace) -> None: determined_common.set_logger(bool(experiment_config.get("debug", False))) - with det._local_execution_manager(args.model_def.resolve()): + with det._execution_manager(args.model_def.resolve()): trial_class = load.load_trial_implementation(experiment_config["entrypoint"]) experimental.test_one_batch(trial_class=trial_class, config=experiment_config) diff --git a/docs/release-notes/1116-sys-exit.txt b/docs/release-notes/1116-sys-exit.txt new file mode 100644 index 000000000000..d759199d810f --- /dev/null +++ b/docs/release-notes/1116-sys-exit.txt @@ -0,0 +1,5 @@ +:orphan: + +**Improvements** + +- Remind users to remove ``sys.exit()`` if ``SystemExit`` exception is caught. diff --git a/harness/determined/__init__.py b/harness/determined/__init__.py index 3d0c86551bde..8d6c5bf40a21 100644 --- a/harness/determined/__init__.py +++ b/harness/determined/__init__.py @@ -10,9 +10,9 @@ LoopTrialController, TrialController, ) -from determined._local_execution import ( +from determined._execution import ( _make_local_execution_env, - _local_execution_manager, + _execution_manager, ) from determined import errors from determined import util diff --git a/harness/determined/_local_execution.py b/harness/determined/_execution.py similarity index 84% rename from harness/determined/_local_execution.py rename to harness/determined/_execution.py index feb22ffd9759..2d47a67f2059 100644 --- a/harness/determined/_local_execution.py +++ b/harness/determined/_execution.py @@ -16,6 +16,18 @@ def _get_gpus() -> Tuple[bool, List[str], List[int]]: return use_gpu, gpu_uuids, gpu_ids +@contextlib.contextmanager +def _catch_sys_exit() -> Any: + try: + yield + except SystemExit as e: + raise det.errors.InvalidExperimentException( + "User code raised a SystemExit exception. " + "This might be raised by directly calling or using a library calling sys.exit(). " + "Please remove any calls to sys.exit() from your model code." + ) from e + + def _make_local_execution_exp_config(input_config: Optional[Dict[str, Any]]) -> Dict[str, Any]: """ Create a local experiment configuration based on an input configuration and @@ -92,16 +104,18 @@ def _make_local_execution_env( @contextlib.contextmanager -def _local_execution_manager(context_dir: pathlib.Path) -> Iterator: +def _execution_manager(context_dir: Optional[pathlib.Path] = None) -> Iterator: """ - A context manager used for local execution to mimic the environment of trial - container execution. + A context manager used for local and cluster execution. It does the following things: 1. Set the current working directory to be the context directory. 2. Add the current working directory to importing paths. + 3. Catch system exit. """ - current_directory = os.getcwd() + if context_dir is None: + context_dir = pathlib.Path.cwd() + current_cwd = os.getcwd() current_path = sys.path[0] try: @@ -113,7 +127,9 @@ def _local_execution_manager(context_dir: pathlib.Path) -> Iterator: # the current directory first. # Reference: https://docs.python.org/3/library/sys.html#sys.path sys.path[0] = "" - yield + + with _catch_sys_exit(): + yield finally: - os.chdir(current_directory) + os.chdir(current_cwd) sys.path[0] = current_path diff --git a/harness/determined/exec/harness.py b/harness/determined/exec/harness.py index a4eff475e091..02369dd669bb 100644 --- a/harness/determined/exec/harness.py +++ b/harness/determined/exec/harness.py @@ -136,14 +136,15 @@ def build_and_run_training_pipeline(env: det.EnvContext) -> None: if env.experiment_config.debug_enabled(): faulthandler.dump_traceback_later(30, repeat=True) - controller = load.prepare_controller( - env, - iter(workload_mgr), - load_path, - socket_mgr.get_rendezvous_info(), - hvd_config, - ) - controller.run() + with det._execution_manager(): + controller = load.prepare_controller( + env, + iter(workload_mgr), + load_path, + socket_mgr.get_rendezvous_info(), + hvd_config, + ) + controller.run() def main() -> None: diff --git a/harness/determined/exec/worker_process.py b/harness/determined/exec/worker_process.py index 5506216fbe4d..c67d1b6b17c0 100644 --- a/harness/determined/exec/worker_process.py +++ b/harness/determined/exec/worker_process.py @@ -3,6 +3,7 @@ import pathlib import sys +import determined as det from determined import ipc, layers, load @@ -37,20 +38,21 @@ def main() -> None: # Wrap the communication layer in a workload.Stream. subrec = layers.SubprocessReceiver(broadcast_client) - controller = load.prepare_controller( - worker_process_env.env, - iter(subrec), - worker_process_env.load_path, - worker_process_env.rendezvous_info, - worker_process_env.hvd_config, - ) - - try: - controller.run() - - except Exception as e: - broadcast_client.send_exception_message() - raise e + with det._execution_manager(): + controller = load.prepare_controller( + worker_process_env.env, + iter(subrec), + worker_process_env.load_path, + worker_process_env.rendezvous_info, + worker_process_env.hvd_config, + ) + + try: + controller.run() + + except Exception as e: + broadcast_client.send_exception_message() + raise e if __name__ == "__main__": diff --git a/harness/determined/experimental/_native.py b/harness/determined/experimental/_native.py index 99ef1edbeec8..1142793d9587 100644 --- a/harness/determined/experimental/_native.py +++ b/harness/determined/experimental/_native.py @@ -174,7 +174,7 @@ def _init_cluster_mode( def _load_trial_on_local( context_dir: pathlib.Path, training: bool, config: Dict[str, Any], hparams: Dict[str, Any] ) -> Tuple[Type[det.Trial], det.TrialContext]: - with det._local_execution_manager(context_dir): + with det._execution_manager(context_dir): trial_class = load.load_trial_implementation(config["entrypoint"]) env, rendezvous_info, hvd_config = det._make_local_execution_env(training, config, hparams) trial_context = trial_class.trial_context_class(env, hvd_config) @@ -263,9 +263,9 @@ def init_native( if local: if not test: - logging.warn("local training is not supported, testing instead") + logging.warning("local training is not supported, testing instead") - with det._local_execution_manager(pathlib.Path(context_dir).resolve()): + with det._execution_manager(pathlib.Path(context_dir).resolve()): return test_one_batch( controller_cls=controller_cls, native_context_cls=native_context_cls, diff --git a/harness/determined/load/_load_implementation.py b/harness/determined/load/_load_implementation.py index 89a0d5584138..7d3b5d18b113 100644 --- a/harness/determined/load/_load_implementation.py +++ b/harness/determined/load/_load_implementation.py @@ -188,12 +188,6 @@ def load_native_implementation( with overwrite_sys_args(command): try: runpy.run_path(command[0], run_name="__main__") - except SystemExit as e: - logging.warning( - "Model code raised a SystemExit (sys.exit()) before entering " - "the training loop. Please remove this sys.exit() from your script." - ) - raise e except det.errors.StopLoadingImplementation: # If caught this exception, will skip running the rest of the user code. pass diff --git a/harness/tests/experiment/test_local.py b/harness/tests/experiment/test_local.py index cc6fb1cb9be7..ef89d5b51b71 100644 --- a/harness/tests/experiment/test_local.py +++ b/harness/tests/experiment/test_local.py @@ -6,7 +6,7 @@ def test_test_one_batch() -> None: - with det._local_execution_manager(pathlib.Path(pytorch_xor_model.__file__).parent): + with det._execution_manager(pathlib.Path(pytorch_xor_model.__file__).parent): experimental.test_one_batch( trial_class=pytorch_xor_model.XORTrial, config={