Skip to content

Commit

Permalink
fix: warn out if catching SystemExit [DET-2956]
Browse files Browse the repository at this point in the history
  • Loading branch information
shiyuann committed Aug 27, 2020
1 parent 3b10cad commit e862662
Show file tree
Hide file tree
Showing 9 changed files with 59 additions and 41 deletions.
2 changes: 1 addition & 1 deletion cli/determined_cli/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,7 @@ def local_experiment(args: Namespace) -> None:

determined_common.set_logger(bool(experiment_config.get("debug", False)))

with det._local_execution_manager(args.model_def.resolve()):
with det._execution_manager(args.model_def.resolve()):
trial_class = load.load_trial_implementation(experiment_config["entrypoint"])
experimental.test_one_batch(trial_class=trial_class, config=experiment_config)

Expand Down
5 changes: 5 additions & 0 deletions docs/release-notes/1116-sys-exit.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
:orphan:

**Improvements**

- Remind users to remove ``sys.exit()`` if ``SystemExit`` exception is caught.
4 changes: 2 additions & 2 deletions harness/determined/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
LoopTrialController,
TrialController,
)
from determined._local_execution import (
from determined._execution import (
_make_local_execution_env,
_local_execution_manager,
_execution_manager,
)
from determined import errors
from determined import util
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,18 @@ def _get_gpus() -> Tuple[bool, List[str], List[int]]:
return use_gpu, gpu_uuids, gpu_ids


@contextlib.contextmanager
def _catch_sys_exit() -> Any:
try:
yield
except SystemExit as e:
raise det.errors.InvalidExperimentException(
"User code raised a SystemExit exception. "
"This might be raised by directly calling or using a library calling sys.exit(). "
"Please remove any calls to sys.exit() from your model code."
) from e


def _make_local_execution_exp_config(input_config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
"""
Create a local experiment configuration based on an input configuration and
Expand Down Expand Up @@ -92,16 +104,18 @@ def _make_local_execution_env(


@contextlib.contextmanager
def _local_execution_manager(context_dir: pathlib.Path) -> Iterator:
def _execution_manager(context_dir: Optional[pathlib.Path] = None) -> Iterator:
"""
A context manager used for local execution to mimic the environment of trial
container execution.
A context manager used for local and cluster execution.
It does the following things:
1. Set the current working directory to be the context directory.
2. Add the current working directory to importing paths.
3. Catch system exit.
"""
current_directory = os.getcwd()
if context_dir is None:
context_dir = pathlib.Path.cwd()
current_cwd = os.getcwd()
current_path = sys.path[0]

try:
Expand All @@ -113,7 +127,9 @@ def _local_execution_manager(context_dir: pathlib.Path) -> Iterator:
# the current directory first.
# Reference: https://docs.python.org/3/library/sys.html#sys.path
sys.path[0] = ""
yield

with _catch_sys_exit():
yield
finally:
os.chdir(current_directory)
os.chdir(current_cwd)
sys.path[0] = current_path
17 changes: 9 additions & 8 deletions harness/determined/exec/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,15 @@ def build_and_run_training_pipeline(env: det.EnvContext) -> None:
if env.experiment_config.debug_enabled():
faulthandler.dump_traceback_later(30, repeat=True)

controller = load.prepare_controller(
env,
iter(workload_mgr),
load_path,
socket_mgr.get_rendezvous_info(),
hvd_config,
)
controller.run()
with det._execution_manager():
controller = load.prepare_controller(
env,
iter(workload_mgr),
load_path,
socket_mgr.get_rendezvous_info(),
hvd_config,
)
controller.run()


def main() -> None:
Expand Down
30 changes: 16 additions & 14 deletions harness/determined/exec/worker_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pathlib
import sys

import determined as det
from determined import ipc, layers, load


Expand Down Expand Up @@ -37,20 +38,21 @@ def main() -> None:
# Wrap the communication layer in a workload.Stream.
subrec = layers.SubprocessReceiver(broadcast_client)

controller = load.prepare_controller(
worker_process_env.env,
iter(subrec),
worker_process_env.load_path,
worker_process_env.rendezvous_info,
worker_process_env.hvd_config,
)

try:
controller.run()

except Exception as e:
broadcast_client.send_exception_message()
raise e
with det._execution_manager():
controller = load.prepare_controller(
worker_process_env.env,
iter(subrec),
worker_process_env.load_path,
worker_process_env.rendezvous_info,
worker_process_env.hvd_config,
)

try:
controller.run()

except Exception as e:
broadcast_client.send_exception_message()
raise e


if __name__ == "__main__":
Expand Down
6 changes: 3 additions & 3 deletions harness/determined/experimental/_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,7 +174,7 @@ def _init_cluster_mode(
def _load_trial_on_local(
context_dir: pathlib.Path, training: bool, config: Dict[str, Any], hparams: Dict[str, Any]
) -> Tuple[Type[det.Trial], det.TrialContext]:
with det._local_execution_manager(context_dir):
with det._execution_manager(context_dir):
trial_class = load.load_trial_implementation(config["entrypoint"])
env, rendezvous_info, hvd_config = det._make_local_execution_env(training, config, hparams)
trial_context = trial_class.trial_context_class(env, hvd_config)
Expand Down Expand Up @@ -263,9 +263,9 @@ def init_native(

if local:
if not test:
logging.warn("local training is not supported, testing instead")
logging.warning("local training is not supported, testing instead")

with det._local_execution_manager(pathlib.Path(context_dir).resolve()):
with det._execution_manager(pathlib.Path(context_dir).resolve()):
return test_one_batch(
controller_cls=controller_cls,
native_context_cls=native_context_cls,
Expand Down
6 changes: 0 additions & 6 deletions harness/determined/load/_load_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,6 @@ def load_native_implementation(
with overwrite_sys_args(command):
try:
runpy.run_path(command[0], run_name="__main__")
except SystemExit as e:
logging.warning(
"Model code raised a SystemExit (sys.exit()) before entering "
"the training loop. Please remove this sys.exit() from your script."
)
raise e
except det.errors.StopLoadingImplementation:
# If caught this exception, will skip running the rest of the user code.
pass
Expand Down
2 changes: 1 addition & 1 deletion harness/tests/experiment/test_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@


def test_test_one_batch() -> None:
with det._local_execution_manager(pathlib.Path(pytorch_xor_model.__file__).parent):
with det._execution_manager(pathlib.Path(pytorch_xor_model.__file__).parent):
experimental.test_one_batch(
trial_class=pytorch_xor_model.XORTrial,
config={
Expand Down

0 comments on commit e862662

Please sign in to comment.