Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: catch SystemExit and error out [DET-2956] #1116

Merged
merged 1 commit into from
Sep 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/release-notes/1116-sys-exit.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
:orphan:

**Improvements**

- Remind users to remove ``sys.exit()`` if ``SystemExit`` exception is caught.
3 changes: 2 additions & 1 deletion harness/determined/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@
LoopTrialController,
TrialController,
)
from determined._local_execution import (
from determined._execution import (
_catch_sys_exit,
_make_local_execution_env,
_local_execution_manager,
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,18 @@ def _get_gpus() -> Tuple[bool, List[str], List[int]]:
return use_gpu, gpu_uuids, gpu_ids


@contextlib.contextmanager
def _catch_sys_exit() -> Any:
try:
yield
except SystemExit as e:
raise det.errors.InvalidExperimentException(
"Caught a SystemExit exception. "
"This might be raised by directly calling or using a library calling sys.exit(). "
"Please remove any calls to sys.exit() from your model code."
) from e


def _make_local_execution_exp_config(input_config: Optional[Dict[str, Any]]) -> Dict[str, Any]:
"""
Create a local experiment configuration based on an input configuration and
Expand Down Expand Up @@ -101,6 +113,7 @@ def _local_execution_manager(context_dir: pathlib.Path) -> Iterator:
It does the following things:
1. Set the current working directory to be the context directory.
2. Add the current working directory to importing paths.
3. Catch SystemExit.
"""
current_directory = os.getcwd()
current_path = sys.path[0]
Expand All @@ -114,7 +127,8 @@ def _local_execution_manager(context_dir: pathlib.Path) -> Iterator:
# the current directory first.
# Reference: https://docs.python.org/3/library/sys.html#sys.path
sys.path[0] = ""
yield
with det._catch_sys_exit():
yield
finally:
os.chdir(current_directory)
sys.path[0] = current_path
1 change: 0 additions & 1 deletion harness/determined/estimator/_estimator_trial.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,6 @@ def control_loop(self) -> None:
raise det.errors.WorkerFinishedGracefully("Exiting normally.")
else:
raise AssertionError(f"Unknown wkld kind {wkld.kind}.")
exit(1)


class EstimatorTrialController(det.LoopTrialController):
Expand Down
17 changes: 9 additions & 8 deletions harness/determined/exec/harness.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,14 +136,15 @@ def build_and_run_training_pipeline(env: det.EnvContext) -> None:
if env.experiment_config.debug_enabled():
faulthandler.dump_traceback_later(30, repeat=True)

controller = load.prepare_controller(
env,
iter(workload_mgr),
load_path,
socket_mgr.get_rendezvous_info(),
hvd_config,
)
controller.run()
with det._catch_sys_exit():
controller = load.prepare_controller(
env,
iter(workload_mgr),
load_path,
socket_mgr.get_rendezvous_info(),
hvd_config,
)
controller.run()


def main() -> None:
Expand Down
30 changes: 16 additions & 14 deletions harness/determined/exec/worker_process.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pathlib
import sys

import determined as det
from determined import ipc, layers, load


Expand Down Expand Up @@ -37,20 +38,21 @@ def main() -> None:
# Wrap the communication layer in a workload.Stream.
subrec = layers.SubprocessReceiver(broadcast_client)

controller = load.prepare_controller(
worker_process_env.env,
iter(subrec),
worker_process_env.load_path,
worker_process_env.rendezvous_info,
worker_process_env.hvd_config,
)

try:
controller.run()

except Exception as e:
broadcast_client.send_exception_message()
raise e
with det._catch_sys_exit():
controller = load.prepare_controller(
worker_process_env.env,
iter(subrec),
worker_process_env.load_path,
worker_process_env.rendezvous_info,
worker_process_env.hvd_config,
)

try:
controller.run()

except Exception as e:
broadcast_client.send_exception_message()
raise e


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion harness/determined/experimental/_native.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,7 @@ def init_native(

if local:
if not test:
logging.warn("local training is not supported, testing instead")
logging.warning("local training is not supported, testing instead")

with det._local_execution_manager(pathlib.Path(context_dir).resolve()):
return test_one_batch(
Expand Down
4 changes: 2 additions & 2 deletions harness/determined/gpu.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import csv
import logging
import subprocess
import sys
from typing import List, NamedTuple, Optional, Tuple

import determined as det
from determined_common import check

gpu_fields = [
Expand Down Expand Up @@ -70,7 +70,7 @@ def get_gpu_uuids_and_validate(use_gpu: bool, slot_ids: Optional[List[str]] = No
# no GPUs are available, this indicates a misconfiguration.
_, gpu_uuids = get_gpu_ids_and_uuids()
if not gpu_uuids:
sys.exit("Failed to find GPUs for GPU-only trial")
raise det.errors.InternalException("Failed to find GPUs for GPU-only trial")

if slot_ids is not None:
check.equal_lengths(slot_ids, gpu_uuids, "Mismatched slot_ids and container_gpus.")
Expand Down
14 changes: 7 additions & 7 deletions harness/determined/ipc.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import logging
import time
from typing import Any, Callable, Dict, List, Optional, Tuple, cast

import zmq
from zmq.error import ZMQBindError, ZMQError

import determined as det
from determined_common import check


Expand Down Expand Up @@ -280,9 +280,8 @@ def _bind_to_specified_ports(self, ports: List[int]) -> None:
socket = self.context.socket(zmq.REP)
try:
socket.bind(f"tcp://*:{port}")
except ZMQError:
logging.warning(f"Failed to bind to port {port}.")
exit(1)
except ZMQError as e:
raise det.errors.InternalException(f"Failed to bind to port {port}.") from e
self.sockets.append(socket)
self.ports.append(port)

Expand All @@ -295,9 +294,10 @@ def _bind_to_random_ports(self, port_range: Tuple[int, int], num_connections: in
addr="tcp://*", min_port=port_range[0], max_port=port_range[1]
)
self.ports.append(selected_port)
except ZMQBindError:
logging.warning(f"Failed to bind to port range {port_range}.")
exit(1)
except ZMQBindError as e:
raise det.errors.InternalException(
f"Failed to bind to port range {port_range}."
) from e
self.sockets.append(socket)

def get_ports(self) -> List[int]:
Expand Down
6 changes: 0 additions & 6 deletions harness/determined/load/_load_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,12 +188,6 @@ def load_native_implementation(
with overwrite_sys_args(command):
try:
runpy.run_path(command[0], run_name="__main__")
except SystemExit as e:
logging.warning(
"Model code raised a SystemExit (sys.exit()) before entering "
"the training loop. Please remove this sys.exit() from your script."
)
raise e
except det.errors.StopLoadingImplementation:
# If caught this exception, will skip running the rest of the user code.
pass
Expand Down