Skip to content

Commit

Permalink
feat: increase GLOO timeout [DET-3309] (#729)
Browse files Browse the repository at this point in the history
When running on a large number of nodes (40+), the default
timeout of 30 seconds is not enough for all nodes to connect
with the chief process.
  • Loading branch information
aaron276h authored Jun 17, 2020
1 parent 0acab3e commit e59a5a1
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 0 deletions.
5 changes: 5 additions & 0 deletions harness/determined/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,8 @@
# Path for file that stores output of horovod auto-tuning. Only created when
# horovod auto-tuning is enabled.
HOROVOD_AUTOTUNE_LOG_FILEPATH = "/tmp/autotune_log.csv"

# How many seconds GLOO waits for all tasks to connect before failing.
# Increasing this from a default of 30 is necessary when there is a
# large number of machines.
HOROVOD_GLOO_TIMEOUT_SECONDS = 240
2 changes: 2 additions & 0 deletions harness/determined/horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,8 @@ def create_run_command(
create_hostlist_arg(num_gpus_per_machine, ip_addresses),
"--start-timeout",
str(constants.HOROVOD_STARTUP_TIMEOUT_SECONDS),
"--gloo-timeout-seconds",
str(constants.HOROVOD_GLOO_TIMEOUT_SECONDS),
]
horovod_process_cmd.extend(create_network_interface_arg_if_specified(env, num_machines))
horovod_process_cmd.extend(create_performance_args(env))
Expand Down
2 changes: 2 additions & 0 deletions harness/tests/test_horovod.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ def test_create_run_command(
"localhost:8,128.140.2.4:8",
"--start-timeout",
str(constants.HOROVOD_STARTUP_TIMEOUT_SECONDS),
"--gloo-timeout-seconds",
str(constants.HOROVOD_GLOO_TIMEOUT_SECONDS),
]
if auto_tune:
expected_horovod_run_cmd.extend(
Expand Down

0 comments on commit e59a5a1

Please sign in to comment.