From 88d02c8ce4160d3fb822f508288ca6431db9307b Mon Sep 17 00:00:00 2001 From: amogkam Date: Fri, 20 Jan 2023 16:22:03 -0800 Subject: [PATCH 1/6] change Signed-off-by: amogkam --- python/ray/train/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index 5eb91f6abb49..744ce9bb7832 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -72,7 +72,7 @@ # Use ethernet when possible. # NCCL_SOCKET_IFNAME does a prefix match so "ens3" or "ens5" will match with # "en". -DEFAULT_NCCL_SOCKET_IFNAME = "en,eth,bond" +DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,vethc" # Key for AIR Checkpoint metadata in TrainingResult metadata CHECKPOINT_METADATA_KEY = "checkpoint_metadata" From 49b40937c96d6be8d55fad24d6befb54e5048815 Mon Sep 17 00:00:00 2001 From: amogkam Date: Fri, 20 Jan 2023 16:29:48 -0800 Subject: [PATCH 2/6] add Signed-off-by: amogkam --- python/ray/train/torch/config.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/ray/train/torch/config.py b/python/ray/train/torch/config.py index 9d2ba2d0a60a..0af8ca0b8308 100644 --- a/python/ray/train/torch/config.py +++ b/python/ray/train/torch/config.py @@ -55,6 +55,8 @@ def backend_cls(self): def _set_nccl_network_interface() -> str: """Set the appropriate NCCL network interface to use.""" + os.environ["NCCL_DEBUG"] = "INFO" + if "NCCL_SOCKET_IFNAME" not in os.environ: logger.debug( f"Setting NCCL_SOCKET_IFNAME to {DEFAULT_NCCL_SOCKET_IFNAME} " From ba276ee3ee48b1d779a02d09f03eaeba1b12bacd Mon Sep 17 00:00:00 2001 From: amogkam Date: Fri, 20 Jan 2023 17:53:50 -0800 Subject: [PATCH 3/6] remove Signed-off-by: amogkam --- python/ray/train/torch/config.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/python/ray/train/torch/config.py b/python/ray/train/torch/config.py index 0af8ca0b8308..9d2ba2d0a60a 100644 --- a/python/ray/train/torch/config.py +++ b/python/ray/train/torch/config.py @@ -55,8 +55,6 @@ def backend_cls(self): def _set_nccl_network_interface() -> str: """Set the appropriate NCCL network interface to use.""" - os.environ["NCCL_DEBUG"] = "INFO" - if "NCCL_SOCKET_IFNAME" not in os.environ: logger.debug( f"Setting NCCL_SOCKET_IFNAME to {DEFAULT_NCCL_SOCKET_IFNAME} " From 25210ff474f10dba3e0333356682b62326a9ab38 Mon Sep 17 00:00:00 2001 From: amogkam Date: Fri, 20 Jan 2023 17:58:12 -0800 Subject: [PATCH 4/6] comment Signed-off-by: amogkam --- python/ray/train/constants.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index 744ce9bb7832..e1ead458caee 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -68,10 +68,7 @@ # as Trainable) DISABLE_LAZY_CHECKPOINTING_ENV = "TRAIN_DISABLE_LAZY_CHECKPOINTING" -# Default NCCL_SOCKET_IFNAME. -# Use ethernet when possible. -# NCCL_SOCKET_IFNAME does a prefix match so "ens3" or "ens5" will match with -# "en". +# Blacklist virtualized networking. DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,vethc" # Key for AIR Checkpoint metadata in TrainingResult metadata From 2c7f7211cb7c32a8bb090e45a67e519035031351 Mon Sep 17 00:00:00 2001 From: amogkam Date: Fri, 20 Jan 2023 20:24:26 -0800 Subject: [PATCH 5/6] remove vethc Signed-off-by: amogkam --- python/ray/train/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index e1ead458caee..6daa5e22a28a 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -69,7 +69,7 @@ DISABLE_LAZY_CHECKPOINTING_ENV = "TRAIN_DISABLE_LAZY_CHECKPOINTING" # Blacklist virtualized networking. -DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,vethc" +DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker" # Key for AIR Checkpoint metadata in TrainingResult metadata CHECKPOINT_METADATA_KEY = "checkpoint_metadata" From 3b23dcb52279ddcb329023f3807a0498cacd00da Mon Sep 17 00:00:00 2001 From: amogkam Date: Mon, 23 Jan 2023 12:47:19 -0800 Subject: [PATCH 6/6] no veth Signed-off-by: amogkam --- python/ray/train/constants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/ray/train/constants.py b/python/ray/train/constants.py index 6daa5e22a28a..abb9e3cffb26 100644 --- a/python/ray/train/constants.py +++ b/python/ray/train/constants.py @@ -69,7 +69,7 @@ DISABLE_LAZY_CHECKPOINTING_ENV = "TRAIN_DISABLE_LAZY_CHECKPOINTING" # Blacklist virtualized networking. -DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker" +DEFAULT_NCCL_SOCKET_IFNAME = "^lo,docker,veth" # Key for AIR Checkpoint metadata in TrainingResult metadata CHECKPOINT_METADATA_KEY = "checkpoint_metadata"