From 4163f49410b2aa2d1fd41aa8ce4f7c2934077446 Mon Sep 17 00:00:00 2001 From: Yuhong Guo Date: Tue, 18 Sep 2018 17:43:02 +0800 Subject: [PATCH 1/7] Fix node manager failure when ClientTable has a disconnected entry. --- src/ray/raylet/node_manager.cc | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index d10d6d390f18..b42c6ab7f612 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -334,21 +334,29 @@ void NodeManager::ClientAdded(const ClientTableDataT &client_data) { return; } - ResourceSet resources_total(client_data.resources_total_label, - client_data.resources_total_capacity); - this->cluster_resource_map_.emplace(client_id, SchedulingResources(resources_total)); - // Establish a new NodeManager connection to this GCS client. auto client_info = gcs_client_->client_table().GetClient(client_id); - RAY_LOG(DEBUG) << "[ClientAdded] CONNECTING TO: " + RAY_LOG(DEBUG) << "[ClientAdded] TRY TO CONNECTING TO: " << " " << client_info.node_manager_address << " " << client_info.node_manager_port; boost::asio::ip::tcp::socket socket(io_service_); - RAY_CHECK_OK(TcpConnect(socket, client_info.node_manager_address, - client_info.node_manager_port)); + auto status = TcpConnect(socket, client_info.node_manager_address, + client_info.node_manager_port); + // ClientTable is Log, which has multiple entries for + // a disconnected client. The first one has is_insertion=true and the second + // one has is_insertion=false. We must make sure this is not a close client. + if (!status.ok()) { + return; + } + + // The client is connected. auto server_conn = TcpServerConnection(std::move(socket)); remote_server_connections_.emplace(client_id, std::move(server_conn)); + + ResourceSet resources_total(client_data.resources_total_label, + client_data.resources_total_capacity); + this->cluster_resource_map_.emplace(client_id, SchedulingResources(resources_total)); } void NodeManager::ClientRemoved(const ClientTableDataT &client_data) { From 204daccf49daef066d00c77be8273d5d0b2452ff Mon Sep 17 00:00:00 2001 From: Yuhong Guo Date: Tue, 18 Sep 2018 19:42:13 +0800 Subject: [PATCH 2/7] Lint --- src/ray/raylet/node_manager.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index b42c6ab7f612..6d8b94f322ef 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -341,8 +341,8 @@ void NodeManager::ClientAdded(const ClientTableDataT &client_data) { << client_info.node_manager_port; boost::asio::ip::tcp::socket socket(io_service_); - auto status = TcpConnect(socket, client_info.node_manager_address, - client_info.node_manager_port); + auto status = + TcpConnect(socket, client_info.node_manager_address, client_info.node_manager_port); // ClientTable is Log, which has multiple entries for // a disconnected client. The first one has is_insertion=true and the second // one has is_insertion=false. We must make sure this is not a close client. From 8284b848761cffda66f9a8ceb16d9e026eb4a26f Mon Sep 17 00:00:00 2001 From: Yuhong Guo Date: Wed, 19 Sep 2018 12:13:22 +0800 Subject: [PATCH 3/7] Change the comment to clarify --- src/ray/raylet/node_manager.cc | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 6d8b94f322ef..0fc5a140e693 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -343,20 +343,28 @@ void NodeManager::ClientAdded(const ClientTableDataT &client_data) { boost::asio::ip::tcp::socket socket(io_service_); auto status = TcpConnect(socket, client_info.node_manager_address, client_info.node_manager_port); - // ClientTable is Log, which has multiple entries for - // a disconnected client. The first one has is_insertion=true and the second - // one has is_insertion=false. We must make sure this is not a close client. - if (!status.ok()) { + // ClientTable is Log, a disconnected client has 2 entries. + // The first one has is_insertion=true and the second one has is_insertion=false. + // When a new raylet starts, ClientAdded will be called with the disconnected client's + // first entry, which will cause IOError and "Connection refused". + if (status.code() == StatusCode::IOError) { + // Do we need to check this? If the error message changes this will fail. + RAY_CHECK(status.message() == "Connection refused"); + RAY_LOG(WARNING) << "ClientAdded: " << status.message() + << " (conde=" << static_cast(status.code()) << ")" + << " which may be caused by a disconnected client."; + return; } // The client is connected. + RAY_CHECK(status.ok()); auto server_conn = TcpServerConnection(std::move(socket)); remote_server_connections_.emplace(client_id, std::move(server_conn)); ResourceSet resources_total(client_data.resources_total_label, client_data.resources_total_capacity); - this->cluster_resource_map_.emplace(client_id, SchedulingResources(resources_total)); + cluster_resource_map_.emplace(client_id, SchedulingResources(resources_total)); } void NodeManager::ClientRemoved(const ClientTableDataT &client_data) { From 30ccb253fd3c0ddfc50f7bf2b59335670e0cc5f7 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 21 Sep 2018 17:55:38 -0700 Subject: [PATCH 4/7] Small changes. --- src/ray/raylet/node_manager.cc | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 0fc5a140e693..785154ce2bca 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -312,7 +312,7 @@ void NodeManager::Heartbeat() { } void NodeManager::ClientAdded(const ClientTableDataT &client_data) { - ClientID client_id = ClientID::from_binary(client_data.client_id); + const ClientID client_id = ClientID::from_binary(client_data.client_id); RAY_LOG(DEBUG) << "[ClientAdded] received callback from client id " << client_id; if (client_id == gcs_client_->client_table().GetLocalClientId()) { @@ -336,8 +336,8 @@ void NodeManager::ClientAdded(const ClientTableDataT &client_data) { // Establish a new NodeManager connection to this GCS client. auto client_info = gcs_client_->client_table().GetClient(client_id); - RAY_LOG(DEBUG) << "[ClientAdded] TRY TO CONNECTING TO: " - << " " << client_info.node_manager_address << " " + RAY_LOG(DEBUG) << "[ClientAdded] Trying to connect to client " << client_id + << " at " << client_info.node_manager_address << ":" << client_info.node_manager_port; boost::asio::ip::tcp::socket socket(io_service_); @@ -347,18 +347,16 @@ void NodeManager::ClientAdded(const ClientTableDataT &client_data) { // The first one has is_insertion=true and the second one has is_insertion=false. // When a new raylet starts, ClientAdded will be called with the disconnected client's // first entry, which will cause IOError and "Connection refused". - if (status.code() == StatusCode::IOError) { - // Do we need to check this? If the error message changes this will fail. + if (!status.ok()) { RAY_CHECK(status.message() == "Connection refused"); - RAY_LOG(WARNING) << "ClientAdded: " << status.message() - << " (conde=" << static_cast(status.code()) << ")" - << " which may be caused by a disconnected client."; - + RAY_LOG(WARNING) << "Failed to connect to client " << client_id + << " in ClientAdded. TcpConnect returned status: " + << status.ToString() << ". This may be caused by " + << "trying to connect to a node manager that has failed."; return; } // The client is connected. - RAY_CHECK(status.ok()); auto server_conn = TcpServerConnection(std::move(socket)); remote_server_connections_.emplace(client_id, std::move(server_conn)); From b2837decee326fe18992af9f7cdc4e2c810c82e1 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 21 Sep 2018 17:57:38 -0700 Subject: [PATCH 5/7] Slight change to comment. --- src/ray/raylet/node_manager.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 785154ce2bca..a059e03c5921 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -343,10 +343,10 @@ void NodeManager::ClientAdded(const ClientTableDataT &client_data) { boost::asio::ip::tcp::socket socket(io_service_); auto status = TcpConnect(socket, client_info.node_manager_address, client_info.node_manager_port); - // ClientTable is Log, a disconnected client has 2 entries. - // The first one has is_insertion=true and the second one has is_insertion=false. - // When a new raylet starts, ClientAdded will be called with the disconnected client's - // first entry, which will cause IOError and "Connection refused". + // A disconnected client has 2 entries in the client table (one for being + // inserted and one for being removed). When a new raylet starts, ClientAdded + // will be called with the disconnected client's first entry, which will cause + // IOError and "Connection refused". if (!status.ok()) { RAY_CHECK(status.message() == "Connection refused"); RAY_LOG(WARNING) << "Failed to connect to client " << client_id From d99360f3f8838697c446e43cc5b0c2fcb5b3e292 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 21 Sep 2018 18:12:56 -0700 Subject: [PATCH 6/7] Linting --- src/ray/raylet/node_manager.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index a059e03c5921..7495a14123f9 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -336,8 +336,8 @@ void NodeManager::ClientAdded(const ClientTableDataT &client_data) { // Establish a new NodeManager connection to this GCS client. auto client_info = gcs_client_->client_table().GetClient(client_id); - RAY_LOG(DEBUG) << "[ClientAdded] Trying to connect to client " << client_id - << " at " << client_info.node_manager_address << ":" + RAY_LOG(DEBUG) << "[ClientAdded] Trying to connect to client " << client_id << " at " + << client_info.node_manager_address << ":" << client_info.node_manager_port; boost::asio::ip::tcp::socket socket(io_service_); From 88724809f2cbb2db9792b8f8b8515f2f5eb98c47 Mon Sep 17 00:00:00 2001 From: Robert Nishihara Date: Fri, 21 Sep 2018 21:17:24 -0700 Subject: [PATCH 7/7] Update node_manager.cc --- src/ray/raylet/node_manager.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/ray/raylet/node_manager.cc b/src/ray/raylet/node_manager.cc index 7495a14123f9..032d3f66707a 100644 --- a/src/ray/raylet/node_manager.cc +++ b/src/ray/raylet/node_manager.cc @@ -348,7 +348,6 @@ void NodeManager::ClientAdded(const ClientTableDataT &client_data) { // will be called with the disconnected client's first entry, which will cause // IOError and "Connection refused". if (!status.ok()) { - RAY_CHECK(status.message() == "Connection refused"); RAY_LOG(WARNING) << "Failed to connect to client " << client_id << " in ClientAdded. TcpConnect returned status: " << status.ToString() << ". This may be caused by "