-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix node manager failure when ClientTable has a disconnected entry. #2905
Changes from 3 commits
4163f49
204dacc
8284b84
30ccb25
b2837de
d99360f
8872480
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -334,21 +334,37 @@ void NodeManager::ClientAdded(const ClientTableDataT &client_data) { | |
return; | ||
} | ||
|
||
ResourceSet resources_total(client_data.resources_total_label, | ||
client_data.resources_total_capacity); | ||
this->cluster_resource_map_.emplace(client_id, SchedulingResources(resources_total)); | ||
|
||
// Establish a new NodeManager connection to this GCS client. | ||
auto client_info = gcs_client_->client_table().GetClient(client_id); | ||
RAY_LOG(DEBUG) << "[ClientAdded] CONNECTING TO: " | ||
RAY_LOG(DEBUG) << "[ClientAdded] TRY TO CONNECTING TO: " | ||
<< " " << client_info.node_manager_address << " " | ||
<< client_info.node_manager_port; | ||
|
||
boost::asio::ip::tcp::socket socket(io_service_); | ||
RAY_CHECK_OK(TcpConnect(socket, client_info.node_manager_address, | ||
client_info.node_manager_port)); | ||
auto status = | ||
TcpConnect(socket, client_info.node_manager_address, client_info.node_manager_port); | ||
// ClientTable is Log<ActorID, ActorTableData>, a disconnected client has 2 entries. | ||
// The first one has is_insertion=true and the second one has is_insertion=false. | ||
// When a new raylet starts, ClientAdded will be called with the disconnected client's | ||
// first entry, which will cause IOError and "Connection refused". | ||
if (status.code() == StatusCode::IOError) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's probably preferable to just check |
||
// Do we need to check this? If the error message changes this will fail. | ||
RAY_CHECK(status.message() == "Connection refused"); | ||
RAY_LOG(WARNING) << "ClientAdded: " << status.message() | ||
<< " (conde=" << static_cast<int>(status.code()) << ")" | ||
<< " which may be caused by a disconnected client."; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. RAY_LOG(WARNING) << "Failed to connect to client " << client_id
<< " in ClientAdded. TcpConnect returned status: "
<< status.ToString() << ". This may be caused by "
<< "trying to connect to a node manager that has failed."; |
||
|
||
return; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Under what circumstances would There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If we return here and don't connect, when will the connection be established? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
} | ||
|
||
// The client is connected. | ||
RAY_CHECK(status.ok()); | ||
auto server_conn = TcpServerConnection(std::move(socket)); | ||
remote_server_connections_.emplace(client_id, std::move(server_conn)); | ||
|
||
ResourceSet resources_total(client_data.resources_total_label, | ||
client_data.resources_total_capacity); | ||
cluster_resource_map_.emplace(client_id, SchedulingResources(resources_total)); | ||
} | ||
|
||
void NodeManager::ClientRemoved(const ClientTableDataT &client_data) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.