-
Notifications
You must be signed in to change notification settings - Fork 5.8k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[core][pg] Fix pg bundles can't reschedule bug when the two nodes bot…
…h dead (#24875) When a bundle is rescheduled due to node death, if other bundles of this pg also trigger rescheduling due to node death, there will be a bug that the bundle cannot be scheduled. Reason: step 1: Node A is down, and then bundle 1 of PG deployed on this node enters this GcsPlacementGroupManager::OnNodeDead process. This PG state will be RESCHEDULING and going to scheduling. step 2: Just when this PG was being scheduled, another node B also went down. Bundle 2 of this PG also enters this GcsPlacementGroupManager::OnNodeDead process. step 3: Because this PG state is RESCHEDULING, the bundle 2 can't be added to pending queue。 In the end, the bundle 2 cannot be rescheduled.
- Loading branch information
Showing
5 changed files
with
123 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
import pytest | ||
import sys | ||
import ray | ||
import ray.cluster_utils | ||
from ray._private.test_utils import ( | ||
get_other_nodes, | ||
) | ||
|
||
MB = 1024 * 1024 | ||
|
||
|
||
@ray.remote(num_cpus=1) | ||
class Actor(object): | ||
def __init__(self): | ||
self.n = 0 | ||
|
||
def value(self): | ||
return self.n | ||
|
||
|
||
# Test whether the bundles spread on two nodes can be rescheduled successfully | ||
# when both nodes die at the same time. | ||
def test_placement_group_failover_when_two_nodes_die(monkeypatch, ray_start_cluster): | ||
with monkeypatch.context() as m: | ||
m.setenv( | ||
"RAY_testing_asio_delay_us", | ||
"NodeManagerService.grpc_client.PrepareBundleResources=2000000:2000000", | ||
) | ||
cluster = ray_start_cluster | ||
num_nodes = 4 | ||
nodes = [] | ||
for _ in range(num_nodes): | ||
nodes.append(cluster.add_node(num_cpus=1)) | ||
ray.init(address=cluster.address) | ||
|
||
bundles = [{"CPU": 1, "memory": 100 * MB} for _ in range(num_nodes)] | ||
placement_group = ray.util.placement_group( | ||
name="name", strategy="STRICT_SPREAD", bundles=bundles | ||
) | ||
assert placement_group.wait(3000) | ||
|
||
# add more nodes for pg bundle rescedule | ||
other_nodes = get_other_nodes(cluster, exclude_head=True) | ||
other_nodes_num = len(other_nodes) | ||
for i in range(other_nodes_num): | ||
cluster.add_node(num_cpus=1) | ||
cluster.wait_for_nodes() | ||
|
||
for node in other_nodes: | ||
cluster.remove_node(node) | ||
|
||
# Create actors with echo bundle to make sure all bundle are ready. | ||
for i in range(num_nodes): | ||
actor = Actor.options( | ||
placement_group=placement_group, placement_group_bundle_index=i | ||
).remote() | ||
object_ref = actor.value.remote() | ||
ray.get(object_ref, timeout=5) | ||
|
||
|
||
if __name__ == "__main__": | ||
sys.exit(pytest.main(["-v", __file__])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters