forked from ray-project/ray
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Doc] Rewrite the placement group documentation (ray-project#33518)
This PR rewrites the existing placement group documentation that is confusing (sorry I wrote the original version). The new doc will start from the simplest example -> explaining the advanced concepts. Also, all the concepts are more thoroughly explained with examples. Signed-off-by: Jack He <[email protected]>
- Loading branch information
1 parent
ba2c05c
commit 3a21a86
Showing
9 changed files
with
573 additions
and
410 deletions.
There are no files selected for viewing
52 changes: 44 additions & 8 deletions
52
doc/source/ray-core/doc_code/placement_group_capture_child_tasks_example.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,32 +1,68 @@ | ||
# __child_capture_pg_start__ | ||
import ray | ||
from ray.util.placement_group import placement_group | ||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy | ||
|
||
ray.init(num_cpus=4) | ||
ray.init(num_cpus=2) | ||
|
||
# Create a placement group with the SPREAD strategy. | ||
pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="SPREAD") | ||
# Create a placement group. | ||
pg = placement_group([{"CPU": 2}]) | ||
ray.get(pg.ready()) | ||
|
||
|
||
@ray.remote(num_cpus=1) | ||
def child(): | ||
pass | ||
import time | ||
|
||
time.sleep(5) | ||
|
||
|
||
@ray.remote(num_cpus=1) | ||
def parent(): | ||
# The child task is scheduled with the same placement group as its parent | ||
# although child.options( | ||
# scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg) | ||
# ).remote() wasn't called if placement_group_capture_child_tasks is set to True. | ||
# The child task is scheduled to the same placement group as its parent, | ||
# although it didn't specify the PlacementGroupSchedulingStrategy. | ||
ray.get(child.remote()) | ||
|
||
|
||
# Since the child and parent use 1 CPU each, the placement group | ||
# bundle {"CPU": 2} is fully occupied. | ||
ray.get( | ||
parent.options( | ||
scheduling_strategy=PlacementGroupSchedulingStrategy( | ||
placement_group=pg, placement_group_capture_child_tasks=True | ||
) | ||
).remote() | ||
) | ||
# __child_capture_pg_end__ | ||
|
||
|
||
# __child_capture_disable_pg_start__ | ||
@ray.remote | ||
def parent(): | ||
# In this case, the child task isn't | ||
# scheduled with the parent's placement group. | ||
ray.get( | ||
child.options( | ||
scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=None) | ||
).remote() | ||
) | ||
|
||
|
||
# This times out because we cannot schedule the child task. | ||
# The cluster has {"CPU": 2}, and both of them are reserved by | ||
# the placement group with a bundle {"CPU": 2}. Since the child shouldn't | ||
# be scheduled within this placement group, it cannot be scheduled because | ||
# there's no available CPU resources. | ||
try: | ||
ray.get( | ||
parent.options( | ||
scheduling_strategy=PlacementGroupSchedulingStrategy( | ||
placement_group=pg, placement_group_capture_child_tasks=True | ||
) | ||
).remote(), | ||
timeout=5, | ||
) | ||
except Exception as e: | ||
print("Couldn't create a child task!") | ||
print(e) | ||
# __child_capture_disable_pg_end__ |
139 changes: 139 additions & 0 deletions
139
doc/source/ray-core/doc_code/placement_group_example.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
# __create_pg_start__ | ||
from pprint import pprint | ||
import time | ||
|
||
# Import placement group APIs. | ||
from ray.util.placement_group import ( | ||
placement_group, | ||
placement_group_table, | ||
remove_placement_group, | ||
) | ||
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy | ||
|
||
# Initialize Ray. | ||
import ray | ||
|
||
# Create a single node Ray cluster with 2 CPUs and 2 GPUs. | ||
ray.init(num_cpus=2, num_gpus=2) | ||
|
||
# Reserve a placement group of 1 bundle that reserves 1 CPU and 1 GPU. | ||
pg = placement_group([{"CPU": 1, "GPU": 1}]) | ||
# __create_pg_end__ | ||
|
||
# __ready_pg_start__ | ||
# Wait until placement group is created. | ||
ray.get(pg.ready(), timeout=10) | ||
|
||
# You can also use ray.wait. | ||
ready, unready = ray.wait([pg.ready()], timeout=10) | ||
|
||
# You can look at placement group states using this API. | ||
print(placement_group_table(pg)) | ||
# __ready_pg_end__ | ||
|
||
# __create_pg_failed_start__ | ||
# Cannot create this placement group because we | ||
# cannot create a {"GPU": 2} bundle. | ||
pending_pg = placement_group([{"CPU": 1}, {"GPU": 2}]) | ||
# This raises the timeout exception! | ||
try: | ||
ray.get(pending_pg.ready(), timeout=5) | ||
except Exception as e: | ||
print( | ||
"Cannot create a placement group because " | ||
"{'GPU': 2} bundle cannot be created." | ||
) | ||
print(e) | ||
# __create_pg_failed_end__ | ||
|
||
|
||
# __schedule_pg_start__ | ||
@ray.remote(num_cpus=1) | ||
class Actor: | ||
def __init__(self): | ||
pass | ||
|
||
def ready(self): | ||
pass | ||
|
||
|
||
# Create an actor to a placement group. | ||
actor = Actor.options( | ||
scheduling_strategy=PlacementGroupSchedulingStrategy( | ||
placement_group=pg, | ||
) | ||
).remote() | ||
|
||
# Verify the actor is scheduled. | ||
ray.get(actor.ready.remote(), timeout=10) | ||
# __schedule_pg_end__ | ||
|
||
|
||
# __schedule_pg_3_start__ | ||
@ray.remote(num_cpus=0, num_gpus=1) | ||
class Actor: | ||
def __init__(self): | ||
pass | ||
|
||
def ready(self): | ||
pass | ||
|
||
|
||
# Create a GPU actor on the first bundle of index 0. | ||
actor2 = Actor.options( | ||
scheduling_strategy=PlacementGroupSchedulingStrategy( | ||
placement_group=pg, | ||
placement_group_bundle_index=0, | ||
) | ||
).remote() | ||
|
||
# Verify that the GPU actor is scheduled. | ||
ray.get(actor2.ready.remote(), timeout=10) | ||
# __schedule_pg_3_end__ | ||
|
||
# __remove_pg_start__ | ||
# This API is asynchronous. | ||
remove_placement_group(pg) | ||
|
||
# Wait until placement group is killed. | ||
time.sleep(1) | ||
# Check that the placement group has died. | ||
pprint(placement_group_table(pg)) | ||
|
||
""" | ||
{'bundles': {0: {'GPU': 1.0}, 1: {'CPU': 1.0}}, | ||
'name': 'unnamed_group', | ||
'placement_group_id': '40816b6ad474a6942b0edb45809b39c3', | ||
'state': 'REMOVED', | ||
'strategy': 'PACK'} | ||
""" | ||
# __remove_pg_end__ | ||
|
||
# __strategy_pg_start__ | ||
# Reserve a placement group of 2 bundles | ||
# that have to be packed on the same node. | ||
pg = placement_group([{"CPU": 1}, {"GPU": 1}], strategy="PACK") | ||
# __strategy_pg_end__ | ||
|
||
remove_placement_group(pg) | ||
|
||
# __detached_pg_start__ | ||
# driver_1.py | ||
# Create a detached placement group that survives even after | ||
# the job terminates. | ||
pg = placement_group([{"CPU": 1}], lifetime="detached", name="global_name") | ||
ray.get(pg.ready()) | ||
# __detached_pg_end__ | ||
remove_placement_group(pg) | ||
|
||
|
||
# __get_pg_start__ | ||
# first_driver.py | ||
# Create a placement group with a global name. | ||
pg = placement_group([{"CPU": 1}], name="global_name") | ||
ray.get(pg.ready()) | ||
|
||
# second_driver.py | ||
# Retrieve a placement group with a global name. | ||
pg = ray.util.get_placement_group("global_name") | ||
# __get_pg_end__ |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.