Skip to content

Commit

Permalink
[Doc] Rewrite the placement group documentation (ray-project#33518)
Browse files Browse the repository at this point in the history
This PR rewrites the existing placement group documentation that is confusing (sorry I wrote the original version).

The new doc will start from the simplest example -> explaining the advanced concepts. Also, all the concepts are more thoroughly explained with examples.

Signed-off-by: Jack He <[email protected]>
  • Loading branch information
rkooo567 authored and ProjectsByJackHe committed May 4, 2023
1 parent ba2c05c commit 3a21a86
Show file tree
Hide file tree
Showing 9 changed files with 573 additions and 410 deletions.
Original file line number Diff line number Diff line change
@@ -1,32 +1,68 @@
# __child_capture_pg_start__
import ray
from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy

ray.init(num_cpus=4)
ray.init(num_cpus=2)

# Create a placement group with the SPREAD strategy.
pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="SPREAD")
# Create a placement group.
pg = placement_group([{"CPU": 2}])
ray.get(pg.ready())


@ray.remote(num_cpus=1)
def child():
pass
import time

time.sleep(5)


@ray.remote(num_cpus=1)
def parent():
# The child task is scheduled with the same placement group as its parent
# although child.options(
# scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg)
# ).remote() wasn't called if placement_group_capture_child_tasks is set to True.
# The child task is scheduled to the same placement group as its parent,
# although it didn't specify the PlacementGroupSchedulingStrategy.
ray.get(child.remote())


# Since the child and parent use 1 CPU each, the placement group
# bundle {"CPU": 2} is fully occupied.
ray.get(
parent.options(
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg, placement_group_capture_child_tasks=True
)
).remote()
)
# __child_capture_pg_end__


# __child_capture_disable_pg_start__
@ray.remote
def parent():
# In this case, the child task isn't
# scheduled with the parent's placement group.
ray.get(
child.options(
scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=None)
).remote()
)


# This times out because we cannot schedule the child task.
# The cluster has {"CPU": 2}, and both of them are reserved by
# the placement group with a bundle {"CPU": 2}. Since the child shouldn't
# be scheduled within this placement group, it cannot be scheduled because
# there's no available CPU resources.
try:
ray.get(
parent.options(
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg, placement_group_capture_child_tasks=True
)
).remote(),
timeout=5,
)
except Exception as e:
print("Couldn't create a child task!")
print(e)
# __child_capture_disable_pg_end__
139 changes: 139 additions & 0 deletions doc/source/ray-core/doc_code/placement_group_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# __create_pg_start__
from pprint import pprint
import time

# Import placement group APIs.
from ray.util.placement_group import (
placement_group,
placement_group_table,
remove_placement_group,
)
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy

# Initialize Ray.
import ray

# Create a single node Ray cluster with 2 CPUs and 2 GPUs.
ray.init(num_cpus=2, num_gpus=2)

# Reserve a placement group of 1 bundle that reserves 1 CPU and 1 GPU.
pg = placement_group([{"CPU": 1, "GPU": 1}])
# __create_pg_end__

# __ready_pg_start__
# Wait until placement group is created.
ray.get(pg.ready(), timeout=10)

# You can also use ray.wait.
ready, unready = ray.wait([pg.ready()], timeout=10)

# You can look at placement group states using this API.
print(placement_group_table(pg))
# __ready_pg_end__

# __create_pg_failed_start__
# Cannot create this placement group because we
# cannot create a {"GPU": 2} bundle.
pending_pg = placement_group([{"CPU": 1}, {"GPU": 2}])
# This raises the timeout exception!
try:
ray.get(pending_pg.ready(), timeout=5)
except Exception as e:
print(
"Cannot create a placement group because "
"{'GPU': 2} bundle cannot be created."
)
print(e)
# __create_pg_failed_end__


# __schedule_pg_start__
@ray.remote(num_cpus=1)
class Actor:
def __init__(self):
pass

def ready(self):
pass


# Create an actor to a placement group.
actor = Actor.options(
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg,
)
).remote()

# Verify the actor is scheduled.
ray.get(actor.ready.remote(), timeout=10)
# __schedule_pg_end__


# __schedule_pg_3_start__
@ray.remote(num_cpus=0, num_gpus=1)
class Actor:
def __init__(self):
pass

def ready(self):
pass


# Create a GPU actor on the first bundle of index 0.
actor2 = Actor.options(
scheduling_strategy=PlacementGroupSchedulingStrategy(
placement_group=pg,
placement_group_bundle_index=0,
)
).remote()

# Verify that the GPU actor is scheduled.
ray.get(actor2.ready.remote(), timeout=10)
# __schedule_pg_3_end__

# __remove_pg_start__
# This API is asynchronous.
remove_placement_group(pg)

# Wait until placement group is killed.
time.sleep(1)
# Check that the placement group has died.
pprint(placement_group_table(pg))

"""
{'bundles': {0: {'GPU': 1.0}, 1: {'CPU': 1.0}},
'name': 'unnamed_group',
'placement_group_id': '40816b6ad474a6942b0edb45809b39c3',
'state': 'REMOVED',
'strategy': 'PACK'}
"""
# __remove_pg_end__

# __strategy_pg_start__
# Reserve a placement group of 2 bundles
# that have to be packed on the same node.
pg = placement_group([{"CPU": 1}, {"GPU": 1}], strategy="PACK")
# __strategy_pg_end__

remove_placement_group(pg)

# __detached_pg_start__
# driver_1.py
# Create a detached placement group that survives even after
# the job terminates.
pg = placement_group([{"CPU": 1}], lifetime="detached", name="global_name")
ray.get(pg.ready())
# __detached_pg_end__
remove_placement_group(pg)


# __get_pg_start__
# first_driver.py
# Create a placement group with a global name.
pg = placement_group([{"CPU": 1}], name="global_name")
ray.get(pg.ready())

# second_driver.py
# Retrieve a placement group with a global name.
pg = ray.util.get_placement_group("global_name")
# __get_pg_end__
Binary file added doc/source/ray-core/images/pg_image_1.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/source/ray-core/images/pg_image_2.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/source/ray-core/images/pg_image_3.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/source/ray-core/images/pg_image_4.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/source/ray-core/images/pg_image_5.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added doc/source/ray-core/images/pg_image_6.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 3a21a86

Please sign in to comment.