[Doc] Rewrite the placement group documentation (ray-project#33518)

This PR rewrites the existing placement group documentation that is confusing (sorry I wrote the original version). The new doc will start from the simplest example -> explaining the advanced concepts. Also, all the concepts are more thoroughly explained with examples. Signed-off-by: Jack He <[email protected]>
ProjectsByJackHe · May 4, 2023 · 3a21a86 · 3a21a86
1 parent ba2c05c
commit 3a21a86
Show file tree

Hide file tree

Showing 9 changed files with 573 additions and 410 deletions.
diff --git a/doc/source/ray-core/doc_code/placement_group_capture_child_tasks_example.py b/doc/source/ray-core/doc_code/placement_group_capture_child_tasks_example.py
@@ -1,32 +1,68 @@
+# __child_capture_pg_start__
 import ray
 from ray.util.placement_group import placement_group
 from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
 
-ray.init(num_cpus=4)
+ray.init(num_cpus=2)
 
-# Create a placement group with the SPREAD strategy.
-pg = placement_group([{"CPU": 2}, {"CPU": 2}], strategy="SPREAD")
+# Create a placement group.
+pg = placement_group([{"CPU": 2}])
 ray.get(pg.ready())
 
 
 @ray.remote(num_cpus=1)
 def child():
-    pass
+    import time
+
+    time.sleep(5)
 
 
 @ray.remote(num_cpus=1)
 def parent():
-    # The child task is scheduled with the same placement group as its parent
-    # although child.options(
-    #     scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg)
-    # ).remote() wasn't called if placement_group_capture_child_tasks is set to True.
+    # The child task is scheduled to the same placement group as its parent,
+    # although it didn't specify the PlacementGroupSchedulingStrategy.
     ray.get(child.remote())
 
 
+# Since the child and parent use 1 CPU each, the placement group
+# bundle {"CPU": 2} is fully occupied.
 ray.get(
     parent.options(
         scheduling_strategy=PlacementGroupSchedulingStrategy(
             placement_group=pg, placement_group_capture_child_tasks=True
         )
     ).remote()
 )
+# __child_capture_pg_end__
+
+
+# __child_capture_disable_pg_start__
+@ray.remote
+def parent():
+    # In this case, the child task isn't
+    # scheduled with the parent's placement group.
+    ray.get(
+        child.options(
+            scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=None)
+        ).remote()
+    )
+
+
+# This times out because we cannot schedule the child task.
+# The cluster has {"CPU": 2}, and both of them are reserved by
+# the placement group with a bundle {"CPU": 2}. Since the child shouldn't
+# be scheduled within this placement group, it cannot be scheduled because
+# there's no available CPU resources.
+try:
+    ray.get(
+        parent.options(
+            scheduling_strategy=PlacementGroupSchedulingStrategy(
+                placement_group=pg, placement_group_capture_child_tasks=True
+            )
+        ).remote(),
+        timeout=5,
+    )
+except Exception as e:
+    print("Couldn't create a child task!")
+    print(e)
+# __child_capture_disable_pg_end__
diff --git a/doc/source/ray-core/doc_code/placement_group_example.py b/doc/source/ray-core/doc_code/placement_group_example.py
@@ -0,0 +1,139 @@
+# __create_pg_start__
+from pprint import pprint
+import time
+
+# Import placement group APIs.
+from ray.util.placement_group import (
+    placement_group,
+    placement_group_table,
+    remove_placement_group,
+)
+from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
+
+# Initialize Ray.
+import ray
+
+# Create a single node Ray cluster with 2 CPUs and 2 GPUs.
+ray.init(num_cpus=2, num_gpus=2)
+
+# Reserve a placement group of 1 bundle that reserves 1 CPU and 1 GPU.
+pg = placement_group([{"CPU": 1, "GPU": 1}])
+# __create_pg_end__
+
+# __ready_pg_start__
+# Wait until placement group is created.
+ray.get(pg.ready(), timeout=10)
+
+# You can also use ray.wait.
+ready, unready = ray.wait([pg.ready()], timeout=10)
+
+# You can look at placement group states using this API.
+print(placement_group_table(pg))
+# __ready_pg_end__
+
+# __create_pg_failed_start__
+# Cannot create this placement group because we
+# cannot create a {"GPU": 2} bundle.
+pending_pg = placement_group([{"CPU": 1}, {"GPU": 2}])
+# This raises the timeout exception!
+try:
+    ray.get(pending_pg.ready(), timeout=5)
+except Exception as e:
+    print(
+        "Cannot create a placement group because "
+        "{'GPU': 2} bundle cannot be created."
+    )
+    print(e)
+# __create_pg_failed_end__
+
+
+# __schedule_pg_start__
+@ray.remote(num_cpus=1)
+class Actor:
+    def __init__(self):
+        pass
+
+    def ready(self):
+        pass
+
+
+# Create an actor to a placement group.
+actor = Actor.options(
+    scheduling_strategy=PlacementGroupSchedulingStrategy(
+        placement_group=pg,
+    )
+).remote()
+
+# Verify the actor is scheduled.
+ray.get(actor.ready.remote(), timeout=10)
+# __schedule_pg_end__
+
+
+# __schedule_pg_3_start__
+@ray.remote(num_cpus=0, num_gpus=1)
+class Actor:
+    def __init__(self):
+        pass
+
+    def ready(self):
+        pass
+
+
+# Create a GPU actor on the first bundle of index 0.
+actor2 = Actor.options(
+    scheduling_strategy=PlacementGroupSchedulingStrategy(
+        placement_group=pg,
+        placement_group_bundle_index=0,
+    )
+).remote()
+
+# Verify that the GPU actor is scheduled.
+ray.get(actor2.ready.remote(), timeout=10)
+# __schedule_pg_3_end__
+
+# __remove_pg_start__
+# This API is asynchronous.
+remove_placement_group(pg)
+
+# Wait until placement group is killed.
+time.sleep(1)
+# Check that the placement group has died.
+pprint(placement_group_table(pg))
+
+"""
+{'bundles': {0: {'GPU': 1.0}, 1: {'CPU': 1.0}},
+'name': 'unnamed_group',
+'placement_group_id': '40816b6ad474a6942b0edb45809b39c3',
+'state': 'REMOVED',
+'strategy': 'PACK'}
+"""
+# __remove_pg_end__
+
+# __strategy_pg_start__
+# Reserve a placement group of 2 bundles
+# that have to be packed on the same node.
+pg = placement_group([{"CPU": 1}, {"GPU": 1}], strategy="PACK")
+# __strategy_pg_end__
+
+remove_placement_group(pg)
+
+# __detached_pg_start__
+# driver_1.py
+# Create a detached placement group that survives even after
+# the job terminates.
+pg = placement_group([{"CPU": 1}], lifetime="detached", name="global_name")
+ray.get(pg.ready())
+# __detached_pg_end__
+remove_placement_group(pg)
+
+
+# __get_pg_start__
+# first_driver.py
+# Create a placement group with a global name.
+pg = placement_group([{"CPU": 1}], name="global_name")
+ray.get(pg.ready())
+
+# second_driver.py
+# Retrieve a placement group with a global name.
+pg = ray.util.get_placement_group("global_name")
+# __get_pg_end__
diff --git a/doc/source/ray-core/images/pg_image_1.png b/doc/source/ray-core/images/pg_image_1.png
diff --git a/doc/source/ray-core/images/pg_image_2.png b/doc/source/ray-core/images/pg_image_2.png
diff --git a/doc/source/ray-core/images/pg_image_3.png b/doc/source/ray-core/images/pg_image_3.png
diff --git a/doc/source/ray-core/images/pg_image_4.png b/doc/source/ray-core/images/pg_image_4.png
diff --git a/doc/source/ray-core/images/pg_image_5.png b/doc/source/ray-core/images/pg_image_5.png
diff --git a/doc/source/ray-core/images/pg_image_6.png b/doc/source/ray-core/images/pg_image_6.png