ray-project · richardliaw · Feb 13, 2019 · Jan 31, 2019 · Jan 31, 2019 · Feb 3, 2019
@@ -90,6 +90,7 @@
         {
             "image": (str, OPTIONAL),  # e.g. tensorflow/tensorflow:1.5.0-py3
             "container_name": (str, OPTIONAL),  # e.g., ray_docker
+            "run_options": (list, OPTIONAL),
         },
         OPTIONAL),
 
@@ -102,7 +103,12 @@
     # Map of remote paths to local paths, e.g. {"/tmp/data": "/my/local/data"}
     "file_mounts": (dict, OPTIONAL),
 
-    # List of common shell commands to run to initialize nodes.
+    # List of commands that will be run before `setup_commands`. If docker is
+    # enabled, these commands will run outside the container and before docker
+    # is setup.
+    "initialization_commands": (list, OPTIONAL),
+
+    # List of common shell commands to run to setup nodes.
     "setup_commands": (list, OPTIONAL),
 
     # Commands that will be run on the head node after common setup.

@@ -20,6 +20,7 @@ initial_workers: 0
 docker:
     image: "" # e.g., tensorflow/tensorflow:1.5.0-py3
     container_name: "" # e.g. ray_docker
+    run_options: []  # Extra options to pass into "docker run"
 
 # The autoscaler will scale up the cluster to this target fraction of resource
 # usage. For example, if a cluster of 10 nodes is 100% busy and
@@ -54,7 +55,7 @@ auth:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 head_node:
     InstanceType: m5.large
-    ImageId: ami-3b6bce43  # Amazon Deep Learning AMI (Ubuntu)
+    ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
 
     # You can provision additional disk space with a conf as follows
     BlockDeviceMappings:
@@ -70,7 +71,7 @@ head_node:
 # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
 worker_nodes:
     InstanceType: m5.large
-    ImageId: ami-3b6bce43  # Amazon Deep Learning AMI (Ubuntu)
+    ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
 
     # Run workers on spot by default. Comment this out to use on-demand.
     InstanceMarketOptions:
@@ -88,6 +89,11 @@ file_mounts: {
 #    "/path2/on/remote/machine": "/path2/on/local/machine",
 }
 
+# List of commands that will be run before `setup_commands`. If docker is
+# enabled, these commands will run outside the container and before docker
+# is setup.
+initialization_commands: []
+
 # List of shell commands to run to set up nodes.
 setup_commands:
     # Note: if you're developing Ray, you probably want to create an AMI that

@@ -0,0 +1,114 @@
+# An unique identifier for the head node and workers of this cluster.
+cluster_name: gpu-docker
+
+# The minimum number of workers nodes to launch in addition to the head
+# node. This number should be >= 0.
+min_workers: 0
+
+# The maximum number of workers nodes to launch in addition to the head
+# node. This takes precedence over min_workers.
+max_workers: 2
+
+# The initial number of worker nodes to launch in addition to the head
+# node. When the cluster is first brought up (or when it is refreshed with a
+# subsequent `ray up`) this number of nodes will be started.
+initial_workers: 0
+
+# This executes all commands on all nodes in the docker container,
+# and opens all the necessary ports to support the Ray cluster.
+# Empty string means disabled.
+docker:
+    image: "tensorflow/tensorflow:1.12.0-gpu-py3"
+    container_name: "ray-nvidia-docker-test" # e.g. ray_docker
+    run_options:
+      - --runtime=nvidia
+
+# The autoscaler will scale up the cluster to this target fraction of resource
+# usage. For example, if a cluster of 10 nodes is 100% busy and
+# target_utilization is 0.8, it would resize the cluster to 13. This fraction
+# can be decreased to increase the aggressiveness of upscaling.
+# This value must be less than 1.0 for scaling to happen.
+target_utilization_fraction: 0.8
+
+# If a node is idle for this many minutes, it will be removed.
+idle_timeout_minutes: 5
+
+# Cloud-provider specific configuration.
+provider:
+    type: aws
+    region: us-west-2
+    # Availability zone(s), comma-separated, that nodes may be launched in.
+    # Nodes are currently spread between zones by a round-robin approach,
+    # however this implementation detail should not be relied upon.
+    availability_zone: us-west-2a,us-west-2b
+
+# How Ray will authenticate with newly launched nodes.
+auth:
+    ssh_user: ubuntu
+# By default Ray creates a new private keypair, but you can also use your own.
+# If you do so, make sure to also set "KeyName" in the head and worker node
+# configurations below.
+#    ssh_private_key: /path/to/your/key.pem
+
+# Provider-specific config for the head node, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+head_node:
+    InstanceType: p2.xlarge
+    ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
+
+    # You can provision additional disk space with a conf as follows
+    BlockDeviceMappings:
+        - DeviceName: /dev/sda1
+          Ebs:
+              VolumeSize: 100
+
+    # Additional options in the boto docs.
+
+# Provider-specific config for worker nodes, e.g. instance type. By default
+# Ray will auto-configure unspecified fields such as SubnetId and KeyName.
+# For more documentation on available fields, see:
+# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances
+worker_nodes:
+    InstanceType: m5.large
+    ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0
+
+    # Run workers on spot by default. Comment this out to use on-demand.
+    InstanceMarketOptions:
+        MarketType: spot
+        # Additional options can be found in the boto docs, e.g.
+        #   SpotOptions:
+        #       MaxPrice: MAX_HOURLY_PRICE
+
+    # Additional options in the boto docs.
+
+# Files or directories to copy to the head and worker nodes. The format is a
+# dictionary from REMOTE_PATH: LOCAL_PATH, e.g.
+file_mounts: {
+#    "/path1/on/remote/machine": "/path1/on/local/machine",
+#    "/path2/on/remote/machine": "/path2/on/local/machine",
+}
+
+# List of shell commands to run to set up nodes.
+setup_commands:
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp27-cp27mu-manylinux1_x86_64.whl
+    - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp35-cp35m-manylinux1_x86_64.whl
+    # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp36-cp36m-manylinux1_x86_64.whl
+
+# Custom commands that will be run on the head node after common setup.
+head_setup_commands:
+    - pip install boto3==1.4.8  # 1.4.8 adds InstanceMarketOptions
+
+# Custom commands that will be run on worker nodes after common setup.
+worker_setup_commands: []
+
+# Command to start ray on the head node. You don't need to change this.
+head_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml
+
+# Command to start ray on worker nodes. You don't need to change this.
+worker_start_ray_commands:
+    - ray stop
+    - ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076
@@ -26,6 +26,7 @@
     TAG_RAY_NODE_NAME
 from ray.autoscaler.updater import NodeUpdaterThread
 from ray.autoscaler.log_timer import LogTimer
+from ray.autoscaler.docker import with_docker_exec
 
 logger = logging.getLogger(__name__)
 
@@ -130,9 +131,16 @@ def kill_node(config_file, yes, override_cluster_name):
     node = random.choice(nodes)
     logger.info("kill_node: Terminating worker {}".format(node))
 
-    updater = NodeUpdaterThread(node, config["provider"], provider,
-                                config["auth"], config["cluster_name"],
-                                config["file_mounts"], [], "")
+    updater = NodeUpdaterThread(
+        node_id=node,
+        provider_config=config["provider"],
+        provider=provider,
+        auth_config=config["auth"],
+        cluster_name=config["cluster_name"],
+        file_mounts=config["file_mounts"],
+        initialization_commands=[],
+        setup_commands=[],
+        runtime_hash="")
 
     _exec(updater, "ray stop", False, False)
 
@@ -215,21 +223,24 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
             init_commands = config["head_start_ray_commands"]
         elif no_restart:
             init_commands = (
-                config["setup_commands"] + config["head_setup_commands"])
+                config["setup_commands"] +
+                config["head_setup_commands"])
         else:
             init_commands = (
-                config["setup_commands"] + config["head_setup_commands"] +
+                config["setup_commands"] +
+                config["head_setup_commands"] +
                 config["head_start_ray_commands"])
 
         updater = NodeUpdaterThread(
-            head_node,
-            config["provider"],
-            provider,
-            config["auth"],
-            config["cluster_name"],
-            config["file_mounts"],
-            init_commands,
-            runtime_hash,
+            node_id=head_node,
+            provider_config=config["provider"],
+            provider=provider,
+            auth_config=config["auth"],
+            cluster_name=config["cluster_name"],
+            file_mounts=config["file_mounts"],
+            initialization_commands=config["initialization_commands"],
+            setup_commands=init_commands,
+            runtime_hash=runtime_hash,
         )
         updater.start()
         updater.join()
@@ -247,19 +258,16 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes,
                         provider.external_ip(head_node)))
 
         monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*"
-        for s in init_commands:
-            if ("ray start" in s and "docker exec" in s
-                    and "--autoscaling-config" in s):
-                monitor_str = "docker exec {} /bin/sh -c {}".format(
-                    config["docker"]["container_name"], quote(monitor_str))
+        use_docker = bool(config["docker"]["container_name"])
         if override_cluster_name:
             modifiers = " --cluster-name={}".format(
                 quote(override_cluster_name))
         else:
             modifiers = ""
         print("To monitor auto-scaling activity, you can run:\n\n"
-              "  ray exec {} {}{}\n".format(config_file, quote(monitor_str),
-                                            modifiers))
+              "  ray exec {} {}{}{}\n".format(config_file, "--docker "
+                                              if use_docker else " ",
+                                              quote(monitor_str), modifiers))
         print("To open a console on the cluster:\n\n"
               "  ray attach {}{}\n".format(config_file, modifiers))
         print("To ssh manually to the cluster, run:\n\n"
@@ -292,17 +300,18 @@ def attach_cluster(config_file, start, use_tmux, override_cluster_name, new):
         else:
             cmd = "screen -L -xRR"
 
-    exec_cluster(config_file, cmd, False, False, False, start,
+    exec_cluster(config_file, cmd, False, False, False, False, start,
                  override_cluster_name, None)
 
 
-def exec_cluster(config_file, cmd, screen, tmux, stop, start,
+def exec_cluster(config_file, cmd, docker, screen, tmux, stop, start,
                  override_cluster_name, port_forward):
     """Runs a command on the specified cluster.
 
     Arguments:
         config_file: path to the cluster yaml
         cmd: command to run
+        docker: whether to run command in docker container of config
         screen: whether to run in a screen
         tmux: whether to run in a tmux session
         stop: whether to stop the cluster after command run
@@ -316,25 +325,38 @@ def exec_cluster(config_file, cmd, screen, tmux, stop, start,
     if override_cluster_name is not None:
         config["cluster_name"] = override_cluster_name
     config = _bootstrap_config(config)
+
     head_node = _get_head_node(
         config, config_file, override_cluster_name, create_if_needed=start)
 
     provider = get_node_provider(config["provider"], config["cluster_name"])
     try:
         updater = NodeUpdaterThread(
-            head_node,
-            config["provider"],
-            provider,
-            config["auth"],
-            config["cluster_name"],
-            config["file_mounts"],
-            [],
-            "",
+            node_id=head_node,
+            provider_config=config["provider"],
+            provider=provider,
+            auth_config=config["auth"],
+            cluster_name=config["cluster_name"],
+            file_mounts=config["file_mounts"],
+            initialization_commands=config["initialization_commands"],
+            setup_commands=[],
+            runtime_hash="",
         )
+
+        def wrap_docker(command):
+            container_name = config["docker"]["container_name"]
+            if not container_name:
+                raise ValueError("Docker container not specified in config.")
+            return with_docker_exec([command], container_name=container_name)[0]
+
+        cmd = wrap_docker(cmd) if docker else cmd
+
         if stop:
-            cmd += (
-                "; ray stop; ray teardown ~/ray_bootstrap_config.yaml --yes "
-                "--workers-only; sudo shutdown -h now")
+            shutdown_cmd = ("ray stop; ray teardown ~/ray_bootstrap_config.yaml "
+                            "--yes --workers-only")
+            shutdown_cmd = wrap_docker(shutdown_cmd) if docker else shutdown_cmd
+            cmd += ("; {}; sudo shutdown -h now".format(shutdown_cmd))
+
         _exec(
             updater,
             cmd,
@@ -378,7 +400,6 @@ def _exec(updater, cmd, screen, tmux, expect_error=False, port_forward=None):
             cmd = " ".join(cmd)
         updater.ssh_cmd(
             cmd,
-            verbose=False,
             allocate_tty=True,
             expect_error=expect_error,
             port_forward=port_forward)
@@ -405,14 +426,15 @@ def rsync(config_file, source, target, override_cluster_name, down):
     provider = get_node_provider(config["provider"], config["cluster_name"])
     try:
         updater = NodeUpdaterThread(
-            head_node,
-            config["provider"],
-            provider,
-            config["auth"],
-            config["cluster_name"],
-            config["file_mounts"],
-            [],
-            "",
+            node_id=head_node,
+            provider_config=config["provider"],
+            provider=provider,
+            auth_config=config["auth"],
+            cluster_name=config["cluster_name"],
+            file_mounts=config["file_mounts"],
+            initialization_commands=[],
+            setup_commands=[],
+            runtime_hash="",
         )
         if down:
             rsync = updater.rsync_down