diff --git a/python/ray/autoscaler/autoscaler.py b/python/ray/autoscaler/autoscaler.py index 9c5c379767a4..b501f52010cb 100644 --- a/python/ray/autoscaler/autoscaler.py +++ b/python/ray/autoscaler/autoscaler.py @@ -90,6 +90,7 @@ { "image": (str, OPTIONAL), # e.g. tensorflow/tensorflow:1.5.0-py3 "container_name": (str, OPTIONAL), # e.g., ray_docker + "run_options": (list, OPTIONAL), }, OPTIONAL), @@ -102,7 +103,12 @@ # Map of remote paths to local paths, e.g. {"/tmp/data": "/my/local/data"} "file_mounts": (dict, OPTIONAL), - # List of common shell commands to run to initialize nodes. + # List of commands that will be run before `setup_commands`. If docker is + # enabled, these commands will run outside the container and before docker + # is setup. + "initialization_commands": (list, OPTIONAL), + + # List of common shell commands to run to setup nodes. "setup_commands": (list, OPTIONAL), # Commands that will be run on the head node after common setup. @@ -527,13 +533,16 @@ def recover_if_needed(self, node_id, now): "{}: No heartbeat in {}s, " "restarting Ray to recover...".format(node_id, delta)) updater = NodeUpdaterThread( - node_id, - self.config["provider"], - self.provider, - self.config["auth"], - self.config["cluster_name"], {}, - with_head_node_ip(self.config["worker_start_ray_commands"]), - self.runtime_hash, + node_id=node_id, + provider_config=self.config["provider"], + provider=self.provider, + auth_config=self.config["auth"], + cluster_name=self.config["cluster_name"], + file_mounts={}, + initialization_commands=[], + setup_commands=with_head_node_ip( + self.config["worker_start_ray_commands"]), + runtime_hash=self.runtime_hash, process_runner=self.process_runner, use_internal_ip=True) updater.start() @@ -561,14 +570,16 @@ def should_update(self, node_id): def spawn_updater(self, node_id, init_commands): updater = NodeUpdaterThread( - node_id, - self.config["provider"], - self.provider, - self.config["auth"], - self.config["cluster_name"], - self.config["file_mounts"], - with_head_node_ip(init_commands), - self.runtime_hash, + node_id=node_id, + provider_config=self.config["provider"], + provider=self.provider, + auth_config=self.config["auth"], + cluster_name=self.config["cluster_name"], + file_mounts=self.config["file_mounts"], + initialization_commands=with_head_node_ip( + self.config["initialization_commands"]), + setup_commands=with_head_node_ip(init_commands), + runtime_hash=self.runtime_hash, process_runner=self.process_runner, use_internal_ip=True) updater.start() diff --git a/python/ray/autoscaler/aws/example-full.yaml b/python/ray/autoscaler/aws/example-full.yaml index b9d0cd38690a..e8df035266c5 100644 --- a/python/ray/autoscaler/aws/example-full.yaml +++ b/python/ray/autoscaler/aws/example-full.yaml @@ -20,6 +20,7 @@ initial_workers: 0 docker: image: "" # e.g., tensorflow/tensorflow:1.5.0-py3 container_name: "" # e.g. ray_docker + run_options: [] # Extra options to pass into "docker run" # The autoscaler will scale up the cluster to this target fraction of resource # usage. For example, if a cluster of 10 nodes is 100% busy and @@ -54,7 +55,7 @@ auth: # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances head_node: InstanceType: m5.large - ImageId: ami-3b6bce43 # Amazon Deep Learning AMI (Ubuntu) + ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0 # You can provision additional disk space with a conf as follows BlockDeviceMappings: @@ -70,7 +71,7 @@ head_node: # http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances worker_nodes: InstanceType: m5.large - ImageId: ami-3b6bce43 # Amazon Deep Learning AMI (Ubuntu) + ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0 # Run workers on spot by default. Comment this out to use on-demand. InstanceMarketOptions: @@ -88,6 +89,11 @@ file_mounts: { # "/path2/on/remote/machine": "/path2/on/local/machine", } +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + # List of shell commands to run to set up nodes. setup_commands: # Note: if you're developing Ray, you probably want to create an AMI that diff --git a/python/ray/autoscaler/aws/example-gpu-docker.yaml b/python/ray/autoscaler/aws/example-gpu-docker.yaml new file mode 100644 index 000000000000..9e4a071656b9 --- /dev/null +++ b/python/ray/autoscaler/aws/example-gpu-docker.yaml @@ -0,0 +1,114 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: gpu-docker + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 2 + +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 0 + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: + image: "tensorflow/tensorflow:1.12.0-gpu-py3" + container_name: "ray-nvidia-docker-test" # e.g. ray_docker + run_options: + - --runtime=nvidia + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: aws + region: us-west-2 + # Availability zone(s), comma-separated, that nodes may be launched in. + # Nodes are currently spread between zones by a round-robin approach, + # however this implementation detail should not be relied upon. + availability_zone: us-west-2a,us-west-2b + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. +# ssh_private_key: /path/to/your/key.pem + +# Provider-specific config for the head node, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. +# For more documentation on available fields, see: +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances +head_node: + InstanceType: p2.xlarge + ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0 + + # You can provision additional disk space with a conf as follows + BlockDeviceMappings: + - DeviceName: /dev/sda1 + Ebs: + VolumeSize: 100 + + # Additional options in the boto docs. + +# Provider-specific config for worker nodes, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as SubnetId and KeyName. +# For more documentation on available fields, see: +# http://boto3.readthedocs.io/en/latest/reference/services/ec2.html#EC2.ServiceResource.create_instances +worker_nodes: + InstanceType: m5.large + ImageId: ami-0b294f219d14e6a82 # Deep Learning AMI (Ubuntu) Version 21.0 + + # Run workers on spot by default. Comment this out to use on-demand. + InstanceMarketOptions: + MarketType: spot + # Additional options can be found in the boto docs, e.g. + # SpotOptions: + # MaxPrice: MAX_HOURLY_PRICE + + # Additional options in the boto docs. + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +# List of shell commands to run to set up nodes. +setup_commands: + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp27-cp27mu-manylinux1_x86_64.whl + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp35-cp35m-manylinux1_x86_64.whl + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp36-cp36m-manylinux1_x86_64.whl + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install boto3==1.4.8 # 1.4.8 adds InstanceMarketOptions + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --head --redis-port=6379 --object-manager-port=8076 --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - ulimit -n 65536; ray start --redis-address=$RAY_HEAD_IP:6379 --object-manager-port=8076 diff --git a/python/ray/autoscaler/commands.py b/python/ray/autoscaler/commands.py index 096b081f2ac8..074b58fc8f01 100644 --- a/python/ray/autoscaler/commands.py +++ b/python/ray/autoscaler/commands.py @@ -26,6 +26,7 @@ TAG_RAY_NODE_NAME from ray.autoscaler.updater import NodeUpdaterThread from ray.autoscaler.log_timer import LogTimer +from ray.autoscaler.docker import with_docker_exec logger = logging.getLogger(__name__) @@ -130,9 +131,16 @@ def kill_node(config_file, yes, override_cluster_name): node = random.choice(nodes) logger.info("kill_node: Terminating worker {}".format(node)) - updater = NodeUpdaterThread(node, config["provider"], provider, - config["auth"], config["cluster_name"], - config["file_mounts"], [], "") + updater = NodeUpdaterThread( + node_id=node, + provider_config=config["provider"], + provider=provider, + auth_config=config["auth"], + cluster_name=config["cluster_name"], + file_mounts=config["file_mounts"], + initialization_commands=[], + setup_commands=[], + runtime_hash="") _exec(updater, "ray stop", False, False) @@ -222,14 +230,15 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, config["head_start_ray_commands"]) updater = NodeUpdaterThread( - head_node, - config["provider"], - provider, - config["auth"], - config["cluster_name"], - config["file_mounts"], - init_commands, - runtime_hash, + node_id=head_node, + provider_config=config["provider"], + provider=provider, + auth_config=config["auth"], + cluster_name=config["cluster_name"], + file_mounts=config["file_mounts"], + initialization_commands=config["initialization_commands"], + setup_commands=init_commands, + runtime_hash=runtime_hash, ) updater.start() updater.join() @@ -247,19 +256,16 @@ def get_or_create_head_node(config, config_file, no_restart, restart_only, yes, provider.external_ip(head_node))) monitor_str = "tail -n 100 -f /tmp/ray/session_*/logs/monitor*" - for s in init_commands: - if ("ray start" in s and "docker exec" in s - and "--autoscaling-config" in s): - monitor_str = "docker exec {} /bin/sh -c {}".format( - config["docker"]["container_name"], quote(monitor_str)) + use_docker = bool(config["docker"]["container_name"]) if override_cluster_name: modifiers = " --cluster-name={}".format( quote(override_cluster_name)) else: modifiers = "" print("To monitor auto-scaling activity, you can run:\n\n" - " ray exec {} {}{}\n".format(config_file, quote(monitor_str), - modifiers)) + " ray exec {} {}{}{}\n".format( + config_file, "--docker " if use_docker else " ", + quote(monitor_str), modifiers)) print("To open a console on the cluster:\n\n" " ray attach {}{}\n".format(config_file, modifiers)) print("To ssh manually to the cluster, run:\n\n" @@ -292,17 +298,18 @@ def attach_cluster(config_file, start, use_tmux, override_cluster_name, new): else: cmd = "screen -L -xRR" - exec_cluster(config_file, cmd, False, False, False, start, + exec_cluster(config_file, cmd, False, False, False, False, start, override_cluster_name, None) -def exec_cluster(config_file, cmd, screen, tmux, stop, start, +def exec_cluster(config_file, cmd, docker, screen, tmux, stop, start, override_cluster_name, port_forward): """Runs a command on the specified cluster. Arguments: config_file: path to the cluster yaml cmd: command to run + docker: whether to run command in docker container of config screen: whether to run in a screen tmux: whether to run in a tmux session stop: whether to stop the cluster after command run @@ -316,25 +323,41 @@ def exec_cluster(config_file, cmd, screen, tmux, stop, start, if override_cluster_name is not None: config["cluster_name"] = override_cluster_name config = _bootstrap_config(config) + head_node = _get_head_node( config, config_file, override_cluster_name, create_if_needed=start) provider = get_node_provider(config["provider"], config["cluster_name"]) try: updater = NodeUpdaterThread( - head_node, - config["provider"], - provider, - config["auth"], - config["cluster_name"], - config["file_mounts"], - [], - "", + node_id=head_node, + provider_config=config["provider"], + provider=provider, + auth_config=config["auth"], + cluster_name=config["cluster_name"], + file_mounts=config["file_mounts"], + initialization_commands=[], + setup_commands=[], + runtime_hash="", ) + + def wrap_docker(command): + container_name = config["docker"]["container_name"] + if not container_name: + raise ValueError("Docker container not specified in config.") + return with_docker_exec( + [command], container_name=container_name)[0] + + cmd = wrap_docker(cmd) if docker else cmd + if stop: - cmd += ( - "; ray stop; ray teardown ~/ray_bootstrap_config.yaml --yes " - "--workers-only; sudo shutdown -h now") + shutdown_cmd = ( + "ray stop; ray teardown ~/ray_bootstrap_config.yaml " + "--yes --workers-only") + if docker: + shutdown_cmd = wrap_docker(shutdown_cmd) + cmd += ("; {}; sudo shutdown -h now".format(shutdown_cmd)) + _exec( updater, cmd, @@ -378,7 +401,6 @@ def _exec(updater, cmd, screen, tmux, expect_error=False, port_forward=None): cmd = " ".join(cmd) updater.ssh_cmd( cmd, - verbose=False, allocate_tty=True, expect_error=expect_error, port_forward=port_forward) @@ -405,14 +427,15 @@ def rsync(config_file, source, target, override_cluster_name, down): provider = get_node_provider(config["provider"], config["cluster_name"]) try: updater = NodeUpdaterThread( - head_node, - config["provider"], - provider, - config["auth"], - config["cluster_name"], - config["file_mounts"], - [], - "", + node_id=head_node, + provider_config=config["provider"], + provider=provider, + auth_config=config["auth"], + cluster_name=config["cluster_name"], + file_mounts=config["file_mounts"], + initialization_commands=[], + setup_commands=[], + runtime_hash="", ) if down: rsync = updater.rsync_down diff --git a/python/ray/autoscaler/docker.py b/python/ray/autoscaler/docker.py index 6db0fedd11d3..6d80e10ded7a 100644 --- a/python/ray/autoscaler/docker.py +++ b/python/ray/autoscaler/docker.py @@ -17,6 +17,8 @@ def dockerize_if_needed(config): return config docker_image = config["docker"].get("image") cname = config["docker"].get("container_name") + run_options = config["docker"].get("run_options", []) + ssh_user = config["auth"]["ssh_user"] if not docker_image: if cname: logger.warning( @@ -26,10 +28,11 @@ def dockerize_if_needed(config): else: assert cname, "Must provide container name!" docker_mounts = {dst: dst for dst in config["file_mounts"]} + config["setup_commands"] = ( - docker_install_cmds() + docker_start_cmds( - config["auth"]["ssh_user"], docker_image, docker_mounts, cname) + - with_docker_exec(config["setup_commands"], container_name=cname)) + docker_start_cmds(ssh_user, docker_image, docker_mounts, cname, + run_options) + with_docker_exec( + config["setup_commands"], container_name=cname)) config["head_setup_commands"] = with_docker_exec( config["head_setup_commands"], container_name=cname) @@ -58,13 +61,6 @@ def with_docker_exec(cmds, container_name, env_vars=None): ] -def docker_install_cmds(): - return [ - aptwait_cmd() + " && sudo apt-get update", - aptwait_cmd() + " && sudo apt-get install -y docker.io" - ] - - def aptwait_cmd(): return ("while sudo fuser" " /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock" @@ -72,13 +68,8 @@ def aptwait_cmd(): "do echo 'Waiting for release of dpkg/apt locks'; sleep 5; done") -def docker_start_cmds(user, image, mount, cname): +def docker_start_cmds(user, image, mount, cname, user_options): cmds = [] - cmds.append("sudo kill -SIGUSR1 $(pidof dockerd) || true") - cmds.append("sudo service docker start") - cmds.append("sudo usermod -a -G docker {}".format(user)) - cmds.append("docker rm -f {} || true".format(cname)) - cmds.append("docker pull {}".format(image)) # create flags # ports for the redis, object manager, and tune client @@ -94,16 +85,21 @@ def docker_start_cmds(user, image, mount, cname): env_flags = " ".join( ["-e {name}={val}".format(name=k, val=v) for k, v in env_vars.items()]) + user_options_str = " ".join(user_options) # docker run command + docker_check = [ + "docker", "inspect", "-f", "'{{.State.Running}}'", cname, "||" + ] docker_run = [ "docker", "run", "--rm", "--name {}".format(cname), "-d", "-it", - port_flags, mount_flags, env_flags, "--net=host", image, "bash" + port_flags, mount_flags, env_flags, user_options_str, "--net=host", + image, "bash" + ] + cmds.append(" ".join(docker_check + docker_run)) + docker_update = [ + " && ".join(("apt-get -y update", "apt-get -y upgrade", + "apt-get install -y git wget cmake psmisc")) ] - cmds.append(" ".join(docker_run)) - docker_update = [] - docker_update.append("apt-get -y update") - docker_update.append("apt-get -y upgrade") - docker_update.append("apt-get install -y git wget cmake psmisc") cmds.extend(with_docker_exec(docker_update, container_name=cname)) return cmds diff --git a/python/ray/autoscaler/gcp/example-full.yaml b/python/ray/autoscaler/gcp/example-full.yaml index a7f708b4c398..2378f38ce38a 100644 --- a/python/ray/autoscaler/gcp/example-full.yaml +++ b/python/ray/autoscaler/gcp/example-full.yaml @@ -20,6 +20,7 @@ initial_workers: 0 docker: image: "" # e.g., tensorflow/tensorflow:1.5.0-py3 container_name: "" # e.g. ray_docker + run_options: [] # Extra options to pass into "docker run" # The autoscaler will scale up the cluster to this target fraction of resource @@ -66,9 +67,9 @@ head_node: # Additional options can be found in in the compute docs at # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert - # If the network interface is specified as below in both head and worker + # If the network interface is specified as below in both head and worker # nodes, the manual network config is used. Otherwise an existing subnet is - # used. To use a shared subnet, ask the subnet owner to grant permission + # used. To use a shared subnet, ask the subnet owner to grant permission # for 'compute.subnetworks.use' to the ray autoscaler account... # networkInterfaces: # - kind: compute#networkInterface @@ -100,6 +101,11 @@ file_mounts: { # "/path2/on/remote/machine": "/path2/on/local/machine", } +# List of commands that will be run before `setup_commands`. If docker is +# enabled, these commands will run outside the container and before docker +# is setup. +initialization_commands: [] + # List of shell commands to run to set up nodes. setup_commands: # Note: if you're developing Ray, you probably want to create an AMI that @@ -107,11 +113,6 @@ setup_commands: # below with a git checkout (and possibly a recompile). # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc - - >- - sudo apt-get update - && sudo apt-get install -y - psmisc - # Install Anaconda. - >- wget https://repo.continuum.io/archive/Anaconda3-5.0.1-Linux-x86_64.sh -O ~/anaconda3.sh diff --git a/python/ray/autoscaler/gcp/example-gpu-docker.yaml b/python/ray/autoscaler/gcp/example-gpu-docker.yaml new file mode 100644 index 000000000000..eb8923eee472 --- /dev/null +++ b/python/ray/autoscaler/gcp/example-gpu-docker.yaml @@ -0,0 +1,159 @@ +# An unique identifier for the head node and workers of this cluster. +cluster_name: gpu-docker + +# The minimum number of workers nodes to launch in addition to the head +# node. This number should be >= 0. +min_workers: 0 + +# The maximum number of workers nodes to launch in addition to the head +# node. This takes precedence over min_workers. +max_workers: 2 + +# The initial number of worker nodes to launch in addition to the head +# node. When the cluster is first brought up (or when it is refreshed with a +# subsequent `ray up`) this number of nodes will be started. +initial_workers: 0 + +# This executes all commands on all nodes in the docker container, +# and opens all the necessary ports to support the Ray cluster. +# Empty string means disabled. +docker: + image: "tensorflow/tensorflow:1.12.0-gpu-py3" + container_name: "ray-nvidia-docker-test" # e.g. ray_docker + run_options: + - --runtime=nvidia + + +# The autoscaler will scale up the cluster to this target fraction of resource +# usage. For example, if a cluster of 10 nodes is 100% busy and +# target_utilization is 0.8, it would resize the cluster to 13. This fraction +# can be decreased to increase the aggressiveness of upscaling. +# This value must be less than 1.0 for scaling to happen. +target_utilization_fraction: 0.8 + +# If a node is idle for this many minutes, it will be removed. +idle_timeout_minutes: 5 + +# Cloud-provider specific configuration. +provider: + type: gcp + region: us-west1 + availability_zone: us-west1-b + project_id: # Globally unique project id + +# How Ray will authenticate with newly launched nodes. +auth: + ssh_user: ubuntu +# By default Ray creates a new private keypair, but you can also use your own. +# If you do so, make sure to also set "KeyName" in the head and worker node +# configurations below. This requires that you have added the key into the +# project wide meta-data. +# ssh_private_key: /path/to/your/key.pem + +# Provider-specific config for the head node, e.g. instance type. By default +# Ray will auto-configure unspecified fields such as subnets and ssh-keys. +# For more documentation on available fields, see: +# https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert +head_node: + machineType: custom-6-16384 + disks: + - boot: true + autoDelete: true + type: PERSISTENT + initializeParams: + diskSizeGb: 50 + # See https://cloud.google.com/compute/docs/images for more images + sourceImage: projects/deeplearning-platform-release/global/images/family/tf-latest-gpu + guestAccelerators: + - acceleratorType: projects//zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80 + acceleratorCount: 1 + metadata: + items: + - key: install-nvidia-driver + value: "True" + scheduling: + - onHostMaintenance: TERMINATE + + # Additional options can be found in in the compute docs at + # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert + +worker_nodes: + machineType: n1-standard-2 + disks: + - boot: true + autoDelete: true + type: PERSISTENT + initializeParams: + diskSizeGb: 50 + # See https://cloud.google.com/compute/docs/images for more images + sourceImage: projects/deeplearning-platform-release/global/images/family/tf-latest-gpu + guestAccelerators: + - acceleratorType: projects//zones/us-west1-b/acceleratorTypes/nvidia-tesla-k80 + acceleratorCount: 1 + metadata: + items: + - key: install-nvidia-driver + value: "True" + # Run workers on preemtible instance by default. + # Comment this out to use on-demand. + scheduling: + - preemptible: true + - onHostMaintenance: TERMINATE + + # Additional options can be found in in the compute docs at + # https://cloud.google.com/compute/docs/reference/rest/v1/instances/insert + +# Files or directories to copy to the head and worker nodes. The format is a +# dictionary from REMOTE_PATH: LOCAL_PATH, e.g. +file_mounts: { +# "/path1/on/remote/machine": "/path1/on/local/machine", +# "/path2/on/remote/machine": "/path2/on/local/machine", +} + +initialization_commands: + # Wait until nvidia drivers are installed + - >- + timeout 300 bash -c " + command -v nvidia-smi && nvidia-smi + until [ \$? -eq 0 ]; do + command -v nvidia-smi && nvidia-smi + done" + +# List of shell commands to run to set up nodes. +setup_commands: + # Note: if you're developing Ray, you probably want to create an AMI that + # has your Ray repo pre-cloned. Then, you can replace the pip installs + # below with a git checkout (and possibly a recompile). + # - echo 'export PATH="$HOME/anaconda3/envs/tensorflow_p36/bin:$PATH"' >> ~/.bashrc + + # Install ray + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp27-cp27mu-manylinux1_x86_64.whl + - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp35-cp35m-manylinux1_x86_64.whl + # - pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-0.6.3-cp36-cp36m-manylinux1_x86_64.whl + +# Custom commands that will be run on the head node after common setup. +head_setup_commands: + - pip install google-api-python-client==1.7.8 + +# Custom commands that will be run on worker nodes after common setup. +worker_setup_commands: [] + +# Command to start ray on the head node. You don't need to change this. +head_start_ray_commands: + - ray stop + - >- + ulimit -n 65536; + ray start + --head + --redis-port=6379 + --object-manager-port=8076 + --autoscaling-config=~/ray_bootstrap_config.yaml + +# Command to start ray on worker nodes. You don't need to change this. +worker_start_ray_commands: + - ray stop + - >- + ulimit -n 65536; + ray start + --redis-address=$RAY_HEAD_IP:6379 + --object-manager-port=8076 diff --git a/python/ray/autoscaler/updater.py b/python/ray/autoscaler/updater.py index b79f05909bb8..e3697adba9c0 100644 --- a/python/ray/autoscaler/updater.py +++ b/python/ray/autoscaler/updater.py @@ -49,7 +49,8 @@ def __init__(self, auth_config, cluster_name, file_mounts, - setup_cmds, + initialization_commands, + setup_commands, runtime_hash, process_runner=subprocess, use_internal_ip=False): @@ -66,7 +67,8 @@ def __init__(self, remote: os.path.expanduser(local) for remote, local in file_mounts.items() } - self.setup_cmds = setup_cmds + self.initialization_commands = initialization_commands + self.setup_commands = setup_commands self.runtime_hash = runtime_hash def get_caller(self, check_error): @@ -215,13 +217,15 @@ def do_update(self): self.provider.set_node_tags(self.node_id, {TAG_RAY_NODE_STATUS: "setting-up"}) + m = "{}: Initialization commands completed".format(self.node_id) + with LogTimer("NodeUpdater: {}".format(m)): + for cmd in self.initialization_commands: + self.ssh_cmd(cmd, redirect=open("/dev/null", "w")) + m = "{}: Setup commands completed".format(self.node_id) with LogTimer("NodeUpdater: {}".format(m)): - for cmd in self.setup_cmds: - self.ssh_cmd( - cmd, - # verbose=True, - redirect=open("/dev/null", "w")) + for cmd in self.setup_commands: + self.ssh_cmd(cmd, redirect=open("/dev/null", "w")) def rsync_up(self, source, target, redirect=None, check_error=True): self.set_ssh_ip_if_required() @@ -253,7 +257,6 @@ def ssh_cmd(self, cmd, connect_timeout=120, redirect=None, - verbose=False, allocate_tty=False, emulate_interactive=True, expect_error=False, @@ -261,9 +264,8 @@ def ssh_cmd(self, self.set_ssh_ip_if_required() - if verbose: - logger.info("NodeUpdater: " - "Running {} on {}...".format(cmd, self.ssh_ip)) + logger.info("NodeUpdater: Running {} on {}...".format( + cmd, self.ssh_ip)) ssh = ["ssh"] if allocate_tty: ssh.append("-tt") diff --git a/python/ray/scripts/scripts.py b/python/ray/scripts/scripts.py index 3ec867dbeb9e..50a87f4e73cc 100644 --- a/python/ray/scripts/scripts.py +++ b/python/ray/scripts/scripts.py @@ -627,6 +627,11 @@ def submit(cluster_config_file, screen, tmux, stop, start, cluster_name, @cli.command() @click.argument("cluster_config_file", required=True, type=str) @click.argument("cmd", required=True, type=str) +@click.option( + "--docker", + is_flag=True, + default=False, + help="Runs command in the docker container specified in cluster_config.") @click.option( "--stop", is_flag=True, @@ -652,9 +657,9 @@ def submit(cluster_config_file, screen, tmux, stop, start, cluster_name, help="Override the configured cluster name.") @click.option( "--port-forward", required=False, type=int, help="Port to forward.") -def exec_cmd(cluster_config_file, cmd, screen, tmux, stop, start, cluster_name, - port_forward): - exec_cluster(cluster_config_file, cmd, screen, tmux, stop, start, +def exec_cmd(cluster_config_file, cmd, docker, screen, tmux, stop, start, + cluster_name, port_forward): + exec_cluster(cluster_config_file, cmd, docker, screen, tmux, stop, start, cluster_name, port_forward) diff --git a/test/autoscaler_test.py b/test/autoscaler_test.py index 6793898f259e..a33290d5dfa2 100644 --- a/test/autoscaler_test.py +++ b/test/autoscaler_test.py @@ -121,6 +121,7 @@ def terminate_node(self, node_id): "TestProp": 2, }, "file_mounts": {}, + "initialization_commands": ["cmd0"], "setup_commands": ["cmd1"], "head_setup_commands": ["cmd2"], "worker_setup_commands": ["cmd3"],