Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Upgrade ray to 2.3.0 #1618

Closed
wants to merge 23 commits into from
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
ca94eba
update the patches
Michaelvll Jan 23, 2023
1651e7f
upgrade node providers
Michaelvll Jan 23, 2023
55dd5b4
Merge branch 'master' of github.com:concretevitamin/sky-experiments i…
Michaelvll Jan 24, 2023
dc3c14f
Merge branch 'master' of github.com:concretevitamin/sky-experiments i…
Michaelvll Jan 24, 2023
d4ea222
fix azure config.py
Michaelvll Jan 24, 2023
ffb6f7b
print sky queue
Michaelvll Jan 24, 2023
d33593e
add back azure disk size
Michaelvll Jan 24, 2023
380d8b6
fix job manager
Michaelvll Jan 26, 2023
16fe424
Merge branch 'master' of github.com:concretevitamin/sky-experiments i…
Michaelvll Jan 26, 2023
c21d46d
fix hash
Michaelvll Jan 26, 2023
99cc5dc
longer timeout
Michaelvll Jan 26, 2023
5ad228d
fix test smoke
Michaelvll Jan 26, 2023
e3f0c60
Remove the patch for job_manager
Michaelvll Jan 26, 2023
3e42635
longer timeout for azure_region test
Michaelvll Jan 27, 2023
e0d8e7c
Merge branch 'master' of github.com:concretevitamin/sky-experiments i…
Michaelvll Feb 1, 2023
0cb298b
address comments
Michaelvll Feb 1, 2023
366173b
Merge branch 'master' of github.com:concretevitamin/sky-experiments i…
Michaelvll Feb 13, 2023
caee0e1
format
Michaelvll Feb 13, 2023
4e280a3
fix templates
Michaelvll Feb 13, 2023
582b0ba
pip install --exists-action
Michaelvll Feb 13, 2023
4351433
Upgrade to 2.3 instead
Michaelvll Feb 27, 2023
79627b8
upgrade to ray 2.3
Michaelvll Feb 27, 2023
9cc992e
Merge branch 'master' of github.com:concretevitamin/sky-experiments i…
Michaelvll Feb 27, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 0 additions & 63 deletions sky/backends/backend_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1587,69 +1587,6 @@ def _process_cli_query(
]


def _ray_launch_hash(cluster_name: str,
ray_config: Dict[str, Any]) -> Optional[Set[str]]:
"""Returns a set of Ray launch config hashes, one per node type.

This returns None if ray's _bootstrap_config() failed to return, which can
happen if node providers' bootstrapping phase (config.py) raises an error
(which *should* only happen on errors prior to nodes launching, e.g.,
VPC/subnet setup).
"""
# Use the cached Ray launch hashes if they exist.
metadata = global_user_state.get_cluster_metadata(cluster_name)
assert metadata is not None, cluster_name
ray_launch_hashes = metadata.get('ray_launch_hashes', None)
if ray_launch_hashes is not None:
logger.debug('Using cached launch_hashes.')
return set(ray_launch_hashes)
try:
with ux_utils.suppress_output():
ray_config = ray_commands._bootstrap_config(ray_config) # pylint: disable=protected-access
except RuntimeError as e:
# TODO(zongheng): is this safe? Could it be node(s) are live but somehow a
# separate status refresh hits such errors?
if 'SKYPILOT_ERROR_NO_NODES_LAUNCHED' in str(e):
logger.error(f'Error found when refreshing cluster status: {e}')
return None
raise e
# Adopted from https://github.com/ray-project/ray/blob/ray-2.2.0/python/ray/autoscaler/_private/node_launcher.py#L71-L81
# TODO(zhwu): this logic is duplicated from the ray code above (keep in
# sync).
launch_hashes = set()
head_node_type = ray_config['head_node_type']
for node_type, node_config in ray_config['available_node_types'].items():
if node_type == head_node_type:
launch_config = ray_config.get('head_node', {})
auth_config = ray_config['auth']
else:
launch_config = ray_config.get('worker_nodes', {})
auth_config = dict(ray_config['auth'])
# Why pop ssh_proxy_command for both head and workers:
#
# When we launch the head node from the local client: our call to `ray
# up` has a monkey-patched version of hash_launch_conf(), which drops
# this field.
#
# When the head node launches worker nodes: On the head node,
# ~/ray_bootstrap_config.yaml, which has any ssh_proxy_command field
# removed (see Ray's autoscaler/_private/commands.py), is passed to the
# autoscaler. Therefore when Ray calculates the hash for workers,
# ssh_proxy_command is not included. Here we follow this (otherwise our
# hash here would not match with what's on the console for workers).
auth_config.pop('ssh_proxy_command', None)
launch_config = copy.deepcopy(launch_config)
launch_config.update(node_config['node_config'])
with ux_utils.suppress_output():
current_hash = ray_autoscaler_private_util.hash_launch_conf(
launch_config, auth_config)
launch_hashes.add(current_hash)
# Cache the launch hashes for the cluster.
metadata['ray_launch_hashes'] = list(launch_hashes)
global_user_state.set_cluster_metadata(cluster_name, metadata)
return launch_hashes


def _query_status_aws(
cluster: str,
ray_config: Dict[str, Any],
Expand Down
4 changes: 2 additions & 2 deletions sky/setup_files/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ def parse_readme(readme: str) -> str:

install_requires = [
'wheel',
# NOTE: ray 2.2.0 requires click<=8.0.4,>=7.0; We disable the
# NOTE: ray 2.2.0 requires click>=7.0; We disable the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Local ray versions may be older than 2.2. Does that mean the click<=8.0.4 constraint is still needed?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I decided to upgrade the local ray version to 2.2.0 due to a bunch of the conflicts of the dependencies in #1734 . Wdyt?

# shell completion for click<8.0 for backward compatibility.
'click<=8.0.4,>=7.0',
'click>=7.0',
# NOTE: required by awscli. To avoid ray automatically installing
# the latest version.
'colorama<0.4.5',
Expand Down
2 changes: 1 addition & 1 deletion sky/skylet/providers/aws/cloudwatch/cloudwatch_helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import botocore

from ray.autoscaler._private.aws.utils import client_cache, resource_cache
from sky.skylet.providers.aws.utils import client_cache, resource_cache
from ray.autoscaler.tags import NODE_KIND_HEAD, TAG_RAY_CLUSTER_NAME, TAG_RAY_NODE_KIND

logger = logging.getLogger(__name__)
Expand Down
4 changes: 2 additions & 2 deletions sky/skylet/providers/aws/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
try:
import ray._private.ray_constants as ray_constants
except ImportError:
# SkyPilot: for local ray version lower than 2.2.0
# SkyPilot: for local ray version lower than 2.0.1
import ray.ray_constants as ray_constants
from sky.skylet.providers.aws.cloudwatch.cloudwatch_helper import (
CloudwatchHelper,
Expand Down Expand Up @@ -502,7 +502,7 @@ def _create_node(self, node_config, tags, count):
# SkyPilot: do not adopt the changes from upstream in
# https://github.com/ray-project/ray/commit/c2abfdb2f7eee7f3e4320cb0d9e8e3bd639d5680#diff-eeb7bc1d8342583cf12c40536240dbcc67f089466a18a37bd60f187265a2dc94
concretevitamin marked this conversation as resolved.
Show resolved Hide resolved
# which replaces the exception to NodeLaunchException. As we directly
# handle the exception output in
# handle the exception output in
# cloud_vm_ray_backend._update_blocklist_on_aws_error
cli_logger.abort(
"Failed to launch instances. Max attempts exceeded.",
Expand Down
5 changes: 4 additions & 1 deletion sky/skylet/providers/azure/node_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,10 @@ def __init__(self, provider_config, cluster_name):
# group after tearing down the cluster. To comfort the autoscaler, we need
# to create/update it here, so the resource group always exists.
from sky.skylet.providers.azure.config import _configure_resource_group
_configure_resource_group({"cluster_name": cluster_name, "provider": provider_config})

_configure_resource_group(
{"cluster_name": cluster_name, "provider": provider_config}
)
subscription_id = provider_config["subscription_id"]
self.cache_stopped_nodes = provider_config.get("cache_stopped_nodes", True)
# Sky only supports Azure CLI credential for now.
Expand Down
2 changes: 1 addition & 1 deletion sky/skylet/ray_patches/log_monitor.py.patch
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
0a1,4
> # Adapted from https://github.com/ray-project/ray/blob/ray-2.2.0/python/ray/_private/log_monitor.py
> # Fixed the problem for progress bar, as the latest version does not preserve \r for progress bar.
> # The change is adapted from https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300
> # It is reverted to https://github.com/ray-project/ray/blob/ray-1.10.0/python/ray/_private/log_monitor.py#L299-L300
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A bit confused: what is reverted to what...? Not understanding this line given L2.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed in #1734. PTAL. : )

>
351c355,356
< next_line = next_line.rstrip("\r\n")
Expand Down
4 changes: 2 additions & 2 deletions sky/templates/aws-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -140,8 +140,8 @@ setup_commands:
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
(which conda > /dev/null 2>&1 && conda init > /dev/null) || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true);
source ~/.bashrc;
(pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
(pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[aws]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
(pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
(pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[aws]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
Expand Down
4 changes: 2 additions & 2 deletions sky/templates/azure-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,8 @@ setup_commands:
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
which conda > /dev/null 2>&1 || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(/home/azureuser/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true);
source ~/.bashrc;
(pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful;
(pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[azure]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
(pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: spell out --exists-action wipe for clarity

Why do we need it now? It seems like the grep would've ensured when we reach pip3 install there's no existing package?

Copy link
Collaborator Author

@Michaelvll Michaelvll May 23, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is needed, because otherwise, the original patched files will not be removed from the package, causing the upgraded ray package corrupted due to the staled files. This makes sure that the existing VM upgraded with sky launch (upgrading ray version) will not have staled files in the ray package

nit: spell out --exists-action wipe for clarity

It seems only w is valid, but wipe is not.

(pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[azure]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
Expand Down
4 changes: 2 additions & 2 deletions sky/templates/gcp-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -164,8 +164,8 @@ setup_commands:
test -f /home/gcpuser/miniconda3/etc/profile.d/conda.sh && source /home/gcpuser/miniconda3/etc/profile.d/conda.sh && conda activate base || true;
pip3 install --upgrade google-api-python-client;
{%- endif %}
(pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
(pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
(pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;
(pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[gcp]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
Expand Down
4 changes: 2 additions & 2 deletions sky/templates/lambda-ray.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,8 @@ setup_commands:
(type -a pip | grep -q pip3) || echo 'alias pip=pip3' >> ~/.bashrc;
which conda > /dev/null 2>&1 || (wget -nc https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh && bash Miniconda3-latest-Linux-x86_64.sh -b && eval "$(~/miniconda3/bin/conda shell.bash hook)" && conda init && conda config --set auto_activate_base true);
source ~/.bashrc;
(pip3 list | grep ray | grep {{ray_version}} 2>&1 > /dev/null || pip3 install -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful;
(pip3 list | grep skypilot && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[lambda]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
(pip3 list | grep "ray " | grep {{ray_version}} 2>&1 > /dev/null || pip3 install --exists-action w -U ray[default]=={{ray_version}}) && mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app && touch ~/.sudo_as_admin_successful;
(pip3 list | grep "skypilot " && [ "$(cat {{sky_remote_path}}/current_sky_wheel_hash)" == "{{sky_wheel_hash}}" ]) || (pip3 uninstall skypilot -y; pip3 install "$(echo {{sky_remote_path}}/{{sky_wheel_hash}}/skypilot-{{sky_version}}*.whl)[lambda]" && echo "{{sky_wheel_hash}}" > {{sky_remote_path}}/current_sky_wheel_hash || exit 1);
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf'); sudo systemctl set-property user-$(id -u $(whoami)).slice TasksMax=infinity; sudo systemctl daemon-reload;
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
Expand Down