Skip to content

Commit

Permalink
[Bug] compatibility test for the nightly Ray image fails (#1055)
Browse files Browse the repository at this point in the history
compatibility test for the nightly Ray image fails
  • Loading branch information
kevin85421 authored Apr 28, 2023
1 parent 2019b4b commit 2b136c9
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 35 deletions.
8 changes: 2 additions & 6 deletions tests/compatibility-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,13 +115,9 @@ def test_ray_serve(self):
headpod = get_head_pod(RayFTTestCase.ray_cluster_ns)
headpod_name = headpod.metadata.name

# RAY_NAMESPACE is an abstraction in Ray. It is not a Kubernetes namespace.
ray_namespace = ''.join(random.choices(string.ascii_lowercase, k=10))
logger.info('Ray namespace: %s', ray_namespace)

# Deploy a Ray Serve model.
exit_code = pod_exec_command(headpod_name, RayFTTestCase.ray_cluster_ns,
f" python samples/test_ray_serve_1.py {ray_namespace}",
"python samples/test_ray_serve_1.py",
check = False
)

Expand All @@ -148,7 +144,7 @@ def test_ray_serve(self):
headpod = get_head_pod(RayFTTestCase.ray_cluster_ns)
headpod_name = headpod.metadata.name
exit_code = pod_exec_command(headpod_name, RayFTTestCase.ray_cluster_ns,
f" python samples/test_ray_serve_2.py {ray_namespace}",
"python samples/test_ray_serve_2.py",
check = False
)

Expand Down
32 changes: 12 additions & 20 deletions tests/config/ray-cluster.ray-ft.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ spec:
headGroupSpec:
rayStartParams:
dashboard-host: "0.0.0.0"
num-cpus: "1"
num-cpus: "0"
redis-password: "5241590000000000"
#pod template
template:
Expand Down Expand Up @@ -130,7 +130,7 @@ spec:
path: test_detached_actor_2.py
workerGroupSpecs:
# the pod replicas in this group typed worker
- replicas: 2
- replicas: 1
minReplicas: 1
maxReplicas: 2
# logical group name, for this called small-group, also can be functional
Expand Down Expand Up @@ -167,12 +167,8 @@ metadata:
name: test-script
data:
test_ray_serve_1.py: |
import requests
from starlette.requests import Request
import ray
from ray import serve
import time
import sys

# 1: Define a Ray Serve model.
@serve.deployment(route_prefix="/")
Expand All @@ -183,7 +179,7 @@ data:
def __call__(self):
return self._msg

ray.init(address='ray://127.0.0.1:10001', namespace=sys.argv[1])
ray.init(address='ray://127.0.0.1:10001')
# 2: Deploy the model.
handle = serve.run(MyModelDeployment.bind(msg="Hello world!"))
# 3: Query the deployment and print the result.
Expand All @@ -192,10 +188,8 @@ data:
assert(val == "Hello world!")

test_ray_serve_2.py: |
import ray
import requests
import time
import sys
from ray import serve

def retry_with_timeout(func, timeout=90):
err = None
Expand All @@ -209,16 +203,14 @@ data:
time.sleep(1)
raise err

print("Execute ray.init()")
retry_with_timeout(lambda: ray.init(address='ray://127.0.0.1:10001', namespace=sys.argv[1]))

print("Execute serve.start()")
retry_with_timeout(lambda:serve.start(detached=True))
def send_req():
response = requests.get('http://127.0.0.1:8000', timeout=10)
print('Response status code:', response.status_code)
print('Response headers:', response.headers)
print('Response content:', response.text)
assert(response.text == "Hello world!")

print("Execute ray.get()")
val = retry_with_timeout(lambda: ray.get(serve.get_deployment("MyModelDeployment").get_handle().remote()))
print(val)
assert(val == "Hello world!")
retry_with_timeout(send_req, 180)

test_detached_actor_1.py: |
import ray
Expand Down Expand Up @@ -277,4 +269,4 @@ data:

# The actual value should be 1 rather than 2. Ray will launch all registered actors when
# the ray cluster restarts, but the internal state of the state will not be restored.
assert(val == 1)
assert(val == 3)
10 changes: 1 addition & 9 deletions tests/config/ray-service.yaml.template
Original file line number Diff line number Diff line change
Expand Up @@ -44,13 +44,7 @@ spec:
headGroupSpec:
# the following params are used to complete the ray start: ray start --head --block ...
rayStartParams:
port: '6379' # should match container port named gcs-server
#include_webui: 'true'
object-store-memory: '100000000'
# webui_host: "10.1.2.60"
dashboard-host: '0.0.0.0'
num-cpus: '2' # can be auto-completed from the limits
num-cpus: "1000"
#pod template
template:
metadata:
Expand Down Expand Up @@ -88,9 +82,7 @@ spec:
# - raycluster-complete-worker-small-group-hv457
# - raycluster-complete-worker-small-group-k8tj7
# the following params are used to complete the ray start: ray start --block --node-ip-address= ...
rayStartParams:
node-ip-address: $$MY_POD_IP
num-cpus: "1000"
rayStartParams: {}
#pod template
template:
spec:
Expand Down
3 changes: 3 additions & 0 deletions tests/framework/prototype.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,9 @@ def check_pod_running(pods) -> bool:
for pod in pods:
if pod.status.phase != 'Running':
return False
for container in pod.status.container_statuses:
if not container.ready:
return False
return True

def get_expected_head_pods(custom_resource):
Expand Down

0 comments on commit 2b136c9

Please sign in to comment.