Skip to content

Commit

Permalink
[CI][GCI/3] Add variations attribute to create tests in multiple clus…
Browse files Browse the repository at this point in the history
…ter environment (ray-project#33718)

This diff adds a 'variations' attribute to release test definitions, and update the parser to interpret this attribute.

This attribute is used when one wants to define several flavors for a test definition. Each flavor defines a set of test parameters, such as its environment, cluster compute, etc. A test will then be created for each flavor.

Signed-off-by: Cuong Nguyen <[email protected]>
Signed-off-by: elliottower <[email protected]>
  • Loading branch information
can-anyscale authored and elliottower committed Apr 22, 2023
1 parent a9aebbc commit fa2ef5a
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 86 deletions.
2 changes: 1 addition & 1 deletion release/ray_release/cluster_manager/cluster_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ def set_cluster_env(self, cluster_env: Dict[str, Any]):

self.cluster_env_name = (
f"{self.project_name}_{self.project_id[4:8]}"
f"__env__{self.test_name}__"
f"__env__{self.test_name.replace('.', '_')}__"
f"{dict_hash(self.cluster_env)}"
)

Expand Down
56 changes: 53 additions & 3 deletions release/ray_release/config.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import copy
import json
import os
import re
Expand All @@ -12,6 +13,18 @@


class Test(dict):
"""A class represents a test to run on buildkite"""

pass


class TestDefinition(dict):
"""
A class represents a definition of a test, such as test name, group, etc. Comparing
to the test class, there are additional field, for example variations, which can be
used to define several variations of a test.
"""

pass


Expand Down Expand Up @@ -47,10 +60,47 @@ def read_and_validate_release_test_collection(
) -> List[Test]:
"""Read and validate test collection from config file"""
with open(config_file, "rt") as fp:
test_config = yaml.safe_load(fp)
tests = parse_test_definition(yaml.safe_load(fp))

validate_release_test_collection(tests, schema_file=schema_file)
return tests


validate_release_test_collection(test_config, schema_file=schema_file)
return test_config
def _test_definition_invariant(
test_definition: TestDefinition,
invariant: bool,
message: str,
) -> None:
if invariant:
return
raise ReleaseTestConfigError(
f'{test_definition["name"]} has invalid definition: {message}',
)


def parse_test_definition(test_definitions: List[TestDefinition]) -> List[Test]:
tests = []
for test_definition in test_definitions:
if "variations" not in test_definition:
tests.append(test_definition)
continue
variations = test_definition.pop("variations")
_test_definition_invariant(
test_definition,
variations,
"variations field cannot be empty in a test definition",
)
for variation in variations:
_test_definition_invariant(
test_definition,
"__suffix__" in variation,
"missing __suffix__ field in a variation",
)
test = copy.deepcopy(test_definition)
test["name"] = f'{test["name"]}.{variation.pop("__suffix__")}'
test.update(variation)
tests.append(test)
return tests


def load_schema_file(path: Optional[str] = None) -> Dict:
Expand Down
52 changes: 52 additions & 0 deletions release/ray_release/tests/test_config.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,17 @@
import os
import sys
import yaml
import pytest

from ray_release.config import (
read_and_validate_release_test_collection,
Test,
validate_cluster_compute,
load_schema_file,
parse_test_definition,
validate_test,
)
from ray_release.exception import ReleaseTestConfigError

TEST_COLLECTION_FILE = os.path.join(
os.path.dirname(__file__), "..", "..", "release_tests.yaml"
Expand Down Expand Up @@ -40,6 +43,55 @@
)


def test_parse_test_definition():
"""
Unit test for the ray_release.config.parse_test_definition function. In particular,
we check that the code correctly parse a test definition that have the 'variations'
field.
"""
test_definitions = yaml.safe_load(
"""
- name: sample_test
working_dir: sample_dir
frequency: nightly
team: sample
cluster:
cluster_env: env.yaml
cluster_compute: compute.yaml
run:
timeout: 100
script: python script.py
variations:
- __suffix__: aws
- __suffix__: gce
cluster:
cluster_env: env_gce.yaml
cluster_compute: compute_gce.yaml
"""
)
# Check that parsing returns two tests, one for each variation (aws and gce). Check
# that both tests are valid, and their fields are populated correctly
tests = parse_test_definition(test_definitions)
aws_test = tests[0]
gce_test = tests[1]
schema = load_schema_file()
assert not validate_test(aws_test, schema)
assert not validate_test(gce_test, schema)
assert aws_test["name"] == "sample_test.aws"
assert gce_test["cluster"]["cluster_compute"] == "compute_gce.yaml"
invalid_test_definition = test_definitions[0]
# Intentionally make the test definition invalid by create an empty 'variations'
# field. Check that the parser throws exception at runtime
invalid_test_definition["variations"] = []
with pytest.raises(ReleaseTestConfigError):
parse_test_definition([invalid_test_definition])
# Intentionally make the test definition invalid by making one 'variation' entry
# missing the __suffix__ entry. Check that the parser throws exception at runtime
invalid_test_definition["variations"] = [{"__suffix__": "aws"}, {}]
with pytest.raises(ReleaseTestConfigError):
parse_test_definition([invalid_test_definition])


def test_schema_validation():
test = VALID_TEST.copy()

Expand Down
109 changes: 29 additions & 80 deletions release/release_tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1238,12 +1238,9 @@
#######################
# Tune cloud tests
#######################
- name: tune_cloud_aws_no_sync_down
- name: tune_cloud_no_sync_down
group: Tune cloud tests
working_dir: tune_tests/cloud_tests

stable: true

frequency: nightly
team: ml

Expand All @@ -1258,38 +1255,19 @@
wait_for_nodes:
num_nodes: 4

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gce_4x8.yaml

alert: tune_tests

- name: tune_cloud_gce_no_sync_down
group: Tune cloud tests
working_dir: tune_tests/cloud_tests

stable: true

frequency: nightly
team: ml
env: gce

cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gce_4x8.yaml
autosuspend_mins: 60

run:
timeout: 600
script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8
wait_for_nodes:
num_nodes: 4

alert: tune_tests

- name: tune_cloud_aws_ssh_sync
- name: tune_cloud_ssh_sync
group: Tune cloud tests
working_dir: tune_tests/cloud_tests

stable: true

frequency: nightly
team: ml

Expand All @@ -1300,42 +1278,22 @@
run:
timeout: 600
script: python workloads/run_cloud_test.py ssh_sync

wait_for_nodes:
num_nodes: 4

- name: tune_cloud_gce_ssh_sync
group: Tune cloud tests
working_dir: tune_tests/cloud_tests

stable: true
frequency: nightly
team: ml
env: gce

cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gce_4x8.yaml
autosuspend_mins: 60

run:
timeout: 600
script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8

wait_for_nodes:
num_nodes: 4

alert: tune_tests

variations:
- __suffix__: aws
- __suffix__: gce
env: gce
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gce_4x8.yaml

alert: tune_tests

- name: tune_cloud_aws_durable_upload
- name: tune_cloud_durable_upload
group: Tune cloud tests
working_dir: tune_tests/cloud_tests

stable: true

frequency: nightly
team: ml

Expand All @@ -1350,27 +1308,18 @@
wait_for_nodes:
num_nodes: 4


alert: tune_tests

- name: tune_cloud_gce_durable_upload
group: Tune cloud tests
working_dir: tune_tests/cloud_tests
stable: true
frequency: nightly
team: ml
env: gce

cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gce_4x8.yaml
autosuspend_mins: 60

run:
timeout: 600
script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://tune-cloud-tests/durable_upload
wait_for_nodes:
num_nodes: 4
variations:
- __suffix__: aws
- __suffix__: gce
env: gce
cluster:
cluster_env: app_config.yaml
cluster_compute: tpl_gce_4x8.yaml
run:
timeout: 600
script: python workloads/run_cloud_test.py durable_upload --bucket gs://tune-cloud-tests/durable_upload
wait_for_nodes:
num_nodes: 4

alert: tune_tests

Expand Down
4 changes: 2 additions & 2 deletions release/tune_tests/cloud_tests/tpl_gce_4x8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,11 @@ max_workers: 3

head_node_type:
name: head_node
instance_type: n2-standard-8
instance_type: n2-standard-2

worker_node_types:
- name: worker_node
instance_type: n2-standard-8
instance_type: n2-standard-2
min_workers: 3
max_workers: 3
use_spot: false

0 comments on commit fa2ef5a

Please sign in to comment.