diff --git a/release/ray_release/cluster_manager/cluster_manager.py b/release/ray_release/cluster_manager/cluster_manager.py index 5455360c2107..60b811fda033 100644 --- a/release/ray_release/cluster_manager/cluster_manager.py +++ b/release/ray_release/cluster_manager/cluster_manager.py @@ -62,7 +62,7 @@ def set_cluster_env(self, cluster_env: Dict[str, Any]): self.cluster_env_name = ( f"{self.project_name}_{self.project_id[4:8]}" - f"__env__{self.test_name}__" + f"__env__{self.test_name.replace('.', '_')}__" f"{dict_hash(self.cluster_env)}" ) diff --git a/release/ray_release/config.py b/release/ray_release/config.py index f6d8fbcf4c7c..47266b02f970 100644 --- a/release/ray_release/config.py +++ b/release/ray_release/config.py @@ -1,3 +1,4 @@ +import copy import json import os import re @@ -12,6 +13,18 @@ class Test(dict): + """A class represents a test to run on buildkite""" + + pass + + +class TestDefinition(dict): + """ + A class represents a definition of a test, such as test name, group, etc. Comparing + to the test class, there are additional field, for example variations, which can be + used to define several variations of a test. + """ + pass @@ -47,10 +60,47 @@ def read_and_validate_release_test_collection( ) -> List[Test]: """Read and validate test collection from config file""" with open(config_file, "rt") as fp: - test_config = yaml.safe_load(fp) + tests = parse_test_definition(yaml.safe_load(fp)) + + validate_release_test_collection(tests, schema_file=schema_file) + return tests + - validate_release_test_collection(test_config, schema_file=schema_file) - return test_config +def _test_definition_invariant( + test_definition: TestDefinition, + invariant: bool, + message: str, +) -> None: + if invariant: + return + raise ReleaseTestConfigError( + f'{test_definition["name"]} has invalid definition: {message}', + ) + + +def parse_test_definition(test_definitions: List[TestDefinition]) -> List[Test]: + tests = [] + for test_definition in test_definitions: + if "variations" not in test_definition: + tests.append(test_definition) + continue + variations = test_definition.pop("variations") + _test_definition_invariant( + test_definition, + variations, + "variations field cannot be empty in a test definition", + ) + for variation in variations: + _test_definition_invariant( + test_definition, + "__suffix__" in variation, + "missing __suffix__ field in a variation", + ) + test = copy.deepcopy(test_definition) + test["name"] = f'{test["name"]}.{variation.pop("__suffix__")}' + test.update(variation) + tests.append(test) + return tests def load_schema_file(path: Optional[str] = None) -> Dict: diff --git a/release/ray_release/tests/test_config.py b/release/ray_release/tests/test_config.py index f43f56362581..80915758b572 100644 --- a/release/ray_release/tests/test_config.py +++ b/release/ray_release/tests/test_config.py @@ -1,5 +1,6 @@ import os import sys +import yaml import pytest from ray_release.config import ( @@ -7,8 +8,10 @@ Test, validate_cluster_compute, load_schema_file, + parse_test_definition, validate_test, ) +from ray_release.exception import ReleaseTestConfigError TEST_COLLECTION_FILE = os.path.join( os.path.dirname(__file__), "..", "..", "release_tests.yaml" @@ -40,6 +43,55 @@ ) +def test_parse_test_definition(): + """ + Unit test for the ray_release.config.parse_test_definition function. In particular, + we check that the code correctly parse a test definition that have the 'variations' + field. + """ + test_definitions = yaml.safe_load( + """ + - name: sample_test + working_dir: sample_dir + frequency: nightly + team: sample + cluster: + cluster_env: env.yaml + cluster_compute: compute.yaml + run: + timeout: 100 + script: python script.py + variations: + - __suffix__: aws + - __suffix__: gce + cluster: + cluster_env: env_gce.yaml + cluster_compute: compute_gce.yaml + """ + ) + # Check that parsing returns two tests, one for each variation (aws and gce). Check + # that both tests are valid, and their fields are populated correctly + tests = parse_test_definition(test_definitions) + aws_test = tests[0] + gce_test = tests[1] + schema = load_schema_file() + assert not validate_test(aws_test, schema) + assert not validate_test(gce_test, schema) + assert aws_test["name"] == "sample_test.aws" + assert gce_test["cluster"]["cluster_compute"] == "compute_gce.yaml" + invalid_test_definition = test_definitions[0] + # Intentionally make the test definition invalid by create an empty 'variations' + # field. Check that the parser throws exception at runtime + invalid_test_definition["variations"] = [] + with pytest.raises(ReleaseTestConfigError): + parse_test_definition([invalid_test_definition]) + # Intentionally make the test definition invalid by making one 'variation' entry + # missing the __suffix__ entry. Check that the parser throws exception at runtime + invalid_test_definition["variations"] = [{"__suffix__": "aws"}, {}] + with pytest.raises(ReleaseTestConfigError): + parse_test_definition([invalid_test_definition]) + + def test_schema_validation(): test = VALID_TEST.copy() diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 1fd8c045e65d..ce229a88f21b 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -1238,12 +1238,9 @@ ####################### # Tune cloud tests ####################### -- name: tune_cloud_aws_no_sync_down +- name: tune_cloud_no_sync_down group: Tune cloud tests working_dir: tune_tests/cloud_tests - - stable: true - frequency: nightly team: ml @@ -1258,38 +1255,19 @@ wait_for_nodes: num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_gce_4x8.yaml alert: tune_tests -- name: tune_cloud_gce_no_sync_down - group: Tune cloud tests - working_dir: tune_tests/cloud_tests - - stable: true - - frequency: nightly - team: ml - env: gce - - cluster: - cluster_env: app_config.yaml - cluster_compute: tpl_gce_4x8.yaml - autosuspend_mins: 60 - - run: - timeout: 600 - script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8 - wait_for_nodes: - num_nodes: 4 - - alert: tune_tests - -- name: tune_cloud_aws_ssh_sync +- name: tune_cloud_ssh_sync group: Tune cloud tests working_dir: tune_tests/cloud_tests - - stable: true - frequency: nightly team: ml @@ -1300,42 +1278,22 @@ run: timeout: 600 script: python workloads/run_cloud_test.py ssh_sync - - wait_for_nodes: - num_nodes: 4 - -- name: tune_cloud_gce_ssh_sync - group: Tune cloud tests - working_dir: tune_tests/cloud_tests - - stable: true - frequency: nightly - team: ml - env: gce - - cluster: - cluster_env: app_config.yaml - cluster_compute: tpl_gce_4x8.yaml - autosuspend_mins: 60 - - run: - timeout: 600 - script: python workloads/run_cloud_test.py ssh_sync --cpus-per-trial 8 - wait_for_nodes: num_nodes: 4 - alert: tune_tests - + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_gce_4x8.yaml alert: tune_tests -- name: tune_cloud_aws_durable_upload +- name: tune_cloud_durable_upload group: Tune cloud tests working_dir: tune_tests/cloud_tests - - stable: true - frequency: nightly team: ml @@ -1350,27 +1308,18 @@ wait_for_nodes: num_nodes: 4 - - alert: tune_tests - -- name: tune_cloud_gce_durable_upload - group: Tune cloud tests - working_dir: tune_tests/cloud_tests - stable: true - frequency: nightly - team: ml - env: gce - - cluster: - cluster_env: app_config.yaml - cluster_compute: tpl_gce_4x8.yaml - autosuspend_mins: 60 - - run: - timeout: 600 - script: python workloads/run_cloud_test.py durable_upload --cpus-per-trial 8 --bucket gs://tune-cloud-tests/durable_upload - wait_for_nodes: - num_nodes: 4 + variations: + - __suffix__: aws + - __suffix__: gce + env: gce + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_gce_4x8.yaml + run: + timeout: 600 + script: python workloads/run_cloud_test.py durable_upload --bucket gs://tune-cloud-tests/durable_upload + wait_for_nodes: + num_nodes: 4 alert: tune_tests diff --git a/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml b/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml index 864291602ac4..1c62f09bb7e0 100644 --- a/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml +++ b/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml @@ -7,11 +7,11 @@ max_workers: 3 head_node_type: name: head_node - instance_type: n2-standard-8 + instance_type: n2-standard-2 worker_node_types: - name: worker_node - instance_type: n2-standard-8 + instance_type: n2-standard-2 min_workers: 3 max_workers: 3 use_spot: false