From 57d69f885f04693e6778796067b791f652124cc1 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Sat, 18 Mar 2023 10:30:15 -0700 Subject: [PATCH 1/7] Fix 'Observed wheel commit () is not expected' issue (https://github.com/ray-project/ray/issues/32156) that has been creeping through many of ci/cd builds in our pipeline. The existing code uses pipe to read from a rather large file (>50MB). Pipe however has buffer limit which by default in term of kb (https://man7.org/linux/man-pages/man7/pipe.7.html) so what we look for might not exist. We can fix this by tell unzip the exact file we are looking for. That file is pretty small so we should not hit buffer limit. You might notice other surpises might still happen with this fix (e.g. many files that match ^__commit__). This sanity check goes back to 2 years ago by our veteran Kai (https://github.com/ray-project/ray/commit/234b015b426274d461a15345a4d4724a08bc5289) to sanity check issues with stale artifacts from previous builds or race conditions between builds. Further investigation on how builkite agent multi-tenant is setup might or might not simplify this logic further. Signed-off-by: Cuong Nguyen --- ci/ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/ci.sh b/ci/ci.sh index 79527a0a8f23..a1229aee9483 100755 --- a/ci/ci.sh +++ b/ci/ci.sh @@ -415,7 +415,7 @@ validate_wheels_commit_str() { continue fi - WHL_COMMIT=$(unzip -p "$whl" | grep "^__commit__" | awk -F'"' '{print $2}') + WHL_COMMIT=$(unzip -p "$whl" "*ray/__init__.py" | grep "^__commit__" | awk -F'"' '{print $2}') if [ "${WHL_COMMIT}" != "${EXPECTED_COMMIT}" ]; then echo "Error: Observed wheel commit (${WHL_COMMIT}) is not expected commit (${EXPECTED_COMMIT}). Aborting." From 5c17ef925369a5d1021e5ff3352fe6235e941c9b Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 20 Mar 2023 09:32:55 -0700 Subject: [PATCH 2/7] Improve wheel commit validation error message Signed-off-by: Cuong Nguyen --- ci/ci.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/ci.sh b/ci/ci.sh index 664c72fd6eb3..c6581e06bf93 100755 --- a/ci/ci.sh +++ b/ci/ci.sh @@ -418,7 +418,7 @@ validate_wheels_commit_str() { WHL_COMMIT=$(unzip -p "$whl" "*ray/__init__.py" | grep "^__commit__" | awk -F'"' '{print $2}') if [ "${WHL_COMMIT}" != "${EXPECTED_COMMIT}" ]; then - echo "Error: Observed wheel commit (${WHL_COMMIT}) is not expected commit (${EXPECTED_COMMIT}). Aborting." + echo "Wheel ${basename} has incorrect commit: (${WHL_COMMIT}) is not expected commit (${EXPECTED_COMMIT}). Aborting." exit 1 fi From 2ddf2c39ca740fb415a9a627817c70092d1069b8 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 20 Mar 2023 15:52:34 -0700 Subject: [PATCH 3/7] Add a sample GCE test Signed-off-by: Cuong Nguyen --- release/release_tests.yaml | 25 +++++++++++++++++++ .../tune_tests/cloud_tests/tpl_gce_4x8.yaml | 15 +++++++++++ 2 files changed, 40 insertions(+) create mode 100644 release/tune_tests/cloud_tests/tpl_gce_4x8.yaml diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 4a3e9117cff8..156dffcd062a 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -1258,6 +1258,31 @@ alert: tune_tests +- name: tune_cloud_gce + group: Tune cloud tests + working_dir: tune_tests/cloud_tests + + stable: true + + frequency: nightly + team: ml + env: prod_v1 + + cluster: + cluster_env: app_config.yaml + cluster_compute: tpl_gce_4x8.yaml + cloud_id: cld_tPsS3nQz8p5cautbyWgEdr4y # anyscale gce + autosuspend_mins: 60 + + run: + timeout: 600 + script: python workloads/run_cloud_test.py no_sync_down --cpus-per-trial 8 + type: client + + wait_for_nodes: + num_nodes: 4 + + alert: tune_tests - name: tune_cloud_gcp_k8s_no_sync_down group: Tune cloud tests working_dir: tune_tests/cloud_tests diff --git a/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml b/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml new file mode 100644 index 000000000000..a7ed93058436 --- /dev/null +++ b/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml @@ -0,0 +1,15 @@ +cloud_id: cld_tPsS3nQz8p5cautbyWgEdr4y # anyscale_k8s_gcp_cloud +region: us-west1 + +max_workers: 3 + +head_node_type: + name: head_node + instance_type: n2-standard-8 + +worker_node_types: + - name: worker_node + instance_type: n2-standard-8 + min_workers: 3 + max_workers: 3 + use_spot: false \ No newline at end of file From 74bc24b1d70df48e86ff246203cd7f8c9d03ac4b Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 20 Mar 2023 16:08:03 -0700 Subject: [PATCH 4/7] Disable TEST_ATTR_REGEX_FILTERS for testing Signed-off-by: Cuong Nguyen --- release/ray_release/buildkite/settings.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/release/ray_release/buildkite/settings.py b/release/ray_release/buildkite/settings.py index fc6f2dfa3ab2..67ac6243ec51 100644 --- a/release/ray_release/buildkite/settings.py +++ b/release/ray_release/buildkite/settings.py @@ -166,10 +166,10 @@ def update_settings_from_environment(settings: Dict) -> Dict: "name:" + os.environ["TEST_NAME"] ) - if "TEST_ATTR_REGEX_FILTERS" in os.environ: - settings["test_attr_regex_filters"] = get_test_attr_regex_filters( - os.environ["TEST_ATTR_REGEX_FILTERS"] - ) +# if "TEST_ATTR_REGEX_FILTERS" in os.environ: +# settings["test_attr_regex_filters"] = get_test_attr_regex_filters( +# os.environ["TEST_ATTR_REGEX_FILTERS"] +# ) if "RELEASE_PRIORITY" in os.environ: settings["priority"] = get_priority(os.environ["RELEASE_PRIORITY"]) From d23793bbf3ea0617a79c8731199b87a83a8486a8 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 20 Mar 2023 18:00:18 -0700 Subject: [PATCH 5/7] Make tests running on staging v2 Signed-off-by: Cuong Nguyen --- release/release_tests.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/release_tests.yaml b/release/release_tests.yaml index 1df4765f71ad..7e574f041bc8 100644 --- a/release/release_tests.yaml +++ b/release/release_tests.yaml @@ -1321,7 +1321,7 @@ frequency: nightly team: ml - env: prod_v1 + env: staging_v2 cluster: cluster_env: app_config.yaml From 4f404a2b1ed3eb092ee1623df8e24ce0cc4c6f4e Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 20 Mar 2023 19:45:41 -0700 Subject: [PATCH 6/7] Fix compute configs Signed-off-by: Cuong Nguyen --- release/tune_tests/cloud_tests/tpl_gce_4x8.yaml | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml b/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml index a7ed93058436..c6a7cb7a306a 100644 --- a/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml +++ b/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml @@ -1,5 +1,7 @@ -cloud_id: cld_tPsS3nQz8p5cautbyWgEdr4y # anyscale_k8s_gcp_cloud +cloud_id: cld_tPsS3nQz8p5cautbyWgEdr4y # anyscale_gce_cloud region: us-west1 +allowed_azs: + - us-west1-a max_workers: 3 @@ -9,7 +11,7 @@ head_node_type: worker_node_types: - name: worker_node - instance_type: n2-standard-8 + instance_type: n2-standard-16 min_workers: 3 max_workers: 3 use_spot: false \ No newline at end of file From 54519c9c27dc9c33cd47abbeb5068608766a3a37 Mon Sep 17 00:00:00 2001 From: Cuong Nguyen Date: Mon, 20 Mar 2023 20:17:52 -0700 Subject: [PATCH 7/7] Use n2-standard-8 machines Signed-off-by: Cuong Nguyen --- release/tune_tests/cloud_tests/tpl_gce_4x8.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml b/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml index c6a7cb7a306a..4f6e64f3241e 100644 --- a/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml +++ b/release/tune_tests/cloud_tests/tpl_gce_4x8.yaml @@ -11,7 +11,7 @@ head_node_type: worker_node_types: - name: worker_node - instance_type: n2-standard-16 + instance_type: n2-standard-8 min_workers: 3 max_workers: 3 use_spot: false \ No newline at end of file