From 28258576d0bf8bbca52dfcbdcd7fd76fc58213f1 Mon Sep 17 00:00:00 2001 From: "Jerry J. Harrow" <84593277+jerryharrow@users.noreply.github.com> Date: Tue, 13 Jun 2023 17:33:58 -0400 Subject: [PATCH] ci: FE-14 Migrate test-e2e-slurm to GCP slurmcluster (#879) Migrate the test-e2e-slurm from running on znode to running on gcp. For each circle-ci job it creates a devbox that runs the test cases for e2e_slurm. This allows multiple users to run the jobs at the same time since each devbox is unique. Acceptance Criteria The test-e2e-slurm suite content at gate runs on gcp instead of znode* as suite test-e2e-slurm-singularity-gcp The current test-e2e-slurm suite instead runs only nightly on znode (could be renamed to test-e2e-slurm-znode) Multiple users can run it at the same time without conflicting (currently I believe there is a single circleci-gcp-dev-box VM) --- .circleci/config.yml | 125 ++++++++++++++++++++++++++ e2e_tests/tests/cluster/test_slurm.py | 1 + tools/slurm/scripts/slurmcluster.sh | 2 +- tools/slurm/scripts/slurmcluster.yaml | 4 +- 4 files changed, 129 insertions(+), 3 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index 2224ac6ece3..7b0c1e6733f 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -2073,6 +2073,120 @@ jobs: steps: - run: echo "Test suite disabled." + test-e2e-slurm-gcp: + parameters: + mark: + type: string + parallelism: + type: integer + default: 1 + extra-pytest-flags: + type: string + default: "" + slack-mentions: + type: string + default: "" + slack-channel: + type: string + default: "" + instance-name: + type: string + gcloud-service-key: + default: GCLOUD_SERVICE_KEY + description: The gcloud service key + type: env_var_name + google-compute-zone: + default: GOOGLE_COMPUTE_ZONE + description: The Google compute zone to connect with via the gcloud CLI + type: env_var_name + google-project-id: + default: GOOGLE_PROJECT_ID + description: The Google project ID to connect with via the gcloud CLI + type: env_var_name + machine: + image: <> + resource_class: xlarge + parallelism: <> + steps: + - checkout + - skip-if-docs-only + - skip-if-github-only + - skip-if-webui-only + - set-slack-user-id + + - attach_workspace: + at: . + + - reinstall-go + - go-get-deps + - run: PATH=$HOME/.local/bin:$PATH make -C proto build + + - setup-python-venv: + install-python: true + determined: true + extra-requirements-file: "e2e_tests/tests/requirements.txt" + executor: <> + + # - run: + # name: Get master dependencies + # command: make -C master get-deps + + - install-devcluster + + # Must overwrite the default value ("circleci") since `slurmcluster.sh` sets the name of + # the dev box to "$USER-dev-box" + - run: echo 'export USER=circleci-job-<>' >> "$BASH_ENV" + + - gcloud/install: + version: "412.0.0" + - gcloud/initialize: + gcloud-service-key: <> + google-compute-zone: <> + google-project-id: <> + + - run: + name: Install terraform + command: | + wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg + echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list + sudo apt update && sudo apt install terraform + - run: terraform --version + + - set-google-application-credentials + + - run: + name: Make slurmcluster + # Breaks without apt-get installs for some reason + command: | + sudo apt-get update + sudo apt-get install gettext + sudo apt-get install iproute2 + yes yes | PATH=$HOME/.local/bin:$PATH make slurmcluster + background: true + + - run-e2e-tests: + mark: <> + wait-for-master: true + extra-pytest-flags: <> + + - run: + name: Make Unslurmcluster + when: always + command: | + (yes yes || true) | make unslurmcluster + # For some reason, even when `make unslurmcluster` is successful, CircleCI + # receives exit code 141, so ignore that. + EXIT_STATUS=$? + echo $EXIT_STATUS + if [[ $EXIT_STATUS -eq 141 ]]; then + echo "Ignoring exit code 141" + exit 0 + else + exit $EXIT_STATUS + fi + - store_test_results: + path: /tmp/test-results/ + test-e2e-slurm: parameters: mark: @@ -3076,6 +3190,17 @@ workflows: requires: - package-and-push-system-local + - test-e2e-slurm-gcp: + matrix: + parameters: + name: [test-e2e-slurm-singularity-gcp] + instance-name: ["${CIRCLE_WORKFLOW_JOB_ID}"] + context: ["gcp"] + mark: ["e2e_slurm and not parallel"] + extra-pytest-flags: ["-k 'not start_and_write_to_shell'"] + requires: + - build-go + - test-e2e: name: test-e2e-rbac requires: diff --git a/e2e_tests/tests/cluster/test_slurm.py b/e2e_tests/tests/cluster/test_slurm.py index 0129a7594f2..ff4e2dabdfe 100644 --- a/e2e_tests/tests/cluster/test_slurm.py +++ b/e2e_tests/tests/cluster/test_slurm.py @@ -146,6 +146,7 @@ def test_master_host() -> None: @pytest.mark.e2e_slurm +@pytest.mark.parallel def test_cifar10_pytorch_distributed() -> None: config = conf.load_config(conf.cv_examples_path("cifar10_pytorch/distributed.yaml")) config["searcher"]["max_length"] = {"epochs": 1} diff --git a/tools/slurm/scripts/slurmcluster.sh b/tools/slurm/scripts/slurmcluster.sh index 7b951151c70..374044ecbd7 100755 --- a/tools/slurm/scripts/slurmcluster.sh +++ b/tools/slurm/scripts/slurmcluster.sh @@ -51,4 +51,4 @@ echo "Generated devcluster file: $TEMPYAML" # Run devcluster. echo "Running cluster..." -devcluster -c $TEMPYAML +devcluster -c $TEMPYAML --oneshot diff --git a/tools/slurm/scripts/slurmcluster.yaml b/tools/slurm/scripts/slurmcluster.yaml index 7c2dc191f97..46ca23e311b 100644 --- a/tools/slurm/scripts/slurmcluster.yaml +++ b/tools/slurm/scripts/slurmcluster.yaml @@ -35,9 +35,9 @@ stages: - sh: > curl localhost:8080/api/v1/auth/login -X POST -d '{"username": "admin", "password": ""}' 2>/dev/null | jq -r .token > /tmp/slurmcluster-token - sh: > - curl -H "Authorization: Bearer $( - curl -H "Authorization: Bearer $(