Skip to content

Commit

Permalink
ci: FE-14 Migrate test-e2e-slurm to GCP slurmcluster (#879)
Browse files Browse the repository at this point in the history
Migrate the test-e2e-slurm from running on znode to running on gcp. For each circle-ci job it creates a devbox that runs the test cases for e2e_slurm. This allows multiple users to run the jobs at the same time since each devbox is unique.

Acceptance Criteria

The test-e2e-slurm suite content at gate runs on gcp instead of znode* as suite test-e2e-slurm-singularity-gcp

The current test-e2e-slurm suite instead runs only nightly on znode (could be renamed to test-e2e-slurm-znode)

Multiple users can run it at the same time without conflicting (currently I believe there is a single circleci-gcp-dev-box VM)
  • Loading branch information
jerryharrow authored and stoksc committed Jun 28, 2023
1 parent 155e1fc commit 2825857
Show file tree
Hide file tree
Showing 4 changed files with 129 additions and 3 deletions.
125 changes: 125 additions & 0 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2073,6 +2073,120 @@ jobs:
steps:
- run: echo "Test suite disabled."

test-e2e-slurm-gcp:
parameters:
mark:
type: string
parallelism:
type: integer
default: 1
extra-pytest-flags:
type: string
default: ""
slack-mentions:
type: string
default: ""
slack-channel:
type: string
default: ""
instance-name:
type: string
gcloud-service-key:
default: GCLOUD_SERVICE_KEY
description: The gcloud service key
type: env_var_name
google-compute-zone:
default: GOOGLE_COMPUTE_ZONE
description: The Google compute zone to connect with via the gcloud CLI
type: env_var_name
google-project-id:
default: GOOGLE_PROJECT_ID
description: The Google project ID to connect with via the gcloud CLI
type: env_var_name
machine:
image: <<pipeline.parameters.machine-image>>
resource_class: xlarge
parallelism: <<parameters.parallelism>>
steps:
- checkout
- skip-if-docs-only
- skip-if-github-only
- skip-if-webui-only
- set-slack-user-id

- attach_workspace:
at: .

- reinstall-go
- go-get-deps
- run: PATH=$HOME/.local/bin:$PATH make -C proto build

- setup-python-venv:
install-python: true
determined: true
extra-requirements-file: "e2e_tests/tests/requirements.txt"
executor: <<pipeline.parameters.machine-image>>

# - run:
# name: Get master dependencies
# command: make -C master get-deps

- install-devcluster

# Must overwrite the default value ("circleci") since `slurmcluster.sh` sets the name of
# the dev box to "$USER-dev-box"
- run: echo 'export USER=circleci-job-<<parameters.instance-name>>' >> "$BASH_ENV"

- gcloud/install:
version: "412.0.0"
- gcloud/initialize:
gcloud-service-key: <<parameters.gcloud-service-key>>
google-compute-zone: <<parameters.google-compute-zone>>
google-project-id: <<parameters.google-project-id>>

- run:
name: Install terraform
command: |
wget -O- https://apt.releases.hashicorp.com/gpg | sudo gpg --dearmor -o /usr/share/keyrings/hashicorp-archive-keyring.gpg
echo "deb [signed-by=/usr/share/keyrings/hashicorp-archive-keyring.gpg] https://apt.releases.hashicorp.com $(lsb_release -cs) main" | sudo tee /etc/apt/sources.list.d/hashicorp.list
sudo apt update && sudo apt install terraform
- run: terraform --version

- set-google-application-credentials

- run:
name: Make slurmcluster
# Breaks without apt-get installs for some reason
command: |
sudo apt-get update
sudo apt-get install gettext
sudo apt-get install iproute2
yes yes | PATH=$HOME/.local/bin:$PATH make slurmcluster
background: true

- run-e2e-tests:
mark: <<parameters.mark>>
wait-for-master: true
extra-pytest-flags: <<parameters.extra-pytest-flags>>

- run:
name: Make Unslurmcluster
when: always
command: |
(yes yes || true) | make unslurmcluster
# For some reason, even when `make unslurmcluster` is successful, CircleCI
# receives exit code 141, so ignore that.
EXIT_STATUS=$?
echo $EXIT_STATUS
if [[ $EXIT_STATUS -eq 141 ]]; then
echo "Ignoring exit code 141"
exit 0
else
exit $EXIT_STATUS
fi
- store_test_results:
path: /tmp/test-results/

test-e2e-slurm:
parameters:
mark:
Expand Down Expand Up @@ -3076,6 +3190,17 @@ workflows:
requires:
- package-and-push-system-local

- test-e2e-slurm-gcp:
matrix:
parameters:
name: [test-e2e-slurm-singularity-gcp]
instance-name: ["${CIRCLE_WORKFLOW_JOB_ID}"]
context: ["gcp"]
mark: ["e2e_slurm and not parallel"]
extra-pytest-flags: ["-k 'not start_and_write_to_shell'"]
requires:
- build-go

- test-e2e:
name: test-e2e-rbac
requires:
Expand Down
1 change: 1 addition & 0 deletions e2e_tests/tests/cluster/test_slurm.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ def test_master_host() -> None:


@pytest.mark.e2e_slurm
@pytest.mark.parallel
def test_cifar10_pytorch_distributed() -> None:
config = conf.load_config(conf.cv_examples_path("cifar10_pytorch/distributed.yaml"))
config["searcher"]["max_length"] = {"epochs": 1}
Expand Down
2 changes: 1 addition & 1 deletion tools/slurm/scripts/slurmcluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,4 @@ echo "Generated devcluster file: $TEMPYAML"

# Run devcluster.
echo "Running cluster..."
devcluster -c $TEMPYAML
devcluster -c $TEMPYAML --oneshot
4 changes: 2 additions & 2 deletions tools/slurm/scripts/slurmcluster.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,9 @@ stages:
- sh: >
curl localhost:8080/api/v1/auth/login -X POST -d '{"username": "admin", "password": ""}' 2>/dev/null | jq -r .token > /tmp/slurmcluster-token
- sh: >
curl -H "Authorization: Bearer $(</tmp/slurmcluster-token)" -X PATCH -d '{"agent_user_group": {"agent_uid": $OPT_REMOTE_UID, "agent_gid": $OPT_REMOTE_GID, "agent_user": "$OPT_REMOTE_USER", "agent_group": "$OPT_REMOTE_GROUP"}}' localhost:8080/api/v1/users/1
curl -H "Authorization: Bearer $(cat /tmp/slurmcluster-token)" -X PATCH -d '{"agent_user_group": {"agent_uid": $OPT_REMOTE_UID, "agent_gid": $OPT_REMOTE_GID, "agent_user": "$OPT_REMOTE_USER", "agent_group": "$OPT_REMOTE_GROUP"}}' localhost:8080/api/v1/users/1
- sh: >
curl -H "Authorization: Bearer $(</tmp/slurmcluster-token)" -X PATCH -d '{"agent_user_group": {"agent_uid": $OPT_REMOTE_UID, "agent_gid": $OPT_REMOTE_GID, "agent_user": "$OPT_REMOTE_USER", "agent_group": "$OPT_REMOTE_GROUP"}}' localhost:8080/api/v1/users/2
curl -H "Authorization: Bearer $(cat /tmp/slurmcluster-token)" -X PATCH -d '{"agent_user_group": {"agent_uid": $OPT_REMOTE_UID, "agent_gid": $OPT_REMOTE_GID, "agent_user": "$OPT_REMOTE_USER", "agent_group": "$OPT_REMOTE_GROUP"}}' localhost:8080/api/v1/users/2
cmdline:
- $OPT_PROJECT_ROOT/master/build/determined-master
- --config-file
Expand Down

0 comments on commit 2825857

Please sign in to comment.