From 6b0d13694ff723cc9637eeeb24d60b590da0825f Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 30 Jun 2023 10:02:18 -0500 Subject: [PATCH] ci: FE 94 + 97: Problem solve concurrent CI builds on same branch (#924) * [ALLGCP] Debugging with ps * [ALLGCP] Debugging with ps and bg * [ALLGCP] Adding a provisional make unslurmcluster before build * [ALLGCP] Testing premature cancellation * [ALLGCP] Continuing to try provisional make unslurm * [ALLGCP] Trying to delete dev box altogether * [ALLGCP] Manually changing compute zone * [ALLGCP] Getting rid of lock for circleci builds * [ALLGCP] Now testing cancel in middle of build * [ALLGCP] Cancel in middle of build worked. Added a few more things. * [ALLGCP] Sending SIGKILL to devluster before make unslurmcluster * [ALLGCP] Got rid of hacky sed workaround; added pkill det-master instead of kill 9 * [ALLGCP] Changing way that google compute zone is set in config.yml * [ALLGCP] Changing determined-master to determined-mast to reflect correct process name * [ALLGCP] Changing the way the google-compute-zone is set in config.yaml (grabs it from variables.tf). Also put a warning in variables.tf about this functionality * [ALLGCP] Adding documentation to document the CircleCI make slurmcluster workaround * [ALLGCP] Adding back 'requires: buid-go' even though I think we may be able to do away with this * [ALLGCP] Typo in README * [ALLGCP] Testing triggering new pipeline while e2e tests are running on current pipeline * [ALLGCP] Added final note in README * [ALLGCP] Changed 'projects' to 'project' for CircleCI directory naming convention --- .circleci/real_config.yml | 29 ++++++++++++++++++++++++----- tools/slurm/Makefile | 4 ++-- tools/slurm/README.md | 12 ++++++++++++ tools/slurm/terraform/Makefile | 5 +++-- tools/slurm/terraform/variables.tf | 2 ++ 5 files changed, 43 insertions(+), 9 deletions(-) diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml index 94d8b6689a8b..9c5a7f833527 100644 --- a/.circleci/real_config.yml +++ b/.circleci/real_config.yml @@ -2142,7 +2142,7 @@ jobs: or: - equal: [ true, <> ] - equal: [ "main", <> ] - - matches: { pattern: "^/release-.*/$", value: <> } + - matches: { pattern: "^release-.*$", value: <> } steps: - set-commit-message - skip-on-dev-branch @@ -2155,7 +2155,13 @@ jobs: - run: name: Create instance name hash and set OPT_DEVBOX_PREFIX env var command: | - echo "export OPT_DEVBOX_PREFIX=circleci-job-$(echo -n "${CIRCLE_BRANCH}-${CIRCLE_JOB}" | md5sum | awk '{print $1}')" >> "$BASH_ENV" + # This check is for the scenario when two pushes to main are running concurrently (we don't want this). + # In this case, we need more specificity to the hash information to make two separate VMs. + if [[ $CIRCLE_BRANCH == "main" ]]; then + echo "export OPT_DEVBOX_PREFIX=circleci-job-$(echo -n "${CIRCLE_USERNAME}-${CIRCLE_BRANCH}-${CIRCLE_JOB}-${CIRCLE_WORKFLOW_JOB_ID}" | md5sum | awk '{print $1}')" >> "$BASH_ENV" + else + echo "export OPT_DEVBOX_PREFIX=circleci-job-$(echo -n "${CIRCLE_USERNAME}-${CIRCLE_BRANCH}-${CIRCLE_JOB}" | md5sum | awk '{print $1}')" >> "$BASH_ENV" + fi - attach_workspace: at: . @@ -2189,19 +2195,30 @@ jobs: - set-google-application-credentials + - run: + name: Delete existing instance (if it exists) + command: | + yes | gcloud compute instances delete ${OPT_DEVBOX_PREFIX}-dev-box --zone=$(grep \ + -A2 'variable "zone"' ~/project/tools/slurm/terraform/variables.tf | grep 'default' \ + | awk -F' = ' '{ print $2 }' | tr -d '\"') || true + - run: name: Make slurmcluster # Breaks without apt-get installs for some reason + # NOTE: tf_lock is set to 'false' to catch the case where a developer triggered a workflow and did not + # let it run completely. This should not be an issue since we delete the instance already in the previous + # step (this is precautionary). See a similar thing in the 'make unslurmcluster' step. command: | sudo apt-get update sudo apt-get install gettext sudo apt-get install iproute2 - yes yes | PATH=$HOME/.local/bin:$PATH make slurmcluster cont=<> + yes yes | PATH=$HOME/.local/bin:$PATH make slurmcluster cont=<> tf_lock=false background: true - run: name: Wait for VM creation - command: sleep 3m + command: | + sleep 3m - run-e2e-tests: mark: <> @@ -2212,7 +2229,9 @@ jobs: name: Make Unslurmcluster when: always command: | - (yes yes || true) | make unslurmcluster + ps -e + pkill determined-mast + (yes yes || true) | make unslurmcluster tf_lock=false # For some reason, even when `make unslurmcluster` is successful, CircleCI # receives exit code 141, so ignore that. EXIT_STATUS=$? diff --git a/tools/slurm/Makefile b/tools/slurm/Makefile index 111164e693d2..de0e81c75f95 100644 --- a/tools/slurm/Makefile +++ b/tools/slurm/Makefile @@ -25,9 +25,9 @@ usage: .PHONY: slurmcluster slurmcluster: mkdir -p ~/.slurmcluster - $(MAKE) -C terraform build VMTIME=$(vmtime) + $(MAKE) -C terraform build VMTIME=$(vmtime) TF_LOCK=$(tf_lock) - ./scripts/slurmcluster.sh -c $(cont) || ./scripts/slurmcluster.sh -c singularity .PHONY: unslurmcluster unslurmcluster: - $(MAKE) -C terraform clean + $(MAKE) -C terraform clean TF_LOCK=$(tf_lock) diff --git a/tools/slurm/README.md b/tools/slurm/README.md index 082f6ef3a55d..3e34faa3c8ab 100644 --- a/tools/slurm/README.md +++ b/tools/slurm/README.md @@ -93,3 +93,15 @@ The following test suites currently run only on hardware. They do not run succes - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite. - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP. +## Important Workaround Explained + +Recall, a CircleCI pipeline is triggered upon a push to the remote repository. Only one pipeline can be running at once on a single branch *unless* that branch is main. If a pipeline is in progress and other changes are pushed to a developer branch, the running workflow will be **canceled**. This is a problem if the build was canceled in between the `make slurmcluster` and `make unslurmcluster` steps because then the VM and its current state (all of its experiments, queued jobs, processes, etc.) will persist to the *subsequent* pipeline on the developer branch. + +There are different aspects to the issue which are addressed in various ways, as follows. + +1. Making a completely separate VM for *each distinct* job in *each distinct* workflow would allow too many opportunities for VMs and VPC networks to become stale (zombie state). Therefore, one VM is created for each distinct `GH username` + `GH branch` + `CircleCI job name` combination. This way, if a user cancels a build in the "danger zone" (between `make/unmake slurmcluster`) then the VM will not become stale forever and can be dealt with on a subsequent push. +2. However, on said subsequent pushes, the state of the VM will persist (as mentioned before) which will cause conflicts with launching experiments and starting a `devcluster`. Therefore, we must **delete** the persistent VM *if it exists* by executing `gcloud compute instances delete `. You will notice in `config.yaml` that the zone is specified via a complicated `grep` command. This command grabs the **default** `zone` value from `terraform/variables.tf` and specifies it in the aforementioned command. This is done since we are running this deletion command *before* we get the chance to reach `make slurmcluster`, which assigns the correct zone to `GOOGLE_COMPUTE_ZONE`. Evidently, this workaround is paradoxical in and of itself. +3. This deleted instance mentioned above will have acquired a state lock on the terraform states, but now it is deleted and will never give up the lock. Thus, the final step of this workaround is to pass in the `tf_lock=false` environment variable to `make slurmcluster` which runs `terraform apply/destroy` with the flag `-lock=false`. This tells terraform to ignore the status of the lock which allows us to `make/unmake slurmcluster`. Keep in mind that this would normally be dangerous, however since the instance is deleted in the previous step, there will never be any contention in changing the terraform states. + +The issue of CircleCI not having a way to gracefully terminate a workflow upon cancellation is a [known issue](https://discuss.circleci.com/t/cancel-workflow-job-with-graceful-termination/39172). If CircleCI ever provides this feature, this workaround can be deprecated and one could simply add `make unslurmcluster` to a cleanup script that runs upon cancellation. + diff --git a/tools/slurm/terraform/Makefile b/tools/slurm/terraform/Makefile index e5eab23e8459..ebdbc4ba5233 100644 --- a/tools/slurm/terraform/Makefile +++ b/tools/slurm/terraform/Makefile @@ -1,4 +1,5 @@ # Commands for managing (creating/destroying/etc) resources. +TF_LOCK ?= true build/backend.conf: scripts/generate-backendconf.sh mkdir -p build @@ -27,11 +28,11 @@ build/default.tfvars: scripts/generate-tfvars.sh build/generate-sshkeys .PHONY: build build: init build/default.tfvars - terraform apply --var-file build/default.tfvars + terraform apply --var-file build/default.tfvars -lock=$(TF_LOCK) .PHONY: clean clean: build/default.tfvars - terraform destroy --var-file build/default.tfvars + terraform destroy --var-file build/default.tfvars -lock=$(TF_LOCK) rm -rf build # Commands for interacting with created resources. diff --git a/tools/slurm/terraform/variables.tf b/tools/slurm/terraform/variables.tf index ca58ec8203d1..1efb8aa515fa 100644 --- a/tools/slurm/terraform/variables.tf +++ b/tools/slurm/terraform/variables.tf @@ -8,6 +8,8 @@ variable "region" { default = "us-west1" } +# Note: The default value for this variable is extracted during some +# CircleCI workflows. Modify with caution. variable "zone" { type = string default = "us-west1-b"