ci: FE 94 + 97: Problem solve concurrent CI builds on same branch (#924)

* [ALLGCP] Debugging with ps * [ALLGCP] Debugging with ps and bg * [ALLGCP] Adding a provisional make unslurmcluster before build * [ALLGCP] Testing premature cancellation * [ALLGCP] Continuing to try provisional make unslurm * [ALLGCP] Trying to delete dev box altogether * [ALLGCP] Manually changing compute zone * [ALLGCP] Getting rid of lock for circleci builds * [ALLGCP] Now testing cancel in middle of build * [ALLGCP] Cancel in middle of build worked. Added a few more things. * [ALLGCP] Sending SIGKILL to devluster before make unslurmcluster * [ALLGCP] Got rid of hacky sed workaround; added pkill det-master instead of kill 9 * [ALLGCP] Changing way that google compute zone is set in config.yml * [ALLGCP] Changing determined-master to determined-mast to reflect correct process name * [ALLGCP] Changing the way the google-compute-zone is set in config.yaml (grabs it from variables.tf). Also put a warning in variables.tf about this functionality * [ALLGCP] Adding documentation to document the CircleCI make slurmcluster workaround * [ALLGCP] Adding back 'requires: buid-go' even though I think we may be able to do away with this * [ALLGCP] Typo in README * [ALLGCP] Testing triggering new pipeline while e2e tests are running on current pipeline * [ALLGCP] Added final note in README * [ALLGCP] Changed 'projects' to 'project' for CircleCI directory naming convention
determined-ai · Mar 12, 2024 · 6b0d136 · 6b0d136
1 parent d1b6de9
commit 6b0d136
Show file tree

Hide file tree

Showing 5 changed files with 43 additions and 9 deletions.
diff --git a/.circleci/real_config.yml b/.circleci/real_config.yml
@@ -2142,7 +2142,7 @@ jobs:
             or:
               - equal: [ true, <<parameters.always-run>> ]
               - equal: [ "main", <<pipeline.git.branch>> ]
-              - matches: { pattern: "^/release-.*/$", value: <<pipeline.git.branch>> }
+              - matches: { pattern: "^release-.*$", value: <<pipeline.git.branch>> }
           steps:
             - set-commit-message
             - skip-on-dev-branch
@@ -2155,7 +2155,13 @@ jobs:
       - run: 
           name: Create instance name hash and set OPT_DEVBOX_PREFIX env var
           command: |
-            echo "export OPT_DEVBOX_PREFIX=circleci-job-$(echo -n "${CIRCLE_BRANCH}-${CIRCLE_JOB}" | md5sum | awk '{print $1}')" >> "$BASH_ENV"
+            # This check is for the scenario when two pushes to main are running concurrently (we don't want this).
+            # In this case, we need more specificity to the hash information to make two separate VMs.
+            if [[ $CIRCLE_BRANCH == "main" ]]; then
+              echo "export OPT_DEVBOX_PREFIX=circleci-job-$(echo -n "${CIRCLE_USERNAME}-${CIRCLE_BRANCH}-${CIRCLE_JOB}-${CIRCLE_WORKFLOW_JOB_ID}" | md5sum | awk '{print $1}')" >> "$BASH_ENV"
+            else
+              echo "export OPT_DEVBOX_PREFIX=circleci-job-$(echo -n "${CIRCLE_USERNAME}-${CIRCLE_BRANCH}-${CIRCLE_JOB}" | md5sum | awk '{print $1}')" >> "$BASH_ENV"
+            fi
 
       - attach_workspace:
           at: . 
@@ -2189,19 +2195,30 @@ jobs:
 
       - set-google-application-credentials
 
+      - run:
+          name: Delete existing instance (if it exists)
+          command: |
+            yes | gcloud compute instances delete ${OPT_DEVBOX_PREFIX}-dev-box --zone=$(grep \
+            -A2 'variable "zone"' ~/project/tools/slurm/terraform/variables.tf | grep 'default' \
+            | awk -F' = ' '{ print $2 }' | tr -d '\"') || true
+
       - run:
           name: Make slurmcluster
           # Breaks without apt-get installs for some reason
+          # NOTE: tf_lock is set to 'false' to catch the case where a developer triggered a workflow and did not
+          # let it run completely. This should not be an issue since we delete the instance already in the previous
+          # step (this is precautionary). See a similar thing in the 'make unslurmcluster' step.
           command: |
             sudo apt-get update
             sudo apt-get install gettext
             sudo apt-get install iproute2
-            yes yes | PATH=$HOME/.local/bin:$PATH make slurmcluster cont=<<parameters.container-run-type>>
+            yes yes | PATH=$HOME/.local/bin:$PATH make slurmcluster cont=<<parameters.container-run-type>> tf_lock=false
           background: true
 
       - run:
           name: Wait for VM creation
-          command: sleep 3m
+          command: |
+            sleep 3m
             
       - run-e2e-tests:
           mark: <<parameters.mark>>
@@ -2212,7 +2229,9 @@ jobs:
           name: Make Unslurmcluster
           when: always
           command: |
-            (yes yes || true) | make unslurmcluster
+            ps -e
+            pkill determined-mast
+            (yes yes || true) | make unslurmcluster tf_lock=false
             # For some reason, even when `make unslurmcluster` is successful, CircleCI
             # receives exit code 141, so ignore that.
             EXIT_STATUS=$?

diff --git a/tools/slurm/Makefile b/tools/slurm/Makefile
@@ -25,9 +25,9 @@ usage:
 .PHONY: slurmcluster
 slurmcluster:
 	mkdir -p ~/.slurmcluster
-	$(MAKE) -C terraform build VMTIME=$(vmtime)
+	$(MAKE) -C terraform build VMTIME=$(vmtime) TF_LOCK=$(tf_lock)
 	- ./scripts/slurmcluster.sh -c $(cont) || ./scripts/slurmcluster.sh -c singularity
 
 .PHONY: unslurmcluster
 unslurmcluster:
-	$(MAKE) -C terraform clean
+	$(MAKE) -C terraform clean TF_LOCK=$(tf_lock)
diff --git a/tools/slurm/README.md b/tools/slurm/README.md
@@ -93,3 +93,15 @@ The following test suites currently run only on hardware. They do not run succes
   - `test-e2e-slurm-preemption-quarantine`: Currently runs on znode as a part of the nightly test suite.
   - `test-e2e-slurm-restart`: Dependent upon znode configuration, so not worth testing on GCP.
 
+## Important Workaround Explained
+
+Recall, a CircleCI pipeline is triggered upon a push to the remote repository. Only one pipeline can be running at once on a single branch *unless* that branch is main. If a pipeline is in progress and other changes are pushed to a developer branch, the running workflow will be **canceled**. This is a problem if the build was canceled in between the `make slurmcluster` and `make unslurmcluster` steps because then the VM and its current state (all of its experiments, queued jobs, processes, etc.) will persist to the *subsequent* pipeline on the developer branch.
+
+There are different aspects to the issue which are addressed in various ways, as follows. 
+
+1. Making a completely separate VM for *each distinct* job in *each distinct* workflow would allow too many opportunities for VMs and VPC networks to become stale (zombie state). Therefore, one VM is created for each distinct `GH username` + `GH branch` + `CircleCI job name` combination. This way, if a user cancels a build in the "danger zone" (between `make/unmake slurmcluster`) then the VM will not become stale forever and can be dealt with on a subsequent push.
+2. However, on said subsequent pushes, the state of the VM will persist (as mentioned before) which will cause conflicts with launching experiments and starting a `devcluster`. Therefore, we must **delete** the persistent VM *if it exists* by executing `gcloud compute instances delete <relevant instance>`. You will notice in `config.yaml` that the zone is specified via a complicated `grep` command. This command grabs the **default** `zone` value from `terraform/variables.tf` and specifies it in the aforementioned command. This is done since we are running this deletion command *before* we get the chance to reach `make slurmcluster`, which assigns the correct zone to `GOOGLE_COMPUTE_ZONE`. Evidently, this workaround is paradoxical in and of itself.
+3. This deleted instance mentioned above will have acquired a state lock on the terraform states, but now it is deleted and will never give up the lock. Thus, the final step of this workaround is to pass in the `tf_lock=false` environment variable to `make slurmcluster` which runs `terraform apply/destroy` with the flag `-lock=false`. This tells terraform to ignore the status of the lock which allows us to `make/unmake slurmcluster`. Keep in mind that this would normally be dangerous, however since the instance is deleted in the previous step, there will never be any contention in changing the terraform states.
+
+The issue of CircleCI not having a way to gracefully terminate a workflow upon cancellation is a [known issue](https://discuss.circleci.com/t/cancel-workflow-job-with-graceful-termination/39172). If CircleCI ever provides this feature, this workaround can be deprecated and one could simply add `make unslurmcluster` to a cleanup script that runs upon cancellation.
+
diff --git a/tools/slurm/terraform/Makefile b/tools/slurm/terraform/Makefile
@@ -1,4 +1,5 @@
 # Commands for managing (creating/destroying/etc) resources.
+TF_LOCK ?= true
 
 build/backend.conf: scripts/generate-backendconf.sh
 	mkdir -p build
@@ -27,11 +28,11 @@ build/default.tfvars: scripts/generate-tfvars.sh build/generate-sshkeys
 
 .PHONY: build
 build: init build/default.tfvars
-	terraform apply --var-file build/default.tfvars
+	terraform apply --var-file build/default.tfvars -lock=$(TF_LOCK)
 
 .PHONY: clean
 clean: build/default.tfvars
-	terraform destroy --var-file build/default.tfvars
+	terraform destroy --var-file build/default.tfvars -lock=$(TF_LOCK)
 	rm -rf build
 
 # Commands for interacting with created resources.

diff --git a/tools/slurm/terraform/variables.tf b/tools/slurm/terraform/variables.tf
@@ -8,6 +8,8 @@ variable "region" {
   default = "us-west1"
 }
 
+# Note: The default value for this variable is extracted during some
+# CircleCI workflows. Modify with caution.
 variable "zone" {
   type    = string
   default = "us-west1-b"