Skip to content

Commit

Permalink
fix: FE-116: CI fails test-e2e-*-enroot-gcp upon update of CPU image …
Browse files Browse the repository at this point in the history
…version (#947)

* [ALLGCP] Added a way for the default CPU image to be checked and default to what is already there

* [ALLGCP] Added new images slurm and pbs

* [ALLGCP] Testing out of date image

* [ALLGCP] Added correct logic for when image is out of date

* [ALLGCP] Added correct logic for when image is out of date

* [ALLGCP] Testing with out-of-date images final

* [ALLGCP] Got rid of debugging in scripts

* [ALLGCP] Added a wait for VM creation step that is less trivial than a constant sleep

* [ALLGCP] Wait for VM creation jhob checks every 5 seconds now

* [ALLGCP] Changing formatting of wait for vm time

* [ALLGCP] Adding some more context to logging in slurmcluster.sh
  • Loading branch information
cquil11 authored and NicholasBlaskey committed Oct 4, 2023
1 parent 10bd209 commit c7c3aca
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
17 changes: 15 additions & 2 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2324,13 +2324,26 @@ jobs:
sudo apt-get update
sudo apt-get install gettext
sudo apt-get install iproute2
yes yes | PATH=$HOME/.local/bin:$PATH make slurmcluster FLAGS="-c <<parameters.container-run-type>> -w <<parameters.workload-manager>> <<parameters.agent-use>>" TF_LOCK=false
yes yes | PATH=$HOME/.local/bin:$PATH make slurmcluster FLAGS="-c <<parameters.container-run-type>> -w <<parameters.workload-manager>> <<parameters.agent-use>>" TF_LOCK=false | tee output.log
background: true

- run:
name: Wait for VM creation
command: |
sleep 3m
sleep 5
for i in {1..120}; do
ELAPSED_TIME=$((i * 5))
echo "Waiting for VM creation [${ELAPSED_TIME}s]..."
if grep -q "^Running cluster..." output.log; then
echo "VM creation has finished."
break
fi
if [[ $i -eq 120 ]]; then
echo "Timeout waiting for VM creation."
exit 1
fi
sleep 5
done
# For when a user wants to use an agent instead of launcher.
- when:
Expand Down
24 changes: 19 additions & 5 deletions tools/slurm/scripts/slurmcluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ while [[ $# -gt 0 ]]; do
fi
shift 2
;;
-w | --resource-manager)
-w | --workload-manager)
export OPT_WORKLOAD_MANAGER=$2
if [[ -z $OPT_WORKLOAD_MANAGER ]]; then
echo >&2 "usage $0: Missing -r {workload_manager}"
Expand Down Expand Up @@ -126,13 +126,27 @@ export OPT_PROJECT_ROOT='../..'
export OPT_CLUSTER_INTERNAL_IP=$(terraform -chdir=terraform output --raw internal_ip)
export OPT_AUTHFILE=$LOCAL_TOKEN_DEST

CPU_IMAGE_STRING=$(grep "CPUImage" ../../master/pkg/schemas/expconf/const.go | awk -F'\"' '{print $2}')
CPU_IMAGE_FMT=${CPU_IMAGE_STRING//[\/:]/+}.sqsh
LOCAL_CPU_IMAGE_STRING=$(grep "CPUImage" ../../master/pkg/schemas/expconf/const.go | awk -F'\"' '{print $2}')
LOCAL_CPU_IMAGE_SQSH=${LOCAL_CPU_IMAGE_STRING//[\/:]/+}.sqsh

# Configuration needed for PBS + Enroot
if [[ $OPT_CONTAINER_RUN_TYPE == "enroot" ]]; then
gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- "enroot create --force /srv/enroot/${CPU_IMAGE_FMT}"
# gcloud compute ssh --zone "$ZONE" "$INSTANCE_NAME" --project "$PROJECT" -- "sudo sed -i '/^#ENROOT_RUNTIME_PATH/c\ENROOT_RUNTIME_PATH /tmp/\$\(whoami\)' /etc/enroot/enroot.conf"
# Find the file and assign its name to CPU_IMAGE_SQSH
CPU_IMAGE_SQSH=$(gcloud_ssh "ls /srv/enroot/ | grep '^determinedai+environments'")

if [[ $CPU_IMAGE_SQSH != "$LOCAL_CPU_IMAGE_SQSH" ]]; then
echo "WARNING: Local CPU Image specified in ../../master/pkg/schemas/expconf/const.go does not match the CPU Image found on existing ${OPT_WORKLOAD_MANAGER} image. Consider re-building the image and pushing to main"
echo "Manually pulling updated image and creating container"
gcloud_ssh "sudo ENROOT_RUNTIME_PATH=/srv/enroot ENROOT_TEMP_PATH=/srv/enroot manage-enroot-cache -s /srv/enroot ${LOCAL_CPU_IMAGE_STRING}"
gcloud_ssh "enroot create --force /srv/enroot/${LOCAL_CPU_IMAGE_SQSH}"
else
echo "Found up-to-date CPU Image on /srv/enroot/ ... creating container"
if [[ -n $CPU_IMAGE_SQSH ]]; then
gcloud_ssh "enroot create --force /srv/enroot/${CPU_IMAGE_SQSH}"
else
echo "No file starting with 'determinedai+environments' found in /srv/enroot/"
fi
fi
fi

TEMPYAML=$TEMPDIR/slurmcluster.yaml
Expand Down
4 changes: 2 additions & 2 deletions tools/slurm/terraform/images.conf
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
slurm: det-environments-slurm-ci-1689107942
pbs: det-environments-pbs-ci-1689108634
slurm: det-environments-slurm-ci-1689266288
pbs: det-environments-pbs-ci-1689269947

0 comments on commit c7c3aca

Please sign in to comment.