Skip to content

Commit

Permalink
changed RESUME to POWER_DOWN and removed delete call which is now han…
Browse files Browse the repository at this point in the history
…dled via Slurm that calls terminate.sh (#503)
  • Loading branch information
XaverStiensmeier authored Jun 3, 2024
1 parent 3943d27 commit 70ceb65
Showing 1 changed file with 5 additions and 7 deletions.
12 changes: 5 additions & 7 deletions resources/playbook/roles/bibigrid/files/slurm/fail.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ process_string() {
}

mkdir -p worker_logs
mkdir -p worker_logs/fail
mkdir -p worker_logs/fail/out
mkdir -p worker_logs/fail/err

Expand All @@ -34,16 +35,13 @@ function log {

log "Fail-Script started"

# $1 is in slurm node format for example: bibigrid-worker0-cid-[0-1],bibigrid-worker1-cid-0 and needs no converting
scontrol update NodeName="$1" state=RESUME reason=FailedStartup # no sudo needed cause executed by slurm user

hosts=$(scontrol show hostnames "$1")

echo "Hosts $hosts used"
log "Hosts $hosts used"

# delete servers
python3 /usr/local/bin/delete_server.py "${hosts}"
# $1 is in slurm node format for example: bibigrid-worker0-cid-[0-1],bibigrid-worker1-cid-0 and needs no converting
scontrol update NodeName="$1" state=POWER_DOWN reason=FailedStartup # no sudo needed cause executed by slurm user

echo "Finished delete_server.py execution."
log "Nodes $1 set to POWER_DOWN."

exit $?

0 comments on commit 70ceb65

Please sign in to comment.