Skip to content

Commit

Permalink
chore: add default SLURM option --no-requeue to avoid unexpected rest…
Browse files Browse the repository at this point in the history
…arts of slurm jobs. (FOUNDENG-495) (#723)

[FOUNDENG-453](https://jira-pro.its.hpecorp.net:8443/browse/FOUNDENG-453) [PR](https://github.hpe.com/hpe/hpc-ard-capsules-core/pull/765) sets the `--no-requeue` option for SLURM by default in the launcher code. Ideally, this should come from determined as this is the default enforced by determined not the launcher. Added code to determined to pass the `--no-requeue` option to the launcher. Please note that the changes in the launcher code will be reverted as part of the future issue [FOUNDENG-496](https://jira-pro.its.hpecorp.net:8443/browse/FOUNDENG-496). Also, the documentation changes for this issue can be found at #6141.
  • Loading branch information
jagadeesh545 authored and determined-ci committed Mar 12, 2024
1 parent 3410499 commit 1a99941
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 2 deletions.
16 changes: 16 additions & 0 deletions master/pkg/tasks/dispatcher_task.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,22 @@ func (t *TaskSpec) ToDispatcherManifest(
var slurmArgs []string
slurmArgs = append(slurmArgs, t.TaskContainerDefaults.Slurm.SbatchArgs()...)
slurmArgs = append(slurmArgs, t.SlurmConfig.SbatchArgs()...)

// SLURM can requeue a job if there are node level settings to specify it to do so.
// So, we have to explicitly specify NO_REQUEUE option to disable the requeueing of slurm jobs.
// Determined will manage the failed/preempted experiments by itself.
// In case, the user has already provided the NO_REQUEUE option, skip this step.
noRequeueExists := false
for _, arg := range slurmArgs {
if arg == "--no-requeue" {
noRequeueExists = true
break
}
}
if !noRequeueExists {
slurmArgs = append(slurmArgs, "--no-requeue")
}

logrus.Debugf("Custom slurm arguments: %s", slurmArgs)
errList := ValidateSlurm(slurmArgs)
if len(errList) > 0 {
Expand Down
19 changes: 17 additions & 2 deletions master/pkg/tasks/dispatcher_task_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ func Test_ToDispatcherManifest(t *testing.T) {
containerRunType: "singularity",
slotType: device.CUDA,
Slurm: []string{"--want=slurmArgs", "--X=Y"},
wantSlurmArgs: []string{"--want=slurmArgs", "--X=Y"},
wantSlurmArgs: []string{"--want=slurmArgs", "--X=Y", "--no-requeue"},
},
{
name: "Test custom pbsArgs",
Expand Down Expand Up @@ -622,6 +622,21 @@ func Test_ToDispatcherManifest(t *testing.T) {
Mounts: []mount.Mount{{Source: varTmp, Target: varTmp}},
wantData: []launcher.Data{{Target: &varTmpLocation}},
},
{
name: "Invalid Slurm Option --requeue",
containerRunType: "singularity",
slotType: device.CUDA,
Slurm: []string{"--requeue"},
wantErr: true,
errorContains: "is not configurable",
},
{
name: "Existing Slurm Option --no-requeue",
containerRunType: "singularity",
slotType: device.CUDA,
Slurm: []string{"--no-requeue"},
wantSlurmArgs: []string{"--no-requeue"},
},
}

for _, tt := range tests {
Expand Down Expand Up @@ -700,7 +715,7 @@ func Test_ToDispatcherManifest(t *testing.T) {
if len(tt.wantSlurmArgs) > 0 {
assert.DeepEqual(t, customs["slurmArgs"], tt.wantSlurmArgs)
} else {
assert.Assert(t, customs["slurmArgs"] == nil)
assert.DeepEqual(t, customs["slurmArgs"], []string{"--no-requeue"})
}

if len(tt.wantPbsArgs) > 0 {
Expand Down
1 change: 1 addition & 0 deletions master/pkg/tasks/dispatcher_verification.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ func ValidateSlurm(slurmOptions []string) []error {
"--error=", "-e",
"--output=", "-o",
"--partition=", "-p",
"--requeue",
}
errors := validateWlmOptions(wlmSlurm, slurmOptions, forbiddenArgs)

Expand Down

0 comments on commit 1a99941

Please sign in to comment.