Skip to content

Commit

Permalink
chore: Revert changes in #723 (#744)
Browse files Browse the repository at this point in the history
  • Loading branch information
jagadeesh545 authored and determined-ci committed Apr 18, 2024
1 parent 44216c3 commit 8db94b4
Show file tree
Hide file tree
Showing 4 changed files with 22 additions and 19 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ const (
pbsSchedulerType = "pbs"
slurmResourcesCarrier = "com.cray.analytics.capsules.carriers.hpc.slurm.SlurmResources"
pbsResourcesCarrier = "com.cray.analytics.capsules.carriers.hpc.pbs.PbsResources"
launcherMinimumVersion = "3.2.3"
launcherMinimumVersion = "3.2.4"
root = "root"
)

Expand Down
15 changes: 0 additions & 15 deletions master/pkg/tasks/dispatcher_task.go
Original file line number Diff line number Diff line change
Expand Up @@ -219,21 +219,6 @@ func (t *TaskSpec) ToDispatcherManifest(
slurmArgs = append(slurmArgs, t.TaskContainerDefaults.Slurm.SbatchArgs()...)
slurmArgs = append(slurmArgs, t.SlurmConfig.SbatchArgs()...)

// SLURM can requeue a job if there are node level settings to specify it to do so.
// So, we have to explicitly specify NO_REQUEUE option to disable the requeueing of slurm jobs.
// Determined will manage the failed/preempted experiments by itself.
// In case, the user has already provided the NO_REQUEUE option, skip this step.
noRequeueExists := false
for _, arg := range slurmArgs {
if arg == "--no-requeue" {
noRequeueExists = true
break
}
}
if !noRequeueExists {
slurmArgs = append(slurmArgs, "--no-requeue")
}

logrus.Debugf("Custom slurm arguments: %s", slurmArgs)
errList := ValidateSlurm(slurmArgs)
if len(errList) > 0 {
Expand Down
23 changes: 20 additions & 3 deletions master/pkg/tasks/dispatcher_task_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -583,7 +583,7 @@ func Test_ToDispatcherManifest(t *testing.T) {
containerRunType: "singularity",
slotType: device.CUDA,
Slurm: []string{"--want=slurmArgs", "--X=Y"},
wantSlurmArgs: []string{"--want=slurmArgs", "--X=Y", "--no-requeue"},
wantSlurmArgs: []string{"--want=slurmArgs", "--X=Y"},
},
{
name: "Test custom pbsArgs",
Expand Down Expand Up @@ -635,7 +635,24 @@ func Test_ToDispatcherManifest(t *testing.T) {
containerRunType: "singularity",
slotType: device.CUDA,
Slurm: []string{"--no-requeue"},
wantSlurmArgs: []string{"--no-requeue"},
wantErr: true,
errorContains: "is not configurable",
},
{
name: "Invalid PBS Option -r",
containerRunType: "singularity",
slotType: device.CUDA,
Pbs: []string{"-r"},
wantErr: true,
errorContains: "is not configurable",
},
{
name: "Existing PBS Option -r n",
containerRunType: "singularity",
slotType: device.CUDA,
Pbs: []string{"-r n"},
wantErr: true,
errorContains: "is not configurable",
},
}

Expand Down Expand Up @@ -715,7 +732,7 @@ func Test_ToDispatcherManifest(t *testing.T) {
if len(tt.wantSlurmArgs) > 0 {
assert.DeepEqual(t, customs["slurmArgs"], tt.wantSlurmArgs)
} else {
assert.DeepEqual(t, customs["slurmArgs"], []string{"--no-requeue"})
assert.Assert(t, customs["slurmArgs"] == nil)
}

if len(tt.wantPbsArgs) > 0 {
Expand Down
1 change: 1 addition & 0 deletions master/pkg/tasks/dispatcher_verification.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ func ValidateSlurm(slurmOptions []string) []error {
"--error=", "-e",
"--output=", "-o",
"--partition=", "-p",
"--no-requeue",
"--requeue",
}
errors := validateWlmOptions(wlmSlurm, slurmOptions, forbiddenArgs)
Expand Down

0 comments on commit 8db94b4

Please sign in to comment.