Skip to content

Commit

Permalink
fix: FOUNDENG-508 Slurm GRES causes checkpoint GC task error: sbatch:…
Browse files Browse the repository at this point in the history
… error: Invalid numeric value "0" [DET-9014] (determined-ai#732)
  • Loading branch information
rcorujo authored and determined-ci committed Jun 21, 2023
1 parent 4220555 commit 350c1b3
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 5 deletions.
19 changes: 14 additions & 5 deletions master/internal/rm/dispatcherrm/dispatcher_resource_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -927,17 +927,26 @@ func (m *dispatcherResourceManager) startLauncherJob(
ctx *actor.Context,
msg StartDispatcherResources,
) {
var err error

req, ok := m.reqList.TaskByHandler(msg.TaskActor)
if !ok {
sendResourceStateChangedErrorResponse(ctx, errors.New("no such task"), msg,
"task not found in the task list")
}

slotType, err := m.resolveSlotType(ctx, req.ResourcePool)
if err != nil {
sendResourceStateChangedErrorResponse(ctx, err, msg,
"unable to access resource pool configuration")
return
slotType := device.CPU

// Only resolve the slot type if the number of slots requested is non-zero.
// Checkpoint GC tasks will always request zero slots and they should
// remain with a slot type of "CPU".
if req.SlotsNeeded > 0 {
slotType, err = m.resolveSlotType(ctx, req.ResourcePool)
if err != nil {
sendResourceStateChangedErrorResponse(ctx, err, msg,
"unable to access resource pool configuration")
return
}
}

// Make sure we explicitly choose a partition. Use default if unspecified.
Expand Down
10 changes: 10 additions & 0 deletions master/pkg/tasks/dispatcher_task.go
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,16 @@ func (t *TaskSpec) computeResources(tresSupported bool, numSlots int, slotType d
resources := launcher.NewResourceRequirementsWithDefaults()
switch {
case slotType == device.CPU:
// Checkpoint GC tasks will always request zero slots and have a device
// type of CPU. While we could simply check for a "t.TaskType" equal to
// "CHECKPOINT_GC", there may be other use cases where the number of
// requested slots is zero, so we check for that instead.
if numSlots == 0 {
numNodes = 1
effectiveSlotsPerNode = 1
haveSlotsPerNode = false
}

resources.SetInstances(map[string]int32{"nodes": int32(numNodes)})

if haveSlotsPerNode {
Expand Down
38 changes: 38 additions & 0 deletions master/pkg/tasks/dispatcher_task_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1106,6 +1106,7 @@ func TestTaskSpec_computeResources(t *testing.T) {
type fields struct {
SlurmConfig expconf.SlurmConfig
PbsConfig expconf.PbsConfig
TaskType model.TaskType
}
type args struct {
tresSupported bool
Expand Down Expand Up @@ -1405,12 +1406,49 @@ func TestTaskSpec_computeResources(t *testing.T) {
Instances: &map[string]int32{"nodes": int32(100)},
},
},
{
name: "Task Type CHECKPOINT_GC, Slot type CPU, gres supported, Slurm",
fields: fields{
SlurmConfig: slurmConfig,
TaskType: "CHECKPOINT_GC",
},
args: args{
tresSupported: false,
slotType: device.CPU,
numSlots: 0,
gresSupported: true,
isPbsLauncher: false,
},
wantResources: &launcher.ResourceRequirements{
Instances: &map[string]int32{"nodes": 1},
Cores: &map[string]float32{"per-node": 1},
},
},
{
name: "Task Type CHECKPOINT_GC, Slot type CPU, gres supported, PBS",
fields: fields{
SlurmConfig: slurmConfig,
TaskType: "CHECKPOINT_GC",
},
args: args{
tresSupported: false,
slotType: device.CPU,
numSlots: 0,
gresSupported: true,
isPbsLauncher: true,
},
wantResources: &launcher.ResourceRequirements{
Instances: &map[string]int32{"nodes": 1},
Cores: &map[string]float32{"per-node": 1},
},
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
tr := &TaskSpec{
SlurmConfig: tt.fields.SlurmConfig,
PbsConfig: tt.fields.PbsConfig,
TaskType: tt.fields.TaskType,
}
got := tr.computeResources(tt.args.tresSupported, tt.args.numSlots, tt.args.slotType,
tt.args.gresSupported, tt.args.isPbsLauncher)
Expand Down

0 comments on commit 350c1b3

Please sign in to comment.