Skip to content

Commit

Permalink
fix: Drop incorrect max_slots support code [FOUNDENG-454] (#691)
Browse files Browse the repository at this point in the history
The expconf resources.max_slots attribute is documented as managed by
Slurm, but there was a partial (non-functional) implementation in
DispatchRM as well.  If we ever submitted max_slots worth of work
the remainder would remain QUEUED forever.   I did attempt to fix
the existing support, but the obvious fixes were not sufficient.  Since
this is already documented as managed by Slurm, just remove the support
and let the workload managers manage it.
  • Loading branch information
jerryharrow authored and determined-ci committed Mar 12, 2024
1 parent 1944b5d commit e058de2
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 20 deletions.
18 changes: 0 additions & 18 deletions master/internal/rm/dispatcherrm/dispatcher_resource_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -261,7 +261,6 @@ type dispatcherResourceManager struct {
reqList *tasklist.TaskList
groups map[*actor.Ref]*tasklist.Group
dispatchIDToAllocationID map[string]model.AllocationID
slotsUsedPerGroup map[*tasklist.Group]int
masterTLSConfig model.TLSClientConfig
loggingConfig model.LoggingConfig
jobWatcher *launcherMonitor
Expand Down Expand Up @@ -316,7 +315,6 @@ func newDispatcherResourceManager(
reqList: tasklist.New(),
groups: make(map[*actor.Ref]*tasklist.Group),
dispatchIDToAllocationID: make(map[string]model.AllocationID),
slotsUsedPerGroup: make(map[*tasklist.Group]int),

masterTLSConfig: masterTLSConfig,
loggingConfig: loggingConfig,
Expand Down Expand Up @@ -849,7 +847,6 @@ func (m *dispatcherResourceManager) receiveRequestMsg(ctx *actor.Context) error
m.getOrCreateGroup(ctx, msg.Handler).MaxSlots = msg.MaxSlots

case tasklist.GroupActorStopped:
delete(m.slotsUsedPerGroup, m.groups[msg.Ref])
delete(m.groups, msg.Ref)

case sproto.SetAllocationName:
Expand Down Expand Up @@ -1729,8 +1726,6 @@ func (m *dispatcherResourceManager) assignResources(
}
}

m.slotsUsedPerGroup[m.groups[req.Group]] += req.SlotsNeeded

if len(rID) == 0 {
rID = sproto.ResourcesID(uuid.NewString())
}
Expand Down Expand Up @@ -1781,12 +1776,6 @@ func (m *dispatcherResourceManager) assignResources(
func (m *dispatcherResourceManager) resourcesReleased(ctx *actor.Context, handler *actor.Ref) {
ctx.Log().Infof("resources are released for %s", handler.Address())
m.reqList.RemoveTaskByHandler(handler)

if req, ok := m.reqList.TaskByHandler(handler); ok {
if group := m.groups[handler]; group != nil {
m.slotsUsedPerGroup[group] -= req.SlotsNeeded
}
}
}

// Used on DEBUG startup, to queue a terminate and delete all dispatches in the DB
Expand Down Expand Up @@ -1834,7 +1823,6 @@ func (m *dispatcherResourceManager) getOrCreateGroup(
priority := config.KubernetesDefaultPriority
g := &tasklist.Group{Handler: handler, Weight: 1, Priority: &priority}
m.groups[handler] = g
m.slotsUsedPerGroup[g] = 0

if ctx != nil && handler != nil { // ctx is nil only for testing purposes.
actors.NotifyOnStop(ctx, handler, tasklist.GroupActorStopped{})
Expand All @@ -1845,14 +1833,8 @@ func (m *dispatcherResourceManager) getOrCreateGroup(
func (m *dispatcherResourceManager) schedulePendingTasks(ctx *actor.Context) {
for it := m.reqList.Iterator(); it.Next(); {
req := it.Value()
group := m.groups[req.Group]
assigned := m.reqList.Allocation(req.AllocationRef)
if !tasklist.AssignmentIsScheduled(assigned) {
if maxSlots := group.MaxSlots; maxSlots != nil {
if m.slotsUsedPerGroup[group]+req.SlotsNeeded > *maxSlots {
continue
}
}
m.assignResources(ctx, req)
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -464,7 +464,6 @@ func Test_dispatcherResourceManager_selectDefaultPools(t *testing.T) {
hpcResourcesManifest *launcher.Manifest
reqList *tasklist.TaskList
groups map[*actor.Ref]*tasklist.Group
slotsUsedPerGroup map[*tasklist.Group]int
dispatchIDToAllocationID map[string]model.AllocationID
masterTLSConfig model.TLSClientConfig
loggingConfig model.LoggingConfig
Expand Down Expand Up @@ -574,7 +573,6 @@ func Test_dispatcherResourceManager_selectDefaultPools(t *testing.T) {
hpcResourcesManifest: tt.fields.hpcResourcesManifest,
reqList: tt.fields.reqList,
groups: tt.fields.groups,
slotsUsedPerGroup: tt.fields.slotsUsedPerGroup,
dispatchIDToAllocationID: tt.fields.dispatchIDToAllocationID,
masterTLSConfig: tt.fields.masterTLSConfig,
loggingConfig: tt.fields.loggingConfig,
Expand Down

0 comments on commit e058de2

Please sign in to comment.