-
Notifications
You must be signed in to change notification settings - Fork 356
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Aaron Amanuel
committed
Oct 30, 2023
1 parent
f24748c
commit 206ceb4
Showing
8 changed files
with
348 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
package internal | ||
|
||
import ( | ||
"fmt" | ||
"time" | ||
|
||
"github.com/pkg/errors" | ||
"github.com/sirupsen/logrus" | ||
|
||
"github.com/determined-ai/determined/master/internal/config" | ||
"github.com/determined-ai/determined/master/internal/db" | ||
"github.com/determined-ai/determined/master/internal/rm" | ||
"github.com/determined-ai/determined/master/internal/rm/tasklist" | ||
"github.com/determined-ai/determined/master/internal/sproto" | ||
"github.com/determined-ai/determined/master/internal/task" | ||
"github.com/determined-ai/determined/master/pkg/logger" | ||
"github.com/determined-ai/determined/master/pkg/model" | ||
"github.com/determined-ai/determined/master/pkg/schemas/expconf" | ||
"github.com/determined-ai/determined/master/pkg/tasks" | ||
) | ||
|
||
func runTensorboardGCTask( | ||
rm rm.ResourceManager, | ||
db *db.PgDB, | ||
taskID model.TaskID, | ||
jobID model.JobID, | ||
jobSubmissionTime time.Time, | ||
taskSpec tasks.TaskSpec, | ||
expID int, | ||
legacyConfig expconf.LegacyConfig, | ||
checkpointGlobs []string, | ||
deleteTensorboards bool, | ||
agentUserGroup *model.AgentUserGroup, | ||
owner *model.User, | ||
logCtx logger.Context, | ||
) error { | ||
|
||
if !deleteTensorboards { | ||
return nil | ||
} | ||
|
||
rp, err := rm.ResolveResourcePool("", -1, 0) | ||
if err != nil { | ||
return fmt.Errorf("resolving resource pool: %w", err) | ||
} | ||
|
||
// t.Base is just a shallow copy of the m.taskSpec on the master, so | ||
// use caution when mutating it. | ||
tcd, err := rm.TaskContainerDefaults( | ||
rp, | ||
config.GetMasterConfig().TaskContainerDefaults) | ||
if err != nil { | ||
return fmt.Errorf("creating task container defaults: %v", err) | ||
} | ||
taskSpec.TaskContainerDefaults = tcd | ||
|
||
taskSpec.AgentUserGroup = agentUserGroup | ||
taskSpec.Owner = owner | ||
|
||
gcSpec := tasks.GCCkptSpec{ | ||
Base: taskSpec, | ||
ExperimentID: expID, | ||
LegacyConfig: legacyConfig, | ||
CheckpointGlobs: checkpointGlobs, | ||
DeleteTensorboards: deleteTensorboards, | ||
} | ||
|
||
logCtx = logger.MergeContexts(logCtx, logger.Context{ | ||
"task-id": taskID, | ||
"task-type": model.TaskTypeTensorboard, | ||
}) | ||
syslog := logrus.WithField("component", "tensorboardgc").WithFields(logCtx.Fields()) | ||
|
||
if err := db.AddTask(&model.Task{ | ||
TaskID: taskID, | ||
TaskType: model.TaskTypeTensorboard, | ||
StartTime: time.Now().UTC(), | ||
JobID: &jobID, | ||
LogVersion: model.CurrentTaskLogVersion, | ||
}); err != nil { | ||
return errors.Wrapf(err, "persisting GC task %s", taskID) | ||
} | ||
|
||
allocationID := model.AllocationID(fmt.Sprintf("%s.%d", taskID, 1)) | ||
gcJobID := model.JobID(fmt.Sprintf("tensorboard_gc-%s", allocationID)) | ||
|
||
resultChan := make(chan error, 1) | ||
onExit := func(ae *task.AllocationExited) { | ||
if err := db.CompleteTask(taskID, time.Now().UTC()); err != nil { | ||
syslog.WithError(err).Error("marking GC task complete") | ||
} | ||
if err := tasklist.GroupPriorityChangeRegistry.Delete(gcJobID); err != nil { | ||
syslog.WithError(err).Error("deleting group priority change registry") | ||
} | ||
resultChan <- ae.Err | ||
} | ||
|
||
if err := tasklist.GroupPriorityChangeRegistry.Add(gcJobID, nil); err != nil { | ||
return err | ||
} | ||
err = task.DefaultService.StartAllocation(logCtx, sproto.AllocateRequest{ | ||
TaskID: taskID, | ||
JobID: gcJobID, | ||
JobSubmissionTime: jobSubmissionTime, | ||
AllocationID: allocationID, | ||
Name: fmt.Sprintf("Tensorboard GC (Experiment %d)", expID), | ||
FittingRequirements: sproto.FittingRequirements{ | ||
SingleAgent: true, | ||
}, | ||
ResourcePool: rp, | ||
}, db, rm, gcSpec, onExit) | ||
if err != nil { | ||
return err | ||
} | ||
return <-resultChan | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.