Skip to content

Commit

Permalink
add tensorboard_gc.go
Browse files Browse the repository at this point in the history
  • Loading branch information
Aaron Amanuel committed Oct 30, 2023
1 parent f24748c commit 206ceb4
Show file tree
Hide file tree
Showing 8 changed files with 348 additions and 4 deletions.
17 changes: 17 additions & 0 deletions harness/determined/cli/tensorboard.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,15 @@ def open_tensorboard(args: Namespace) -> None:
)


@authentication.required
def delete_tensorboard(args: Namespace) -> None:
body = bindings.v1DeleteTensorboardRequest(
experimentId=args.experiment_id,
)

bindings.post_DeleteTensorboard(cli.setup_session(args), body=body)


args_description: ArgsDescription = [
Cmd(
"tensorboard",
Expand Down Expand Up @@ -206,6 +215,14 @@ def open_tensorboard(args: Namespace) -> None:
),
],
),
Cmd(
"delete",
delete_tensorboard,
"delete TensorBoard files associate with the proived experiment ID",
[
Arg("experiment_id", type=int, help="Experiment ID"),
],
),
],
)
]
48 changes: 48 additions & 0 deletions harness/determined/common/api/bindings.py

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

42 changes: 42 additions & 0 deletions master/internal/api_tensorboard.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ import (
"google.golang.org/grpc/status"

petname "github.com/dustinkirkland/golang-petname"
"github.com/google/uuid"
log "github.com/sirupsen/logrus"

"github.com/pkg/errors"

Expand All @@ -31,6 +33,8 @@ import (
"github.com/determined-ai/determined/master/internal/grpcutil"
"github.com/determined-ai/determined/master/internal/rbac/audit"
"github.com/determined-ai/determined/master/internal/trials"
"github.com/determined-ai/determined/master/internal/user"
"github.com/determined-ai/determined/master/internal/workspace"
"github.com/determined-ai/determined/master/pkg/actor"
"github.com/determined-ai/determined/master/pkg/archive"
"github.com/determined-ai/determined/master/pkg/check"
Expand Down Expand Up @@ -524,3 +528,41 @@ func (a *apiServer) getTensorBoardConfigsFromReq(

return configs, nil
}

func (a *apiServer) DeleteTensorboard(
ctx context.Context, req *apiv1.DeleteTensorboardRequest,
) (resp *apiv1.DeleteTensorboardResponse, err error) {
curUser, _, err := grpcutil.GetUser(ctx)
if err != nil {
return nil, err
}

exp, err := db.ExperimentByID(context.TODO(), int(req.ExperimentId))
if err != nil {
return nil, err
}

workspaceID, err := workspace.WorkspacesIDsByExperimentIDs(ctx, []int{exp.ID})
if err != nil {
return nil, err
}
agentUserGroup, err := user.GetAgentUserGroup(context.TODO(), *exp.OwnerID, workspaceID[0])
if err != nil {
return nil, err
}

var uuidList []uuid.UUID

err = runCheckpointGCTask(
a.m.rm, a.m.db, model.NewTaskID(), exp.JobID, exp.StartTime, *a.m.taskSpec, exp.ID,
exp.Config, uuidList, nil, true, agentUserGroup, curUser,
nil,
)

if err != nil {
log.WithError(err).Errorf("failed to gc tensorboard for experiment: %d", exp.ID)
return nil, err
}

return &apiv1.DeleteTensorboardResponse{}, nil
}
116 changes: 116 additions & 0 deletions master/internal/tensorboard_gc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package internal

import (
"fmt"
"time"

"github.com/pkg/errors"
"github.com/sirupsen/logrus"

"github.com/determined-ai/determined/master/internal/config"
"github.com/determined-ai/determined/master/internal/db"
"github.com/determined-ai/determined/master/internal/rm"
"github.com/determined-ai/determined/master/internal/rm/tasklist"
"github.com/determined-ai/determined/master/internal/sproto"
"github.com/determined-ai/determined/master/internal/task"
"github.com/determined-ai/determined/master/pkg/logger"
"github.com/determined-ai/determined/master/pkg/model"
"github.com/determined-ai/determined/master/pkg/schemas/expconf"
"github.com/determined-ai/determined/master/pkg/tasks"
)

func runTensorboardGCTask(
rm rm.ResourceManager,
db *db.PgDB,
taskID model.TaskID,
jobID model.JobID,
jobSubmissionTime time.Time,
taskSpec tasks.TaskSpec,
expID int,
legacyConfig expconf.LegacyConfig,
checkpointGlobs []string,
deleteTensorboards bool,
agentUserGroup *model.AgentUserGroup,
owner *model.User,
logCtx logger.Context,
) error {

if !deleteTensorboards {
return nil
}

rp, err := rm.ResolveResourcePool("", -1, 0)
if err != nil {
return fmt.Errorf("resolving resource pool: %w", err)
}

// t.Base is just a shallow copy of the m.taskSpec on the master, so
// use caution when mutating it.
tcd, err := rm.TaskContainerDefaults(
rp,
config.GetMasterConfig().TaskContainerDefaults)
if err != nil {
return fmt.Errorf("creating task container defaults: %v", err)
}
taskSpec.TaskContainerDefaults = tcd

taskSpec.AgentUserGroup = agentUserGroup
taskSpec.Owner = owner

gcSpec := tasks.GCCkptSpec{
Base: taskSpec,
ExperimentID: expID,
LegacyConfig: legacyConfig,
CheckpointGlobs: checkpointGlobs,
DeleteTensorboards: deleteTensorboards,
}

logCtx = logger.MergeContexts(logCtx, logger.Context{
"task-id": taskID,
"task-type": model.TaskTypeTensorboard,
})
syslog := logrus.WithField("component", "tensorboardgc").WithFields(logCtx.Fields())

if err := db.AddTask(&model.Task{
TaskID: taskID,
TaskType: model.TaskTypeTensorboard,
StartTime: time.Now().UTC(),
JobID: &jobID,
LogVersion: model.CurrentTaskLogVersion,
}); err != nil {
return errors.Wrapf(err, "persisting GC task %s", taskID)
}

allocationID := model.AllocationID(fmt.Sprintf("%s.%d", taskID, 1))
gcJobID := model.JobID(fmt.Sprintf("tensorboard_gc-%s", allocationID))

resultChan := make(chan error, 1)
onExit := func(ae *task.AllocationExited) {
if err := db.CompleteTask(taskID, time.Now().UTC()); err != nil {
syslog.WithError(err).Error("marking GC task complete")
}
if err := tasklist.GroupPriorityChangeRegistry.Delete(gcJobID); err != nil {
syslog.WithError(err).Error("deleting group priority change registry")
}
resultChan <- ae.Err
}

if err := tasklist.GroupPriorityChangeRegistry.Add(gcJobID, nil); err != nil {
return err
}
err = task.DefaultService.StartAllocation(logCtx, sproto.AllocateRequest{
TaskID: taskID,
JobID: gcJobID,
JobSubmissionTime: jobSubmissionTime,
AllocationID: allocationID,
Name: fmt.Sprintf("Tensorboard GC (Experiment %d)", expID),
FittingRequirements: sproto.FittingRequirements{
SingleAgent: true,
},
ResourcePool: rp,
}, db, rm, gcSpec, onExit)
if err != nil {
return err
}
return <-resultChan
}
11 changes: 7 additions & 4 deletions master/pkg/tasks/task_gc.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,13 @@ import (
"strconv"
"strings"

"github.com/docker/docker/api/types/mount"

"github.com/determined-ai/determined/master/pkg/archive"
"github.com/determined-ai/determined/master/pkg/cproto"
"github.com/determined-ai/determined/master/pkg/etc"
"github.com/determined-ai/determined/master/pkg/model"
"github.com/determined-ai/determined/master/pkg/schemas"
"github.com/determined-ai/determined/master/pkg/schemas/expconf"
"github.com/docker/docker/api/types/mount"
)

// GCCkptSpec is a description of a task for running checkpoint GC.
Expand Down Expand Up @@ -106,9 +105,13 @@ func (g GCCkptSpec) ToTaskSpec() TaskSpec {
"--experiment-id",
strconv.Itoa(g.ExperimentID),
"--storage-config", fmt.Sprintf("/run/determined/%s", storageConfigPath),
"--delete", fmt.Sprintf("/run/determined/%s", checkpointsToDeletePath),
"--globs", fmt.Sprintf("/run/determined/%s", checkpointsGlobsPath),
}

if len(g.ToDelete) > 0 {
res.Entrypoint = append(res.Entrypoint, "--delete", fmt.Sprintf("/run/determined/%s", checkpointsToDeletePath))
res.Entrypoint = append(res.Entrypoint, "--globs", fmt.Sprintf("/run/determined/%s", checkpointsGlobsPath))
}

if g.DeleteTensorboards {
res.Entrypoint = append(res.Entrypoint, "--delete-tensorboards")
}
Expand Down
11 changes: 11 additions & 0 deletions proto/src/determined/api/v1/api.proto
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,17 @@ service Determined {
tags: "Tensorboards"
};
}
// Delete tensorboard files.
rpc DeleteTensorboard(DeleteTensorboardRequest)
returns (DeleteTensorboardResponse) {
option (google.api.http) = {
post: "/api/v1/tensorboards/delete"
body: "*"
};
option (grpc.gateway.protoc_gen_swagger.options.openapiv2_operation) = {
tags: "Tensorboards"
};
}

// Get a count of active tasks.
rpc GetActiveTasksCount(GetActiveTasksCountRequest)
Expand Down
8 changes: 8 additions & 0 deletions proto/src/determined/api/v1/tensorboard.proto
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,11 @@ message LaunchTensorboardResponse {
// List of any related warnings.
repeated LaunchWarning warnings = 3;
}

// Request to delete a tensorboard.
message DeleteTensorboardRequest {
// ID of experiment that the tensorboard is linked to.
int32 experiment_id = 1;
}
// Response to DeleteTensorboardRequest.
message DeleteTensorboardResponse {}
Loading

0 comments on commit 206ceb4

Please sign in to comment.