Skip to content

Commit

Permalink
feat: add prometheus metrics (#459)
Browse files Browse the repository at this point in the history
  • Loading branch information
garethgeorge authored Sep 10, 2024
1 parent 822ec35 commit daacf28
Show file tree
Hide file tree
Showing 16 changed files with 135 additions and 1 deletion.
2 changes: 2 additions & 0 deletions cmd/backrest/backrest.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"github.com/garethgeorge/backrest/internal/config"
"github.com/garethgeorge/backrest/internal/env"
"github.com/garethgeorge/backrest/internal/logwriter"
"github.com/garethgeorge/backrest/internal/metric"
"github.com/garethgeorge/backrest/internal/oplog"
"github.com/garethgeorge/backrest/internal/oplog/bboltstore"
"github.com/garethgeorge/backrest/internal/orchestrator"
Expand Down Expand Up @@ -116,6 +117,7 @@ func main() {
mux.Handle(backrestHandlerPath, auth.RequireAuthentication(backrestHandler, authenticator))
mux.Handle("/", webui.Handler())
mux.Handle("/download/", http.StripPrefix("/download", api.NewDownloadHandler(oplog)))
mux.Handle("/metrics", auth.RequireAuthentication(metric.GetRegistry().Handler(), authenticator))

// Serve the HTTP gateway
server := &http.Server{
Expand Down
8 changes: 8 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ require (

require (
github.com/akavel/rsrc v0.10.2 // indirect
github.com/beorn7/perks v1.0.1 // indirect
github.com/cespare/xxhash/v2 v2.3.0 // indirect
github.com/dchest/jsmin v0.0.0-20220218165748-59f39799265f // indirect
github.com/fatih/color v1.17.0 // indirect
github.com/getlantern/context v0.0.0-20220418194847-3d5e7a086201 // indirect
Expand All @@ -45,8 +47,14 @@ require (
github.com/go-stack/stack v1.8.1 // indirect
github.com/hashicorp/errwrap v1.1.0 // indirect
github.com/josephspurrier/goversioninfo v1.4.0 // indirect
github.com/klauspost/compress v1.17.9 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c // indirect
github.com/prometheus/client_golang v1.20.3 // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/randall77/makefat v0.0.0-20210315173500-7ddd0e42c844 // indirect
go.opentelemetry.io/otel v1.27.0 // indirect
go.opentelemetry.io/otel/metric v1.27.0 // indirect
Expand Down
16 changes: 16 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ github.com/akavel/rsrc v0.10.2/go.mod h1:uLoCtb9J+EyAqh+26kdrTgmzRBFPGOolLWKpdxk
github.com/alessio/shellescape v1.4.2 h1:MHPfaU+ddJ0/bYWpgIeUnQUqKrlJ1S7BfEYPM4uEoM0=
github.com/alessio/shellescape v1.4.2/go.mod h1:PZAiSCk0LJaZkiCSkPv8qIobYglO3FPpyFjDCtHLS30=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM=
github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw=
github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
github.com/containrrr/shoutrrr v0.8.0 h1:mfG2ATzIS7NR2Ec6XL+xyoHzN97H8WPjir8aYzJUSec=
github.com/containrrr/shoutrrr v0.8.0/go.mod h1:ioyQAyu1LJY6sILuNyKaQaw+9Ttik5QePU8atnAdO2o=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
Expand Down Expand Up @@ -77,6 +81,8 @@ github.com/jarcoal/httpmock v1.3.0 h1:2RJ8GP0IIaWwcC9Fp2BmVi8Kog3v2Hn7VXM3fTd+nu
github.com/jarcoal/httpmock v1.3.0/go.mod h1:3yb8rc4BI7TCBhFY8ng0gjuLKJNquuDNiPaZjnENuYg=
github.com/josephspurrier/goversioninfo v1.4.0 h1:Puhl12NSHUSALHSuzYwPYQkqa2E1+7SrtAPJorKK0C8=
github.com/josephspurrier/goversioninfo v1.4.0/go.mod h1:JWzv5rKQr+MmW+LvM412ToT/IkYDZjaclF2pKDss8IY=
github.com/klauspost/compress v1.17.9 h1:6KIumPrER1LHsvBVuDa0r5xaG0Es51mhhB9BQB2qeMA=
github.com/klauspost/compress v1.17.9/go.mod h1:Di0epgTjJY877eYKx5yC51cX2A2Vl2ibi7bDH9ttBbw=
github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
Expand All @@ -87,6 +93,8 @@ github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovk
github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA=
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ=
github.com/natefinch/atomic v1.0.1 h1:ZPYKxkqQOx3KZ+RsbnP/YsgvxWQPGxjC0oBt2AhwV0A=
github.com/natefinch/atomic v1.0.1/go.mod h1:N/D/ELrljoqDyT3rZrsUmtsuzvHkeB/wWjHV22AZRbM=
github.com/ncruces/zenity v0.10.12 h1:o4SErDa0kQijlqG6W4OYYzO6kA0fGu34uegvJGcMLBI=
Expand All @@ -100,6 +108,14 @@ github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c/go.mod h1:X07ZCGwU
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_golang v1.20.3 h1:oPksm4K8B+Vt35tUhw6GbSNSgVlVSBH0qELP/7u83l4=
github.com/prometheus/client_golang v1.20.3/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY=
github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc=
github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/randall77/makefat v0.0.0-20210315173500-7ddd0e42c844 h1:GranzK4hv1/pqTIhMTXt2X8MmMOuH3hMeUR0o9SP5yc=
github.com/randall77/makefat v0.0.0-20210315173500-7ddd0e42c844/go.mod h1:T1TLSfyWVBRXVGzWd0o9BI4kfoO9InEgfQe4NV3mLz8=
github.com/skratchdot/open-golang v0.0.0-20200116055534-eef842397966/go.mod h1:sUM3LWHvSMaG192sy56D9F7CNvL7jUJVXoqM1QKLnog=
Expand Down
1 change: 1 addition & 0 deletions internal/hook/hook.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ func newOneoffRunHookTask(title, instanceID, repoID, planID string, parentOp *v1
return &tasks.GenericOneoffTask{
OneoffTask: tasks.OneoffTask{
BaseTask: tasks.BaseTask{
TaskType: "hook",
TaskName: fmt.Sprintf("run hook %v", title),
TaskRepoID: repoID,
TaskPlanID: planID,
Expand Down
84 changes: 84 additions & 0 deletions internal/metric/metric.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
package metric

import (
"net/http"
"slices"

"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
)

var (
globalRegistry = initRegistry()
)

func initRegistry() *Registry {

commonDims := []string{"repo_id", "plan_id"}

registry := &Registry{
reg: prometheus.NewRegistry(),
backupBytesProcessed: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "backrest_backup_bytes_processed",
Help: "The total number of bytes processed during a backup",
}, commonDims),
backupBytesAdded: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "backrest_backup_bytes_added",
Help: "The total number of bytes added during a backup",
}, commonDims),
backupFileWarnings: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "backrest_backup_file_warnings",
Help: "The total number of file warnings during a backup",
}, commonDims),
tasksDuration: prometheus.NewSummaryVec(prometheus.SummaryOpts{
Name: "backrest_tasks_duration_secs",
Help: "The duration of a task in seconds",
}, append(slices.Clone(commonDims), "task_type")),
tasksRun: prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "backrest_tasks_run_total",
Help: "The total number of tasks run",
}, append(slices.Clone(commonDims), "task_type", "status")),
}

registry.reg.MustRegister(registry.backupBytesProcessed)
registry.reg.MustRegister(registry.backupBytesAdded)
registry.reg.MustRegister(registry.backupFileWarnings)
registry.reg.MustRegister(registry.tasksDuration)
registry.reg.MustRegister(registry.tasksRun)

return registry
}

func GetRegistry() *Registry {
return globalRegistry
}

type Registry struct {
reg *prometheus.Registry
backupBytesProcessed *prometheus.SummaryVec
backupBytesAdded *prometheus.SummaryVec
backupFileWarnings *prometheus.SummaryVec
tasksDuration *prometheus.SummaryVec
tasksRun *prometheus.CounterVec
}

func (r *Registry) Handler() http.Handler {
return promhttp.HandlerFor(r.reg, promhttp.HandlerOpts{})
}

func (r *Registry) RecordTaskRun(repoID, planID, taskType string, duration_secs float64, status string) {
if repoID == "" {
repoID = "_unassociated_"
}
if planID == "" {
planID = "_unassociated_"
}
r.tasksRun.WithLabelValues(repoID, planID, taskType, status).Inc()
r.tasksDuration.WithLabelValues(repoID, planID, taskType).Observe(duration_secs)
}

func (r *Registry) RecordBackupSummary(repoID, planID string, bytesProcessed, bytesAdded int64, fileWarnings int64) {
r.backupBytesProcessed.WithLabelValues(repoID, planID).Observe(float64(bytesProcessed))
r.backupBytesAdded.WithLabelValues(repoID, planID).Observe(float64(bytesAdded))
r.backupFileWarnings.WithLabelValues(repoID, planID).Observe(float64(fileWarnings))
}
2 changes: 2 additions & 0 deletions internal/orchestrator/orchestrator.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
v1 "github.com/garethgeorge/backrest/gen/go/v1"
"github.com/garethgeorge/backrest/internal/config"
"github.com/garethgeorge/backrest/internal/logwriter"
"github.com/garethgeorge/backrest/internal/metric"
"github.com/garethgeorge/backrest/internal/oplog"
"github.com/garethgeorge/backrest/internal/orchestrator/logging"
"github.com/garethgeorge/backrest/internal/orchestrator/repo"
Expand Down Expand Up @@ -426,6 +427,7 @@ func (o *Orchestrator) RunTask(ctx context.Context, st tasks.ScheduledTask) erro
runner.Logger(ctx).Error("task failed", zap.Error(err), zap.Duration("duration", time.Since(start)))
} else {
runner.Logger(ctx).Info("task finished", zap.Duration("duration", time.Since(start)))
metric.GetRegistry().RecordTaskRun(st.Task.RepoID(), st.Task.PlanID(), st.Task.Type(), time.Since(start).Seconds(), "success")
}

if op != nil {
Expand Down
8 changes: 7 additions & 1 deletion internal/orchestrator/tasks/task.go
Original file line number Diff line number Diff line change
Expand Up @@ -86,18 +86,24 @@ func (s ScheduledTask) Less(other ScheduledTask) bool {
// Task is a task that can be scheduled to run at a specific time.
type Task interface {
Name() string // human readable name for this task.
Type() string // simple string 'type' for this task.
Next(now time.Time, runner TaskRunner) (ScheduledTask, error) // returns the next scheduled task.
Run(ctx context.Context, st ScheduledTask, runner TaskRunner) error // run the task.
PlanID() string // the ID of the plan this task is associated with.
RepoID() string // the ID of the repo this task is associated with.
}

type BaseTask struct {
TaskType string
TaskName string
TaskPlanID string
TaskRepoID string
}

func (b BaseTask) Type() string {
return b.TaskType
}

func (b BaseTask) Name() string {
return b.TaskName
}
Expand Down Expand Up @@ -164,7 +170,7 @@ type testTaskRunner struct {

var _ TaskRunner = &testTaskRunner{}

func newTestTaskRunner(t testing.TB, config *v1.Config, oplog *oplog.OpLog) *testTaskRunner {
func newTestTaskRunner(_ testing.TB, config *v1.Config, oplog *oplog.OpLog) *testTaskRunner {
return &testTaskRunner{
config: config,
oplog: oplog,
Expand Down
7 changes: 7 additions & 0 deletions internal/orchestrator/tasks/taskbackup.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (
"time"

v1 "github.com/garethgeorge/backrest/gen/go/v1"
"github.com/garethgeorge/backrest/internal/metric"
"github.com/garethgeorge/backrest/internal/oplog"
"github.com/garethgeorge/backrest/internal/protoutil"
"github.com/garethgeorge/backrest/pkg/restic"
Expand All @@ -29,6 +30,7 @@ var _ Task = &BackupTask{}
func NewScheduledBackupTask(plan *v1.Plan) *BackupTask {
return &BackupTask{
BaseTask: BaseTask{
TaskType: "backup",
TaskName: fmt.Sprintf("backup for plan %q", plan.Id),
TaskRepoID: plan.Repo,
TaskPlanID: plan.Id,
Expand All @@ -39,6 +41,7 @@ func NewScheduledBackupTask(plan *v1.Plan) *BackupTask {
func NewOneoffBackupTask(plan *v1.Plan, at time.Time) *BackupTask {
return &BackupTask{
BaseTask: BaseTask{
TaskType: "backup",
TaskName: fmt.Sprintf("backup for plan %q", plan.Id),
TaskRepoID: plan.Repo,
TaskPlanID: plan.Id,
Expand Down Expand Up @@ -132,6 +135,7 @@ func (t *BackupTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunne
var sendWg sync.WaitGroup
lastSent := time.Now() // debounce progress updates, these can endup being very frequent.
var lastFiles []string
fileErrorCount := 0
summary, err := repo.Backup(ctx, plan, func(entry *restic.BackupProgressEntry) {
sendWg.Wait()
if entry.MessageType == "status" {
Expand All @@ -145,6 +149,7 @@ func (t *BackupTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunne
backupOp.OperationBackup.LastStatus = protoutil.BackupProgressEntryToProto(entry)
} else if entry.MessageType == "error" {
l.Sugar().Warnf("an unknown error was encountered in processing item: %v", entry.Item)
fileErrorCount++
backupError, err := protoutil.BackupProgressEntryToBackupError(entry)
if err != nil {
l.Sugar().Errorf("failed to convert backup progress entry to backup error: %v", err)
Expand Down Expand Up @@ -180,6 +185,8 @@ func (t *BackupTask) Run(ctx context.Context, st ScheduledTask, runner TaskRunne
summary = &restic.BackupProgressEntry{}
}

metric.GetRegistry().RecordBackupSummary(t.RepoID(), t.PlanID(), summary.TotalBytesProcessed, summary.DataAdded, int64(fileErrorCount))

vars := HookVars{
Task: t.Name(),
SnapshotStats: summary,
Expand Down
1 change: 1 addition & 0 deletions internal/orchestrator/tasks/taskcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ type CheckTask struct {
func NewCheckTask(repoID, planID string, force bool) Task {
return &CheckTask{
BaseTask: BaseTask{
TaskType: "check",
TaskName: fmt.Sprintf("check for repo %q", repoID),
TaskRepoID: repoID,
TaskPlanID: planID,
Expand Down
1 change: 1 addition & 0 deletions internal/orchestrator/tasks/taskcollectgarbage.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ type CollectGarbageTask struct {
func NewCollectGarbageTask() *CollectGarbageTask {
return &CollectGarbageTask{
BaseTask: BaseTask{
TaskType: "collect_garbage",
TaskName: "collect garbage",
},
}
Expand Down
1 change: 1 addition & 0 deletions internal/orchestrator/tasks/taskforget.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ func NewOneoffForgetTask(repoID, planID string, flowID int64, at time.Time) Task
return &GenericOneoffTask{
OneoffTask: OneoffTask{
BaseTask: BaseTask{
TaskType: "forget",
TaskName: fmt.Sprintf("forget for plan %q in repo %q", repoID, planID),
TaskRepoID: repoID,
TaskPlanID: planID,
Expand Down
1 change: 1 addition & 0 deletions internal/orchestrator/tasks/taskforgetsnapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ func NewOneoffForgetSnapshotTask(repoID, planID string, flowID int64, at time.Ti
return &GenericOneoffTask{
OneoffTask: OneoffTask{
BaseTask: BaseTask{
TaskType: "forget_snapshot",
TaskName: fmt.Sprintf("forget snapshot %q for plan %q in repo %q", snapshotID, planID, repoID),
TaskRepoID: repoID,
TaskPlanID: planID,
Expand Down
1 change: 1 addition & 0 deletions internal/orchestrator/tasks/taskindexsnapshots.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ func NewOneoffIndexSnapshotsTask(repoID string, at time.Time) Task {
return &GenericOneoffTask{
OneoffTask: OneoffTask{
BaseTask: BaseTask{
TaskType: "index_snapshots",
TaskName: fmt.Sprintf("index snapshots for repo %q", repoID),
TaskRepoID: repoID,
},
Expand Down
1 change: 1 addition & 0 deletions internal/orchestrator/tasks/taskprune.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ type PruneTask struct {
func NewPruneTask(repoID, planID string, force bool) Task {
return &PruneTask{
BaseTask: BaseTask{
TaskType: "prune",
TaskName: fmt.Sprintf("prune repo %q", repoID),
TaskRepoID: repoID,
TaskPlanID: planID,
Expand Down
1 change: 1 addition & 0 deletions internal/orchestrator/tasks/taskrestore.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ func NewOneoffRestoreTask(repoID, planID string, flowID int64, at time.Time, sna
return &GenericOneoffTask{
OneoffTask: OneoffTask{
BaseTask: BaseTask{
TaskType: "restore",
TaskName: fmt.Sprintf("restore snapshot %q in repo %q", snapshotID, repoID),
TaskRepoID: repoID,
TaskPlanID: planID,
Expand Down
1 change: 1 addition & 0 deletions internal/orchestrator/tasks/taskstats.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ type StatsTask struct {
func NewStatsTask(repoID, planID string, force bool) Task {
return &StatsTask{
BaseTask: BaseTask{
TaskType: "stats",
TaskName: fmt.Sprintf("stats for repo %q", repoID),
TaskRepoID: repoID,
TaskPlanID: planID,
Expand Down

0 comments on commit daacf28

Please sign in to comment.