Skip to content
This repository has been archived by the owner on Nov 24, 2023. It is now read-only.

master: add metric for worker event error #833

Merged
merged 6 commits into from
Jul 28, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions dm/dm-ansible/scripts/dm.json
Original file line number Diff line number Diff line change
Expand Up @@ -5710,6 +5710,94 @@
"align": false,
"alignLevel": null
}
},
{
"aliasColors": {},
"bars": false,
"dashLength": 10,
"dashes": false,
"datasource": "${DS_TEST-CLUSTER}",
"description": "number of worker event error",
"fill": 1,
"gridPos": {
"h": 7,
"w": 6,
"x": 0,
"y": 55
},
"id": 85,
"legend": {
"avg": false,
"current": false,
"max": false,
"min": false,
"show": false,
"total": false,
"values": false
},
"lines": true,
"linewidth": 1,
"links": [],
"nullPointMode": "null",
"percentage": false,
"pointradius": 5,
"points": false,
"renderer": "flot",
"seriesOverrides": [],
"spaceLength": 10,
"stack": false,
"steppedLine": false,
"targets": [
{
"expr": "sum by (type) (dm_master_worker_event_error)",
"format": "time_series",
"instant": false,
"intervalFactor": 2,
"refId": "A"
}
],
"thresholds": [],
"timeFrom": null,
"timeRegions": [],
"timeShift": null,
"title": "number of worker event error",
"tooltip": {
"shared": true,
"sort": 0,
"value_type": "individual"
},
"transparent": true,
"type": "graph",
"xaxis": {
"buckets": null,
"mode": "time",
"name": null,
"show": true,
"values": []
},
"yaxes": [
{
"decimals": 0,
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": true
},
{
"format": "short",
"label": null,
"logBase": 1,
"max": null,
"min": null,
"show": false
}
],
"yaxis": {
"align": false,
"alignLevel": null
}
}
],
"repeat": null,
Expand Down
21 changes: 21 additions & 0 deletions dm/master/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,12 @@ const (
OpErrPutNonOwnerOp = "OperationPut - PutNonOwnerOpError"
)

// used to represent worker event error type
const (
WorkerEventHandle = "handle"
WorkerEventWatch = "watch"
)

var (
workerState = metricsproxy.NewGaugeVec(
prometheus.GaugeOpts{
Expand Down Expand Up @@ -74,6 +80,14 @@ var (
Name: "shard_ddl_error",
Help: "number of shard DDL lock/operation error",
}, []string{"task", "type"})

workerEventErrCounter = metricsproxy.NewCounterVec(
prometheus.CounterOpts{
Namespace: "dm",
Subsystem: "master",
Name: "worker_event_error",
Help: "number of error related to worker event, during handling or watching",
}, []string{"type"})
)

func collectMetrics() {
Expand Down Expand Up @@ -107,6 +121,7 @@ func RegistryMetrics() {
registry.MustRegister(cpuUsageGauge)
registry.MustRegister(ddlPendingCounter)
registry.MustRegister(ddlErrCounter)
registry.MustRegister(workerEventErrCounter)

prometheus.DefaultGatherer = registry
}
Expand Down Expand Up @@ -141,9 +156,15 @@ func ReportDDLError(task, errType string) {
ddlErrCounter.WithLabelValues(task, errType).Inc()
}

// ReportWorkerEventErr is a setter for workerEventErrCounter
func ReportWorkerEventErr(errType string) {
workerEventErrCounter.WithLabelValues(errType).Inc()
}

// OnRetireLeader cleans some metrics when retires
func OnRetireLeader() {
workerState.Reset()
ddlErrCounter.Reset()
ddlPendingCounter.Reset()
workerEventErrCounter.Reset()
}
5 changes: 3 additions & 2 deletions dm/master/scheduler/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -1128,15 +1128,16 @@ func (s *Scheduler) handleWorkerEv(ctx context.Context, evCh <-chan ha.WorkerEve
err = s.handleWorkerOnline(ev, true)
}
if err != nil {
// TODO(csuzhangxc): report the error through metrics or other methods.
s.logger.Error("fail to handle worker status change event", zap.Bool("delete", ev.IsDeleted), zap.Stringer("event", ev), zap.Error(err))
metrics.ReportWorkerEventErr(metrics.WorkerEventHandle)
}
case err, ok := <-errCh:
if !ok {
return nil
}
// TODO(csuzhangxc): we only log the `err` here, but we should update metrics and do more works for it later.
// error here are caused by etcd error or worker event decoding
s.logger.Error("receive error when watching worker status change event", zap.Error(err))
metrics.ReportWorkerEventErr(metrics.WorkerEventWatch)
if etcdutil.IsRetryableError(err) {
return err
}
Expand Down