From 6c2b0b83918d6f39d7e42db6a31a77d1e7de9c40 Mon Sep 17 00:00:00 2001 From: Zach Zhu Date: Wed, 25 May 2022 16:46:42 +0800 Subject: [PATCH] introduce standard controller runtime metrics --- pkg/controller/util/worker.go | 31 +++++++++++++++++- pkg/metrics/metrics.go | 59 +++++++++++++++++++++++++++++++---- 2 files changed, 83 insertions(+), 7 deletions(-) diff --git a/pkg/controller/util/worker.go b/pkg/controller/util/worker.go index 07d94b3d73..a4c04ce57d 100644 --- a/pkg/controller/util/worker.go +++ b/pkg/controller/util/worker.go @@ -23,6 +23,8 @@ import ( "k8s.io/client-go/util/flowcontrol" "k8s.io/client-go/util/workqueue" runtimeclient "sigs.k8s.io/controller-runtime/pkg/client" + + "sigs.k8s.io/kubefed/pkg/metrics" ) type ReconcileFunc func(qualifiedName QualifiedName) ReconciliationStatus @@ -128,6 +130,8 @@ func (w *asyncWorker) EnqueueWithDelay(qualifiedName QualifiedName, delay time.D } func (w *asyncWorker) Run(stopChan <-chan struct{}) { + w.initMetrics() + StartBackoffGC(w.backoff, stopChan) w.deliverer.StartWithHandler(func(item *DelayingDelivererItem) { qualifiedName, ok := item.Value.(*QualifiedName) @@ -183,16 +187,41 @@ func (w *asyncWorker) reconcileOnce() bool { return true } + metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Add(1) + defer metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Add(-1) + defer metrics.UpdateControllerRuntimeReconcileTimeFromStart(w.name, time.Now()) + status := w.reconcile(qualifiedName) switch status { case StatusAllOK: - break + metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelSuccess).Inc() case StatusError: w.EnqueueForError(qualifiedName) + metrics.ControllerRuntimeReconcileErrors.WithLabelValues(w.name).Inc() + metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelError).Inc() case StatusNeedsRecheck: w.EnqueueForRetry(qualifiedName) + metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNeedsRecheck).Inc() case StatusNotSynced: w.EnqueueForClusterSync(qualifiedName) + metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNotSynced).Inc() } return true } + +const ( + labelSuccess = "success" + labelError = "error" + labelNeedsRecheck = "needs_recheck" + labelNotSynced = "not_synced" +) + +func (w *asyncWorker) initMetrics() { + metrics.ControllerRuntimeActiveWorkers.WithLabelValues(w.name).Set(0) + metrics.ControllerRuntimeReconcileErrors.WithLabelValues(w.name).Add(0) + metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelSuccess).Add(0) + metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelError).Add(0) + metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNeedsRecheck).Add(0) + metrics.ControllerRuntimeReconcileTotal.WithLabelValues(w.name, labelNotSynced).Add(0) + metrics.ControllerRuntimeWorkerCount.WithLabelValues(w.name).Set(float64(w.maxConcurrentReconciles)) +} diff --git a/pkg/metrics/metrics.go b/pkg/metrics/metrics.go index b17d9af63d..503abd0b37 100644 --- a/pkg/metrics/metrics.go +++ b/pkg/metrics/metrics.go @@ -20,6 +20,7 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/collectors" "k8s.io/klog/v2" "sigs.k8s.io/controller-runtime/pkg/metrics" ) @@ -58,7 +59,7 @@ var ( reconcileFederatedResourcesDuration = prometheus.NewHistogram( prometheus.HistogramOpts{ Name: "reconcile_federated_resources_duration_seconds", - Help: "Time taken to reconcile federated resources in the target clusters.", + Help: "[Deprecated] Time taken to reconcile federated resources in the target clusters. Replaced by controller_runtime_reconcile_time_seconds.", Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0}, }, ) @@ -90,7 +91,7 @@ var ( controllerRuntimeReconcileDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Name: "controller_runtime_reconcile_duration_seconds", - Help: "Time taken by various parts of Kubefed controllers reconciliation loops.", + Help: "[Deprecated] Time taken by various parts of Kubefed controllers reconciliation loops. Replaced by controller_runtime_reconcile_time_seconds.", Buckets: []float64{0.01, 0.05, 0.1, 0.5, 1.0, 2.5, 5.0, 7.5, 10.0, 12.5, 15.0, 17.5, 20.0, 22.5, 25.0, 27.5, 30.0, 50.0, 75.0, 100.0, 1000.0}, }, []string{"controller"}, ) @@ -98,10 +99,37 @@ var ( controllerRuntimeReconcileDurationSummary = prometheus.NewSummaryVec( prometheus.SummaryOpts{ Name: "controller_runtime_reconcile_quantile_seconds", - Help: "Quantiles of time taken by various parts of Kubefed controllers reconciliation loops.", + Help: "[Deprecated] Quantiles of time taken by various parts of Kubefed controllers reconciliation loops. Replaced by controller_runtime_reconcile_time_seconds.", MaxAge: time.Hour, }, []string{"controller"}, ) + + ControllerRuntimeReconcileTotal = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_total", + Help: "Total number of reconciliations per controller", + }, []string{"controller", "result"}) + + ControllerRuntimeReconcileErrors = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_errors_total", + Help: "Total number of reconciliation errors per controller", + }, []string{"controller"}) + + ControllerRuntimeReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ + Name: "controller_runtime_reconcile_time_seconds", + Help: "Length of time per reconciliation per controller", + Buckets: []float64{0.005, 0.01, 0.025, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, + 1.25, 1.5, 1.75, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 40, 50, 60}, + }, []string{"controller"}) + + ControllerRuntimeWorkerCount = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "controller_runtime_max_concurrent_reconciles", + Help: "Maximum number of concurrent reconciles per controller", + }, []string{"controller"}) + + ControllerRuntimeActiveWorkers = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Name: "controller_runtime_active_workers", + Help: "Number of currently used workers per controller", + }, []string{"controller"}) ) const ( @@ -117,6 +145,10 @@ const ( // RegisterAll registers all metrics. func RegisterAll() { metrics.Registry.MustRegister( + // expose process metrics like CPU, Memory, file descriptor usage etc. + collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}), + // expose Go runtime metrics like GC stats, memory stats etc. + collectors.NewGoCollector(), kubefedClusterTotal, joinedClusterTotal, reconcileFederatedResourcesDuration, @@ -127,6 +159,11 @@ func RegisterAll() { dispatchOperationDuration, controllerRuntimeReconcileDuration, controllerRuntimeReconcileDurationSummary, + ControllerRuntimeReconcileTotal, + ControllerRuntimeReconcileErrors, + ControllerRuntimeReconcileTime, + ControllerRuntimeWorkerCount, + ControllerRuntimeActiveWorkers, ) } @@ -203,10 +240,20 @@ func UpdateControllerReconcileDurationFromStart(controller string, start time.Ti // UpdateControllerReconcileDuration records the duration of the reconcile function of a controller func UpdateControllerReconcileDuration(controller string, duration time.Duration) { + controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds()) + controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds()) +} + +// UpdateControllerRuntimeReconcileTimeFromStart records the duration of the reconcile loop of a controller +func UpdateControllerRuntimeReconcileTimeFromStart(controller string, start time.Time) { + duration := time.Since(start) + UpdateControllerRuntimeReconcileTime(controller, duration) +} + +// UpdateControllerRuntimeReconcileTime records the duration of the reconcile function of a controller +func UpdateControllerRuntimeReconcileTime(controller string, duration time.Duration) { if duration > LogReconcileLongDurationThreshold { klog.V(4).Infof("Reconcile loop %s took %v to complete", controller, duration) } - - controllerRuntimeReconcileDurationSummary.WithLabelValues(controller).Observe(duration.Seconds()) - controllerRuntimeReconcileDuration.WithLabelValues(controller).Observe(duration.Seconds()) + ControllerRuntimeReconcileTime.WithLabelValues(controller).Observe(duration.Seconds()) }