Skip to content

Commit

Permalink
Add metrics for degraded mode correctness
Browse files Browse the repository at this point in the history
Add metrics that reports the number of endpoint differed between current
endpoint calculation and degraded mode calculation. It is not emitted
when current endpoint calculation returns any errors because the
returned map will be empty, while in degraded mode calculation, errors
are handled and a non-empty map is returned
  • Loading branch information
sawsa307 committed Apr 27, 2023
1 parent 73bd213 commit 040ba0c
Show file tree
Hide file tree
Showing 2 changed files with 53 additions and 10 deletions.
45 changes: 36 additions & 9 deletions pkg/neg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,21 +24,25 @@ import (
)

const (
negControllerSubsystem = "neg_controller"
syncerLatencyKey = "syncer_sync_duration_seconds"
managerProcessLatencyKey = "manager_process_duration_seconds"
initLatencyKey = "neg_initialization_duration_seconds"
negOpLatencyKey = "neg_operation_duration_seconds"
negOpEndpointsKey = "neg_operation_endpoints"
lastSyncTimestampKey = "sync_timestamp"
syncerStalenessKey = "syncer_staleness"
epsStalenessKey = "endpointslice_staleness"
negControllerSubsystem = "neg_controller"
syncerLatencyKey = "syncer_sync_duration_seconds"
managerProcessLatencyKey = "manager_process_duration_seconds"
initLatencyKey = "neg_initialization_duration_seconds"
negOpLatencyKey = "neg_operation_duration_seconds"
negOpEndpointsKey = "neg_operation_endpoints"
lastSyncTimestampKey = "sync_timestamp"
syncerStalenessKey = "syncer_staleness"
epsStalenessKey = "endpointslice_staleness"
degradedModeCorrectnessKey = "degraded_mode_correctness"

resultSuccess = "success"
resultError = "error"

GCProcess = "GC"
SyncProcess = "Sync"

NotInDegradedEndpoints = "not_in_degraded_endpoints"
OnlyInDegradedEndpoints = "only_in_degraded_endpoints"
)

type syncType string
Expand Down Expand Up @@ -68,6 +72,11 @@ var (
"result", // result of the sync
}

degradedModeCorrectnessLabels = []string{
"neg_type", // type of neg
"endpoint_type", // type of endpoint
}

NegOperationLatency = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: negControllerSubsystem,
Expand Down Expand Up @@ -149,6 +158,17 @@ var (
Buckets: prometheus.ExponentialBuckets(1, 2, 14),
},
)

DegradeModeCorrectness = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Subsystem: negControllerSubsystem,
Name: degradedModeCorrectnessKey,
Help: "Number of endpoints differed between current endpoint calculation and degraded mode calculation",
// custom buckets - [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, +Inf]
Buckets: prometheus.ExponentialBuckets(1, 2, 20),
},
degradedModeCorrectnessLabels,
)
)

var register sync.Once
Expand All @@ -167,6 +187,7 @@ func RegisterMetrics() {
prometheus.MustRegister(LabelPropagationError)
prometheus.MustRegister(LabelNumber)
prometheus.MustRegister(AnnotationSize)
prometheus.MustRegister(DegradeModeCorrectness)

RegisterSyncerMetrics()
})
Expand Down Expand Up @@ -206,6 +227,12 @@ func PublishNegEPSStalenessMetrics(epsStaleness time.Duration) {
EPSStaleness.Observe(epsStaleness.Seconds())
}

// PublishDegradedModeCorrectnessMetrics publishes collected metrics
// of the correctness of degraded mode calculations compared with the current one
func PublishDegradedModeCorrectnessMetrics(count int, endpointType string, negType string) {
DegradeModeCorrectness.WithLabelValues(negType, endpointType).Observe(float64(count))
}

func getResult(err error) string {
if err != nil {
return resultError
Expand Down
18 changes: 17 additions & 1 deletion pkg/neg/syncers/transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -274,14 +274,16 @@ func (s *transactionSyncer) syncInternalImpl() error {
return degradedModeErr
}
notInDegraded, onlyInDegraded := calculateNetworkEndpointDifference(targetMap, degradedTargetMap)
if err == nil { // we collect metrics when the normal calculation doesn't run into error
computeDegradedModeCorrectness(notInDegraded, onlyInDegraded, string(s.NegSyncerKey.NegType))
}
if s.inErrorState() {
targetMap = degradedTargetMap
endpointPodMap = degradedPodMap
if len(notInDegraded) == 0 && len(onlyInDegraded) == 0 {
s.resetErrorState()
}
}
// TODO(cheungdavid): in the else branch, publish metrics if we don't encounter error and we are not in error state
}
s.logStats(targetMap, "desired NEG endpoints")

Expand Down Expand Up @@ -767,6 +769,20 @@ func (s *transactionSyncer) computeEPSStaleness(endpointSlices []*discovery.Endp
}
}

// computeDegradedModeCorrectness computes degraded mode correctness metrics based on the difference between degraded mode and normal calculation
func computeDegradedModeCorrectness(notInDegraded, onlyInDegraded map[string]negtypes.NetworkEndpointSet, negType string) {
notInDegradedEndpoints := 0
for _, val := range notInDegraded {
notInDegradedEndpoints += len(val)
}
metrics.PublishDegradedModeCorrectnessMetrics(notInDegradedEndpoints, metrics.NotInDegradedEndpoints, negType)
onlyInDegradedEndpoints := 0
for _, val := range onlyInDegraded {
onlyInDegradedEndpoints += len(val)
}
metrics.PublishDegradedModeCorrectnessMetrics(onlyInDegradedEndpoints, metrics.OnlyInDegradedEndpoints, negType)
}

// getNegFromStore returns the neg associated with the provided namespace and neg name if it exists otherwise throws an error
func getNegFromStore(svcNegLister cache.Indexer, namespace, negName string) (*negv1beta1.ServiceNetworkEndpointGroup, error) {
n, exists, err := svcNegLister.GetByKey(fmt.Sprintf("%s/%s", namespace, negName))
Expand Down

0 comments on commit 040ba0c

Please sign in to comment.