Skip to content

Commit

Permalink
Merge pull request #1699 from kl52752/metric-with-retry
Browse files Browse the repository at this point in the history
Modify reporting error metrics from L4 RBS services -  only report service in error if resync deadline elapsed
  • Loading branch information
k8s-ci-robot authored May 31, 2022
2 parents e24e6c8 + bbd24a6 commit 434b718
Show file tree
Hide file tree
Showing 7 changed files with 136 additions and 37 deletions.
2 changes: 1 addition & 1 deletion pkg/context/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ func NewControllerContext(
ClusterNamer: clusterNamer,
L4Namer: namer.NewL4Namer(string(kubeSystemUID), clusterNamer),
KubeSystemUID: kubeSystemUID,
ControllerMetrics: metrics.NewControllerMetrics(flags.F.MetricsExportInterval),
ControllerMetrics: metrics.NewControllerMetrics(flags.F.MetricsExportInterval, flags.F.L4NetLBProvisionDeadline),
ControllerContextConfig: config,
IngressInformer: informernetworking.NewIngressInformer(kubeClient, config.Namespace, config.ResyncPeriod, utils.NewNamespaceIndexer()),
ServiceInformer: informerv1.NewServiceInformer(kubeClient, config.Namespace, config.ResyncPeriod, utils.NewNamespaceIndexer()),
Expand Down
3 changes: 3 additions & 0 deletions pkg/flags/flags.go
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ var (
NegGCPeriod time.Duration
NodePortRanges PortRanges
ResyncPeriod time.Duration
L4NetLBProvisionDeadline time.Duration
NumL4Workers int
NumIngressWorkers int
RunIngressController bool
Expand Down Expand Up @@ -204,6 +205,8 @@ the pod secrets for creating a Kubernetes client.`)
`Path to kubeconfig file with authorization and master location information.`)
flag.DurationVar(&F.ResyncPeriod, "sync-period", 30*time.Second,
`Relist and confirm cloud resources this often.`)
flag.DurationVar(&F.L4NetLBProvisionDeadline, "l4-netlb-provision-deadline", 20*time.Minute,
`Deadline latency for L4 NetLB provisioning.`)
flag.IntVar(&F.NumL4Workers, "num-l4-workers", 5,
`Number of parallel L4 Service worker goroutines.`)
flag.IntVar(&F.NumIngressWorkers, "num-ingress-workers", 1,
Expand Down
2 changes: 1 addition & 1 deletion pkg/l4lb/l4netlbcontroller.go
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,7 @@ func (lc *L4NetLBController) syncInternal(service *v1.Service) *loadbalancers.L4
syncResult.Error = fmt.Errorf("failed to set resource annotations, err: %w", err)
return syncResult
}
syncResult.MetricsState.InSuccess = true
syncResult.SetMetricsForSuccessfulService()
return syncResult
}

Expand Down
9 changes: 8 additions & 1 deletion pkg/loadbalancers/l4netlb.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,12 @@ type L4NetLBSyncResult struct {
StartTime time.Time
}

// SetMetricsForSuccessfulService should be call after successful sync.
func (r *L4NetLBSyncResult) SetMetricsForSuccessfulService() {
r.MetricsState.FirstSyncErrorTime = nil
r.MetricsState.InSuccess = true
}

// NewL4NetLB creates a new Handler for the given L4NetLB service.
func NewL4NetLB(service *corev1.Service, cloud *gce.Cloud, scope meta.KeyType, namer namer.L4ResourcesNamer, recorder record.EventRecorder, lock *sync.Mutex) *L4NetLB {
l4netlb := &L4NetLB{cloud: cloud,
Expand Down Expand Up @@ -100,7 +106,8 @@ func (l4netlb *L4NetLB) EnsureFrontend(nodeNames []string, svc *corev1.Service)
result := &L4NetLBSyncResult{
Annotations: make(map[string]string),
StartTime: time.Now(),
SyncType: SyncTypeCreate}
SyncType: SyncTypeCreate,
}

// If service already has an IP assigned, treat it as an update instead of a new Loadbalancer.
if len(svc.Status.LoadBalancer.Ingress) > 0 {
Expand Down
119 changes: 95 additions & 24 deletions pkg/metrics/l4metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package metrics
import (
"fmt"
"testing"
"time"

"github.com/google/go-cmp/cmp"
)
Expand Down Expand Up @@ -181,6 +182,11 @@ func newL4ILBServiceState(globalAccess, customSubnet, inSuccess bool) L4ILBServi

func TestComputeL4NetLBMetrics(t *testing.T) {
t.Parallel()

currTime := time.Now()
before5min := currTime.Add(-5 * time.Minute)
before25min := currTime.Add(-25 * time.Minute)

for _, tc := range []struct {
desc string
serviceStates []L4NetLBServiceState
Expand All @@ -195,79 +201,85 @@ func TestComputeL4NetLBMetrics(t *testing.T) {
premiumNetworkTier: 0,
success: 0,
inUserError: 0,
inError: 0,
},
},
{
desc: "one l4 Netlb service",
serviceStates: []L4NetLBServiceState{
newL4NetLBServiceState(isSuccess, unmanaged, standard, noUserError),
newL4NetLBServiceState(isSuccess, unmanaged, noUserError, standard /*resyncTime = */, nil),
},
expectL4NetLBCount: netLBFeatureCount{
service: 1,
managedStaticIP: 0,
premiumNetworkTier: 0,
success: 1,
inUserError: 0,
inError: 0,
},
},
{
desc: "one l4 Netlb service in premium network tier",
serviceStates: []L4NetLBServiceState{
newL4NetLBServiceState(isSuccess, unmanaged, premium, noUserError),
newL4NetLBServiceState(isSuccess, unmanaged, premium, noUserError /*resyncTime = */, nil),
},
expectL4NetLBCount: netLBFeatureCount{
service: 1,
managedStaticIP: 0,
premiumNetworkTier: 1,
success: 1,
inUserError: 0,
inError: 0,
},
},
{
desc: "one l4 Netlb service in premium network tier and managed static ip",
serviceStates: []L4NetLBServiceState{
newL4NetLBServiceState(isSuccess, managed, premium, noUserError),
newL4NetLBServiceState(isSuccess, managed, premium, noUserError /*resyncTime = */, nil),
},
expectL4NetLBCount: netLBFeatureCount{
service: 1,
managedStaticIP: 1,
premiumNetworkTier: 1,
success: 1,
inUserError: 0,
inError: 0,
},
},
{
desc: "l4 Netlb service in error state",
desc: "l4 Netlb service in error state with timestamp greater than resync period",
serviceStates: []L4NetLBServiceState{
newL4NetLBServiceState(isError, unmanaged, standard, noUserError),
newL4NetLBServiceState(isError, unmanaged, standard, noUserError, &before25min),
},
expectL4NetLBCount: netLBFeatureCount{
service: 1,
managedStaticIP: 0,
premiumNetworkTier: 0,
success: 0,
inUserError: 0,
inError: 1,
},
},
{
desc: "l4 Netlb service in error state should not count static ip and network tier",
serviceStates: []L4NetLBServiceState{
newL4NetLBServiceState(isError, unmanaged, standard, noUserError),
newL4NetLBServiceState(isError, managed, premium, noUserError),
newL4NetLBServiceState(isError, unmanaged, standard, noUserError, &before25min),
newL4NetLBServiceState(isError, managed, premium, noUserError, &before25min),
},
expectL4NetLBCount: netLBFeatureCount{
service: 2,
managedStaticIP: 0,
premiumNetworkTier: 0,
success: 0,
inUserError: 0,
inError: 2,
},
},
{
desc: "two l4 Netlb services with different network tier",
serviceStates: []L4NetLBServiceState{
newL4NetLBServiceState(isSuccess, unmanaged, standard, noUserError),
newL4NetLBServiceState(isSuccess, unmanaged, premium, noUserError),
newL4NetLBServiceState(isSuccess, unmanaged, standard, noUserError /*resyncTime = */, nil),
newL4NetLBServiceState(isSuccess, unmanaged, premium, noUserError, nil),
},
expectL4NetLBCount: netLBFeatureCount{
service: 2,
Expand All @@ -280,35 +292,51 @@ func TestComputeL4NetLBMetrics(t *testing.T) {
{
desc: "two l4 Netlb services with user error should not count others features",
serviceStates: []L4NetLBServiceState{
newL4NetLBServiceState(isError, unmanaged, standard, isUserError),
newL4NetLBServiceState(isError, unmanaged, premium, isUserError),
newL4NetLBServiceState(isError, unmanaged, standard, isUserError /*resyncTime = */, nil),
newL4NetLBServiceState(isError, unmanaged, premium, isUserError, nil),
},
expectL4NetLBCount: netLBFeatureCount{
service: 2,
managedStaticIP: 0,
premiumNetworkTier: 0,
success: 0,
inUserError: 2,
inError: 0,
},
},
{
desc: "Error should be measure only after retry time period",
serviceStates: []L4NetLBServiceState{
newL4NetLBServiceState(isError, unmanaged, premium, noUserError, &before5min),
},
expectL4NetLBCount: netLBFeatureCount{
service: 1,
managedStaticIP: 0,
premiumNetworkTier: 0,
success: 0,
inUserError: 0,
inError: 0,
},
},
{
desc: "multi l4 Netlb services",
serviceStates: []L4NetLBServiceState{
newL4NetLBServiceState(isSuccess, unmanaged, standard, noUserError),
newL4NetLBServiceState(isSuccess, unmanaged, premium, noUserError),
newL4NetLBServiceState(isSuccess, unmanaged, premium, noUserError),
newL4NetLBServiceState(isSuccess, managed, premium, noUserError),
newL4NetLBServiceState(isSuccess, managed, premium, noUserError),
newL4NetLBServiceState(isSuccess, managed, standard, noUserError),
newL4NetLBServiceState(isError, managed, premium, noUserError),
newL4NetLBServiceState(isError, managed, standard, noUserError),
newL4NetLBServiceState(isSuccess, unmanaged, standard, noUserError /*resyncTime = */, nil),
newL4NetLBServiceState(isSuccess, unmanaged, premium, noUserError, nil),
newL4NetLBServiceState(isSuccess, unmanaged, premium, noUserError, nil),
newL4NetLBServiceState(isSuccess, managed, premium, noUserError, nil),
newL4NetLBServiceState(isSuccess, managed, premium, noUserError, nil),
newL4NetLBServiceState(isSuccess, managed, standard, noUserError, nil),
newL4NetLBServiceState(isError, managed, premium, noUserError, &before25min),
newL4NetLBServiceState(isError, managed, standard, noUserError, &before25min),
},
expectL4NetLBCount: netLBFeatureCount{
service: 8,
managedStaticIP: 3,
premiumNetworkTier: 4,
success: 6,
inUserError: 0,
inError: 2,
},
},
} {
Expand All @@ -327,11 +355,54 @@ func TestComputeL4NetLBMetrics(t *testing.T) {
}
}

func newL4NetLBServiceState(inSuccess, managed, premium, userError bool) L4NetLBServiceState {
func newL4NetLBServiceState(inSuccess, managed, premium, userError bool, errorTimestamp *time.Time) L4NetLBServiceState {
return L4NetLBServiceState{
IsPremiumTier: premium,
IsManagedIP: managed,
InSuccess: inSuccess,
IsUserError: userError,
IsPremiumTier: premium,
IsManagedIP: managed,
InSuccess: inSuccess,
IsUserError: userError,
FirstSyncErrorTime: errorTimestamp,
}
}

func TestRetryPeriodForL4NetLBServices(t *testing.T) {
t.Parallel()
currTime := time.Now()
before5min := currTime.Add(-5 * time.Minute)
before25min := currTime.Add(-25 * time.Minute)

svcName1 := "svc1"
svcName2 := "svc2"
newMetrics := FakeControllerMetrics()
errorState := newL4NetLBServiceState(isError, managed, premium, noUserError, &currTime)
newMetrics.SetL4NetLBService(svcName1, errorState)

if err := checkMetricsComputation(newMetrics /*expErrorCount =*/, 0 /*expSvcCount =*/, 1); err != nil {
t.Fatalf("Check metrics computation failed err: %v", err)
}
errorState.FirstSyncErrorTime = &before5min
newMetrics.SetL4NetLBService(svcName1, errorState)
state, ok := newMetrics.l4NetLBServiceMap[svcName1]
if !ok {
t.Fatalf("state should be set")
}
if *state.FirstSyncErrorTime == before5min {
t.Fatal("Time Should Not be rewrite")
}
errorState.FirstSyncErrorTime = &before25min
newMetrics.SetL4NetLBService(svcName2, errorState)
if err := checkMetricsComputation(newMetrics /*expErrorCount =*/, 1 /*expSvcCount =*/, 2); err != nil {
t.Fatalf("Check metrics computation failed err: %v", err)
}
}

func checkMetricsComputation(newMetrics *ControllerMetrics, expErrorCount, expSvcCount int) error {
got := newMetrics.computeL4NetLBMetrics()
if got.inError != expErrorCount {
return fmt.Errorf("Error count mismatch expected: %v got: %v", expErrorCount, got.inError)
}
if got.service != expSvcCount {
return fmt.Errorf("Service count mismatch expected: %v got: %v", expSvcCount, got.service)
}
return nil
}
34 changes: 24 additions & 10 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ type netLBFeatureCount struct {
premiumNetworkTier int
success int
inUserError int
inError int
}

func (netlbCount *netLBFeatureCount) record() {
Expand All @@ -114,7 +115,7 @@ func (netlbCount *netLBFeatureCount) record() {
l4NetLBCount.With(prometheus.Labels{label: l4NetLBManagedStaticIP.String()}).Set(float64(netlbCount.managedStaticIP))
l4NetLBCount.With(prometheus.Labels{label: l4NetLBInSuccess.String()}).Set(float64(netlbCount.success))
l4NetLBCount.With(prometheus.Labels{label: l4NetLBInUserError.String()}).Set(float64(netlbCount.inUserError))
l4NetLBCount.With(prometheus.Labels{label: l4NetLBInError.String()}).Set(float64(netlbCount.service - netlbCount.success))
l4NetLBCount.With(prometheus.Labels{label: l4NetLBInError.String()}).Set(float64(netlbCount.inError))
}

// init registers ingress usage metrics.
Expand Down Expand Up @@ -161,24 +162,27 @@ type ControllerMetrics struct {
sync.Mutex
// duration between metrics exports
metricsInterval time.Duration
// Time during which the L4 NetLB service should be provision.
l4NetLBProvisionDeadline time.Duration
}

// NewControllerMetrics initializes ControllerMetrics and starts a go routine to compute and export metrics periodically.
func NewControllerMetrics(exportInterval time.Duration) *ControllerMetrics {
func NewControllerMetrics(exportInterval, l4NetLBProvisionDeadline time.Duration) *ControllerMetrics {
return &ControllerMetrics{
ingressMap: make(map[string]IngressState),
negMap: make(map[string]NegServiceState),
l4ILBServiceMap: make(map[string]L4ILBServiceState),
l4NetLBServiceMap: make(map[string]L4NetLBServiceState),
pscMap: make(map[string]pscmetrics.PSCState),
serviceMap: make(map[string]struct{}),
metricsInterval: exportInterval,
ingressMap: make(map[string]IngressState),
negMap: make(map[string]NegServiceState),
l4ILBServiceMap: make(map[string]L4ILBServiceState),
l4NetLBServiceMap: make(map[string]L4NetLBServiceState),
pscMap: make(map[string]pscmetrics.PSCState),
serviceMap: make(map[string]struct{}),
metricsInterval: exportInterval,
l4NetLBProvisionDeadline: l4NetLBProvisionDeadline,
}
}

// FakeControllerMetrics creates new ControllerMetrics with fixed 10 minutes metricsInterval, to be used in tests
func FakeControllerMetrics() *ControllerMetrics {
return NewControllerMetrics(10 * time.Minute)
return NewControllerMetrics(10*time.Minute, 20*time.Minute)
}

// servicePortKey defines a service port uniquely.
Expand Down Expand Up @@ -275,6 +279,13 @@ func (im *ControllerMetrics) SetL4NetLBService(svcKey string, state L4NetLBServi
if im.l4NetLBServiceMap == nil {
klog.Fatalf("L4 Net LB Metrics failed to initialize correctly.")
}

if !state.InSuccess {
if previousState, ok := im.l4NetLBServiceMap[svcKey]; ok && previousState.FirstSyncErrorTime != nil {
// If service is in Error state and retry timestamp was set then do not update it.
state.FirstSyncErrorTime = previousState.FirstSyncErrorTime
}
}
im.l4NetLBServiceMap[svcKey] = state
}

Expand Down Expand Up @@ -523,6 +534,9 @@ func (im *ControllerMetrics) computeL4NetLBMetrics() netLBFeatureCount {
continue
}
if !state.InSuccess {
if time.Since(*state.FirstSyncErrorTime) > im.l4NetLBProvisionDeadline {
counts.inError++
}
// Skip counting other features if the service is in error state.
continue
}
Expand Down
4 changes: 4 additions & 0 deletions pkg/metrics/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ limitations under the License.
package metrics

import (
"time"

v1 "k8s.io/api/networking/v1"
frontendconfigv1beta1 "k8s.io/ingress-gce/pkg/apis/frontendconfig/v1beta1"
"k8s.io/ingress-gce/pkg/utils"
Expand Down Expand Up @@ -81,6 +83,8 @@ type L4NetLBServiceState struct {
InSuccess bool
// IsUserError specifies if the error was caused by User misconfiguration.
IsUserError bool
// FirstSyncErrorTime specifies the time timestamp when the service sync ended up with error for the first time.
FirstSyncErrorTime *time.Time
}

// IngressMetricsCollector is an interface to update/delete ingress states in the cache
Expand Down

0 comments on commit 434b718

Please sign in to comment.