Skip to content

Commit

Permalink
Detect panics in e2e tests
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Büringer [email protected]
  • Loading branch information
sbueringer committed Aug 20, 2024
1 parent defa62d commit 7ee08ec
Show file tree
Hide file tree
Showing 3 changed files with 156 additions and 3 deletions.
53 changes: 51 additions & 2 deletions test/framework/deployment_helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package framework

import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
Expand All @@ -31,12 +32,15 @@ import (

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"github.com/pkg/errors"
"github.com/prometheus/common/expfmt"
appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
policyv1 "k8s.io/api/policy/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
kerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/apimachinery/pkg/util/intstr"
utilversion "k8s.io/apimachinery/pkg/util/version"
"k8s.io/apimachinery/pkg/util/wait"
Expand Down Expand Up @@ -348,8 +352,8 @@ type WatchPodMetricsInput struct {

// WatchPodMetrics captures metrics from all pods every 5s. It expects to find port 8080 open on the controller.
func WatchPodMetrics(ctx context.Context, input WatchPodMetricsInput) {
// Dump machine metrics every 5 seconds
ticker := time.NewTicker(time.Second * 5)
// Dump metrics periodically.
ticker := time.NewTicker(time.Second * 10)
Expect(ctx).NotTo(BeNil(), "ctx is required for dumpContainerMetrics")
Expect(input.ClientSet).NotTo(BeNil(), "input.ClientSet is required for dumpContainerMetrics")
Expect(input.Deployment).NotTo(BeNil(), "input.Deployment is required for dumpContainerMetrics")
Expand Down Expand Up @@ -397,8 +401,10 @@ func dumpPodMetrics(ctx context.Context, client *kubernetes.Clientset, metricsPa
Do(ctx)
data, err := res.Raw()

var errorRetrievingMetrics bool
if err != nil {
// Failing to dump metrics should not cause the test to fail
errorRetrievingMetrics = true
data = []byte(fmt.Sprintf("Error retrieving metrics for pod %s: %v\n%s", klog.KRef(pod.Namespace, pod.Name), err, string(data)))
metricsFile = path.Join(metricsDir, "metrics-error.txt")
}
Expand All @@ -407,7 +413,50 @@ func dumpPodMetrics(ctx context.Context, client *kubernetes.Clientset, metricsPa
// Failing to dump metrics should not cause the test to fail
log.Logf("Error writing metrics for pod %s: %v", klog.KRef(pod.Namespace, pod.Name), err)
}

if !errorRetrievingMetrics {
Expect(verifyMetrics(data)).To(Succeed())
}
}
}

func verifyMetrics(data []byte) error {
var parser expfmt.TextParser
mf, err := parser.TextToMetricFamilies(bytes.NewReader(data))
if err != nil {
return errors.Wrapf(err, "failed to parse data to metrics families")
}

var errs []error
for metric, metricFamily := range mf {
if metric == "controller_runtime_reconcile_panics_total" {
for _, controllerPanicMetric := range metricFamily.Metric {
if controllerPanicMetric.Counter != nil && controllerPanicMetric.Counter.Value != nil && *controllerPanicMetric.Counter.Value > 0 {
controllerName := "unknown"
for _, label := range controllerPanicMetric.Label {
if *label.Name == "controller" {
controllerName = *label.Value
}
}
errs = append(errs, fmt.Errorf("panic occurred in %q controller", controllerName))
}
}
}

if metric == "controller_runtime_webhook_panics_total" {
for _, webhookPanicMetric := range metricFamily.Metric {
if webhookPanicMetric.Counter != nil && webhookPanicMetric.Counter.Value != nil && *webhookPanicMetric.Counter.Value > 0 {
errs = append(errs, fmt.Errorf("panic occurred in webhook"))
}
}
}
}

if len(errs) > 0 {
return kerrors.NewAggregate(errs)
}

return nil
}

// WaitForDNSUpgradeInput is the input for WaitForDNSUpgrade.
Expand Down
104 changes: 104 additions & 0 deletions test/framework/deployment_helpers_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package framework

import (
"testing"

. "github.com/onsi/gomega"
)

func Test_verifyMetrics(t *testing.T) {
tests := []struct {
name string
data []byte
wantErr string
}{
{
name: "no panic metric exists",
data: []byte(`
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
# TYPE controller_runtime_max_concurrent_reconciles gauge
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
`),
},
{
name: "no panic occurred",
data: []byte(`
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
# TYPE controller_runtime_max_concurrent_reconciles gauge
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
# HELP controller_runtime_reconcile_panics_total Total number of reconciliation panics per controller
# TYPE controller_runtime_reconcile_panics_total counter
controller_runtime_reconcile_panics_total{controller="cluster"} 0
controller_runtime_reconcile_panics_total{controller="clusterclass"} 0
# HELP controller_runtime_webhook_panics_total Total number of webhook panics
# TYPE controller_runtime_webhook_panics_total counter
controller_runtime_webhook_panics_total 0
`),
},
{
name: "panic occurred in controller",
data: []byte(`
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
# TYPE controller_runtime_max_concurrent_reconciles gauge
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
# HELP controller_runtime_reconcile_panics_total Total number of reconciliation panics per controller
# TYPE controller_runtime_reconcile_panics_total counter
controller_runtime_reconcile_panics_total{controller="cluster"} 1
controller_runtime_reconcile_panics_total{controller="clusterclass"} 0
# HELP controller_runtime_webhook_panics_total Total number of webhook panics
# TYPE controller_runtime_webhook_panics_total counter
controller_runtime_webhook_panics_total 0
`),
wantErr: "panic occurred in \"cluster\" controller",
},
{
name: "panic occurred in webhook",
data: []byte(`
# HELP controller_runtime_max_concurrent_reconciles Maximum number of concurrent reconciles per controller
# TYPE controller_runtime_max_concurrent_reconciles gauge
controller_runtime_max_concurrent_reconciles{controller="cluster"} 10
controller_runtime_max_concurrent_reconciles{controller="clusterclass"} 10
# HELP controller_runtime_reconcile_panics_total Total number of reconciliation panics per controller
# TYPE controller_runtime_reconcile_panics_total counter
controller_runtime_reconcile_panics_total{controller="cluster"} 0
controller_runtime_reconcile_panics_total{controller="clusterclass"} 0
# HELP controller_runtime_webhook_panics_total Total number of webhook panics
# TYPE controller_runtime_webhook_panics_total counter
controller_runtime_webhook_panics_total 1
`),
wantErr: "panic occurred in webhook",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)

err := verifyMetrics(tt.data)
if tt.wantErr == "" {
g.Expect(err).ToNot(HaveOccurred())
} else {
g.Expect(err).To(HaveOccurred())
g.Expect(err.Error()).To(Equal(tt.wantErr))
}
})
}
}
2 changes: 1 addition & 1 deletion test/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ require (
github.com/onsi/gomega v1.34.1
github.com/pkg/errors v0.9.1
github.com/prometheus/client_golang v1.19.1
github.com/prometheus/common v0.55.0
github.com/spf13/pflag v1.0.5
github.com/vincent-petithory/dataurl v1.0.0
go.etcd.io/etcd/api/v3 v3.5.15
Expand Down Expand Up @@ -122,7 +123,6 @@ require (
github.com/pelletier/go-toml/v2 v2.2.2 // indirect
github.com/peterbourgon/diskv v2.0.1+incompatible // indirect
github.com/prometheus/client_model v0.6.1 // indirect
github.com/prometheus/common v0.55.0 // indirect
github.com/prometheus/procfs v0.15.1 // indirect
github.com/russross/blackfriday/v2 v2.1.0 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
Expand Down

0 comments on commit 7ee08ec

Please sign in to comment.