From 1e66a54974856cd31fea0e203a7b97b955ebe792 Mon Sep 17 00:00:00 2001 From: Thibault Jamet Date: Thu, 24 Feb 2022 16:08:32 +0100 Subject: [PATCH] Add a certificate info metric (#8253) When the ingress controller loads certificates (new ones or following a secret update), it performs a series of check to ensure its validity. In our systems, we detected a case where, when the secret object is compromised, for example when the certificate does not match the secret key, different pods of the ingress controller are serving a different version of the certificate. This behaviour is due to the cache mechanism of the ingress controller, keeping the last known certificate in case of corruption. When this happens, old ingress-controller pods will keep serving the old one, while new pods, by failing to load the corrupted certificates, would use the default certificate, causing invalid certificates for its clients. This generates a random error on the client side, depending on the actual pod instance it reaches. In order to allow detecting occurences of those situations, add a metric to expose, for all ingress controlller pods, detailed informations of the currently loaded certificate. This will, for example, allow setting an alert when there is a certificate discrepency across all ingress controller pods using a query similar to `sum(nginx_ingress_controller_ssl_certificate_info{host="name.tld"})by(serial_number)` This also allows to catch other exceptions loading certificates (failing to load the certificate from the k8s API, ... Co-authored-by: Daniel Ricart Co-authored-by: Daniel Ricart --- internal/ingress/controller/controller.go | 35 +++- internal/ingress/controller/nginx.go | 1 + .../ingress/metric/collectors/controller.go | 89 +++++++++- .../metric/collectors/controller_test.go | 166 +++++++++++++++++- internal/ingress/metric/dummy.go | 5 +- internal/ingress/metric/main.go | 14 +- internal/ingress/sslcert.go | 11 ++ 7 files changed, 309 insertions(+), 12 deletions(-) diff --git a/internal/ingress/controller/controller.go b/internal/ingress/controller/controller.go index 607bb5cb96..651e983a84 100644 --- a/internal/ingress/controller/controller.go +++ b/internal/ingress/controller/controller.go @@ -146,6 +146,7 @@ func (n *NGINXController) syncIngress(interface{}) error { hosts, servers, pcfg := n.getConfiguration(ings) n.metricCollector.SetSSLExpireTime(servers) + n.metricCollector.SetSSLInfo(servers) if n.runningConfig.Equal(pcfg) { klog.V(3).Infof("No configuration change detected, skipping backend reload") @@ -211,7 +212,8 @@ func (n *NGINXController) syncIngress(interface{}) error { ri := getRemovedIngresses(n.runningConfig, pcfg) re := getRemovedHosts(n.runningConfig, pcfg) - n.metricCollector.RemoveMetrics(ri, re) + rc := getRemovedCertificateSerialNumbers(n.runningConfig, pcfg) + n.metricCollector.RemoveMetrics(ri, re, rc) n.runningConfig = pcfg @@ -1625,6 +1627,37 @@ func getRemovedHosts(rucfg, newcfg *ingress.Configuration) []string { return old.Difference(new).List() } +func getRemovedCertificateSerialNumbers(rucfg, newcfg *ingress.Configuration) []string { + oldCertificates := sets.NewString() + newCertificates := sets.NewString() + + for _, server := range rucfg.Servers { + if server.SSLCert == nil { + continue + } + identifier := server.SSLCert.Identifier() + if identifier != "" { + if !oldCertificates.Has(identifier) { + oldCertificates.Insert(identifier) + } + } + } + + for _, server := range newcfg.Servers { + if server.SSLCert == nil { + continue + } + identifier := server.SSLCert.Identifier() + if identifier != "" { + if !newCertificates.Has(identifier) { + newCertificates.Insert(identifier) + } + } + } + + return oldCertificates.Difference(newCertificates).List() +} + func getRemovedIngresses(rucfg, newcfg *ingress.Configuration) []string { oldIngresses := sets.NewString() newIngresses := sets.NewString() diff --git a/internal/ingress/controller/nginx.go b/internal/ingress/controller/nginx.go index ed5590c3e7..e4037dfb05 100644 --- a/internal/ingress/controller/nginx.go +++ b/internal/ingress/controller/nginx.go @@ -275,6 +275,7 @@ func (n *NGINXController) Start() { // manually update SSL expiration metrics // (to not wait for a reload) n.metricCollector.SetSSLExpireTime(n.runningConfig.Servers) + n.metricCollector.SetSSLInfo(n.runningConfig.Servers) }, OnStoppedLeading: func() { n.metricCollector.OnStoppedLeading(electionID) diff --git a/internal/ingress/metric/collectors/controller.go b/internal/ingress/metric/collectors/controller.go index 135e74b9b8..e4e2655c80 100644 --- a/internal/ingress/metric/collectors/controller.go +++ b/internal/ingress/metric/collectors/controller.go @@ -30,7 +30,8 @@ import ( var ( operation = []string{"controller_namespace", "controller_class", "controller_pod"} ingressOperation = []string{"controller_namespace", "controller_class", "controller_pod", "namespace", "ingress"} - sslLabelHost = []string{"namespace", "class", "host"} + sslLabelHost = []string{"namespace", "class", "host", "secret_name"} + sslInfoLabels = []string{"namespace", "class", "host", "secret_name", "identifier", "issuer_organization", "issuer_common_name", "serial_number", "public_key_algorithm"} ) // Controller defines base metrics about the ingress controller @@ -46,6 +47,7 @@ type Controller struct { checkIngressOperation *prometheus.CounterVec checkIngressOperationErrors *prometheus.CounterVec sslExpireTime *prometheus.GaugeVec + sslInfo *prometheus.GaugeVec constLabels prometheus.Labels labels prometheus.Labels @@ -152,6 +154,14 @@ func NewController(pod, namespace, class string) *Controller { }, sslLabelHost, ), + sslInfo: prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Namespace: PrometheusNamespace, + Name: "ssl_certificate_info", + Help: `Hold all labels associated to a certificate`, + }, + sslInfoLabels, + ), leaderElection: prometheus.NewGaugeVec( prometheus.GaugeOpts{ Namespace: PrometheusNamespace, @@ -229,6 +239,7 @@ func (cm Controller) Describe(ch chan<- *prometheus.Desc) { cm.checkIngressOperation.Describe(ch) cm.checkIngressOperationErrors.Describe(ch) cm.sslExpireTime.Describe(ch) + cm.sslInfo.Describe(ch) cm.leaderElection.Describe(ch) cm.buildInfo.Describe(ch) } @@ -243,6 +254,7 @@ func (cm Controller) Collect(ch chan<- prometheus.Metric) { cm.checkIngressOperation.Collect(ch) cm.checkIngressOperationErrors.Collect(ch) cm.sslExpireTime.Collect(ch) + cm.sslInfo.Collect(ch) cm.leaderElection.Collect(ch) cm.buildInfo.Collect(ch) } @@ -256,20 +268,89 @@ func (cm *Controller) SetSSLExpireTime(servers []*ingress.Server) { labels[k] = v } labels["host"] = s.Hostname + labels["secret_name"] = s.SSLCert.Name cm.sslExpireTime.With(labels).Set(float64(s.SSLCert.ExpireTime.Unix())) } } } +// SetSSLInfo creates a metric with all certificates informations +func (cm *Controller) SetSSLInfo(servers []*ingress.Server) { + for _, s := range servers { + if s.SSLCert != nil && s.SSLCert.Certificate != nil && s.SSLCert.Certificate.SerialNumber != nil { + labels := make(prometheus.Labels, len(cm.labels)+1) + for k, v := range cm.labels { + labels[k] = v + } + labels["identifier"] = s.SSLCert.Identifier() + labels["host"] = s.Hostname + labels["secret_name"] = s.SSLCert.Name + labels["namespace"] = s.SSLCert.Namespace + labels["issuer_common_name"] = s.SSLCert.Certificate.Issuer.CommonName + labels["issuer_organization"] = "" + if len(s.SSLCert.Certificate.Issuer.Organization) > 0 { + labels["issuer_organization"] = s.SSLCert.Certificate.Issuer.Organization[0] + } + labels["serial_number"] = s.SSLCert.Certificate.SerialNumber.String() + labels["public_key_algorithm"] = s.SSLCert.Certificate.PublicKeyAlgorithm.String() + + cm.sslInfo.With(labels).Set(1) + } + } +} + // RemoveMetrics removes metrics for hostnames not available anymore -func (cm *Controller) RemoveMetrics(hosts []string, registry prometheus.Gatherer) { +func (cm *Controller) RemoveMetrics(hosts, certificates []string, registry prometheus.Gatherer) { cm.removeSSLExpireMetrics(true, hosts, registry) + cm.removeCertificatesMetrics(true, certificates, registry) } -// RemoveAllSSLExpireMetrics removes metrics for expiration of SSL Certificates -func (cm *Controller) RemoveAllSSLExpireMetrics(registry prometheus.Gatherer) { +// RemoveAllSSLMetrics removes metrics for expiration of SSL Certificates +func (cm *Controller) RemoveAllSSLMetrics(registry prometheus.Gatherer) { cm.removeSSLExpireMetrics(false, []string{}, registry) + cm.removeCertificatesMetrics(false, []string{}, registry) +} + +func (cm *Controller) removeCertificatesMetrics(onlyDefinedHosts bool, certificates []string, registry prometheus.Gatherer) { + mfs, err := registry.Gather() + if err != nil { + klog.Errorf("Error gathering metrics: %v", err) + return + } + + toRemove := sets.NewString(certificates...) + + for _, mf := range mfs { + metricName := mf.GetName() + if fmt.Sprintf("%v_ssl_certificate_info", PrometheusNamespace) != metricName { + continue + } + + for _, m := range mf.GetMetric() { + labels := make(map[string]string, len(m.GetLabel())) + for _, labelPair := range m.GetLabel() { + labels[*labelPair.Name] = *labelPair.Value + } + + // remove labels that are constant + deleteConstants(labels) + + identifier, ok := labels["identifier"] + if !ok { + continue + } + if onlyDefinedHosts && !toRemove.Has(identifier) { + continue + } + + klog.V(2).Infof("Removing prometheus metric from gauge %v for identifier %v", metricName, identifier) + removed := cm.sslInfo.Delete(labels) + if !removed { + klog.V(2).Infof("metric %v for identifier %v with labels not removed: %v", metricName, identifier, labels) + } + } + } } func (cm *Controller) removeSSLExpireMetrics(onlyDefinedHosts bool, hosts []string, registry prometheus.Gatherer) { diff --git a/internal/ingress/metric/collectors/controller_test.go b/internal/ingress/metric/collectors/controller_test.go index e10dafdfe3..bc9f21c60c 100644 --- a/internal/ingress/metric/collectors/controller_test.go +++ b/internal/ingress/metric/collectors/controller_test.go @@ -17,6 +17,9 @@ limitations under the License. package collectors import ( + "crypto/x509" + "crypto/x509/pkix" + "math/big" "testing" "time" @@ -96,10 +99,111 @@ func TestControllerCounters(t *testing.T) { want: ` # HELP nginx_ingress_controller_ssl_expire_time_seconds Number of seconds since 1970 to the SSL Certificate expire.\n An example to check if this certificate will expire in 10 days is: "nginx_ingress_controller_ssl_expire_time_seconds < (time() + (10 * 24 * 3600))" # TYPE nginx_ingress_controller_ssl_expire_time_seconds gauge - nginx_ingress_controller_ssl_expire_time_seconds{class="nginx",host="demo",namespace="default"} 1.351807721e+09 + nginx_ingress_controller_ssl_expire_time_seconds{class="nginx",host="demo",namespace="default",secret_name=""} 1.351807721e+09 `, metrics: []string{"nginx_ingress_controller_ssl_expire_time_seconds"}, }, + { + name: "should set SSL certificates infos metrics", + test: func(cm *Controller) { + + servers := []*ingress.Server{ + { + Hostname: "demo", + SSLCert: &ingress.SSLCert{ + Name: "secret-name", + Namespace: "ingress-namespace", + Certificate: &x509.Certificate{ + PublicKeyAlgorithm: x509.ECDSA, + Issuer: pkix.Name{ + CommonName: "certificate issuer", + SerialNumber: "abcd1234", + Organization: []string{"issuer org"}, + }, + SerialNumber: big.NewInt(100), + }, + }, + }, + { + Hostname: "invalid", + SSLCert: &ingress.SSLCert{ + ExpireTime: time.Unix(0, 0), + }, + }, + } + cm.SetSSLInfo(servers) + }, + want: ` + # HELP nginx_ingress_controller_ssl_certificate_info Hold all labels associated to a certificate + # TYPE nginx_ingress_controller_ssl_certificate_info gauge + nginx_ingress_controller_ssl_certificate_info{class="nginx",host="demo",identifier="abcd1234-100",issuer_common_name="certificate issuer",issuer_organization="issuer org",namespace="ingress-namespace",public_key_algorithm="ECDSA",secret_name="secret-name",serial_number="100"} 1 + `, + metrics: []string{"nginx_ingress_controller_ssl_certificate_info"}, + }, + { + name: "should ignore certificates without serial number", + test: func(cm *Controller) { + + servers := []*ingress.Server{ + { + Hostname: "demo", + SSLCert: &ingress.SSLCert{ + Name: "secret-name", + Namespace: "ingress-namespace", + Certificate: &x509.Certificate{ + PublicKeyAlgorithm: x509.ECDSA, + Issuer: pkix.Name{ + CommonName: "certificate issuer", + SerialNumber: "abcd1234", + }, + }, + }, + }, + } + cm.SetSSLInfo(servers) + }, + want: ``, + metrics: []string{"nginx_ingress_controller_ssl_certificate_info"}, + }, + { + name: "should ignore certificates with nil x509 pointer", + test: func(cm *Controller) { + + servers := []*ingress.Server{ + { + Hostname: "demo", + SSLCert: &ingress.SSLCert{ + Name: "secret-name", + Namespace: "ingress-namespace", + Certificate: &x509.Certificate{ + PublicKeyAlgorithm: x509.ECDSA, + Issuer: pkix.Name{ + CommonName: "certificate issuer", + SerialNumber: "abcd1234", + }, + }, + }, + }, + } + cm.SetSSLInfo(servers) + }, + want: ``, + metrics: []string{"nginx_ingress_controller_ssl_certificate_info"}, + }, + { + name: "should ignore servers without certificates", + test: func(cm *Controller) { + + servers := []*ingress.Server{ + { + Hostname: "demo", + }, + } + cm.SetSSLInfo(servers) + }, + want: ``, + metrics: []string{"nginx_ingress_controller_ssl_certificate_info"}, + }, } for _, c := range cases { @@ -137,6 +241,13 @@ func TestRemoveMetrics(t *testing.T) { Hostname: "demo", SSLCert: &ingress.SSLCert{ ExpireTime: t1, + Certificate: &x509.Certificate{ + Issuer: pkix.Name{ + CommonName: "certificate issuer", + SerialNumber: "abcd1234", + }, + SerialNumber: big.NewInt(100), + }, }, }, { @@ -147,12 +258,63 @@ func TestRemoveMetrics(t *testing.T) { }, } cm.SetSSLExpireTime(servers) + cm.SetSSLInfo(servers) - cm.RemoveMetrics([]string{"demo"}, reg) + cm.RemoveMetrics([]string{"demo"}, []string{"abcd1234-100"}, reg) if err := GatherAndCompare(cm, "", []string{"nginx_ingress_controller_ssl_expire_time_seconds"}, reg); err != nil { t.Errorf("unexpected collecting result:\n%s", err) } + if err := GatherAndCompare(cm, "", []string{"nginx_ingress_controller_ssl_certificate_info"}, reg); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } + + reg.Unregister(cm) +} + +func TestRemoveAllSSLMetrics(t *testing.T) { + cm := NewController("pod", "default", "nginx") + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(cm); err != nil { + t.Errorf("registering collector failed: %s", err) + } + + t1, _ := time.Parse( + time.RFC3339, + "2012-11-01T22:08:41+00:00") + + servers := []*ingress.Server{ + { + Hostname: "demo", + SSLCert: &ingress.SSLCert{ + ExpireTime: t1, + Certificate: &x509.Certificate{ + Issuer: pkix.Name{ + CommonName: "certificate issuer", + SerialNumber: "abcd1234", + }, + SerialNumber: big.NewInt(100), + }, + }, + }, + { + Hostname: "invalid", + SSLCert: &ingress.SSLCert{ + ExpireTime: time.Unix(0, 0), + }, + }, + } + cm.SetSSLExpireTime(servers) + cm.SetSSLInfo(servers) + + cm.RemoveAllSSLMetrics(reg) + + if err := GatherAndCompare(cm, "", []string{"nginx_ingress_controller_ssl_expire_time_seconds"}, reg); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } + if err := GatherAndCompare(cm, "", []string{"nginx_ingress_controller_ssl_certificate_info"}, reg); err != nil { + t.Errorf("unexpected collecting result:\n%s", err) + } reg.Unregister(cm) } diff --git a/internal/ingress/metric/dummy.go b/internal/ingress/metric/dummy.go index 922a21604d..8360dc87db 100644 --- a/internal/ingress/metric/dummy.go +++ b/internal/ingress/metric/dummy.go @@ -48,7 +48,7 @@ func (dc DummyCollector) IncCheckCount(string, string) {} func (dc DummyCollector) IncCheckErrorCount(string, string) {} // RemoveMetrics ... -func (dc DummyCollector) RemoveMetrics(ingresses, endpoints []string) {} +func (dc DummyCollector) RemoveMetrics(ingresses, endpoints, certificates []string) {} // Start ... func (dc DummyCollector) Start(admissionStatus string) {} @@ -56,6 +56,9 @@ func (dc DummyCollector) Start(admissionStatus string) {} // Stop ... func (dc DummyCollector) Stop(admissionStatus string) {} +// SetSSLInfo ... +func (dc DummyCollector) SetSSLInfo([]*ingress.Server) {} + // SetSSLExpireTime ... func (dc DummyCollector) SetSSLExpireTime([]*ingress.Server) {} diff --git a/internal/ingress/metric/main.go b/internal/ingress/metric/main.go index 644a9b690e..97a8ca7d40 100644 --- a/internal/ingress/metric/main.go +++ b/internal/ingress/metric/main.go @@ -44,9 +44,10 @@ type Collector interface { IncCheckCount(string, string) IncCheckErrorCount(string, string) - RemoveMetrics(ingresses, endpoints []string) + RemoveMetrics(ingresses, endpoints, certificates []string) SetSSLExpireTime([]*ingress.Server) + SetSSLInfo(servers []*ingress.Server) // SetHosts sets the hostnames that are being served by the ingress controller SetHosts(sets.String) @@ -128,9 +129,9 @@ func (c *collector) IncReloadErrorCount() { c.ingressController.IncReloadErrorCount() } -func (c *collector) RemoveMetrics(ingresses, hosts []string) { +func (c *collector) RemoveMetrics(ingresses, hosts, certificates []string) { c.socket.RemoveMetrics(ingresses, c.registry) - c.ingressController.RemoveMetrics(hosts, c.registry) + c.ingressController.RemoveMetrics(hosts, certificates, c.registry) } func (c *collector) Start(admissionStatus string) { @@ -175,6 +176,11 @@ func (c *collector) SetSSLExpireTime(servers []*ingress.Server) { c.ingressController.SetSSLExpireTime(servers) } +func (c *collector) SetSSLInfo(servers []*ingress.Server) { + klog.V(2).Infof("Updating ssl certificate info metrics") + c.ingressController.SetSSLInfo(servers) +} + func (c *collector) SetHosts(hosts sets.String) { c.socket.SetHosts(hosts) } @@ -200,7 +206,7 @@ func (c *collector) OnStartedLeading(electionID string) { func (c *collector) OnStoppedLeading(electionID string) { setLeader(false) c.ingressController.OnStoppedLeading(electionID) - c.ingressController.RemoveAllSSLExpireMetrics(c.registry) + c.ingressController.RemoveAllSSLMetrics(c.registry) } var ( diff --git a/internal/ingress/sslcert.go b/internal/ingress/sslcert.go index 9265b07ac8..7dee3880d2 100644 --- a/internal/ingress/sslcert.go +++ b/internal/ingress/sslcert.go @@ -18,6 +18,7 @@ package ingress import ( "crypto/x509" + "fmt" "time" "k8s.io/apimachinery/pkg/runtime/schema" @@ -69,6 +70,16 @@ func (s SSLCert) GetObjectKind() schema.ObjectKind { return schema.EmptyObjectKind } +// Identifier returns a the couple issuer / serial number if they both exist, an empty string otherwise +func (s SSLCert) Identifier() string { + if s.Certificate != nil { + if s.Certificate.SerialNumber != nil { + return fmt.Sprintf("%s-%s", s.Certificate.Issuer.SerialNumber, s.Certificate.SerialNumber.String()) + } + } + return "" +} + // HashInclude defines if a field should be used or not to calculate the hash func (s SSLCert) HashInclude(field string, v interface{}) (bool, error) { switch field {