Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Roffe/metrics polish #595

Merged
merged 5 commits into from
Dec 7, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified dashboard/dashboard.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2,875 changes: 1,533 additions & 1,342 deletions dashboard/kube-router.json

Large diffs are not rendered by default.

12 changes: 10 additions & 2 deletions docs/metrics.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ The default values unless other specified is
By enabling [Kubernetes SD](https://prometheus.io/docs/prometheus/latest/configuration/configuration/#<kubernetes_sd_config>) in Prometheus configuration & adding required annotations Prometheus can automaticly discover & scrape kube-router metrics

## Version notes
kube-router v0.2.4 received a metrics overhaul where some metrics were changed into histograms, additional metrics was also added. Please make sure you are using the latest dashboard version with versions => v0.2.4

kube-router 0.1.0-rc2 and upwards supports the runtime configuration for controlling where to expose the metrics. If you are using a older version, metrics path & port is locked to `/metrics` & `8080`

## Supported annotations
Expand Down Expand Up @@ -56,14 +58,20 @@ The following metrics is exposed by kube-router prefixed by `kube_router_`
* controller_bgp_peers
Number of BGP peers of the instance
* controller_bgp_advertisements_received
Number of total BGP advertisements received since kube-router start
Total number of BGP advertisements received since kube-router started
* controller_bgp_advertisements_sent
Total number of BGP advertisements sent since kube-router started
* controller_bgp_internal_peers_sync_time
Time it took for the BGP internal peer sync loop to complete
* controller_routes_sync_time
Time it took for controller to sync routes

### run-firewall=true

* controller_iptables_sync_time
Time it took for the iptables sync loop to complete
* controller_policy_chains_sync_time
Time it took for controller to sync policy chains

### run-service-proxy = true

Expand Down Expand Up @@ -95,7 +103,7 @@ The following metrics is exposed by kube-router prefixed by `kube_router_`
Outgoing bytes per second

To get a grouped list of CPS for each service a Prometheus query could look like this e.g:
`sum(kube_router_service_cps) by (namespace, service_name)`
`sum(kube_router_service_cps) by (svc_namespace, service_name)`

## Grafana Dashboard

Expand Down
12 changes: 9 additions & 3 deletions pkg/controllers/netpol/network_policy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"errors"
"fmt"
"net"
"regexp"
"strconv"
"strings"
"sync"
Expand All @@ -26,7 +27,6 @@ import (
"k8s.io/client-go/kubernetes"
listers "k8s.io/client-go/listers/core/v1"
"k8s.io/client-go/tools/cache"
"regexp"
)

const (
Expand Down Expand Up @@ -215,7 +215,7 @@ func (npc *NetworkPolicyController) Sync() error {
defer func() {
endTime := time.Since(start)
if npc.MetricsEnabled {
metrics.ControllerIptablesSyncTime.WithLabelValues().Set(float64(endTime.Seconds()))
metrics.ControllerIptablesSyncTime.Observe(endTime.Seconds())
}
glog.V(1).Infof("sync iptables took %v", endTime)
}()
Expand Down Expand Up @@ -258,7 +258,12 @@ func (npc *NetworkPolicyController) Sync() error {
// policyspec is evaluated to set of matching pods, which are grouped in to a
// ipset used for source ip addr matching.
func (npc *NetworkPolicyController) syncNetworkPolicyChains(version string) (map[string]bool, map[string]bool, error) {

start := time.Now()
defer func() {
endTime := time.Since(start)
metrics.ControllerPolicyChainsSyncTime.Observe(endTime.Seconds())
glog.V(2).Infof("Syncing network policy chains took %v", endTime)
}()
activePolicyChains := make(map[string]bool)
activePolicyIpSets := make(map[string]bool)

Expand Down Expand Up @@ -1536,6 +1541,7 @@ func NewNetworkPolicyController(clientset kubernetes.Interface,
if config.MetricsEnabled {
//Register the metrics for this controller
prometheus.MustRegister(metrics.ControllerIptablesSyncTime)
prometheus.MustRegister(metrics.ControllerPolicyChainsSyncTime)
npc.MetricsEnabled = true
}

Expand Down
8 changes: 5 additions & 3 deletions pkg/controllers/proxy/network_services_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,9 @@ func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoM
defer func() {
endTime := time.Since(start)
glog.V(2).Infof("Publishing IPVS metrics took %v", endTime)
metrics.ControllerIpvsMetricsExportTime.WithLabelValues().Set(float64(endTime.Seconds()))
if nsc.MetricsEnabled {
metrics.ControllerIpvsMetricsExportTime.Observe(float64(endTime.Seconds()))
}
}()

ipvsSvcs, err := nsc.ln.ipvsGetServices()
Expand Down Expand Up @@ -429,7 +431,7 @@ func (nsc *NetworkServicesController) publishMetrics(serviceInfoMap serviceInfoM
metrics.ServicePpsIn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PPSIn))
metrics.ServicePpsOut.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.PPSOut))
metrics.ServiceTotalConn.WithLabelValues(svc.namespace, svc.name, svcVip, svc.protocol, strconv.Itoa(svc.port)).Set(float64(ipvsSvc.Stats.Connections))
metrics.ControllerIpvsServices.WithLabelValues().Set(float64(len(ipvsSvcs)))
metrics.ControllerIpvsServices.Set(float64(len(ipvsSvcs)))
}
}
}
Expand Down Expand Up @@ -528,7 +530,7 @@ func (nsc *NetworkServicesController) syncIpvsServices(serviceInfoMap serviceInf
defer func() {
endTime := time.Since(start)
if nsc.MetricsEnabled {
metrics.ControllerIpvsServicesSyncTime.WithLabelValues().Set(float64(endTime.Seconds()))
metrics.ControllerIpvsServicesSyncTime.Observe(endTime.Seconds())
}
glog.V(1).Infof("sync ipvs services took %v", endTime)
}()
Expand Down
9 changes: 6 additions & 3 deletions pkg/controllers/routing/bgp_peers.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,9 @@ func (nrc *NetworkRoutingController) syncInternalPeers() {
start := time.Now()
defer func() {
endTime := time.Since(start)
metrics.ControllerBGPInternalPeersSyncTime.WithLabelValues().Set(float64(endTime.Seconds()))
if nrc.MetricsEnabled {
metrics.ControllerBGPInternalPeersSyncTime.Observe(endTime.Seconds())
}
glog.V(2).Infof("Syncing BGP peers for the node took %v", endTime)
}()

Expand All @@ -40,8 +42,9 @@ func (nrc *NetworkRoutingController) syncInternalPeers() {
glog.Errorf("Failed to list nodes from API server due to: %s. Can not perform BGP peer sync", err.Error())
return
}

metrics.ControllerBPGpeers.WithLabelValues().Set(float64(len(nodes.Items)))
if nrc.MetricsEnabled {
metrics.ControllerBPGpeers.Set(float64(len(nodes.Items)))
}
// establish peer and add Pod CIDRs with current set of nodes
currentNodes := make([]string, 0)
for _, node := range nodes.Items {
Expand Down
12 changes: 11 additions & 1 deletion pkg/controllers/routing/network_routes_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -325,7 +325,7 @@ func (nrc *NetworkRoutingController) watchBgpUpdates() {
case *gobgp.WatchEventBestPath:
glog.V(3).Info("Processing bgp route advertisement from peer")
if nrc.MetricsEnabled {
metrics.ControllerBGPadvertisementsReceived.WithLabelValues().Add(float64(1))
metrics.ControllerBGPadvertisementsReceived.Inc()
}
for _, path := range msg.PathList {
if path.IsLocal() {
Expand All @@ -342,6 +342,9 @@ func (nrc *NetworkRoutingController) watchBgpUpdates() {
}

func (nrc *NetworkRoutingController) advertisePodRoute() error {
if nrc.MetricsEnabled {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For a future PR, I think we can make some changes to remove all if metrics enabled checks.

metrics.ControllerBGPadvertisementsSent.Inc()
}
cidr, err := utils.GetPodCidrFromNodeSpec(nrc.clientset, nrc.hostnameOverride)
if err != nil {
return err
Expand Down Expand Up @@ -486,6 +489,12 @@ func (nrc *NetworkRoutingController) Cleanup() {
}

func (nrc *NetworkRoutingController) syncNodeIPSets() error {
start := time.Now()
defer func() {
if nrc.MetricsEnabled {
metrics.ControllerRoutesSyncTime.Observe(time.Since(start).Seconds())
}
}()
// Get the current list of the nodes from API server
nodes, err := nrc.clientset.CoreV1().Nodes().List(metav1.ListOptions{})
if err != nil {
Expand Down Expand Up @@ -786,6 +795,7 @@ func NewNetworkRoutingController(clientset kubernetes.Interface,
prometheus.MustRegister(metrics.ControllerBGPadvertisementsReceived)
prometheus.MustRegister(metrics.ControllerBGPInternalPeersSyncTime)
prometheus.MustRegister(metrics.ControllerBPGpeers)
prometheus.MustRegister(metrics.ControllerRoutesSyncTime)
nrc.MetricsEnabled = true
}

Expand Down
100 changes: 65 additions & 35 deletions pkg/metrics/metrics_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,108 +21,138 @@ const (
)

var (
// ServiceTotalConn Total incoming connections made
ServiceTotalConn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_total_connections",
Help: "Total incoming connections made",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServicePacketsIn Total incoming packets
ServicePacketsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_packets_in",
Help: "Total incoming packets",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServicePacketsOut Total outgoing packets
ServicePacketsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_packets_out",
Help: "Total outgoing packets",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServiceBytesIn Total incoming bytes
ServiceBytesIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_bytes_in",
Help: "Total incoming bytes",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServiceBytesOut Total outgoing bytes
ServiceBytesOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_bytes_out",
Help: "Total outgoing bytes",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServicePpsIn Incoming packets per second
ServicePpsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_pps_in",
Help: "Incoming packets per second",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServicePpsOut Outgoing packets per second
ServicePpsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_pps_out",
Help: "Outgoing packets per second",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServiceCPS Service connections per second
ServiceCPS = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_cps",
Help: "Service connections per second",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServiceBpsIn Incoming bytes per second
ServiceBpsIn = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_bps_in",
Help: "Incoming bytes per second",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ServiceBpsOut Outgoing bytes per second
ServiceBpsOut = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "service_bps_out",
Help: "Outgoing bytes per second",
}, []string{"namespace", "service_name", "service_vip", "protocol", "port"})
ControllerIpvsServices = prometheus.NewGaugeVec(prometheus.GaugeOpts{
}, []string{"svc_namespace", "service_name", "service_vip", "protocol", "port"})
// ControllerIpvsServices Number of ipvs services in the instance
ControllerIpvsServices = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: "controller_ipvs_services",
Help: "Number of ipvs services in the instance",
}, []string{})
ControllerIptablesSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
})
// ControllerIptablesSyncTime Time it took for controller to sync iptables
ControllerIptablesSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_iptables_sync_time",
Help: "Time it took for controller to sync iptables",
}, []string{})
ControllerPublishMetricsTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "controller_publish_metrics_time",
Help: "Time it took to publish metrics",
}, []string{})
ControllerIpvsServicesSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
})
// ControllerIpvsServicesSyncTime Time it took for controller to sync ipvs services
ControllerIpvsServicesSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_ipvs_services_sync_time",
Help: "Time it took for controller to sync ipvs services",
}, []string{})
ControllerBPGpeers = prometheus.NewGaugeVec(prometheus.GaugeOpts{
})
// ControllerRoutesSyncTime Time it took for controller to sync ipvs services
ControllerRoutesSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_routes_sync_time",
Help: "Time it took for controller to sync routes",
})
// ControllerBPGpeers BGP peers in the runtime configuration
ControllerBPGpeers = prometheus.NewGauge(prometheus.GaugeOpts{
Namespace: namespace,
Name: "controller_bgp_peers",
Help: "BGP peers in the runtime configuration",
}, []string{})
ControllerBGPInternalPeersSyncTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
})
// ControllerBGPInternalPeersSyncTime Time it took to sync internal bgp peers
ControllerBGPInternalPeersSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_bgp_internal_peers_sync_time",
Help: "Time it took to sync internal bgp peers",
}, []string{})
ControllerBGPadvertisementsReceived = prometheus.NewGaugeVec(prometheus.GaugeOpts{
})
// ControllerBGPadvertisementsReceived Time it took to sync internal bgp peers
ControllerBGPadvertisementsReceived = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Name: "controller_bgp_advertisements_received",
Help: "Time it took to sync internal bgp peers",
}, []string{})
ControllerIpvsMetricsExportTime = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Help: "BGP advertisements received",
})
// ControllerBGPadvertisementsSent Time it took to sync internal bgp peers
ControllerBGPadvertisementsSent = prometheus.NewCounter(prometheus.CounterOpts{
Namespace: namespace,
Name: "controller_bgp_advertisements_sent",
Help: "BGP advertisements sent",
})
// ControllerIpvsMetricsExportTime Time it took to export metrics
ControllerIpvsMetricsExportTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_ipvs_metrics_export_time",
Help: "Time it took to export metrics",
}, []string{})
})
// ControllerPolicyChainsSyncTime Time it took for controller to sync policys
ControllerPolicyChainsSyncTime = prometheus.NewHistogram(prometheus.HistogramOpts{
Namespace: namespace,
Name: "controller_policy_chains_sync_time",
Help: "Time it took for controller to sync policy chains",
})
)

// MetricsController Holds settings for the metrics controller
type MetricsController struct {
// Controller Holds settings for the metrics controller
type Controller struct {
MetricsPath string
MetricsPort uint16
mu sync.Mutex
nodeIP net.IP
}

// Run prometheus metrics controller
func (mc *MetricsController) Run(healthChan chan<- *healthcheck.ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) error {
func (mc *Controller) Run(healthChan chan<- *healthcheck.ControllerHeartbeat, stopCh <-chan struct{}, wg *sync.WaitGroup) error {
t := time.NewTicker(3 * time.Second)
defer wg.Done()
glog.Info("Starting metrics controller")
Expand Down Expand Up @@ -157,8 +187,8 @@ func (mc *MetricsController) Run(healthChan chan<- *healthcheck.ControllerHeartb
}

// NewMetricsController returns new MetricController object
func NewMetricsController(clientset kubernetes.Interface, config *options.KubeRouterConfig) (*MetricsController, error) {
mc := MetricsController{}
func NewMetricsController(clientset kubernetes.Interface, config *options.KubeRouterConfig) (*Controller, error) {
mc := Controller{}
mc.MetricsPath = config.MetricsPath
mc.MetricsPort = config.MetricsPort
return &mc, nil
Expand Down