Skip to content

Commit

Permalink
backport(fix): Fix dashboard panels not working from #80 (#82)
Browse files Browse the repository at this point in the history
* Add `ckf` tag to the grafana dashboard.
* Fix dashboard panels not working by: 
  * Replacing unavailable metrics with available ones
  * Adding 2 minutes instead of 1 in places where rate() is used since this requires more than one scrape data points.
  * Remove rate() from panels that shows percentages.
  * Remove labels where the metrics don't provide them.

Part of canonical/bundle-kubeflow#856
Ref canonical/bundle-kubeflow#834
Ref #73
  • Loading branch information
orfeas-k committed Apr 9, 2024
1 parent d4728db commit b1f115e
Showing 1 changed file with 26 additions and 26 deletions.
52 changes: 26 additions & 26 deletions src/grafana_dashboards/envoy-service.json.tmpl
Original file line number Diff line number Diff line change
Expand Up @@ -88,28 +88,28 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(envoy_cluster_upstream_cx_total{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_cx_total{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "egress CPS",
"refId": "A"
},
{
"expr": "sum(rate(envoy_cluster_upstream_rq_total{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_total{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "egress RPS",
"refId": "B"
},
{
"expr": "sum(rate(envoy_cluster_upstream_rq_pending_total{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_pending_total{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "pending req to",
"legendFormat": "pending req total",
"refId": "C"
},
{
"expr": "sum(rate(envoy_cluster_lb_healthy_panic{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_lb_healthy_panic{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "lb healthy panic RPS",
Expand Down Expand Up @@ -408,10 +408,10 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(envoy_cluster_upstream_rq_xx{response_code_class=\"4\",envoy_cluster_name=~\"[[originating_service]]\"}[1m])) / sum(rate(envoy_cluster_upstream_rq_xx{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(envoy_http_downstream_rq_xx{envoy_response_code_class=\"4\"}) / sum(envoy_http_downstream_rq_xx{})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "%",
"legendFormat": "http downstream 4xx requests %",
"refId": "A"
}
],
Expand Down Expand Up @@ -495,7 +495,7 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(envoy_cluster_upstream_rq_xx{response_code_class!=\"5\",envoy_cluster_name=~\"[[originating_service]]\"}[1m])) / sum(rate(envoy_cluster_upstream_rq_xx{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(envoy_http_downstream_rq_xx{envoy_response_code_class!=\"5\"}) / sum(envoy_http_downstream_rq_xx{})",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "Success Rate %",
Expand Down Expand Up @@ -581,63 +581,63 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(envoy_cluster_upstream_cx_connect_timeout{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_cx_connect_timeout{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "connect timeout",
"refId": "A"
},
{
"expr": "sum(rate(envoy_cluster_upstream_rq_pending_failure_eject{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_pending_failure_eject{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "pending failure ejection",
"refId": "B"
},
{
"expr": "sum(rate(envoy_cluster_upstream_rq_pending_overflow{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_pending_overflow{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "pending overflow",
"refId": "C"
},
{
"expr": "sum(rate(envoy_cluster_upstream_rq_timeout{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_timeout{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "request timeout",
"refId": "D"
},
{
"expr": "sum(rate(envoy_cluster_upstream_rq_per_try_timeout{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_per_try_timeout{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "per try request timeout",
"refId": "E"
},
{
"expr": "sum(rate(envoy_cluster_upstream_rq_rx_reset{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_rx_reset{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "request reset",
"refId": "F"
},
{
"expr": "sum(rate(envoy_cluster_upstream_cx_destroy_local_with_active_rq{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_cx_destroy_local_with_active_rq{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "destroy initialized from originating service",
"refId": "G"
},
{
"expr": "sum(rate(envoy_http_downstream_cx_destroy_remote_active_rq{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_http_downstream_cx_destroy_remote_active_rq{}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "destroy initialized from remote service",
"refId": "H"
},
{
"expr": "sum(rate(envoy_cluster_upstream_rq_maintenance_mode{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_maintenance_mode{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "request failed maintenance mode",
Expand Down Expand Up @@ -722,29 +722,29 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(envoy_cluster_upstream_flow_control_paused_reading_total{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_flow_control_paused_reading_total{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "paused reading from destination service",
"refId": "A"
},
{
"expr": "sum(rate(envoy_cluster_upstream_flow_control_resumed_reading_total{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_flow_control_resumed_reading_total{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "resumed reading from destination service",
"refId": "B"
},
{
"expr": "sum(rate(envoy_cluster_upstream_flow_control_backed_up_total{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_flow_control_backed_up_total{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "paused reading from originating service",
"refId": "C"
},
{
"expr": "sum(rate(envoy_cluster_upstream_flow_control_drained_total{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_flow_control_drained_total{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "resumed reading from originating service",
Expand Down Expand Up @@ -829,22 +829,22 @@
"steppedLine": false,
"targets": [
{
"expr": "sum(rate(envoy_cluster_upstream_rq_retry{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_retry{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"interval": "",
"intervalFactor": 2,
"legendFormat": "request retry",
"refId": "A"
},
{
"expr": "sum(rate(envoy_cluster_upstream_rq_retry_success{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_retry_success{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "request retry success",
"refId": "B"
},
{
"expr": "sum(rate(envoy_cluster_upstream_rq_retry_overflow{envoy_cluster_name=~\"[[originating_service]]\"}[1m]))",
"expr": "sum(rate(envoy_cluster_upstream_rq_retry_overflow{envoy_cluster_name=~\"[[originating_service]]\"}[2m]))",
"format": "time_series",
"intervalFactor": 2,
"legendFormat": "request retry overflow",
Expand Down Expand Up @@ -896,8 +896,8 @@
"schemaVersion": 16,
"style": "dark",
"tags": [
"envoy",
"test"
"ckf",
"envoy"
],
"templating": {
"list": [
Expand Down

0 comments on commit b1f115e

Please sign in to comment.