Skip to content

Commit

Permalink
feat: use constrained topk to improve dashboard performance
Browse files Browse the repository at this point in the history
Thanks to Alessandro.Nuzzo for raising!
  • Loading branch information
cgrinds committed Apr 12, 2024
1 parent 6ab8aa8 commit 0726c41
Show file tree
Hide file tree
Showing 26 changed files with 457 additions and 447 deletions.
55 changes: 38 additions & 17 deletions cmd/tools/grafana/dashboard_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -807,12 +807,12 @@ func checkUniquePanelIDs(t *testing.T, path string, data []byte) {
})
}

// - collect all expressions that include "topk". Ignore expressions that are:
// - Collect all expressions and variables that include "topk".
// Ignore expressions that are:
// - part of a table or stat or
// - calculate a percentage
// - for each expression - check if any variable used in the expression has a topk range
// a) if it does, pass
// b) otherwise fail, printing the expression, path, dashboard
// - if the var|expression includes a `rate|deriv`, ensure the look-back is 4m
// - otherwise, the look-back should be 3h

func TestTopKRange(t *testing.T) {
VisitDashboards(
Expand Down Expand Up @@ -846,24 +846,29 @@ func checkTopKRange(t *testing.T, path string, data []byte) {
if strings.Contains(expr.expr, "/") {
continue
}
hasRange := false
vars:

for _, name := range expr.vars {
for _, v := range variables {
if v.name == name && strings.Contains(v.query, "__range") {
hasRange = true
break vars
}
v, ok := variables[name]
if !ok {
t.Errorf(`dashboard=%s path=%s is using var that does not exist. var=%s`,
ShortPath(path), expr.path, name)
continue
}
if !strings.Contains(v.query, "topk") {
continue
}

noWhitespace := strings.ReplaceAll(v.query, " ", "")
problem := ensureLookBack(noWhitespace)
if problem != "" {
t.Errorf(`dashboard=%s var=%s topk got=%s %s`, ShortPath(path), v.name, v.query, problem)
}
}

noWhitespace := strings.ReplaceAll(expr.expr, " ", "")
if strings.Contains(noWhitespace, "[$__range]@end()") {
hasRange = true
}
if !hasRange {
t.Errorf(`dashboard=%s path=%s use topk but no variable has range. expr=%s`,
ShortPath(path), expr.path, expr.expr)
problem := ensureLookBack(noWhitespace)
if problem != "" {
t.Errorf(`dashboard=%s path=%s topk got=%s %s`, ShortPath(path), expr.path, expr.expr, problem)
}
}

Expand Down Expand Up @@ -907,6 +912,22 @@ func checkTopKRange(t *testing.T, path string, data []byte) {

}

func ensureLookBack(noWhitespace string) string {
if !strings.Contains(noWhitespace, "[") {
return ""
}
if strings.Contains(noWhitespace, "rate(") || strings.Contains(noWhitespace, "deriv(") {
if !strings.Contains(noWhitespace, "[4m]") {
return "rate/deriv want=[4m]"
}
} else {
if !strings.Contains(noWhitespace, "[3h]") {
return "range lookback want=[3h]"
}
}
return ""
}

func TestOnlyHighlightsExpanded(t *testing.T) {
exceptions := map[string]int{
"cmode/shelf.json": 2,
Expand Down
9 changes: 5 additions & 4 deletions cmd/tools/grafana/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,22 +105,23 @@ type variable struct {
options []gjson.Result
}

func allVariables(data []byte) []variable {
variables := make([]variable, 0)
func allVariables(data []byte) map[string]variable {
variables := make(map[string]variable)
gjson.GetBytes(data, "templating.list").ForEach(func(key, value gjson.Result) bool {
// The datasource variable can be ignored
if value.Get("type").String() == "datasource" {
return true
}

variables = append(variables, variable{
v := variable{
name: value.Get("name").String(),
kind: value.Get("type").String(),
query: value.Get("query.query").String(),
refresh: value.Get("refresh").String(),
options: value.Get("options").Array(),
path: key.String(),
})
}
variables[v.name] = v
return true
})
return variables
Expand Down
40 changes: 20 additions & 20 deletions grafana/dashboards/cmode/aggregate.json

Large diffs are not rendered by default.

22 changes: 11 additions & 11 deletions grafana/dashboards/cmode/cdot.json
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@
"targets": [
{
"exemplar": false,
"expr": "sum by (cluster) (node_cifs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(node_cifs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "sum by (cluster) (node_cifs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(node_cifs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"interval": "",
"legendFormat": "{{cluster}}",
"refId": "A"
Expand Down Expand Up @@ -260,7 +260,7 @@
"targets": [
{
"exemplar": false,
"expr": "sum by (cluster) (node_nfs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(node_nfs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "sum by (cluster) (node_nfs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(node_nfs_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"interval": "",
"legendFormat": "{{cluster}}",
"refId": "A"
Expand Down Expand Up @@ -351,7 +351,7 @@
"targets": [
{
"exemplar": false,
"expr": "sum by (cluster) (volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "sum by (cluster) (volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, sum by (cluster) (avg_over_time(volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"interval": "",
"legendFormat": "{{cluster}}",
"refId": "A"
Expand Down Expand Up @@ -445,7 +445,7 @@
"targets": [
{
"exemplar": false,
"expr": "avg by (cluster) (node_avg_processor_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_avg_processor_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "avg by (cluster) (node_avg_processor_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_avg_processor_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"format": "time_series",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -540,7 +540,7 @@
"targets": [
{
"exemplar": false,
"expr": "avg by (cluster) (node_cpu_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_cpu_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "avg by (cluster) (node_cpu_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_cpu_busy{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"format": "time_series",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -633,7 +633,7 @@
"targets": [
{
"exemplar": false,
"expr": "avg by (cluster) (node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end())))",
"expr": "avg by (cluster) (node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"})\nand\ntopk($TopResources, avg by (cluster) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end())))",
"interval": "",
"legendFormat": "{{cluster}}",
"refId": "A"
Expand Down Expand Up @@ -1432,7 +1432,7 @@
"targets": [
{
"exemplar": false,
"expr": "svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}[$__range] @ end()))",
"expr": "svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}[3h] @ end()))",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}}",
"refId": "A"
Expand Down Expand Up @@ -1611,7 +1611,7 @@
"targets": [
{
"exemplar": false,
"expr": "svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}[$__range] @ end()))",
"expr": "svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\"}[3h] @ end()))",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}}",
"refId": "A"
Expand Down Expand Up @@ -1716,7 +1716,7 @@
{
"datasource": "${DS_PROMETHEUS}",
"exemplar": false,
"expr": "volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[$__range] @ end()))",
"expr": "volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[3h] @ end()))",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}} - {{volume}}",
"refId": "A"
Expand Down Expand Up @@ -1807,7 +1807,7 @@
{
"datasource": "${DS_PROMETHEUS}",
"exemplar": false,
"expr": "volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[$__range] @ end())) ",
"expr": "volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[3h] @ end())) ",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}} - {{volume}}",
"refId": "A"
Expand Down Expand Up @@ -1899,7 +1899,7 @@
{
"datasource": "${DS_PROMETHEUS}",
"exemplar": false,
"expr": "volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[$__range] @ end()))",
"expr": "volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}\nand\ntopk($TopResources, avg_over_time(volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",svm=~\"$SVM\",volume=~\"$Volume\"}[3h] @ end()))",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}} - {{volume}}",
"refId": "A"
Expand Down
20 changes: 10 additions & 10 deletions grafana/dashboards/cmode/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -4041,7 +4041,7 @@
"targets": [
{
"exemplar": false,
"expr": "svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end()))",
"expr": "svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end()))",
"interval": "",
"legendFormat": "{{cluster}} - {{svm}}",
"refId": "A"
Expand Down Expand Up @@ -4216,7 +4216,7 @@
"targets": [
{
"exemplar": false,
"expr": "svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[$__range] @ end()))",
"expr": "svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}\nand\ntopk($TopResources, avg_over_time(svm_vol_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h] @ end()))",
"format": "time_series",
"instant": false,
"interval": "",
Expand Down Expand Up @@ -4410,7 +4410,7 @@
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "query_result(topk($TopResources, avg_over_time(svm_vol_read_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[${__range}])+avg_over_time(svm_vol_write_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[${__range}])))",
"definition": "query_result(topk($TopResources, avg_over_time(svm_vol_read_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h])+avg_over_time(svm_vol_write_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h])))",
"description": null,
"error": null,
"hide": 2,
Expand All @@ -4420,7 +4420,7 @@
"name": "TopSVMs",
"options": [],
"query": {
"query": "query_result(topk($TopResources, avg_over_time(svm_vol_read_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[${__range}])+avg_over_time(svm_vol_write_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[${__range}])))",
"query": "query_result(topk($TopResources, avg_over_time(svm_vol_read_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h])+avg_over_time(svm_vol_write_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\"}[3h])))",
"refId": "StandardVariableQuery"
},
"refresh": 2,
Expand All @@ -4433,7 +4433,7 @@
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[${__range}]))))",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[3h]))))",
"description": null,
"error": null,
"hide": 2,
Expand All @@ -4443,7 +4443,7 @@
"name": "TopVolumeAvgLatency",
"options": [],
"query": {
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[${__range}]))))",
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_avg_latency{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[3h]))))",
"refId": "StandardVariableQuery"
},
"refresh": 2,
Expand All @@ -4456,7 +4456,7 @@
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[${__range}]))))",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[3h]))))",
"description": null,
"error": null,
"hide": 2,
Expand All @@ -4466,7 +4466,7 @@
"name": "TopVolumeTotalData",
"options": [],
"query": {
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[${__range}]))))",
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_data{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\", node!=\"\"}[3h]))))",
"refId": "StandardVariableQuery"
},
"refresh": 2,
Expand All @@ -4479,7 +4479,7 @@
"allValue": null,
"current": {},
"datasource": "${DS_PROMETHEUS}",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",node!=\"\"}[${__range}]))))",
"definition": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",node!=\"\"}[3h]))))",
"description": null,
"error": null,
"hide": 2,
Expand All @@ -4489,7 +4489,7 @@
"name": "TopVolumeTotalOps",
"options": [],
"query": {
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",node!=\"\"}[${__range}]))))",
"query": "query_result(topk($TopResources, sum by (node) (avg_over_time(node_volume_total_ops{datacenter=~\"$Datacenter\",cluster=~\"$Cluster\",node!=\"\"}[3h]))))",
"refId": "StandardVariableQuery"
},
"refresh": 2,
Expand Down
Loading

0 comments on commit 0726c41

Please sign in to comment.