From 3d3bfc52f4d4dfa452fdf9788ac519208020af82 Mon Sep 17 00:00:00 2001 From: glorv Date: Sun, 29 Sep 2024 11:27:42 +0800 Subject: [PATCH 1/5] This is an automated cherry-pick of #56374 Signed-off-by: ti-chi-bot --- .../grafana/tidb_resource_control.json | 1474 +++++++++++++++++ .../grafana/tidb_resource_control.jsonnet | 1246 ++++++++++++++ 2 files changed, 2720 insertions(+) create mode 100644 pkg/metrics/grafana/tidb_resource_control.jsonnet diff --git a/pkg/metrics/grafana/tidb_resource_control.json b/pkg/metrics/grafana/tidb_resource_control.json index 156927f8ee136..73cffb8c8e898 100644 --- a/pkg/metrics/grafana/tidb_resource_control.json +++ b/pkg/metrics/grafana/tidb_resource_control.json @@ -780,6 +780,7 @@ "steppedLine": false, "targets": [ { +<<<<<<< HEAD "exemplar": true, "expr": "sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name)", "format": "time_series", @@ -836,6 +837,1479 @@ "max": null, "min": null, "show": true +======= + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The total CPU time cost by tasks of each priority.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 32, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tikv_resource_control_priority_task_exec_duration{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\"}[1m])) by (instance, priority)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{priority}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Time by Priority", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The CPU quota limiter applied to each priority.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 33, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "tikv_resource_control_priority_quota_limit{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{priority}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota Limit by Priority", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Tasks number per second that triggers quota limiter wait", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 34, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tikv_resource_control_priority_wait_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\"}[1m])) by (instance, priority)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{priority}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Tasks Wait QPS by Priority", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The wait Duration of tasks that triggers quota limiter wait per priority", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 35, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(tikv_resource_control_priority_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, priority, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{priority}}-P99", + "refId": "A" + }, + { + "expr": "sum(rate(tikv_resource_control_priority_wait_duration_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, priority) / sum(rate(tikv_resource_control_priority_wait_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, priority)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{priority}}-avg", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Priority Task Wait Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Priority Task Control", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 36, + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The total background task's request unit cost for all resource groups.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 37, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"background\"}[1m])) by (name) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"background\"}[1m])) by (name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{name}}", + "refId": "A" + }, + { + "expr": "sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"background\"}[1m])) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"background\"}[1m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "total", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Background Task RU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 10, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The resource(CPU, IO) utilization percentage and limit of background tasks.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 38, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "tikv_resource_control_bg_resource_utilization{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{type}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Background Task Resource Utilization", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "percent", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The total background task's io limit for all resource groups.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 39, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "tikv_resource_control_background_quota_limiter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"io\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}-{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Background Task IO Limit", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The total background task's cpu consumption for all resource groups.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 40, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(tikv_resource_control_background_resource_consumption{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"cpu\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}-{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Background Task CPU Consumption", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The total background task's cpu limit for all resource groups.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "tikv_resource_control_background_quota_limiter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"cpu\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}-{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Background Task CPU Limit", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The total background task's io consumption for all resource groups.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 42, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(tikv_resource_control_background_resource_consumption{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"io\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}-{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Background Task IO Consumption", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The total background task's wait duration for all resource groups.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 43, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(tikv_resource_control_background_task_wait_duration{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}-{{instance}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Background Task Total Wait Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Background Task Control", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": true, + "collapsed": true, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 44, + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The Duration of sql execute for different resource group", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 45, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (le,resource_group))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}-P999", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (le,resource_group))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}-P99", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.9, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (le,resource_group))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}-P90", + "refId": "C" + }, + { + "expr": "histogram_quantile(0.8, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (le,resource_group))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}-P80", + "refId": "D" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Query Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "MySQL commands processing numbers per second. See https://dev.mysql.com/doc/internals/en/text-protocol.html and https://dev.mysql.com/doc/internals/en/prepared-statements.html", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 46, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tidb_server_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (result,resource_group)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}--{{result}}", + "refId": "A" + }, + { + "expr": "sum(rate(tidb_server_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",result=\"OK\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m] offset 1d)) by (result,resource_group)", + "format": "time_series", + "hide": true, + "intervalFactor": 2, + "legendFormat": "{{resource_group}}--yesterday", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Command Per Second", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 10, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "TiDB statement statistics", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 47, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tidb_executor_statement_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (type,resource_group)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}--{{type}}", + "refId": "A" + }, + { + "expr": "sum(rate(tidb_executor_statement_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (resource_group)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}--total", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "QPS", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The number of connections to the TiDB server", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 48, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "tidb_server_connections{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}--{{resource_group}}", + "refId": "A" + }, + { + "expr": "tidb_server_connections{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{resource_group}}--total", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Connection Count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The number of failed queries per minute", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 49, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(increase(tidb_server_execute_error_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\",resource_group=~\"$resource_group\"}[1m])) by (type, instance,resource_group)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{type}}--{{instance}}--{{resource_group}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Failed Query OPM", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] +>>>>>>> 9f11e6f33b2 (metrics: add metrics for priority resource control (#56374)) } ], "yaxis": { diff --git a/pkg/metrics/grafana/tidb_resource_control.jsonnet b/pkg/metrics/grafana/tidb_resource_control.jsonnet new file mode 100644 index 0000000000000..81eac5685fb6e --- /dev/null +++ b/pkg/metrics/grafana/tidb_resource_control.jsonnet @@ -0,0 +1,1246 @@ +// Copyright 2024 PingCAP, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +local grafana = import "grafonnet/grafana.libsonnet"; +local dashboard = grafana.dashboard; +local row = grafana.row; +local graphPanel = grafana.graphPanel; +local tablePanel = grafana.tablePanel; +local prometheus = grafana.prometheus; +local template = grafana.template; +local transformation = grafana.transformation; + +local myNameFlag = "DS_TEST-CLUSTER"; +local myDS = "${" + myNameFlag + "}"; + +// A new dashboard +// Add the template variables +local TiDBResourceControlDash = dashboard.new( + title="Test-Cluster-TiDB-Resource-Control", + editable=true, + graphTooltip="shared_crosshair", + refresh="30s", + time_from="now-1h", +).addInput( + name=myNameFlag, + label="test-cluster", + type="datasource", + pluginId="prometheus", + pluginName="Prometheus", +).addTemplate( + // Default template for tidb-cloud + template.new( + allValues=null, + current=null, + datasource=myDS, + hide="all", + includeAll=false, + label="tidb_cluster", + multi=false, + name="tidb_cluster", + query='label_values(pd_cluster_status{k8s_cluster="$kuberentes"}, tidb_cluster)', + refresh="time", + regex="", + sort=1, + tagValuesQuery="", + ) +).addTemplate( + // Default template for tidb-cloud + template.new( + datasource=myDS, + hide=2, + label="K8s-cluster", + name="k8s_cluster", + query="label_values(pd_cluster_status, k8s_cluster)", + refresh="time", + sort=1, + ) +).addTemplate( + template.new( + allValues=".*", + current=null, + datasource=myDS, + hide="", + includeAll=true, + label="TiDB Instance", + multi=false, + name="tidb_instance", + query='label_values(tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}, instance)', + refresh="load", + regex="", + sort=1, + tagValuesQuery="", + ) +).addTemplate( + template.new( + allValues=".*", + current=null, + datasource=myDS, + hide="", + includeAll=true, + label="TiKV Instance", + multi=false, + name="tikv_instance", + query='label_values(tikv_engine_size_bytes{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}, instance)', + refresh="load", + regex="", + sort=1, + tagValuesQuery="", + ) +).addTemplate( + template.new( + allValues=".*", + current=null, + datasource=myDS, + hide="", + includeAll=true, + label="Resource Group", + multi=true, + name="resource_group", + query='label_values(tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}, resource_group)', + refresh="load", + regex="", + sort=1, + tagValuesQuery="", + ) +); + + +//* ==============Panel (Resource Unit)================== +//* Panel Title: Resource Unit +//* Description: The metrics about request unit(abstract unit) cost for all resource groups. +//* Panels: 7 +//* ==============Panel (Resource Unit)================== +local ruRow = row.new(collapse=true, title="Resource Unit"); + +local ConfigPanel = tablePanel.new( + "RU Config", + datasource=myDS, +).addTarget( + prometheus.target( + 'resource_manager_server_group_config{type="priority"}', + legendFormat="{{resource_group}}", + instant="instant", + ) +).addTarget( + prometheus.target( + 'resource_manager_server_group_config{type="ru_capacity"}', + legendFormat="{{resource_group}}", + instant="instant", + ) +).addTarget( + prometheus.target( + 'resource_manager_server_group_config{type="ru_per_sec"}', + legendFormat="{{resource_group}}", + instant="instant", + ) +).addTransformation( + transformation.new("labelsToFields", options={ + valueLabel: "type", + }) +).addTransformation( + transformation.new("organize", options={ + excludeByName: { + Time: true, + __name__: true, + instance: true, + job: true, + }, + indexByName: { + Time: 0, + __name__: 1, + instance: 2, + job: 3, + resource_group: 4, + priority: 5, + ru_per_sec: 6, + ru_capacity: 7, + }, + }) +); + +local RUPanel = graphPanel.new( + title="RU", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The metrics about request unit cost for all resource groups.", + logBase1Y=10, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (resource_group) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (resource_group)', + legendFormat="{{resource_group}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m]))', + legendFormat="total", + ) +); + +local RUMaxPanel = graphPanel.new( + title="RU (Max Cost During 20s Period)", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The max request unit cost for resource groups during in a period(20s).", + logBase1Y=10, +).addTarget( + prometheus.target( + 'sum(resource_manager_resource_unit_read_request_unit_max_per_sec{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}) by (resource_group)', + legendFormat="{{resource_group}}-read", + ) +).addTarget( + prometheus.target( + 'sum(resource_manager_resource_unit_write_request_unit_max_per_sec{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}) by (resource_group)', + legendFormat="{{resource_group}}-write", + ) +); + +local RUPerQueryPanel = graphPanel.new( + title="RU Per Query", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The avg request unit cost for each query.", + logBase1Y=10, +).addTarget( + prometheus.target( + '(sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name)) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + '(sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m]))) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total", + ) +); + +local RRUPanel = graphPanel.new( + title="RRU", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The read request unit cost for all resource groups.", + logBase1Y=10, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m]))', + legendFormat="total", + ) +); + +local RRUPerQueryPanel = graphPanel.new( + title="RRU Per Query", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The avg read request unit cost for each query.", + logBase1Y=10, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total", + ) +); + +local WRUPanel = graphPanel.new( + title="WRU", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The write request unit cost for all resource groups.", + logBase1Y=10, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m]))', + legendFormat="total", + ) +); + +local WRUPerQueryPanel = graphPanel.new( + title="WRU Per Query", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The avg write request unit cost for each query.", + logBase1Y=10, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total", + ) +); + + +//* ============== Panel (Resource Details)================== +//* Panel Title: Resource Details +//* Description: The metrics about actual resource usage for all resource groups. +//* Panels: 8 +//* ============== Panel (Resource Details)================== + +local resourceRow = row.new(collapse=true, title="Resource Details"); +local KVRequestCountPanel = graphPanel.new( + title="KV Request Count", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The metrics about kv request count for all resource groups.", + logBase1Y=2, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name, type)', + legendFormat="{{name}}-{{type}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}-total", + ) +); + +local KVRequestCountPerQueryPanel = graphPanel.new( + title="KV Request Count Per Query", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The avg kv request count for each query.", + logBase1Y=2, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type="read"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}-read", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type="write"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}-write", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type="read"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total-read", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type="write"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total-write", + ) +); + +local BytesReadPanel = graphPanel.new( + title="Bytes Read", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="bytes", + description="The metrics about bytes read for all resource groups.", + logBase1Y=2, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_read_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_read_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total", + ) +); + +local BytesReadPerQueryPanel = graphPanel.new( + title="Bytes Read Per Query", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="bytes", + description="The avg bytes read for each query.", + logBase1Y=2, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_read_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_read_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total", + ) +); + +local BytesWrittenPanel = graphPanel.new( + title="Bytes Written", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="bytes", + description="The metrics about bytes written for all resource groups.", + logBase1Y=2, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_write_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_write_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total", + ) +); + +local BytesWrittenPerQueryPanel = graphPanel.new( + title="Bytes Written Per Query", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="bytes", + description="The avg bytes written for each query.", + logBase1Y=2, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_write_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_write_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total", + ) +); + +local KVCPUTimePanel = graphPanel.new( + title="KV CPU Time", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="ms", + description="The metrics about kv cpu time for all resource groups.", + logBase1Y=1, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_kv_cpu_time_ms_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_kv_cpu_time_ms_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total", + ) +); + +local SQLCPUTimePanel = graphPanel.new( + title="SQL CPU Time", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="ms", + description="The metrics about sql cpu time for all resource groups.", + logBase1Y=1, +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_sql_cpu_time_ms_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_sql_cpu_time_ms_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', + legendFormat="total", + ) +); + +//* ==============Panel (Client)================== +//* Row Title: Client +//* Description: The metrics about resource control client +//* Panels: 7 +//* ==============Panel (Client)================== + +local clientRow = row.new(collapse=true, title="Client"); + +local ActiveResourceGroupPanel = graphPanel.new( + title="Active Resource Groups", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + bars=true, + format="short", + description="The metrics about active resource groups.", +).addTarget( + prometheus.target( + 'resource_manager_client_resource_group_status{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}', + legendFormat="{{instance}}-{{name}}", + ) +); + +local TotalKVRequestCountPanel = graphPanel.new( + title="Total KV Request Count", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The metrics about total kv request count.", +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_client_request_success_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance) + sum(rate(resource_manager_client_request_fail{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance)', + legendFormat="{{instance}}-total", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_client_request_success_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name) + sum(rate(resource_manager_client_request_fail{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name)', + legendFormat="total", + ) +); + +local FailedKVRequestCountPanel = graphPanel.new( + title="Failed KV Request Count", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The metrics about failed kv request count.", +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_client_request_fail{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance)', + legendFormat="{{instance}}-total", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_client_request_fail{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name)', + legendFormat="{{instance}}-{{name}}", + ) +); + +local SuccessfulKVRequestWaitDurationPanel = graphPanel.new( + title="Successful KV Request Wait Duration", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The metrics about successful kv request wait duration.", +).addTarget( + prometheus.target( + 'histogram_quantile(0.99, sum(rate(resource_manager_client_request_success_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name, le))', + legendFormat="{{instance}}-{{name}}-99", + ) +).addTarget( + prometheus.target( + 'histogram_quantile(0.9, sum(rate(resource_manager_client_request_success_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name, le))', + legendFormat="{{instance}}-{{name}}-90", + ) +); + +// Successful KV Request Count +local SuccessfulKVRequestCountPanel = graphPanel.new( + title="Successful KV Request Count", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + logBase1Y=2, + format="short", + description="The metrics about successful kv request count.", +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_client_request_success_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance)', + legendFormat="{{instance}}-total", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_client_request_success_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name)', + legendFormat="{{instance}}-{{name}}", + ) +); + +// Token Request Handle Duration +local TokenRequestHandleDurationPanel = graphPanel.new( + title="Token Request Handle Duration", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The metrics about token request handle duration.", +).addTarget( + prometheus.target( + 'histogram_quantile(0.99, sum(rate(resource_manager_client_token_request_duration_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, le))', + legendFormat="{{instance}}-{{name}}-99", + ) +).addTarget( + prometheus.target( + 'histogram_quantile(0.999, sum(rate(resource_manager_client_token_request_duration_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, le))', + legendFormat="{{instance}}-{{name}}-999", + ) +); + +// Token Request Count +local TokenRequestCountPanel = graphPanel.new( + title="Token Request Count", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + description="The metrics about token request count.", +).addTarget( + prometheus.target( + 'sum(delta(resource_manager_client_token_request_duration_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance)', + legendFormat="{{instance}}-total", + ) +).addTarget( + prometheus.target( + 'sum(delta(resource_manager_client_token_request_duration_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance", type="success"}[1m])) by (instance)', + legendFormat="{{instance}}-successful", + ) +).addTarget( + prometheus.target( + 'sum(delta(resource_manager_client_token_request_duration_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance", type="fail"}[1m])) by (instance)', + legendFormat="{{instance}}-failed", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_client_token_request_resource_group{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name)', + legendFormat="{{instance}}-{{name}}", + ) +); + +//* ==============Panel (Runaway)================== +//* Row Title: Runaway +//* Description: The metrics about runaway resource control +//* Panels: 2 +//* ==============Panel (Runaway)================== + +local runawayRow = row.new(collapse=true, title="Runaway"); +// Query Max Duration +local QueryMaxDurationPanel = graphPanel.new( + title="Query Max Duration", + datasource=myDS, + legend_rightSide=true, + legend_avg=true, + legend_max=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + logBase1Y=2, + description="TiDB max durations for different resource group", +).addTarget( + prometheus.target( + 'histogram_quantile(1.0, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (le,resource_group))', + legendFormat="{{resource_group}}", + ) +); + +// Runaway Event +local RunawayEventPanel = graphPanel.new( + title="Runaway Event", + datasource=myDS, + legend_rightSide=true, + legend_avg=true, + legend_max=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + logBase1Y=2, + description="Runaway manager events for different resource group", +).addTarget( + prometheus.target( + 'sum(rate(tidb_server_query_runaway_check{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type="hit"}[5m])) by (resource_group)', + legendFormat="{{resource_group}}-hit", + ) +).addTarget( + prometheus.target( + 'sum(rate(tidb_server_query_runaway_check{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type!="hit"}[5m])) by (resource_group, type, action)', + legendFormat="{{resource_group}}-{{type}}-{{action}}", + ) +); + +//* ==============Panel (Priority Task Control)================== +//* Row Title: Priority Task Control +//* Description: The metrics about Priority Tasks Control resource control +//* Panels: 4 +//* ==============Panel (Background Task Control)================== + +local priorityTaskRow = row.new(collapse=true, title="Priority Task Control"); + +// The CPU time used of each priority +local PriorityTaskCPUPanel = graphPanel.new( + title="CPU Time by Priority", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="µs", + logBase1Y=1, + description="The total CPU time cost by tasks of each priority.", +).addTarget( + prometheus.target( + 'sum(rate(tikv_resource_control_priority_task_exec_duration{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, priority)', + legendFormat="{{instance}}-{{priority}}", + ) +); + +// The CPU Limiter Quota of each priority +local PriorityTaskQuotaLimitPanel = graphPanel.new( + title="CPU Quota Limit by Priority", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="µs", + logBase1Y=1, + description="The CPU quota limiter applied to each priority.", +).addTarget( + prometheus.target( + 'tikv_resource_control_priority_quota_limit{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}', + legendFormat="{{instance}}-{{priority}}", + ) +); + +// Task QPS that triggers wait +local PriorityTaskWaitQPSPanel = graphPanel.new( + title="Tasks Wait QPS by Priority", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + logBase1Y=1, + description="Tasks number per second that triggers quota limiter wait", +).addTarget( + prometheus.target( + 'sum(rate(tikv_resource_control_priority_wait_duration_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, priority)', + legendFormat="{{instance}}-{{priority}}", + ) +); + +// The task wait distribution by priority +local PriorityTaskWaitDurationPanel = graphPanel.new( + title="Priority Task Wait Duration", + datasource=myDS, + legend_rightSide=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="s", + logBase1Y=2, + description="The wait Duration of tasks that triggers quota limiter wait per priority", +).addTarget( + prometheus.target( + 'histogram_quantile(0.99, sum(rate(tikv_resource_control_priority_wait_duration_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}[1m])) by (instance, priority, le))', + legendFormat="{{instance}}-{{priority}}-P99", + ) +).addTarget( + prometheus.target( + 'sum(rate(tikv_resource_control_priority_wait_duration_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}[1m])) by (instance, priority) / sum(rate(tikv_resource_control_priority_wait_duration_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}[1m])) by (instance, priority)', + legendFormat="{{instance}}-{{priority}}-avg", + ) +); + + +//* ==============Panel (Background Task Control)================== +//* Row Title: Background Task Control +//* Description: The metrics about Background Task Control resource control +//* Panels: 7 +//* ==============Panel (Background Task Control)================== + +local backgroundTaskRow = row.new(collapse=true, title="Background Task Control"); + +// Background Tasks' RU +local BackgroundTaskRUPanel = graphPanel.new( + title="Background Task RU", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + logBase1Y=10, + description="The total background task's request unit cost for all resource groups.", +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"background"}[1m])) by (name) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"background"}[1m])) by (name)', + legendFormat="{{name}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"background"}[1m])) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"background"}[1m]))', + legendFormat="total", + ) +); + +// Background Task Resource Utilization +local BackgroundTaskResourceUtilizationPanel = graphPanel.new( + title="Background Task Resource Utilization", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="percent", + logBase1Y=1, + description="The resource(CPU, IO) utilization percentage and limit of background tasks.", +).addTarget( + prometheus.target( + 'tikv_resource_control_bg_resource_utilization{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}', + legendFormat="{{instance}}-{{type}}", + ) +); + +// Background Task CPU Limit +local BackgroundTaskCPULimitPanel = graphPanel.new( + title="Background Task CPU Limit", + datasource=myDS, + legend_rightSide=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="µs", + logBase1Y=1, + description="The total background task's cpu limit for all resource groups.", +).addTarget( + prometheus.target( + 'tikv_resource_control_background_quota_limiter{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance", type="cpu"}', + legendFormat="{{resource_group}}-{{instance}}", + ) +); + +// Background Task IO Limit +local BackgroundTaskIOLimitPanel = graphPanel.new( + title="Background Task IO Limit", + datasource=myDS, + legend_rightSide=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="bytes", + logBase1Y=1, + description="The total background task's io limit for all resource groups.", +).addTarget( + prometheus.target( + 'tikv_resource_control_background_quota_limiter{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance", type="io"}', + legendFormat="{{resource_group}}-{{instance}}", + ) +); + +// Background Task CPU Consumption +local BackgroundTaskCPUConsumptionPanel = graphPanel.new( + title="Background Task CPU Consumption", + datasource=myDS, + legend_rightSide=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="µs", + logBase1Y=1, + description="The total background task's cpu consumption for all resource groups.", +).addTarget( + prometheus.target( + 'rate(tikv_resource_control_background_resource_consumption{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance", type="cpu"}[1m])', + legendFormat="{{resource_group}}-{{instance}}", + ) +); + +// Background Task IO Consumption +local BackgroundTaskIOConsumptionPanel = graphPanel.new( + title="Background Task IO Consumption", + datasource=myDS, + legend_rightSide=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="bytes", + logBase1Y=1, + description="The total background task's io consumption for all resource groups.", +).addTarget( + prometheus.target( + 'rate(tikv_resource_control_background_resource_consumption{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance", type="io"}[1m])', + legendFormat="{{resource_group}}-{{instance}}", + ) +); + +// Background Task Total Wait Duration +local BackgroundTaskTotalWaitDurationPanel = graphPanel.new( + title="Background Task Total Wait Duration", + datasource=myDS, + legend_rightSide=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="µs", + logBase1Y=1, + description="The total background task's wait duration for all resource groups.", +).addTarget( + prometheus.target( + 'rate(tikv_resource_control_background_task_wait_duration{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}[1m])', + legendFormat="{{resource_group}}-{{instance}}", + ) +); + +//* ==============Panel (Query Sumary)================== +//* Row Title: Query Sumary +//* Description: The metrics about query summary +//* Panels: 5 +//* ==============Panel (Query Sumary)================== + +local querySummaryRow = row.new(collapse=true, title="Query Summary"); + +// Query Duration +local QueryDurationPanel = graphPanel.new( + title="Query Duration", + datasource=myDS, + legend_rightSide=true, + legend_min=true, + legend_max=true, + legend_avg=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="s", + logBase1Y=2, + description="The Duration of sql execute for different resource group", +).addTarget( + prometheus.target( + 'histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (le,resource_group))', + legendFormat="{{resource_group}}-P999", + ) +).addTarget( + prometheus.target( + 'histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (le,resource_group))', + legendFormat="{{resource_group}}-P99", + ) +).addTarget( + prometheus.target( + 'histogram_quantile(0.9, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (le,resource_group))', + legendFormat="{{resource_group}}-P90", + ) +).addTarget( + prometheus.target( + 'histogram_quantile(0.8, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (le,resource_group))', + legendFormat="{{resource_group}}-P80", + ) +); + +// Command per second +local CommandPerSecondPanel = graphPanel.new( + title="Command Per Second", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + logBase1Y=10, + description="MySQL commands processing numbers per second. See https://dev.mysql.com/doc/internals/en/text-protocol.html and https://dev.mysql.com/doc/internals/en/prepared-statements.html", +).addTarget( + prometheus.target( + 'sum(rate(tidb_server_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (result,resource_group)', + legendFormat="{{resource_group}}--{{result}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(tidb_server_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",result="OK",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m] offset 1d)) by (result,resource_group)', + legendFormat="{{resource_group}}--yesterday", + hide=true, + ) +); + +// QPS +local QPSPanel = graphPanel.new( + title="QPS", + datasource=myDS, + legend_rightSide=true, + legend_avg=true, + legend_max=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + logBase1Y=2, + description="TiDB statement statistics", +).addTarget( + prometheus.target( + 'sum(rate(tidb_executor_statement_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (type,resource_group)', + legendFormat="{{resource_group}}--{{type}}", + ) +).addTarget( + prometheus.target( + 'sum(rate(tidb_executor_statement_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (resource_group)', + legendFormat="{{resource_group}}--total", + ) +); + +// Connection Count +local ConnectionCountPanel = graphPanel.new( + title="Connection Count", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + logBase1Y=1, + description="The number of connections to the TiDB server", +).addTarget( + prometheus.target( + 'tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}', + legendFormat="{{instance}}--{{resource_group}}", + ) +).addTarget( + prometheus.target( + 'tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}', + legendFormat="{{resource_group}}--total", + ) +); + +// Failed Query OPM +local FailedQueryOPMPanel = graphPanel.new( + title="Failed Query OPM", + datasource=myDS, + legend_rightSide=true, + legend_current=true, + legend_max=true, + legend_alignAsTable=true, + legend_values=true, + format="short", + logBase1Y=2, + description="The number of failed queries per minute", +).addTarget( + prometheus.target( + 'sum(increase(tidb_server_execute_error_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance",resource_group=~"$resource_group"}[1m])) by (type, instance,resource_group)', + legendFormat="{{type}}--{{instance}}--{{resource_group}}", + ) +); + +//* ============== Dashboard =============== +//* Merge together +//* ============== Dashboard =============== + +// Position definition +local panelW = 12; +local panelH = 7; +local rowW = 24; +local rowH = 1; + +local rowPos = { x: 0, y: 0, w: rowW, h: rowH }; +local leftPanelPos = { x: 0, y: 0, w: panelW, h: panelH }; +local rightPanelPos = { x: panelW, y: 0, w: panelW, h: panelH }; +local fullPanelPos = { x: 0, y: 0, w: rowW, h: panelH }; + +TiDBResourceControlDash +.addPanel( + ruRow/* Resource Unit */ + .addPanel(ConfigPanel, gridPos=fullPanelPos) + .addPanel(RUPanel, gridPos=leftPanelPos) + .addPanel(RUMaxPanel, gridPos=rightPanelPos) + .addPanel(RUPerQueryPanel, gridPos=leftPanelPos) + .addPanel(RRUPanel, gridPos=rightPanelPos) + .addPanel(RRUPerQueryPanel, gridPos=leftPanelPos) + .addPanel(WRUPanel, gridPos=rightPanelPos) + .addPanel(WRUPerQueryPanel, gridPos=leftPanelPos) + , + gridPos=rowPos +).addPanel( + resourceRow/* Resource Details */ + .addPanel(KVRequestCountPanel, gridPos=leftPanelPos) + .addPanel(KVRequestCountPerQueryPanel, gridPos=rightPanelPos) + .addPanel(BytesReadPanel, gridPos=leftPanelPos) + .addPanel(BytesReadPerQueryPanel, gridPos=rightPanelPos) + .addPanel(BytesWrittenPanel, gridPos=leftPanelPos) + .addPanel(BytesWrittenPerQueryPanel, gridPos=rightPanelPos) + .addPanel(KVCPUTimePanel, gridPos=leftPanelPos) + .addPanel(SQLCPUTimePanel, gridPos=rightPanelPos) + , + gridPos=rowPos +).addPanel( + clientRow/* Client */ + .addPanel(ActiveResourceGroupPanel, gridPos=fullPanelPos) + .addPanel(TotalKVRequestCountPanel, gridPos=leftPanelPos) + .addPanel(FailedKVRequestCountPanel, gridPos=rightPanelPos) + .addPanel(SuccessfulKVRequestWaitDurationPanel, gridPos=leftPanelPos) + .addPanel(SuccessfulKVRequestCountPanel, gridPos=rightPanelPos) + .addPanel(TokenRequestHandleDurationPanel, gridPos=leftPanelPos) + .addPanel(TokenRequestCountPanel, gridPos=rightPanelPos) + , + gridPos=rowPos +).addPanel( + runawayRow/* Runaway */ + .addPanel(QueryMaxDurationPanel, gridPos=leftPanelPos) + .addPanel(RunawayEventPanel, gridPos=rightPanelPos) + , + gridPos=rowPos +).addPanel( + priorityTaskRow /* Priority Task Control */ + .addPanel(PriorityTaskCPUPanel, gridPos=leftPanelPos) + .addPanel(PriorityTaskQuotaLimitPanel, gridPos=rightPanelPos) + .addPanel(PriorityTaskWaitQPSPanel, gridPos=leftPanelPos) + .addPanel(PriorityTaskWaitDurationPanel, gridPos=rightPanelPos) + , + gridPos=rowPos +).addPanel( + backgroundTaskRow/* Background Task Control */ + .addPanel(BackgroundTaskRUPanel, gridPos=leftPanelPos) + .addPanel(BackgroundTaskResourceUtilizationPanel, gridPos=rightPanelPos) + .addPanel(BackgroundTaskIOLimitPanel, gridPos=leftPanelPos) + .addPanel(BackgroundTaskCPUConsumptionPanel, gridPos=rightPanelPos) + .addPanel(BackgroundTaskCPULimitPanel, gridPos=leftPanelPos) + .addPanel(BackgroundTaskIOConsumptionPanel, gridPos=rightPanelPos) + .addPanel(BackgroundTaskTotalWaitDurationPanel, gridPos=leftPanelPos) + , + gridPos=rowPos +).addPanel( + querySummaryRow/* Query Summary */ + .addPanel(QueryDurationPanel, gridPos=leftPanelPos) + .addPanel(CommandPerSecondPanel, gridPos=rightPanelPos) + .addPanel(QPSPanel, gridPos=leftPanelPos) + .addPanel(ConnectionCountPanel, gridPos=rightPanelPos) + .addPanel(FailedQueryOPMPanel, gridPos=leftPanelPos) + , + gridPos=rowPos +) From e0bcd60894eb623a63bc058bd7bc4772138441cd Mon Sep 17 00:00:00 2001 From: glorv Date: Sun, 29 Sep 2024 12:13:33 +0800 Subject: [PATCH 2/5] resolve conflicts --- .../grafana/tidb_resource_control.json | 1850 ++++------------- .../grafana/tidb_resource_control.jsonnet | 1246 ----------- 2 files changed, 373 insertions(+), 2723 deletions(-) delete mode 100644 pkg/metrics/grafana/tidb_resource_control.jsonnet diff --git a/pkg/metrics/grafana/tidb_resource_control.json b/pkg/metrics/grafana/tidb_resource_control.json index 73cffb8c8e898..36ce7cd0b0b0e 100644 --- a/pkg/metrics/grafana/tidb_resource_control.json +++ b/pkg/metrics/grafana/tidb_resource_control.json @@ -780,7 +780,6 @@ "steppedLine": false, "targets": [ { -<<<<<<< HEAD "exemplar": true, "expr": "sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"|tp\"}[1m])) by (name)", "format": "time_series", @@ -837,1479 +836,6 @@ "max": null, "min": null, "show": true -======= - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The total CPU time cost by tasks of each priority.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 32, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(tikv_resource_control_priority_task_exec_duration{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\"}[1m])) by (instance, priority)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{priority}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Time by Priority", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The CPU quota limiter applied to each priority.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 33, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "tikv_resource_control_priority_quota_limit{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{priority}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "CPU Quota Limit by Priority", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "Tasks number per second that triggers quota limiter wait", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 34, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(tikv_resource_control_priority_wait_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\"}[1m])) by (instance, priority)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{priority}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Tasks Wait QPS by Priority", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The wait Duration of tasks that triggers quota limiter wait per priority", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 35, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.99, sum(rate(tikv_resource_control_priority_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, priority, le))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{priority}}-P99", - "refId": "A" - }, - { - "expr": "sum(rate(tikv_resource_control_priority_wait_duration_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, priority) / sum(rate(tikv_resource_control_priority_wait_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, priority)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{priority}}-avg", - "refId": "B" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Priority Task Wait Duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 2, - "max": null, - "min": null, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Priority Task Control", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": true, - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 36, - "panels": [ - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The total background task's request unit cost for all resource groups.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 37, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"background\"}[1m])) by (name) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"background\"}[1m])) by (name)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{name}}", - "refId": "A" - }, - { - "expr": "sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"background\"}[1m])) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", type=~\"background\"}[1m]))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "total", - "refId": "B" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Background Task RU", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 10, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The resource(CPU, IO) utilization percentage and limit of background tasks.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 38, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "tikv_resource_control_bg_resource_utilization{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}-{{type}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Background Task Resource Utilization", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The total background task's io limit for all resource groups.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 39, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "tikv_resource_control_background_quota_limiter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"io\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}-{{instance}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Background Task IO Limit", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The total background task's cpu consumption for all resource groups.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 40, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(tikv_resource_control_background_resource_consumption{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"cpu\"}[1m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}-{{instance}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Background Task CPU Consumption", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The total background task's cpu limit for all resource groups.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 41, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "tikv_resource_control_background_quota_limiter{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"cpu\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}-{{instance}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Background Task CPU Limit", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The total background task's io consumption for all resource groups.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 42, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(tikv_resource_control_background_resource_consumption{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\", type=\"io\"}[1m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}-{{instance}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Background Task IO Consumption", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The total background task's wait duration for all resource groups.", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 43, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(tikv_resource_control_background_task_wait_duration{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}-{{instance}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Background Task Total Wait Duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "µs", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": true, - "title": "Background Task Control", - "titleSize": "h6", - "type": "row" - }, - { - "collapse": true, - "collapsed": true, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 44, - "panels": [ - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The Duration of sql execute for different resource group", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 45, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": true, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (le,resource_group))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}-P999", - "refId": "A" - }, - { - "expr": "histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (le,resource_group))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}-P99", - "refId": "B" - }, - { - "expr": "histogram_quantile(0.9, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (le,resource_group))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}-P90", - "refId": "C" - }, - { - "expr": "histogram_quantile(0.8, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (le,resource_group))", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}-P80", - "refId": "D" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Query Duration", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "s", - "label": null, - "logBase": 2, - "max": null, - "min": null, - "show": true - }, - { - "format": "s", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "MySQL commands processing numbers per second. See https://dev.mysql.com/doc/internals/en/text-protocol.html and https://dev.mysql.com/doc/internals/en/prepared-statements.html", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 46, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(tidb_server_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (result,resource_group)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}--{{result}}", - "refId": "A" - }, - { - "expr": "sum(rate(tidb_server_query_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",result=\"OK\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m] offset 1d)) by (result,resource_group)", - "format": "time_series", - "hide": true, - "intervalFactor": 2, - "legendFormat": "{{resource_group}}--yesterday", - "refId": "B" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Command Per Second", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 10, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "TiDB statement statistics", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 47, - "legend": { - "alignAsTable": true, - "avg": true, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(rate(tidb_executor_statement_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (type,resource_group)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}--{{type}}", - "refId": "A" - }, - { - "expr": "sum(rate(tidb_executor_statement_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}[1m])) by (resource_group)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}--total", - "refId": "B" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "QPS", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 2, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of connections to the TiDB server", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 0 - }, - "id": 48, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": false, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "tidb_server_connections{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{instance}}--{{resource_group}}", - "refId": "A" - }, - { - "expr": "tidb_server_connections{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\",instance=~\"$tidb_instance\",resource_group=~\"$resource_group\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{resource_group}}--total", - "refId": "B" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Connection Count", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": { }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_TEST-CLUSTER}", - "description": "The number of failed queries per minute", - "fill": 1, - "fillGradient": 0, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 0 - }, - "id": 49, - "legend": { - "alignAsTable": true, - "avg": false, - "current": true, - "max": true, - "min": false, - "rightSide": true, - "show": true, - "sideWidth": null, - "total": false, - "values": true - }, - "lines": true, - "linewidth": 1, - "links": [ ], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "repeat": null, - "seriesOverrides": [ ], - "spaceLength": 10, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "sum(increase(tidb_server_execute_error_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\",resource_group=~\"$resource_group\"}[1m])) by (type, instance,resource_group)", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "{{type}}--{{instance}}--{{resource_group}}", - "refId": "A" - } - ], - "thresholds": [ ], - "timeFrom": null, - "timeShift": null, - "title": "Failed Query OPM", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ ] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 2, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] ->>>>>>> 9f11e6f33b2 (metrics: add metrics for priority resource control (#56374)) } ], "yaxis": { @@ -4452,14 +2978,384 @@ "type": "row" }, { + "collapse": true, "collapsed": true, - "datasource": null, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 40 }, + "id": 39, + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The total CPU time cost by tasks of each priority.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 40, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tikv_resource_control_priority_task_exec_duration{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\"}[1m])) by (instance, priority)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{priority}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Time by Priority", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The CPU quota limiter applied to each priority.", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 41, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "tikv_resource_control_priority_quota_limit{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{priority}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "CPU Quota Limit by Priority", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "µs", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "Tasks number per second that triggers quota limiter wait", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 0 + }, + "id": 42, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(tikv_resource_control_priority_wait_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tidb_instance\"}[1m])) by (instance, priority)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{priority}}", + "refId": "A" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Tasks Wait QPS by Priority", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "The wait Duration of tasks that triggers quota limiter wait per priority", + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 0 + }, + "id": 43, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(tikv_resource_control_priority_wait_duration_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, priority, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{priority}}-P99", + "refId": "A" + }, + { + "expr": "sum(rate(tikv_resource_control_priority_wait_duration_sum{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, priority) / sum(rate(tikv_resource_control_priority_wait_duration_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$tikv_instance\"}[1m])) by (instance, priority)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}-{{priority}}-avg", + "refId": "B" + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeShift": null, + "title": "Priority Task Wait Duration", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 2, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": true, + "title": "Priority Task Control", + "type": "row" + }, + { + "collapsed": true, + "datasource": null, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 48 + }, "id": 23763573632, "panels": [ { @@ -5141,7 +4037,7 @@ "h": 1, "w": 24, "x": 0, - "y": 48 + "y": 56 }, "id": 23763573756, "panels": [ @@ -5910,4 +4806,4 @@ "title": "Test-Cluster-TiDB-Resource-Control", "uid": "000000201", "version": 1 -} +} \ No newline at end of file diff --git a/pkg/metrics/grafana/tidb_resource_control.jsonnet b/pkg/metrics/grafana/tidb_resource_control.jsonnet deleted file mode 100644 index 81eac5685fb6e..0000000000000 --- a/pkg/metrics/grafana/tidb_resource_control.jsonnet +++ /dev/null @@ -1,1246 +0,0 @@ -// Copyright 2024 PingCAP, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -local grafana = import "grafonnet/grafana.libsonnet"; -local dashboard = grafana.dashboard; -local row = grafana.row; -local graphPanel = grafana.graphPanel; -local tablePanel = grafana.tablePanel; -local prometheus = grafana.prometheus; -local template = grafana.template; -local transformation = grafana.transformation; - -local myNameFlag = "DS_TEST-CLUSTER"; -local myDS = "${" + myNameFlag + "}"; - -// A new dashboard -// Add the template variables -local TiDBResourceControlDash = dashboard.new( - title="Test-Cluster-TiDB-Resource-Control", - editable=true, - graphTooltip="shared_crosshair", - refresh="30s", - time_from="now-1h", -).addInput( - name=myNameFlag, - label="test-cluster", - type="datasource", - pluginId="prometheus", - pluginName="Prometheus", -).addTemplate( - // Default template for tidb-cloud - template.new( - allValues=null, - current=null, - datasource=myDS, - hide="all", - includeAll=false, - label="tidb_cluster", - multi=false, - name="tidb_cluster", - query='label_values(pd_cluster_status{k8s_cluster="$kuberentes"}, tidb_cluster)', - refresh="time", - regex="", - sort=1, - tagValuesQuery="", - ) -).addTemplate( - // Default template for tidb-cloud - template.new( - datasource=myDS, - hide=2, - label="K8s-cluster", - name="k8s_cluster", - query="label_values(pd_cluster_status, k8s_cluster)", - refresh="time", - sort=1, - ) -).addTemplate( - template.new( - allValues=".*", - current=null, - datasource=myDS, - hide="", - includeAll=true, - label="TiDB Instance", - multi=false, - name="tidb_instance", - query='label_values(tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}, instance)', - refresh="load", - regex="", - sort=1, - tagValuesQuery="", - ) -).addTemplate( - template.new( - allValues=".*", - current=null, - datasource=myDS, - hide="", - includeAll=true, - label="TiKV Instance", - multi=false, - name="tikv_instance", - query='label_values(tikv_engine_size_bytes{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}, instance)', - refresh="load", - regex="", - sort=1, - tagValuesQuery="", - ) -).addTemplate( - template.new( - allValues=".*", - current=null, - datasource=myDS, - hide="", - includeAll=true, - label="Resource Group", - multi=true, - name="resource_group", - query='label_values(tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}, resource_group)', - refresh="load", - regex="", - sort=1, - tagValuesQuery="", - ) -); - - -//* ==============Panel (Resource Unit)================== -//* Panel Title: Resource Unit -//* Description: The metrics about request unit(abstract unit) cost for all resource groups. -//* Panels: 7 -//* ==============Panel (Resource Unit)================== -local ruRow = row.new(collapse=true, title="Resource Unit"); - -local ConfigPanel = tablePanel.new( - "RU Config", - datasource=myDS, -).addTarget( - prometheus.target( - 'resource_manager_server_group_config{type="priority"}', - legendFormat="{{resource_group}}", - instant="instant", - ) -).addTarget( - prometheus.target( - 'resource_manager_server_group_config{type="ru_capacity"}', - legendFormat="{{resource_group}}", - instant="instant", - ) -).addTarget( - prometheus.target( - 'resource_manager_server_group_config{type="ru_per_sec"}', - legendFormat="{{resource_group}}", - instant="instant", - ) -).addTransformation( - transformation.new("labelsToFields", options={ - valueLabel: "type", - }) -).addTransformation( - transformation.new("organize", options={ - excludeByName: { - Time: true, - __name__: true, - instance: true, - job: true, - }, - indexByName: { - Time: 0, - __name__: 1, - instance: 2, - job: 3, - resource_group: 4, - priority: 5, - ru_per_sec: 6, - ru_capacity: 7, - }, - }) -); - -local RUPanel = graphPanel.new( - title="RU", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The metrics about request unit cost for all resource groups.", - logBase1Y=10, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (resource_group) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (resource_group)', - legendFormat="{{resource_group}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m]))', - legendFormat="total", - ) -); - -local RUMaxPanel = graphPanel.new( - title="RU (Max Cost During 20s Period)", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The max request unit cost for resource groups during in a period(20s).", - logBase1Y=10, -).addTarget( - prometheus.target( - 'sum(resource_manager_resource_unit_read_request_unit_max_per_sec{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}) by (resource_group)', - legendFormat="{{resource_group}}-read", - ) -).addTarget( - prometheus.target( - 'sum(resource_manager_resource_unit_write_request_unit_max_per_sec{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}) by (resource_group)', - legendFormat="{{resource_group}}-write", - ) -); - -local RUPerQueryPanel = graphPanel.new( - title="RU Per Query", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The avg request unit cost for each query.", - logBase1Y=10, -).addTarget( - prometheus.target( - '(sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name)) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - '(sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m]))) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total", - ) -); - -local RRUPanel = graphPanel.new( - title="RRU", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The read request unit cost for all resource groups.", - logBase1Y=10, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m]))', - legendFormat="total", - ) -); - -local RRUPerQueryPanel = graphPanel.new( - title="RRU Per Query", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The avg read request unit cost for each query.", - logBase1Y=10, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total", - ) -); - -local WRUPanel = graphPanel.new( - title="WRU", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The write request unit cost for all resource groups.", - logBase1Y=10, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m]))', - legendFormat="total", - ) -); - -local WRUPerQueryPanel = graphPanel.new( - title="WRU Per Query", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The avg write request unit cost for each query.", - logBase1Y=10, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"|tp"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total", - ) -); - - -//* ============== Panel (Resource Details)================== -//* Panel Title: Resource Details -//* Description: The metrics about actual resource usage for all resource groups. -//* Panels: 8 -//* ============== Panel (Resource Details)================== - -local resourceRow = row.new(collapse=true, title="Resource Details"); -local KVRequestCountPanel = graphPanel.new( - title="KV Request Count", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The metrics about kv request count for all resource groups.", - logBase1Y=2, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name, type)', - legendFormat="{{name}}-{{type}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}-total", - ) -); - -local KVRequestCountPerQueryPanel = graphPanel.new( - title="KV Request Count Per Query", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The avg kv request count for each query.", - logBase1Y=2, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type="read"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}-read", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type="write"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}-write", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type="read"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total-read", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_request_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type="write"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total-write", - ) -); - -local BytesReadPanel = graphPanel.new( - title="Bytes Read", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="bytes", - description="The metrics about bytes read for all resource groups.", - logBase1Y=2, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_read_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_read_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total", - ) -); - -local BytesReadPerQueryPanel = graphPanel.new( - title="Bytes Read Per Query", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="bytes", - description="The avg bytes read for each query.", - logBase1Y=2, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_read_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_read_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total", - ) -); - -local BytesWrittenPanel = graphPanel.new( - title="Bytes Written", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="bytes", - description="The metrics about bytes written for all resource groups.", - logBase1Y=2, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_write_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_write_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total", - ) -); - -local BytesWrittenPerQueryPanel = graphPanel.new( - title="Bytes Written Per Query", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="bytes", - description="The avg bytes written for each query.", - logBase1Y=2, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_write_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_write_byte_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) / sum(rate(tidb_session_resource_group_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total", - ) -); - -local KVCPUTimePanel = graphPanel.new( - title="KV CPU Time", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="ms", - description="The metrics about kv cpu time for all resource groups.", - logBase1Y=1, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_kv_cpu_time_ms_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_kv_cpu_time_ms_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total", - ) -); - -local SQLCPUTimePanel = graphPanel.new( - title="SQL CPU Time", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="ms", - description="The metrics about sql cpu time for all resource groups.", - logBase1Y=1, -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_sql_cpu_time_ms_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_sql_cpu_time_ms_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m]))', - legendFormat="total", - ) -); - -//* ==============Panel (Client)================== -//* Row Title: Client -//* Description: The metrics about resource control client -//* Panels: 7 -//* ==============Panel (Client)================== - -local clientRow = row.new(collapse=true, title="Client"); - -local ActiveResourceGroupPanel = graphPanel.new( - title="Active Resource Groups", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - bars=true, - format="short", - description="The metrics about active resource groups.", -).addTarget( - prometheus.target( - 'resource_manager_client_resource_group_status{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}', - legendFormat="{{instance}}-{{name}}", - ) -); - -local TotalKVRequestCountPanel = graphPanel.new( - title="Total KV Request Count", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The metrics about total kv request count.", -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_client_request_success_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance) + sum(rate(resource_manager_client_request_fail{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance)', - legendFormat="{{instance}}-total", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_client_request_success_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name) + sum(rate(resource_manager_client_request_fail{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name)', - legendFormat="total", - ) -); - -local FailedKVRequestCountPanel = graphPanel.new( - title="Failed KV Request Count", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The metrics about failed kv request count.", -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_client_request_fail{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance)', - legendFormat="{{instance}}-total", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_client_request_fail{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name)', - legendFormat="{{instance}}-{{name}}", - ) -); - -local SuccessfulKVRequestWaitDurationPanel = graphPanel.new( - title="Successful KV Request Wait Duration", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The metrics about successful kv request wait duration.", -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(resource_manager_client_request_success_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name, le))', - legendFormat="{{instance}}-{{name}}-99", - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.9, sum(rate(resource_manager_client_request_success_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name, le))', - legendFormat="{{instance}}-{{name}}-90", - ) -); - -// Successful KV Request Count -local SuccessfulKVRequestCountPanel = graphPanel.new( - title="Successful KV Request Count", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - logBase1Y=2, - format="short", - description="The metrics about successful kv request count.", -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_client_request_success_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance)', - legendFormat="{{instance}}-total", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_client_request_success_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name)', - legendFormat="{{instance}}-{{name}}", - ) -); - -// Token Request Handle Duration -local TokenRequestHandleDurationPanel = graphPanel.new( - title="Token Request Handle Duration", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The metrics about token request handle duration.", -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(resource_manager_client_token_request_duration_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, le))', - legendFormat="{{instance}}-{{name}}-99", - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.999, sum(rate(resource_manager_client_token_request_duration_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, le))', - legendFormat="{{instance}}-{{name}}-999", - ) -); - -// Token Request Count -local TokenRequestCountPanel = graphPanel.new( - title="Token Request Count", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - description="The metrics about token request count.", -).addTarget( - prometheus.target( - 'sum(delta(resource_manager_client_token_request_duration_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance)', - legendFormat="{{instance}}-total", - ) -).addTarget( - prometheus.target( - 'sum(delta(resource_manager_client_token_request_duration_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance", type="success"}[1m])) by (instance)', - legendFormat="{{instance}}-successful", - ) -).addTarget( - prometheus.target( - 'sum(delta(resource_manager_client_token_request_duration_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance", type="fail"}[1m])) by (instance)', - legendFormat="{{instance}}-failed", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_client_token_request_resource_group{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, name)', - legendFormat="{{instance}}-{{name}}", - ) -); - -//* ==============Panel (Runaway)================== -//* Row Title: Runaway -//* Description: The metrics about runaway resource control -//* Panels: 2 -//* ==============Panel (Runaway)================== - -local runawayRow = row.new(collapse=true, title="Runaway"); -// Query Max Duration -local QueryMaxDurationPanel = graphPanel.new( - title="Query Max Duration", - datasource=myDS, - legend_rightSide=true, - legend_avg=true, - legend_max=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - logBase1Y=2, - description="TiDB max durations for different resource group", -).addTarget( - prometheus.target( - 'histogram_quantile(1.0, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster"}[1m])) by (le,resource_group))', - legendFormat="{{resource_group}}", - ) -); - -// Runaway Event -local RunawayEventPanel = graphPanel.new( - title="Runaway Event", - datasource=myDS, - legend_rightSide=true, - legend_avg=true, - legend_max=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - logBase1Y=2, - description="Runaway manager events for different resource group", -).addTarget( - prometheus.target( - 'sum(rate(tidb_server_query_runaway_check{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type="hit"}[5m])) by (resource_group)', - legendFormat="{{resource_group}}-hit", - ) -).addTarget( - prometheus.target( - 'sum(rate(tidb_server_query_runaway_check{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type!="hit"}[5m])) by (resource_group, type, action)', - legendFormat="{{resource_group}}-{{type}}-{{action}}", - ) -); - -//* ==============Panel (Priority Task Control)================== -//* Row Title: Priority Task Control -//* Description: The metrics about Priority Tasks Control resource control -//* Panels: 4 -//* ==============Panel (Background Task Control)================== - -local priorityTaskRow = row.new(collapse=true, title="Priority Task Control"); - -// The CPU time used of each priority -local PriorityTaskCPUPanel = graphPanel.new( - title="CPU Time by Priority", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="µs", - logBase1Y=1, - description="The total CPU time cost by tasks of each priority.", -).addTarget( - prometheus.target( - 'sum(rate(tikv_resource_control_priority_task_exec_duration{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, priority)', - legendFormat="{{instance}}-{{priority}}", - ) -); - -// The CPU Limiter Quota of each priority -local PriorityTaskQuotaLimitPanel = graphPanel.new( - title="CPU Quota Limit by Priority", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="µs", - logBase1Y=1, - description="The CPU quota limiter applied to each priority.", -).addTarget( - prometheus.target( - 'tikv_resource_control_priority_quota_limit{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}', - legendFormat="{{instance}}-{{priority}}", - ) -); - -// Task QPS that triggers wait -local PriorityTaskWaitQPSPanel = graphPanel.new( - title="Tasks Wait QPS by Priority", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - logBase1Y=1, - description="Tasks number per second that triggers quota limiter wait", -).addTarget( - prometheus.target( - 'sum(rate(tikv_resource_control_priority_wait_duration_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tidb_instance"}[1m])) by (instance, priority)', - legendFormat="{{instance}}-{{priority}}", - ) -); - -// The task wait distribution by priority -local PriorityTaskWaitDurationPanel = graphPanel.new( - title="Priority Task Wait Duration", - datasource=myDS, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="s", - logBase1Y=2, - description="The wait Duration of tasks that triggers quota limiter wait per priority", -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(tikv_resource_control_priority_wait_duration_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}[1m])) by (instance, priority, le))', - legendFormat="{{instance}}-{{priority}}-P99", - ) -).addTarget( - prometheus.target( - 'sum(rate(tikv_resource_control_priority_wait_duration_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}[1m])) by (instance, priority) / sum(rate(tikv_resource_control_priority_wait_duration_count{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}[1m])) by (instance, priority)', - legendFormat="{{instance}}-{{priority}}-avg", - ) -); - - -//* ==============Panel (Background Task Control)================== -//* Row Title: Background Task Control -//* Description: The metrics about Background Task Control resource control -//* Panels: 7 -//* ==============Panel (Background Task Control)================== - -local backgroundTaskRow = row.new(collapse=true, title="Background Task Control"); - -// Background Tasks' RU -local BackgroundTaskRUPanel = graphPanel.new( - title="Background Task RU", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - logBase1Y=10, - description="The total background task's request unit cost for all resource groups.", -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"background"}[1m])) by (name) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"background"}[1m])) by (name)', - legendFormat="{{name}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(resource_manager_resource_unit_read_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"background"}[1m])) + sum(rate(resource_manager_resource_unit_write_request_unit_sum{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", type=~"background"}[1m]))', - legendFormat="total", - ) -); - -// Background Task Resource Utilization -local BackgroundTaskResourceUtilizationPanel = graphPanel.new( - title="Background Task Resource Utilization", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="percent", - logBase1Y=1, - description="The resource(CPU, IO) utilization percentage and limit of background tasks.", -).addTarget( - prometheus.target( - 'tikv_resource_control_bg_resource_utilization{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}', - legendFormat="{{instance}}-{{type}}", - ) -); - -// Background Task CPU Limit -local BackgroundTaskCPULimitPanel = graphPanel.new( - title="Background Task CPU Limit", - datasource=myDS, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="µs", - logBase1Y=1, - description="The total background task's cpu limit for all resource groups.", -).addTarget( - prometheus.target( - 'tikv_resource_control_background_quota_limiter{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance", type="cpu"}', - legendFormat="{{resource_group}}-{{instance}}", - ) -); - -// Background Task IO Limit -local BackgroundTaskIOLimitPanel = graphPanel.new( - title="Background Task IO Limit", - datasource=myDS, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="bytes", - logBase1Y=1, - description="The total background task's io limit for all resource groups.", -).addTarget( - prometheus.target( - 'tikv_resource_control_background_quota_limiter{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance", type="io"}', - legendFormat="{{resource_group}}-{{instance}}", - ) -); - -// Background Task CPU Consumption -local BackgroundTaskCPUConsumptionPanel = graphPanel.new( - title="Background Task CPU Consumption", - datasource=myDS, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="µs", - logBase1Y=1, - description="The total background task's cpu consumption for all resource groups.", -).addTarget( - prometheus.target( - 'rate(tikv_resource_control_background_resource_consumption{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance", type="cpu"}[1m])', - legendFormat="{{resource_group}}-{{instance}}", - ) -); - -// Background Task IO Consumption -local BackgroundTaskIOConsumptionPanel = graphPanel.new( - title="Background Task IO Consumption", - datasource=myDS, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="bytes", - logBase1Y=1, - description="The total background task's io consumption for all resource groups.", -).addTarget( - prometheus.target( - 'rate(tikv_resource_control_background_resource_consumption{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance", type="io"}[1m])', - legendFormat="{{resource_group}}-{{instance}}", - ) -); - -// Background Task Total Wait Duration -local BackgroundTaskTotalWaitDurationPanel = graphPanel.new( - title="Background Task Total Wait Duration", - datasource=myDS, - legend_rightSide=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="µs", - logBase1Y=1, - description="The total background task's wait duration for all resource groups.", -).addTarget( - prometheus.target( - 'rate(tikv_resource_control_background_task_wait_duration{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$tikv_instance"}[1m])', - legendFormat="{{resource_group}}-{{instance}}", - ) -); - -//* ==============Panel (Query Sumary)================== -//* Row Title: Query Sumary -//* Description: The metrics about query summary -//* Panels: 5 -//* ==============Panel (Query Sumary)================== - -local querySummaryRow = row.new(collapse=true, title="Query Summary"); - -// Query Duration -local QueryDurationPanel = graphPanel.new( - title="Query Duration", - datasource=myDS, - legend_rightSide=true, - legend_min=true, - legend_max=true, - legend_avg=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="s", - logBase1Y=2, - description="The Duration of sql execute for different resource group", -).addTarget( - prometheus.target( - 'histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (le,resource_group))', - legendFormat="{{resource_group}}-P999", - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (le,resource_group))', - legendFormat="{{resource_group}}-P99", - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.9, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (le,resource_group))', - legendFormat="{{resource_group}}-P90", - ) -).addTarget( - prometheus.target( - 'histogram_quantile(0.8, sum(rate(tidb_server_handle_query_duration_seconds_bucket{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (le,resource_group))', - legendFormat="{{resource_group}}-P80", - ) -); - -// Command per second -local CommandPerSecondPanel = graphPanel.new( - title="Command Per Second", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - logBase1Y=10, - description="MySQL commands processing numbers per second. See https://dev.mysql.com/doc/internals/en/text-protocol.html and https://dev.mysql.com/doc/internals/en/prepared-statements.html", -).addTarget( - prometheus.target( - 'sum(rate(tidb_server_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (result,resource_group)', - legendFormat="{{resource_group}}--{{result}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(tidb_server_query_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",result="OK",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m] offset 1d)) by (result,resource_group)', - legendFormat="{{resource_group}}--yesterday", - hide=true, - ) -); - -// QPS -local QPSPanel = graphPanel.new( - title="QPS", - datasource=myDS, - legend_rightSide=true, - legend_avg=true, - legend_max=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - logBase1Y=2, - description="TiDB statement statistics", -).addTarget( - prometheus.target( - 'sum(rate(tidb_executor_statement_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (type,resource_group)', - legendFormat="{{resource_group}}--{{type}}", - ) -).addTarget( - prometheus.target( - 'sum(rate(tidb_executor_statement_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}[1m])) by (resource_group)', - legendFormat="{{resource_group}}--total", - ) -); - -// Connection Count -local ConnectionCountPanel = graphPanel.new( - title="Connection Count", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - logBase1Y=1, - description="The number of connections to the TiDB server", -).addTarget( - prometheus.target( - 'tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}', - legendFormat="{{instance}}--{{resource_group}}", - ) -).addTarget( - prometheus.target( - 'tidb_server_connections{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster",instance=~"$tidb_instance",resource_group=~"$resource_group"}', - legendFormat="{{resource_group}}--total", - ) -); - -// Failed Query OPM -local FailedQueryOPMPanel = graphPanel.new( - title="Failed Query OPM", - datasource=myDS, - legend_rightSide=true, - legend_current=true, - legend_max=true, - legend_alignAsTable=true, - legend_values=true, - format="short", - logBase1Y=2, - description="The number of failed queries per minute", -).addTarget( - prometheus.target( - 'sum(increase(tidb_server_execute_error_total{k8s_cluster="$k8s_cluster", tidb_cluster="$tidb_cluster", instance=~"$instance",resource_group=~"$resource_group"}[1m])) by (type, instance,resource_group)', - legendFormat="{{type}}--{{instance}}--{{resource_group}}", - ) -); - -//* ============== Dashboard =============== -//* Merge together -//* ============== Dashboard =============== - -// Position definition -local panelW = 12; -local panelH = 7; -local rowW = 24; -local rowH = 1; - -local rowPos = { x: 0, y: 0, w: rowW, h: rowH }; -local leftPanelPos = { x: 0, y: 0, w: panelW, h: panelH }; -local rightPanelPos = { x: panelW, y: 0, w: panelW, h: panelH }; -local fullPanelPos = { x: 0, y: 0, w: rowW, h: panelH }; - -TiDBResourceControlDash -.addPanel( - ruRow/* Resource Unit */ - .addPanel(ConfigPanel, gridPos=fullPanelPos) - .addPanel(RUPanel, gridPos=leftPanelPos) - .addPanel(RUMaxPanel, gridPos=rightPanelPos) - .addPanel(RUPerQueryPanel, gridPos=leftPanelPos) - .addPanel(RRUPanel, gridPos=rightPanelPos) - .addPanel(RRUPerQueryPanel, gridPos=leftPanelPos) - .addPanel(WRUPanel, gridPos=rightPanelPos) - .addPanel(WRUPerQueryPanel, gridPos=leftPanelPos) - , - gridPos=rowPos -).addPanel( - resourceRow/* Resource Details */ - .addPanel(KVRequestCountPanel, gridPos=leftPanelPos) - .addPanel(KVRequestCountPerQueryPanel, gridPos=rightPanelPos) - .addPanel(BytesReadPanel, gridPos=leftPanelPos) - .addPanel(BytesReadPerQueryPanel, gridPos=rightPanelPos) - .addPanel(BytesWrittenPanel, gridPos=leftPanelPos) - .addPanel(BytesWrittenPerQueryPanel, gridPos=rightPanelPos) - .addPanel(KVCPUTimePanel, gridPos=leftPanelPos) - .addPanel(SQLCPUTimePanel, gridPos=rightPanelPos) - , - gridPos=rowPos -).addPanel( - clientRow/* Client */ - .addPanel(ActiveResourceGroupPanel, gridPos=fullPanelPos) - .addPanel(TotalKVRequestCountPanel, gridPos=leftPanelPos) - .addPanel(FailedKVRequestCountPanel, gridPos=rightPanelPos) - .addPanel(SuccessfulKVRequestWaitDurationPanel, gridPos=leftPanelPos) - .addPanel(SuccessfulKVRequestCountPanel, gridPos=rightPanelPos) - .addPanel(TokenRequestHandleDurationPanel, gridPos=leftPanelPos) - .addPanel(TokenRequestCountPanel, gridPos=rightPanelPos) - , - gridPos=rowPos -).addPanel( - runawayRow/* Runaway */ - .addPanel(QueryMaxDurationPanel, gridPos=leftPanelPos) - .addPanel(RunawayEventPanel, gridPos=rightPanelPos) - , - gridPos=rowPos -).addPanel( - priorityTaskRow /* Priority Task Control */ - .addPanel(PriorityTaskCPUPanel, gridPos=leftPanelPos) - .addPanel(PriorityTaskQuotaLimitPanel, gridPos=rightPanelPos) - .addPanel(PriorityTaskWaitQPSPanel, gridPos=leftPanelPos) - .addPanel(PriorityTaskWaitDurationPanel, gridPos=rightPanelPos) - , - gridPos=rowPos -).addPanel( - backgroundTaskRow/* Background Task Control */ - .addPanel(BackgroundTaskRUPanel, gridPos=leftPanelPos) - .addPanel(BackgroundTaskResourceUtilizationPanel, gridPos=rightPanelPos) - .addPanel(BackgroundTaskIOLimitPanel, gridPos=leftPanelPos) - .addPanel(BackgroundTaskCPUConsumptionPanel, gridPos=rightPanelPos) - .addPanel(BackgroundTaskCPULimitPanel, gridPos=leftPanelPos) - .addPanel(BackgroundTaskIOConsumptionPanel, gridPos=rightPanelPos) - .addPanel(BackgroundTaskTotalWaitDurationPanel, gridPos=leftPanelPos) - , - gridPos=rowPos -).addPanel( - querySummaryRow/* Query Summary */ - .addPanel(QueryDurationPanel, gridPos=leftPanelPos) - .addPanel(CommandPerSecondPanel, gridPos=rightPanelPos) - .addPanel(QPSPanel, gridPos=leftPanelPos) - .addPanel(ConnectionCountPanel, gridPos=rightPanelPos) - .addPanel(FailedQueryOPMPanel, gridPos=leftPanelPos) - , - gridPos=rowPos -) From 8ae553ecc5770ec08c74dfe539c2c8ec98b5a837 Mon Sep 17 00:00:00 2001 From: glorv Date: Sun, 29 Sep 2024 12:17:13 +0800 Subject: [PATCH 3/5] fix --- pkg/metrics/grafana/tidb_resource_control.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pkg/metrics/grafana/tidb_resource_control.json b/pkg/metrics/grafana/tidb_resource_control.json index 36ce7cd0b0b0e..fde485013a9a2 100644 --- a/pkg/metrics/grafana/tidb_resource_control.json +++ b/pkg/metrics/grafana/tidb_resource_control.json @@ -2978,7 +2978,6 @@ "type": "row" }, { - "collapse": true, "collapsed": true, "gridPos": { "h": 1, @@ -4806,4 +4805,4 @@ "title": "Test-Cluster-TiDB-Resource-Control", "uid": "000000201", "version": 1 -} \ No newline at end of file +} From 8a6995187ea095967c0766d74b9d1157805cf718 Mon Sep 17 00:00:00 2001 From: glorv Date: Sun, 29 Sep 2024 12:18:35 +0800 Subject: [PATCH 4/5] fix --- pkg/metrics/grafana/tidb_resource_control.json | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/metrics/grafana/tidb_resource_control.json b/pkg/metrics/grafana/tidb_resource_control.json index fde485013a9a2..f0739b7e1206f 100644 --- a/pkg/metrics/grafana/tidb_resource_control.json +++ b/pkg/metrics/grafana/tidb_resource_control.json @@ -2979,6 +2979,7 @@ }, { "collapsed": true, + "datasource": null, "gridPos": { "h": 1, "w": 24, From d56d2e91f55eea42f3e96907a8f66d5516a48453 Mon Sep 17 00:00:00 2001 From: glorv Date: Mon, 30 Sep 2024 11:43:12 +0800 Subject: [PATCH 5/5] fix duplicate id --- pkg/metrics/grafana/tidb_resource_control.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/metrics/grafana/tidb_resource_control.json b/pkg/metrics/grafana/tidb_resource_control.json index f0739b7e1206f..0e765c73c1240 100644 --- a/pkg/metrics/grafana/tidb_resource_control.json +++ b/pkg/metrics/grafana/tidb_resource_control.json @@ -3003,7 +3003,7 @@ "x": 0, "y": 0 }, - "id": 40, + "id": 43, "legend": { "alignAsTable": true, "avg": false, @@ -3261,7 +3261,7 @@ "x": 12, "y": 0 }, - "id": 43, + "id": 44, "legend": { "alignAsTable": true, "avg": true,