From a1bed8cedcafd26a00c4d01bab3e0962230f936c Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Wed, 13 Apr 2022 15:28:11 +0800 Subject: [PATCH 01/11] *: Add grafana for PD http api request duration --- metrics/grafana/tidb.json | 123 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 123 insertions(+) diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index c0f3f4b771ec7..eb5babb5af98f 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -8802,6 +8802,129 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "pd http request durations by type within 99.9 percent buckets", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 8, + "x": 16, + "y": 23 + }, + "hiddenSeries": false, + "id": 246, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.10", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(tidb_server_pd_api_execution_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, type))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "999-{{type}}", + "refId": "A", + "step": 10 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tidb_server_pd_api_execution_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, type))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "99-{{type}}", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.90, sum(rate(tidb_server_pd_api_execution_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, type))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "90-{{type}}", + "refId": "C" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PD HTTP Request Duration", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, From 42071b4b8eece3431931c9daf5a83e0fafc7b8fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=8E=8B=E8=B6=85?= Date: Wed, 13 Apr 2022 15:37:23 +0800 Subject: [PATCH 02/11] Update metrics/grafana/tidb.json --- metrics/grafana/tidb.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index eb5babb5af98f..f71f2e7982ab5 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -8809,7 +8809,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "pd http request durations by type within 99.9 percent buckets", + "description": "pd http request durations", "editable": true, "error": false, "fieldConfig": { From 8c11366ee07a7b152eae3a5a177681052e4bced0 Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Wed, 13 Apr 2022 15:54:06 +0800 Subject: [PATCH 03/11] modify --- metrics/grafana/tidb.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index f71f2e7982ab5..25973564fcb3d 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -8809,7 +8809,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "pd http request durations", + "description": "The duration of a client sending HTTP request to PD util received the response.", "editable": true, "error": false, "fieldConfig": { From e5d426e1c648b6d55f26802cf32f412fc01dfd43 Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Wed, 13 Apr 2022 15:55:05 +0800 Subject: [PATCH 04/11] update --- metrics/grafana/tidb.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index 25973564fcb3d..699178d86901a 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -8809,7 +8809,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "The duration of a client sending HTTP request to PD util received the response.", + "description": "The duration of a client sending one HTTP request to PD util received the response.", "editable": true, "error": false, "fieldConfig": { From eadf0951441eb7a8d72dbc559d181d7285ec29c4 Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Wed, 13 Apr 2022 17:51:23 +0800 Subject: [PATCH 05/11] *: Add more informations to pd http api request --- domain/infosync/info.go | 9 +- domain/infosync/label_manager.go | 8 +- domain/infosync/placement_manager.go | 6 +- domain/infosync/region.go | 2 +- domain/infosync/tiflash_manager.go | 12 +- metrics/grafana/tidb.json | 271 ++++++++++++++++++++++++++- metrics/server.go | 8 + metrics/session.go | 1 + store/helper/helper.go | 20 +- 9 files changed, 308 insertions(+), 29 deletions(-) diff --git a/domain/infosync/info.go b/domain/infosync/info.go index a1c1695886cfb..baae4cc3cab19 100644 --- a/domain/infosync/info.go +++ b/domain/infosync/info.go @@ -368,7 +368,7 @@ func GetTiFlashTableSyncProgress(ctx context.Context) (map[int64]float64, error) return progressMap, nil } -func doRequest(ctx context.Context, addrs []string, route, method string, body io.Reader) ([]byte, error) { +func doRequest(ctx context.Context, apiName string, addrs []string, route, method string, body io.Reader) ([]byte, error) { var err error var req *http.Request var res *http.Response @@ -383,8 +383,9 @@ func doRequest(ctx context.Context, addrs []string, route, method string, body i } start := time.Now() res, err = doRequestWithFailpoint(req) - metrics.PDApiExecutionHistogram.WithLabelValues("placement").Observe(time.Since(start).Seconds()) if err == nil { + metrics.PDApiExecutionHistogram.WithLabelValues(apiName).Observe(time.Since(start).Seconds()) + metrics.PDApiRequestCounter.WithLabelValues(apiName, res.Status).Inc() bodyBytes, err := io.ReadAll(res.Body) if err != nil { terror.Log(res.Body.Close()) @@ -407,8 +408,10 @@ func doRequest(ctx context.Context, addrs []string, route, method string, body i terror.Log(res.Body.Close()) return bodyBytes, err } - logutil.BgLogger().Warn("fail to doRequest, retry next address", + metrics.PDApiRequestCounter.WithLabelValues(apiName, "network error").Inc() + logutil.BgLogger().Warn("fail to doRequest", zap.Error(err), + zap.Bool("retry next address", idx == len(addrs)-1), zap.String("method", method), zap.String("hosts", addr), zap.String("url", url), diff --git a/domain/infosync/label_manager.go b/domain/infosync/label_manager.go index c6a3ef98da268..b8bf8940caf74 100644 --- a/domain/infosync/label_manager.go +++ b/domain/infosync/label_manager.go @@ -45,7 +45,7 @@ func (lm *PDLabelManager) PutLabelRule(ctx context.Context, rule *label.Rule) er if err != nil { return err } - _, err = doRequest(ctx, lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rule"), "POST", bytes.NewReader(r)) + _, err = doRequest(ctx, "put-label-rule", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rule"), "POST", bytes.NewReader(r)) return err } @@ -56,14 +56,14 @@ func (lm *PDLabelManager) UpdateLabelRules(ctx context.Context, patch *label.Rul return err } - _, err = doRequest(ctx, lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules"), "PATCH", bytes.NewReader(r)) + _, err = doRequest(ctx, "update-label-rules", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules"), "PATCH", bytes.NewReader(r)) return err } // GetAllLabelRules implements GetAllLabelRules func (lm *PDLabelManager) GetAllLabelRules(ctx context.Context) ([]*label.Rule, error) { var rules []*label.Rule - res, err := doRequest(ctx, lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules"), "GET", nil) + res, err := doRequest(ctx, "get-all-label-rules", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules"), "GET", nil) if err == nil && res != nil { err = json.Unmarshal(res, &rules) @@ -79,7 +79,7 @@ func (lm *PDLabelManager) GetLabelRules(ctx context.Context, ruleIDs []string) ( } rules := []*label.Rule{} - res, err := doRequest(ctx, lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules", "ids"), "GET", bytes.NewReader(ids)) + res, err := doRequest(ctx, "get-label-rules", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules", "ids"), "GET", bytes.NewReader(ids)) if err == nil && res != nil { err = json.Unmarshal(res, &rules) diff --git a/domain/infosync/placement_manager.go b/domain/infosync/placement_manager.go index 0a36de70715a8..1d2e13e9fbff7 100644 --- a/domain/infosync/placement_manager.go +++ b/domain/infosync/placement_manager.go @@ -44,7 +44,7 @@ type PDPlacementManager struct { // GetRuleBundle is used to get one specific rule bundle from PD. func (m *PDPlacementManager) GetRuleBundle(ctx context.Context, name string) (*placement.Bundle, error) { bundle := &placement.Bundle{ID: name} - res, err := doRequest(ctx, m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule", name), "GET", nil) + res, err := doRequest(ctx, "get-placement-rule", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule", name), "GET", nil) if err == nil && res != nil { err = json.Unmarshal(res, bundle) } @@ -54,7 +54,7 @@ func (m *PDPlacementManager) GetRuleBundle(ctx context.Context, name string) (*p // GetAllRuleBundles is used to get all rule bundles from PD. It is used to load full rules from PD while fullload infoschema. func (m *PDPlacementManager) GetAllRuleBundles(ctx context.Context) ([]*placement.Bundle, error) { var bundles []*placement.Bundle - res, err := doRequest(ctx, m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule"), "GET", nil) + res, err := doRequest(ctx, "get-all-placement-rules", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule"), "GET", nil) if err == nil && res != nil { err = json.Unmarshal(res, &bundles) } @@ -72,7 +72,7 @@ func (m *PDPlacementManager) PutRuleBundles(ctx context.Context, bundles []*plac return err } - _, err = doRequest(ctx, m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule")+"?partial=true", "POST", bytes.NewReader(b)) + _, err = doRequest(ctx, "put-placement-rules", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule")+"?partial=true", "POST", bytes.NewReader(b)) return err } diff --git a/domain/infosync/region.go b/domain/infosync/region.go index 8f72d289b57da..08866c306d8c4 100644 --- a/domain/infosync/region.go +++ b/domain/infosync/region.go @@ -66,7 +66,7 @@ func GetReplicationState(ctx context.Context, startKey []byte, endKey []byte) (P return PlacementScheduleStatePending, errors.Errorf("pd unavailable") } - res, err := doRequest(ctx, addrs, fmt.Sprintf("%s/replicated?startKey=%s&endKey=%s", pdapi.Regions, hex.EncodeToString(startKey), hex.EncodeToString(endKey)), "GET", nil) + res, err := doRequest(ctx, "get-replication-state", addrs, fmt.Sprintf("%s/replicated?startKey=%s&endKey=%s", pdapi.Regions, hex.EncodeToString(startKey), hex.EncodeToString(endKey)), "GET", nil) if err == nil && res != nil { st := PlacementScheduleStatePending // it should not fail diff --git a/domain/infosync/tiflash_manager.go b/domain/infosync/tiflash_manager.go index 53c664091b6cc..ba5c7e0015aad 100644 --- a/domain/infosync/tiflash_manager.go +++ b/domain/infosync/tiflash_manager.go @@ -76,7 +76,7 @@ func (m *TiFlashPDPlacementManager) SetPlacementRule(ctx context.Context, rule p } j, _ := json.Marshal(rule) buf := bytes.NewBuffer(j) - res, err := doRequest(ctx, m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rule"), "POST", buf) + res, err := doRequest(ctx, "set-rule", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rule"), "POST", buf) if err != nil { return errors.Trace(err) } @@ -88,7 +88,7 @@ func (m *TiFlashPDPlacementManager) SetPlacementRule(ctx context.Context, rule p // DeletePlacementRule is to delete placement rule for certain group. func (m *TiFlashPDPlacementManager) DeletePlacementRule(ctx context.Context, group string, ruleID string) error { - res, err := doRequest(ctx, m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rule", group, ruleID), "DELETE", nil) + res, err := doRequest(ctx, "delete-rule", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rule", group, ruleID), "DELETE", nil) if err != nil { return errors.Trace(err) } @@ -100,7 +100,7 @@ func (m *TiFlashPDPlacementManager) DeletePlacementRule(ctx context.Context, gro // GetGroupRules to get all placement rule in a certain group. func (m *TiFlashPDPlacementManager) GetGroupRules(ctx context.Context, group string) ([]placement.TiFlashRule, error) { - res, err := doRequest(ctx, m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rules", "group", group), "GET", nil) + res, err := doRequest(ctx, "get-group-rules", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rules", "group", group), "GET", nil) if err != nil { return nil, errors.Trace(err) } @@ -133,7 +133,7 @@ func (m *TiFlashPDPlacementManager) PostAccelerateSchedule(ctx context.Context, return errors.Trace(err) } buf := bytes.NewBuffer(j) - res, err := doRequest(ctx, m.etcdCli.Endpoints(), "/pd/api/v1/regions/accelerate-schedule", "POST", buf) + res, err := doRequest(ctx, "accelerate-schedule", m.etcdCli.Endpoints(), "/pd/api/v1/regions/accelerate-schedule", "POST", buf) if err != nil { return errors.Trace(err) } @@ -153,7 +153,7 @@ func (m *TiFlashPDPlacementManager) GetPDRegionRecordStats(ctx context.Context, p := fmt.Sprintf("/pd/api/v1/stats/region?start_key=%s&end_key=%s", url.QueryEscape(string(startKey)), url.QueryEscape(string(endKey))) - res, err := doRequest(ctx, m.etcdCli.Endpoints(), p, "GET", nil) + res, err := doRequest(ctx, "get-region-stats", m.etcdCli.Endpoints(), p, "GET", nil) if err != nil { return errors.Trace(err) } @@ -171,7 +171,7 @@ func (m *TiFlashPDPlacementManager) GetPDRegionRecordStats(ctx context.Context, // GetStoresStat gets the TiKV store information by accessing PD's api. func (m *TiFlashPDPlacementManager) GetStoresStat(ctx context.Context) (*helper.StoresStat, error) { var storesStat helper.StoresStat - res, err := doRequest(ctx, m.etcdCli.Endpoints(), pdapi.Stores, "GET", nil) + res, err := doRequest(ctx, "get-stores", m.etcdCli.Endpoints(), pdapi.Stores, "GET", nil) if err != nil { return nil, errors.Trace(err) } diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index 699178d86901a..4eb123f3fc9ac 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -8855,33 +8855,66 @@ "stack": false, "steppedLine": false, "targets": [ + { + "exemplar": true, + "expr": "histogram_quantile(0.999, sum(rate(tidb_server_pd_api_execution_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "999-all", + "refId": "A", + "step": 10 + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.99, sum(rate(tidb_server_pd_api_execution_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "99-all", + "refId": "B" + }, + { + "exemplar": true, + "expr": "histogram_quantile(0.90, sum(rate(tidb_server_pd_api_execution_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le))", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "90-all", + "refId": "C" + }, { "exemplar": true, "expr": "histogram_quantile(0.999, sum(rate(tidb_server_pd_api_execution_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, type))", "format": "time_series", + "hide": true, "interval": "", "intervalFactor": 2, "legendFormat": "999-{{type}}", - "refId": "A", + "refId": "D", "step": 10 }, { "exemplar": true, "expr": "histogram_quantile(0.99, sum(rate(tidb_server_pd_api_execution_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, type))", "format": "time_series", + "hide": true, "interval": "", "intervalFactor": 2, "legendFormat": "99-{{type}}", - "refId": "B" + "refId": "E" }, { "exemplar": true, "expr": "histogram_quantile(0.90, sum(rate(tidb_server_pd_api_execution_duration_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (le, type))", "format": "time_series", + "hide": true, "interval": "", "intervalFactor": 2, "legendFormat": "90-{{type}}", - "refId": "C" + "refId": "F" } ], "thresholds": [], @@ -8925,6 +8958,238 @@ "align": false, "alignLevel": null } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "PD HTTP API requests count per second.", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 8, + "x": 0, + "y": 30 + }, + "hiddenSeries": false, + "id": 247, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.10", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(tidb_server_pd_api_request_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "all", + "refId": "A", + "step": 10 + }, + { + "exemplar": true, + "expr": "sum(rate(tidb_server_pd_api_request_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\"}[1m])) by (type)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}}", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PD HTTP Request OPS", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "${DS_TEST-CLUSTER}", + "description": "pd http requests failed count per second", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 8, + "x": 8, + "y": 30 + }, + "hiddenSeries": false, + "id": 248, + "legend": { + "alignAsTable": true, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [], + "nullPointMode": "null as zero", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.5.10", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(rate(pd_client_cmd_handle_failed_cmds_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", http_status=~\"200.*\"}[1m]))", + "format": "time_series", + "interval": "", + "intervalFactor": 2, + "legendFormat": "all", + "refId": "A", + "step": 10 + }, + { + "exemplar": true, + "expr": "sum(rate(pd_client_cmd_handle_failed_cmds_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", http_status=~\"200.*\"}[1m])) by (type, http_status)", + "format": "time_series", + "hide": false, + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{type}} {{http_status}}", + "refId": "B", + "step": 10 + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "PD HTTP Request Fail OPS", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": "0", + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } } ], "repeat": null, diff --git a/metrics/server.go b/metrics/server.go index 72dc9e4e4b8ba..1955147ba5a08 100644 --- a/metrics/server.go +++ b/metrics/server.go @@ -247,6 +247,14 @@ var ( Buckets: prometheus.ExponentialBuckets(0.001, 2, 20), // 1ms ~ 524s }, []string{LblType}) + PDApiRequestCounter = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: "tidb", + Subsystem: "server", + Name: "pd_api_request_total", + Help: "Counter of the pd http api requests", + }, []string{LblType, LblHTTPStatus}) + CPUProfileCounter = prometheus.NewCounter( prometheus.CounterOpts{ Namespace: "tidb", diff --git a/metrics/session.go b/metrics/session.go index 83df91439d311..4b76fb5ac7390 100644 --- a/metrics/session.go +++ b/metrics/session.go @@ -157,4 +157,5 @@ const ( LblVersion = "version" LblHash = "hash" LblCTEType = "cte_type" + LblHTTPStatus = "http_status" ) diff --git a/store/helper/helper.go b/store/helper/helper.go index 95c1c302394da..e6316f0b78000 100644 --- a/store/helper/helper.go +++ b/store/helper/helper.go @@ -227,7 +227,7 @@ func (h *Helper) ScrapeHotInfo(rw string, allSchemas []*model.DBInfo) ([]HotTabl // FetchHotRegion fetches the hot region information from PD's http api. func (h *Helper) FetchHotRegion(rw string) (map[uint64]RegionMetric, error) { var regionResp StoreHotRegionInfos - if err := h.requestPD("GET", rw, nil, ®ionResp); err != nil { + if err := h.requestPD("get-hot-region", "GET", rw, nil, ®ionResp); err != nil { return nil, err } metricCnt := 0 @@ -794,28 +794,28 @@ func bytesKeyToHex(key []byte) string { // GetRegionsInfo gets the region information of current store by using PD's api. func (h *Helper) GetRegionsInfo() (*RegionsInfo, error) { var regionsInfo RegionsInfo - err := h.requestPD("GET", pdapi.Regions, nil, ®ionsInfo) + err := h.requestPD("get-region", "GET", pdapi.Regions, nil, ®ionsInfo) return ®ionsInfo, err } // GetStoreRegionsInfo gets the region in given store. func (h *Helper) GetStoreRegionsInfo(storeID uint64) (*RegionsInfo, error) { var regionsInfo RegionsInfo - err := h.requestPD("GET", pdapi.StoreRegions+"/"+strconv.FormatUint(storeID, 10), nil, ®ionsInfo) + err := h.requestPD("get-store-regions", "GET", pdapi.StoreRegions+"/"+strconv.FormatUint(storeID, 10), nil, ®ionsInfo) return ®ionsInfo, err } // GetRegionInfoByID gets the region information of the region ID by using PD's api. func (h *Helper) GetRegionInfoByID(regionID uint64) (*RegionInfo, error) { var regionInfo RegionInfo - err := h.requestPD("GET", pdapi.RegionByID+"/"+strconv.FormatUint(regionID, 10), nil, ®ionInfo) + err := h.requestPD("get-region-by-id", "GET", pdapi.RegionByID+"/"+strconv.FormatUint(regionID, 10), nil, ®ionInfo) return ®ionInfo, err } // GetRegionsInfoByRange scans region by key range func (h *Helper) GetRegionsInfoByRange(sk, ek []byte) (*RegionsInfo, error) { var regionsInfo RegionsInfo - err := h.requestPD("GET", fmt.Sprintf("%v?key=%s&end_key=%s", pdapi.ScanRegions, + err := h.requestPD("get-regions-by-range", "GET", fmt.Sprintf("%v?key=%s&end_key=%s", pdapi.ScanRegions, url.QueryEscape(string(sk)), url.QueryEscape(string(ek))), nil, ®ionsInfo) return ®ionsInfo, err } @@ -823,12 +823,12 @@ func (h *Helper) GetRegionsInfoByRange(sk, ek []byte) (*RegionsInfo, error) { // GetRegionByKey gets regioninfo by key func (h *Helper) GetRegionByKey(k []byte) (*RegionInfo, error) { var regionInfo RegionInfo - err := h.requestPD("GET", fmt.Sprintf("%v/%v", pdapi.RegionKey, url.QueryEscape(string(k))), nil, ®ionInfo) + err := h.requestPD("get-region-by-key", "GET", fmt.Sprintf("%v/%v", pdapi.RegionKey, url.QueryEscape(string(k))), nil, ®ionInfo) return ®ionInfo, err } // request PD API, decode the response body into res -func (h *Helper) requestPD(method, uri string, body io.Reader, res interface{}) error { +func (h *Helper) requestPD(apiName, method, uri string, body io.Reader, res interface{}) error { etcd, ok := h.Store.(kv.EtcdBackend) if !ok { return errors.WithStack(errors.New("not implemented")) @@ -858,9 +858,11 @@ func (h *Helper) requestPD(method, uri string, body io.Reader, res interface{}) start := time.Now() resp, err := util.InternalHTTPClient().Do(req) if err != nil { + metrics.PDApiRequestCounter.WithLabelValues(apiName, "network error").Inc() return errors.Trace(err) } - metrics.PDApiExecutionHistogram.WithLabelValues("common").Observe(time.Since(start).Seconds()) + metrics.PDApiExecutionHistogram.WithLabelValues(apiName).Observe(time.Since(start).Seconds()) + metrics.PDApiRequestCounter.WithLabelValues(apiName, resp.Status).Inc() defer func() { err = resp.Body.Close() @@ -928,7 +930,7 @@ type StoreDetailStat struct { // GetStoresStat gets the TiKV store information by accessing PD's api. func (h *Helper) GetStoresStat() (*StoresStat, error) { var storesStat StoresStat - err := h.requestPD("GET", pdapi.Stores, nil, &storesStat) + err := h.requestPD("get-stores-stat", "GET", pdapi.Stores, nil, &storesStat) return &storesStat, err } From ceecb2e28e66cea3774e961ca2405db9483ffbc2 Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Wed, 13 Apr 2022 17:59:10 +0800 Subject: [PATCH 06/11] register collector --- metrics/metrics.go | 1 + 1 file changed, 1 insertion(+) diff --git a/metrics/metrics.go b/metrics/metrics.go index 5bbb2ab4a3c01..c11b3064e7a2a 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -161,6 +161,7 @@ func RegisterMetrics() { prometheus.MustRegister(TopSQLReportDurationHistogram) prometheus.MustRegister(TopSQLReportDataHistogram) prometheus.MustRegister(PDApiExecutionHistogram) + prometheus.MustRegister(PDApiRequestCounter) prometheus.MustRegister(CPUProfileCounter) prometheus.MustRegister(ReadFromTableCacheCounter) prometheus.MustRegister(LoadTableCacheDurationHistogram) From 2ad39f4105801e2c4670091b6281e1413d6b17ed Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Wed, 13 Apr 2022 18:08:48 +0800 Subject: [PATCH 07/11] update --- metrics/grafana/tidb.json | 6 +++--- metrics/server.go | 2 +- metrics/session.go | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index 4eb123f3fc9ac..3b837b55aee93 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -9129,7 +9129,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(pd_client_cmd_handle_failed_cmds_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", http_status=~\"200.*\"}[1m]))", + "expr": "sum(rate(pd_client_cmd_handle_failed_cmds_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", result=~\"200.*\"}[1m]))", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -9139,12 +9139,12 @@ }, { "exemplar": true, - "expr": "sum(rate(pd_client_cmd_handle_failed_cmds_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", http_status=~\"200.*\"}[1m])) by (type, http_status)", + "expr": "sum(rate(pd_client_cmd_handle_failed_cmds_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", result=~\"200.*\"}[1m])) by (type, result)", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, - "legendFormat": "{{type}} {{http_status}}", + "legendFormat": "{{type}} - {{result}}", "refId": "B", "step": 10 } diff --git a/metrics/server.go b/metrics/server.go index 1955147ba5a08..c380ff5cb9f1d 100644 --- a/metrics/server.go +++ b/metrics/server.go @@ -253,7 +253,7 @@ var ( Subsystem: "server", Name: "pd_api_request_total", Help: "Counter of the pd http api requests", - }, []string{LblType, LblHTTPStatus}) + }, []string{LblType, LblResult}) CPUProfileCounter = prometheus.NewCounter( prometheus.CounterOpts{ diff --git a/metrics/session.go b/metrics/session.go index 4b76fb5ac7390..83df91439d311 100644 --- a/metrics/session.go +++ b/metrics/session.go @@ -157,5 +157,4 @@ const ( LblVersion = "version" LblHash = "hash" LblCTEType = "cte_type" - LblHTTPStatus = "http_status" ) From f98a7fd4a3eca674900a66e56f7ffbfaa5ae21bf Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Wed, 13 Apr 2022 18:14:26 +0800 Subject: [PATCH 08/11] update --- domain/infosync/label_manager.go | 8 ++++---- domain/infosync/placement_manager.go | 6 +++--- domain/infosync/region.go | 2 +- domain/infosync/tiflash_manager.go | 12 ++++++------ store/helper/helper.go | 14 +++++++------- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/domain/infosync/label_manager.go b/domain/infosync/label_manager.go index b8bf8940caf74..84babc3380f1e 100644 --- a/domain/infosync/label_manager.go +++ b/domain/infosync/label_manager.go @@ -45,7 +45,7 @@ func (lm *PDLabelManager) PutLabelRule(ctx context.Context, rule *label.Rule) er if err != nil { return err } - _, err = doRequest(ctx, "put-label-rule", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rule"), "POST", bytes.NewReader(r)) + _, err = doRequest(ctx, "PutLabelRule", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rule"), "POST", bytes.NewReader(r)) return err } @@ -56,14 +56,14 @@ func (lm *PDLabelManager) UpdateLabelRules(ctx context.Context, patch *label.Rul return err } - _, err = doRequest(ctx, "update-label-rules", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules"), "PATCH", bytes.NewReader(r)) + _, err = doRequest(ctx, "UpdateLabelRules", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules"), "PATCH", bytes.NewReader(r)) return err } // GetAllLabelRules implements GetAllLabelRules func (lm *PDLabelManager) GetAllLabelRules(ctx context.Context) ([]*label.Rule, error) { var rules []*label.Rule - res, err := doRequest(ctx, "get-all-label-rules", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules"), "GET", nil) + res, err := doRequest(ctx, "GetAllLabelRules", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules"), "GET", nil) if err == nil && res != nil { err = json.Unmarshal(res, &rules) @@ -79,7 +79,7 @@ func (lm *PDLabelManager) GetLabelRules(ctx context.Context, ruleIDs []string) ( } rules := []*label.Rule{} - res, err := doRequest(ctx, "get-label-rules", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules", "ids"), "GET", bytes.NewReader(ids)) + res, err := doRequest(ctx, "GetLabelRules", lm.etcdCli.Endpoints(), path.Join(pdapi.Config, "region-label", "rules", "ids"), "GET", bytes.NewReader(ids)) if err == nil && res != nil { err = json.Unmarshal(res, &rules) diff --git a/domain/infosync/placement_manager.go b/domain/infosync/placement_manager.go index 1d2e13e9fbff7..5ae047a80cf2c 100644 --- a/domain/infosync/placement_manager.go +++ b/domain/infosync/placement_manager.go @@ -44,7 +44,7 @@ type PDPlacementManager struct { // GetRuleBundle is used to get one specific rule bundle from PD. func (m *PDPlacementManager) GetRuleBundle(ctx context.Context, name string) (*placement.Bundle, error) { bundle := &placement.Bundle{ID: name} - res, err := doRequest(ctx, "get-placement-rule", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule", name), "GET", nil) + res, err := doRequest(ctx, "GetPlacementRule", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule", name), "GET", nil) if err == nil && res != nil { err = json.Unmarshal(res, bundle) } @@ -54,7 +54,7 @@ func (m *PDPlacementManager) GetRuleBundle(ctx context.Context, name string) (*p // GetAllRuleBundles is used to get all rule bundles from PD. It is used to load full rules from PD while fullload infoschema. func (m *PDPlacementManager) GetAllRuleBundles(ctx context.Context) ([]*placement.Bundle, error) { var bundles []*placement.Bundle - res, err := doRequest(ctx, "get-all-placement-rules", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule"), "GET", nil) + res, err := doRequest(ctx, "GetAllPlacementRules", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule"), "GET", nil) if err == nil && res != nil { err = json.Unmarshal(res, &bundles) } @@ -72,7 +72,7 @@ func (m *PDPlacementManager) PutRuleBundles(ctx context.Context, bundles []*plac return err } - _, err = doRequest(ctx, "put-placement-rules", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule")+"?partial=true", "POST", bytes.NewReader(b)) + _, err = doRequest(ctx, "PutPlacementRules", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "placement-rule")+"?partial=true", "POST", bytes.NewReader(b)) return err } diff --git a/domain/infosync/region.go b/domain/infosync/region.go index 08866c306d8c4..cdda3e1a73d68 100644 --- a/domain/infosync/region.go +++ b/domain/infosync/region.go @@ -66,7 +66,7 @@ func GetReplicationState(ctx context.Context, startKey []byte, endKey []byte) (P return PlacementScheduleStatePending, errors.Errorf("pd unavailable") } - res, err := doRequest(ctx, "get-replication-state", addrs, fmt.Sprintf("%s/replicated?startKey=%s&endKey=%s", pdapi.Regions, hex.EncodeToString(startKey), hex.EncodeToString(endKey)), "GET", nil) + res, err := doRequest(ctx, "GetReplicationState", addrs, fmt.Sprintf("%s/replicated?startKey=%s&endKey=%s", pdapi.Regions, hex.EncodeToString(startKey), hex.EncodeToString(endKey)), "GET", nil) if err == nil && res != nil { st := PlacementScheduleStatePending // it should not fail diff --git a/domain/infosync/tiflash_manager.go b/domain/infosync/tiflash_manager.go index ba5c7e0015aad..48d526840ea40 100644 --- a/domain/infosync/tiflash_manager.go +++ b/domain/infosync/tiflash_manager.go @@ -76,7 +76,7 @@ func (m *TiFlashPDPlacementManager) SetPlacementRule(ctx context.Context, rule p } j, _ := json.Marshal(rule) buf := bytes.NewBuffer(j) - res, err := doRequest(ctx, "set-rule", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rule"), "POST", buf) + res, err := doRequest(ctx, "SetPlacementRule", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rule"), "POST", buf) if err != nil { return errors.Trace(err) } @@ -88,7 +88,7 @@ func (m *TiFlashPDPlacementManager) SetPlacementRule(ctx context.Context, rule p // DeletePlacementRule is to delete placement rule for certain group. func (m *TiFlashPDPlacementManager) DeletePlacementRule(ctx context.Context, group string, ruleID string) error { - res, err := doRequest(ctx, "delete-rule", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rule", group, ruleID), "DELETE", nil) + res, err := doRequest(ctx, "DeletePlacementRule", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rule", group, ruleID), "DELETE", nil) if err != nil { return errors.Trace(err) } @@ -100,7 +100,7 @@ func (m *TiFlashPDPlacementManager) DeletePlacementRule(ctx context.Context, gro // GetGroupRules to get all placement rule in a certain group. func (m *TiFlashPDPlacementManager) GetGroupRules(ctx context.Context, group string) ([]placement.TiFlashRule, error) { - res, err := doRequest(ctx, "get-group-rules", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rules", "group", group), "GET", nil) + res, err := doRequest(ctx, "GetGroupRules", m.etcdCli.Endpoints(), path.Join(pdapi.Config, "rules", "group", group), "GET", nil) if err != nil { return nil, errors.Trace(err) } @@ -133,7 +133,7 @@ func (m *TiFlashPDPlacementManager) PostAccelerateSchedule(ctx context.Context, return errors.Trace(err) } buf := bytes.NewBuffer(j) - res, err := doRequest(ctx, "accelerate-schedule", m.etcdCli.Endpoints(), "/pd/api/v1/regions/accelerate-schedule", "POST", buf) + res, err := doRequest(ctx, "PostAccelerateSchedule", m.etcdCli.Endpoints(), "/pd/api/v1/regions/accelerate-schedule", "POST", buf) if err != nil { return errors.Trace(err) } @@ -153,7 +153,7 @@ func (m *TiFlashPDPlacementManager) GetPDRegionRecordStats(ctx context.Context, p := fmt.Sprintf("/pd/api/v1/stats/region?start_key=%s&end_key=%s", url.QueryEscape(string(startKey)), url.QueryEscape(string(endKey))) - res, err := doRequest(ctx, "get-region-stats", m.etcdCli.Endpoints(), p, "GET", nil) + res, err := doRequest(ctx, "GetPDRegionStats", m.etcdCli.Endpoints(), p, "GET", nil) if err != nil { return errors.Trace(err) } @@ -171,7 +171,7 @@ func (m *TiFlashPDPlacementManager) GetPDRegionRecordStats(ctx context.Context, // GetStoresStat gets the TiKV store information by accessing PD's api. func (m *TiFlashPDPlacementManager) GetStoresStat(ctx context.Context) (*helper.StoresStat, error) { var storesStat helper.StoresStat - res, err := doRequest(ctx, "get-stores", m.etcdCli.Endpoints(), pdapi.Stores, "GET", nil) + res, err := doRequest(ctx, "GetStoresStat", m.etcdCli.Endpoints(), pdapi.Stores, "GET", nil) if err != nil { return nil, errors.Trace(err) } diff --git a/store/helper/helper.go b/store/helper/helper.go index e6316f0b78000..68b9c3e569d2e 100644 --- a/store/helper/helper.go +++ b/store/helper/helper.go @@ -227,7 +227,7 @@ func (h *Helper) ScrapeHotInfo(rw string, allSchemas []*model.DBInfo) ([]HotTabl // FetchHotRegion fetches the hot region information from PD's http api. func (h *Helper) FetchHotRegion(rw string) (map[uint64]RegionMetric, error) { var regionResp StoreHotRegionInfos - if err := h.requestPD("get-hot-region", "GET", rw, nil, ®ionResp); err != nil { + if err := h.requestPD("FetchHotRegion", "GET", rw, nil, ®ionResp); err != nil { return nil, err } metricCnt := 0 @@ -794,28 +794,28 @@ func bytesKeyToHex(key []byte) string { // GetRegionsInfo gets the region information of current store by using PD's api. func (h *Helper) GetRegionsInfo() (*RegionsInfo, error) { var regionsInfo RegionsInfo - err := h.requestPD("get-region", "GET", pdapi.Regions, nil, ®ionsInfo) + err := h.requestPD("GetRegions", "GET", pdapi.Regions, nil, ®ionsInfo) return ®ionsInfo, err } // GetStoreRegionsInfo gets the region in given store. func (h *Helper) GetStoreRegionsInfo(storeID uint64) (*RegionsInfo, error) { var regionsInfo RegionsInfo - err := h.requestPD("get-store-regions", "GET", pdapi.StoreRegions+"/"+strconv.FormatUint(storeID, 10), nil, ®ionsInfo) + err := h.requestPD("GetStoreRegions", "GET", pdapi.StoreRegions+"/"+strconv.FormatUint(storeID, 10), nil, ®ionsInfo) return ®ionsInfo, err } // GetRegionInfoByID gets the region information of the region ID by using PD's api. func (h *Helper) GetRegionInfoByID(regionID uint64) (*RegionInfo, error) { var regionInfo RegionInfo - err := h.requestPD("get-region-by-id", "GET", pdapi.RegionByID+"/"+strconv.FormatUint(regionID, 10), nil, ®ionInfo) + err := h.requestPD("GetRegionByID", "GET", pdapi.RegionByID+"/"+strconv.FormatUint(regionID, 10), nil, ®ionInfo) return ®ionInfo, err } // GetRegionsInfoByRange scans region by key range func (h *Helper) GetRegionsInfoByRange(sk, ek []byte) (*RegionsInfo, error) { var regionsInfo RegionsInfo - err := h.requestPD("get-regions-by-range", "GET", fmt.Sprintf("%v?key=%s&end_key=%s", pdapi.ScanRegions, + err := h.requestPD("GetRegionByRange", "GET", fmt.Sprintf("%v?key=%s&end_key=%s", pdapi.ScanRegions, url.QueryEscape(string(sk)), url.QueryEscape(string(ek))), nil, ®ionsInfo) return ®ionsInfo, err } @@ -823,7 +823,7 @@ func (h *Helper) GetRegionsInfoByRange(sk, ek []byte) (*RegionsInfo, error) { // GetRegionByKey gets regioninfo by key func (h *Helper) GetRegionByKey(k []byte) (*RegionInfo, error) { var regionInfo RegionInfo - err := h.requestPD("get-region-by-key", "GET", fmt.Sprintf("%v/%v", pdapi.RegionKey, url.QueryEscape(string(k))), nil, ®ionInfo) + err := h.requestPD("GetRegionByKey", "GET", fmt.Sprintf("%v/%v", pdapi.RegionKey, url.QueryEscape(string(k))), nil, ®ionInfo) return ®ionInfo, err } @@ -930,7 +930,7 @@ type StoreDetailStat struct { // GetStoresStat gets the TiKV store information by accessing PD's api. func (h *Helper) GetStoresStat() (*StoresStat, error) { var storesStat StoresStat - err := h.requestPD("get-stores-stat", "GET", pdapi.Stores, nil, &storesStat) + err := h.requestPD("GetStoresStat", "GET", pdapi.Stores, nil, &storesStat) return &storesStat, err } From 9600103c21c4ae16d15f82534434c22bf88e19e5 Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Wed, 13 Apr 2022 18:22:56 +0800 Subject: [PATCH 09/11] update some text --- metrics/grafana/tidb.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index 14ac2d1af49c4..bf8836f7078c9 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -8966,7 +8966,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "PD HTTP API requests count per second.", + "description": "PD HTTP API request count per second.", "editable": true, "error": false, "fieldConfig": { @@ -9082,7 +9082,7 @@ "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", - "description": "pd http requests failed count per second", + "description": "PD failed HTTP request count per second.", "editable": true, "error": false, "fieldConfig": { From 571e87a20850319bbc8a842f722c649a3d7c6588 Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Wed, 13 Apr 2022 18:55:23 +0800 Subject: [PATCH 10/11] update --- metrics/grafana/tidb.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metrics/grafana/tidb.json b/metrics/grafana/tidb.json index bf8836f7078c9..f2b58738a16e4 100644 --- a/metrics/grafana/tidb.json +++ b/metrics/grafana/tidb.json @@ -9130,7 +9130,7 @@ "targets": [ { "exemplar": true, - "expr": "sum(rate(pd_client_cmd_handle_failed_cmds_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", result=~\"200.*\"}[1m]))", + "expr": "sum(rate(tidb_server_pd_api_request_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", result!~\"200.*\"}[1m]))", "format": "time_series", "interval": "", "intervalFactor": 2, @@ -9140,7 +9140,7 @@ }, { "exemplar": true, - "expr": "sum(rate(pd_client_cmd_handle_failed_cmds_duration_seconds_count{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", result=~\"200.*\"}[1m])) by (type, result)", + "expr": "sum(rate(tidb_server_pd_api_request_total{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", result!~\"200.*\"}[1m])) by (type, result)", "format": "time_series", "hide": false, "interval": "", From d1282d44b038eaba65ea1172a9c89e069427e54e Mon Sep 17 00:00:00 2001 From: Chao Wang Date: Thu, 14 Apr 2022 11:25:58 +0800 Subject: [PATCH 11/11] update name --- domain/infosync/info.go | 6 +++--- metrics/metrics.go | 4 ++-- metrics/server.go | 4 ++-- store/helper/helper.go | 6 +++--- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/domain/infosync/info.go b/domain/infosync/info.go index 25cf81108e669..5b8edd2b4a53b 100644 --- a/domain/infosync/info.go +++ b/domain/infosync/info.go @@ -392,8 +392,8 @@ func doRequest(ctx context.Context, apiName string, addrs []string, route, metho start := time.Now() res, err = doRequestWithFailpoint(req) if err == nil { - metrics.PDApiExecutionHistogram.WithLabelValues(apiName).Observe(time.Since(start).Seconds()) - metrics.PDApiRequestCounter.WithLabelValues(apiName, res.Status).Inc() + metrics.PDAPIExecutionHistogram.WithLabelValues(apiName).Observe(time.Since(start).Seconds()) + metrics.PDAPIRequestCounter.WithLabelValues(apiName, res.Status).Inc() bodyBytes, err := io.ReadAll(res.Body) if err != nil { terror.Log(res.Body.Close()) @@ -416,7 +416,7 @@ func doRequest(ctx context.Context, apiName string, addrs []string, route, metho terror.Log(res.Body.Close()) return bodyBytes, err } - metrics.PDApiRequestCounter.WithLabelValues(apiName, "network error").Inc() + metrics.PDAPIRequestCounter.WithLabelValues(apiName, "network error").Inc() logutil.BgLogger().Warn("fail to doRequest", zap.Error(err), zap.Bool("retry next address", idx == len(addrs)-1), diff --git a/metrics/metrics.go b/metrics/metrics.go index c11b3064e7a2a..1803ebe127cdb 100644 --- a/metrics/metrics.go +++ b/metrics/metrics.go @@ -160,8 +160,8 @@ func RegisterMetrics() { prometheus.MustRegister(TopSQLIgnoredCounter) prometheus.MustRegister(TopSQLReportDurationHistogram) prometheus.MustRegister(TopSQLReportDataHistogram) - prometheus.MustRegister(PDApiExecutionHistogram) - prometheus.MustRegister(PDApiRequestCounter) + prometheus.MustRegister(PDAPIExecutionHistogram) + prometheus.MustRegister(PDAPIRequestCounter) prometheus.MustRegister(CPUProfileCounter) prometheus.MustRegister(ReadFromTableCacheCounter) prometheus.MustRegister(LoadTableCacheDurationHistogram) diff --git a/metrics/server.go b/metrics/server.go index c380ff5cb9f1d..086321548c5a9 100644 --- a/metrics/server.go +++ b/metrics/server.go @@ -238,7 +238,7 @@ var ( Help: "Counter of TiFlash queries.", }, []string{LblType, LblResult}) - PDApiExecutionHistogram = prometheus.NewHistogramVec( + PDAPIExecutionHistogram = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "tidb", Subsystem: "server", @@ -247,7 +247,7 @@ var ( Buckets: prometheus.ExponentialBuckets(0.001, 2, 20), // 1ms ~ 524s }, []string{LblType}) - PDApiRequestCounter = prometheus.NewCounterVec( + PDAPIRequestCounter = prometheus.NewCounterVec( prometheus.CounterOpts{ Namespace: "tidb", Subsystem: "server", diff --git a/store/helper/helper.go b/store/helper/helper.go index 68b9c3e569d2e..e3912ac9aff26 100644 --- a/store/helper/helper.go +++ b/store/helper/helper.go @@ -858,11 +858,11 @@ func (h *Helper) requestPD(apiName, method, uri string, body io.Reader, res inte start := time.Now() resp, err := util.InternalHTTPClient().Do(req) if err != nil { - metrics.PDApiRequestCounter.WithLabelValues(apiName, "network error").Inc() + metrics.PDAPIRequestCounter.WithLabelValues(apiName, "network error").Inc() return errors.Trace(err) } - metrics.PDApiExecutionHistogram.WithLabelValues(apiName).Observe(time.Since(start).Seconds()) - metrics.PDApiRequestCounter.WithLabelValues(apiName, resp.Status).Inc() + metrics.PDAPIExecutionHistogram.WithLabelValues(apiName).Observe(time.Since(start).Seconds()) + metrics.PDAPIRequestCounter.WithLabelValues(apiName, resp.Status).Inc() defer func() { err = resp.Body.Close()