*: add HTTP API to generate TiDB metric profile (#18272) (#18531)

pingcap · Jul 29, 2020 · 1dd6ade · 1dd6ade
1 parent 0a3b8e8
commit 1dd6ade
Show file tree

Hide file tree

Showing 8 changed files with 642 additions and 61 deletions.
diff --git a/executor/inspection_profile.go b/executor/inspection_profile.go
diff --git a/executor/inspection_summary.go b/executor/inspection_summary.go
@@ -71,22 +71,22 @@ var inspectionSummaryRules = map[string][]string{
 		"tidb_kv_backoff_duration",
 		"tidb_kv_request_duration",
 		"pd_client_cmd_duration",
-		"tikv_grpc_messge_duration",
+		"tikv_grpc_message_duration",
 		"tikv_average_grpc_messge_duration",
 		"tikv_channel_full",
 		"tikv_scheduler_is_busy",
 		"tikv_coprocessor_is_busy",
 		"tikv_engine_write_stall",
-		"tikv_apply_log_avg_duration",
-		"tikv_apply_log_duration",
-		"tikv_append_log_avg_duration",
-		"tikv_append_log_duration",
-		"tikv_commit_log_avg_duration",
-		"tikv_commit_log_duration",
-		"tikv_process_duration",
-		"tikv_propose_wait_duration",
+		"tikv_raftstore_apply_log_avg_duration",
+		"tikv_raftstore_apply_log_duration",
+		"tikv_raftstore_append_log_avg_duration",
+		"tikv_raftstore_append_log_duration",
+		"tikv_raftstore_commit_log_avg_duration",
+		"tikv_raftstore_commit_log_duration",
+		"tikv_raftstore_process_duration",
+		"tikv_raftstore_propose_wait_duration",
 		"tikv_propose_avg_wait_duration",
-		"tikv_apply_wait_duration",
+		"tikv_raftstore_apply_wait_duration",
 		"tikv_apply_avg_wait_duration",
 		"tikv_check_split_duration",
 		"tikv_storage_async_request_duration",
@@ -155,7 +155,7 @@ var inspectionSummaryRules = map[string][]string{
 		"tikv_grpc_avg_req_batch_size",
 		"tikv_grpc_avg_resp_batch_size",
 		"tikv_grpc_errors",
-		"tikv_grpc_messge_duration",
+		"tikv_grpc_message_duration",
 		"tikv_grpc_qps",
 		"tikv_grpc_req_batch_size",
 		"tikv_grpc_resp_batch_size",
@@ -219,7 +219,7 @@ var inspectionSummaryRules = map[string][]string{
 		"tikv_grpc_avg_req_batch_size",
 		"tikv_grpc_avg_resp_batch_size",
 		"tikv_grpc_errors",
-		"tikv_grpc_messge_duration",
+		"tikv_grpc_message_duration",
 		"tikv_grpc_qps",
 		"tikv_grpc_req_batch_size",
 		"tikv_grpc_resp_batch_size",
@@ -238,15 +238,15 @@ var inspectionSummaryRules = map[string][]string{
 		"tikv_scheduler_stage",
 		"tikv_scheduler_writing_bytes",
 		"tikv_propose_avg_wait_duration",
-		"tikv_propose_wait_duration",
-		"tikv_append_log_avg_duration",
-		"tikv_append_log_duration",
-		"tikv_commit_log_avg_duration",
-		"tikv_commit_log_duration",
+		"tikv_raftstore_propose_wait_duration",
+		"tikv_raftstore_append_log_avg_duration",
+		"tikv_raftstore_append_log_duration",
+		"tikv_raftstore_commit_log_avg_duration",
+		"tikv_raftstore_commit_log_duration",
 		"tikv_apply_avg_wait_duration",
-		"tikv_apply_log_avg_duration",
-		"tikv_apply_log_duration",
-		"tikv_apply_wait_duration",
+		"tikv_raftstore_apply_log_avg_duration",
+		"tikv_raftstore_apply_log_duration",
+		"tikv_raftstore_apply_wait_duration",
 		"tikv_engine_wal_sync_operations",
 		"tikv_engine_write_duration",
 		"tikv_engine_write_operations",
@@ -388,18 +388,18 @@ var inspectionSummaryRules = map[string][]string{
 		"tikv_approximate_avg_region_size",
 		"tikv_approximate_region_size_histogram",
 		"tikv_approximate_region_size",
-		"tikv_append_log_avg_duration",
-		"tikv_append_log_duration",
-		"tikv_commit_log_avg_duration",
-		"tikv_commit_log_duration",
+		"tikv_raftstore_append_log_avg_duration",
+		"tikv_raftstore_append_log_duration",
+		"tikv_raftstore_commit_log_avg_duration",
+		"tikv_raftstore_commit_log_duration",
 		"tikv_apply_avg_wait_duration",
-		"tikv_apply_log_avg_duration",
-		"tikv_apply_log_duration",
-		"tikv_apply_wait_duration",
-		"tikv_process_duration",
-		"tikv_process_handled",
+		"tikv_raftstore_apply_log_avg_duration",
+		"tikv_raftstore_apply_log_duration",
+		"tikv_raftstore_apply_wait_duration",
+		"tikv_raftstore_process_duration",
+		"tikv_raftstore_process_handled",
 		"tikv_propose_avg_wait_duration",
-		"tikv_propose_wait_duration",
+		"tikv_raftstore_propose_wait_duration",
 		"tikv_raft_dropped_messages",
 		"tikv_raft_log_speed",
 		"tikv_raft_message_avg_batch_size",

diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go
@@ -655,6 +655,22 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:   []string{"instance"},
 		Quantile: 0.95,
 	},
+	"tidb_batch_client_wait_conn_duration": {
+		PromQL:   "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_batch_client_wait_connection_establish_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
+		Labels:   []string{"instance"},
+		Quantile: 0.95,
+		Comment:  "The quantile of batch client wait new connection establish durations",
+	},
+	"tidb_batch_client_wait_conn_total_count": {
+		PromQL:  "sum(increase(tidb_tikvclient_batch_client_wait_connection_establish_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
+		Labels:  []string{"instance"},
+		Comment: "The total count of batch client wait new connection establish",
+	},
+	"tidb_batch_client_wait_conn_total_time": {
+		PromQL:  "sum(increase(tidb_tikvclient_batch_client_wait_connection_establish_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
+		Labels:  []string{"instance"},
+		Comment: "The total time of batch client wait new connection establish",
+	},
 	"tidb_batch_client_unavailable_duration": {
 		Comment:  "The quantile of kv storage batch processing unvailable durations",
 		PromQL:   "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_batch_client_unavailable_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
@@ -1069,7 +1085,7 @@ var MetricTableMap = map[string]MetricTableDef{
 		Comment:  "The quantile size of requests into request batch per TiKV instance",
 	},
 
-	"tikv_grpc_messge_duration": {
+	"tikv_grpc_message_duration": {
 		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_grpc_msg_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,instance))`,
 		Labels:   []string{"instance", "type"},
 		Quantile: 0.99,
@@ -1138,33 +1154,33 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:  []string{"instance", "type"},
 		Comment: "The total number of peers validated by the PD worker",
 	},
-	"tikv_apply_log_avg_duration": {
+	"tikv_raftstore_apply_log_avg_duration": {
 		PromQL:  `sum(rate(tikv_raftstore_apply_log_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) / sum(rate(tikv_raftstore_apply_log_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) `,
 		Labels:  []string{"instance"},
 		Comment: "The average time consumed when Raft applies log",
 	},
-	"tikv_apply_log_duration": {
+	"tikv_raftstore_apply_log_duration": {
 		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
 		Labels:   []string{"instance"},
 		Quantile: 0.99,
 		Comment:  "The quantile time consumed when Raft applies log",
 	},
-	"tikv_append_log_avg_duration": {
+	"tikv_raftstore_append_log_avg_duration": {
 		PromQL:  `sum(rate(tikv_raftstore_append_log_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) / sum(rate(tikv_raftstore_append_log_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION]))`,
 		Labels:  []string{"instance"},
 		Comment: "The avg time consumed when Raft appends log",
 	},
-	"tikv_append_log_duration": {
+	"tikv_raftstore_append_log_duration": {
 		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
 		Labels:   []string{"instance"},
 		Quantile: 0.99,
 		Comment:  "The quantile time consumed when Raft appends log",
 	},
-	"tikv_commit_log_avg_duration": {
+	"tikv_raftstore_commit_log_avg_duration": {
 		PromQL:  `sum(rate(tikv_raftstore_commit_log_duration_seconds_sum[$RANGE_DURATION])) / sum(rate(tikv_raftstore_commit_log_duration_seconds_count[$RANGE_DURATION]))`,
 		Comment: "The time consumed when Raft commits log",
 	},
-	"tikv_commit_log_duration": {
+	"tikv_raftstore_commit_log_duration": {
 		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
 		Labels:   []string{"instance"},
 		Quantile: 0.99,
@@ -1175,12 +1191,12 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:  []string{"instance"},
 		Comment: "The count of ready handled of Raft",
 	},
-	"tikv_process_handled": {
+	"tikv_raftstore_process_handled": {
 		PromQL:  `sum(rate(tikv_raftstore_raft_process_duration_secs_count{$LABEL_CONDITIONS}[$RANGE_DURATION]))`,
 		Labels:  []string{"instance", "type"},
 		Comment: "The count of different process type of Raft",
 	},
-	"tikv_process_duration": {
+	"tikv_raftstore_process_duration": {
 		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,type))`,
 		Labels:   []string{"instance", "type"},
 		Quantile: 0.99,
@@ -1249,7 +1265,7 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:  []string{"instance", "type"},
 		Comment: "The total number of proposals per type in raft",
 	},
-	"tikv_propose_wait_duration": {
+	"tikv_raftstore_propose_wait_duration": {
 		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
 		Labels:   []string{"instance"},
 		Quantile: 0.99,
@@ -1260,7 +1276,7 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:  []string{"instance"},
 		Comment: "The average wait time of each proposal",
 	},
-	"tikv_apply_wait_duration": {
+	"tikv_raftstore_apply_wait_duration": {
 		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
 		Labels:   []string{"instance"},
 		Quantile: 0.99,
@@ -1354,6 +1370,22 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:  []string{"instance", "type"},
 		Comment: "The average time which is caused by latch wait in command",
 	},
+	"tikv_scheduler_processing_read_duration": {
+		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_scheduler_processing_read_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,type))`,
+		Labels:   []string{"instance", "type"},
+		Quantile: 0.99,
+		Comment:  "The quantile time of scheduler processing read in command",
+	},
+	"tikv_scheduler_processing_read_total_count": {
+		PromQL:  "sum(increase(tikv_scheduler_processing_read_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
+		Labels:  []string{"instance", "type"},
+		Comment: "The total count of scheduler processing read in command",
+	},
+	"tikv_scheduler_processing_read_total_time": {
+		PromQL:  "sum(increase(tikv_scheduler_processing_read_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
+		Labels:  []string{"instance", "type"},
+		Comment: "The total time of scheduler processing read in command",
+	},
 
 	"tikv_scheduler_keys_read": {
 		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_scheduler_kv_command_key_read_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,type))`,
@@ -1573,8 +1605,8 @@ var MetricTableMap = map[string]MetricTableDef{
 		Comment:  "The quantile of time consumed when handling coprocessor requests",
 	},
 	"tikv_cop_wait_duration": {
-		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,req,instance))`,
-		Labels:   []string{"instance", "req"},
+		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,req,type,instance))`,
+		Labels:   []string{"instance", "req", "type"},
 		Quantile: 1,
 		Comment:  "The quantile of time consumed when coprocessor requests are wait for being handled",
 	},
@@ -2583,6 +2615,22 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:  []string{"instance", "sql_type"},
 		Comment: "The total time of TiDB query durations(second)",
 	},
+	"tidb_txn_cmd_duration": {
+		PromQL:   `histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,instance))`,
+		Labels:   []string{"instance", "type"},
+		Quantile: 0.90,
+		Comment:  "The quantile of TiDB transaction command durations(second)",
+	},
+	"tidb_txn_cmd_total_count": {
+		PromQL:  "sum(increase(tidb_tikvclient_txn_cmd_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
+		Labels:  []string{"instance", "type"},
+		Comment: "The total count of TiDB transaction command",
+	},
+	"tidb_txn_cmd_total_time": {
+		PromQL:  "sum(increase(tidb_tikvclient_txn_cmd_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
+		Labels:  []string{"instance", "type"},
+		Comment: "The total time of TiDB transaction command",
+	},
 	"tidb_slow_query_cop_process_total_count": {
 		PromQL:  "sum(increase(tidb_server_slow_query_cop_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels:  []string{"instance"},
@@ -2643,31 +2691,31 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:  []string{"instance", "type", "sql_type"},
 		Comment: "The total time of transaction execution durations, including retry(second)",
 	},
-	"tikv_append_log_total_count": {
+	"tikv_raftstore_append_log_total_count": {
 		PromQL:  "sum(increase(tikv_raftstore_append_log_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels:  []string{"instance"},
 		Comment: "The total count of Raft appends log",
 	},
-	"tikv_append_log_total_time": {
+	"tikv_raftstore_append_log_total_time": {
 		PromQL:  "sum(increase(tikv_raftstore_append_log_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels:  []string{"instance"},
 		Comment: "The total time of Raft appends log",
 	},
-	"tikv_apply_log_total_count": {
+	"tikv_raftstore_apply_log_total_count": {
 		PromQL:  "sum(increase(tikv_raftstore_apply_log_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels:  []string{"instance"},
 		Comment: "The total count of Raft applies log",
 	},
-	"tikv_apply_log_total_time": {
+	"tikv_raftstore_apply_log_total_time": {
 		PromQL:  "sum(increase(tikv_raftstore_apply_log_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels:  []string{"instance"},
 		Comment: "The total time of Raft applies log",
 	},
-	"tikv_apply_wait_total_count": {
+	"tikv_raftstore_apply_wait_total_count": {
 		PromQL: "sum(increase(tikv_raftstore_apply_wait_time_duration_secs_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels: []string{"instance"},
 	},
-	"tikv_apply_wait_total_time": {
+	"tikv_raftstore_apply_wait_total_time": {
 		PromQL: "sum(increase(tikv_raftstore_apply_wait_time_duration_secs_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels: []string{"instance"},
 	},
@@ -2697,12 +2745,12 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:  []string{"instance"},
 		Comment: "The total time of time consumed when running split check in .9999",
 	},
-	"tikv_commit_log_total_count": {
+	"tikv_raftstore_commit_log_total_count": {
 		PromQL:  "sum(increase(tikv_raftstore_commit_log_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels:  []string{"instance"},
 		Comment: "The total count of Raft commits log",
 	},
-	"tikv_commit_log_total_time": {
+	"tikv_raftstore_commit_log_total_time": {
 		PromQL:  "sum(increase(tikv_raftstore_commit_log_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels:  []string{"instance"},
 		Comment: "The total time of Raft commits log",
@@ -2728,13 +2776,13 @@ var MetricTableMap = map[string]MetricTableDef{
 		Comment: "The total time of time consumed to handle coprocessor read requests",
 	},
 	"tikv_cop_wait_total_count": {
-		PromQL:  "sum(increase(tikv_coprocessor_request_wait_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req)",
-		Labels:  []string{"instance", "req"},
+		PromQL:  "sum(increase(tikv_coprocessor_request_wait_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req,type)",
+		Labels:  []string{"instance", "req", "type"},
 		Comment: "The total count of coprocessor requests that wait for being handled",
 	},
 	"tikv_cop_wait_total_time": {
-		PromQL:  "sum(increase(tikv_coprocessor_request_wait_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req)",
-		Labels:  []string{"instance", "req"},
+		PromQL:  "sum(increase(tikv_coprocessor_request_wait_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req,type)",
+		Labels:  []string{"instance", "req", "type"},
 		Comment: "The total time of time consumed when coprocessor requests are wait for being handled",
 	},
 	"tikv_raft_store_events_total_count": {
@@ -2757,12 +2805,12 @@ var MetricTableMap = map[string]MetricTableDef{
 		Labels:  []string{"instance", "task"},
 		Comment: "The total time of time consumed when executing GC tasks",
 	},
-	"tikv_grpc_messge_total_count": {
+	"tikv_grpc_message_total_count": {
 		PromQL:  "sum(increase(tikv_grpc_msg_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
 		Labels:  []string{"instance", "type"},
 		Comment: "The total count of tikv execution gRPC message",
 	},
-	"tikv_grpc_messge_total_time": {
+	"tikv_grpc_message_total_time": {
 		PromQL:  "sum(increase(tikv_grpc_msg_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
 		Labels:  []string{"instance", "type"},
 		Comment: "The total time of execution time of gRPC message",
@@ -2803,22 +2851,22 @@ var MetricTableMap = map[string]MetricTableDef{
 		PromQL: "sum(increase(tikv_lock_manager_waiter_lifetime_duration_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels: []string{"instance"},
 	},
-	"tikv_process_total_count": {
+	"tikv_raftstore_process_total_count": {
 		PromQL:  "sum(increase(tikv_raftstore_raft_process_duration_secs_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
 		Labels:  []string{"instance", "type"},
 		Comment: "The total count of peer processes in Raft",
 	},
-	"tikv_process_total_time": {
+	"tikv_raftstore_process_total_time": {
 		PromQL:  "sum(increase(tikv_raftstore_raft_process_duration_secs_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
 		Labels:  []string{"instance", "type"},
 		Comment: "The total time of peer processes in Raft",
 	},
-	"tikv_propose_wait_total_count": {
+	"tikv_raftstore_propose_wait_total_count": {
 		PromQL:  "sum(increase(tikv_raftstore_request_wait_time_duration_secs_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels:  []string{"instance"},
 		Comment: "The total count of each proposal",
 	},
-	"tikv_propose_wait_total_time": {
+	"tikv_raftstore_propose_wait_total_time": {
 		PromQL:  "sum(increase(tikv_raftstore_request_wait_time_duration_secs_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
 		Labels:  []string{"instance"},
 		Comment: "The total time of wait time of each proposal",

diff --git a/metrics/session.go b/metrics/session.go
@@ -140,4 +140,6 @@ const (
 	LblOptimistic  = "optimistic"
 	LblStore       = "store"
 	LblAddress     = "address"
+	LblBatchGet    = "batch_get"
+	LblGet         = "get"
 )