Skip to content

Commit

Permalink
*: add HTTP API to generate TiDB metric profile (#18272) (#18531)
Browse files Browse the repository at this point in the history
  • Loading branch information
ti-srebot authored Jul 29, 2020
1 parent 0a3b8e8 commit 1dd6ade
Show file tree
Hide file tree
Showing 8 changed files with 642 additions and 61 deletions.
479 changes: 479 additions & 0 deletions executor/inspection_profile.go

Large diffs are not rendered by default.

60 changes: 30 additions & 30 deletions executor/inspection_summary.go
Original file line number Diff line number Diff line change
Expand Up @@ -71,22 +71,22 @@ var inspectionSummaryRules = map[string][]string{
"tidb_kv_backoff_duration",
"tidb_kv_request_duration",
"pd_client_cmd_duration",
"tikv_grpc_messge_duration",
"tikv_grpc_message_duration",
"tikv_average_grpc_messge_duration",
"tikv_channel_full",
"tikv_scheduler_is_busy",
"tikv_coprocessor_is_busy",
"tikv_engine_write_stall",
"tikv_apply_log_avg_duration",
"tikv_apply_log_duration",
"tikv_append_log_avg_duration",
"tikv_append_log_duration",
"tikv_commit_log_avg_duration",
"tikv_commit_log_duration",
"tikv_process_duration",
"tikv_propose_wait_duration",
"tikv_raftstore_apply_log_avg_duration",
"tikv_raftstore_apply_log_duration",
"tikv_raftstore_append_log_avg_duration",
"tikv_raftstore_append_log_duration",
"tikv_raftstore_commit_log_avg_duration",
"tikv_raftstore_commit_log_duration",
"tikv_raftstore_process_duration",
"tikv_raftstore_propose_wait_duration",
"tikv_propose_avg_wait_duration",
"tikv_apply_wait_duration",
"tikv_raftstore_apply_wait_duration",
"tikv_apply_avg_wait_duration",
"tikv_check_split_duration",
"tikv_storage_async_request_duration",
Expand Down Expand Up @@ -155,7 +155,7 @@ var inspectionSummaryRules = map[string][]string{
"tikv_grpc_avg_req_batch_size",
"tikv_grpc_avg_resp_batch_size",
"tikv_grpc_errors",
"tikv_grpc_messge_duration",
"tikv_grpc_message_duration",
"tikv_grpc_qps",
"tikv_grpc_req_batch_size",
"tikv_grpc_resp_batch_size",
Expand Down Expand Up @@ -219,7 +219,7 @@ var inspectionSummaryRules = map[string][]string{
"tikv_grpc_avg_req_batch_size",
"tikv_grpc_avg_resp_batch_size",
"tikv_grpc_errors",
"tikv_grpc_messge_duration",
"tikv_grpc_message_duration",
"tikv_grpc_qps",
"tikv_grpc_req_batch_size",
"tikv_grpc_resp_batch_size",
Expand All @@ -238,15 +238,15 @@ var inspectionSummaryRules = map[string][]string{
"tikv_scheduler_stage",
"tikv_scheduler_writing_bytes",
"tikv_propose_avg_wait_duration",
"tikv_propose_wait_duration",
"tikv_append_log_avg_duration",
"tikv_append_log_duration",
"tikv_commit_log_avg_duration",
"tikv_commit_log_duration",
"tikv_raftstore_propose_wait_duration",
"tikv_raftstore_append_log_avg_duration",
"tikv_raftstore_append_log_duration",
"tikv_raftstore_commit_log_avg_duration",
"tikv_raftstore_commit_log_duration",
"tikv_apply_avg_wait_duration",
"tikv_apply_log_avg_duration",
"tikv_apply_log_duration",
"tikv_apply_wait_duration",
"tikv_raftstore_apply_log_avg_duration",
"tikv_raftstore_apply_log_duration",
"tikv_raftstore_apply_wait_duration",
"tikv_engine_wal_sync_operations",
"tikv_engine_write_duration",
"tikv_engine_write_operations",
Expand Down Expand Up @@ -388,18 +388,18 @@ var inspectionSummaryRules = map[string][]string{
"tikv_approximate_avg_region_size",
"tikv_approximate_region_size_histogram",
"tikv_approximate_region_size",
"tikv_append_log_avg_duration",
"tikv_append_log_duration",
"tikv_commit_log_avg_duration",
"tikv_commit_log_duration",
"tikv_raftstore_append_log_avg_duration",
"tikv_raftstore_append_log_duration",
"tikv_raftstore_commit_log_avg_duration",
"tikv_raftstore_commit_log_duration",
"tikv_apply_avg_wait_duration",
"tikv_apply_log_avg_duration",
"tikv_apply_log_duration",
"tikv_apply_wait_duration",
"tikv_process_duration",
"tikv_process_handled",
"tikv_raftstore_apply_log_avg_duration",
"tikv_raftstore_apply_log_duration",
"tikv_raftstore_apply_wait_duration",
"tikv_raftstore_process_duration",
"tikv_raftstore_process_handled",
"tikv_propose_avg_wait_duration",
"tikv_propose_wait_duration",
"tikv_raftstore_propose_wait_duration",
"tikv_raft_dropped_messages",
"tikv_raft_log_speed",
"tikv_raft_message_avg_batch_size",
Expand Down
110 changes: 79 additions & 31 deletions infoschema/metric_table_def.go
Original file line number Diff line number Diff line change
Expand Up @@ -655,6 +655,22 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance"},
Quantile: 0.95,
},
"tidb_batch_client_wait_conn_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_batch_client_wait_connection_establish_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
Labels: []string{"instance"},
Quantile: 0.95,
Comment: "The quantile of batch client wait new connection establish durations",
},
"tidb_batch_client_wait_conn_total_count": {
PromQL: "sum(increase(tidb_tikvclient_batch_client_wait_connection_establish_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total count of batch client wait new connection establish",
},
"tidb_batch_client_wait_conn_total_time": {
PromQL: "sum(increase(tidb_tikvclient_batch_client_wait_connection_establish_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total time of batch client wait new connection establish",
},
"tidb_batch_client_unavailable_duration": {
Comment: "The quantile of kv storage batch processing unvailable durations",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_batch_client_unavailable_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le, instance))",
Expand Down Expand Up @@ -1069,7 +1085,7 @@ var MetricTableMap = map[string]MetricTableDef{
Comment: "The quantile size of requests into request batch per TiKV instance",
},

"tikv_grpc_messge_duration": {
"tikv_grpc_message_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_grpc_msg_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,instance))`,
Labels: []string{"instance", "type"},
Quantile: 0.99,
Expand Down Expand Up @@ -1138,33 +1154,33 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "type"},
Comment: "The total number of peers validated by the PD worker",
},
"tikv_apply_log_avg_duration": {
"tikv_raftstore_apply_log_avg_duration": {
PromQL: `sum(rate(tikv_raftstore_apply_log_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) / sum(rate(tikv_raftstore_apply_log_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) `,
Labels: []string{"instance"},
Comment: "The average time consumed when Raft applies log",
},
"tikv_apply_log_duration": {
"tikv_raftstore_apply_log_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
Labels: []string{"instance"},
Quantile: 0.99,
Comment: "The quantile time consumed when Raft applies log",
},
"tikv_append_log_avg_duration": {
"tikv_raftstore_append_log_avg_duration": {
PromQL: `sum(rate(tikv_raftstore_append_log_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) / sum(rate(tikv_raftstore_append_log_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION]))`,
Labels: []string{"instance"},
Comment: "The avg time consumed when Raft appends log",
},
"tikv_append_log_duration": {
"tikv_raftstore_append_log_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
Labels: []string{"instance"},
Quantile: 0.99,
Comment: "The quantile time consumed when Raft appends log",
},
"tikv_commit_log_avg_duration": {
"tikv_raftstore_commit_log_avg_duration": {
PromQL: `sum(rate(tikv_raftstore_commit_log_duration_seconds_sum[$RANGE_DURATION])) / sum(rate(tikv_raftstore_commit_log_duration_seconds_count[$RANGE_DURATION]))`,
Comment: "The time consumed when Raft commits log",
},
"tikv_commit_log_duration": {
"tikv_raftstore_commit_log_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
Labels: []string{"instance"},
Quantile: 0.99,
Expand All @@ -1175,12 +1191,12 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance"},
Comment: "The count of ready handled of Raft",
},
"tikv_process_handled": {
"tikv_raftstore_process_handled": {
PromQL: `sum(rate(tikv_raftstore_raft_process_duration_secs_count{$LABEL_CONDITIONS}[$RANGE_DURATION]))`,
Labels: []string{"instance", "type"},
Comment: "The count of different process type of Raft",
},
"tikv_process_duration": {
"tikv_raftstore_process_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,type))`,
Labels: []string{"instance", "type"},
Quantile: 0.99,
Expand Down Expand Up @@ -1249,7 +1265,7 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "type"},
Comment: "The total number of proposals per type in raft",
},
"tikv_propose_wait_duration": {
"tikv_raftstore_propose_wait_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
Labels: []string{"instance"},
Quantile: 0.99,
Expand All @@ -1260,7 +1276,7 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance"},
Comment: "The average wait time of each proposal",
},
"tikv_apply_wait_duration": {
"tikv_raftstore_apply_wait_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))`,
Labels: []string{"instance"},
Quantile: 0.99,
Expand Down Expand Up @@ -1354,6 +1370,22 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "type"},
Comment: "The average time which is caused by latch wait in command",
},
"tikv_scheduler_processing_read_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_scheduler_processing_read_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,type))`,
Labels: []string{"instance", "type"},
Quantile: 0.99,
Comment: "The quantile time of scheduler processing read in command",
},
"tikv_scheduler_processing_read_total_count": {
PromQL: "sum(increase(tikv_scheduler_processing_read_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total count of scheduler processing read in command",
},
"tikv_scheduler_processing_read_total_time": {
PromQL: "sum(increase(tikv_scheduler_processing_read_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total time of scheduler processing read in command",
},

"tikv_scheduler_keys_read": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_scheduler_kv_command_key_read_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance,type))`,
Expand Down Expand Up @@ -1573,8 +1605,8 @@ var MetricTableMap = map[string]MetricTableDef{
Comment: "The quantile of time consumed when handling coprocessor requests",
},
"tikv_cop_wait_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,req,instance))`,
Labels: []string{"instance", "req"},
PromQL: `histogram_quantile($QUANTILE, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,req,type,instance))`,
Labels: []string{"instance", "req", "type"},
Quantile: 1,
Comment: "The quantile of time consumed when coprocessor requests are wait for being handled",
},
Expand Down Expand Up @@ -2583,6 +2615,22 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "sql_type"},
Comment: "The total time of TiDB query durations(second)",
},
"tidb_txn_cmd_duration": {
PromQL: `histogram_quantile($QUANTILE, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,type,instance))`,
Labels: []string{"instance", "type"},
Quantile: 0.90,
Comment: "The quantile of TiDB transaction command durations(second)",
},
"tidb_txn_cmd_total_count": {
PromQL: "sum(increase(tidb_tikvclient_txn_cmd_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total count of TiDB transaction command",
},
"tidb_txn_cmd_total_time": {
PromQL: "sum(increase(tidb_tikvclient_txn_cmd_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total time of TiDB transaction command",
},
"tidb_slow_query_cop_process_total_count": {
PromQL: "sum(increase(tidb_server_slow_query_cop_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Expand Down Expand Up @@ -2643,31 +2691,31 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "type", "sql_type"},
Comment: "The total time of transaction execution durations, including retry(second)",
},
"tikv_append_log_total_count": {
"tikv_raftstore_append_log_total_count": {
PromQL: "sum(increase(tikv_raftstore_append_log_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total count of Raft appends log",
},
"tikv_append_log_total_time": {
"tikv_raftstore_append_log_total_time": {
PromQL: "sum(increase(tikv_raftstore_append_log_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total time of Raft appends log",
},
"tikv_apply_log_total_count": {
"tikv_raftstore_apply_log_total_count": {
PromQL: "sum(increase(tikv_raftstore_apply_log_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total count of Raft applies log",
},
"tikv_apply_log_total_time": {
"tikv_raftstore_apply_log_total_time": {
PromQL: "sum(increase(tikv_raftstore_apply_log_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total time of Raft applies log",
},
"tikv_apply_wait_total_count": {
"tikv_raftstore_apply_wait_total_count": {
PromQL: "sum(increase(tikv_raftstore_apply_wait_time_duration_secs_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
},
"tikv_apply_wait_total_time": {
"tikv_raftstore_apply_wait_total_time": {
PromQL: "sum(increase(tikv_raftstore_apply_wait_time_duration_secs_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
},
Expand Down Expand Up @@ -2697,12 +2745,12 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance"},
Comment: "The total time of time consumed when running split check in .9999",
},
"tikv_commit_log_total_count": {
"tikv_raftstore_commit_log_total_count": {
PromQL: "sum(increase(tikv_raftstore_commit_log_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total count of Raft commits log",
},
"tikv_commit_log_total_time": {
"tikv_raftstore_commit_log_total_time": {
PromQL: "sum(increase(tikv_raftstore_commit_log_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total time of Raft commits log",
Expand All @@ -2728,13 +2776,13 @@ var MetricTableMap = map[string]MetricTableDef{
Comment: "The total time of time consumed to handle coprocessor read requests",
},
"tikv_cop_wait_total_count": {
PromQL: "sum(increase(tikv_coprocessor_request_wait_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req)",
Labels: []string{"instance", "req"},
PromQL: "sum(increase(tikv_coprocessor_request_wait_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req,type)",
Labels: []string{"instance", "req", "type"},
Comment: "The total count of coprocessor requests that wait for being handled",
},
"tikv_cop_wait_total_time": {
PromQL: "sum(increase(tikv_coprocessor_request_wait_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req)",
Labels: []string{"instance", "req"},
PromQL: "sum(increase(tikv_coprocessor_request_wait_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,req,type)",
Labels: []string{"instance", "req", "type"},
Comment: "The total time of time consumed when coprocessor requests are wait for being handled",
},
"tikv_raft_store_events_total_count": {
Expand All @@ -2757,12 +2805,12 @@ var MetricTableMap = map[string]MetricTableDef{
Labels: []string{"instance", "task"},
Comment: "The total time of time consumed when executing GC tasks",
},
"tikv_grpc_messge_total_count": {
"tikv_grpc_message_total_count": {
PromQL: "sum(increase(tikv_grpc_msg_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total count of tikv execution gRPC message",
},
"tikv_grpc_messge_total_time": {
"tikv_grpc_message_total_time": {
PromQL: "sum(increase(tikv_grpc_msg_duration_seconds_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total time of execution time of gRPC message",
Expand Down Expand Up @@ -2803,22 +2851,22 @@ var MetricTableMap = map[string]MetricTableDef{
PromQL: "sum(increase(tikv_lock_manager_waiter_lifetime_duration_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
},
"tikv_process_total_count": {
"tikv_raftstore_process_total_count": {
PromQL: "sum(increase(tikv_raftstore_raft_process_duration_secs_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total count of peer processes in Raft",
},
"tikv_process_total_time": {
"tikv_raftstore_process_total_time": {
PromQL: "sum(increase(tikv_raftstore_raft_process_duration_secs_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type)",
Labels: []string{"instance", "type"},
Comment: "The total time of peer processes in Raft",
},
"tikv_propose_wait_total_count": {
"tikv_raftstore_propose_wait_total_count": {
PromQL: "sum(increase(tikv_raftstore_request_wait_time_duration_secs_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total count of each proposal",
},
"tikv_propose_wait_total_time": {
"tikv_raftstore_propose_wait_total_time": {
PromQL: "sum(increase(tikv_raftstore_request_wait_time_duration_secs_sum{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance)",
Labels: []string{"instance"},
Comment: "The total time of wait time of each proposal",
Expand Down
2 changes: 2 additions & 0 deletions metrics/session.go
Original file line number Diff line number Diff line change
Expand Up @@ -140,4 +140,6 @@ const (
LblOptimistic = "optimistic"
LblStore = "store"
LblAddress = "address"
LblBatchGet = "batch_get"
LblGet = "get"
)
Loading

0 comments on commit 1dd6ade

Please sign in to comment.