Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[close #287] fix alertmanager for tikv-cdc #288

Merged
merged 3 commits into from
Nov 3, 2022
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 29 additions & 53 deletions cdc/metrics/alertmanager/ticdc.rules.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,111 +2,99 @@ groups:
- name: alert.rules
rules:
- alert: cdc_multiple_owners
expr: sum(rate(ticdc_owner_ownership_counter[30s])) >= 2
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest:

  1. Rename the file to tikv-cdc.rules.yml.
  2. Change the alert names to avoid conflict with TiCDC.
  3. Change "cdc" to "TiKV-CDC" in summary to avoid misunstanding of user.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good suggest, modified, thanks!

expr: sum(rate(tikv_cdc_owner_ownership_counter[30s])) >= 2
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: sum(rate(ticdc_owner_ownership_counter[30s])) >= 2
expr: sum(rate(tikv_cdc_owner_ownership_counter[30s])) >= 2
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc cluster has multiple owners

- alert: cdc_no_owner
expr: sum(rate(ticdc_owner_ownership_counter[30s])) < 0.5
expr: sum(rate(tikv_cdc_owner_ownership_counter[30s])) < 0.5
for: 10m
labels:
env: ENV_LABELS_ENV
level: warning
expr: sum(rate(ticdc_owner_ownership_counter[30s])) < 0.5
expr: sum(rate(tikv_cdc_owner_ownership_counter[30s])) < 0.5
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc cluster has no owner for more than 10 minutes

- alert: cdc_checkpoint_high_delay
expr: ticdc_processor_checkpoint_ts_lag > 600
expr: tikv_cdc_processor_checkpoint_ts_lag > 600
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: ticdc_processor_checkpoint_ts_lag > 600
expr: tikv_cdc_processor_checkpoint_ts_lag > 600
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc processor checkpoint delay more than 10 minutes

- alert: cdc_resolvedts_high_delay
expr: ticdc_processor_resolved_ts_lag > 300
expr: tikv_cdc_processor_resolved_ts_lag > 300
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: ticdc_processor_resolved_ts_lag > 300
expr: tikv_cdc_processor_resolved_ts_lag > 300
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc processor resolved ts delay more than 5 minutes

- alert: ticdc_mounter_unmarshal_and_mount_time_more_than_1s
expr: histogram_quantile(0.9, rate(ticdc_mounter_unmarshal_and_mount_bucket[1m])) * 1000 > 1000
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(ticdc_mounter_unmarshal_and_mount_bucket[1m])) * 1000 > 1000
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc_mounter unmarshal and mount time more than 1s

- alert: cdc_sink_execute_duration_time_more_than_10s
expr: histogram_quantile(0.9, rate(ticdc_sink_txn_exec_duration_bucket[1m])) > 10
expr: histogram_quantile(0.9, rate(tikv_cdc_sink_txn_exec_duration_bucket[1m])) > 10
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(ticdc_sink_txn_exec_duration_bucket[1m])) > 10
expr: histogram_quantile(0.9, rate(tikv_cdc_sink_txn_exec_duration_bucket[1m])) > 10
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc sink execute_duration_time_more_than_10s

- alert: cdc_processor_checkpoint_tso_no_change_for_1m
expr: changes(ticdc_processor_checkpoint_ts[1m]) < 1
expr: changes(tikv_cdc_processor_checkpoint_ts[1m]) < 1
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(ticdc_processor_checkpoint_ts[1m]) < 1
expr: changes(tikv_cdc_processor_checkpoint_ts[1m]) < 1
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc processor checkpoint tso no change for 1m

- alert: ticdc_puller_entry_sorter_sort_bucket
expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket{}[1m])) > 1
- alert: cdc_puller_entry_sorter_sort_bucket
expr: histogram_quantile(0.9, rate(tikv_cdc_puller_entry_sorter_sort_bucket{}[1m])) > 1
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_sort_bucket{}[1m]))
expr: histogram_quantile(0.9, rate(tikv_cdc_puller_entry_sorter_sort_bucket{}[1m]))
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: ticdc puller entry sorter sort latency is too high
summary: cdc puller entry sorter sort latency is too high

- alert: ticdc_puller_entry_sorter_merge_bucket
expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket{}[1m])) > 1
- alert: cdc_puller_entry_sorter_merge_bucket
expr: histogram_quantile(0.9, rate(tikv_cdc_puller_entry_sorter_merge_bucket{}[1m])) > 1
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: histogram_quantile(0.9, rate(ticdc_puller_entry_sorter_merge_bucket{}[1m]))
expr: histogram_quantile(0.9, rate(tikv_cdc_puller_entry_sorter_merge_bucket{}[1m]))
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: ticdc puller entry sorter merge latency is too high
summary: cdc puller entry sorter merge latency is too high

- alert: tikv_cdc_min_resolved_ts_no_change_for_1m
expr: changes(tikv_cdc_min_resolved_ts[1m]) < 1 and ON (instance) tikv_cdc_region_resolve_status{status="resolved"} > 0
Expand All @@ -118,7 +106,7 @@ groups:
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $labels.instance }}'
summary: tikv cdc min resolved ts no change for 1m
summary: tikv cdc component min resolved ts no change for 1m

- alert: tikv_cdc_scan_duration_seconds_more_than_10min
expr: histogram_quantile(0.9, rate(tikv_cdc_scan_duration_seconds_bucket{}[1m])) > 600
Expand All @@ -130,40 +118,28 @@ groups:
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: tikv cdc scan duration seconds more than 10 min

- alert: ticdc_sink_mysql_execution_error
expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: changes(ticdc_sink_mysql_execution_error[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc sink mysql execution meets errors
summary: tikv cdc component scan duration seconds more than 10 min

- alert: ticdc_processor_exit_with_error_count
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
- alert: cdc_processor_exit_with_error_count
expr: changes(tikv_cdc_processor_exit_with_error_count[1m]) > 0
for: 1m
labels:
env: ENV_LABELS_ENV
level: critical
expr: changes(ticdc_processor_exit_with_error_count[1m]) > 0
expr: changes(tikv_cdc_processor_exit_with_error_count[1m]) > 0
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}'
value: '{{ $value }}'
summary: cdc processor exits with error

- alert: ticdc_memory_abnormal
expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
- alert: tikv_cdc_memory_abnormal
expr: go_memstats_heap_alloc_bytes{job="tikv-cdc"} > 1e+10
for: 1m
labels:
env: ENV_LABELS_ENV
level: warning
expr: go_memstats_heap_alloc_bytes{job="ticdc"} > 1e+10
expr: go_memstats_heap_alloc_bytes{job="tikv-cdc"} > 1e+10
annotations:
description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}'
value: '{{ $value }}'
summary: TiCDC heap memory usage is over 10 GB
summary: TiKV-CDC heap memory usage is over 10 GB