-
Notifications
You must be signed in to change notification settings - Fork 46
/
alert.yml
642 lines (642 loc) · 31 KB
/
alert.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: node-rules
namespace: monitoring
spec:
groups:
- name: external_node_alarm
rules:
- alert: node_host_lost
expr: up{job="external-node"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}}:service is down"
description: "{{$labels.instance}}: lost contact for 1 minutes"
- alert: HostOutOfMemory10
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 1m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例内存不足
description: "节点内存已用完 (剩余小于10%)已持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfMemory2
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 2
for: 1m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例内存不足,可能会发生内存溢出
description: "节点内存已用完 (剩余小于2%)已持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 1m
labels:
severity: warning
annotations:
summary: (instance {{ $labels.instance }})实例内存压力很大
description: "节点内存压力很大major page错误率高已持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 2m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例出现异常的input网络吞吐量
description: "{{ $labels.instance }}网络接口接收超过100MB/s的数据已持续2分钟 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 2m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例出现异常的output网络吞吐量
description: "{{ $labels.instance }}实例网络发送了超过100MB/s的数据已持续2分钟 (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRatecritical
expr: sum by (instance) (rate(node_disk_read_bytes_total[1m])) / 1024 / 1024 > 100
for: 1m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例磁盘读取过高
description: "{{ $labels.instance }}实例磁盘读取超过>100MB/s avg [HDD]已持续1分钟 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadRatewarning
expr: sum by (instance) (rate(node_disk_read_bytes_total[1m])) / 1024 / 1024 > 60
for: 1m
labels:
severity: warning
annotations:
summary: (instance {{ $labels.instance }})实例磁盘读取过高
description: "{{ $labels.instance }}实例磁盘读取超过>60MB/s avg [HDD]已持续1分钟 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteRatewarning
expr: sum by (instance) (rate(node_disk_written_bytes_total[1m])) / 1024 / 1024 > 60
for: 1m
labels:
severity: warning
annotations:
summary: (instance {{ $labels.instance }})实例磁盘写入过高
description: "{{ $labels.instance }}实例磁盘写入超过60MB/s avg [HDD]已持续1分钟 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteRatecritical
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 60
for: 2m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例磁盘写入过高
description: "{{ $labels.instance }}实例磁盘写入超过60MB/s avg [HDD]已持续2分钟 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpacefree
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例磁盘空间不足
description: "{{ $labels.instance }}磁盘快满了,剩余不足10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# Please add ignored mountpoints in node_exporter parameters like
# "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
# Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
- alert: HostOutOfDiskSpacehigh
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 5 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 1m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例磁盘空间不足
description: "{{ $labels.instance }}磁盘快满了,剩余不足5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})在24小时内就会写满
description: "{{ $labels.instance }}在当前的写入速率下,预计文件系统将会在24小时内耗尽磁盘空间\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例inodes不足
description: "{{ $labels.instance }}磁盘几乎用完了inodes,剩余不足10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostInodesWillFillIn24Hours
expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 *3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
description: "{{ $labels.instance }}以当前写入速率,文件系统预计将在未来 24小时内耗尽inode\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.2 and rate(node_disk_reads_completed_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例磁盘读取延迟过高
description: "磁盘读取延迟正在增长,当下大于200ms已持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
for: 1m
labels:
severity: warning
annotations:
summary: (instance {{ $labels.instance }})实例磁盘写入延迟过高
description: "磁盘写入延迟正在增长,当下大于100ms已持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad85
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) > 85
for: 1m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例CPU负载过高
description: "当前CPU load高于85%持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostHighCpuLoad75
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) > 75
for: 1m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例CPU负载过高
description: "当前CPU load高于75%持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 5m
labels:
severity: warning
annotations:
summary: (instance {{ $labels.instance }})CPU steal time开始增长
description: "CPU steal time大于10%已持续5分钟. 这意味着CPU资源不足或vm资源竞争。当前实例计算能力将开始下降.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
# 1000 context switches is an arbitrary number.
# Alert threshold depends on nature of application.
# Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
- alert: HostContextSwitching
expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000
for: 0m
labels:
severity: warning
annotations:
summary: (instance {{ $labels.instance }})主机上下文切换异常
description: "实例上下文切换正在增长,当下超过10000/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: (instance {{ $labels.instance }})swap内存即将用完
description: "Swap使用超过80% \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostSystemdServiceCrashed
expr: node_systemd_unit_state{state="failed"} == 1
for: 0m
labels:
severity: critical
annotations:
summary: 请注意,(instance {{ $labels.instance }})当前实例systemd service服务已崩溃
description: "当前实例systemd service崩溃\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostKernelVersionDeviations
expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
for: 6h
labels:
severity: warning
annotations:
summary: (instance {{ $labels.instance }})实例内核版本偏差
description: "正在运行的内核版本:\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: increase(node_vmstat_oom_kill[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})实例发生OOM kill事件
description: "检测到OOM kill\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: (instance {{ $labels.instance }})主机网络接收错误
description: "主机 {{ $labels.instance }} interface {{ $labels.device }} 在过去五分钟内遇到了 {{ printf \"%.0f\" $value }} 接收错误。\n VALUE = { { $value }}\n 标签 = {{ $labels }}"
- alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: (instance {{ $labels.instance }})主机网络传输错误
description: "主机 {{ $labels.instance }} interface {{ $labels.device }} 在过去五分钟内遇到了 {{ printf \"%.0f\" $value }} 传输错误。\n VALUE = { { $value }}\n 标签 = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
for: 1m
labels:
severity: critical
annotations:
summary: (instance {{ $labels.instance }})网络接口接近饱和
description: "\"{{ $labels.instance }}\" 上的网络接口 \"{{ $labels.interface }}\" 正在过载。\n VALUE = {{ $value }}\n LABELS = { { $labels }}"
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: 主机 conntrack 限制(instance {{ $labels.instance }})
description: "conntrack 数量接近限制\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 2m
labels:
severity: warning
annotations:
summary: 主机时钟偏差(instance {{ $labels.instance }})
description: "检测到时钟偏差。时钟不同步。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacCorrectableErrorsDetected
expr: increase(node_edac_correctable_errors_total[1m]) > 0
for: 0m
labels:
severity: info
annotations:
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostEdacUncorrectableErrorsDetected
expr: node_edac_uncorrectable_errors_total > 0
for: 0m
labels:
severity: warning
annotations:
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: node_disk_Utilization_high
expr: rate(node_disk_io_time_seconds_total[5m]) * 100 > 90
for: 1m
labels:
severity: critical
annotations:
summary: "{{$labels.instance}}: disk Utilization > {{ $value }} % "
description: "{{$labels.instance}}: disk Utilization > {{ $value }}. this should attract attention . disk performance seems to be problematic .for 1 minutes"
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: redis-rules
namespace: monitoring
spec:
groups:
- name: redisStatsAlert
rules:
- alert: redis is down
expr: up{job="external-redis"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }} redis is down"
description: "{{ $labels.instance }} redis is down "
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: rabbitmq-rules
namespace: monitoring
spec:
groups:
- name: rabbitmqStatsAlert
rules:
- alert: rabbitmq is down
expr: up{job="external-rabbitmq"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }} rabbitmq is down"
description: "{{ $labels.instance }} rabbitmq is down "
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: mysql-rules
namespace: monitoring
spec:
groups:
- name: MySQLStatsAlert
rules:
- alert: MySQL is down
expr: mysql_up == 0
for: 1m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }} MySQL is down"
description: "MySQL database is down. This requires immediate action!"
- alert: MysqlTooManyConnections(>80%)
expr: avg by (instance) (rate(mysql_global_status_threads_connected[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
description: "超过 80% 的 MySQL 连接在 {{ $labels.instance }} 上使用\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlHighThreadsRunning
expr: avg by (instance) (rate(mysql_global_status_threads_running[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60
for: 2m
labels:
severity: warning
annotations:
summary: MySQL high threads running (instance {{ $labels.instance }})
description: "超过 60% 的 MySQL 连接在 {{ $labels.instance }} 上处于运行状态\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MysqlRestarted
expr: mysql_global_status_uptime < 60
for: 0m
labels:
severity: critical
annotations:
summary: MySQL重新启动 (instance {{ $labels.instance }})
description: "MySQL 刚刚重新启动,不到一分钟前在 {{ $labels.instance }} 上。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## MySQL Slave IO 线程未运行
#- alert: MysqlSlaveIoThreadNotRunning
# expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
# description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## MySQL Slave SQL 线程未运行
#- alert: MysqlSlaveSqlThreadNotRunning
# expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
# for: 0m
# labels:
# severity: critical
# annotations:
# summary: MySQL Slave SQL thread not running (instance {{ $labels.instance }})
# description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## MySQL Slave 复制滞后
#- alert: MysqlSlaveReplicationLag
# expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30
# for: 1m
# labels:
# severity: critical
# annotations:
# summary: MySQL Slave replication lag (instance {{ $labels.instance }})
# description: "MySQL replication lag on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## MySQL 慢查询
#- alert: MysqlSlowQueries
# expr: increase(mysql_global_status_slow_queries[1m]) > 0
# for: 2m
# labels:
# severity: warning
# annotations:
# summary: MySQL slow queries (instance {{ $labels.instance }})
# description: "MySQL server mysql has some new slow query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## MySQL innodb 日志写入停滞 MySQL InnoDB 日志等待
#- alert: MysqlInnodbLogWaits
# expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
# for: 0m
# labels:
# severity: warning
# annotations:
# summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
# description: "MySQL innodb log writes stalling\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: kafka-rules
namespace: monitoring
spec:
groups:
- name: external_kafka_alarm
rules:
- alert: KafkaTopicsReplicas
expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
for: 0m
labels:
severity: critical
annotations:
summary: Kafka topics replicas (instance {{ $labels.instance }})
description: "Kafka 副本分区\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: kafka_is_down
expr: up{job="external-kafka"} == 0
for: 0m
labels:
severity: critical
annotations:
summary: Kafka server is down (instance {{ $labels.instance }})
description: " {{ $labels.instance }} Kafka server is down now!"
- alert: KafkaConsumersGroup
expr: sum(kafka_consumergroup_lag) by (consumergroup) > 8888
for: 1m
labels:
severity: critical
annotations:
summary: Kafka consumers group (instance {{ $labels.instance }})
description: "Kafka 消费者组\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KafkaTopicOffsetDecreased
expr: delta(kafka_burrow_partition_current_offset[1m]) < 0
for: 0m
labels:
severity: warning
annotations:
summary: Kafka topic offset decreased (instance {{ $labels.instance }})
description: "Kafka 主题偏移量已减少\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: KafkaConsumerLag
expr: 'kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m)
AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0'
for: 15m
labels:
severity: critical
annotations:
summary: Kafka consumer lag (instance {{ $labels.instance }})
description: "卡夫卡消费者有 30 分钟的延迟并且越来越多\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: mongodb-rules
namespace: monitoring
spec:
groups:
- name: external_mongodb_alarm
rules:
- alert: MongodbDown
expr: mongodb_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: MongoDB Down (instance {{ $labels.instance }})
description: "MongoDB 实例已关闭\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbReplicationLag
#expr: mongodb_mongod_replset_member_optime_date{state="PRIMARY"} - ON (set) mongodb_mongod_replset_member_optime_date{state="SECONDARY"} > 10
expr: avg(mongodb_mongod_replset_member_optime_date{state="PRIMARY"})-avg(mongodb_mongod_replset_member_optime_date{state="SECONDARY"}) > 10
for: 0m
labels:
severity: critical
annotations:
summary: MongoDB replication lag (instance {{ $labels.instance }})
description: "Mongodb 复制延迟超过 10 秒\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbNumberCursorsOpen
expr: mongodb_mongod_metrics_cursor_open{state="total"} > 10 * 1000
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB number cursors open (instance {{ $labels.instance }})
description: "MongoDB 为客户端打开了太多cursors (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbCursorsTimeouts
expr: increase(mongodb_mongod_metrics_cursor_timed_out_total[1m]) > 100
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
description: "太多的cursors 超时\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: MongodbTooManyConnections
expr: avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: MongoDB too many connections (instance {{ $labels.instance }})
description: "连接过多 (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: etcd-rules
namespace: monitoring
spec:
groups:
- name: external_etcd_alarm
rules:
- alert: EtcdInsufficientMembers
expr: count(etcd_server_id) % 2 == 0
for: 0m
labels:
severity: critical
annotations:
summary: etcd cluster 已崩溃 (instance {{ $labels.instance }})
description: "Etcd 集群应该有奇数个成员, \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdNoLeader
expr: etcd_server_has_leader == 0
for: 0m
labels:
severity: critical
annotations:
summary: Etcd 选举失败 (instance {{ $labels.instance }})
description: "Etcd cluster have no leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighNumberOfLeaderChanges
expr: increase(etcd_server_leader_changes_seen_total[10m]) > 2
for: 0m
labels:
severity: warning
annotations:
summary: Etcd leader 切换异常 (instance {{ $labels.instance }})
description: "10分钟内leader切换了两次\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
#- alert: EtcdHighNumberOfFailedGrpcRequestswarning
# expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))> 1
# for: 10m
# labels:
# severity: warning
# annotations:
# summary: Etcd 大量失败的 GRPC 请求 (instance {{ $labels.instance }})
# description: "在 Etcd 中检测到超过 1% 的 GRPC 请求失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
#- alert: EtcdHighNumberOfFailedGrpcRequestscritical
# expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5
# for: 5m
# labels:
# severity: critical
# annotations:
# summary: Etcd 大量失败的 GRPC 请求 (instance {{ $labels.instance }})
# description: "在 Etcd 中检测到超过 5% 的 GRPC 请求失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdGrpcRequestsSlow
expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15
for: 2m
labels:
severity: critical
annotations:
summary: Etcd GRPC 请求缓慢(instance {{ $labels.instance }})
description: "GRPC 请求变慢,99% 超过 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighNumberOfFailedHttpRequestswarning
expr: sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: Etcd 大量失败的 HTTP 请求 (instance {{ $labels.instance }})
description: "在 Etcd 中检测到超过 1% 的 HTTP 失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighNumberOfFailedHttpRequestscritical
expr: sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05
for: 2m
labels:
severity: critical
annotations:
summary: Etcd 大量失败的 HTTP 请求 (instance {{ $labels.instance }})
description: "在 Etcd 中检测到超过 5% 的 HTTP 失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHttpRequestsSlow
expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15
for: 2m
labels:
severity: warning
annotations:
summary: Etcd HTTP 请求缓慢 (instance {{ $labels.instance }})
description: "Etcd HTTP 请求变慢,99% 超过 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdMemberCommunicationSlow
expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.2
for: 2m
labels:
severity: critical
annotations:
summary: Etcd成员通讯缓慢 (instance {{ $labels.instance }})
description: "Etcd 成员通信变慢,99% 超过 0.2s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighNumberOfFailedProposals
expr: increase(etcd_server_proposals_failed_total[1h]) > 5
for: 2m
labels:
severity: warning
annotations:
summary: Etcd 大量失败的proposals (instance {{ $labels.instance }})
description: "Etcd 服务器在过去一小时收到了超过 5 个失败的proposals\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighFsyncDurations
expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5
for: 2m
labels:
severity: critical
annotations:
summary: Etcd fsync 持续时间变高 (instance {{ $labels.instance }})
description: "Etcd WAL fsync 持续时间增加,99% 超过 0.5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: EtcdHighCommitDurations
expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25
for: 2m
labels:
severity: warning
annotations:
summary: Etcd 提交持续时间较高 (instance {{ $labels.instance }})
description: "Etcd 提交持续时间增加,99% 超过 0.25s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"