diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet new file mode 100644 index 0000000..c690b64 --- /dev/null +++ b/alerts/absent_alerts.libsonnet @@ -0,0 +1,43 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'ceph-mgr-status', + rules: [ + { + alert: 'CephMgrIsAbsent', + expr: ||| + absent(up{%(cephExporterSelector)s} == 1) + ||| % $._config, + 'for': $._config.mgrIsAbsentAlertTime, + labels: { + severity: 'warning', + }, + annotations: { + message: 'Storage metrics collector service not available anymore.', + description: 'Ceph Manager has disappeared from Prometheus target discovery.', + storage_type: $._config.storageType, + severity_level: 'warning', + }, + }, + { + alert: 'CephMgrIsMissingReplicas', + expr: ||| + sum(up{%(cephExporterSelector)s}) < %(cephMgrCount)d + ||| % $._config, + 'for': $._config.mgrMissingReplicasAlertTime, + labels: { + severity: 'warning', + }, + annotations: { + message: "Storage metrics collector service doesn't have required no of replicas.", + description: 'Ceph Manager is missing replicas.', + storage_type: $._config.storageType, + severity_level: 'warning', + }, + }, + ], + }, + ], + }, +} diff --git a/alerts/alerts.libsonnet b/alerts/alerts.libsonnet index 98a9629..601a27b 100644 --- a/alerts/alerts.libsonnet +++ b/alerts/alerts.libsonnet @@ -1,3 +1,4 @@ +(import 'absent_alerts.libsonnet') + (import 'monquorum.libsonnet') + (import 'node.libsonnet') + (import 'osd.libsonnet') + diff --git a/config.libsonnet b/config.libsonnet index 6b1bce3..bb1ec9f 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -3,6 +3,9 @@ // Selectors are inserted between {} in Prometheus queries. cephExporterSelector: 'job="rook-ceph-mgr"', + // Expected number of Ceph Managers which are reporting metrics + cephMgrCount: 1, + // Duration to raise various Alerts cephNodeDownAlertTime: '30s', clusterStateAlertTime: '10m', @@ -15,6 +18,8 @@ osdDiskAlertTime: '1m', osdDownAlertTime: '5m', PGRepairAlertTime: '1h', + mgrMissingReplicasAlertTime: '5m', + mgrIsAbsentAlertTime: '5m', // Constants storageType: 'ceph', diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index 0609465..5c140e2 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -16,6 +16,30 @@ spec: - expr: | (sum((max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node)) * on (node) group_right() (label_replace(max by(pod_ip,node) (kube_pod_info{pod=~"node-exporter.*"}), "instance", "$1:9100", "pod_ip", "(.*)")) * on (instance) group_right() (irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (irate(node_disk_reads_completed_total[1m]) + irate(node_disk_writes_completed_total[1m]))>0))) record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m + - name: ceph-mgr-status + rules: + - alert: CephMgrIsAbsent + annotations: + description: Ceph Manager has disappeared from Prometheus target discovery. + message: Storage metrics collector service not available anymore. + severity_level: warning + storage_type: ceph + expr: | + absent(up{job="rook-ceph-mgr"} == 1) + for: 5m + labels: + severity: warning + - alert: CephMgrIsMissingReplicas + annotations: + description: Ceph Manager is missing replicas. + message: Storage metrics collector service doesn't have required no of replicas. + severity_level: warning + storage_type: ceph + expr: | + sum(up{job="rook-ceph-mgr"}) < 1 + for: 5m + labels: + severity: warning - name: quorum-alert.rules rules: - alert: CephMonQuorumAtRisk @@ -115,6 +139,30 @@ spec: for: 10m labels: severity: warning + - alert: CephOSDVersionMismatch + annotations: + description: There are {{ $value }} different versions of Ceph OSD components + running. + message: There are multiple versions of storage services running. + severity_level: warning + storage_type: ceph + expr: | + count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 + for: 10m + labels: + severity: warning + - alert: CephMonVersionMismatch + annotations: + description: There are {{ $value }} different versions of Ceph Mon components + running. + message: There are multiple versions of storage services running. + severity_level: warning + storage_type: ceph + expr: | + count(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 + for: 10m + labels: + severity: warning - name: cluster-utilization-alert.rules rules: - alert: CephClusterNearFull