From 2396405059d54bd860005e70660cd541821e6345 Mon Sep 17 00:00:00 2001 From: Jonas Date: Wed, 24 Apr 2019 07:58:12 +0300 Subject: [PATCH 1/5] Add absent alerts --- alerts/absent_alerts.libsonnet | 46 ++++++++++++++++++++++++++++++++++ alerts/alerts.libsonnet | 1 + config.libsonnet | 16 ++++++++++++ 3 files changed, 63 insertions(+) create mode 100644 alerts/absent_alerts.libsonnet diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet new file mode 100644 index 0000000..7716629 --- /dev/null +++ b/alerts/absent_alerts.libsonnet @@ -0,0 +1,46 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'ceph-absent', + rules: [ + { + alert: 'CephMgrIsAbsent', + expr: ||| + absent(up{%(cephExporterSelector)s} == 1) + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Ceph Manager has disappeared from Prometheus target discovery.', + component: 'ceph-manager', + grafana_url: '%(grafanaMgrDashboardURL)s' % $._config, + }, + }, + ], + }, + { + name: 'ceph-down', + rules: [ + { + alert: 'CephMgrIsMissingReplicas', + expr: ||| + sum(up{%(cephExporterSelector)s}) != %(cephMgrCount)d + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + message: 'Ceph Manager is missing replicas.', + component: 'ceph-manager', + grafana_url: '%(grafanaMgrDashboardURL)s' % $._config, + }, + }, + ], + }, + ], + }, +} diff --git a/alerts/alerts.libsonnet b/alerts/alerts.libsonnet index 98a9629..601a27b 100644 --- a/alerts/alerts.libsonnet +++ b/alerts/alerts.libsonnet @@ -1,3 +1,4 @@ +(import 'absent_alerts.libsonnet') + (import 'monquorum.libsonnet') + (import 'node.libsonnet') + (import 'osd.libsonnet') + diff --git a/config.libsonnet b/config.libsonnet index 6b1bce3..db9aabd 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -3,6 +3,22 @@ // Selectors are inserted between {} in Prometheus queries. cephExporterSelector: 'job="rook-ceph-mgr"', + // Number of Ceph Managers which are reporting metrics + cephMgrCount: 3, + // Number of Ceph Monitors + cephMonCount: 3, + // Number of Ceph OSDs + cephOsdCount: 3, + + // Grafana url for Ceph-Cluster dashboard + grafanaClusterDashboardURL: '', + // Grafana url for OSD overview dashboard + grafanaOSDDashboardURL: '', + // Grafana url for Ceph-Cluster dashboard + grafanaMonDashboardURL: '', + // Grafana url for Ceph-Cluster dashboard + grafanaMgrDashboardURL: '', + // Duration to raise various Alerts cephNodeDownAlertTime: '30s', clusterStateAlertTime: '10m', From 8a89011b77e0688ed59be88c593ccbaf380a13ae Mon Sep 17 00:00:00 2001 From: Jonas Date: Sat, 4 May 2019 08:47:01 +0300 Subject: [PATCH 2/5] Fixes after review --- alerts/absent_alerts.libsonnet | 14 ++++++++------ config.libsonnet | 13 ------------- 2 files changed, 8 insertions(+), 19 deletions(-) diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet index 7716629..31bd015 100644 --- a/alerts/absent_alerts.libsonnet +++ b/alerts/absent_alerts.libsonnet @@ -14,9 +14,10 @@ severity: 'warning', }, annotations: { - message: 'Ceph Manager has disappeared from Prometheus target discovery.', - component: 'ceph-manager', - grafana_url: '%(grafanaMgrDashboardURL)s' % $._config, + message: 'Storage metrics collector service not available anymore.', + description: 'Ceph Manager has disappeared from Prometheus target discovery.', + storage_type: $._config.storageType, + severity_level: 'warning', }, }, ], @@ -34,9 +35,10 @@ severity: 'warning', }, annotations: { - message: 'Ceph Manager is missing replicas.', - component: 'ceph-manager', - grafana_url: '%(grafanaMgrDashboardURL)s' % $._config, + message: 'Storage metrics collector service not available anymore.', + description: 'Ceph Manager is missing replicas.', + storage_type: $._config.storageType, + severity_level: 'warning', }, }, ], diff --git a/config.libsonnet b/config.libsonnet index db9aabd..2e9660a 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -5,19 +5,6 @@ // Number of Ceph Managers which are reporting metrics cephMgrCount: 3, - // Number of Ceph Monitors - cephMonCount: 3, - // Number of Ceph OSDs - cephOsdCount: 3, - - // Grafana url for Ceph-Cluster dashboard - grafanaClusterDashboardURL: '', - // Grafana url for OSD overview dashboard - grafanaOSDDashboardURL: '', - // Grafana url for Ceph-Cluster dashboard - grafanaMonDashboardURL: '', - // Grafana url for Ceph-Cluster dashboard - grafanaMgrDashboardURL: '', // Duration to raise various Alerts cephNodeDownAlertTime: '30s', From 85014a6e95ef8856a901a3b9c76e279f0fccdc51 Mon Sep 17 00:00:00 2001 From: Jonas Date: Sat, 4 May 2019 08:49:24 +0300 Subject: [PATCH 3/5] Make alert time configurable --- alerts/absent_alerts.libsonnet | 4 ++-- config.libsonnet | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet index 31bd015..8ec70f4 100644 --- a/alerts/absent_alerts.libsonnet +++ b/alerts/absent_alerts.libsonnet @@ -9,7 +9,7 @@ expr: ||| absent(up{%(cephExporterSelector)s} == 1) ||| % $._config, - 'for': '5m', + 'for': $._config.mgrIsAbsentAlertTime, labels: { severity: 'warning', }, @@ -30,7 +30,7 @@ expr: ||| sum(up{%(cephExporterSelector)s}) != %(cephMgrCount)d ||| % $._config, - 'for': '5m', + 'for': $._config.mgrMissingReplicasAlertTime, labels: { severity: 'warning', }, diff --git a/config.libsonnet b/config.libsonnet index 2e9660a..bb4582f 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -18,6 +18,8 @@ osdDiskAlertTime: '1m', osdDownAlertTime: '5m', PGRepairAlertTime: '1h', + mgrMissingReplicasAlertTime: '5m', + mgrIsAbsentAlertTime: '5m', // Constants storageType: 'ceph', From fe2ca9d682b588954a97bc6a564f468f28943984 Mon Sep 17 00:00:00 2001 From: Jonas Date: Sat, 11 May 2019 08:06:01 +0300 Subject: [PATCH 4/5] Move alerts into ceph-mgr-status group --- alerts/absent_alerts.libsonnet | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet index 8ec70f4..3c6349d 100644 --- a/alerts/absent_alerts.libsonnet +++ b/alerts/absent_alerts.libsonnet @@ -2,7 +2,7 @@ prometheusAlerts+:: { groups+: [ { - name: 'ceph-absent', + name: 'ceph-mgr-status', rules: [ { alert: 'CephMgrIsAbsent', @@ -20,11 +20,6 @@ severity_level: 'warning', }, }, - ], - }, - { - name: 'ceph-down', - rules: [ { alert: 'CephMgrIsMissingReplicas', expr: ||| From 501fb486241c8aa31b5f86379f3e2f5d038e5ef0 Mon Sep 17 00:00:00 2001 From: Jonas Date: Wed, 15 May 2019 17:36:52 +0300 Subject: [PATCH 5/5] Fixes after review --- alerts/absent_alerts.libsonnet | 4 +- config.libsonnet | 4 +- extras/manifests/prometheus-ceph-rules.yaml | 48 +++++++++++++++++++++ 3 files changed, 52 insertions(+), 4 deletions(-) diff --git a/alerts/absent_alerts.libsonnet b/alerts/absent_alerts.libsonnet index 3c6349d..c690b64 100644 --- a/alerts/absent_alerts.libsonnet +++ b/alerts/absent_alerts.libsonnet @@ -23,14 +23,14 @@ { alert: 'CephMgrIsMissingReplicas', expr: ||| - sum(up{%(cephExporterSelector)s}) != %(cephMgrCount)d + sum(up{%(cephExporterSelector)s}) < %(cephMgrCount)d ||| % $._config, 'for': $._config.mgrMissingReplicasAlertTime, labels: { severity: 'warning', }, annotations: { - message: 'Storage metrics collector service not available anymore.', + message: "Storage metrics collector service doesn't have required no of replicas.", description: 'Ceph Manager is missing replicas.', storage_type: $._config.storageType, severity_level: 'warning', diff --git a/config.libsonnet b/config.libsonnet index bb4582f..bb1ec9f 100644 --- a/config.libsonnet +++ b/config.libsonnet @@ -3,8 +3,8 @@ // Selectors are inserted between {} in Prometheus queries. cephExporterSelector: 'job="rook-ceph-mgr"', - // Number of Ceph Managers which are reporting metrics - cephMgrCount: 3, + // Expected number of Ceph Managers which are reporting metrics + cephMgrCount: 1, // Duration to raise various Alerts cephNodeDownAlertTime: '30s', diff --git a/extras/manifests/prometheus-ceph-rules.yaml b/extras/manifests/prometheus-ceph-rules.yaml index 0609465..5c140e2 100644 --- a/extras/manifests/prometheus-ceph-rules.yaml +++ b/extras/manifests/prometheus-ceph-rules.yaml @@ -16,6 +16,30 @@ spec: - expr: | (sum((max(label_replace(ceph_disk_occupation{job="rook-ceph-mgr"},"node","$1","exported_instance","(.*)")) by (node)) * on (node) group_right() (label_replace(max by(pod_ip,node) (kube_pod_info{pod=~"node-exporter.*"}), "instance", "$1:9100", "pod_ip", "(.*)")) * on (instance) group_right() (irate(node_disk_read_time_seconds_total[1m]) + irate(node_disk_write_time_seconds_total[1m]) / (irate(node_disk_reads_completed_total[1m]) + irate(node_disk_writes_completed_total[1m]))>0))) record: cluster:ceph_disk_latency:join_ceph_node_disk_irate1m + - name: ceph-mgr-status + rules: + - alert: CephMgrIsAbsent + annotations: + description: Ceph Manager has disappeared from Prometheus target discovery. + message: Storage metrics collector service not available anymore. + severity_level: warning + storage_type: ceph + expr: | + absent(up{job="rook-ceph-mgr"} == 1) + for: 5m + labels: + severity: warning + - alert: CephMgrIsMissingReplicas + annotations: + description: Ceph Manager is missing replicas. + message: Storage metrics collector service doesn't have required no of replicas. + severity_level: warning + storage_type: ceph + expr: | + sum(up{job="rook-ceph-mgr"}) < 1 + for: 5m + labels: + severity: warning - name: quorum-alert.rules rules: - alert: CephMonQuorumAtRisk @@ -115,6 +139,30 @@ spec: for: 10m labels: severity: warning + - alert: CephOSDVersionMismatch + annotations: + description: There are {{ $value }} different versions of Ceph OSD components + running. + message: There are multiple versions of storage services running. + severity_level: warning + storage_type: ceph + expr: | + count(count(ceph_osd_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 + for: 10m + labels: + severity: warning + - alert: CephMonVersionMismatch + annotations: + description: There are {{ $value }} different versions of Ceph Mon components + running. + message: There are multiple versions of storage services running. + severity_level: warning + storage_type: ceph + expr: | + count(count(ceph_mon_metadata{job="rook-ceph-mgr"}) by (ceph_version)) > 1 + for: 10m + labels: + severity: warning - name: cluster-utilization-alert.rules rules: - alert: CephClusterNearFull