From daea8a27490fb613378fc8a831925d7048127172 Mon Sep 17 00:00:00 2001 From: 7840vz <122374011+7840vz@users.noreply.github.com> Date: Thu, 20 Apr 2023 16:00:26 +0300 Subject: [PATCH 1/2] feat: map ems severity to prom sev Uses label 'severity' coming from EMS and maps to recommended prometheus scale of info/warning/critical: https://monitoring.mixins.dev/#guidelines-for-alert-names-labels-and-annotations --- container/prometheus/ems_alert_rules.yml | 921 +++++++++++++++++++++-- 1 file changed, 865 insertions(+), 56 deletions(-) diff --git a/container/prometheus/ems_alert_rules.yml b/container/prometheus/ems_alert_rules.yml index b8cb14ca5..ff6da70bc 100644 --- a/container/prometheus/ems_alert_rules.yml +++ b/container/prometheus/ems_alert_rules.yml @@ -1,5 +1,4 @@ # Example Harvest ems alerts - groups: - name: Harvest Ems Alert rules: @@ -8,7 +7,22 @@ groups: - alert: Volume Anti-ransomware Monitoring State Changed expr: last_over_time(ems_events{message="arw.volume.state"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Anti-ransomware state was changed to [{{ $labels.op }}] for Volume uuid [{{ $labels.volume_uuid }}]." @@ -16,7 +30,22 @@ groups: - alert: Storage VM Anti-ransomware Monitoring State Changed expr: last_over_time(ems_events{message="arw.vserver.state"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Anti-ransomware state was changed to [{{ $labels.op }}] for SVM name [{{ $labels.vserverName }}]." @@ -24,7 +53,22 @@ groups: - alert: Ransomware Activity Detected expr: last_over_time(ems_events{message="callhome.arw.activity.seen"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Ransomware activity detected for Volume uuid [{{ $labels.volume_uuid }}]." @@ -32,7 +76,22 @@ groups: - alert: NVRAM Battery Low expr: last_over_time(ems_events{message="callhome.battery.low"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVRAM battery low for Node uuid [{{ $labels.node_uuid }}]" @@ -40,7 +99,22 @@ groups: - alert: HA Interconnect Down expr: last_over_time(ems_events{message="callhome.hainterconnect.down"} [1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "HA interconnect down for Node uuid [{{ $labels.node_uuid }}]." @@ -48,7 +122,22 @@ groups: - alert: Shadow Copy Failed expr: last_over_time(ems_events{message="cifs.shadowcopy.failure"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Shadow copy failed for Object uuid [{{ $labels.object_uuid }}]." @@ -56,7 +145,22 @@ groups: - alert: AWS Credentials Not Initialized expr: last_over_time(ems_events{message="cloud.aws.iamNotInitialized"} [5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "AWS credentials not initialized on Node uuid [{{ $labels.node_uuid }}]." @@ -64,7 +168,22 @@ groups: - alert: Storage Switch Power Supplies Failed expr: last_over_time(ems_events{message="cluster.switch.pwr.fail"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Storage switch power supplies failed on Cluster uuid [{{ $labels.cluster_uuid }}]." @@ -72,7 +191,22 @@ groups: - alert: Disk Out of Service expr: last_over_time(ems_events{message="disk.outOfService"} [5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Disk out of service for Node uuid [{{ $labels.node_uuid }}]." @@ -80,7 +214,22 @@ groups: - alert: FabricPool Space Usage Limit Reached expr: last_over_time(ems_events{message="fabricpool.full"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "FabricPool space usage limit reached for Cluster uuid [{{ $labels.cluster_uuid }}]." @@ -88,7 +237,22 @@ groups: - alert: FabricPool Space Usage Limit Nearly Reached expr: last_over_time(ems_events{message="fabricpool.nearly.full"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "FabricPool space usage limit nearly reached for Cluster uuid [{{ $labels.cluster_uuid }}]." @@ -96,7 +260,22 @@ groups: - alert: Giveback of Aggregate Failed expr: last_over_time(ems_events{message="gb.netra.ca.check.failed"} [4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Giveback of aggregate failed of Aggregate uuid [{{ $labels.aggr_uuid }}]." @@ -104,7 +283,22 @@ groups: - alert: LUN Destroyed expr: last_over_time(ems_events{message="LUN.destroy"} [5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "LUN destroyed of Object uuid [{{ $labels.object_uuid }}]." @@ -112,7 +306,22 @@ groups: - alert: LUN Offline expr: last_over_time(ems_events{message="LUN.offline"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Lun offline for Object uuid [{{ $labels.object_uuid }}]" @@ -120,7 +329,22 @@ groups: - alert: Node Root Volume Space Low expr: last_over_time(ems_events{message="mgmtgwd.rootvolrec.low.space"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Node root volume space low on Node uuid [{{ $labels.node_uuid }}]" @@ -128,7 +352,22 @@ groups: - alert: System Cannot Operate Due to Main Unit Fan Failure expr: last_over_time(ems_events{message="monitor.fan.critical"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "System cannot operate due to main unit fan failure for Node uuid [{{ $labels.node_uuid }}]" @@ -136,7 +375,22 @@ groups: - alert: Main Unit Fan Failed expr: last_over_time(ems_events{message="monitor.fan.failed"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Main unit fan failed for Node uuid [{{ $labels.node_uuid }}]" @@ -144,7 +398,22 @@ groups: - alert: Main Unit Fan in Warning State expr: last_over_time(ems_events{message="monitor.fan.warning"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Main unit fan in warning state for Node uuid [{{ $labels.node_uuid }}]" @@ -152,7 +421,22 @@ groups: - alert: Too Many CIFS Authentication expr: last_over_time(ems_events{message="Nblade.cifsManyAuths"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Too many CIFS authentication on Object uuid [{{ $labels.object_uuid }}]" @@ -160,7 +444,22 @@ groups: - alert: Max Times Open Per File Exceeded expr: last_over_time(ems_events{message="Nblade.cifsMaxOpenSameFile"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Max times open per file exceeded on Object uuid [{{ $labels.object_uuid }}]" @@ -168,7 +467,22 @@ groups: - alert: Max Sessions Per User Exceeded expr: last_over_time(ems_events{message="Nblade.cifsMaxSessPerUsrConn"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Max sessions per user exceeded for Object uuid [{{ $labels.object_uuid }}]" @@ -176,7 +490,22 @@ groups: - alert: NetBIOS Name Conflict expr: last_over_time(ems_events{message="Nblade.cifsNbNameConflict"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NetBIOS name conflict for Object uuid [{{ $labels.object_uuid }}]" @@ -184,7 +513,22 @@ groups: - alert: Nonexistent Admin Share expr: last_over_time(ems_events{message="Nblade.cifsNoPrivShare"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Nonexistent admin share for Object uuid [{{ $labels.object_uuid }}]" @@ -192,7 +536,22 @@ groups: - alert: NFSv4 Store Pool Exhausted expr: last_over_time(ems_events{message="Nblade.nfsV4PoolExhaust"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NFSv4 store pool exhausted for Object uuid [{{ $labels.object_uuid }}]" @@ -200,7 +559,22 @@ groups: - alert: Unauthorized User Access to Admin Share expr: last_over_time(ems_events{message="Nblade.vscanBadUserPrivAccess"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Unauthorized user access to admin share for Object uuid [{{ $labels.object_uuid }}]" @@ -208,7 +582,22 @@ groups: - alert: Antivirus Server Busy expr: last_over_time(ems_events{message="Nblade.vscanConnBackPressure"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Antivirus server busy for Object uuid [{{ $labels.object_uuid }}]" @@ -216,7 +605,22 @@ groups: - alert: No Registered Scan Engine expr: last_over_time(ems_events{message="Nblade.vscanNoRegdScanner"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "No registered scan engine for Object uuid [{{ $labels.object_uuid }}]" @@ -224,7 +628,22 @@ groups: - alert: No Vscan Connection expr: last_over_time(ems_events{message="Nblade.vscanNoScannerConn"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "No vscan connection for Object uuid [{{ $labels.object_uuid }}]" @@ -232,7 +651,22 @@ groups: - alert: Virus Detected expr: last_over_time(ems_events{message="Nblade.vscanVirusDetected"}[1w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Virus detected for Object uuid [{{ $labels.object_uuid }}]" @@ -240,7 +674,22 @@ groups: - alert: Non-responsive AntiVirus Server expr: last_over_time(ems_events{message="Nblade.vscanConnInactive"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Non-responsive antiVirus server for Object uuid [{{ $labels.object_uuid }}]" @@ -248,7 +697,22 @@ groups: - alert: NVMe Namespace Destroyed expr: last_over_time(ems_events{message="NVMeNS.destroy"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe namespace destroyed for Object uuid [{{ $labels.object_uuid }}]" @@ -256,7 +720,22 @@ groups: - alert: NVMe Namespace Offline expr: last_over_time(ems_events{message="NVMeNS.offline"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe namespace offline for Object uuid [{{ $labels.object_uuid }}]" @@ -264,7 +743,22 @@ groups: - alert: NVMe Namespace Online expr: last_over_time(ems_events{message="NVMeNS.online"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe namespace online for Object uuid [{{ $labels.object_uuid }}]" @@ -272,7 +766,22 @@ groups: - alert: NVMe-oF Grace Period Active expr: last_over_time(ems_events{message="nvmf.graceperiod.active"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe-oF grace period active for Cluster uuid [{{ $labels.cluster_uuid }}]" @@ -280,7 +789,22 @@ groups: - alert: NVMe-oF Grace Period Expired expr: last_over_time(ems_events{message="nvmf.graceperiod.expired"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe-oF grace period expired for Cluster uuid [{{ $labels.cluster_uuid }}]" @@ -288,7 +812,22 @@ groups: - alert: NVMe-oF Grace Period Start expr: last_over_time(ems_events{message="nvmf.graceperiod.start"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "NVMe-oF grace period start for Cluster uuid [{{ $labels.cluster_uuid }}]" @@ -296,7 +835,22 @@ groups: - alert: Cloud Tier Unreachable expr: last_over_time(ems_events{message="object.store.unavailable"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Cloud tier unreachable for Node uuid [{{ $labels.node_uuid }}]" @@ -304,7 +858,22 @@ groups: - alert: Object Store Host Unresolvable expr: last_over_time(ems_events{message="objstore.host.unresolvable"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Object store host unresolvable for Node uuid [{{ $labels.node_uuid }}]" @@ -312,7 +881,22 @@ groups: - alert: Object Store Intercluster LIF Down expr: last_over_time(ems_events{message="objstore.interclusterlifDown"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Object store intercluster LIF down for Node uuid [{{ $labels.node_uuid }}]" @@ -320,7 +904,22 @@ groups: - alert: Object Store Signature Mismatch expr: last_over_time(ems_events{message="osc.signatureMismatch"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Object store signature mismatch for Node uuid [{{ $labels.node_uuid }}]" @@ -328,7 +927,22 @@ groups: - alert: QoS Monitor Memory Maxed Out expr: last_over_time(ems_events{message="qos.monitor.memory.maxed"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "QoS monitor memory maxed out for Object uuid [{{ $labels.object_uuid }}]" @@ -336,7 +950,22 @@ groups: - alert: SAN [active-active] State Changed expr: last_over_time(ems_events{message="scsiblade.san.config.active"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "SAN [active-active] state changed for Object uuid [{{ $labels.object_uuid }}]" @@ -344,7 +973,22 @@ groups: - alert: FC Target Port Commands Exceeded expr: last_over_time(ems_events{message="scsitarget.fct.port.full"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "FC target port commands exceeded for Port Name [{{ $labels.portname }}]" @@ -352,7 +996,22 @@ groups: - alert: Shelf Fan Failed expr: last_over_time(ems_events{message="ses.status.fanError"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Shelf fan failed for Node uuid [{{ $labels.node_uuid }}]" @@ -360,7 +1019,22 @@ groups: - alert: Node Panic expr: last_over_time(ems_events{message="sk.panic"}[1d]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Node panic for Node uuid [{{ $labels.node_uuid }}]" @@ -368,7 +1042,22 @@ groups: - alert: SnapMirror Relationship Out of Sync expr: last_over_time(ems_events{message="sms.status.out.of.sync"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "SnapMirror relationship out of sync for Relationship id [{{ $labels.relationship_id }}]" @@ -376,7 +1065,22 @@ groups: - alert: Service Processor Offline expr: last_over_time(ems_events{message="sp.ipmi.lost.shutdown"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Service processor offline for Node uuid [{{ $labels.node_uuid }}]" @@ -384,7 +1088,22 @@ groups: - alert: Service Processor Not Configured expr: last_over_time(ems_events{message="sp.notConfigured"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Service processor not configured for Node uuid [{{ $labels.node_uuid }}]" @@ -392,7 +1111,22 @@ groups: - alert: Unassigned Disks expr: last_over_time(ems_events{message="unowned.disk.reminder"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Unassigned disks for Cluster uuid [{{ $labels.cluster_uuid }}]" @@ -400,15 +1134,45 @@ groups: - alert: Storage VM Stop Succeeded expr: last_over_time(ems_events{message="vserver.stop.succeeded"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: - summary: "Storage VM stop succeeded for instance uuid [{{ $labels.inst_uuid }}]" + summary: "Storage VM stop succeeded for instance uuid [{{ $labels.instuuid }}]" # Alert for READDIR timeout ems - alert: READDIR Timeout expr: last_over_time(ems_events{message="wafl.readdir.expired"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "READDIR timeout for Object uuid [{{ $labels.object_uuid }}]" @@ -416,7 +1180,22 @@ groups: - alert: Volume Automatic Resizing Succeeded expr: last_over_time(ems_events{message="wafl.vol.autoSize.done"}[5m]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Volume automatic resizing succeeded for Object uuid [{{ $labels.object_uuid }}]" @@ -424,7 +1203,22 @@ groups: - alert: Volume Offline expr: last_over_time(ems_events{message="wafl.vvol.offline"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Volume offline for instance uuid [{{ $labels.inst_uuid }}]" @@ -432,6 +1226,21 @@ groups: - alert: Volume Restricted expr: last_over_time(ems_events{message="wafl.vvol.restrict"}[4w]) == 1 labels: - severity: "error" + severity: > + {{- if $labels.severity -}} + {{- if eq $labels.severity "alert" -}} + critical + {{- else if eq $labels.severity "error" -}} + warning + {{- else if eq $labels.severity "emergency" -}} + critical + {{- else if eq $labels.severity "notice" -}} + info + {{- else if eq $labels.severity "informational" -}} + info + {{- else -}} + {{ $labels.severity }} + {{- end -}} + {{- end -}} annotations: summary: "Volume restricted for instance uuid [{{ $labels.inst_uuid }}]" From 80d9cf722363293f4a9be4e61cd12c63f60500aa Mon Sep 17 00:00:00 2001 From: 7840vz <122374011+7840vz@users.noreply.github.com> Date: Thu, 20 Apr 2023 16:06:28 +0300 Subject: [PATCH 2/2] fix: revert whitespace --- container/prometheus/ems_alert_rules.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/container/prometheus/ems_alert_rules.yml b/container/prometheus/ems_alert_rules.yml index ff6da70bc..880da26b3 100644 --- a/container/prometheus/ems_alert_rules.yml +++ b/container/prometheus/ems_alert_rules.yml @@ -1,4 +1,5 @@ # Example Harvest ems alerts + groups: - name: Harvest Ems Alert rules: