diff --git a/CHANGELOG.md b/CHANGELOG.md index b68640aac1..6b1b261ef1 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ We use *breaking* word for marking changes that are not backward compatible (rel - [#2970](https://github.com/thanos-io/thanos/pull/2970) Store: Upgrade minio-go/v7 to fix slowness when running on EKS. - [#2957](https://github.com/thanos-io/thanos/pull/2957) Rule: now sets all of the relevant fields properly; avoids a panic when `/api/v1/rules` is called and the time zone is _not_ UTC; `rules` field is an empty array now if no rules have been defined in a rule group. - [#2976](https://github.com/thanos-io/thanos/pull/2976) Query: Better rounding for incoming query timestamps. +- [#2929](https://github.com/thanos-io/thanos/pull/2929) Mixin: Fix expression for 'unhealthy sidecar' alert and also increase the timeout for 10 minutes. ### Added diff --git a/examples/alerts/alerts.md b/examples/alerts/alerts.md index cb40e2a9f7..89e567a07d 100644 --- a/examples/alerts/alerts.md +++ b/examples/alerts/alerts.md @@ -275,7 +275,7 @@ rules: message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. expr: | - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600 labels: severity: critical ``` diff --git a/examples/alerts/alerts.yaml b/examples/alerts/alerts.yaml index 72c3279e49..ad5f75301b 100644 --- a/examples/alerts/alerts.yaml +++ b/examples/alerts/alerts.yaml @@ -258,7 +258,7 @@ groups: message: Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds. expr: | - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job=~"thanos-sidecar.*"}) by (job, pod) >= 600 labels: severity: critical - name: thanos-store.rules diff --git a/examples/alerts/tests.yaml b/examples/alerts/tests.yaml index 25df0414e4..adac87b9a4 100644 --- a/examples/alerts/tests.yaml +++ b/examples/alerts/tests.yaml @@ -22,47 +22,47 @@ tests: exp_samples: - labels: '{}' value: 120 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) eval_time: 2m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' value: 43 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' value: 42 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 5m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) + eval_time: 10m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' value: 0 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' value: 0 - - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 6m + - expr: max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) + eval_time: 11m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' value: 0 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' value: 0 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 5m + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) + eval_time: 10m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 300 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 300 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) - eval_time: 6m + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' + value: 600 + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' + value: 600 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) + eval_time: 11m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' - value: 360 - - labels: '{pod="thanos-sidecar-pod-1"}' - value: 360 - - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (pod) >= 300 + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' + value: 660 + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' + value: 660 + - expr: time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{job="thanos-sidecar"}) by (job, pod) >= 600 eval_time: 12m exp_samples: - - labels: '{pod="thanos-sidecar-pod-0"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-0"}' value: 720 - - labels: '{pod="thanos-sidecar-pod-1"}' + - labels: '{job="thanos-sidecar", pod="thanos-sidecar-pod-1"}' value: 720 alert_rule_test: - eval_time: 1m @@ -71,24 +71,48 @@ tests: alertname: ThanosSidecarUnhealthy - eval_time: 3m alertname: ThanosSidecarUnhealthy - - eval_time: 5m + - eval_time: 10m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-0 exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' - - eval_time: 6m + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 600 seconds.' + - exp_labels: + severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-1 + exp_annotations: + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 600 seconds.' + - eval_time: 11m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-0 exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 660 seconds.' + - exp_labels: + severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-1 + exp_annotations: + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 660 seconds.' - eval_time: 12m alertname: ThanosSidecarUnhealthy exp_alerts: - exp_labels: severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-0 + exp_annotations: + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-0 is unhealthy for 720 seconds.' + - exp_labels: + severity: critical + job: thanos-sidecar + pod: thanos-sidecar-pod-1 exp_annotations: - message: 'Thanos Sidecar is unhealthy for 2 seconds.' + message: 'Thanos Sidecar thanos-sidecar thanos-sidecar-pod-1 is unhealthy for 720 seconds.' diff --git a/mixin/alerts/sidecar.libsonnet b/mixin/alerts/sidecar.libsonnet index c81e2ba0a9..e1790dbac6 100644 --- a/mixin/alerts/sidecar.libsonnet +++ b/mixin/alerts/sidecar.libsonnet @@ -27,7 +27,7 @@ message: 'Thanos Sidecar {{$labels.job}} {{$labels.pod}} is unhealthy for {{ $value }} seconds.', }, expr: ||| - count(time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) by (job, pod) >= 300) > 0 + time() - max(thanos_sidecar_last_heartbeat_success_time_seconds{%(selector)s}) by (job, pod) >= 600 ||| % thanos.sidecar, labels: { severity: 'critical',