Skip to content

Commit

Permalink
Add namespaces to alerts
Browse files Browse the repository at this point in the history
Signed-off-by: Kemal Akkoyun <[email protected]>
  • Loading branch information
kakkoyun committed Mar 4, 2021
1 parent befb025 commit ad13ea2
Show file tree
Hide file tree
Showing 14 changed files with 533 additions and 498 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ We use _breaking :warning:_ to mark changes that are not backward compatible (re
- [#3792](https://github.com/thanos-io/thanos/pull/3792) Receiver: Added `--tsdb.allow-overlapping-blocks` flag to allow overlapping tsdb blocks and enable vertical compaction
- [#3031](https://github.com/thanos-io/thanos/pull/3031) Compact/Sidecar/other writers: added `--hash-func`. If some function has been specified, writers calculate hashes using that function of each file in a block before uploading them. If those hashes exist in the `meta.json` file then Compact does not download the files if they already exist on disk and with the same hash. This also means that the data directory passed to Thanos Compact is only *cleared once at boot* or *if everything succeeds*. So, if you, for example, use persistent volumes on k8s and your Thanos Compact crashes or fails to make an iteration properly then the last downloaded files are not wiped from the disk. The directories that were created the last time are only wiped again after a successful iteration or if the previously picked up blocks have disappeared.
- [#3686](https://github.com/thanos-io/thanos/pull/3686) Query: Added federated metric metadata support.
- [#3856](https://github.com/thanos-io/thanos/pull/3856) Mixin: Add namespace to the alerting rules and annotations.

### Fixed

Expand Down
330 changes: 167 additions & 163 deletions examples/alerts/alerts.md

Large diffs are not rendered by default.

333 changes: 168 additions & 165 deletions examples/alerts/alerts.yaml

Large diffs are not rendered by default.

42 changes: 42 additions & 0 deletions examples/alerts/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,17 +77,31 @@ tests:
- exp_labels:
severity: critical
job: thanos-sidecar
<<<<<<< HEAD
instance: thanos-sidecar-0
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-0 is unhealthy for 600 seconds.'
=======
namespace: production
pod: thanos-sidecar-pod-0
exp_annotations:
description: 'Thanos Sidecar production/thanos-sidecar/thanos-sidecar-pod-0 is unhealthy for 600 seconds.'
>>>>>>> 21ef5f66 (Add namespaces to alerts)
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- exp_labels:
severity: critical
job: thanos-sidecar
<<<<<<< HEAD
instance: thanos-sidecar-1
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-1 is unhealthy for 600 seconds.'
=======
namespace: production
pod: thanos-sidecar-pod-1
exp_annotations:
description: 'Thanos Sidecar production/thanos-sidecar/thanos-sidecar-pod-1 is unhealthy for 600 seconds.'
>>>>>>> 21ef5f66 (Add namespaces to alerts)
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- eval_time: 11m
Expand All @@ -96,17 +110,31 @@ tests:
- exp_labels:
severity: critical
job: thanos-sidecar
<<<<<<< HEAD
instance: thanos-sidecar-0
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-0 is unhealthy for 660 seconds.'
=======
namespace: production
pod: thanos-sidecar-pod-0
exp_annotations:
description: 'Thanos Sidecar production/thanos-sidecar/thanos-sidecar-pod-0 is unhealthy for 660 seconds.'
>>>>>>> 21ef5f66 (Add namespaces to alerts)
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- exp_labels:
severity: critical
job: thanos-sidecar
<<<<<<< HEAD
instance: thanos-sidecar-1
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-1 is unhealthy for 660 seconds.'
=======
namespace: production
pod: thanos-sidecar-pod-1
exp_annotations:
description: 'Thanos Sidecar production/thanos-sidecar/thanos-sidecar-pod-1 is unhealthy for 660 seconds.'
>>>>>>> 21ef5f66 (Add namespaces to alerts)
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- eval_time: 12m
Expand All @@ -115,16 +143,30 @@ tests:
- exp_labels:
severity: critical
job: thanos-sidecar
<<<<<<< HEAD
instance: thanos-sidecar-0
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-0 is unhealthy for 720 seconds.'
=======
namespace: production
pod: thanos-sidecar-pod-0
exp_annotations:
description: 'Thanos Sidecar production/thanos-sidecar/thanos-sidecar-pod-0 is unhealthy for 720 seconds.'
>>>>>>> 21ef5f66 (Add namespaces to alerts)
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
- exp_labels:
severity: critical
job: thanos-sidecar
<<<<<<< HEAD
instance: thanos-sidecar-1
exp_annotations:
description: 'Thanos Sidecar thanos-sidecar thanos-sidecar-1 is unhealthy for 720 seconds.'
=======
namespace: production
pod: thanos-sidecar-pod-1
exp_annotations:
description: 'Thanos Sidecar production/thanos-sidecar/thanos-sidecar-pod-1 is unhealthy for 720 seconds.'
>>>>>>> 21ef5f66 (Add namespaces to alerts)
runbook_url: 'https://github.com/thanos-io/thanos/tree/main/mixin/runbook.md#alert-name-thanossidecarunhealthy'
summary: 'Thanos Sidecar is unhealthy.'
4 changes: 2 additions & 2 deletions mixin/alerts/absent.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@ local titlize(str) = std.join('', std.map(capitalize, std.split(str, '_')));
severity: 'critical',
},
annotations: {
description: '%s has disappeared from Prometheus target discovery.' % name,
summary: 'thanos component has disappeared from Prometheus target discovery.',
description: '%s has disappeared from {{$labels.namespace}}. Prometheus target for the component cannot be discovered.' % name,
summary: 'Thanos component has disappeared from {{$labels.namespace}}.',
},
}
for name in std.objectFields(thanos.jobs)
Expand Down
28 changes: 7 additions & 21 deletions mixin/alerts/bucket_replicate.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,17 @@
{
name: 'thanos-bucket-replicate',
rules: [
{
alert: 'ThanosBucketReplicateIsDown',
expr: |||
absent(up{%(selector)s})
||| % thanos.bucket_replicate,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
description: 'Thanos Replicate has disappeared from Prometheus target discovery.',
summary: 'Thanos Replicate has disappeared from Prometheus target discovery.',
},
},
{
alert: 'ThanosBucketReplicateErrorRate',
annotations: {
description: 'Thanos Replicate failing to run, {{ $value | humanize }}% of attempts failed.',
summary: 'Thanose Replicate is failing to run.',
description: 'Thanos Replicate in {{$labels.namespace}} failing to run, {{$value | humanize}}% of attempts failed.',
summary: 'Thanose Replicate in {{$labels.namespace}} is failing to run.',
},
expr: |||
(
sum(rate(thanos_replicate_replication_runs_total{result="error", %(selector)s}[5m]))
sum by (namespace, job) (rate(thanos_replicate_replication_runs_total{result="error", %(selector)s}[5m]))
/ on (namespace) group_left
sum(rate(thanos_replicate_replication_runs_total{%(selector)s}[5m]))
sum by (namespace, job) (rate(thanos_replicate_replication_runs_total{%(selector)s}[5m]))
) * 100 >= %(errorThreshold)s
||| % thanos.bucket_replicate,
'for': '5m',
Expand All @@ -45,14 +31,14 @@
{
alert: 'ThanosBucketReplicateRunLatency',
annotations: {
description: 'Thanos Replicate {{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.',
description: 'Thanos Replicate {{$labels.namespace}}/{{$labels.job}} has a 99th percentile latency of {{ $value }} seconds for the replicate operations.',
summary: 'Thanos Replicate has a high latency for replicate operations.',
},
expr: |||
(
histogram_quantile(0.99, sum by (job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m]))) > %(p99LatencyThreshold)s
histogram_quantile(0.99, sum by (namespace, job, le) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m]))) > %(p99LatencyThreshold)s
and
sum by (job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m])) > 0
sum by (namespace, job) (rate(thanos_replicate_replication_run_duration_seconds_bucket{%(selector)s}[5m])) > 0
)
||| % thanos.bucket_replicate,
'for': '5m',
Expand Down
20 changes: 10 additions & 10 deletions mixin/alerts/compact.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@
{
alert: 'ThanosCompactMultipleRunning',
annotations: {
description: 'No more than one Thanos Compact instance should be running at once. There are {{ $value }}',
description: 'No more than one Thanos Compact instance should be running at once. There are {{$value}} in {{$labels.namespace}}',
summary: 'Thanos Compact has multiple instances running.',
},
expr: 'sum(up{%(selector)s}) > 1' % thanos.compact,
expr: 'sum by (namespace) (up{%(selector)s}) > 1' % thanos.compact,
'for': '5m',
labels: {
severity: 'warning',
Expand All @@ -37,14 +37,14 @@
{
alert: 'ThanosCompactHighCompactionFailures',
annotations: {
description: 'Thanos Compact {{$labels.job}} is failing to execute {{ $value | humanize }}% of compactions.',
description: 'Thanos Compact {{$labels.namespace}}/{{$labels.job}} is failing to execute {{$value | humanize}}% of compactions.',
summary: 'Thanos Compact is failing to execute compactions.',
},
expr: |||
(
sum by (job) (rate(thanos_compact_group_compactions_failures_total{%(selector)s}[5m]))
sum by (namespsace, job) (rate(thanos_compact_group_compactions_failures_total{%(selector)s}[5m]))
/
sum by (job) (rate(thanos_compact_group_compactions_total{%(selector)s}[5m]))
sum by (namespsace, job) (rate(thanos_compact_group_compactions_total{%(selector)s}[5m]))
* 100 > %(compactionErrorThreshold)s
)
||| % thanos.compact,
Expand All @@ -56,14 +56,14 @@
{
alert: 'ThanosCompactBucketHighOperationFailures',
annotations: {
description: 'Thanos Compact {{$labels.job}} Bucket is failing to execute {{ $value | humanize }}% of operations.',
description: 'Thanos Compact {{$labels.namespace}}/{{$labels.job}} Bucket is failing to execute {{$value | humanize}}% of operations.',
summary: 'Thanos Compact Bucket is having a high number of operation failures.',
},
expr: |||
(
sum by (job) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m]))
sum by (namespsace, job) (rate(thanos_objstore_bucket_operation_failures_total{%(selector)s}[5m]))
/
sum by (job) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m]))
sum by (namespsace, job) (rate(thanos_objstore_bucket_operations_total{%(selector)s}[5m]))
* 100 > %(bucketOpsErrorThreshold)s
)
||| % thanos.compact,
Expand All @@ -75,10 +75,10 @@
{
alert: 'ThanosCompactHasNotRun',
annotations: {
description: 'Thanos Compact {{$labels.job}} has not uploaded anything for 24 hours.',
description: 'Thanos Compact {{$labels.namespace}}/{{$labels.job}} has not uploaded anything for 24 hours.',
summary: 'Thanos Compact has not uploaded anything for last 24 hours.',
},
expr: '(time() - max(max_over_time(thanos_objstore_bucket_last_successful_upload_time{%(selector)s}[24h]))) / 60 / 60 > 24' % thanos.compact,
expr: '(time() - max by (namespace, job) (max_over_time(thanos_objstore_bucket_last_successful_upload_time{%(selector)s}[24h]))) / 60 / 60 > 24' % thanos.compact,
labels: {
severity: 'warning',
},
Expand Down
Loading

0 comments on commit ad13ea2

Please sign in to comment.