From e8589758064e8a3f0486074d7d82de92b12f5ef2 Mon Sep 17 00:00:00 2001 From: QuentinBisson Date: Mon, 15 Apr 2024 12:00:21 +0200 Subject: [PATCH] Fix mixin generation when cluster label is changed Signed-off-by: QuentinBisson --- .../loki-mixin-compiled-ssd/alerts.yaml | 80 ++++++++--------- production/loki-mixin-compiled-ssd/rules.yaml | 90 ++++++++----------- production/loki-mixin-compiled/alerts.yaml | 80 ++++++++--------- production/loki-mixin-compiled/rules.yaml | 90 ++++++++----------- production/loki-mixin/alerts.libsonnet | 8 +- .../loki-canary-dashboard.libsonnet | 48 +++++----- .../loki-mixin/dashboards/loki-logs.libsonnet | 6 +- .../dashboards/loki-operational.libsonnet | 26 ++++-- .../dashboards/loki-reads.libsonnet | 3 +- .../dashboards/loki-writes.libsonnet | 3 +- .../dashboards/recording-rules.libsonnet | 2 +- 11 files changed, 210 insertions(+), 226 deletions(-) diff --git a/production/loki-mixin-compiled-ssd/alerts.yaml b/production/loki-mixin-compiled-ssd/alerts.yaml index 77f285b99c06..af06880cf698 100644 --- a/production/loki-mixin-compiled-ssd/alerts.yaml +++ b/production/loki-mixin-compiled-ssd/alerts.yaml @@ -1,41 +1,41 @@ groups: -- name: loki_alerts - rules: - - alert: LokiRequestErrors - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - expr: | - cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 - for: 15m - labels: - severity: critical - - alert: LokiTooManyCompactorsRunning - annotations: - message: | - {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - for: 5m - labels: - severity: warning + - name: loki_alerts + rules: + - alert: LokiRequestErrors + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + expr: | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + / + sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + > 10 + for: 15m + labels: + severity: critical + - alert: LokiRequestPanics + annotations: + message: | + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + expr: | + sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + labels: + severity: critical + - alert: LokiRequestLatency + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + expr: | + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + for: 15m + labels: + severity: critical + - alert: LokiTooManyCompactorsRunning + annotations: + message: | + {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + expr: | + sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + for: 5m + labels: + severity: warning diff --git a/production/loki-mixin-compiled-ssd/rules.yaml b/production/loki-mixin-compiled-ssd/rules.yaml index 2a54ed4fb2e5..5893770570f6 100644 --- a/production/loki-mixin-compiled-ssd/rules.yaml +++ b/production/loki-mixin-compiled-ssd/rules.yaml @@ -1,53 +1,39 @@ groups: -- name: loki_rules - rules: - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate + - name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/production/loki-mixin-compiled/alerts.yaml b/production/loki-mixin-compiled/alerts.yaml index 77f285b99c06..af06880cf698 100644 --- a/production/loki-mixin-compiled/alerts.yaml +++ b/production/loki-mixin-compiled/alerts.yaml @@ -1,41 +1,41 @@ groups: -- name: loki_alerts - rules: - - alert: LokiRequestErrors - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. - expr: | - 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) - / - sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) - > 10 - for: 15m - labels: - severity: critical - - alert: LokiRequestPanics - annotations: - message: | - {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. - expr: | - sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 - labels: - severity: critical - - alert: LokiRequestLatency - annotations: - message: | - {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. - expr: | - cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 - for: 15m - labels: - severity: critical - - alert: LokiTooManyCompactorsRunning - annotations: - message: | - {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - expr: | - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - for: 5m - labels: - severity: warning + - name: loki_alerts + rules: + - alert: LokiRequestErrors + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors. + expr: | + 100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route) + / + sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route) + > 10 + for: 15m + labels: + severity: critical + - alert: LokiRequestPanics + annotations: + message: | + {{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics. + expr: | + sum(increase(loki_panic_total[10m])) by (namespace, job) > 0 + labels: + severity: critical + - alert: LokiRequestLatency + annotations: + message: | + {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency. + expr: | + cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1 + for: 15m + labels: + severity: critical + - alert: LokiTooManyCompactorsRunning + annotations: + message: | + {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. + expr: | + sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 + for: 5m + labels: + severity: warning diff --git a/production/loki-mixin-compiled/rules.yaml b/production/loki-mixin-compiled/rules.yaml index 2a54ed4fb2e5..5893770570f6 100644 --- a/production/loki-mixin-compiled/rules.yaml +++ b/production/loki-mixin-compiled/rules.yaml @@ -1,53 +1,39 @@ groups: -- name: loki_rules - rules: - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job)) - record: cluster_job:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) - by (cluster, job) - record: cluster_job:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) - record: cluster_job:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) - record: cluster_job:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, job, route)) - record: cluster_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, - route) - record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) - record: cluster_job_route:loki_request_duration_seconds_count:sum_rate - - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile - - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) - by (le, cluster, namespace, job, route)) - record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, - namespace, job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds:avg - - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate - - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate - - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, - job, route) - record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate + - name: loki_rules + rules: + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job)) + record: cluster_job:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job) + record: cluster_job:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job) + record: cluster_job:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route)) + record: cluster_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, job, route) + record: cluster_job_route:loki_request_duration_seconds_count:sum_rate + - expr: histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:99quantile + - expr: histogram_quantile(0.50, sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route)) + record: cluster_namespace_job_route:loki_request_duration_seconds:50quantile + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds:avg + - expr: sum(rate(loki_request_duration_seconds_bucket[1m])) by (le, cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_bucket:sum_rate + - expr: sum(rate(loki_request_duration_seconds_sum[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_sum:sum_rate + - expr: sum(rate(loki_request_duration_seconds_count[1m])) by (cluster, namespace, job, route) + record: cluster_namespace_job_route:loki_request_duration_seconds_count:sum_rate diff --git a/production/loki-mixin/alerts.libsonnet b/production/loki-mixin/alerts.libsonnet index 0045cc194ba3..e38673d728dc 100644 --- a/production/loki-mixin/alerts.libsonnet +++ b/production/loki-mixin/alerts.libsonnet @@ -54,16 +54,16 @@ { alert: 'LokiTooManyCompactorsRunning', expr: ||| - sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1 - |||, + sum(loki_boltdb_shipper_compactor_running) by (namespace, %s) > 1 + ||| % $._config.per_cluster_label, 'for': '5m', labels: { severity: 'warning', }, annotations: { - message: ||| + message: std.strReplace(||| {{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time. - |||, + |||, 'cluster', $._config.per_cluster_label), }, }, ], diff --git a/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet b/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet index 6539a34d77e4..94e07deb236b 100644 --- a/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet +++ b/production/loki-mixin/dashboards/loki-canary-dashboard.libsonnet @@ -24,8 +24,8 @@ local grafana = import 'grafonnet/grafana.libsonnet'; // This logic is inherited from mimir-mixin. dashboard.dashboard('Canary') // We can't make use of simplified template selectors from the loki dashboard utils until we port the cortex dashboard utils panel/grid functionality. - .addTemplate('cluster', 'loki_build_info', 'cluster') - .addTemplate('namespace', 'loki_build_info{cluster=~"$cluster"}', 'namespace') + .addTemplate('cluster', 'loki_build_info', $._config.per_cluster_label) + .addTemplate('namespace', 'loki_build_info{' + $._config.per_cluster_label + '=~"$cluster"}', 'namespace') + { // This dashboard uses the new grid system in order to place panels (using gridPos). // Because of this we can't use the mixin's addRow() and addPanel(). @@ -33,7 +33,7 @@ local grafana = import 'grafonnet/grafana.libsonnet'; rows: null, // ugly hack, copy pasta the tag/link // code from the loki-mixin - tags: ['loki'], + tags: $._config.tags, links: [ { asDropdown: true, @@ -49,60 +49,60 @@ local grafana = import 'grafonnet/grafana.libsonnet'; panels: [ // grid row 1 dashboard.panel('Canary Entries Total') + - dashboard.newStatPanel('sum(count(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}))', unit='short') + + dashboard.newStatPanel('sum(count(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster", namespace=~"$namespace"}))', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 0 } }, dashboard.panel('Canary Logs Total') + - dashboard.newStatPanel('sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 3, y: 0 } }, dashboard.panel('Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 6, y: 0 } }, dashboard.panel('Spotcheck Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 9, y: 0 } }, // grid row 2 dashboard.panel('Spotcheck Total') + - dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 4 } }, dashboard.panel('Metric Test Error %') + - dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"}))) * 100') + + dashboard.newStatPanel('((sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}) - sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))/(sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}))) * 100') + { gridPos: { h: 4, w: 3, x: 3, y: 4 } }, dashboard.panel('Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + + dashboard.newStatPanel('(sum(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + { gridPos: { h: 4, w: 3, x: 6, y: 4 } }, dashboard.panel('Spotcheck Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') + + dashboard.newStatPanel('(sum(increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))) * 100') + { gridPos: { h: 4, w: 3, x: 9, y: 4 } }, // grid row 3 dashboard.panel('Metric Test Expected') + - dashboard.newStatPanel('sum(loki_canary_metric_test_expected{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') + + dashboard.newStatPanel('sum(loki_canary_metric_test_expected{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') + { gridPos: { h: 4, w: 3, x: 0, y: 8 } }, dashboard.panel('Metric Test Actual') + - dashboard.newStatPanel('sum(loki_canary_metric_test_actual{cluster=~"$cluster",namespace=~"$namespace"})', unit='short') + + dashboard.newStatPanel('sum(loki_canary_metric_test_actual{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"})', unit='short') + { gridPos: { h: 4, w: 3, x: 3, y: 8 } }, dashboard.panel('Websocket Missing') + - dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + + dashboard.newStatPanel('sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))', unit='short') + { gridPos: { h: 4, w: 3, x: 6, y: 8 } }, dashboard.panel('Websocket Missing %') + - dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + + dashboard.newStatPanel('(sum(increase(loki_canary_websocket_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range]))/sum(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__range])))*100') + { gridPos: { h: 4, w: 3, x: 9, y: 8 } }, // end of grid dashboard.panel('Log Write to read Latency Percentiles') + dashboard.queryPanel([ - 'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.95, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', ], ['p95', 'p50']) + { gridPos: { h: 6, w: 12, x: 12, y: 0 } }, @@ -115,7 +115,7 @@ local grafana = import 'grafonnet/grafana.libsonnet'; ).addTargets( [ grafana.prometheus.target( - 'sum(rate(loki_canary_response_latency_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)', + 'sum(rate(loki_canary_response_latency_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le)', legendFormat='{{le}}', format='heatmap', ), @@ -125,24 +125,24 @@ local grafana = import 'grafonnet/grafana.libsonnet'; dashboard.panel('Spot Check Query') + dashboard.queryPanel([ - 'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.99, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_spot_check_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) by (le))', ], ['p99', 'p95']) + { gridPos: { h: 6, w: 12, x: 0, y: 14 } }, dashboard.panel('Metric Test Query') + dashboard.queryPanel([ - 'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', - 'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{cluster=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', + 'histogram_quantile(0.99, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', + 'histogram_quantile(0.50, sum(rate(loki_canary_metric_test_request_duration_seconds_bucket{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[15m])) by (le))', ], ['p99', 'p95'],) + { gridPos: { h: 6, w: 12, x: 12, y: 14 } }, dashboard.panel('Spot Check Missing %') + - dashboard.queryPanel('topk(20, (sum by (cluster, pod) (increase(loki_canary_spot_check_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod) (increase(loki_canary_spot_check_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') + + dashboard.queryPanel('topk(20, (sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod) (increase(loki_canary_spot_check_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])) * 100)) > 0', '') + { gridPos: { h: 6, w: 12, x: 0, y: 20 } }, g.panel('Missing logs') + - g.queryPanel('topk(20,(sum by (cluster, pod)(increase(loki_canary_missing_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (cluster, pod)(increase(loki_canary_entries_total{cluster=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ cluster }} {{ pod }}') + + g.queryPanel('topk(20,(sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_missing_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval]))/sum by (' + $._config.per_cluster_label + ', pod)(increase(loki_canary_entries_total{' + $._config.per_cluster_label + '=~"$cluster",namespace=~"$namespace"}[$__rate_interval])))*100) > 0', 'Missing {{ ' + $._config.per_cluster_label + ' }} {{ pod }}') + { gridPos: { h: 6, w: 12, x: 12, y: 20 } }, ], diff --git a/production/loki-mixin/dashboards/loki-logs.libsonnet b/production/loki-mixin/dashboards/loki-logs.libsonnet index 9fd6eee58950..b28d74e94366 100644 --- a/production/loki-mixin/dashboards/loki-logs.libsonnet +++ b/production/loki-mixin/dashboards/loki-logs.libsonnet @@ -48,7 +48,6 @@ local template = import 'grafonnet/template.libsonnet'; local cfg = self, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, } + lokiLogs + $.dashboard('Loki / Logs', uid='logs') @@ -61,8 +60,9 @@ local template = import 'grafonnet/template.libsonnet'; p { targets: [ e { - expr: if dashboards['loki-logs.json'].showMultiCluster then super.expr - else std.strReplace(super.expr, $._config.per_cluster_label + '="$cluster", ', ''), + expr: if dashboards['loki-logs.json'].showMultiCluster + then std.strReplace(super.expr, 'cluster="$cluster"', $._config.per_cluster_label + '="$cluster"') + else std.strReplace(super.expr, 'cluster="$cluster", ', ''), } for e in p.targets ], diff --git a/production/loki-mixin/dashboards/loki-operational.libsonnet b/production/loki-mixin/dashboards/loki-operational.libsonnet index e8f5d9824874..e20d7dc2d562 100644 --- a/production/loki-mixin/dashboards/loki-operational.libsonnet +++ b/production/loki-mixin/dashboards/loki-operational.libsonnet @@ -11,7 +11,6 @@ local utils = import 'mixin-utils/utils.libsonnet'; showAnnotations:: true, showLinks:: true, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, hiddenRows:: [ 'Cassandra', @@ -62,7 +61,22 @@ local utils = import 'mixin-utils/utils.libsonnet'; local replaceClusterMatchers(expr) = if dashboards['loki-operational.json'].showMultiCluster - then expr + // Replace the recording rules cluster label with the per-cluster label + then std.strReplace( + // Replace the cluster label for equality matchers with the per-cluster label + std.strReplace( + // Replace the cluster label for regex matchers with the per-cluster label + std.strReplace( + expr, + 'cluster=~"$cluster"', + $._config.per_cluster_label + '=~"$cluster"' + ), + 'cluster="$cluster"', + $._config.per_cluster_label + '="$cluster"' + ), + 'cluster_', + $._config.per_cluster_label + '_' + ) else std.strReplace( std.strReplace( @@ -143,7 +157,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; local replaceAllMatchers(expr) = - replaceMatchers(replaceClusterMatchers(expr)), + replaceMatchers(expr), local selectDatasource(ds) = if ds == null || ds == '' then ds @@ -179,7 +193,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: selectDatasource(super.datasource), targets: if std.objectHas(p, 'targets') then [ e { - expr: removeInternalComponents(p.title, e.expr), + expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)), } for e in p.targets ] else [], @@ -188,7 +202,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: selectDatasource(super.datasource), targets: if std.objectHas(sp, 'targets') then [ e { - expr: removeInternalComponents(p.title, e.expr), + expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)), } for e in sp.targets ] else [], @@ -197,7 +211,7 @@ local utils = import 'mixin-utils/utils.libsonnet'; datasource: selectDatasource(super.datasource), targets: if std.objectHas(ssp, 'targets') then [ e { - expr: removeInternalComponents(p.title, e.expr), + expr: removeInternalComponents(p.title, replaceClusterMatchers(e.expr)), } for e in ssp.targets ] else [], diff --git a/production/loki-mixin/dashboards/loki-reads.libsonnet b/production/loki-mixin/dashboards/loki-reads.libsonnet index 3da4e200e1ab..0b636fa5e6eb 100644 --- a/production/loki-mixin/dashboards/loki-reads.libsonnet +++ b/production/loki-mixin/dashboards/loki-reads.libsonnet @@ -31,10 +31,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; local cfg = self, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, clusterMatchers:: if cfg.showMultiCluster then - [utils.selector.re(cfg.clusterLabel, '$cluster')] + [utils.selector.re($._config.per_cluster_label, '$cluster')] else [], diff --git a/production/loki-mixin/dashboards/loki-writes.libsonnet b/production/loki-mixin/dashboards/loki-writes.libsonnet index bedb9ca10825..8227cc383492 100644 --- a/production/loki-mixin/dashboards/loki-writes.libsonnet +++ b/production/loki-mixin/dashboards/loki-writes.libsonnet @@ -9,10 +9,9 @@ local utils = import 'mixin-utils/utils.libsonnet'; local cfg = self, showMultiCluster:: true, - clusterLabel:: $._config.per_cluster_label, clusterMatchers:: if cfg.showMultiCluster then - [utils.selector.re(cfg.clusterLabel, '$cluster')] + [utils.selector.re($._config.per_cluster_label, '$cluster')] else [], diff --git a/production/loki-mixin/dashboards/recording-rules.libsonnet b/production/loki-mixin/dashboards/recording-rules.libsonnet index 2d943807c648..46618da952dc 100644 --- a/production/loki-mixin/dashboards/recording-rules.libsonnet +++ b/production/loki-mixin/dashboards/recording-rules.libsonnet @@ -7,7 +7,7 @@ local template = import 'grafonnet/template.libsonnet'; template.new( 'tenant', '$datasource', - 'query_result(sum by (id) (grafanacloud_logs_instance_info) and sum(label_replace(loki_tenant:active_streams{cluster="$cluster",namespace="$namespace"},"id","$1","tenant","(.*)")) by(id))', + 'query_result(sum by (id) (grafanacloud_logs_instance_info) and sum(label_replace(loki_tenant:active_streams{' + $._config.per_cluster_label + '="$cluster",namespace="$namespace"},"id","$1","tenant","(.*)")) by(id))', regex='/"([^"]+)"/', sort=1, includeAll=true,