Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add a version of the mixin dashboards for meta monitoring #12700

Merged
merged 10 commits into from
Apr 26, 2024
6 changes: 6 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,7 @@ clients/cmd/promtail/promtail-debug:
MIXIN_PATH := production/loki-mixin
MIXIN_OUT_PATH := production/loki-mixin-compiled
MIXIN_OUT_PATH_SSD := production/loki-mixin-compiled-ssd
MIXIN_OUT_PATH_META_MONITORING := production/loki-mixin-compiled-meta-monitoring

loki-mixin: ## compile the loki mixin
ifeq ($(BUILD_IN_CONTAINER),true)
Expand All @@ -273,12 +274,17 @@ else
@rm -rf $(MIXIN_OUT_PATH_SSD) && mkdir $(MIXIN_OUT_PATH_SSD)
@cd $(MIXIN_PATH) && jb install
@mixtool generate all --output-alerts $(MIXIN_OUT_PATH_SSD)/alerts.yaml --output-rules $(MIXIN_OUT_PATH_SSD)/rules.yaml --directory $(MIXIN_OUT_PATH_SSD)/dashboards ${MIXIN_PATH}/mixin-ssd.libsonnet

@rm -rf $(MIXIN_OUT_PATH_META_MONITORING) && mkdir $(MIXIN_OUT_PATH_META_MONITORING)
@cd $(MIXIN_PATH) && jb install
@mixtool generate all --output-alerts $(MIXIN_OUT_PATH_META_MONITORING)/alerts.yaml --output-rules $(MIXIN_OUT_PATH_META_MONITORING)/rules.yaml --directory $(MIXIN_OUT_PATH_META_MONITORING)/dashboards ${MIXIN_PATH}/mixin-meta-monitoring.libsonnet
endif

loki-mixin-check: loki-mixin ## check the loki mixin is up to date
@echo "Checking diff"
@git diff --exit-code -- $(MIXIN_OUT_PATH) || (echo "Please build mixin by running 'make loki-mixin'" && false)
@git diff --exit-code -- $(MIXIN_OUT_PATH_SSD) || (echo "Please build mixin by running 'make loki-mixin'" && false)
@git diff --exit-code -- $(MIXIN_OUT_PATH_META_MONITORING) || (echo "Please build mixin by running 'make loki-mixin'" && false)

###############
# Migrate #
Expand Down
45 changes: 45 additions & 0 deletions production/loki-mixin-compiled-meta-monitoring/alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
groups:
- name: loki_alerts
rules:
- alert: LokiRequestErrors
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}% errors.
summary: Loki request error rate is high.
expr: |
100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[2m])) by (namespace, job, route)
/
sum(rate(loki_request_duration_seconds_count[2m])) by (namespace, job, route)
> 10
for: 15m
labels:
severity: critical
- alert: LokiRequestPanics
annotations:
description: |
{{ $labels.job }} is experiencing {{ printf "%.2f" $value }}% increase of panics.
summary: Loki requests are causing code panics.
expr: |
sum(increase(loki_panic_total[10m])) by (namespace, job) > 0
labels:
severity: critical
- alert: LokiRequestLatency
annotations:
description: |
{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf "%.2f" $value }}s 99th percentile latency.
summary: Loki request error latency is high.
expr: |
cluster_namespace_job_route:loki_request_duration_seconds:99quantile{route!~"(?i).*tail.*|/schedulerpb.SchedulerForQuerier/QuerierLoop"} > 1
for: 15m
labels:
severity: critical
- alert: LokiTooManyCompactorsRunning
annotations:
description: |
{{ $labels.cluster }} {{ $labels.namespace }} has had {{ printf "%.0f" $value }} compactors running for more than 5m. Only one compactor should run at a time.
summary: Loki deployment is running more than one compactor.
expr: |
sum(loki_boltdb_shipper_compactor_running) by (namespace, cluster) > 1
for: 5m
labels:
severity: warning
Loading
Loading