Skip to content

Commit

Permalink
add prometheus module
Browse files Browse the repository at this point in the history
  • Loading branch information
Mic92 committed Aug 14, 2023
1 parent b7407c2 commit 80617bf
Show file tree
Hide file tree
Showing 3 changed files with 360 additions and 0 deletions.
1 change: 1 addition & 0 deletions nixos/default.nix
Original file line number Diff line number Diff line change
Expand Up @@ -55,5 +55,6 @@ exposeModules ./. [
./mixins/trusted-nix-caches.nix
./roles/github-actions-runner.nix
./roles/nix-remote-builder.nix
./roles/prometheus
./server
]
229 changes: 229 additions & 0 deletions nixos/roles/prometheus/default-alerts.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,229 @@
{
srvos.prometheus.ruleGroups.srvosPrometheusAlerts = {
alertRules = {
MonitoringTooManyRestarts = {
expr = "changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager|telegraf\"}[15m]) > 2";
annotations.description = "Service has restarted more than twice in the last 15 minutes. It might be crashlooping";
};

AlertManagerConfigNotSynced = {
expr = "count(count_values(\"config_hash\", alertmanager_config_hash)) > 1";
annotations.description = "Configurations of AlertManager cluster instances are out of sync";
};

PrometheusNotConnectedToAlertmanager = {
expr = "prometheus_notifications_alertmanagers_discovered < 1";
annotations.description = "Prometheus cannot connect the alertmanager\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
};

PrometheusRuleEvaluationFailures = {
expr = "increase(prometheus_rule_evaluation_failures_total[3m]) > 0";
annotations.description = "Prometheus encountered {{ $value }} rule evaluation failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
};

PrometheusTemplateExpansionFailures = {
expr = "increase(prometheus_template_text_expansion_failures_total[3m]) > 0";
for = "0m";
annotations.description = "Prometheus encountered {{ $value }} template text expansion failures\n VALUE = {{ $value }}\n LABELS = {{ $labels }}";
};

PromtailRequestsErrors = {
expr = "100 * sum(rate(promtail_request_duration_seconds_count{status_code=~\"5..|failed\"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10";
for = "15m";
annotations.description = "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors";
};

PromtailFileLagging = {
expr = "abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6";
for = "15m";
annotations.description = "{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m";
};

Filesystem80percentFull = {
expr = ''disk_used_percent{mode!="ro"} >= 80'';
for = "10m";
annotations.description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 20% space left on its filesystem";
};

FilesystemInodesFull = {
expr = ''disk_inodes_free / disk_inodes_total < 0.10'';
time = "10m";
annotations.description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% inodes left on its filesystem";
};

DailyTaskNotRun = {
expr = ''time() - task_last_run{state="ok",frequency="daily"} > (24 + 6) * 60 * 60'';
annotations.description = "{{$labels.host}}: {{$labels.name}} was not run in the last 24h";
};

TenMinutesTaskNotRun = {
expr = ''time() - task_last_run{state="ok",frequency=""} > (24 + 6) * 60 * 60'';
annotations.description = "{{$labels.host}}: {{$labels.name}} was not run in the last 24h";
};

TaskFailed = {
expr = ''task_last_run{state="fail"}'';
annotations.description = "{{$labels.host}}: {{$labels.name}} failed to run";
};

NixpkgsOutOfDate = {
expr = ''(time() - flake_input_last_modified{input="nixpkgs",host!="matchbox"}) / (60*60*24) > 7'';
annotations.description = "{{$labels.host}}: nixpkgs flake is older than a week";
};

SwapUsing30Percent = {
expr = ''mem_swap_total{host!="eva"} - (mem_swap_cached + mem_swap_free) > mem_swap_total * 0.3'';
for = "30m";
annotations.description = "{{$labels.host}} is using 30% of its swap space for at least 30 minutes";
};

# user@$uid.service and similar sometimes fail, we don't care about those services.
SystemdServiceFailed = {
expr = ''systemd_units_active_code{name!~"user@\\d+.service"} == 3'';
annotations.description = "{{$labels.host}} failed to (re)start service {{$labels.name}}";
};

NfsExportNotPresent = {
expr = "nfs_export_present == 0";
for = "1h";
annotations.description = "{{$labels.host}} cannot reach nfs export [{{$labels.server}}]:{{$labels.path}}";
};

RamUsing95Percent = {
expr = "mem_buffered + mem_free + mem_cached < mem_total * 0.05";
for = "1h";
annotations.description = "{{$labels.host}} is using at least 95% of its RAM for at least 1 hour";
};

Load15 = {
expr = ''system_load15 / system_n_cpus{org!="nix-community"} >= 2.0'';
for = "10m";
annotations.description = "{{$labels.host}} is running with load15 > 1 for at least 5 minutes: {{$value}}";
};

Reboot = {
expr = "system_uptime < 300";
annotations.description = "{{$labels.host}} just rebooted";
};

Uptime = {
expr = "system_uptime > 2592000";
annotations.description = "{{$labels.host}} has been up for more than 30 days";
};

TelegrafDown = {
expr = "min(up{job=~\"telegraf\",type!='mobile'}) by (source, job, instance, org) == 0";
time = "3m";
annotations.description = "{{$labels.instance}}: {{$labels.job}} telegraf exporter from {{$labels.source}} is down";
};

Ping = {
expr = "ping_result_code{type!='mobile'} != 0";
annotations.description = "{{$labels.url}}: ping from {{$labels.instance}} has failed";
};

PingHighLatency = {
expr = "ping_average_response_ms{type!='mobile'} > 5000";
annotations.description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency";
};

Http = {
expr = "http_response_result_code != 0";
annotations.description = "{{$labels.url}}: http request from {{$labels.instance}} has failed";
};

HttpMatchFailed = {
expr = "http_response_response_string_match == 0";
annotations.description = "{{$labels.url}}: http body not as expected; status code: {{$labels.status_code}}";
};

DnsQuery = {
expr = "dns_query_result_code != 0";
annotations.description = "{{$labels.domain}} : could retrieve A record {{$labels.instance}} from server {{$labels.server}}: {{$labels.result}}";
};

SecureDnsQuery = {
expr = "secure_dns_state != 0";
annotations.description = "{{$labels.domain}} : could retrieve A record {{$labels.instance}} from server {{$labels.server}}: {{$labels.result}} for protocol {{$labels.protocol}}";
};

ConnectionFailed = {
expr = "net_response_result_code != 0";
annotations.description = "{{$labels.server}}: connection to {{$labels.port}}({{$labels.protocol}}) failed from {{$labels.instance}}";
};

# https://healthchecks.io/
Healthchecks = {
expr = "hc_check_up == 0";
annotations.description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails";
};

CertExpiry = {
expr = "x509_cert_expiry < 7*24*3600";
annotations.description = "{{$labels.instance}}: The TLS certificate from {{$labels.source}} will expire in less than 7 days: {{$value}}s";
};

PostfixQueueLength = {
expr = "avg_over_time(postfix_queue_length[1h]) > 10";
annotations.description = "{{$labels.instance}}: postfix mail queue has undelivered {{$value}} items";
};

ZfsErrors = {
expr = "zfs_arcstats_l2_io_error + zfs_dmu_tx_error + zfs_arcstats_l2_writes_error > 0";
annotations.description = "{{$labels.instance}} reports: {{$value}} ZFS IO errors";
};

ZpoolErrors = {
expr = "zpool_status_errors > 0";
annotations.description = "{{$labels.instance}} reports: zpool {{$labels.name}} has {{$value}} errors";
};

MdRaidDegradedDisks = {
expr = "mdstat_degraded_disks > 0";
annotations.description = "{{$labels.instance}}: raid {{$labels.dev}} has failed disks";
};

# ignore devices that disabled S.M.A.R.T (example if attached via USB)
# Also ignore nix-community server ci server until nvme actually fails
smart_errors = {
expr = ''smart_device_health_ok{enabled!="Disabled"} != 1'';
description = "{{$labels.instance}}: S.M.A.R.T reports: {{$labels.device}} ({{$labels.model}}) has errors";
};

OomKills = {
expr = "increase(kernel_vmstat_oom_kill[5m]) > 0";
annotations.description = "{{$labels.instance}}: OOM kill detected";
};

UnusualDiskReadLatency = {
expr = "rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
annotations.description = "{{$labels.instance}}: Disk latency is growing (read operations > 100ms)";
};

UnusualDiskWriteLatency = {
expr = "rate(diskio_write_time[1m]) / rate(diskio_write[1m]) > 0.1 and rate(diskio_write[1m]) > 0";
annotations.description = "{{$labels.instance}}: Disk latency is growing (write operations > 100ms)";
};

Ipv6DadCheck = {
expr = "ipv6_dad_failures_count > 0";
annotations.description = "{{$labels.host}}: {{$value}} assigned ipv6 addresses have failed duplicate address check";
};

HostMemoryUnderMemoryPressure = {
expr = "rate(node_vmstat_pgmajfault[1m]) > 1000";
annotations.description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
};

Ext4Errors = {
expr = "ext4_errors_value > 0";
annotations.description = "{{$labels.instance}}: ext4 has reported {{$value}} I/O errors: check /sys/fs/ext4/*/errors_count";
};

AlertmanagerSilencesChanged = {
expr = "abs(delta(alertmanager_silences{state=\"active\"}[1h])) >= 1";
annotations.description = "alertmanager: number of active silences has changed: {{$value}}";
};
};
};
}
130 changes: 130 additions & 0 deletions nixos/roles/prometheus/default.nix
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
{ lib, pkgs, config, ... }:
let
filterEnabled = lib.filterAttrs (n: v: v.enable);
rules = lib.mapAttrsToList (name: group: {
inherit name;
rules =
(lib.mapAttrsToList
(name: rule: {
alert = rule.name;
expr = rule.expr;
for = rule.for;
labels = rule.labels;
annotations = rule.annotations;
})
(filterEnabled group.alertRules)) ++
(lib.mapAttrsToList
(name: rule: {
record = rule.name;
expr = rule.expr;
labels = rule.labels;
annotations = rule.annotations;
})
(filterEnabled group.recordingRules)) config.srvos.prometheus.ruleGroups;
});
in
{
imports = [
./default-alerts.nix
];
options = {
# XXX maybe we move this upstream eventually to nixpkgs. Expect this interface to be replaced with the upstream equivalent.
srvos.prometheus.ruleGroups = lib.mkOption {
type = lib.types.attrsOf (lib.types.submodule ({ name, ... }: {
name = lib.mkOption {
type = lib.types.str;
default = name;
};
enable = lib.mkEnableOption (lib.mdDoc "Enable rule group");
alertRules = lib.mkOption {
type = lib.attrsetsOf (lib.types.submodule ({ name, ... }: {
name = lib.mkOption {
type = lib.types.str;
default = name;
};
enable = lib.mkEnableOption (lib.mdDoc "Enable alert rule");
expr = lib.mkOption {
type = lib.types.str;
};
for = lib.mkOption {
type = lib.types.str;
default = "2m";
};
labels = lib.mkOption {
type = lib.types.attrsOf lib.types.str;
default = { };
};
annotations = lib.mkOption {
type = lib.types.attrsOf lib.types.str;
default = { };
};
}));
default = [ ];
};
recordingRules = lib.mkOption {
type = lib.attrsetsOf (lib.types.submodule ({ name, ... }: {
name = lib.mkOption {
type = lib.types.str;
default = name;
};
enable = lib.mkEnableOption (lib.mdDoc "Enable recording rule");
expr = lib.mkOption {
type = lib.types.str;
};
for = lib.mkOption {
type = lib.types.str;
default = "2m";
};
labels = lib.mkOption {
type = lib.types.attrsOf lib.types.str;
default = { };
};
annotations = lib.mkOption {
type = lib.types.attrsOf lib.types.str;
default = { };
};
}));
default = [ ];
};
}));
example = {
prometheusAlerts = {
alertRules = {
ExampleAlert = {
expr = "up == 0";
for = "2m";
labels = {
severity = "critical";
};
annotations = {
summary = "Instance {{ $labels.instance }} down";
description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes.";
};
};
};
recordingRules = {
RecordingExample = {
expr = "up";
for = "2m";
labels = {
severity = "critical";
};
annotations = {
summary = "Instance {{ $labels.instance }} up";
description = "{{ $labels.instance }} of job {{ $labels.job }} has been up for more than 2 minutes.";
};
};
};
};
};
};
};
config = {
services.prometheus = {
enable = lib.mkDefault true;
# checks fail because of missing secrets in the sandbox
checkConfig = false;
ruleFiles = pkgs.writers.writeYAML "rules.yaml" rules;
};
};
}

0 comments on commit 80617bf

Please sign in to comment.