add prometheus module

nix-community · Aug 14, 2023 · 80617bf · 80617bf
1 parent b7407c2
commit 80617bf
Show file tree

Hide file tree

Showing 3 changed files with 360 additions and 0 deletions.
diff --git a/nixos/default.nix b/nixos/default.nix
@@ -55,5 +55,6 @@ exposeModules ./. [
   ./mixins/trusted-nix-caches.nix
   ./roles/github-actions-runner.nix
   ./roles/nix-remote-builder.nix
+  ./roles/prometheus
   ./server
 ]
diff --git a/nixos/roles/prometheus/default-alerts.nix b/nixos/roles/prometheus/default-alerts.nix
@@ -0,0 +1,229 @@
+{
+  srvos.prometheus.ruleGroups.srvosPrometheusAlerts = {
+    alertRules = {
+      MonitoringTooManyRestarts = {
+        expr = "changes(process_start_time_seconds{job=~\"prometheus|pushgateway|alertmanager|telegraf\"}[15m]) > 2";
+        annotations.description = "Service has restarted more than twice in the last 15 minutes. It might be crashlooping";
+      };
+
+      AlertManagerConfigNotSynced = {
+        expr = "count(count_values(\"config_hash\", alertmanager_config_hash)) > 1";
+        annotations.description = "Configurations of AlertManager cluster instances are out of sync";
+      };
+
+      PrometheusNotConnectedToAlertmanager = {
+        expr = "prometheus_notifications_alertmanagers_discovered < 1";
+        annotations.description = "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}";
+      };
+
+      PrometheusRuleEvaluationFailures = {
+        expr = "increase(prometheus_rule_evaluation_failures_total[3m]) > 0";
+        annotations.description = "Prometheus encountered {{ $value }} rule evaluation failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}";
+      };
+
+      PrometheusTemplateExpansionFailures = {
+        expr = "increase(prometheus_template_text_expansion_failures_total[3m]) > 0";
+        for = "0m";
+        annotations.description = "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}";
+      };
+
+      PromtailRequestsErrors = {
+        expr = "100 * sum(rate(promtail_request_duration_seconds_count{status_code=~\"5..|failed\"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10";
+        for = "15m";
+        annotations.description = "{{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors";
+      };
+
+      PromtailFileLagging = {
+        expr = "abs(promtail_file_bytes_total - promtail_read_bytes_total) > 1e6";
+        for = "15m";
+        annotations.description = "{{ $labels.instance }} {{ $labels.job }} {{ $labels.path }} has been lagging by more than 1MB for more than 15m";
+      };
+
+      Filesystem80percentFull = {
+        expr = ''disk_used_percent{mode!="ro"} >= 80'';
+        for = "10m";
+        annotations.description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 20% space left on its filesystem";
+      };
+
+      FilesystemInodesFull = {
+        expr = ''disk_inodes_free / disk_inodes_total < 0.10'';
+        time = "10m";
+        annotations.description = "{{$labels.instance}} device {{$labels.device}} on {{$labels.path}} got less than 10% inodes left on its filesystem";
+      };
+
+      DailyTaskNotRun = {
+        expr = ''time() - task_last_run{state="ok",frequency="daily"} > (24 + 6) * 60 * 60'';
+        annotations.description = "{{$labels.host}}: {{$labels.name}} was not run in the last 24h";
+      };
+
+      TenMinutesTaskNotRun = {
+        expr = ''time() - task_last_run{state="ok",frequency=""} > (24 + 6) * 60 * 60'';
+        annotations.description = "{{$labels.host}}: {{$labels.name}} was not run in the last 24h";
+      };
+
+      TaskFailed = {
+        expr = ''task_last_run{state="fail"}'';
+        annotations.description = "{{$labels.host}}: {{$labels.name}} failed to run";
+      };
+
+      NixpkgsOutOfDate = {
+        expr = ''(time() - flake_input_last_modified{input="nixpkgs",host!="matchbox"}) / (60*60*24) > 7'';
+        annotations.description = "{{$labels.host}}: nixpkgs flake is older than a week";
+      };
+
+      SwapUsing30Percent = {
+        expr = ''mem_swap_total{host!="eva"} - (mem_swap_cached + mem_swap_free) > mem_swap_total * 0.3'';
+        for = "30m";
+        annotations.description = "{{$labels.host}} is using 30% of its swap space for at least 30 minutes";
+      };
+
+      # user@$uid.service and similar sometimes fail, we don't care about those services.
+      SystemdServiceFailed = {
+        expr = ''systemd_units_active_code{name!~"user@\\d+.service"} == 3'';
+        annotations.description = "{{$labels.host}} failed to (re)start service {{$labels.name}}";
+      };
+
+      NfsExportNotPresent = {
+        expr = "nfs_export_present == 0";
+        for = "1h";
+        annotations.description = "{{$labels.host}} cannot reach nfs export [{{$labels.server}}]:{{$labels.path}}";
+      };
+
+      RamUsing95Percent = {
+        expr = "mem_buffered + mem_free + mem_cached < mem_total * 0.05";
+        for = "1h";
+        annotations.description = "{{$labels.host}} is using at least 95% of its RAM for at least 1 hour";
+      };
+
+      Load15 = {
+        expr = ''system_load15 / system_n_cpus{org!="nix-community"} >= 2.0'';
+        for = "10m";
+        annotations.description = "{{$labels.host}} is running with load15 > 1 for at least 5 minutes: {{$value}}";
+      };
+
+      Reboot = {
+        expr = "system_uptime < 300";
+        annotations.description = "{{$labels.host}} just rebooted";
+      };
+
+      Uptime = {
+        expr = "system_uptime > 2592000";
+        annotations.description = "{{$labels.host}} has been up for more than 30 days";
+      };
+
+      TelegrafDown = {
+        expr = "min(up{job=~\"telegraf\",type!='mobile'}) by (source, job, instance, org) == 0";
+        time = "3m";
+        annotations.description = "{{$labels.instance}}: {{$labels.job}} telegraf exporter from {{$labels.source}} is down";
+      };
+
+      Ping = {
+        expr = "ping_result_code{type!='mobile'} != 0";
+        annotations.description = "{{$labels.url}}: ping from {{$labels.instance}} has failed";
+      };
+
+      PingHighLatency = {
+        expr = "ping_average_response_ms{type!='mobile'} > 5000";
+        annotations.description = "{{$labels.instance}}: ping probe from {{$labels.source}} is encountering high latency";
+      };
+
+      Http = {
+        expr = "http_response_result_code != 0";
+        annotations.description = "{{$labels.url}}: http request from {{$labels.instance}} has failed";
+      };
+
+      HttpMatchFailed = {
+        expr = "http_response_response_string_match == 0";
+        annotations.description = "{{$labels.url}}: http body not as expected; status code: {{$labels.status_code}}";
+      };
+
+      DnsQuery = {
+        expr = "dns_query_result_code != 0";
+        annotations.description = "{{$labels.domain}} : could retrieve A record {{$labels.instance}} from server {{$labels.server}}: {{$labels.result}}";
+      };
+
+      SecureDnsQuery = {
+        expr = "secure_dns_state != 0";
+        annotations.description = "{{$labels.domain}} : could retrieve A record {{$labels.instance}} from server {{$labels.server}}: {{$labels.result}} for protocol {{$labels.protocol}}";
+      };
+
+      ConnectionFailed = {
+        expr = "net_response_result_code != 0";
+        annotations.description = "{{$labels.server}}: connection to {{$labels.port}}({{$labels.protocol}}) failed from {{$labels.instance}}";
+      };
+
+      # https://healthchecks.io/
+      Healthchecks = {
+        expr = "hc_check_up == 0";
+        annotations.description = "{{$labels.instance}}: healtcheck {{$labels.job}} fails";
+      };
+
+      CertExpiry = {
+        expr = "x509_cert_expiry < 7*24*3600";
+        annotations.description = "{{$labels.instance}}: The TLS certificate from {{$labels.source}} will expire in less than 7 days: {{$value}}s";
+      };
+
+      PostfixQueueLength = {
+        expr = "avg_over_time(postfix_queue_length[1h]) > 10";
+        annotations.description = "{{$labels.instance}}: postfix mail queue has undelivered {{$value}} items";
+      };
+
+      ZfsErrors = {
+        expr = "zfs_arcstats_l2_io_error + zfs_dmu_tx_error + zfs_arcstats_l2_writes_error > 0";
+        annotations.description = "{{$labels.instance}} reports: {{$value}} ZFS IO errors";
+      };
+
+      ZpoolErrors = {
+        expr = "zpool_status_errors > 0";
+        annotations.description = "{{$labels.instance}} reports: zpool {{$labels.name}} has {{$value}} errors";
+      };
+
+      MdRaidDegradedDisks = {
+        expr = "mdstat_degraded_disks > 0";
+        annotations.description = "{{$labels.instance}}: raid {{$labels.dev}} has failed disks";
+      };
+
+      # ignore devices that disabled S.M.A.R.T (example if attached via USB)
+      # Also ignore nix-community server ci server until nvme actually fails
+      smart_errors = {
+        expr = ''smart_device_health_ok{enabled!="Disabled"} != 1'';
+        description = "{{$labels.instance}}: S.M.A.R.T reports: {{$labels.device}} ({{$labels.model}}) has errors";
+      };
+
+      OomKills = {
+        expr = "increase(kernel_vmstat_oom_kill[5m]) > 0";
+        annotations.description = "{{$labels.instance}}: OOM kill detected";
+      };
+
+      UnusualDiskReadLatency = {
+        expr = "rate(diskio_read_time[1m]) / rate(diskio_reads[1m]) > 0.1 and rate(diskio_reads[1m]) > 0";
+        annotations.description = "{{$labels.instance}}: Disk latency is growing (read operations > 100ms)";
+      };
+
+      UnusualDiskWriteLatency = {
+        expr = "rate(diskio_write_time[1m]) / rate(diskio_write[1m]) > 0.1 and rate(diskio_write[1m]) > 0";
+        annotations.description = "{{$labels.instance}}: Disk latency is growing (write operations > 100ms)";
+      };
+
+      Ipv6DadCheck = {
+        expr = "ipv6_dad_failures_count > 0";
+        annotations.description = "{{$labels.host}}: {{$value}} assigned ipv6 addresses have failed duplicate address check";
+      };
+
+      HostMemoryUnderMemoryPressure = {
+        expr = "rate(node_vmstat_pgmajfault[1m]) > 1000";
+        annotations.description = "{{$labels.instance}}: The node is under heavy memory pressure. High rate of major page faults: {{$value}}";
+      };
+
+      Ext4Errors = {
+        expr = "ext4_errors_value > 0";
+        annotations.description = "{{$labels.instance}}: ext4 has reported {{$value}} I/O errors: check /sys/fs/ext4/*/errors_count";
+      };
+
+      AlertmanagerSilencesChanged = {
+        expr = "abs(delta(alertmanager_silences{state=\"active\"}[1h])) >= 1";
+        annotations.description = "alertmanager: number of active silences has changed: {{$value}}";
+      };
+    };
+  };
+}
diff --git a/nixos/roles/prometheus/default.nix b/nixos/roles/prometheus/default.nix
@@ -0,0 +1,130 @@
+{ lib, pkgs, config, ... }:
+let
+  filterEnabled = lib.filterAttrs (n: v: v.enable);
+  rules = lib.mapAttrsToList (name: group: {
+    inherit name;
+    rules =
+      (lib.mapAttrsToList
+        (name: rule: {
+          alert = rule.name;
+          expr = rule.expr;
+          for = rule.for;
+          labels = rule.labels;
+          annotations = rule.annotations;
+        })
+        (filterEnabled group.alertRules)) ++
+      (lib.mapAttrsToList
+        (name: rule: {
+          record = rule.name;
+          expr = rule.expr;
+          labels = rule.labels;
+          annotations = rule.annotations;
+        })
+        (filterEnabled group.recordingRules)) config.srvos.prometheus.ruleGroups;
+  });
+in
+{
+  imports = [
+    ./default-alerts.nix
+  ];
+  options = {
+    # XXX maybe we move this upstream eventually to nixpkgs. Expect this interface to be replaced with the upstream equivalent.
+    srvos.prometheus.ruleGroups = lib.mkOption {
+      type = lib.types.attrsOf (lib.types.submodule ({ name, ... }: {
+        name = lib.mkOption {
+          type = lib.types.str;
+          default = name;
+        };
+        enable = lib.mkEnableOption (lib.mdDoc "Enable rule group");
+        alertRules = lib.mkOption {
+          type = lib.attrsetsOf (lib.types.submodule ({ name, ... }: {
+            name = lib.mkOption {
+              type = lib.types.str;
+              default = name;
+            };
+            enable = lib.mkEnableOption (lib.mdDoc "Enable alert rule");
+            expr = lib.mkOption {
+              type = lib.types.str;
+            };
+            for = lib.mkOption {
+              type = lib.types.str;
+              default = "2m";
+            };
+            labels = lib.mkOption {
+              type = lib.types.attrsOf lib.types.str;
+              default = { };
+            };
+            annotations = lib.mkOption {
+              type = lib.types.attrsOf lib.types.str;
+              default = { };
+            };
+          }));
+          default = [ ];
+        };
+        recordingRules = lib.mkOption {
+          type = lib.attrsetsOf (lib.types.submodule ({ name, ... }: {
+            name = lib.mkOption {
+              type = lib.types.str;
+              default = name;
+            };
+            enable = lib.mkEnableOption (lib.mdDoc "Enable recording rule");
+            expr = lib.mkOption {
+              type = lib.types.str;
+            };
+            for = lib.mkOption {
+              type = lib.types.str;
+              default = "2m";
+            };
+            labels = lib.mkOption {
+              type = lib.types.attrsOf lib.types.str;
+              default = { };
+            };
+            annotations = lib.mkOption {
+              type = lib.types.attrsOf lib.types.str;
+              default = { };
+            };
+          }));
+          default = [ ];
+        };
+      }));
+      example = {
+        prometheusAlerts = {
+          alertRules = {
+            ExampleAlert = {
+              expr = "up == 0";
+              for = "2m";
+              labels = {
+                severity = "critical";
+              };
+              annotations = {
+                summary = "Instance {{ $labels.instance }} down";
+                description = "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 2 minutes.";
+              };
+            };
+          };
+          recordingRules = {
+            RecordingExample = {
+              expr = "up";
+              for = "2m";
+              labels = {
+                severity = "critical";
+              };
+              annotations = {
+                summary = "Instance {{ $labels.instance }} up";
+                description = "{{ $labels.instance }} of job {{ $labels.job }} has been up for more than 2 minutes.";
+              };
+            };
+          };
+        };
+      };
+    };
+  };
+  config = {
+    services.prometheus = {
+      enable = lib.mkDefault true;
+      # checks fail because of missing secrets in the sandbox
+      checkConfig = false;
+      ruleFiles = pkgs.writers.writeYAML "rules.yaml" rules;
+    };
+  };
+}