forked from rancher/charts
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
(dev-v2.6) Merge pull request rancher#1274 from aiyengar2/rebase_moni…
…toring Rebase Monitoring to 16.x.x (partially cherry picked from commit 89d8ad2)
- Loading branch information
Showing
90 changed files
with
2,187 additions
and
911 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
url: https://github.com/kubernetes/kube-state-metrics.git | ||
url: https://github.com/prometheus-community/helm-charts.git | ||
subdirectory: charts/kube-state-metrics | ||
commit: bd6335b43ef3eb97639dc41dd605dc12422f60b6 | ||
commit: 086f1f7f0870e110abf30aa6bfe7c141e83cc950 | ||
version: 100.0.0 |
12 changes: 6 additions & 6 deletions
12
packages/rancher-monitoring/generated-changes/exclude/Chart.lock
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,12 @@ | ||
dependencies: | ||
- name: kube-state-metrics | ||
repository: https://kubernetes.github.io/kube-state-metrics | ||
version: 2.13.0 | ||
repository: https://prometheus-community.github.io/helm-charts | ||
version: 3.1.1 | ||
- name: prometheus-node-exporter | ||
repository: https://prometheus-community.github.io/helm-charts | ||
version: 1.16.2 | ||
version: 1.18.1 | ||
- name: grafana | ||
repository: https://grafana.github.io/helm-charts | ||
version: 6.6.3 | ||
digest: sha256:52acbef377da70248ae3fa926dc7f6601df9022b1b1e17224a8fe99e6995d3af | ||
generated: "2021-03-19T17:50:36.8566658+01:00" | ||
version: 6.12.0 | ||
digest: sha256:11886645ff1ade77d0fefdca90afba4a92f2b535997280074a59828e8d1dab4e | ||
generated: "2021-06-09T16:56:40.364303181+02:00" |
2 changes: 1 addition & 1 deletion
2
packages/rancher-monitoring/generated-changes/exclude/hack/requirements.txt
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,2 @@ | ||
PyYAML==5.1.2 | ||
PyYAML==5.4 | ||
requests==2.22.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
63 changes: 63 additions & 0 deletions
63
...r-monitoring/generated-changes/exclude/templates/prometheus/rules/alertmanager.rules.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
{{- /* | ||
Generated from 'alertmanager.rules' group from https://raw.githubusercontent.com/prometheus-operator/kube-prometheus/release-0.1/manifests/prometheus-rules.yaml | ||
Do not change in-place! In order to change this file first read following link: | ||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack | ||
*/ -}} | ||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} | ||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.defaultRules.rules.alertmanager }} | ||
{{- $operatorJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "operator" }} | ||
{{- $alertmanagerJob := printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager" }} | ||
{{- $namespace := printf "%s" (include "kube-prometheus-stack.namespace" .) }} | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "alertmanager.rules" | trunc 63 | trimSuffix "-" }} | ||
namespace: {{ template "kube-prometheus-stack.namespace" . }} | ||
labels: | ||
app: {{ template "kube-prometheus-stack.name" . }} | ||
{{ include "kube-prometheus-stack.labels" . | indent 4 }} | ||
{{- if .Values.defaultRules.labels }} | ||
{{ toYaml .Values.defaultRules.labels | indent 4 }} | ||
{{- end }} | ||
{{- if .Values.defaultRules.annotations }} | ||
annotations: | ||
{{ toYaml .Values.defaultRules.annotations | indent 4 }} | ||
{{- end }} | ||
spec: | ||
groups: | ||
- name: alertmanager.rules | ||
rules: | ||
- alert: AlertmanagerConfigInconsistent | ||
annotations: | ||
message: The configuration of the instances of the Alertmanager cluster `{{`{{`}}$labels.service{{`}}`}}` are out of sync. | ||
expr: count_values("config_hash", alertmanager_config_hash{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) BY (service) / ON(service) GROUP_LEFT() label_replace(max(prometheus_operator_spec_replicas{job="{{ $operatorJob }}",namespace="{{ $namespace }}",controller="alertmanager"}) by (name, job, namespace, controller), "service", "$1", "name", "(.*)") != 1 | ||
for: 5m | ||
labels: | ||
severity: critical | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: AlertmanagerFailedReload | ||
annotations: | ||
message: Reloading Alertmanager's configuration has failed for {{`{{`}} $labels.namespace {{`}}`}}/{{`{{`}} $labels.pod{{`}}`}}. | ||
expr: alertmanager_config_last_reload_successful{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} == 0 | ||
for: 10m | ||
labels: | ||
severity: warning | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: AlertmanagerMembersInconsistent | ||
annotations: | ||
message: Alertmanager has not found all other members of the cluster. | ||
expr: |- | ||
alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"} | ||
!= on (service) GROUP_LEFT() | ||
count by (service) (alertmanager_cluster_members{job="{{ $alertmanagerJob }}",namespace="{{ $namespace }}"}) | ||
for: 5m | ||
labels: | ||
severity: critical | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
{{- end }} |
179 changes: 179 additions & 0 deletions
179
packages/rancher-monitoring/generated-changes/exclude/templates/prometheus/rules/etcd.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,179 @@ | ||
{{- /* | ||
Generated from 'etcd' group from https://raw.githubusercontent.com/etcd-io/website/master/content/en/docs/v3.4/op-guide/etcd3_alert.rules.yml | ||
Do not change in-place! In order to change this file first read following link: | ||
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack/hack | ||
*/ -}} | ||
{{- $kubeTargetVersion := default .Capabilities.KubeVersion.GitVersion .Values.kubeTargetVersionOverride }} | ||
{{- if and (semverCompare ">=1.10.0-0" $kubeTargetVersion) (semverCompare "<1.14.0-0" $kubeTargetVersion) .Values.defaultRules.create .Values.kubeEtcd.enabled .Values.defaultRules.rules.etcd }} | ||
apiVersion: monitoring.coreos.com/v1 | ||
kind: PrometheusRule | ||
metadata: | ||
name: {{ printf "%s-%s" (include "kube-prometheus-stack.fullname" .) "etcd" | trunc 63 | trimSuffix "-" }} | ||
namespace: {{ template "kube-prometheus-stack.namespace" . }} | ||
labels: | ||
app: {{ template "kube-prometheus-stack.name" . }} | ||
{{ include "kube-prometheus-stack.labels" . | indent 4 }} | ||
{{- if .Values.defaultRules.labels }} | ||
{{ toYaml .Values.defaultRules.labels | indent 4 }} | ||
{{- end }} | ||
{{- if .Values.defaultRules.annotations }} | ||
annotations: | ||
{{ toYaml .Values.defaultRules.annotations | indent 4 }} | ||
{{- end }} | ||
spec: | ||
groups: | ||
- name: etcd | ||
rules: | ||
- alert: etcdInsufficientMembers | ||
annotations: | ||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": insufficient members ({{`{{`}} $value {{`}}`}}).' | ||
expr: sum(up{job=~".*etcd.*"} == bool 1) by (job) < ((count(up{job=~".*etcd.*"}) by (job) + 1) / 2) | ||
for: 3m | ||
labels: | ||
severity: critical | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdNoLeader | ||
annotations: | ||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member {{`{{`}} $labels.instance {{`}}`}} has no leader.' | ||
expr: etcd_server_has_leader{job=~".*etcd.*"} == 0 | ||
for: 1m | ||
labels: | ||
severity: critical | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdHighNumberOfLeaderChanges | ||
annotations: | ||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": instance {{`{{`}} $labels.instance {{`}}`}} has seen {{`{{`}} $value {{`}}`}} leader changes within the last hour.' | ||
expr: rate(etcd_server_leader_changes_seen_total{job=~".*etcd.*"}[15m]) > 3 | ||
for: 15m | ||
labels: | ||
severity: warning | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdHighNumberOfFailedGRPCRequests | ||
annotations: | ||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' | ||
expr: |- | ||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) | ||
/ | ||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) | ||
> 1 | ||
for: 10m | ||
labels: | ||
severity: warning | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdHighNumberOfFailedGRPCRequests | ||
annotations: | ||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.grpc_method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' | ||
expr: |- | ||
100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) BY (job, instance, grpc_service, grpc_method) | ||
/ | ||
sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) BY (job, instance, grpc_service, grpc_method) | ||
> 5 | ||
for: 5m | ||
labels: | ||
severity: critical | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdGRPCRequestsSlow | ||
annotations: | ||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": gRPC requests to {{`{{`}} $labels.grpc_method {{`}}`}} are taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' | ||
expr: |- | ||
histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_type="unary"}[5m])) by (job, instance, grpc_service, grpc_method, le)) | ||
> 0.15 | ||
for: 10m | ||
labels: | ||
severity: critical | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdMemberCommunicationSlow | ||
annotations: | ||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": member communication with {{`{{`}} $labels.To {{`}}`}} is taking {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' | ||
expr: |- | ||
histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) | ||
> 0.15 | ||
for: 10m | ||
labels: | ||
severity: warning | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdHighNumberOfFailedProposals | ||
annotations: | ||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": {{`{{`}} $value {{`}}`}} proposal failures within the last hour on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' | ||
expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5 | ||
for: 15m | ||
labels: | ||
severity: warning | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdHighFsyncDurations | ||
annotations: | ||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile fync durations are {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' | ||
expr: |- | ||
histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket{job=~".*etcd.*"}[5m])) | ||
> 0.5 | ||
for: 10m | ||
labels: | ||
severity: warning | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdHighCommitDurations | ||
annotations: | ||
message: 'etcd cluster "{{`{{`}} $labels.job {{`}}`}}": 99th percentile commit durations {{`{{`}} $value {{`}}`}}s on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' | ||
expr: |- | ||
histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket{job=~".*etcd.*"}[5m])) | ||
> 0.25 | ||
for: 10m | ||
labels: | ||
severity: warning | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdHighNumberOfFailedHTTPRequests | ||
annotations: | ||
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}' | ||
expr: |- | ||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) | ||
BY (method) > 0.01 | ||
for: 10m | ||
labels: | ||
severity: warning | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdHighNumberOfFailedHTTPRequests | ||
annotations: | ||
message: '{{`{{`}} $value {{`}}`}}% of requests for {{`{{`}} $labels.method {{`}}`}} failed on etcd instance {{`{{`}} $labels.instance {{`}}`}}.' | ||
expr: |- | ||
sum(rate(etcd_http_failed_total{job=~".*etcd.*", code!="404"}[5m])) BY (method) / sum(rate(etcd_http_received_total{job=~".*etcd.*"}[5m])) | ||
BY (method) > 0.05 | ||
for: 10m | ||
labels: | ||
severity: critical | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
- alert: etcdHTTPRequestsSlow | ||
annotations: | ||
message: etcd instance {{`{{`}} $labels.instance {{`}}`}} HTTP requests to {{`{{`}} $labels.method {{`}}`}} are slow. | ||
expr: |- | ||
histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[5m])) | ||
> 0.15 | ||
for: 10m | ||
labels: | ||
severity: warning | ||
{{- if .Values.defaultRules.additionalRuleLabels }} | ||
{{ toYaml .Values.defaultRules.additionalRuleLabels | indent 8 }} | ||
{{- end }} | ||
{{- end }} |
Oops, something went wrong.