diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..81c0aa83 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,29 @@ +--- +name: CI +'on': + pull_request: + push: + branches: + - master + +jobs: + + lint: + name: Lint + runs-on: ubuntu-latest + + steps: + - name: Check out the codebase. + uses: actions/checkout@v2 + + - name: Set up Python 3. + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install test dependencies. + run: pip3 install yamllint + + - name: Lint all the YAMLs. + working-directory: ./ansible + run: yamllint . diff --git a/ansible/group_vars/k3s_cluster.yml b/ansible/group_vars/k3s_cluster.yml index 90873385..a004c53f 100644 --- a/ansible/group_vars/k3s_cluster.yml +++ b/ansible/group_vars/k3s_cluster.yml @@ -7,8 +7,41 @@ k3s_master_ip: 10.0.0.11 k3s_token: s1cret0 # Extra arguments for k3s installation scripts -k3s_server_extra_args: "--write-kubeconfig-mode '0644' --disable 'servicelb' --node-taint 'node-role.kubernetes.io/master=true:NoSchedule'" +k3s_server_extra_args: >- + --write-kubeconfig-mode '0644' + --disable 'servicelb' + --node-taint 'node-role.kubernetes.io/master=true:NoSchedule' + --kube-controller-manager-arg 'bind-address=0.0.0.0' + --kube-controller-manager-arg 'address=0.0.0.0' + --kube-proxy-arg 'metrics-bind-address=0.0.0.0' + --kube-scheduler-arg 'bind-address=0.0.0.0' + --kube-scheduler-arg 'address=0.0.0.0' + + k3s_worker_extra_args: "--node-label 'node_type=worker'" +# Namespaces +k3s_metallb_namespace: metallb-system +k3s_traefik_namespace: traefik-system +k3s_longhorn_namespace: longhorn-system +k3s_certmanager_namespace: certmanager-system +k3s_logging_namespace: k3s-logging +k3s_monitoring_namespace: k3s-monitoring + +# DNS service end-points + +traefik_dashboard_dns: traefik.picluster.ricsanfre.com +longhorn_dashboard_dns: storage.picluster.ricsanfre.com +kibana_dashboard_dns: kibana.picluster.ricsanfre.com +grafana_dashboard_dns: grafana.picluster.ricsanfre.com +prometheus_dashboard_dns: prometheus.picluster.ricsanfre.com +alertmanager_dashboard_dns: alertmanager.picluster.ricsanfre.com + +# MetalLB configuration # k3s external ip range: Metal LB pool configuration k3s_external_ip_range: "10.0.0.100-10.0.0.200" + +# Traefik configuration +# HTTP Basic auth credentials +traefik_basic_auth_user: admin +traefik_basic_auth_passwd: s1cret0 diff --git a/ansible/roles/basic_setup/tasks/remove_snap_packages.yml b/ansible/roles/basic_setup/tasks/remove_snap_packages.yml index 34ed1005..31560422 100644 --- a/ansible/roles/basic_setup/tasks/remove_snap_packages.yml +++ b/ansible/roles/basic_setup/tasks/remove_snap_packages.yml @@ -16,9 +16,12 @@ register: snap_remove_output with_items: "{{ snap_packages.stdout_lines }}" rescue: - - fail: + - name: Check number of retries and fail if greater that 3 + fail: msg: Maximum retries of grouped tasks reached when: retry_count | int == 3 - - debug: + - name: printing retry message + debug: msg: "Removing snap package failed, let's give it another shot" - - include_tasks: remove_snap_packages.yml + - name: retrying deletion + include_tasks: remove_snap_packages.yml diff --git a/ansible/roles/certmanager/defaults/main.yml b/ansible/roles/certmanager/defaults/main.yml new file mode 100644 index 00000000..c136a589 --- /dev/null +++ b/ansible/roles/certmanager/defaults/main.yml @@ -0,0 +1,3 @@ +--- +# Namespace for cert-manager +k3s_certmanager_namespace: certmanager-system diff --git a/ansible/roles/certmanager/tasks/main.yml b/ansible/roles/certmanager/tasks/main.yml index 7f31b8e9..9884e648 100644 --- a/ansible/roles/certmanager/tasks/main.yml +++ b/ansible/roles/certmanager/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Create cert-manager namespace. kubernetes.core.k8s: - name: certmanager-system + name: "{{ k3s_certmanager_namespace }}" api_version: v1 kind: Namespace state: present @@ -15,8 +15,8 @@ kubernetes.core.helm: name: certmanager chart_ref: jetstack/cert-manager - chart_version: "1.5.3" - release_namespace: certmanager-system + update_repo_cache: true + release_namespace: "{{ k3s_certmanager_namespace }}" state: present release_values: installCRDs: true @@ -26,4 +26,4 @@ definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - selfsigned_issuer.yml + - selfsigned_issuer.yml.j2 diff --git a/ansible/roles/certmanager/templates/selfsigned_issuer.yml b/ansible/roles/certmanager/templates/selfsigned_issuer.yml.j2 similarity index 71% rename from ansible/roles/certmanager/templates/selfsigned_issuer.yml rename to ansible/roles/certmanager/templates/selfsigned_issuer.yml.j2 index 46d0f8a7..77f907a5 100644 --- a/ansible/roles/certmanager/templates/selfsigned_issuer.yml +++ b/ansible/roles/certmanager/templates/selfsigned_issuer.yml.j2 @@ -3,5 +3,6 @@ apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: name: self-signed-issuer + namespace: {{ k3s_certmanager_namespace }} spec: selfSigned: {} diff --git a/ansible/roles/logging/k3s/defaults/main.yml b/ansible/roles/logging/k3s/defaults/main.yml index 0c7304de..3a5128db 100644 --- a/ansible/roles/logging/k3s/defaults/main.yml +++ b/ansible/roles/logging/k3s/defaults/main.yml @@ -20,3 +20,6 @@ efk_fluentd_image: "fluent/fluentd-kubernetes-daemonset:v1.14-debian-elasticsear # Configured Timezone efk_node_timezone: "Europe/Madrid" + +# Endpoint Service DNS name +kibana_dashboard_dns: kibana.picluster.ricsanfre.com diff --git a/ansible/roles/logging/k3s/tasks/main.yml b/ansible/roles/logging/k3s/tasks/main.yml index f1cbb7db..cb1ce7be 100644 --- a/ansible/roles/logging/k3s/tasks/main.yml +++ b/ansible/roles/logging/k3s/tasks/main.yml @@ -2,7 +2,7 @@ # namespace for hosting EFK stack - name: Create k3s-logging namespace. kubernetes.core.k8s: - name: k3s-logging + name: "{{ k3s_logging_namespace }}" api_version: v1 kind: Namespace state: present @@ -25,6 +25,7 @@ name: elastic-operator chart_ref: elastic/eck-operator release_namespace: elastic-system + update_repo_cache: true state: present - name: Deploy elasticsearch, kibana and fluentd @@ -32,8 +33,8 @@ definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - elasticsearch_secret.yml - - elasticsearch.yml - - kibana.yml - - fluentd.yml - - kibana_ingress.yml + - elasticsearch_secret.yml.j2 + - elasticsearch.yml.j2 + - kibana.yml.j2 + - fluentd.yml.j2 + - kibana_ingress.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/elasticsearch.yml b/ansible/roles/logging/k3s/templates/elasticsearch.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/elasticsearch.yml rename to ansible/roles/logging/k3s/templates/elasticsearch.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/elasticsearch_secret.yml b/ansible/roles/logging/k3s/templates/elasticsearch_secret.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/elasticsearch_secret.yml rename to ansible/roles/logging/k3s/templates/elasticsearch_secret.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/filebeat.yml b/ansible/roles/logging/k3s/templates/filebeat.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/filebeat.yml rename to ansible/roles/logging/k3s/templates/filebeat.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/fluentd.yml b/ansible/roles/logging/k3s/templates/fluentd.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/fluentd.yml rename to ansible/roles/logging/k3s/templates/fluentd.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/kibana.yml b/ansible/roles/logging/k3s/templates/kibana.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/kibana.yml rename to ansible/roles/logging/k3s/templates/kibana.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/kibana_ingress.yml b/ansible/roles/logging/k3s/templates/kibana_ingress.yml.j2 similarity index 90% rename from ansible/roles/logging/k3s/templates/kibana_ingress.yml rename to ansible/roles/logging/k3s/templates/kibana_ingress.yml.j2 index 5ddab642..6bbc606d 100644 --- a/ansible/roles/logging/k3s/templates/kibana_ingress.yml +++ b/ansible/roles/logging/k3s/templates/kibana_ingress.yml.j2 @@ -8,7 +8,7 @@ metadata: kubernetes.io/ingress.class: traefik spec: rules: - - host: kibana.picluster.ricsanfre.com + - host: {{ kibana_dashboard_dns }} http: paths: - path: / diff --git a/ansible/roles/longhorn/defaults/main.yml b/ansible/roles/longhorn/defaults/main.yml new file mode 100644 index 00000000..a9fd69ef --- /dev/null +++ b/ansible/roles/longhorn/defaults/main.yml @@ -0,0 +1,6 @@ +--- +# Namespace for Longhorn +k3s_longhorn_namespace: longhorn-system + +# Endpoint Service DNS name +longhorn_dashboard_dns: storage.picluster.ricsanfre.com diff --git a/ansible/roles/longhorn/tasks/main.yml b/ansible/roles/longhorn/tasks/main.yml index 845dfc29..20bacde4 100644 --- a/ansible/roles/longhorn/tasks/main.yml +++ b/ansible/roles/longhorn/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Create longhorn namespace. kubernetes.core.k8s: - name: longhorn-system + name: "{{ k3s_longhorn_namespace }}" api_version: v1 kind: Namespace state: present @@ -16,7 +16,7 @@ name: longhorn chart_ref: longhorn/longhorn update_repo_cache: true - release_namespace: longhorn-system + release_namespace: "{{ k3s_longhorn_namespace }}" state: present release_values: defaultSettings: @@ -27,7 +27,7 @@ definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - longhorn_ingress.yml + - longhorn_ingress.yml.j2 - name: Remove Local-Path as default storage class command: diff --git a/ansible/roles/longhorn/templates/longhorn_ingress.yml b/ansible/roles/longhorn/templates/longhorn_ingress.yml.j2 similarity index 75% rename from ansible/roles/longhorn/templates/longhorn_ingress.yml rename to ansible/roles/longhorn/templates/longhorn_ingress.yml.j2 index 77957106..76316e14 100644 --- a/ansible/roles/longhorn/templates/longhorn_ingress.yml +++ b/ansible/roles/longhorn/templates/longhorn_ingress.yml.j2 @@ -4,24 +4,24 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: longhorn-ingress - namespace: longhorn-system + namespace: {{ k3s_longhorn_namespace }} annotations: # HTTPS as entry point traefik.ingress.kubernetes.io/router.entrypoints: websecure # Enable TLS traefik.ingress.kubernetes.io/router.tls: "true" # Use Basic Auth Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-basic-auth@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-basic-auth@kubernetescrd # Enable cert-manager to create automatically the SSL certificate and store in Secret cert-manager.io/cluster-issuer: self-signed-issuer cert-manager.io/common-name: longhorn spec: tls: - hosts: - - storage.picluster.ricsanfre.com + - {{ longhorn_dashboard_dns }} secretName: storage-tls rules: - - host: storage.picluster.ricsanfre.com + - host: {{ longhorn_dashboard_dns }} http: paths: - path: / @@ -38,15 +38,15 @@ kind: Ingress apiVersion: networking.k8s.io/v1 metadata: name: longhorn-redirect - namespace: longhorn-system + namespace: {{ k3s_longhorn_namespace }} annotations: # Use redirect Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-redirect@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-redirect@kubernetescrd # HTTP as entrypoint traefik.ingress.kubernetes.io/router.entrypoints: web spec: rules: - - host: storage.picluster.ricsanfre.com + - host: {{ longhorn_dashboard_dns }} http: paths: - path: / diff --git a/ansible/roles/metallb/defaults/main.yml b/ansible/roles/metallb/defaults/main.yml new file mode 100644 index 00000000..d8150212 --- /dev/null +++ b/ansible/roles/metallb/defaults/main.yml @@ -0,0 +1,6 @@ +--- +# Namespace for metallb +k3s_metallb_namespace: metallb-system + +# k3s external ip range: Metal LB pool configuration +k3s_external_ip_range: diff --git a/ansible/roles/metallb/tasks/main.yml b/ansible/roles/metallb/tasks/main.yml index 74ef394c..29f34ae8 100644 --- a/ansible/roles/metallb/tasks/main.yml +++ b/ansible/roles/metallb/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Create Metal LB namespace. kubernetes.core.k8s: - name: metallb-system + name: "{{ k3s_metallb_namespace }}" api_version: v1 kind: Namespace state: present @@ -15,7 +15,8 @@ kubernetes.core.helm: name: metallb chart_ref: metallb/metallb - release_namespace: metallb-system + release_namespace: "{{ k3s_metallb_namespace }}" + update_repo_cache: true state: present release_values: configInline: diff --git a/ansible/roles/prometheus/defaults/main.yml b/ansible/roles/prometheus/defaults/main.yml index b894f3ab..c9cbdfa8 100644 --- a/ansible/roles/prometheus/defaults/main.yml +++ b/ansible/roles/prometheus/defaults/main.yml @@ -1,5 +1,12 @@ --- +k3s_monitoring_namespace: k3s-monitoring + +# Endpoint Service DNS name +grafana_dashboard_dns: grafana.picluster.ricsanfre.com +prometheus_dashboard_dns: prometheus.picluster.ricsanfre.com +alertmanager_dashboard_dns: alertmanager.picluster.ricsanfre.com + # Storage Settings prometheus_storage_size: "5Gi" prometheus_storage_class: "longhorn" diff --git a/ansible/roles/prometheus/files/k3s-controllermanager-dashboard.json b/ansible/roles/prometheus/files/k3s-controllermanager-dashboard.json new file mode 100644 index 00000000..7ab23496 --- /dev/null +++ b/ansible/roles/prometheus/files/k3s-controllermanager-dashboard.json @@ -0,0 +1,1154 @@ +{ + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{cluster=\"$cluster\", job=\"k3s-metrics-service\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Up", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "min" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} {{name}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Work Queue Add Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} {{name}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Work Queue Depth", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} {{name}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Work Queue Latency", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rest_client_requests_total{job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" + }, + { + "expr": "sum(rate(rest_client_requests_total{job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Kube API Request Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 8, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Post Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Get Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"k3s-metrics-service\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(up{cluster=\"$cluster\", job=\"k3s-metrics-service\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Kubernetes / Controller Manager", + "uid": "72e0e05bef5099e5f049b05fdc429ed4", + "version": 0 +} diff --git a/ansible/roles/prometheus/files/k3s-proxy-dashboard.json b/ansible/roles/prometheus/files/k3s-proxy-dashboard.json new file mode 100644 index 00000000..2c37544a --- /dev/null +++ b/ansible/roles/prometheus/files/k3s-proxy-dashboard.json @@ -0,0 +1,1234 @@ +{ + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{cluster=\"$cluster\", job=\"k3s-metrics-service\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Up", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "min" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "rate", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rules Sync Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rule Sync Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "rate", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Programming Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Programming Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Kube API Request Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 8, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Post Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Get Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Kubernetes / Proxy", + "uid": "632e265de029684c40b21cb76bca4f94", + "version": 0 +} diff --git a/ansible/roles/prometheus/files/k3s-scheduler-dashboard.json b/ansible/roles/prometheus/files/k3s-scheduler-dashboard.json new file mode 100644 index 00000000..8f2a7eb4 --- /dev/null +++ b/ansible/roles/prometheus/files/k3s-scheduler-dashboard.json @@ -0,0 +1,1077 @@ +{ + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{cluster=\"$cluster\", job=\"k3s-metrics-service\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Up", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "min" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} e2e", + "refId": "A" + }, + { + "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} binding", + "refId": "B" + }, + { + "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} scheduling algorithm", + "refId": "C" + }, + { + "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} volume", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Scheduling Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} e2e", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} binding", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} scheduling algorithm", + "refId": "C" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} volume", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Scheduling latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Kube API Request Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 8, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Post Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Get Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"k3s-metrics-service\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"k3s-metrics-service\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Kubernetes / Scheduler", + "uid": "2e6b6a3b4bddf1427b3a55aa1311c656", + "version": 0 +} diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml index 833fb6b4..09aa2d67 100644 --- a/ansible/roles/prometheus/tasks/main.yml +++ b/ansible/roles/prometheus/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Create prometheus namespace. kubernetes.core.k8s: - name: k3s-monitoring + name: "{{ k3s_monitoring_namespace }}" api_version: v1 kind: Namespace state: present @@ -15,7 +15,8 @@ kubernetes.core.helm: name: kube-prometheus-stack chart_ref: prometheus-community/kube-prometheus-stack - release_namespace: k3s-monitoring + release_namespace: "{{ k3s_monitoring_namespace }}" + update_repo_cache: true state: present release_values: alertmanager: @@ -42,23 +43,41 @@ adminPassword: "{{ prometheus_grafana_password }}" plugins: - grafana-piechart-panel + kubeApiServer: + enabled: true + kubeControllerManager: + enabled: false + kubeScheduler: + enabled: false + kubeProxy: + enabled: false + kubeEtcd: + enabled: false + +- name: Create k3s metrics service + kubernetes.core.k8s: + definition: "{{ lookup('template', 'templates/' + item ) }}" + state: present + with_items: + - k3s_service_metrics.yml.j2 - name: Create Ingress rule for Prometheus, Alertmanager and Graphana UI kubernetes.core.k8s: definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - prometheus_ingress.yml - - grafana_ingress.yml - - alertmanager_ingress.yml + - prometheus_ingress.yml.j2 + - grafana_ingress.yml.j2 + - alertmanager_ingress.yml.j2 - name: Configure Service Monitors kubernetes.core.k8s: definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - traefik_servicemonitor.yml - - longhorn_servicemonitor.yml + - traefik_servicemonitor.yml.j2 + - longhorn_servicemonitor.yml.j2 + - k3s_servicemonitor.yml.j2 - name: Configure Grafana Dashboards include_tasks: configure_grafana_dashboards.yml @@ -72,3 +91,9 @@ file: traefik-dashboard.json - name: dashboard-longhorn file: longhorn-dashboard.json + - name: k3s-controller-manager + file: k3s-controllermanager-dashboard.json + - name: k3s-scheduler + file: k3s-scheduler-dashboard.json + - name: k3s-proxy + file: k3s-proxy-dashboard.json diff --git a/ansible/roles/prometheus/templates/alertmanager_ingress.yml b/ansible/roles/prometheus/templates/alertmanager_ingress.yml.j2 similarity index 75% rename from ansible/roles/prometheus/templates/alertmanager_ingress.yml rename to ansible/roles/prometheus/templates/alertmanager_ingress.yml.j2 index 00c4d7ca..bbd77068 100644 --- a/ansible/roles/prometheus/templates/alertmanager_ingress.yml +++ b/ansible/roles/prometheus/templates/alertmanager_ingress.yml.j2 @@ -4,24 +4,24 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: alertmanager-ingress - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # HTTPS as entry point traefik.ingress.kubernetes.io/router.entrypoints: websecure # Enable TLS traefik.ingress.kubernetes.io/router.tls: "true" # Use Basic Auth Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-basic-auth@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-basic-auth@kubernetescrd # Enable cert-manager to create automatically the SSL certificate and store in Secret cert-manager.io/cluster-issuer: self-signed-issuer cert-manager.io/common-name: alertmanager spec: tls: - hosts: - - alertmanager.picluster.ricsanfre.com + - {{ alertmanager_dashboard_dns }} secretName: prometheus-tls rules: - - host: alertmanager.picluster.ricsanfre.com + - host: {{ alertmanager_dashboard_dns }} http: paths: - path: / @@ -38,15 +38,15 @@ kind: Ingress apiVersion: networking.k8s.io/v1 metadata: name: alertmanager-redirect - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # Use redirect Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-redirect@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-redirect@kubernetescrd # HTTP as entrypoint traefik.ingress.kubernetes.io/router.entrypoints: web spec: rules: - - host: alertmanager.picluster.ricsanfre.com + - host: {{ alertmanager_dashboard_dns }} http: paths: - path: / diff --git a/ansible/roles/prometheus/templates/grafana_dashboard.yml.j2 b/ansible/roles/prometheus/templates/grafana_dashboard.yml.j2 index 97a7139f..88e19f8a 100644 --- a/ansible/roles/prometheus/templates/grafana_dashboard.yml.j2 +++ b/ansible/roles/prometheus/templates/grafana_dashboard.yml.j2 @@ -3,7 +3,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: {{ dashboard_name }} - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} labels: grafana_dashboard: "1" data: diff --git a/ansible/roles/prometheus/templates/grafana_ingress.yml b/ansible/roles/prometheus/templates/grafana_ingress.yml.j2 similarity index 80% rename from ansible/roles/prometheus/templates/grafana_ingress.yml rename to ansible/roles/prometheus/templates/grafana_ingress.yml.j2 index 3e3bff85..e23d4641 100644 --- a/ansible/roles/prometheus/templates/grafana_ingress.yml +++ b/ansible/roles/prometheus/templates/grafana_ingress.yml.j2 @@ -4,7 +4,7 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: grafana-ingress - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # HTTPS as entry point traefik.ingress.kubernetes.io/router.entrypoints: websecure @@ -16,10 +16,10 @@ metadata: spec: tls: - hosts: - - grafana.picluster.ricsanfre.com + - {{ grafana_dashboard_dns }} secretName: grafana-tls rules: - - host: grafana.picluster.ricsanfre.com + - host: {{ grafana_dashboard_dns }} http: paths: - path: / @@ -36,15 +36,15 @@ kind: Ingress apiVersion: networking.k8s.io/v1 metadata: name: grafana-redirect - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # Use redirect Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-redirect@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-redirect@kubernetescrd # HTTP as entrypoint traefik.ingress.kubernetes.io/router.entrypoints: web spec: rules: - - host: grafana.picluster.ricsanfre.com + - host: {{ grafana_dashboard_dns }} http: paths: - path: / diff --git a/ansible/roles/prometheus/templates/k3s_service_metrics.yml.j2 b/ansible/roles/prometheus/templates/k3s_service_metrics.yml.j2 new file mode 100644 index 00000000..eeba21b5 --- /dev/null +++ b/ansible/roles/prometheus/templates/k3s_service_metrics.yml.j2 @@ -0,0 +1,32 @@ +--- +# Headless service for K3S metrics. No Selector +apiVersion: v1 +kind: Service +metadata: + name: k3s-metrics-service + labels: + app: k3s-metrics + namespace: kube-system +spec: + clusterIP: None + ports: + - name: http-metrics + port: 10249 + protocol: TCP + targetPort: 10249 + type: ClusterIP + +--- +# Endpoint for the headless service without selector +apiVersion: v1 +kind: Endpoints +metadata: + name: k3s-metrics-service + namespace: kube-system +subsets: +- addresses: + - ip: 10.0.0.11 + ports: + - name: http-metrics + port: 10249 + protocol: TCP diff --git a/ansible/roles/prometheus/templates/k3s_servicemonitor.yml.j2 b/ansible/roles/prometheus/templates/k3s_servicemonitor.yml.j2 new file mode 100644 index 00000000..c0fdae4f --- /dev/null +++ b/ansible/roles/prometheus/templates/k3s_servicemonitor.yml.j2 @@ -0,0 +1,19 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app: k3s + release: kube-prometheus-stack + name: k3s-prometheus-servicemonitor + namespace: {{ k3s_monitoring_namespace }} +spec: + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app: k3s-metrics + endpoints: + - port: http-metrics + path: /metrics diff --git a/ansible/roles/prometheus/templates/longhorn_servicemonitor.yml b/ansible/roles/prometheus/templates/longhorn_servicemonitor.yml.j2 similarity index 88% rename from ansible/roles/prometheus/templates/longhorn_servicemonitor.yml rename to ansible/roles/prometheus/templates/longhorn_servicemonitor.yml.j2 index 42d0ef3d..2e43007c 100644 --- a/ansible/roles/prometheus/templates/longhorn_servicemonitor.yml +++ b/ansible/roles/prometheus/templates/longhorn_servicemonitor.yml.j2 @@ -6,7 +6,7 @@ metadata: app: longhorn release: kube-prometheus-stack name: longhorn-prometheus-servicemonitor - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} spec: selector: matchLabels: diff --git a/ansible/roles/prometheus/templates/prometheus_ingress.yml b/ansible/roles/prometheus/templates/prometheus_ingress.yml.j2 similarity index 75% rename from ansible/roles/prometheus/templates/prometheus_ingress.yml rename to ansible/roles/prometheus/templates/prometheus_ingress.yml.j2 index f4aa894b..83cfcfda 100644 --- a/ansible/roles/prometheus/templates/prometheus_ingress.yml +++ b/ansible/roles/prometheus/templates/prometheus_ingress.yml.j2 @@ -4,24 +4,24 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: prometheus-ingress - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # HTTPS as entry point traefik.ingress.kubernetes.io/router.entrypoints: websecure # Enable TLS traefik.ingress.kubernetes.io/router.tls: "true" # Use Basic Auth Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-basic-auth@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-basic-auth@kubernetescrd # Enable cert-manager to create automatically the SSL certificate and store in Secret cert-manager.io/cluster-issuer: self-signed-issuer cert-manager.io/common-name: prometheus spec: tls: - hosts: - - prometheus.picluster.ricsanfre.com + - {{ prometheus_dashboard_dns }} secretName: prometheus-tls rules: - - host: prometheus.picluster.ricsanfre.com + - host: {{ prometheus_dashboard_dns }} http: paths: - path: / @@ -38,15 +38,15 @@ kind: Ingress apiVersion: networking.k8s.io/v1 metadata: name: prometheus-redirect - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # Use redirect Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-redirect@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-redirect@kubernetescrd # HTTP as entrypoint traefik.ingress.kubernetes.io/router.entrypoints: web spec: rules: - - host: prometheus.picluster.ricsanfre.com + - host: {{ prometheus_dashboard_dns }} http: paths: - path: / diff --git a/ansible/roles/prometheus/templates/traefik_servicemonitor.yml b/ansible/roles/prometheus/templates/traefik_servicemonitor.yml.j2 similarity index 90% rename from ansible/roles/prometheus/templates/traefik_servicemonitor.yml rename to ansible/roles/prometheus/templates/traefik_servicemonitor.yml.j2 index 28cc3943..478afa60 100644 --- a/ansible/roles/prometheus/templates/traefik_servicemonitor.yml +++ b/ansible/roles/prometheus/templates/traefik_servicemonitor.yml.j2 @@ -6,7 +6,7 @@ metadata: app: traefik release: kube-prometheus-stack name: traefik-prometheus-servicemonitor - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} spec: endpoints: - port: traefik diff --git a/ansible/roles/traefik/defaults/main.yml b/ansible/roles/traefik/defaults/main.yml new file mode 100644 index 00000000..e44322b8 --- /dev/null +++ b/ansible/roles/traefik/defaults/main.yml @@ -0,0 +1,13 @@ +--- +# Namespace for cert-manager +k3s_traefik_namespace: traefik-system + +# Endpoint Service DNS name +traefik_dashboard_dns: traefik.picluster.ricsanfre.com + + +# Basic auth user/passwd +traefik_basic_auth_user: admin +traefik_basic_auth_passwd: s1cret0 + +traefik_auth_htpasswd_pair: KKYWRtaW46JGFwcjEkWkRkMWIvNC4kUG9RR244RW5Gc0lWUUFDS3p3VHJrLgoK diff --git a/ansible/roles/traefik/templates/traefik-config.yaml b/ansible/roles/traefik/files/traefik-config.yaml similarity index 100% rename from ansible/roles/traefik/templates/traefik-config.yaml rename to ansible/roles/traefik/files/traefik-config.yaml diff --git a/ansible/roles/traefik/tasks/create_basic_auth_credentials.yml b/ansible/roles/traefik/tasks/create_basic_auth_credentials.yml new file mode 100644 index 00000000..98a78ba5 --- /dev/null +++ b/ansible/roles/traefik/tasks/create_basic_auth_credentials.yml @@ -0,0 +1,18 @@ +--- + +- name: Ensure htpasswd utility is installed + package: + name: 'apache2-utils' + state: 'present' + update_cache: true + become: true + +- name: htpasswd utility + shell: + cmd: >- + htpasswd -nb {{ traefik_basic_auth_user }} {{ traefik_basic_auth_passwd }} | base64 + register: htpasswd + +- name: Set htpasswd pair + set_fact: + traefik_auth_htpasswd_pair: "{{ htpasswd.stdout }}" diff --git a/ansible/roles/traefik/tasks/main.yml b/ansible/roles/traefik/tasks/main.yml index 68fa5315..62ebbbd7 100644 --- a/ansible/roles/traefik/tasks/main.yml +++ b/ansible/roles/traefik/tasks/main.yml @@ -3,7 +3,7 @@ - name: Configure K3S embedded Traefik Helm Chart copy: dest: "/var/lib/rancher/k3s/server/manifests/traefik-config.yaml" - src: templates/traefik-config.yaml + src: files/traefik-config.yaml owner: root group: root mode: 0600 @@ -11,15 +11,19 @@ - name: Create traefik namespace. kubernetes.core.k8s: - name: traefik-system + name: "{{ k3s_traefik_namespace }}" api_version: v1 kind: Namespace state: present + +- name: Create Basic authentication credentials + include_tasks: create_basic_auth_credentials.yml + - name: Configura Traefik kubernetes.core.k8s: definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - redirect_middleware.yml - - basicauth_middleware.yml - - traefik_dashboard.yml + - redirect_middleware.yml.j2 + - basicauth_middleware.yml.j2 + - traefik_dashboard.yml.j2 diff --git a/ansible/roles/traefik/templates/basicauth_middleware.yml b/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 similarity index 80% rename from ansible/roles/traefik/templates/basicauth_middleware.yml rename to ansible/roles/traefik/templates/basicauth_middleware.yml.j2 index cec52c31..98b6fe09 100644 --- a/ansible/roles/traefik/templates/basicauth_middleware.yml +++ b/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 @@ -7,10 +7,10 @@ apiVersion: v1 kind: Secret metadata: name: basic-auth-secret - namespace: traefik-system + namespace: {{ k3s_traefik_namespace }} data: users: |2 - b3NzOiRhcHIxJDNlZTVURy83JFpmY1NRQlV6SFpIMFZTak9NZGJ5UDANCg0K + {{ traefik_auth_htpasswd_pair }} --- # Basic-auth middleware @@ -18,7 +18,7 @@ apiVersion: traefik.containo.us/v1alpha1 kind: Middleware metadata: name: basic-auth - namespace: traefik-system + namespace: {{ k3s_traefik_namespace }} spec: basicAuth: secret: basic-auth-secret diff --git a/ansible/roles/traefik/templates/redirect_middleware.yml b/ansible/roles/traefik/templates/redirect_middleware.yml.j2 similarity index 80% rename from ansible/roles/traefik/templates/redirect_middleware.yml rename to ansible/roles/traefik/templates/redirect_middleware.yml.j2 index 115ecada..b8402d58 100644 --- a/ansible/roles/traefik/templates/redirect_middleware.yml +++ b/ansible/roles/traefik/templates/redirect_middleware.yml.j2 @@ -4,7 +4,7 @@ apiVersion: traefik.containo.us/v1alpha1 kind: Middleware metadata: name: redirect - namespace: traefik-system + namespace: {{ k3s_traefik_namespace }} spec: redirectScheme: scheme: https diff --git a/ansible/roles/traefik/templates/traefik_dashboard.yml b/ansible/roles/traefik/templates/traefik_dashboard.yml.j2 similarity index 83% rename from ansible/roles/traefik/templates/traefik_dashboard.yml rename to ansible/roles/traefik/templates/traefik_dashboard.yml.j2 index d6d0c1e4..42c423e9 100644 --- a/ansible/roles/traefik/templates/traefik_dashboard.yml +++ b/ansible/roles/traefik/templates/traefik_dashboard.yml.j2 @@ -31,17 +31,17 @@ metadata: # Enable TLS traefik.ingress.kubernetes.io/router.tls: "true" # Use Basic Auth Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-basic-auth@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-basic-auth@kubernetescrd # Enable cert-manager to create automatically the SSL certificate and store in Secret cert-manager.io/cluster-issuer: self-signed-issuer cert-manager.io/common-name: traefik spec: tls: - hosts: - - traefik.picluster.ricsanfre.com + - {{ traefik_dashboard_dns }} secretName: prometheus-tls rules: - - host: traefik.picluster.ricsanfre.com + - host: {{ traefik_dashboard_dns }} http: paths: - path: / @@ -61,12 +61,12 @@ metadata: namespace: kube-system annotations: # Use redirect Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-redirect@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-redirect@kubernetescrd # HTTP as entrypoint traefik.ingress.kubernetes.io/router.entrypoints: web spec: rules: - - host: traefik.picluster.ricsanfre.com + - host: {{ traefik_dashboard_dns }} http: paths: - path: / diff --git a/ansible/tasks/cleaning.yml b/ansible/tasks/cleaning.yml index a899d6d2..d20f4211 100644 --- a/ansible/tasks/cleaning.yml +++ b/ansible/tasks/cleaning.yml @@ -19,10 +19,12 @@ shell: "rm -rf /var/log/pods /var/log/containers" args: executable: /bin/bash + ignore_errors: true changed_when: true - name: Clean fluentd pos files shell: "rm /var/log/*.pos" args: executable: /bin/bash + ignore_errors: true changed_when: true diff --git a/documentation/gateway.md b/documentation/gateway.md index ce5ac9de..2b3e3d38 100644 --- a/documentation/gateway.md +++ b/documentation/gateway.md @@ -126,9 +126,204 @@ Package can be installed with apt: And it can be configured using command line or a configuration file `/etc/nftables.conf`. -``` -TBD: CONTENT nftables.conf -``` + +As a modular example: + +- Global Configuration File + + `/etc/nftables.conf` + ``` + #!/usr/sbin/nft -f + # Ansible managed + + # clean + flush ruleset + + include "/etc/nftables.d/defines.nft" + + table inet filter { + chain global { + # 005 state management + ct state established,related accept + ct state invalid drop + } + include "/etc/nftables.d/sets.nft" + include "/etc/nftables.d/filter-input.nft" + include "/etc/nftables.d/filter-output.nft" + include "/etc/nftables.d/filter-forward.nft" + } + + # Additionnal table for Network Address Translation (NAT) + table ip nat { + include "/etc/nftables.d/sets.nft" + include "/etc/nftables.d/nat-prerouting.nft" + include "/etc/nftables.d/nat-postrouting.nft" + } + + ``` +- Variables Variables containing the IP address and ports to be used by the rules files + + `/etc/nftables.d/defines.nft` + ``` + # broadcast and multicast + define badcast_addr = { 255.255.255.255, 224.0.0.1, 224.0.0.251 } + + # broadcast and multicast + define ip6_badcast_addr = { ff02::16 } + + # in_tcp_accept + define in_tcp_accept = { ssh, https, http } + + # in_udp_accept + define in_udp_accept = { snmp, domain, ntp, bootps } + + # out_tcp_accept + define out_tcp_accept = { http, https, ssh } + + # out_udp_accept + define out_udp_accept = { domain, bootps , ntp } + + # lan_interface + define lan_interface = eth0 + + # wan_interface + define wan_interface = wlan0 + + # lan_network + define lan_network = 10.0.0.0/24 + + # forward_tcp_accept + define forward_tcp_accept = { http, https, ssh } + + # forward_udp_accept + define forward_udp_accept = { domain, ntp } + + ``` +- Nftables typed and tagged variables, [sets](https://wiki.nftables.org/wiki-nftables/index.php/Sets). + + `/etc/nftables.d/sets.nft` + ``` + set blackhole { + type ipv4_addr; + elements = $badcast_addr + } + + set forward_tcp_accept { + type inet_service; flags interval; + elements = $forward_tcp_accept + } + + set forward_udp_accept { + type inet_service; flags interval; + elements = $forward_udp_accept + } + + set in_tcp_accept { + type inet_service; flags interval; + elements = $in_tcp_accept + } + + set in_udp_accept { + type inet_service; flags interval; + elements = $in_udp_accept + } + + set ip6blackhole { + type ipv6_addr; + elements = $ip6_badcast_addr + } + + set out_tcp_accept { + type inet_service; flags interval; + elements = $out_tcp_accept + } + + set out_udp_accept { + type inet_service; flags interval; + elements = $out_udp_accept + } + + ``` +- Input traffic filtering rules + + `/etc/nftables.d/filter-input.nft` + ``` + chain input { + # 000 policy + type filter hook input priority 0; policy drop; + # 005 global + jump global + # 010 drop unwanted + # (none) + # 011 drop unwanted ipv6 + # (none) + # 015 localhost + iif lo accept + # 050 icmp + meta l4proto {icmp,icmpv6} accept + # 200 input udp accepted + udp dport @in_udp_accept ct state new accept + # 210 input tcp accepted + tcp dport @in_tcp_accept ct state new accept + } + + ``` + +- Output traffic filtering rules + + `/etc/nftables.d/filter-output.nft` + ``` + chain output { + # 000 policy: Allow any output traffic + type filter hook output priority 0; + } + ``` + +- Forwarding traffic rules + + `/etc/nftables.d/filter-forward.nft` + ``` + chain forward { + # 000 policy + type filter hook forward priority 0; policy drop; + # 005 global + jump global + # 200 lan to wan tcp + iifname $lan_interface ip saddr $lan_network oifname $wan_interface tcp dport @forward_tcp_accept ct state new accept + # 210 wan to lan udp + iifname $lan_interface ip saddr $lan_network oifname $wan_interface udp dport @forward_udp_accept ct state new accept + # 220 ssh from wan + iifname $wan_interface oifname $lan_interface ip daddr $lan_network tcp dport ssh ct state new accept + # 230 http from wan + iifname $wan_interface oifname $lan_interface ip daddr $lan_network tcp dport {http, https} ct state new accept + } + + ``` + +- NAT pre-routing rules + + `/etc/nftables.d/nat-prerouting.nft` + ``` + chain prerouting { + # 000 policy + type nat hook prerouting priority 0; + } + + ``` + +- NAT post-routing rules + `/etc/nftables.d/nat-postrouting.nft` + ``` + chain postrouting { + # 000 policy + type nat hook postrouting priority 100; + # 005 masquerade lan to wan + ip saddr $lan_network oifname $wan_interface masquerade + } + + ``` + +
diff --git a/documentation/installing_k3s.md b/documentation/installing_k3s.md index ce6e2b37..0fb24c64 100644 --- a/documentation/installing_k3s.md +++ b/documentation/installing_k3s.md @@ -35,12 +35,14 @@ Enable cgroup via boot commandline if not already enabled for Ubuntu on a Raspbe - Step 1: Installing K3S control plane node For installing the master node execute the following command: ``` - curl -sfL https://get.k3s.io | K3S_TOKEN= sh -s - server --write-kubeconfig-mode '0644' --node-taint 'node-role.kubernetes.io/master=true:NoSchedule' --disable 'servicelb' + curl -sfL https://get.k3s.io | K3S_TOKEN= sh -s - server --write-kubeconfig-mode '0644' --node-taint 'node-role.kubernetes.io/master=true:NoSchedule' --disable 'servicelb' --kube-controller-manager-arg 'bind-address=0.0.0.0' --kube-controller-manager-arg 'address=0.0.0.0' --kube-proxy-arg 'metrics-bind-address=0.0.0.0' --kube-scheduler-arg 'bind-address=0.0.0.0' --kube-scheduler-arg 'address=0.0.0.0' ``` - **server_token** is shared secret within the cluster for allowing connection of worker nodes - **--write-kubeconfig-mode '0644'** gives read permissions to kubeconfig file located in `/etc/rancher/k3s/k3s.yaml` - **--node-taint 'node-role.kubernetes.io/master=true:NoSchedule'** makes master node not schedulable to run any pod. Only pods marked with specific tolerance will be scheduled on master node. - **--disable servicelb** to disable default service load balancer installed by K3S (Klipper Load Balancer) +- **--kube-controller-manager.arg**, **--kube-schedueler-arg** and **--kube-proxy-arg** to bind those components not only to 127.0.0.1 and enable metrics scraping from external node. + > NOTE 1: diff --git a/documentation/monitoring.md b/documentation/monitoring.md index af76e2b0..f342bb4e 100644 --- a/documentation/monitoring.md +++ b/documentation/monitoring.md @@ -32,7 +32,7 @@ Kube-prometheus stack can be installed using helm [kube-prometheus-stack](https: ``` kubectl create namespace monitoring ``` -- Step 3: Create values.yml for configuring VolumeClaimTemplates using longhorn and Grafana's admin password and list of plugins to be installed +- Step 3: Create values.yml for configuring VolumeClaimTemplates using longhorn and Grafana's admin password, list of plugins to be installed and disabling the monitoring of kubernetes components (Scheduler, Controller Manager and Proxy). See issue [#22](https://github.com/ricsanfre/pi-cluster/issues/22) ```yml alertmanager: @@ -61,6 +61,16 @@ Kube-prometheus stack can be installed using helm [kube-prometheus-stack](https: # List of grafana plugins to be installed plugins: - grafana-piechart-panel + kubeApiServer: + enabled: true + kubeControllerManager: + enabled: false + kubeScheduler: + enabled: false + kubeProxy: + enabled: false + kubeEtcd: + enabled: false ```yml - Step 3: Install kube-Prometheus-stack in the monitoring namespace with the overriden values @@ -272,6 +282,95 @@ spec: kubectl apply -f prometheus_ingress.yml grafana_ingress.yml alertmanager_ingress.yml + +## K3S components monitoring + +In order to monitor Kubernetes components (Scheduler, Controller Manager and Proxy), default resources created by kube-prometheus-operator (headless service, service monitor and grafana dashboards) are not valid for monitoring K3S because K3S is emitting the same metrics on the three end-points, causing prometheus to consume high memory causing worker node outage. See issue [#22](https://github.com/ricsanfre/pi-cluster/issues/22) for more details. + + +- Create a manifest file `k3s-metrics-service.yml` for creating the Kuberentes service used by Prometheus to scrape K3S metrics. + + This service must be a [headless service](https://kubernetes.io/docs/concepts/services-networking/service/#headless-services), for allowing Prometheus service discovery process of each of the pods behind the service. Since the metrics are exposed not by a pod but by a k3s process, the service need to be defined [`without selector`](https://kubernetes.io/docs/concepts/services-networking/service/#services-without-selectors) and the `endpoints` must be defined explicitely + + The service will be use the k3s-proxy endpoint (TCP port 10249) for scraping all metrics. + + ```yml + --- + # Headless service for K3S metrics. No Selector + apiVersion: v1 + kind: Service + metadata: + name: k3s-metrics-service + labels: + app: k3s-metrics + namespace: kube-system + spec: + clusterIP: None + ports: + - name: http-metrics + port: 10249 + protocol: TCP + targetPort: 10249 + type: ClusterIP + + --- + # Endpoint for the headless service without selector + apiVersion: v1 + kind: Endpoints + metadata: + name: k3s-metrics-service + namespace: kube-system + subsets: + - addresses: + - ip: 10.0.0.11 + ports: + - name: http-metrics + port: 10249 + protocol: TCP + ``` + +- Create manifest file for defining the service monitor resource for let Prometheus discover this target + + The Prometheus custom resource definition (CRD), `ServiceMonitoring` will be used to automatically discover K3S metrics endpoint as a Prometheus target. + + ```yml + apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + labels: + app: k3s + release: kube-prometheus-stack + name: k3s-prometheus-servicemonitor + namespace: k3s-monitoring + spec: + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app: k3s-metrics + endpoints: + - port: http-metrics + path: /metrics + ``` + + +- Apply manifest file + + kubectl apply -f k3s-metrics-service.yml k3s-servicemonitor.yml + +- Check target is automatically discovered in Prometheus UI + + http://prometheus.picluster.ricsanfre/targets + +### K3S Grafana dashboards + +Kubernetes-controller-manager, kubernetes-proxy and kuberetes-scheduler dashboards can be donwloaded from grafana.com: + +- Kube Proxy: https://grafana.com/grafana/dashboards/12129 +- Kube Controller Manager: https://grafana.com/grafana/dashboards/12122 +- Kube Scheduler: https://grafana.com/grafana/dashboards/12130 + ## Traefik Monitoring The Prometheus custom resource definition (CRD), `ServiceMonitoring` will be used to automatically discover Traefik metrics endpoint as a Prometheus target.