From 7e59be9686f795f86dc3d6aad828ba71d1affc8c Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Wed, 1 Dec 2021 17:06:05 +0100 Subject: [PATCH 01/17] Refactoring: add jinja2 extension to kubernetes manifest templates --- ansible/roles/certmanager/tasks/main.yml | 2 +- ...{selfsigned_issuer.yml => selfsigned_issuer.yml.j2} | 0 ansible/roles/logging/k3s/tasks/main.yml | 10 +++++----- .../{elasticsearch.yml => elasticsearch.yml.j2} | 0 ...icsearch_secret.yml => elasticsearch_secret.yml.j2} | 0 .../k3s/templates/{filebeat.yml => filebeat.yml.j2} | 0 .../k3s/templates/{fluentd.yml => fluentd.yml.j2} | 0 .../k3s/templates/{kibana.yml => kibana.yml.j2} | 0 .../{kibana_ingress.yml => kibana_ingress.yml.j2} | 0 ansible/roles/longhorn/tasks/main.yml | 2 +- .../{longhorn_ingress.yml => longhorn_ingress.yml.j2} | 0 ansible/roles/prometheus/tasks/main.yml | 10 +++++----- ...manager_ingress.yml => alertmanager_ingress.yml.j2} | 0 .../{grafana_ingress.yml => grafana_ingress.yml.j2} | 0 ...rvicemonitor.yml => longhorn_servicemonitor.yml.j2} | 0 ...rometheus_ingress.yml => prometheus_ingress.yml.j2} | 0 ...ervicemonitor.yml => traefik_servicemonitor.yml.j2} | 0 ansible/roles/traefik/tasks/main.yml | 6 +++--- ...auth_middleware.yml => basicauth_middleware.yml.j2} | 0 ...irect_middleware.yml => redirect_middleware.yml.j2} | 0 ...{traefik_dashboard.yml => traefik_dashboard.yml.j2} | 0 21 files changed, 15 insertions(+), 15 deletions(-) rename ansible/roles/certmanager/templates/{selfsigned_issuer.yml => selfsigned_issuer.yml.j2} (100%) rename ansible/roles/logging/k3s/templates/{elasticsearch.yml => elasticsearch.yml.j2} (100%) rename ansible/roles/logging/k3s/templates/{elasticsearch_secret.yml => elasticsearch_secret.yml.j2} (100%) rename ansible/roles/logging/k3s/templates/{filebeat.yml => filebeat.yml.j2} (100%) rename ansible/roles/logging/k3s/templates/{fluentd.yml => fluentd.yml.j2} (100%) rename ansible/roles/logging/k3s/templates/{kibana.yml => kibana.yml.j2} (100%) rename ansible/roles/logging/k3s/templates/{kibana_ingress.yml => kibana_ingress.yml.j2} (100%) rename ansible/roles/longhorn/templates/{longhorn_ingress.yml => longhorn_ingress.yml.j2} (100%) rename ansible/roles/prometheus/templates/{alertmanager_ingress.yml => alertmanager_ingress.yml.j2} (100%) rename ansible/roles/prometheus/templates/{grafana_ingress.yml => grafana_ingress.yml.j2} (100%) rename ansible/roles/prometheus/templates/{longhorn_servicemonitor.yml => longhorn_servicemonitor.yml.j2} (100%) rename ansible/roles/prometheus/templates/{prometheus_ingress.yml => prometheus_ingress.yml.j2} (100%) rename ansible/roles/prometheus/templates/{traefik_servicemonitor.yml => traefik_servicemonitor.yml.j2} (100%) rename ansible/roles/traefik/templates/{basicauth_middleware.yml => basicauth_middleware.yml.j2} (100%) rename ansible/roles/traefik/templates/{redirect_middleware.yml => redirect_middleware.yml.j2} (100%) rename ansible/roles/traefik/templates/{traefik_dashboard.yml => traefik_dashboard.yml.j2} (100%) diff --git a/ansible/roles/certmanager/tasks/main.yml b/ansible/roles/certmanager/tasks/main.yml index 7f31b8e9..485c5026 100644 --- a/ansible/roles/certmanager/tasks/main.yml +++ b/ansible/roles/certmanager/tasks/main.yml @@ -26,4 +26,4 @@ definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - selfsigned_issuer.yml + - selfsigned_issuer.yml.j2 diff --git a/ansible/roles/certmanager/templates/selfsigned_issuer.yml b/ansible/roles/certmanager/templates/selfsigned_issuer.yml.j2 similarity index 100% rename from ansible/roles/certmanager/templates/selfsigned_issuer.yml rename to ansible/roles/certmanager/templates/selfsigned_issuer.yml.j2 diff --git a/ansible/roles/logging/k3s/tasks/main.yml b/ansible/roles/logging/k3s/tasks/main.yml index f1cbb7db..9e6e16e6 100644 --- a/ansible/roles/logging/k3s/tasks/main.yml +++ b/ansible/roles/logging/k3s/tasks/main.yml @@ -32,8 +32,8 @@ definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - elasticsearch_secret.yml - - elasticsearch.yml - - kibana.yml - - fluentd.yml - - kibana_ingress.yml + - elasticsearch_secret.yml.j2 + - elasticsearch.yml.j2 + - kibana.yml.j2 + - fluentd.yml.j2 + - kibana_ingress.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/elasticsearch.yml b/ansible/roles/logging/k3s/templates/elasticsearch.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/elasticsearch.yml rename to ansible/roles/logging/k3s/templates/elasticsearch.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/elasticsearch_secret.yml b/ansible/roles/logging/k3s/templates/elasticsearch_secret.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/elasticsearch_secret.yml rename to ansible/roles/logging/k3s/templates/elasticsearch_secret.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/filebeat.yml b/ansible/roles/logging/k3s/templates/filebeat.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/filebeat.yml rename to ansible/roles/logging/k3s/templates/filebeat.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/fluentd.yml b/ansible/roles/logging/k3s/templates/fluentd.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/fluentd.yml rename to ansible/roles/logging/k3s/templates/fluentd.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/kibana.yml b/ansible/roles/logging/k3s/templates/kibana.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/kibana.yml rename to ansible/roles/logging/k3s/templates/kibana.yml.j2 diff --git a/ansible/roles/logging/k3s/templates/kibana_ingress.yml b/ansible/roles/logging/k3s/templates/kibana_ingress.yml.j2 similarity index 100% rename from ansible/roles/logging/k3s/templates/kibana_ingress.yml rename to ansible/roles/logging/k3s/templates/kibana_ingress.yml.j2 diff --git a/ansible/roles/longhorn/tasks/main.yml b/ansible/roles/longhorn/tasks/main.yml index 845dfc29..d8f68f75 100644 --- a/ansible/roles/longhorn/tasks/main.yml +++ b/ansible/roles/longhorn/tasks/main.yml @@ -27,7 +27,7 @@ definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - longhorn_ingress.yml + - longhorn_ingress.yml.j2 - name: Remove Local-Path as default storage class command: diff --git a/ansible/roles/longhorn/templates/longhorn_ingress.yml b/ansible/roles/longhorn/templates/longhorn_ingress.yml.j2 similarity index 100% rename from ansible/roles/longhorn/templates/longhorn_ingress.yml rename to ansible/roles/longhorn/templates/longhorn_ingress.yml.j2 diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml index 833fb6b4..854077c1 100644 --- a/ansible/roles/prometheus/tasks/main.yml +++ b/ansible/roles/prometheus/tasks/main.yml @@ -48,17 +48,17 @@ definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - prometheus_ingress.yml - - grafana_ingress.yml - - alertmanager_ingress.yml + - prometheus_ingress.yml.j2 + - grafana_ingress.yml.j2 + - alertmanager_ingress.yml.j2 - name: Configure Service Monitors kubernetes.core.k8s: definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - traefik_servicemonitor.yml - - longhorn_servicemonitor.yml + - traefik_servicemonitor.yml.j2 + - longhorn_servicemonitor.yml.j2 - name: Configure Grafana Dashboards include_tasks: configure_grafana_dashboards.yml diff --git a/ansible/roles/prometheus/templates/alertmanager_ingress.yml b/ansible/roles/prometheus/templates/alertmanager_ingress.yml.j2 similarity index 100% rename from ansible/roles/prometheus/templates/alertmanager_ingress.yml rename to ansible/roles/prometheus/templates/alertmanager_ingress.yml.j2 diff --git a/ansible/roles/prometheus/templates/grafana_ingress.yml b/ansible/roles/prometheus/templates/grafana_ingress.yml.j2 similarity index 100% rename from ansible/roles/prometheus/templates/grafana_ingress.yml rename to ansible/roles/prometheus/templates/grafana_ingress.yml.j2 diff --git a/ansible/roles/prometheus/templates/longhorn_servicemonitor.yml b/ansible/roles/prometheus/templates/longhorn_servicemonitor.yml.j2 similarity index 100% rename from ansible/roles/prometheus/templates/longhorn_servicemonitor.yml rename to ansible/roles/prometheus/templates/longhorn_servicemonitor.yml.j2 diff --git a/ansible/roles/prometheus/templates/prometheus_ingress.yml b/ansible/roles/prometheus/templates/prometheus_ingress.yml.j2 similarity index 100% rename from ansible/roles/prometheus/templates/prometheus_ingress.yml rename to ansible/roles/prometheus/templates/prometheus_ingress.yml.j2 diff --git a/ansible/roles/prometheus/templates/traefik_servicemonitor.yml b/ansible/roles/prometheus/templates/traefik_servicemonitor.yml.j2 similarity index 100% rename from ansible/roles/prometheus/templates/traefik_servicemonitor.yml rename to ansible/roles/prometheus/templates/traefik_servicemonitor.yml.j2 diff --git a/ansible/roles/traefik/tasks/main.yml b/ansible/roles/traefik/tasks/main.yml index 68fa5315..057bbc99 100644 --- a/ansible/roles/traefik/tasks/main.yml +++ b/ansible/roles/traefik/tasks/main.yml @@ -20,6 +20,6 @@ definition: "{{ lookup('template', 'templates/' + item ) }}" state: present with_items: - - redirect_middleware.yml - - basicauth_middleware.yml - - traefik_dashboard.yml + - redirect_middleware.yml.j2 + - basicauth_middleware.yml.j2 + - traefik_dashboard.yml.j2 diff --git a/ansible/roles/traefik/templates/basicauth_middleware.yml b/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 similarity index 100% rename from ansible/roles/traefik/templates/basicauth_middleware.yml rename to ansible/roles/traefik/templates/basicauth_middleware.yml.j2 diff --git a/ansible/roles/traefik/templates/redirect_middleware.yml b/ansible/roles/traefik/templates/redirect_middleware.yml.j2 similarity index 100% rename from ansible/roles/traefik/templates/redirect_middleware.yml rename to ansible/roles/traefik/templates/redirect_middleware.yml.j2 diff --git a/ansible/roles/traefik/templates/traefik_dashboard.yml b/ansible/roles/traefik/templates/traefik_dashboard.yml.j2 similarity index 100% rename from ansible/roles/traefik/templates/traefik_dashboard.yml rename to ansible/roles/traefik/templates/traefik_dashboard.yml.j2 From 807a30f23ae080cc7b37be8193384f125f8bee53 Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Wed, 1 Dec 2021 17:17:24 +0100 Subject: [PATCH 02/17] Adding CI workflow: triggering Yamllint validation with pull requests on any branch and push requests on master branch --- .github/workflows/ci.yml | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 .github/workflows/ci.yml diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 00000000..81c0aa83 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,29 @@ +--- +name: CI +'on': + pull_request: + push: + branches: + - master + +jobs: + + lint: + name: Lint + runs-on: ubuntu-latest + + steps: + - name: Check out the codebase. + uses: actions/checkout@v2 + + - name: Set up Python 3. + uses: actions/setup-python@v2 + with: + python-version: '3.x' + + - name: Install test dependencies. + run: pip3 install yamllint + + - name: Lint all the YAMLs. + working-directory: ./ansible + run: yamllint . From df46754c7bb8a8c3efcb080297cb4d454e7608c8 Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Wed, 1 Dec 2021 17:29:06 +0100 Subject: [PATCH 03/17] Making cert-manager namespace configurable, removing chart_version from Helm and updating Helm repo cache --- ansible/roles/certmanager/defaults/main.yml | 3 +++ ansible/roles/certmanager/tasks/main.yml | 6 +++--- 2 files changed, 6 insertions(+), 3 deletions(-) create mode 100644 ansible/roles/certmanager/defaults/main.yml diff --git a/ansible/roles/certmanager/defaults/main.yml b/ansible/roles/certmanager/defaults/main.yml new file mode 100644 index 00000000..c136a589 --- /dev/null +++ b/ansible/roles/certmanager/defaults/main.yml @@ -0,0 +1,3 @@ +--- +# Namespace for cert-manager +k3s_certmanager_namespace: certmanager-system diff --git a/ansible/roles/certmanager/tasks/main.yml b/ansible/roles/certmanager/tasks/main.yml index 485c5026..9884e648 100644 --- a/ansible/roles/certmanager/tasks/main.yml +++ b/ansible/roles/certmanager/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Create cert-manager namespace. kubernetes.core.k8s: - name: certmanager-system + name: "{{ k3s_certmanager_namespace }}" api_version: v1 kind: Namespace state: present @@ -15,8 +15,8 @@ kubernetes.core.helm: name: certmanager chart_ref: jetstack/cert-manager - chart_version: "1.5.3" - release_namespace: certmanager-system + update_repo_cache: true + release_namespace: "{{ k3s_certmanager_namespace }}" state: present release_values: installCRDs: true From fbf429504578a83708d65a3eed81697e371a8bc1 Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Wed, 1 Dec 2021 17:57:22 +0100 Subject: [PATCH 04/17] Making traefik namespace and dashboard dns configurable --- ansible/roles/traefik/defaults/main.yml | 6 ++++++ .../traefik/templates/basicauth_middleware.yml.j2 | 4 ++-- .../roles/traefik/templates/redirect_middleware.yml.j2 | 2 +- .../roles/traefik/templates/traefik_dashboard.yml.j2 | 10 +++++----- 4 files changed, 14 insertions(+), 8 deletions(-) create mode 100644 ansible/roles/traefik/defaults/main.yml diff --git a/ansible/roles/traefik/defaults/main.yml b/ansible/roles/traefik/defaults/main.yml new file mode 100644 index 00000000..aedd2b75 --- /dev/null +++ b/ansible/roles/traefik/defaults/main.yml @@ -0,0 +1,6 @@ +--- +# Namespace for cert-manager +k3s_traefik_namespace: traefik-system + +# Endpoint Service DNS name +traefik_dashboard_dns: traefik.picluster.ricsanfre.com diff --git a/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 b/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 index cec52c31..520b0ad9 100644 --- a/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 +++ b/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 @@ -7,7 +7,7 @@ apiVersion: v1 kind: Secret metadata: name: basic-auth-secret - namespace: traefik-system + namespace: {{ k3s_traefik_namespace }} data: users: |2 b3NzOiRhcHIxJDNlZTVURy83JFpmY1NRQlV6SFpIMFZTak9NZGJ5UDANCg0K @@ -18,7 +18,7 @@ apiVersion: traefik.containo.us/v1alpha1 kind: Middleware metadata: name: basic-auth - namespace: traefik-system + namespace: {{ k3s_traefik_namespace }} spec: basicAuth: secret: basic-auth-secret diff --git a/ansible/roles/traefik/templates/redirect_middleware.yml.j2 b/ansible/roles/traefik/templates/redirect_middleware.yml.j2 index 115ecada..b8402d58 100644 --- a/ansible/roles/traefik/templates/redirect_middleware.yml.j2 +++ b/ansible/roles/traefik/templates/redirect_middleware.yml.j2 @@ -4,7 +4,7 @@ apiVersion: traefik.containo.us/v1alpha1 kind: Middleware metadata: name: redirect - namespace: traefik-system + namespace: {{ k3s_traefik_namespace }} spec: redirectScheme: scheme: https diff --git a/ansible/roles/traefik/templates/traefik_dashboard.yml.j2 b/ansible/roles/traefik/templates/traefik_dashboard.yml.j2 index d6d0c1e4..42c423e9 100644 --- a/ansible/roles/traefik/templates/traefik_dashboard.yml.j2 +++ b/ansible/roles/traefik/templates/traefik_dashboard.yml.j2 @@ -31,17 +31,17 @@ metadata: # Enable TLS traefik.ingress.kubernetes.io/router.tls: "true" # Use Basic Auth Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-basic-auth@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-basic-auth@kubernetescrd # Enable cert-manager to create automatically the SSL certificate and store in Secret cert-manager.io/cluster-issuer: self-signed-issuer cert-manager.io/common-name: traefik spec: tls: - hosts: - - traefik.picluster.ricsanfre.com + - {{ traefik_dashboard_dns }} secretName: prometheus-tls rules: - - host: traefik.picluster.ricsanfre.com + - host: {{ traefik_dashboard_dns }} http: paths: - path: / @@ -61,12 +61,12 @@ metadata: namespace: kube-system annotations: # Use redirect Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-redirect@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-redirect@kubernetescrd # HTTP as entrypoint traefik.ingress.kubernetes.io/router.entrypoints: web spec: rules: - - host: traefik.picluster.ricsanfre.com + - host: {{ traefik_dashboard_dns }} http: paths: - path: / From b39fa8b30082a9840281c927776a2e446725acba Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Wed, 1 Dec 2021 18:02:03 +0100 Subject: [PATCH 05/17] Moving traefik overriden helm deployment to files --- .../roles/traefik/{templates => files}/traefik-config.yaml | 0 ansible/roles/traefik/tasks/main.yml | 5 +++-- 2 files changed, 3 insertions(+), 2 deletions(-) rename ansible/roles/traefik/{templates => files}/traefik-config.yaml (100%) diff --git a/ansible/roles/traefik/templates/traefik-config.yaml b/ansible/roles/traefik/files/traefik-config.yaml similarity index 100% rename from ansible/roles/traefik/templates/traefik-config.yaml rename to ansible/roles/traefik/files/traefik-config.yaml diff --git a/ansible/roles/traefik/tasks/main.yml b/ansible/roles/traefik/tasks/main.yml index 057bbc99..893cc5dc 100644 --- a/ansible/roles/traefik/tasks/main.yml +++ b/ansible/roles/traefik/tasks/main.yml @@ -3,7 +3,7 @@ - name: Configure K3S embedded Traefik Helm Chart copy: dest: "/var/lib/rancher/k3s/server/manifests/traefik-config.yaml" - src: templates/traefik-config.yaml + src: files/traefik-config.yaml owner: root group: root mode: 0600 @@ -11,10 +11,11 @@ - name: Create traefik namespace. kubernetes.core.k8s: - name: traefik-system + name: "{{ k3s_traefik_namespace }}" api_version: v1 kind: Namespace state: present + - name: Configura Traefik kubernetes.core.k8s: definition: "{{ lookup('template', 'templates/' + item ) }}" From 7a6cb07053ed13f843ebbc012a0ecb4eccb65266 Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Thu, 2 Dec 2021 15:21:18 +0100 Subject: [PATCH 06/17] Putting clusterissuer resource in the proper namespace --- ansible/roles/certmanager/templates/selfsigned_issuer.yml.j2 | 1 + 1 file changed, 1 insertion(+) diff --git a/ansible/roles/certmanager/templates/selfsigned_issuer.yml.j2 b/ansible/roles/certmanager/templates/selfsigned_issuer.yml.j2 index 46d0f8a7..77f907a5 100644 --- a/ansible/roles/certmanager/templates/selfsigned_issuer.yml.j2 +++ b/ansible/roles/certmanager/templates/selfsigned_issuer.yml.j2 @@ -3,5 +3,6 @@ apiVersion: cert-manager.io/v1 kind: ClusterIssuer metadata: name: self-signed-issuer + namespace: {{ k3s_certmanager_namespace }} spec: selfSigned: {} From d6c042df7c74c798ee5a1d717191fb87f215e926 Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Thu, 2 Dec 2021 15:22:57 +0100 Subject: [PATCH 07/17] Making metallb namespace configurable and updating Helm repo cache before deploying chart --- ansible/roles/metallb/defaults/main.yml | 6 ++++++ ansible/roles/metallb/tasks/main.yml | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) create mode 100644 ansible/roles/metallb/defaults/main.yml diff --git a/ansible/roles/metallb/defaults/main.yml b/ansible/roles/metallb/defaults/main.yml new file mode 100644 index 00000000..d8150212 --- /dev/null +++ b/ansible/roles/metallb/defaults/main.yml @@ -0,0 +1,6 @@ +--- +# Namespace for metallb +k3s_metallb_namespace: metallb-system + +# k3s external ip range: Metal LB pool configuration +k3s_external_ip_range: diff --git a/ansible/roles/metallb/tasks/main.yml b/ansible/roles/metallb/tasks/main.yml index 74ef394c..29f34ae8 100644 --- a/ansible/roles/metallb/tasks/main.yml +++ b/ansible/roles/metallb/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Create Metal LB namespace. kubernetes.core.k8s: - name: metallb-system + name: "{{ k3s_metallb_namespace }}" api_version: v1 kind: Namespace state: present @@ -15,7 +15,8 @@ kubernetes.core.helm: name: metallb chart_ref: metallb/metallb - release_namespace: metallb-system + release_namespace: "{{ k3s_metallb_namespace }}" + update_repo_cache: true state: present release_values: configInline: From 046e33db08ea87ce5c98a95163656f89b9dcbb28 Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Thu, 2 Dec 2021 15:40:57 +0100 Subject: [PATCH 08/17] Making kibana dashboard dns endpoint configurable and updating heml repo cache before deploying --- ansible/roles/logging/k3s/defaults/main.yml | 3 +++ ansible/roles/logging/k3s/tasks/main.yml | 3 ++- ansible/roles/logging/k3s/templates/kibana_ingress.yml.j2 | 2 +- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/ansible/roles/logging/k3s/defaults/main.yml b/ansible/roles/logging/k3s/defaults/main.yml index 0c7304de..3a5128db 100644 --- a/ansible/roles/logging/k3s/defaults/main.yml +++ b/ansible/roles/logging/k3s/defaults/main.yml @@ -20,3 +20,6 @@ efk_fluentd_image: "fluent/fluentd-kubernetes-daemonset:v1.14-debian-elasticsear # Configured Timezone efk_node_timezone: "Europe/Madrid" + +# Endpoint Service DNS name +kibana_dashboard_dns: kibana.picluster.ricsanfre.com diff --git a/ansible/roles/logging/k3s/tasks/main.yml b/ansible/roles/logging/k3s/tasks/main.yml index 9e6e16e6..cb1ce7be 100644 --- a/ansible/roles/logging/k3s/tasks/main.yml +++ b/ansible/roles/logging/k3s/tasks/main.yml @@ -2,7 +2,7 @@ # namespace for hosting EFK stack - name: Create k3s-logging namespace. kubernetes.core.k8s: - name: k3s-logging + name: "{{ k3s_logging_namespace }}" api_version: v1 kind: Namespace state: present @@ -25,6 +25,7 @@ name: elastic-operator chart_ref: elastic/eck-operator release_namespace: elastic-system + update_repo_cache: true state: present - name: Deploy elasticsearch, kibana and fluentd diff --git a/ansible/roles/logging/k3s/templates/kibana_ingress.yml.j2 b/ansible/roles/logging/k3s/templates/kibana_ingress.yml.j2 index 5ddab642..6bbc606d 100644 --- a/ansible/roles/logging/k3s/templates/kibana_ingress.yml.j2 +++ b/ansible/roles/logging/k3s/templates/kibana_ingress.yml.j2 @@ -8,7 +8,7 @@ metadata: kubernetes.io/ingress.class: traefik spec: rules: - - host: kibana.picluster.ricsanfre.com + - host: {{ kibana_dashboard_dns }} http: paths: - path: / From 43c6ba114ccf0a8be31bc823bebb517c094c3088 Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Thu, 2 Dec 2021 16:03:47 +0100 Subject: [PATCH 09/17] Making longhorn dashboard dns endpoint and namespace configurable --- ansible/group_vars/k3s_cluster.yml | 8 ++++++++ ansible/roles/longhorn/defaults/main.yml | 6 ++++++ ansible/roles/longhorn/tasks/main.yml | 4 ++-- .../longhorn/templates/longhorn_ingress.yml.j2 | 14 +++++++------- 4 files changed, 23 insertions(+), 9 deletions(-) create mode 100644 ansible/roles/longhorn/defaults/main.yml diff --git a/ansible/group_vars/k3s_cluster.yml b/ansible/group_vars/k3s_cluster.yml index 90873385..fb5d7b37 100644 --- a/ansible/group_vars/k3s_cluster.yml +++ b/ansible/group_vars/k3s_cluster.yml @@ -10,5 +10,13 @@ k3s_token: s1cret0 k3s_server_extra_args: "--write-kubeconfig-mode '0644' --disable 'servicelb' --node-taint 'node-role.kubernetes.io/master=true:NoSchedule'" k3s_worker_extra_args: "--node-label 'node_type=worker'" +# Namespaces +k3s_metallb_namespace: metallb-system +k3s_traefik_namespace: traefik-system +k3s_longhorn_namespace: longhorn-system +k3s_certmanager_namespace: certmanager-system +k3s_logging_namespace: k3s-logging +k3s_monitoring_namespace: k3s-monitoring + # k3s external ip range: Metal LB pool configuration k3s_external_ip_range: "10.0.0.100-10.0.0.200" diff --git a/ansible/roles/longhorn/defaults/main.yml b/ansible/roles/longhorn/defaults/main.yml new file mode 100644 index 00000000..a9fd69ef --- /dev/null +++ b/ansible/roles/longhorn/defaults/main.yml @@ -0,0 +1,6 @@ +--- +# Namespace for Longhorn +k3s_longhorn_namespace: longhorn-system + +# Endpoint Service DNS name +longhorn_dashboard_dns: storage.picluster.ricsanfre.com diff --git a/ansible/roles/longhorn/tasks/main.yml b/ansible/roles/longhorn/tasks/main.yml index d8f68f75..20bacde4 100644 --- a/ansible/roles/longhorn/tasks/main.yml +++ b/ansible/roles/longhorn/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Create longhorn namespace. kubernetes.core.k8s: - name: longhorn-system + name: "{{ k3s_longhorn_namespace }}" api_version: v1 kind: Namespace state: present @@ -16,7 +16,7 @@ name: longhorn chart_ref: longhorn/longhorn update_repo_cache: true - release_namespace: longhorn-system + release_namespace: "{{ k3s_longhorn_namespace }}" state: present release_values: defaultSettings: diff --git a/ansible/roles/longhorn/templates/longhorn_ingress.yml.j2 b/ansible/roles/longhorn/templates/longhorn_ingress.yml.j2 index 77957106..76316e14 100644 --- a/ansible/roles/longhorn/templates/longhorn_ingress.yml.j2 +++ b/ansible/roles/longhorn/templates/longhorn_ingress.yml.j2 @@ -4,24 +4,24 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: longhorn-ingress - namespace: longhorn-system + namespace: {{ k3s_longhorn_namespace }} annotations: # HTTPS as entry point traefik.ingress.kubernetes.io/router.entrypoints: websecure # Enable TLS traefik.ingress.kubernetes.io/router.tls: "true" # Use Basic Auth Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-basic-auth@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-basic-auth@kubernetescrd # Enable cert-manager to create automatically the SSL certificate and store in Secret cert-manager.io/cluster-issuer: self-signed-issuer cert-manager.io/common-name: longhorn spec: tls: - hosts: - - storage.picluster.ricsanfre.com + - {{ longhorn_dashboard_dns }} secretName: storage-tls rules: - - host: storage.picluster.ricsanfre.com + - host: {{ longhorn_dashboard_dns }} http: paths: - path: / @@ -38,15 +38,15 @@ kind: Ingress apiVersion: networking.k8s.io/v1 metadata: name: longhorn-redirect - namespace: longhorn-system + namespace: {{ k3s_longhorn_namespace }} annotations: # Use redirect Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-redirect@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-redirect@kubernetescrd # HTTP as entrypoint traefik.ingress.kubernetes.io/router.entrypoints: web spec: rules: - - host: storage.picluster.ricsanfre.com + - host: {{ longhorn_dashboard_dns }} http: paths: - path: / From 495862b47fff767aadb3359e0ee639017d20774e Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Thu, 2 Dec 2021 16:31:28 +0100 Subject: [PATCH 10/17] Making prometheus dns endpoints and namespace configurable --- ansible/roles/prometheus/defaults/main.yml | 7 +++++++ ansible/roles/prometheus/tasks/main.yml | 5 +++-- .../templates/alertmanager_ingress.yml.j2 | 14 +++++++------- .../prometheus/templates/grafana_dashboard.yml.j2 | 2 +- .../prometheus/templates/grafana_ingress.yml.j2 | 12 ++++++------ .../templates/longhorn_servicemonitor.yml.j2 | 2 +- .../prometheus/templates/prometheus_ingress.yml.j2 | 14 +++++++------- .../templates/traefik_servicemonitor.yml.j2 | 2 +- 8 files changed, 33 insertions(+), 25 deletions(-) diff --git a/ansible/roles/prometheus/defaults/main.yml b/ansible/roles/prometheus/defaults/main.yml index b894f3ab..c9cbdfa8 100644 --- a/ansible/roles/prometheus/defaults/main.yml +++ b/ansible/roles/prometheus/defaults/main.yml @@ -1,5 +1,12 @@ --- +k3s_monitoring_namespace: k3s-monitoring + +# Endpoint Service DNS name +grafana_dashboard_dns: grafana.picluster.ricsanfre.com +prometheus_dashboard_dns: prometheus.picluster.ricsanfre.com +alertmanager_dashboard_dns: alertmanager.picluster.ricsanfre.com + # Storage Settings prometheus_storage_size: "5Gi" prometheus_storage_class: "longhorn" diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml index 854077c1..b7f3900e 100644 --- a/ansible/roles/prometheus/tasks/main.yml +++ b/ansible/roles/prometheus/tasks/main.yml @@ -1,7 +1,7 @@ --- - name: Create prometheus namespace. kubernetes.core.k8s: - name: k3s-monitoring + name: "{{ k3s_monitoring_namespace }}" api_version: v1 kind: Namespace state: present @@ -15,7 +15,8 @@ kubernetes.core.helm: name: kube-prometheus-stack chart_ref: prometheus-community/kube-prometheus-stack - release_namespace: k3s-monitoring + release_namespace: "{{ k3s_monitoring_namespace }}" + update_repo_cache: true state: present release_values: alertmanager: diff --git a/ansible/roles/prometheus/templates/alertmanager_ingress.yml.j2 b/ansible/roles/prometheus/templates/alertmanager_ingress.yml.j2 index 00c4d7ca..bbd77068 100644 --- a/ansible/roles/prometheus/templates/alertmanager_ingress.yml.j2 +++ b/ansible/roles/prometheus/templates/alertmanager_ingress.yml.j2 @@ -4,24 +4,24 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: alertmanager-ingress - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # HTTPS as entry point traefik.ingress.kubernetes.io/router.entrypoints: websecure # Enable TLS traefik.ingress.kubernetes.io/router.tls: "true" # Use Basic Auth Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-basic-auth@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-basic-auth@kubernetescrd # Enable cert-manager to create automatically the SSL certificate and store in Secret cert-manager.io/cluster-issuer: self-signed-issuer cert-manager.io/common-name: alertmanager spec: tls: - hosts: - - alertmanager.picluster.ricsanfre.com + - {{ alertmanager_dashboard_dns }} secretName: prometheus-tls rules: - - host: alertmanager.picluster.ricsanfre.com + - host: {{ alertmanager_dashboard_dns }} http: paths: - path: / @@ -38,15 +38,15 @@ kind: Ingress apiVersion: networking.k8s.io/v1 metadata: name: alertmanager-redirect - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # Use redirect Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-redirect@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-redirect@kubernetescrd # HTTP as entrypoint traefik.ingress.kubernetes.io/router.entrypoints: web spec: rules: - - host: alertmanager.picluster.ricsanfre.com + - host: {{ alertmanager_dashboard_dns }} http: paths: - path: / diff --git a/ansible/roles/prometheus/templates/grafana_dashboard.yml.j2 b/ansible/roles/prometheus/templates/grafana_dashboard.yml.j2 index 97a7139f..88e19f8a 100644 --- a/ansible/roles/prometheus/templates/grafana_dashboard.yml.j2 +++ b/ansible/roles/prometheus/templates/grafana_dashboard.yml.j2 @@ -3,7 +3,7 @@ apiVersion: v1 kind: ConfigMap metadata: name: {{ dashboard_name }} - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} labels: grafana_dashboard: "1" data: diff --git a/ansible/roles/prometheus/templates/grafana_ingress.yml.j2 b/ansible/roles/prometheus/templates/grafana_ingress.yml.j2 index 3e3bff85..e23d4641 100644 --- a/ansible/roles/prometheus/templates/grafana_ingress.yml.j2 +++ b/ansible/roles/prometheus/templates/grafana_ingress.yml.j2 @@ -4,7 +4,7 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: grafana-ingress - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # HTTPS as entry point traefik.ingress.kubernetes.io/router.entrypoints: websecure @@ -16,10 +16,10 @@ metadata: spec: tls: - hosts: - - grafana.picluster.ricsanfre.com + - {{ grafana_dashboard_dns }} secretName: grafana-tls rules: - - host: grafana.picluster.ricsanfre.com + - host: {{ grafana_dashboard_dns }} http: paths: - path: / @@ -36,15 +36,15 @@ kind: Ingress apiVersion: networking.k8s.io/v1 metadata: name: grafana-redirect - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # Use redirect Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-redirect@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-redirect@kubernetescrd # HTTP as entrypoint traefik.ingress.kubernetes.io/router.entrypoints: web spec: rules: - - host: grafana.picluster.ricsanfre.com + - host: {{ grafana_dashboard_dns }} http: paths: - path: / diff --git a/ansible/roles/prometheus/templates/longhorn_servicemonitor.yml.j2 b/ansible/roles/prometheus/templates/longhorn_servicemonitor.yml.j2 index 42d0ef3d..2e43007c 100644 --- a/ansible/roles/prometheus/templates/longhorn_servicemonitor.yml.j2 +++ b/ansible/roles/prometheus/templates/longhorn_servicemonitor.yml.j2 @@ -6,7 +6,7 @@ metadata: app: longhorn release: kube-prometheus-stack name: longhorn-prometheus-servicemonitor - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} spec: selector: matchLabels: diff --git a/ansible/roles/prometheus/templates/prometheus_ingress.yml.j2 b/ansible/roles/prometheus/templates/prometheus_ingress.yml.j2 index f4aa894b..83cfcfda 100644 --- a/ansible/roles/prometheus/templates/prometheus_ingress.yml.j2 +++ b/ansible/roles/prometheus/templates/prometheus_ingress.yml.j2 @@ -4,24 +4,24 @@ apiVersion: networking.k8s.io/v1 kind: Ingress metadata: name: prometheus-ingress - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # HTTPS as entry point traefik.ingress.kubernetes.io/router.entrypoints: websecure # Enable TLS traefik.ingress.kubernetes.io/router.tls: "true" # Use Basic Auth Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-basic-auth@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-basic-auth@kubernetescrd # Enable cert-manager to create automatically the SSL certificate and store in Secret cert-manager.io/cluster-issuer: self-signed-issuer cert-manager.io/common-name: prometheus spec: tls: - hosts: - - prometheus.picluster.ricsanfre.com + - {{ prometheus_dashboard_dns }} secretName: prometheus-tls rules: - - host: prometheus.picluster.ricsanfre.com + - host: {{ prometheus_dashboard_dns }} http: paths: - path: / @@ -38,15 +38,15 @@ kind: Ingress apiVersion: networking.k8s.io/v1 metadata: name: prometheus-redirect - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} annotations: # Use redirect Midleware configured - traefik.ingress.kubernetes.io/router.middlewares: traefik-system-redirect@kubernetescrd + traefik.ingress.kubernetes.io/router.middlewares: {{ k3s_traefik_namespace }}-redirect@kubernetescrd # HTTP as entrypoint traefik.ingress.kubernetes.io/router.entrypoints: web spec: rules: - - host: prometheus.picluster.ricsanfre.com + - host: {{ prometheus_dashboard_dns }} http: paths: - path: / diff --git a/ansible/roles/prometheus/templates/traefik_servicemonitor.yml.j2 b/ansible/roles/prometheus/templates/traefik_servicemonitor.yml.j2 index 28cc3943..478afa60 100644 --- a/ansible/roles/prometheus/templates/traefik_servicemonitor.yml.j2 +++ b/ansible/roles/prometheus/templates/traefik_servicemonitor.yml.j2 @@ -6,7 +6,7 @@ metadata: app: traefik release: kube-prometheus-stack name: traefik-prometheus-servicemonitor - namespace: k3s-monitoring + namespace: {{ k3s_monitoring_namespace }} spec: endpoints: - port: traefik From a3e95bd568df919aee5c6be7210a9782ace7d9bf Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Fri, 3 Dec 2021 17:41:22 +0100 Subject: [PATCH 11/17] Ignore errors when cleaning pod logs and fluentd pos files --- ansible/tasks/cleaning.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ansible/tasks/cleaning.yml b/ansible/tasks/cleaning.yml index a899d6d2..4c65f2f7 100644 --- a/ansible/tasks/cleaning.yml +++ b/ansible/tasks/cleaning.yml @@ -19,10 +19,12 @@ shell: "rm -rf /var/log/pods /var/log/containers" args: executable: /bin/bash + ignore_errors: yes changed_when: true - name: Clean fluentd pos files shell: "rm /var/log/*.pos" args: executable: /bin/bash + ignore_errors: yes changed_when: true From a1b200a9b8c889c79543c1cbbf7e6b10530b6682 Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Wed, 8 Dec 2021 17:54:48 +0100 Subject: [PATCH 12/17] Fix #22. Adding monitoring of K3S controller, scheduler and proxy components --- ansible/group_vars/k3s_cluster.yml | 12 +- .../k3s-controllermanager-dashboard.json | 1154 +++++++++++++++ .../prometheus/files/k3s-proxy-dashboard.json | 1234 +++++++++++++++++ .../files/k3s-scheduler-dashboard.json | 1077 ++++++++++++++ ansible/roles/prometheus/tasks/main.yml | 24 + .../templates/k3s_service_metrics.yml.j2 | 32 + .../templates/k3s_servicemonitor.yml.j2 | 19 + documentation/installing_k3s.md | 4 +- documentation/monitoring.md | 101 +- 9 files changed, 3654 insertions(+), 3 deletions(-) create mode 100644 ansible/roles/prometheus/files/k3s-controllermanager-dashboard.json create mode 100644 ansible/roles/prometheus/files/k3s-proxy-dashboard.json create mode 100644 ansible/roles/prometheus/files/k3s-scheduler-dashboard.json create mode 100644 ansible/roles/prometheus/templates/k3s_service_metrics.yml.j2 create mode 100644 ansible/roles/prometheus/templates/k3s_servicemonitor.yml.j2 diff --git a/ansible/group_vars/k3s_cluster.yml b/ansible/group_vars/k3s_cluster.yml index fb5d7b37..7c73fc0d 100644 --- a/ansible/group_vars/k3s_cluster.yml +++ b/ansible/group_vars/k3s_cluster.yml @@ -7,7 +7,17 @@ k3s_master_ip: 10.0.0.11 k3s_token: s1cret0 # Extra arguments for k3s installation scripts -k3s_server_extra_args: "--write-kubeconfig-mode '0644' --disable 'servicelb' --node-taint 'node-role.kubernetes.io/master=true:NoSchedule'" +k3s_server_extra_args: >- + --write-kubeconfig-mode '0644' + --disable 'servicelb' + --node-taint 'node-role.kubernetes.io/master=true:NoSchedule' + --kube-controller-manager-arg 'bind-address=0.0.0.0' + --kube-controller-manager-arg 'address=0.0.0.0' + --kube-proxy-arg 'metrics-bind-address=0.0.0.0' + --kube-scheduler-arg 'bind-address=0.0.0.0' + --kube-scheduler-arg 'address=0.0.0.0' + + k3s_worker_extra_args: "--node-label 'node_type=worker'" # Namespaces diff --git a/ansible/roles/prometheus/files/k3s-controllermanager-dashboard.json b/ansible/roles/prometheus/files/k3s-controllermanager-dashboard.json new file mode 100644 index 00000000..7ab23496 --- /dev/null +++ b/ansible/roles/prometheus/files/k3s-controllermanager-dashboard.json @@ -0,0 +1,1154 @@ +{ + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{cluster=\"$cluster\", job=\"k3s-metrics-service\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Up", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "min" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(workqueue_adds_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} {{name}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Work Queue Add Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(workqueue_depth{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance, name)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} {{name}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Work Queue Depth", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(workqueue_queue_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance, name, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} {{name}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Work Queue Latency", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rest_client_requests_total{job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" + }, + { + "expr": "sum(rate(rest_client_requests_total{job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Kube API Request Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 8, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Post Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Get Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"k3s-metrics-service\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(up{cluster=\"$cluster\", job=\"k3s-metrics-service\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Kubernetes / Controller Manager", + "uid": "72e0e05bef5099e5f049b05fdc429ed4", + "version": 0 +} diff --git a/ansible/roles/prometheus/files/k3s-proxy-dashboard.json b/ansible/roles/prometheus/files/k3s-proxy-dashboard.json new file mode 100644 index 00000000..2c37544a --- /dev/null +++ b/ansible/roles/prometheus/files/k3s-proxy-dashboard.json @@ -0,0 +1,1234 @@ +{ + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{cluster=\"$cluster\", job=\"k3s-metrics-service\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Up", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "min" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubeproxy_sync_proxy_rules_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "rate", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rules Sync Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99,rate(kubeproxy_sync_proxy_rules_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Rule Sync Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(kubeproxy_network_programming_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "rate", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Programming Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 6, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Network Programming Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Kube API Request Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 8, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\",verb=\"POST\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Post Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Get Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 11, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 12, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(kube_pod_info, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(kubeproxy_network_programming_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Kubernetes / Proxy", + "uid": "632e265de029684c40b21cb76bca4f94", + "version": 0 +} diff --git a/ansible/roles/prometheus/files/k3s-scheduler-dashboard.json b/ansible/roles/prometheus/files/k3s-scheduler-dashboard.json new file mode 100644 index 00000000..8f2a7eb4 --- /dev/null +++ b/ansible/roles/prometheus/files/k3s-scheduler-dashboard.json @@ -0,0 +1,1077 @@ +{ + "__inputs": [ + + ], + "__requires": [ + + ], + "annotations": { + "list": [ + + ] + }, + "editable": false, + "gnetId": null, + "graphTooltip": 0, + "hideControls": false, + "id": null, + "links": [ + + ], + "refresh": "10s", + "rows": [ + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "#299c46", + "rgba(237, 129, 40, 0.89)", + "#d44a3a" + ], + "datasource": "$datasource", + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + + }, + "id": 2, + "interval": null, + "links": [ + + ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "span": 2, + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": false, + "lineColor": "rgb(31, 120, 193)", + "show": false + }, + "tableColumn": "", + "targets": [ + { + "expr": "sum(up{cluster=\"$cluster\", job=\"k3s-metrics-service\"})", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": "", + "title": "Up", + "tooltip": { + "shared": false + }, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "min" + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(scheduler_e2e_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} e2e", + "refId": "A" + }, + { + "expr": "sum(rate(scheduler_binding_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} binding", + "refId": "B" + }, + { + "expr": "sum(rate(scheduler_scheduling_algorithm_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} scheduling algorithm", + "refId": "C" + }, + { + "expr": "sum(rate(scheduler_volume_scheduling_duration_seconds_count{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])) by (cluster, instance)", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} volume", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Scheduling Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 5, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_e2e_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} e2e", + "refId": "A" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_binding_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} binding", + "refId": "B" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_scheduling_algorithm_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} scheduling algorithm", + "refId": "C" + }, + { + "expr": "histogram_quantile(0.99, sum(rate(scheduler_volume_scheduling_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}[5m])) by (cluster, instance, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{cluster}} {{instance}} volume", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Scheduling latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 5, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"2..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "2xx", + "refId": "A" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"3..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "3xx", + "refId": "B" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"4..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "4xx", + "refId": "C" + }, + { + "expr": "sum(rate(rest_client_requests_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\",code=~\"5..\"}[5m]))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "5xx", + "refId": "D" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Kube API Request Rate", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "ops", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 6, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 8, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\", verb=\"POST\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Post Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 7, + "legend": { + "alignAsTable": true, + "avg": false, + "current": true, + "max": false, + "min": false, + "rightSide": true, + "show": true, + "sideWidth": null, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "histogram_quantile(0.99, sum(rate(rest_client_request_duration_seconds_bucket{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\", verb=\"GET\"}[5m])) by (verb, url, le))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{verb}} {{url}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Get Request Latency 99th Quantile", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "s", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + }, + { + "collapse": false, + "collapsed": false, + "panels": [ + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 8, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "process_resident_memory_bytes{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Memory", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 9, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(process_cpu_seconds_total{cluster=\"$cluster\", job=\"k3s-metrics-service\", instance=~\"$instance\"}[5m])", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "CPU usage", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "bytes", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + } + ] + }, + { + "aliasColors": { + + }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fill": 1, + "fillGradient": 0, + "gridPos": { + + }, + "id": 10, + "legend": { + "alignAsTable": false, + "avg": false, + "current": false, + "max": false, + "min": false, + "rightSide": false, + "show": true, + "sideWidth": null, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "links": [ + + ], + "nullPointMode": "null", + "percentage": false, + "pointradius": 5, + "points": false, + "renderer": "flot", + "repeat": null, + "seriesOverrides": [ + + ], + "spaceLength": 10, + "span": 4, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "go_goroutines{cluster=\"$cluster\", job=\"k3s-metrics-service\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{instance}}", + "refId": "A" + } + ], + "thresholds": [ + + ], + "timeFrom": null, + "timeShift": null, + "title": "Goroutines", + "tooltip": { + "shared": false, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ + + ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ] + } + ], + "repeat": null, + "repeatIteration": null, + "repeatRowId": null, + "showTitle": false, + "title": "Dashboard Row", + "titleSize": "h6", + "type": "row" + } + ], + "schemaVersion": 14, + "style": "dark", + "tags": [ + "kubernetes-mixin" + ], + "templating": { + "list": [ + { + "current": { + "text": "default", + "value": "default" + }, + "hide": 0, + "label": null, + "name": "datasource", + "options": [ + + ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "type": "datasource" + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 2, + "includeAll": false, + "label": "cluster", + "multi": false, + "name": "cluster", + "options": [ + + ], + "query": "label_values(up{job=\"k3s-metrics-service\"}, cluster)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": null, + "current": { + + }, + "datasource": "$datasource", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ + + ], + "query": "label_values(process_cpu_seconds_total{cluster=\"$cluster\", job=\"k3s-metrics-service\"}, instance)", + "refresh": 2, + "regex": "", + "sort": 1, + "tagValuesQuery": "", + "tags": [ + + ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "utc", + "title": "Kubernetes / Scheduler", + "uid": "2e6b6a3b4bddf1427b3a55aa1311c656", + "version": 0 +} diff --git a/ansible/roles/prometheus/tasks/main.yml b/ansible/roles/prometheus/tasks/main.yml index b7f3900e..09aa2d67 100644 --- a/ansible/roles/prometheus/tasks/main.yml +++ b/ansible/roles/prometheus/tasks/main.yml @@ -43,6 +43,23 @@ adminPassword: "{{ prometheus_grafana_password }}" plugins: - grafana-piechart-panel + kubeApiServer: + enabled: true + kubeControllerManager: + enabled: false + kubeScheduler: + enabled: false + kubeProxy: + enabled: false + kubeEtcd: + enabled: false + +- name: Create k3s metrics service + kubernetes.core.k8s: + definition: "{{ lookup('template', 'templates/' + item ) }}" + state: present + with_items: + - k3s_service_metrics.yml.j2 - name: Create Ingress rule for Prometheus, Alertmanager and Graphana UI kubernetes.core.k8s: @@ -60,6 +77,7 @@ with_items: - traefik_servicemonitor.yml.j2 - longhorn_servicemonitor.yml.j2 + - k3s_servicemonitor.yml.j2 - name: Configure Grafana Dashboards include_tasks: configure_grafana_dashboards.yml @@ -73,3 +91,9 @@ file: traefik-dashboard.json - name: dashboard-longhorn file: longhorn-dashboard.json + - name: k3s-controller-manager + file: k3s-controllermanager-dashboard.json + - name: k3s-scheduler + file: k3s-scheduler-dashboard.json + - name: k3s-proxy + file: k3s-proxy-dashboard.json diff --git a/ansible/roles/prometheus/templates/k3s_service_metrics.yml.j2 b/ansible/roles/prometheus/templates/k3s_service_metrics.yml.j2 new file mode 100644 index 00000000..eeba21b5 --- /dev/null +++ b/ansible/roles/prometheus/templates/k3s_service_metrics.yml.j2 @@ -0,0 +1,32 @@ +--- +# Headless service for K3S metrics. No Selector +apiVersion: v1 +kind: Service +metadata: + name: k3s-metrics-service + labels: + app: k3s-metrics + namespace: kube-system +spec: + clusterIP: None + ports: + - name: http-metrics + port: 10249 + protocol: TCP + targetPort: 10249 + type: ClusterIP + +--- +# Endpoint for the headless service without selector +apiVersion: v1 +kind: Endpoints +metadata: + name: k3s-metrics-service + namespace: kube-system +subsets: +- addresses: + - ip: 10.0.0.11 + ports: + - name: http-metrics + port: 10249 + protocol: TCP diff --git a/ansible/roles/prometheus/templates/k3s_servicemonitor.yml.j2 b/ansible/roles/prometheus/templates/k3s_servicemonitor.yml.j2 new file mode 100644 index 00000000..c0fdae4f --- /dev/null +++ b/ansible/roles/prometheus/templates/k3s_servicemonitor.yml.j2 @@ -0,0 +1,19 @@ +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + labels: + app: k3s + release: kube-prometheus-stack + name: k3s-prometheus-servicemonitor + namespace: {{ k3s_monitoring_namespace }} +spec: + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app: k3s-metrics + endpoints: + - port: http-metrics + path: /metrics diff --git a/documentation/installing_k3s.md b/documentation/installing_k3s.md index ce6e2b37..0fb24c64 100644 --- a/documentation/installing_k3s.md +++ b/documentation/installing_k3s.md @@ -35,12 +35,14 @@ Enable cgroup via boot commandline if not already enabled for Ubuntu on a Raspbe - Step 1: Installing K3S control plane node For installing the master node execute the following command: ``` - curl -sfL https://get.k3s.io | K3S_TOKEN= sh -s - server --write-kubeconfig-mode '0644' --node-taint 'node-role.kubernetes.io/master=true:NoSchedule' --disable 'servicelb' + curl -sfL https://get.k3s.io | K3S_TOKEN= sh -s - server --write-kubeconfig-mode '0644' --node-taint 'node-role.kubernetes.io/master=true:NoSchedule' --disable 'servicelb' --kube-controller-manager-arg 'bind-address=0.0.0.0' --kube-controller-manager-arg 'address=0.0.0.0' --kube-proxy-arg 'metrics-bind-address=0.0.0.0' --kube-scheduler-arg 'bind-address=0.0.0.0' --kube-scheduler-arg 'address=0.0.0.0' ``` - **server_token** is shared secret within the cluster for allowing connection of worker nodes - **--write-kubeconfig-mode '0644'** gives read permissions to kubeconfig file located in `/etc/rancher/k3s/k3s.yaml` - **--node-taint 'node-role.kubernetes.io/master=true:NoSchedule'** makes master node not schedulable to run any pod. Only pods marked with specific tolerance will be scheduled on master node. - **--disable servicelb** to disable default service load balancer installed by K3S (Klipper Load Balancer) +- **--kube-controller-manager.arg**, **--kube-schedueler-arg** and **--kube-proxy-arg** to bind those components not only to 127.0.0.1 and enable metrics scraping from external node. + > NOTE 1: diff --git a/documentation/monitoring.md b/documentation/monitoring.md index af76e2b0..f342bb4e 100644 --- a/documentation/monitoring.md +++ b/documentation/monitoring.md @@ -32,7 +32,7 @@ Kube-prometheus stack can be installed using helm [kube-prometheus-stack](https: ``` kubectl create namespace monitoring ``` -- Step 3: Create values.yml for configuring VolumeClaimTemplates using longhorn and Grafana's admin password and list of plugins to be installed +- Step 3: Create values.yml for configuring VolumeClaimTemplates using longhorn and Grafana's admin password, list of plugins to be installed and disabling the monitoring of kubernetes components (Scheduler, Controller Manager and Proxy). See issue [#22](https://github.com/ricsanfre/pi-cluster/issues/22) ```yml alertmanager: @@ -61,6 +61,16 @@ Kube-prometheus stack can be installed using helm [kube-prometheus-stack](https: # List of grafana plugins to be installed plugins: - grafana-piechart-panel + kubeApiServer: + enabled: true + kubeControllerManager: + enabled: false + kubeScheduler: + enabled: false + kubeProxy: + enabled: false + kubeEtcd: + enabled: false ```yml - Step 3: Install kube-Prometheus-stack in the monitoring namespace with the overriden values @@ -272,6 +282,95 @@ spec: kubectl apply -f prometheus_ingress.yml grafana_ingress.yml alertmanager_ingress.yml + +## K3S components monitoring + +In order to monitor Kubernetes components (Scheduler, Controller Manager and Proxy), default resources created by kube-prometheus-operator (headless service, service monitor and grafana dashboards) are not valid for monitoring K3S because K3S is emitting the same metrics on the three end-points, causing prometheus to consume high memory causing worker node outage. See issue [#22](https://github.com/ricsanfre/pi-cluster/issues/22) for more details. + + +- Create a manifest file `k3s-metrics-service.yml` for creating the Kuberentes service used by Prometheus to scrape K3S metrics. + + This service must be a [headless service](https://kubernetes.io/docs/concepts/services-networking/service/#headless-services), for allowing Prometheus service discovery process of each of the pods behind the service. Since the metrics are exposed not by a pod but by a k3s process, the service need to be defined [`without selector`](https://kubernetes.io/docs/concepts/services-networking/service/#services-without-selectors) and the `endpoints` must be defined explicitely + + The service will be use the k3s-proxy endpoint (TCP port 10249) for scraping all metrics. + + ```yml + --- + # Headless service for K3S metrics. No Selector + apiVersion: v1 + kind: Service + metadata: + name: k3s-metrics-service + labels: + app: k3s-metrics + namespace: kube-system + spec: + clusterIP: None + ports: + - name: http-metrics + port: 10249 + protocol: TCP + targetPort: 10249 + type: ClusterIP + + --- + # Endpoint for the headless service without selector + apiVersion: v1 + kind: Endpoints + metadata: + name: k3s-metrics-service + namespace: kube-system + subsets: + - addresses: + - ip: 10.0.0.11 + ports: + - name: http-metrics + port: 10249 + protocol: TCP + ``` + +- Create manifest file for defining the service monitor resource for let Prometheus discover this target + + The Prometheus custom resource definition (CRD), `ServiceMonitoring` will be used to automatically discover K3S metrics endpoint as a Prometheus target. + + ```yml + apiVersion: monitoring.coreos.com/v1 + kind: ServiceMonitor + metadata: + labels: + app: k3s + release: kube-prometheus-stack + name: k3s-prometheus-servicemonitor + namespace: k3s-monitoring + spec: + namespaceSelector: + matchNames: + - kube-system + selector: + matchLabels: + app: k3s-metrics + endpoints: + - port: http-metrics + path: /metrics + ``` + + +- Apply manifest file + + kubectl apply -f k3s-metrics-service.yml k3s-servicemonitor.yml + +- Check target is automatically discovered in Prometheus UI + + http://prometheus.picluster.ricsanfre/targets + +### K3S Grafana dashboards + +Kubernetes-controller-manager, kubernetes-proxy and kuberetes-scheduler dashboards can be donwloaded from grafana.com: + +- Kube Proxy: https://grafana.com/grafana/dashboards/12129 +- Kube Controller Manager: https://grafana.com/grafana/dashboards/12122 +- Kube Scheduler: https://grafana.com/grafana/dashboards/12130 + ## Traefik Monitoring The Prometheus custom resource definition (CRD), `ServiceMonitoring` will be used to automatically discover Traefik metrics endpoint as a Prometheus target. From dae52d1f28df68ec10657500a8c4a3950dc969d1 Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Sat, 11 Dec 2021 10:42:16 +0100 Subject: [PATCH 13/17] Solving yamllint issue --- ansible/tasks/cleaning.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ansible/tasks/cleaning.yml b/ansible/tasks/cleaning.yml index 4c65f2f7..d20f4211 100644 --- a/ansible/tasks/cleaning.yml +++ b/ansible/tasks/cleaning.yml @@ -19,12 +19,12 @@ shell: "rm -rf /var/log/pods /var/log/containers" args: executable: /bin/bash - ignore_errors: yes + ignore_errors: true changed_when: true - name: Clean fluentd pos files shell: "rm /var/log/*.pos" args: executable: /bin/bash - ignore_errors: yes + ignore_errors: true changed_when: true From 434b9be4d6b29cc172d9719f87a0caf28f3877ec Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Sat, 11 Dec 2021 10:53:59 +0100 Subject: [PATCH 14/17] Fix #20. Making basic http authentication credenctials configurable --- ansible/roles/traefik/defaults/main.yml | 7 +++++++ .../tasks/create_basic_auth_credentials.yml | 18 ++++++++++++++++++ ansible/roles/traefik/tasks/main.yml | 3 +++ .../templates/basicauth_middleware.yml.j2 | 2 +- 4 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 ansible/roles/traefik/tasks/create_basic_auth_credentials.yml diff --git a/ansible/roles/traefik/defaults/main.yml b/ansible/roles/traefik/defaults/main.yml index aedd2b75..e44322b8 100644 --- a/ansible/roles/traefik/defaults/main.yml +++ b/ansible/roles/traefik/defaults/main.yml @@ -4,3 +4,10 @@ k3s_traefik_namespace: traefik-system # Endpoint Service DNS name traefik_dashboard_dns: traefik.picluster.ricsanfre.com + + +# Basic auth user/passwd +traefik_basic_auth_user: admin +traefik_basic_auth_passwd: s1cret0 + +traefik_auth_htpasswd_pair: KKYWRtaW46JGFwcjEkWkRkMWIvNC4kUG9RR244RW5Gc0lWUUFDS3p3VHJrLgoK diff --git a/ansible/roles/traefik/tasks/create_basic_auth_credentials.yml b/ansible/roles/traefik/tasks/create_basic_auth_credentials.yml new file mode 100644 index 00000000..98a78ba5 --- /dev/null +++ b/ansible/roles/traefik/tasks/create_basic_auth_credentials.yml @@ -0,0 +1,18 @@ +--- + +- name: Ensure htpasswd utility is installed + package: + name: 'apache2-utils' + state: 'present' + update_cache: true + become: true + +- name: htpasswd utility + shell: + cmd: >- + htpasswd -nb {{ traefik_basic_auth_user }} {{ traefik_basic_auth_passwd }} | base64 + register: htpasswd + +- name: Set htpasswd pair + set_fact: + traefik_auth_htpasswd_pair: "{{ htpasswd.stdout }}" diff --git a/ansible/roles/traefik/tasks/main.yml b/ansible/roles/traefik/tasks/main.yml index 893cc5dc..62ebbbd7 100644 --- a/ansible/roles/traefik/tasks/main.yml +++ b/ansible/roles/traefik/tasks/main.yml @@ -16,6 +16,9 @@ kind: Namespace state: present +- name: Create Basic authentication credentials + include_tasks: create_basic_auth_credentials.yml + - name: Configura Traefik kubernetes.core.k8s: definition: "{{ lookup('template', 'templates/' + item ) }}" diff --git a/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 b/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 index 520b0ad9..98b6fe09 100644 --- a/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 +++ b/ansible/roles/traefik/templates/basicauth_middleware.yml.j2 @@ -10,7 +10,7 @@ metadata: namespace: {{ k3s_traefik_namespace }} data: users: |2 - b3NzOiRhcHIxJDNlZTVURy83JFpmY1NRQlV6SFpIMFZTak9NZGJ5UDANCg0K + {{ traefik_auth_htpasswd_pair }} --- # Basic-auth middleware From 9e5355c9840657fea1788894b499f82de6ad097e Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Sat, 11 Dec 2021 11:03:47 +0100 Subject: [PATCH 15/17] Adding dns variables for available services --- ansible/group_vars/k3s_cluster.yml | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/ansible/group_vars/k3s_cluster.yml b/ansible/group_vars/k3s_cluster.yml index 7c73fc0d..a004c53f 100644 --- a/ansible/group_vars/k3s_cluster.yml +++ b/ansible/group_vars/k3s_cluster.yml @@ -28,5 +28,20 @@ k3s_certmanager_namespace: certmanager-system k3s_logging_namespace: k3s-logging k3s_monitoring_namespace: k3s-monitoring +# DNS service end-points + +traefik_dashboard_dns: traefik.picluster.ricsanfre.com +longhorn_dashboard_dns: storage.picluster.ricsanfre.com +kibana_dashboard_dns: kibana.picluster.ricsanfre.com +grafana_dashboard_dns: grafana.picluster.ricsanfre.com +prometheus_dashboard_dns: prometheus.picluster.ricsanfre.com +alertmanager_dashboard_dns: alertmanager.picluster.ricsanfre.com + +# MetalLB configuration # k3s external ip range: Metal LB pool configuration k3s_external_ip_range: "10.0.0.100-10.0.0.200" + +# Traefik configuration +# HTTP Basic auth credentials +traefik_basic_auth_user: admin +traefik_basic_auth_passwd: s1cret0 From fa3fbf8df4d1526a2ec5783f41a6585f1ddd66ec Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Sat, 11 Dec 2021 11:37:26 +0100 Subject: [PATCH 16/17] Fix #12. Adding nftables configuration details in the documentation --- documentation/gateway.md | 201 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 198 insertions(+), 3 deletions(-) diff --git a/documentation/gateway.md b/documentation/gateway.md index ce5ac9de..2b3e3d38 100644 --- a/documentation/gateway.md +++ b/documentation/gateway.md @@ -126,9 +126,204 @@ Package can be installed with apt: And it can be configured using command line or a configuration file `/etc/nftables.conf`. -``` -TBD: CONTENT nftables.conf -``` + +As a modular example: + +- Global Configuration File + + `/etc/nftables.conf` + ``` + #!/usr/sbin/nft -f + # Ansible managed + + # clean + flush ruleset + + include "/etc/nftables.d/defines.nft" + + table inet filter { + chain global { + # 005 state management + ct state established,related accept + ct state invalid drop + } + include "/etc/nftables.d/sets.nft" + include "/etc/nftables.d/filter-input.nft" + include "/etc/nftables.d/filter-output.nft" + include "/etc/nftables.d/filter-forward.nft" + } + + # Additionnal table for Network Address Translation (NAT) + table ip nat { + include "/etc/nftables.d/sets.nft" + include "/etc/nftables.d/nat-prerouting.nft" + include "/etc/nftables.d/nat-postrouting.nft" + } + + ``` +- Variables Variables containing the IP address and ports to be used by the rules files + + `/etc/nftables.d/defines.nft` + ``` + # broadcast and multicast + define badcast_addr = { 255.255.255.255, 224.0.0.1, 224.0.0.251 } + + # broadcast and multicast + define ip6_badcast_addr = { ff02::16 } + + # in_tcp_accept + define in_tcp_accept = { ssh, https, http } + + # in_udp_accept + define in_udp_accept = { snmp, domain, ntp, bootps } + + # out_tcp_accept + define out_tcp_accept = { http, https, ssh } + + # out_udp_accept + define out_udp_accept = { domain, bootps , ntp } + + # lan_interface + define lan_interface = eth0 + + # wan_interface + define wan_interface = wlan0 + + # lan_network + define lan_network = 10.0.0.0/24 + + # forward_tcp_accept + define forward_tcp_accept = { http, https, ssh } + + # forward_udp_accept + define forward_udp_accept = { domain, ntp } + + ``` +- Nftables typed and tagged variables, [sets](https://wiki.nftables.org/wiki-nftables/index.php/Sets). + + `/etc/nftables.d/sets.nft` + ``` + set blackhole { + type ipv4_addr; + elements = $badcast_addr + } + + set forward_tcp_accept { + type inet_service; flags interval; + elements = $forward_tcp_accept + } + + set forward_udp_accept { + type inet_service; flags interval; + elements = $forward_udp_accept + } + + set in_tcp_accept { + type inet_service; flags interval; + elements = $in_tcp_accept + } + + set in_udp_accept { + type inet_service; flags interval; + elements = $in_udp_accept + } + + set ip6blackhole { + type ipv6_addr; + elements = $ip6_badcast_addr + } + + set out_tcp_accept { + type inet_service; flags interval; + elements = $out_tcp_accept + } + + set out_udp_accept { + type inet_service; flags interval; + elements = $out_udp_accept + } + + ``` +- Input traffic filtering rules + + `/etc/nftables.d/filter-input.nft` + ``` + chain input { + # 000 policy + type filter hook input priority 0; policy drop; + # 005 global + jump global + # 010 drop unwanted + # (none) + # 011 drop unwanted ipv6 + # (none) + # 015 localhost + iif lo accept + # 050 icmp + meta l4proto {icmp,icmpv6} accept + # 200 input udp accepted + udp dport @in_udp_accept ct state new accept + # 210 input tcp accepted + tcp dport @in_tcp_accept ct state new accept + } + + ``` + +- Output traffic filtering rules + + `/etc/nftables.d/filter-output.nft` + ``` + chain output { + # 000 policy: Allow any output traffic + type filter hook output priority 0; + } + ``` + +- Forwarding traffic rules + + `/etc/nftables.d/filter-forward.nft` + ``` + chain forward { + # 000 policy + type filter hook forward priority 0; policy drop; + # 005 global + jump global + # 200 lan to wan tcp + iifname $lan_interface ip saddr $lan_network oifname $wan_interface tcp dport @forward_tcp_accept ct state new accept + # 210 wan to lan udp + iifname $lan_interface ip saddr $lan_network oifname $wan_interface udp dport @forward_udp_accept ct state new accept + # 220 ssh from wan + iifname $wan_interface oifname $lan_interface ip daddr $lan_network tcp dport ssh ct state new accept + # 230 http from wan + iifname $wan_interface oifname $lan_interface ip daddr $lan_network tcp dport {http, https} ct state new accept + } + + ``` + +- NAT pre-routing rules + + `/etc/nftables.d/nat-prerouting.nft` + ``` + chain prerouting { + # 000 policy + type nat hook prerouting priority 0; + } + + ``` + +- NAT post-routing rules + `/etc/nftables.d/nat-postrouting.nft` + ``` + chain postrouting { + # 000 policy + type nat hook postrouting priority 100; + # 005 masquerade lan to wan + ip saddr $lan_network oifname $wan_interface masquerade + } + + ``` + +
From ee6b9840410cbe6a1592a4b16ca3bba8a4a2867d Mon Sep 17 00:00:00 2001 From: ricsanfre Date: Sat, 11 Dec 2021 11:59:34 +0100 Subject: [PATCH 17/17] Adding tasks names. ansible-lint issue --- ansible/roles/basic_setup/tasks/remove_snap_packages.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/ansible/roles/basic_setup/tasks/remove_snap_packages.yml b/ansible/roles/basic_setup/tasks/remove_snap_packages.yml index 34ed1005..31560422 100644 --- a/ansible/roles/basic_setup/tasks/remove_snap_packages.yml +++ b/ansible/roles/basic_setup/tasks/remove_snap_packages.yml @@ -16,9 +16,12 @@ register: snap_remove_output with_items: "{{ snap_packages.stdout_lines }}" rescue: - - fail: + - name: Check number of retries and fail if greater that 3 + fail: msg: Maximum retries of grouped tasks reached when: retry_count | int == 3 - - debug: + - name: printing retry message + debug: msg: "Removing snap package failed, let's give it another shot" - - include_tasks: remove_snap_packages.yml + - name: retrying deletion + include_tasks: remove_snap_packages.yml