diff --git a/bin/kube-client b/bin/kube-client index af12a19b..24731175 100755 --- a/bin/kube-client +++ b/bin/kube-client @@ -65,6 +65,7 @@ containerfull=ghcr.io/wfau/atolmis/${containername:?} kubectlproxy=127.0.0.1::8001 + monitorproxy=127.0.0.1::3001 podman run \ --rm \ @@ -73,6 +74,7 @@ --name "${clientname:?}" \ --hostname "${clientname:?}" \ --publish "${kubectlproxy:?}" \ + --publish "${monitorproxy:?}" \ --env "cloudname=${cloudname:?}" \ --env "cloudsite=${cloudsite:?}" \ --env "SSH_AUTH_SOCK=/mnt/ssh_auth_sock" \ diff --git a/deployments/cluster-api/ansible/00-create-all.yml b/deployments/cluster-api/ansible/00-create-all.yml index e4739d83..273cbb43 100644 --- a/deployments/cluster-api/ansible/00-create-all.yml +++ b/deployments/cluster-api/ansible/00-create-all.yml @@ -29,7 +29,7 @@ - import_playbook: 22-install-capi-provider.yml - import_playbook: 23-install-capi-helm-charts.yml - import_playbook: 25-create-work-cluster.yml -- import_playbook: 26-secure-work-cluster.yml +# import_playbook: 26-secure-work-cluster.yml - import_playbook: 30-install-aglais.yml diff --git a/deployments/cluster-api/ansible/21-create-kind-cluster.yml b/deployments/cluster-api/ansible/21-create-kind-cluster.yml index a20c1bf8..98709fc4 100644 --- a/deployments/cluster-api/ansible/21-create-kind-cluster.yml +++ b/deployments/cluster-api/ansible/21-create-kind-cluster.yml @@ -46,7 +46,7 @@ dest: "{{ aglais.kubernetes.cluster.kind.conf }}" flat: yes -- name: "Set local file permissions" +- name: "Update localhost" gather_facts: false hosts: localhost vars_files: @@ -61,3 +61,11 @@ path: "{{ aglais.kubernetes.cluster.kind.conf }}" mode: "u=rw,g=,o=" + - name: "Update timestamp in [{{ agstatusfile }}]" + yedit: + src: "{{ agstatuspath }}" + key: aglais.kubernetes.cluster.kind.debug + value: + created: "{{ now('%Y-%m-%dT%H:%M:%S%:z') }}" + + diff --git a/deployments/cluster-api/ansible/25-create-work-cluster.yml b/deployments/cluster-api/ansible/25-create-work-cluster.yml index cb11e0f0..5dc1a39c 100644 --- a/deployments/cluster-api/ansible/25-create-work-cluster.yml +++ b/deployments/cluster-api/ansible/25-create-work-cluster.yml @@ -77,7 +77,7 @@ dest: "{{ aglais.kubernetes.cluster.work.conf }}" flat: yes -- name: "Set local file permissions" +- name: "Update localhost" gather_facts: false hosts: localhost vars_files: @@ -92,4 +92,11 @@ path: "{{ aglais.kubernetes.cluster.work.conf }}" mode: "u=rw,g=,o=" + - name: "Update timestamp in [{{ agstatusfile }}]" + yedit: + src: "{{ agstatuspath }}" + key: aglais.kubernetes.cluster.work.debug + value: + created: "{{ now('%Y-%m-%dT%H:%M:%S%:z') }}" + diff --git a/deployments/cluster-api/ansible/templates/clusterapi-config.j2 b/deployments/cluster-api/ansible/templates/clusterapi-config.j2 index f35c16c2..639fe998 100644 --- a/deployments/cluster-api/ansible/templates/clusterapi-config.j2 +++ b/deployments/cluster-api/ansible/templates/clusterapi-config.j2 @@ -93,8 +93,7 @@ addons: # Settings for the CNI addon cni: - - # Indicates if a CNI should be deployed + # Indicates if a CNI should be deployed (default true) enabled: true # The CNI to deploy - supported values are calico or cilium @@ -108,8 +107,53 @@ addons: name: tigera-operator version: v3.26.0 - # Include the Kubernetes dashboard + # Settings for the OpenStack integrations + openstack: + # Indicates if the OpenStack integrations should be enabled (default false) + enabled: false + + # Settings for the metrics server + # https://github.com/kubernetes-sigs/metrics-server#helm-chart + metricsServer: + # Indicates if the metrics server should be deployed (default true) + enabled: true + + # Settings for the Kubernetes dashboard + # https://github.com/kubernetes/dashboard/tree/master/charts/helm-chart/kubernetes-dashboard kubernetesDashboard: + # Indicates if the Kubernetes dashboard should be enabled (default false) + enabled: true + + # Settings for ingress controllers + ingress: + # Indicates if ingress controllers should be enabled (default false) + enabled: false + + # Settings for cluster monitoring + monitoring: + # Indicates if the cluster monitoring should be enabled (default false) + enabled: true + + # Settings for node feature discovery + # https://github.com/kubernetes-sigs/node-feature-discovery/tree/master/deployment/helm/node-feature-discovery + nodeFeatureDiscovery: + # Indicates if node feature discovery should be enabled (default true) + enabled: true + + # Settings for the NVIDIA GPU operator + nvidiaGPUOperator: + # Indicates if the NVIDIA GPU operator should be enabled (default true) + # Note that because it uses node feature discovery to run only on nodes + # with an NVIDIA GPU available, the overhead of enabling this on clusters + # that do not need it now but may need it in the future is low + enabled: true + + # Settings for the Mellanox network operator + mellanoxNetworkOperator: + # Indicates if the network operator should be enabled (default true) + # Note that because it uses node feature discovery to run only on nodes + # with a Mellanox NIC available, the overhead of enabling this on clusters + # that do not need it now but may need it in the future is low enabled: true diff --git a/deployments/cluster-api/ansible/templates/init-status.j2 b/deployments/cluster-api/ansible/templates/init-status.j2 index 72b83643..edd4adb7 100644 --- a/deployments/cluster-api/ansible/templates/init-status.j2 +++ b/deployments/cluster-api/ansible/templates/init-status.j2 @@ -26,6 +26,8 @@ aglais: type: cluster-api name: {{ deployname }} date: {{ deploydate }} + debug: + started: "{{ now('%Y-%m-%dT%H:%M:%S%:z') }}" openstack: cloud: name: {{ cloudname }} diff --git a/notes/zrq/20240219-03-jade-reconnect.txt b/notes/zrq/20240219-03-jade-reconnect.txt index 8fbce528..0af2db41 100644 --- a/notes/zrq/20240219-03-jade-reconnect.txt +++ b/notes/zrq/20240219-03-jade-reconnect.txt @@ -61,12 +61,10 @@ ' date hostname - ls -al /opt/aglais/aglais-status.yml ' > Mon Feb 19 05:58:30 PM UTC 2024 > somerville-jade-20240219-bootstrap-node.novalocal - > -rw-r--r--. 1 root root 1970 Feb 19 17:02 /opt/aglais/aglais-status.yml # ----------------------------------------------------- @@ -112,3 +110,42 @@ # Client configured :-D # + +# ----------------------------------------------------- +# Fetch the kubectl configuration files. +#[root@ansibler] + + yq '.aglais.kubernetes.cluster.kind.conf' \ + '/opt/aglais/aglais-status.yml' + + > /opt/aglais/somerville-jade-20240221-kind.yml + + + yq '.aglais.kubernetes.cluster.work.conf' \ + '/opt/aglais/aglais-status.yml' + + > /opt/aglais/somerville-jade-20240221-work.yml + + + kindclusterconf=$( + yq '.aglais.kubernetes.cluster.kind.conf' \ + '/opt/aglais/aglais-status.yml' + ) + + workclusterconf=$( + yq '.aglais.kubernetes.cluster.work.conf' \ + '/opt/aglais/aglais-status.yml' + ) + + scp "root@bootstrap:${kindclusterconf}" \ + "${kindclusterconf}" + + scp "root@bootstrap:${workclusterconf}" \ + "${workclusterconf}" + + # + # This doesn't work because the kubectl API is blocked by Somerville firewall. + # Only ssh is allowed to public IP addresses ? + # + + diff --git a/notes/zrq/20240220-01-jade-debug.txt b/notes/zrq/20240220-01-jade-debug.txt new file mode 100644 index 00000000..b65aec98 --- /dev/null +++ b/notes/zrq/20240220-01-jade-debug.txt @@ -0,0 +1,725 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# +# AIMetrics: [] +# + + Target: + + Collecting more details about the failed deployment from yesterday. + + Result: + + Embarrassing/iritating outcome. + While I was collecting all of this infomation, I dsciovered that the deployment was working. + Some time in the 12hrs between when I did the initial deploy and when I was collecting the data + the cluster finally managed to fix itself. + + Discovered some more thinsg to check if/when we get the same problem again. + + We need to come up with a way of measuting how long it takes for a cluster to resolve itself. + Something like `watch`, but one that periodically checks the cluster status and can detect + if/when it becomes healthy. + + +# ----------------------------------------------------- + + Message from Scott Davidson on Slack + Hi + @Dave Morris + I hope you don’t mind me jumping in here but I was just looking at your notes from the Kubernetes issues + you were having yesterday and these lines stand out to me. + It looks like 172.24.0.1 might be the internal IP address of the Kubernetes API server (I think you can + check with kubectl get svc -n default ) in which case a HTTP 500 from the Kubernetes API server itself + looks pretty suspicious. + Do you (or anyone else here) happen to know if the OpenStack VMs that make up the cluster are volume-backed? + The reason I ask is that there are known issues around running etcd on slow(er) storage devices such as + network-attached storage and sometimes even with HDD local disks. + If the storage backing the VMs is too slow (or is somehow sensitive to other workloads happening on the cloud) + then that might explain the intermittent issues you have been seeing. + +# ----------------------------------------------------- +# ----------------------------------------------------- +#[user@desktop] + + # + # Re-connect a client using notes. + # notes/zrq/20240219-03-jade-reconnect.txt + # + +# ----------------------------------------------------- +# List the nodes, flavors and images. +#[root@ansibler] + + openstack \ + --os-cloud "${cloudname:?}" \ + server list + + > +--------------------------------------+------------------------------------------------------------+--------+----------------------------------------------------------------------------+-----------------------------------+----------------+ + > | ID | Name | Status | Networks | Image | Flavor | + > +--------------------------------------+------------------------------------------------------------+--------+----------------------------------------------------------------------------+-----------------------------------+----------------+ + > | 32bdadee-9ab1-4f7e-ade4-463908234aa5 | somerville-jade-20240219-work-md-0-fb50a5e8-fhxtc | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240219-work=192.168.3.157 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | 7f6c01ab-059f-4709-8149-36ce4864570b | somerville-jade-20240219-work-md-0-fb50a5e8-bh6d5 | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240219-work=192.168.3.223 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | ed9a635d-ed0a-4b2e-a054-41160b3feb80 | somerville-jade-20240219-work-md-0-fb50a5e8-9bsxs | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240219-work=192.168.3.225 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | d7c145c8-b699-4555-bde6-4ec7973a5ba7 | somerville-jade-20240219-work-md-0-fb50a5e8-zjwtj | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240219-work=192.168.3.243 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | f380cceb-6385-48e5-bf70-030ace07b8e7 | somerville-jade-20240219-work-control-plane-ac9af912-v42dq | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240219-work=192.168.3.171 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.2vcpu | + > | 2ff8c8d9-34a1-444d-b32c-8db1a806e833 | somerville-jade-20240219-work-control-plane-ac9af912-m4vdt | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240219-work=192.168.3.47 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.2vcpu | + > | d64cc19f-f0eb-4044-b28d-476fc39208f9 | somerville-jade-20240219-work-md-0-fb50a5e8-ntqbd | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240219-work=192.168.3.129 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | d05a0082-33f7-4d8d-bc57-b33757c67cd2 | somerville-jade-20240219-work-md-0-fb50a5e8-whft4 | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240219-work=192.168.3.113 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | 3eca33f6-b2f0-414a-b7ca-c2a35541022e | somerville-jade-20240219-work-control-plane-ac9af912-gjv45 | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240219-work=192.168.3.186 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.2vcpu | + > | d8d63532-0ca9-4a0c-9e84-93644df8af49 | somerville-jade-20240219-bootstrap-node | ACTIVE | somerville-jade-20240219-bootstrap-network=10.10.0.211, 192.41.122.174 | gaia-dmp-fedora-cloud-38-1.6 | gaia.vm.2vcpu | + > +--------------------------------------+------------------------------------------------------------+--------+----------------------------------------------------------------------------+-----------------------------------+----------------+ + + + openstack \ + --os-cloud "${cloudname:?}" \ + server show \ + somerville-jade-20240219-work-md-0-fb50a5e8-fhxtc + + > +-------------------------------------+----------------------------------------------------------------------------+ + > | Field | Value | + > +-------------------------------------+----------------------------------------------------------------------------+ + > | OS-DCF:diskConfig | MANUAL | + > | OS-EXT-AZ:availability_zone | nova | + > | OS-EXT-SRV-ATTR:host | sv-hpe-0-6 | + > | OS-EXT-SRV-ATTR:hypervisor_hostname | sv-hpe-0-6 | + > | OS-EXT-SRV-ATTR:instance_name | instance-000075e9 | + > | OS-EXT-STS:power_state | Running | + > | OS-EXT-STS:task_state | None | + > | OS-EXT-STS:vm_state | active | + > | OS-SRV-USG:launched_at | 2024-02-19T17:13:10.000000 | + > | OS-SRV-USG:terminated_at | None | + > | accessIPv4 | | + > | accessIPv6 | | + > | addresses | k8s-clusterapi-cluster-default-somerville-jade-20240219-work=192.168.3.157 | + > | config_drive | | + > | created | 2024-02-19T17:13:06Z | + > | flavor | gaia.vm.26vcpu (f5bf7c55-d6aa-4ef7-ba91-6e15683ab557) | + > | hostId | f790b78efb6cb4355ad73dd6a6f953627fb3e8c2a0457196852611a8 | + > | id | 32bdadee-9ab1-4f7e-ade4-463908234aa5 | + > | image | gaia-dmp-ubuntu-2204-kube-v1.26.7 (2bfecf33-9fd4-4687-bf6a-569e43c47999) | + > | key_name | somerville-jade-20240219-keypair | + > | name | somerville-jade-20240219-work-md-0-fb50a5e8-fhxtc | + > | progress | 0 | + > | project_id | be227fe0300b4ce5b03f44264df615df | + > | properties | | + > | security_groups | name='k8s-cluster-default-somerville-jade-20240219-work-secgroup-worker' | + > | status | ACTIVE | + > | updated | 2024-02-19T17:13:10Z | + > | user_id | c4aad146ab7acaf44819e90e3e67a4d0490c164fbb02d388823c1ac9f0ae2e13 | + > | volumes_attached | | + > +-------------------------------------+----------------------------------------------------------------------------+ + + + openstack \ + --os-cloud "${cloudname:?}" \ + image show \ + gaia-dmp-ubuntu-2204-kube-v1.26.7 + + > +------------------+---------------------------------------------------------------------------------+ + > | Field | Value | + > +------------------+---------------------------------------------------------------------------------+ + > | checksum | eb33d889f410ee521e87d313f1b200ce | + > | container_format | bare | + > | created_at | 2024-01-06T03:39:13Z | + > | disk_format | qcow2 | + > | file | /v2/images/2bfecf33-9fd4-4687-bf6a-569e43c47999/file | + > | id | 2bfecf33-9fd4-4687-bf6a-569e43c47999 | + > | min_disk | 0 | + > | min_ram | 0 | + > | name | gaia-dmp-ubuntu-2204-kube-v1.26.7 | + > | owner | be227fe0300b4ce5b03f44264df615df | + > | properties | direct_url='rbd://84c5........7999/snap', | + > | | os_hash_algo='sha512', | + > | | os_hash_value='7015........147e', | + > | | os_hidden='False', | + > | | owner_specified.openstack.md5='', | + > | | owner_specified.openstack.object='images/gaia-dmp-ubuntu-2204-kube-v1.26.7', | + > | | owner_specified.openstack.sha256='', | + > | | stores='rbd' | + > | protected | False | + > | schema | /v2/schemas/image | + > | size | 10737418240 | + > | status | active | + > | tags | | + > | updated_at | 2024-01-06T05:45:22Z | + > | visibility | shared | + > +------------------+---------------------------------------------------------------------------------+ + + + openstack \ + --os-cloud "${cloudname:?}" \ + flavor show \ + gaia.vm.26vcpu + + > +----------------------------+--------------------------------------+ + > | Field | Value | + > +----------------------------+--------------------------------------+ + > | OS-FLV-DISABLED:disabled | False | + > | OS-FLV-EXT-DATA:ephemeral | 0 | + > | access_project_ids | None | + > | description | None | + > | disk | 20 | + > | id | f5bf7c55-d6aa-4ef7-ba91-6e15683ab557 | + > | name | gaia.vm.26vcpu | + > | os-flavor-access:is_public | True | + > | properties | trait:CUSTOM_SSD_DEV='forbidden' | + > | ram | 44032 | + > | rxtx_factor | 1.0 | + > | swap | | + > | vcpus | 26 | + > +----------------------------+--------------------------------------+ + + +# ----------------------------------------------------- +# List the kube-system Pods in the tenant (work) cluster. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + get pods \ + --namespace kube-system + ' + + > NAME READY STATUS RESTARTS AGE + > coredns-787d4945fb-dv6px 1/1 Running 0 18h + > coredns-787d4945fb-svl9q 1/1 Running 0 18h + > etcd-somerville-jade-20240219-work-control-plane-ac9af912-gjv45 1/1 Running 0 18h + > etcd-somerville-jade-20240219-work-control-plane-ac9af912-m4vdt 1/1 Running 0 18h + > etcd-somerville-jade-20240219-work-control-plane-ac9af912-v42dq 1/1 Running 0 18h + > kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-gjv45 1/1 Running 0 18h + > kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-m4vdt 1/1 Running 0 18h + > kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-v42dq 1/1 Running 0 18h + > kube-controller-manager-somerville-jade-20240219-work-control-plane-ac9af912-gjv45 1/1 Running 4 (18h ago) 18h + > kube-controller-manager-somerville-jade-20240219-work-control-plane-ac9af912-m4vdt 1/1 Running 2 (18h ago) 18h + > kube-controller-manager-somerville-jade-20240219-work-control-plane-ac9af912-v42dq 1/1 Running 0 18h + > kube-proxy-6ccvh 1/1 Running 0 18h + > kube-proxy-6vr7b 1/1 Running 0 18h + > kube-proxy-7qwtb 1/1 Running 0 18h + > kube-proxy-8pn9v 1/1 Running 0 18h + > kube-proxy-dpzg8 1/1 Running 0 18h + > kube-proxy-ppr9v 1/1 Running 0 18h + > kube-proxy-qn22t 1/1 Running 0 18h + > kube-proxy-rj9qh 1/1 Running 0 18h + > kube-proxy-vpskm 1/1 Running 0 18h + > kube-scheduler-somerville-jade-20240219-work-control-plane-ac9af912-gjv45 1/1 Running 4 (18h ago) 18h + > kube-scheduler-somerville-jade-20240219-work-control-plane-ac9af912-m4vdt 1/1 Running 2 (18h ago) 18h + > kube-scheduler-somerville-jade-20240219-work-control-plane-ac9af912-v42dq 1/1 Running 0 18h + > metrics-server-65cccfc7bb-k594p 1/1 Running 0 18h + + +# ----------------------------------------------------- +# Check the 'kube-apiserver' Pod in the 'kube-system' namespace. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + get pods \ + --namespace kube-system \ + --output json \ + | jq -r ".items[].metadata.name | select(. | startswith(\"kube-apiserver\"))" + ' + + > kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-gjv45 + > kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-m4vdt + > kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-v42dq + + + ssh bootstrap -t \ + ' + source loadconfig + for podname in $( + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + get pods \ + --namespace kube-system \ + --output json \ + | jq -r ".items[].metadata.name | select(. | startswith(\"kube-apiserver\"))" + ) + do + echo "" + echo "---- ---- ---- ----" + echo "Podname [${podname}]" + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + describe pod \ + --namespace kube-system \ + "${podname}" + done + ' + + > ---- ---- ---- ---- + > Podname [kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-gjv45] + > Name: kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-gjv45 + > Namespace: kube-system + > Priority: 2000001000 + > Priority Class Name: system-node-critical + > Node: somerville-jade-20240219-work-control-plane-ac9af912-gjv45/192.168.3.186 + > Start Time: Mon, 19 Feb 2024 17:02:41 +0000 + > Labels: component=kube-apiserver + > tier=control-plane + > Annotations: kubeadm.kubernetes.io/kube-apiserver.advertise-address.endpoint: 192.168.3.186:6443 + > kubernetes.io/config.hash: 46c32cf496946e8498634f34e761d972 + > kubernetes.io/config.mirror: 46c32cf496946e8498634f34e761d972 + > kubernetes.io/config.seen: 2024-02-19T17:02:40.871951952Z + > kubernetes.io/config.source: file + > Status: Running + > SeccompProfile: RuntimeDefault + > IP: 192.168.3.186 + > IPs: + > IP: 192.168.3.186 + > Controlled By: Node/somerville-jade-20240219-work-control-plane-ac9af912-gjv45 + > Containers: + > kube-apiserver: + > Container ID: containerd://620263b18a07caf23fd79658055a6d5ef32ca555c8c203ae054322aa7afc0adf + > Image: registry.k8s.io/kube-apiserver:v1.26.7 + > Image ID: registry.k8s.io/kube-apiserver@sha256:c3b8fbd0418e29e8a3d49fbeebc187ffba6d0b2e437fc6c4db2cfb69b19163bf + > Port: + > Host Port: + > Command: + > kube-apiserver + > --advertise-address=192.168.3.186 + > --allow-privileged=true + > --authorization-mode=Node,RBAC + > --client-ca-file=/etc/kubernetes/pki/ca.crt + > --cloud-provider=external + > --enable-admission-plugins=NodeRestriction + > --enable-bootstrap-token-auth=true + > --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt + > --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt + > --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key + > --etcd-servers=https://127.0.0.1:2379 + > --kubelet-client-certificate=/etc/kubernetes/pki/apiserver-kubelet-client.crt + > --kubelet-client-key=/etc/kubernetes/pki/apiserver-kubelet-client.key + > --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname + > --proxy-client-cert-file=/etc/kubernetes/pki/front-proxy-client.crt + > --proxy-client-key-file=/etc/kubernetes/pki/front-proxy-client.key + > --requestheader-allowed-names=front-proxy-client + > --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt + > --requestheader-extra-headers-prefix=X-Remote-Extra- + > --requestheader-group-headers=X-Remote-Group + > --requestheader-username-headers=X-Remote-User + > --secure-port=6443 + > --service-account-issuer=https://kubernetes.default.svc.cluster.local + > --service-account-key-file=/etc/kubernetes/pki/sa.pub + > --service-account-signing-key-file=/etc/kubernetes/pki/sa.key + > --service-cluster-ip-range=172.24.0.0/13 + > --tls-cert-file=/etc/kubernetes/pki/apiserver.crt + > --tls-private-key-file=/etc/kubernetes/pki/apiserver.key + > State: Running + > Started: Mon, 19 Feb 2024 17:01:51 +0000 + > Ready: True + > Restart Count: 0 + > Requests: + > cpu: 250m + > Liveness: http-get https://192.168.3.186:6443/livez delay=10s timeout=15s period=10s #success=1 #failure=8 + > Readiness: http-get https://192.168.3.186:6443/readyz delay=0s timeout=15s period=1s #success=1 #failure=3 + > Startup: http-get https://192.168.3.186:6443/livez delay=10s timeout=15s period=10s #success=1 #failure=24 + > Environment: + > Mounts: + > /etc/ca-certificates from etc-ca-certificates (ro) + > /etc/kubernetes/pki from k8s-certs (ro) + > /etc/ssl/certs from ca-certs (ro) + > /usr/local/share/ca-certificates from usr-local-share-ca-certificates (ro) + > /usr/share/ca-certificates from usr-share-ca-certificates (ro) + > Conditions: + > Type Status + > Initialized True + > Ready True + > ContainersReady True + > PodScheduled True + > Volumes: + > ca-certs: + > Type: HostPath (bare host directory volume) + > Path: /etc/ssl/certs + > HostPathType: DirectoryOrCreate + > etc-ca-certificates: + > Type: HostPath (bare host directory volume) + > Path: /etc/ca-certificates + > HostPathType: DirectoryOrCreate + > k8s-certs: + > Type: HostPath (bare host directory volume) + > Path: /etc/kubernetes/pki + > HostPathType: DirectoryOrCreate + > usr-local-share-ca-certificates: + > Type: HostPath (bare host directory volume) + > Path: /usr/local/share/ca-certificates + > HostPathType: DirectoryOrCreate + > usr-share-ca-certificates: + > Type: HostPath (bare host directory volume) + > Path: /usr/share/ca-certificates + > HostPathType: DirectoryOrCreate + > QoS Class: Burstable + > Node-Selectors: + > Tolerations: :NoExecute op=Exists + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Warning Unhealthy 93m (x21 over 18h) kubelet Liveness probe failed: HTTP probe failed with statuscode: 500 + > Warning Unhealthy 4m10s (x251 over 18h) kubelet Readiness probe failed: HTTP probe failed with statuscode: 500 + > + > ---- ---- ---- ---- + > Podname [kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-m4vdt] + > Name: kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-m4vdt + > Namespace: kube-system + > Priority: 2000001000 + > Priority Class Name: system-node-critical + > Node: somerville-jade-20240219-work-control-plane-ac9af912-m4vdt/192.168.3.47 + > Start Time: Mon, 19 Feb 2024 17:05:09 +0000 + > Labels: component=kube-apiserver + > tier=control-plane + > Annotations: kubeadm.kubernetes.io/kube-apiserver.advertise-address.endpoint: 192.168.3.47:6443 + > kubernetes.io/config.hash: f6125d635ed29a6fe511cb611ace8915 + > kubernetes.io/config.mirror: f6125d635ed29a6fe511cb611ace8915 + > kubernetes.io/config.seen: 2024-02-19T17:05:08.276846295Z + > kubernetes.io/config.source: file + > Status: Running + > SeccompProfile: RuntimeDefault + > IP: 192.168.3.47 + > IPs: + > IP: 192.168.3.47 + > Controlled By: Node/somerville-jade-20240219-work-control-plane-ac9af912-m4vdt + > Containers: + > kube-apiserver: + > Container ID: containerd://5d8bc6b204199714638784346538cd226ebd6ae9f9321d3181fbae2e4b429733 + > Image: registry.k8s.io/kube-apiserver:v1.26.7 + > Image ID: registry.k8s.io/kube-apiserver@sha256:c3b8fbd0418e29e8a3d49fbeebc187ffba6d0b2e437fc6c4db2cfb69b19163bf + > Port: + > Host Port: + > Command: + > kube-apiserver + > --advertise-address=192.168.3.47 + > --allow-privileged=true + > --authorization-mode=Node,RBAC + > --client-ca-file=/etc/kubernetes/pki/ca.crt + > --cloud-provider=external + > --enable-admission-plugins=NodeRestriction + > --enable-bootstrap-token-auth=true + > --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt + > --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt + > --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key + > --etcd-servers=https://127.0.0.1:2379 + > --kubelet-client-certificate=/etc/kubernetes/pki/apiserver-kubelet-client.crt + > --kubelet-client-key=/etc/kubernetes/pki/apiserver-kubelet-client.key + > --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname + > --proxy-client-cert-file=/etc/kubernetes/pki/front-proxy-client.crt + > --proxy-client-key-file=/etc/kubernetes/pki/front-proxy-client.key + > --requestheader-allowed-names=front-proxy-client + > --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt + > --requestheader-extra-headers-prefix=X-Remote-Extra- + > --requestheader-group-headers=X-Remote-Group + > --requestheader-username-headers=X-Remote-User + > --secure-port=6443 + > --service-account-issuer=https://kubernetes.default.svc.cluster.local + > --service-account-key-file=/etc/kubernetes/pki/sa.pub + > --service-account-signing-key-file=/etc/kubernetes/pki/sa.key + > --service-cluster-ip-range=172.24.0.0/13 + > --tls-cert-file=/etc/kubernetes/pki/apiserver.crt + > --tls-private-key-file=/etc/kubernetes/pki/apiserver.key + > State: Running + > Started: Mon, 19 Feb 2024 17:05:24 +0000 + > Ready: True + > Restart Count: 0 + > Requests: + > cpu: 250m + > Liveness: http-get https://192.168.3.47:6443/livez delay=10s timeout=15s period=10s #success=1 #failure=8 + > Readiness: http-get https://192.168.3.47:6443/readyz delay=0s timeout=15s period=1s #success=1 #failure=3 + > Startup: http-get https://192.168.3.47:6443/livez delay=10s timeout=15s period=10s #success=1 #failure=24 + > Environment: + > Mounts: + > /etc/ca-certificates from etc-ca-certificates (ro) + > /etc/kubernetes/pki from k8s-certs (ro) + > /etc/ssl/certs from ca-certs (ro) + > /usr/local/share/ca-certificates from usr-local-share-ca-certificates (ro) + > /usr/share/ca-certificates from usr-share-ca-certificates (ro) + > Conditions: + > Type Status + > Initialized True + > Ready True + > ContainersReady True + > PodScheduled True + > Volumes: + > ca-certs: + > Type: HostPath (bare host directory volume) + > Path: /etc/ssl/certs + > HostPathType: DirectoryOrCreate + > etc-ca-certificates: + > Type: HostPath (bare host directory volume) + > Path: /etc/ca-certificates + > HostPathType: DirectoryOrCreate + > k8s-certs: + > Type: HostPath (bare host directory volume) + > Path: /etc/kubernetes/pki + > HostPathType: DirectoryOrCreate + > usr-local-share-ca-certificates: + > Type: HostPath (bare host directory volume) + > Path: /usr/local/share/ca-certificates + > HostPathType: DirectoryOrCreate + > usr-share-ca-certificates: + > Type: HostPath (bare host directory volume) + > Path: /usr/share/ca-certificates + > HostPathType: DirectoryOrCreate + > QoS Class: Burstable + > Node-Selectors: + > Tolerations: :NoExecute op=Exists + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal Pulled 18h kubelet Container image "registry.k8s.io/kube-apiserver:v1.26.7" already present on machine + > Normal Created 18h kubelet Created container kube-apiserver + > Normal Started 18h kubelet Started container kube-apiserver + > Warning Unhealthy 18h kubelet Startup probe failed: HTTP probe failed with statuscode: 403 + > Warning Unhealthy 18h (x4 over 18h) kubelet Startup probe failed: HTTP probe failed with statuscode: 500 + > Warning Unhealthy 13m (x139 over 18h) kubelet Readiness probe failed: HTTP probe failed with statuscode: 500 + > Warning Unhealthy 3m17s (x21 over 18h) kubelet Liveness probe failed: HTTP probe failed with statuscode: 500 + > + > ---- ---- ---- ---- + > Podname [kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-v42dq] + > Name: kube-apiserver-somerville-jade-20240219-work-control-plane-ac9af912-v42dq + > Namespace: kube-system + > Priority: 2000001000 + > Priority Class Name: system-node-critical + > Node: somerville-jade-20240219-work-control-plane-ac9af912-v42dq/192.168.3.171 + > Start Time: Mon, 19 Feb 2024 17:09:40 +0000 + > Labels: component=kube-apiserver + > tier=control-plane + > Annotations: kubeadm.kubernetes.io/kube-apiserver.advertise-address.endpoint: 192.168.3.171:6443 + > kubernetes.io/config.hash: 94787ecd0b24bc15df444d00e1ead91e + > kubernetes.io/config.mirror: 94787ecd0b24bc15df444d00e1ead91e + > kubernetes.io/config.seen: 2024-02-19T17:09:35.786020973Z + > kubernetes.io/config.source: file + > Status: Running + > SeccompProfile: RuntimeDefault + > IP: 192.168.3.171 + > IPs: + > IP: 192.168.3.171 + > Controlled By: Node/somerville-jade-20240219-work-control-plane-ac9af912-v42dq + > Containers: + > kube-apiserver: + > Container ID: containerd://b6ecd2f7c06f03576508e0617ac9e8ce93b5321b1905079da9a97a015d0869c7 + > Image: registry.k8s.io/kube-apiserver:v1.26.7 + > Image ID: registry.k8s.io/kube-apiserver@sha256:c3b8fbd0418e29e8a3d49fbeebc187ffba6d0b2e437fc6c4db2cfb69b19163bf + > Port: + > Host Port: + > Command: + > kube-apiserver + > --advertise-address=192.168.3.171 + > --allow-privileged=true + > --authorization-mode=Node,RBAC + > --client-ca-file=/etc/kubernetes/pki/ca.crt + > --cloud-provider=external + > --enable-admission-plugins=NodeRestriction + > --enable-bootstrap-token-auth=true + > --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt + > --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt + > --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key + > --etcd-servers=https://127.0.0.1:2379 + > --kubelet-client-certificate=/etc/kubernetes/pki/apiserver-kubelet-client.crt + > --kubelet-client-key=/etc/kubernetes/pki/apiserver-kubelet-client.key + > --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname + > --proxy-client-cert-file=/etc/kubernetes/pki/front-proxy-client.crt + > --proxy-client-key-file=/etc/kubernetes/pki/front-proxy-client.key + > --requestheader-allowed-names=front-proxy-client + > --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt + > --requestheader-extra-headers-prefix=X-Remote-Extra- + > --requestheader-group-headers=X-Remote-Group + > --requestheader-username-headers=X-Remote-User + > --secure-port=6443 + > --service-account-issuer=https://kubernetes.default.svc.cluster.local + > --service-account-key-file=/etc/kubernetes/pki/sa.pub + > --service-account-signing-key-file=/etc/kubernetes/pki/sa.key + > --service-cluster-ip-range=172.24.0.0/13 + > --tls-cert-file=/etc/kubernetes/pki/apiserver.crt + > --tls-private-key-file=/etc/kubernetes/pki/apiserver.key + > State: Running + > Started: Mon, 19 Feb 2024 17:09:56 +0000 + > Ready: True + > Restart Count: 0 + > Requests: + > cpu: 250m + > Liveness: http-get https://192.168.3.171:6443/livez delay=10s timeout=15s period=10s #success=1 #failure=8 + > Readiness: http-get https://192.168.3.171:6443/readyz delay=0s timeout=15s period=1s #success=1 #failure=3 + > Startup: http-get https://192.168.3.171:6443/livez delay=10s timeout=15s period=10s #success=1 #failure=24 + > Environment: + > Mounts: + > /etc/ca-certificates from etc-ca-certificates (ro) + > /etc/kubernetes/pki from k8s-certs (ro) + > /etc/ssl/certs from ca-certs (ro) + > /usr/local/share/ca-certificates from usr-local-share-ca-certificates (ro) + > /usr/share/ca-certificates from usr-share-ca-certificates (ro) + > Conditions: + > Type Status + > Initialized True + > Ready True + > ContainersReady True + > PodScheduled True + > Volumes: + > ca-certs: + > Type: HostPath (bare host directory volume) + > Path: /etc/ssl/certs + > HostPathType: DirectoryOrCreate + > etc-ca-certificates: + > Type: HostPath (bare host directory volume) + > Path: /etc/ca-certificates + > HostPathType: DirectoryOrCreate + > k8s-certs: + > Type: HostPath (bare host directory volume) + > Path: /etc/kubernetes/pki + > HostPathType: DirectoryOrCreate + > usr-local-share-ca-certificates: + > Type: HostPath (bare host directory volume) + > Path: /usr/local/share/ca-certificates + > HostPathType: DirectoryOrCreate + > usr-share-ca-certificates: + > Type: HostPath (bare host directory volume) + > Path: /usr/share/ca-certificates + > HostPathType: DirectoryOrCreate + > QoS Class: Burstable + > Node-Selectors: + > Tolerations: :NoExecute op=Exists + > Events: + > Type Reason Age From Message + > ---- ------ ---- ---- ------- + > Normal Pulled 18h kubelet Container image "registry.k8s.io/kube-apiserver:v1.26.7" already present on machine + > Normal Created 18h kubelet Created container kube-apiserver + > Normal Started 18h kubelet Started container kube-apiserver + > Warning Unhealthy 18h (x5 over 18h) kubelet Startup probe failed: Get "https://192.168.3.171:6443/livez": dial tcp 192.168.3.171:6443: connect: connection refused + > Warning Unhealthy 18h kubelet Startup probe failed: Get "https://192.168.3.171:6443/livez": net/http: TLS handshake timeout + > Warning Unhealthy 18h kubelet Startup probe failed: HTTP probe failed with statuscode: 403 + > Warning Unhealthy 27m (x28 over 18h) kubelet Liveness probe failed: HTTP probe failed with statuscode: 500 + > Warning Unhealthy 9m40s (x144 over 18h) kubelet Readiness probe failed: HTTP probe failed with statuscode: 500 + + + +# ----------------------------------------------------- +# Slow down and read Scott's comment again. +# He actually tells us how to find out what the IP address is. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + get services \ + --namespace default + ' + + > NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + > kubernetes ClusterIP 172.24.0.1 443/TCP 19h + + +# ----------------------------------------------------- +# Found some documentation about debugging services. +# https://kubernetes.io/docs/tasks/debug/debug-application/debug-service/ +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + describe service \ + --namespace default \ + kubernetes + ' + + > Name: kubernetes + > Namespace: default + > Labels: component=apiserver + > provider=kubernetes + > Annotations: + > Selector: + > Type: ClusterIP + > IP Family Policy: SingleStack + > IP Families: IPv4 + > IP: 172.24.0.1 + > IPs: 172.24.0.1 + > Port: https 443/TCP + > TargetPort: 6443/TCP + > Endpoints: 192.168.3.171:6443,192.168.3.186:6443,192.168.3.47:6443 + > Session Affinity: None + > Events: + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + get service \ + --namespace default \ + kubernetes \ + --output json + ' + + > { + > "apiVersion": "v1", + > "kind": "Service", + > "metadata": { + > "creationTimestamp": "2024-02-19T17:01:57Z", + > "labels": { + > "component": "apiserver", + > "provider": "kubernetes" + > }, + > "name": "kubernetes", + > "namespace": "default", + > "resourceVersion": "191", + > "uid": "7080949b-a9fc-48bf-89fd-bc9098ed2132" + > }, + > "spec": { + > "clusterIP": "172.24.0.1", + > "clusterIPs": [ + > "172.24.0.1" + > ], + > "internalTrafficPolicy": "Cluster", + > "ipFamilies": [ + > "IPv4" + > ], + > "ipFamilyPolicy": "SingleStack", + > "ports": [ + > { + > "name": "https", + > "port": 443, + > "protocol": "TCP", + > "targetPort": 6443 + > } + > ], + > "sessionAffinity": "None", + > "type": "ClusterIP" + > }, + > "status": { + > "loadBalancer": {} + > } + > } + diff --git a/notes/zrq/20240220-02-bookmarks.txt b/notes/zrq/20240220-02-bookmarks.txt new file mode 100644 index 00000000..5c6bdc0c --- /dev/null +++ b/notes/zrq/20240220-02-bookmarks.txt @@ -0,0 +1,56 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# +# AIMetrics: [] +# + + Target: + + Just some bookmarks that might be useful. + + Result: + + Work in progress ... + +# ----------------------------------------------------- + + Cinder volumes and Kubernetes - slow + https://stackhpc.github.io/azimuth-config/configuration/01-prerequisites/#cinder-volumes-and-kubernetes + + Accessing the Kubernetes API from a Pod + https://kubernetes.io/docs/tasks/run-application/access-api-from-pod/ + + Kubernetes Debugging Services + https://kubernetes.io/docs/tasks/debug/debug-application/debug-service/ + + Using Kubectl Logs | Complete Guide to viewing Kubernetes Pod Logs + https://signoz.io/blog/kubectl-logs/ + + StackHPC capi-helm-charts Monitoring logging and + https://github.com/stackhpc/capi-helm-charts/tree/main/charts/cluster-addons#monitoring-and-logging + + diff --git a/notes/zrq/20240221-01-jade-deploy.txt b/notes/zrq/20240221-01-jade-deploy.txt new file mode 100644 index 00000000..49f71932 --- /dev/null +++ b/notes/zrq/20240221-01-jade-deploy.txt @@ -0,0 +1,1476 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# +# AIMetrics: [] +# + + Target: + + Try a new deployment, keeping track of how long it takes to become healthy. + + Result: + + Deploy failed. + Left it for 30hrs and still not resolved. + + +# ----------------------------------------------------- +# Run our local client. +#[user@desktop] + + source "${HOME:?}/aglais.env" + export PATH=${PATH}:${AGLAIS_CODE}/bin + + kube-client jade + + > .... + > .... + + +# ----------------------------------------------------- +# Delete and create everything. +#[root@ansibler] + + export cloudsite=somerville-jade + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + > .... + > .... + + +# ----------------------------------------------------- +# Check the deployment configuration. +#[root@ansibler] + + cat /opt/aglais/aglais-status.yml + + > .... + > .... + + +# ----------------------------------------------------- +# Check the cluster status. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + --grouping=false \ + --show-conditions all \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240221-work False Warning ScalingUp 12m Scaling up control plane to 3 replicas (actual 1) + > │ ├─ControlPlaneInitialized True 10m + > │ ├─ControlPlaneReady False Warning ScalingUp 12m Scaling up control plane to 3 replicas (actual 1) + > │ └─InfrastructureReady True 13m + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240221-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240221-work-control-plane False Warning ScalingUp 12m Scaling up control plane to 3 replicas (actual 1) + > │ │ ├─Available True 10m + > │ │ ├─CertificatesAvailable True 13m + > │ │ ├─MachinesReady False Warning NodeStartupTimeout @ /somerville-jade-20240221-work-control-plane-bdfrr 52s Node failed to report startup in 10m0s + > │ │ └─Resized False Warning ScalingUp 12m Scaling up control plane to 3 replicas (actual 1) + > │ └─Machine/somerville-jade-20240221-work-control-plane-bdfrr False Warning NodeStartupTimeout 55s Node failed to report startup in 10m0s + > │ ├─BootstrapReady True 13m + > │ ├─HealthCheckSucceeded False Warning NodeStartupTimeout 55s Node failed to report startup in 10m0s + > │ ├─InfrastructureReady True 12m + > │ ├─NodeHealthy False Warning NodeProvisioning 10m + > │ └─OwnerRemediated False Warning WaitingForRemediation 53s KCP can't remediate if current replicas are less or equal to 1 + > └─Workers + > └─MachineDeployment/somerville-jade-20240221-work-md-0 False Warning WaitingForAvailableMachines 14m Minimum availability requires 5 replicas, current 0 available + > │ └─Available False Warning WaitingForAvailableMachines 14m Minimum availability requires 5 replicas, current 0 available + > ├─Machine/somerville-jade-20240221-work-md-0-bc4ps-jqcjp True 14s + > │ ├─BootstrapReady True 27s + > │ ├─InfrastructureReady True 14s + > │ └─NodeHealthy False Warning NodeProvisioning 13s + > ├─Machine/somerville-jade-20240221-work-md-0-bc4ps-sc5pc True 17s + > │ ├─BootstrapReady True 29s + > │ ├─InfrastructureReady True 17s + > │ └─NodeHealthy False Warning NodeProvisioning 16s + > ├─Machine/somerville-jade-20240221-work-md-0-bc4ps-sc5xt True 14s + > │ ├─BootstrapReady True 26s + > │ ├─InfrastructureReady True 14s + > │ └─NodeHealthy False Warning NodeProvisioning 14s + > ├─Machine/somerville-jade-20240221-work-md-0-bc4ps-spnmh True 23s + > │ ├─BootstrapReady True 35s + > │ ├─InfrastructureReady True 23s + > │ └─NodeHealthy False Warning NodeProvisioning 22s + > ├─Machine/somerville-jade-20240221-work-md-0-bc4ps-xccp4 True 11s + > │ ├─BootstrapReady True 24s + > │ ├─InfrastructureReady True 11s + > │ └─NodeHealthy False Warning NodeProvisioning 10s + > └─Machine/somerville-jade-20240221-work-md-0-bc4ps-xmx67 True 18s + > ├─BootstrapReady True 32s + > ├─InfrastructureReady True 18s + > └─NodeHealthy False Warning NodeProvisioning 18s + + +# ----------------------------------------------------- +# List our machines in Openstack. +#[root@ansibler] + + openstack \ + --os-cloud "${cloudname:?}" \ + server list + + > +--------------------------------------+------------------------------------------------------------+--------+----------------------------------------------------------------------------+-----------------------------------+----------------+ + > | ID | Name | Status | Networks | Image | Flavor | + > +--------------------------------------+------------------------------------------------------------+--------+----------------------------------------------------------------------------+-----------------------------------+----------------+ + > | 9930656b-57eb-4455-b697-5e135a55679a | somerville-jade-20240221-work-md-0-f538b732-x9gr9 | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240221-work=192.168.3.115 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | b21a5408-3ceb-45de-89ff-ed50d2bdf2e3 | somerville-jade-20240221-work-md-0-f538b732-9cflr | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240221-work=192.168.3.96 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | eb4702e1-29d8-43f4-bfb5-b8da42edcc54 | somerville-jade-20240221-work-md-0-f538b732-s46n8 | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240221-work=192.168.3.39 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | 0babc155-fc63-483b-8df2-e027cd751110 | somerville-jade-20240221-work-md-0-f538b732-6xkbt | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240221-work=192.168.3.119 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | d5d50923-dcbe-4d9f-958e-6991fb595d42 | somerville-jade-20240221-work-md-0-f538b732-ptksh | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240221-work=192.168.3.31 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | bd6abaff-27ad-4010-bba2-e78d29c94f6b | somerville-jade-20240221-work-md-0-f538b732-9x7bk | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240221-work=192.168.3.191 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.26vcpu | + > | 073c0051-8805-4719-a589-3769c66339b1 | somerville-jade-20240221-work-control-plane-c6b6f2d1-f5k5w | ACTIVE | k8s-clusterapi-cluster-default-somerville-jade-20240221-work=192.168.3.176 | gaia-dmp-ubuntu-2204-kube-v1.26.7 | gaia.vm.2vcpu | + > | 086da559-fea9-4875-aa79-53cc0dbe7011 | somerville-jade-20240221-bootstrap-node | ACTIVE | somerville-jade-20240221-bootstrap-network=10.10.2.166, 192.41.122.71 | gaia-dmp-fedora-cloud-38-1.6 | gaia.vm.2vcpu | + > +--------------------------------------+------------------------------------------------------------+--------+----------------------------------------------------------------------------+-----------------------------------+----------------+ + + +# ----------------------------------------------------- +# List our machines in the KinD cluster. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get machines \ + --all-namespaces + ' + + > NAMESPACE NAME CLUSTER NODENAME PROVIDERID PHASE AGE VERSION + > default somerville-jade-20240221-work-control-plane-bdfrr somerville-jade-20240221-work openstack:///073c0051-8805-4719-a589-3769c66339b1 Provisioned 19m v1.26.7 + > default somerville-jade-20240221-work-md-0-bc4ps-jqcjp somerville-jade-20240221-work openstack:///eb4702e1-29d8-43f4-bfb5-b8da42edcc54 Provisioned 6m32s v1.26.7 + > default somerville-jade-20240221-work-md-0-bc4ps-sc5pc somerville-jade-20240221-work openstack:///0babc155-fc63-483b-8df2-e027cd751110 Provisioned 6m34s v1.26.7 + > default somerville-jade-20240221-work-md-0-bc4ps-sc5xt somerville-jade-20240221-work openstack:///b21a5408-3ceb-45de-89ff-ed50d2bdf2e3 Provisioned 6m31s v1.26.7 + > default somerville-jade-20240221-work-md-0-bc4ps-spnmh somerville-jade-20240221-work openstack:///bd6abaff-27ad-4010-bba2-e78d29c94f6b Provisioned 6m39s v1.26.7 + > default somerville-jade-20240221-work-md-0-bc4ps-xccp4 somerville-jade-20240221-work openstack:///9930656b-57eb-4455-b697-5e135a55679a Provisioned 6m29s v1.26.7 + > default somerville-jade-20240221-work-md-0-bc4ps-xmx67 somerville-jade-20240221-work openstack:///d5d50923-dcbe-4d9f-958e-6991fb595d42 Provisioned 6m36s v1.26.7 + + +# ----------------------------------------------------- +# List our nodes in the work cluster. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + get nodes \ + --all-namespaces + ' + + > E0221 07:03:35.168872 15478 memcache.go:287] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0221 07:03:35.210309 15478 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0221 07:03:35.220496 15478 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0221 07:03:35.227506 15478 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > NAME STATUS ROLES AGE VERSION + > somerville-jade-20240221-work-control-plane-c6b6f2d1-f5k5w Ready control-plane 17m v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-474d8 NotReady 15m v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-5tgnn NotReady 15m v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-6xkbt Ready 6m5s v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-8d8n2 NotReady 16m v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-9cflr Ready 5m34s v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-9x7bk Ready 5m50s v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-fbpln NotReady 15m v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-phgtl NotReady 16m v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-ptksh Ready 6m30s v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-s46n8 Ready 5m4s v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-vfckp NotReady 16m v1.26.7 + > somerville-jade-20240221-work-md-0-f538b732-x9gr9 Ready 6m18s v1.26.7 + + + + +# ----------------------------------------------------- +# Get the Kubernetes service status +# https://kubernetes.io/docs/tasks/debug/debug-application/debug-service/ +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + describe service \ + --namespace default \ + kubernetes + ' + + > E0221 07:13:43.703538 15770 memcache.go:287] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0221 07:13:43.732918 15770 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0221 07:13:43.737366 15770 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0221 07:13:43.741154 15770 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > Name: kubernetes + > Namespace: default + > Labels: component=apiserver + > provider=kubernetes + > Annotations: + > Selector: + > Type: ClusterIP + > IP Family Policy: SingleStack + > IP Families: IPv4 + > IP: 172.24.0.1 + > IPs: 172.24.0.1 + > Port: https 443/TCP + > TargetPort: 6443/TCP + > Endpoints: 192.168.3.176:6443 + > Session Affinity: None + > Events: + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + get service \ + --namespace default \ + kubernetes \ + --output json + ' + > { + > "apiVersion": "v1", + > "kind": "Service", + > "metadata": { + > "creationTimestamp": "2024-02-21T06:45:23Z", + > "labels": { + > "component": "apiserver", + > "provider": "kubernetes" + > }, + > "name": "kubernetes", + > "namespace": "default", + > "resourceVersion": "195", + > "uid": "12c7b70c-cff8-4cf3-9f37-2a01ddf50193" + > }, + > "spec": { + > "clusterIP": "172.24.0.1", + > "clusterIPs": [ + > "172.24.0.1" + > ], + > "internalTrafficPolicy": "Cluster", + > "ipFamilies": [ + > "IPv4" + > ], + > "ipFamilyPolicy": "SingleStack", + > "ports": [ + > { + > "name": "https", + > "port": 443, + > "protocol": "TCP", + > "targetPort": 6443 + > } + > ], + > "sessionAffinity": "None", + > "type": "ClusterIP" + > }, + > "status": { + > "loadBalancer": {} + > } + > } + + +# ----------------------------------------------------- +# List stuff in the KinD cluster. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + api-resources + ' + + > NAME SHORTNAMES APIVERSION NAMESPACED KIND + > bindings v1 true Binding + > componentstatuses cs v1 false ComponentStatus + > configmaps cm v1 true ConfigMap + > endpoints ep v1 true Endpoints + > events ev v1 true Event + > limitranges limits v1 true LimitRange + > namespaces ns v1 false Namespace + > nodes no v1 false Node + > persistentvolumeclaims pvc v1 true PersistentVolumeClaim + > persistentvolumes pv v1 false PersistentVolume + > pods po v1 true Pod + > podtemplates v1 true PodTemplate + > replicationcontrollers rc v1 true ReplicationController + > resourcequotas quota v1 true ResourceQuota + > secrets v1 true Secret + > serviceaccounts sa v1 true ServiceAccount + > services svc v1 true Service + > challenges acme.cert-manager.io/v1 true Challenge + > orders acme.cert-manager.io/v1 true Order + > clusterresourcesetbindings addons.cluster.x-k8s.io/v1beta1 true ClusterResourceSetBinding + > clusterresourcesets addons.cluster.x-k8s.io/v1beta1 true ClusterResourceSet + > helmreleases addons.stackhpc.com/v1alpha1 true HelmRelease + > manifests addons.stackhpc.com/v1alpha1 true Manifests + > mutatingwebhookconfigurations admissionregistration.k8s.io/v1 false MutatingWebhookConfiguration + > validatingwebhookconfigurations admissionregistration.k8s.io/v1 false ValidatingWebhookConfiguration + > customresourcedefinitions crd,crds apiextensions.k8s.io/v1 false CustomResourceDefinition + > apiservices apiregistration.k8s.io/v1 false APIService + > controllerrevisions apps/v1 true ControllerRevision + > daemonsets ds apps/v1 true DaemonSet + > deployments deploy apps/v1 true Deployment + > replicasets rs apps/v1 true ReplicaSet + > statefulsets sts apps/v1 true StatefulSet + > tokenreviews authentication.k8s.io/v1 false TokenReview + > localsubjectaccessreviews authorization.k8s.io/v1 true LocalSubjectAccessReview + > selfsubjectaccessreviews authorization.k8s.io/v1 false SelfSubjectAccessReview + > selfsubjectrulesreviews authorization.k8s.io/v1 false SelfSubjectRulesReview + > subjectaccessreviews authorization.k8s.io/v1 false SubjectAccessReview + > horizontalpodautoscalers hpa autoscaling/v2 true HorizontalPodAutoscaler + > cronjobs cj batch/v1 true CronJob + > jobs batch/v1 true Job + > kubeadmconfigs bootstrap.cluster.x-k8s.io/v1beta1 true KubeadmConfig + > kubeadmconfigtemplates bootstrap.cluster.x-k8s.io/v1beta1 true KubeadmConfigTemplate + > certificaterequests cr,crs cert-manager.io/v1 true CertificateRequest + > certificates cert,certs cert-manager.io/v1 true Certificate + > clusterissuers cert-manager.io/v1 false ClusterIssuer + > issuers cert-manager.io/v1 true Issuer + > certificatesigningrequests csr certificates.k8s.io/v1 false CertificateSigningRequest + > clusterclasses cc cluster.x-k8s.io/v1beta1 true ClusterClass + > clusters cl cluster.x-k8s.io/v1beta1 true Cluster + > machinedeployments md cluster.x-k8s.io/v1beta1 true MachineDeployment + > machinehealthchecks mhc,mhcs cluster.x-k8s.io/v1beta1 true MachineHealthCheck + > machinepools mp cluster.x-k8s.io/v1beta1 true MachinePool + > machines ma cluster.x-k8s.io/v1beta1 true Machine + > machinesets ms cluster.x-k8s.io/v1beta1 true MachineSet + > providers clusterctl.cluster.x-k8s.io/v1alpha3 true Provider + > kubeadmcontrolplanes kcp controlplane.cluster.x-k8s.io/v1beta1 true KubeadmControlPlane + > kubeadmcontrolplanetemplates controlplane.cluster.x-k8s.io/v1beta1 true KubeadmControlPlaneTemplate + > leases coordination.k8s.io/v1 true Lease + > endpointslices discovery.k8s.io/v1 true EndpointSlice + > events ev events.k8s.io/v1 true Event + > flowschemas flowcontrol.apiserver.k8s.io/v1beta3 false FlowSchema + > prioritylevelconfigurations flowcontrol.apiserver.k8s.io/v1beta3 false PriorityLevelConfiguration + > openstackclusters osc infrastructure.cluster.x-k8s.io/v1alpha7 true OpenStackCluster + > openstackclustertemplates osct infrastructure.cluster.x-k8s.io/v1alpha7 true OpenStackClusterTemplate + > openstackmachines osm infrastructure.cluster.x-k8s.io/v1alpha7 true OpenStackMachine + > openstackmachinetemplates osmt infrastructure.cluster.x-k8s.io/v1alpha7 true OpenStackMachineTemplate + > ipaddressclaims ipam.cluster.x-k8s.io/v1beta1 true IPAddressClaim + > ipaddresses ipam.cluster.x-k8s.io/v1beta1 true IPAddress + > ingressclasses networking.k8s.io/v1 false IngressClass + > ingresses ing networking.k8s.io/v1 true Ingress + > networkpolicies netpol networking.k8s.io/v1 true NetworkPolicy + > runtimeclasses node.k8s.io/v1 false RuntimeClass + > poddisruptionbudgets pdb policy/v1 true PodDisruptionBudget + > clusterrolebindings rbac.authorization.k8s.io/v1 false ClusterRoleBinding + > clusterroles rbac.authorization.k8s.io/v1 false ClusterRole + > rolebindings rbac.authorization.k8s.io/v1 true RoleBinding + > roles rbac.authorization.k8s.io/v1 true Role + > extensionconfigs ext runtime.cluster.x-k8s.io/v1alpha1 false ExtensionConfig + > priorityclasses pc scheduling.k8s.io/v1 false PriorityClass + > csidrivers storage.k8s.io/v1 false CSIDriver + > csinodes storage.k8s.io/v1 false CSINode + > csistoragecapacities storage.k8s.io/v1 true CSIStorageCapacity + > storageclasses sc storage.k8s.io/v1 false StorageClass + > volumeattachments storage.k8s.io/v1 false VolumeAttachment + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get clusters \ + --all-namespaces + ' + + > NAMESPACE NAME CLUSTERCLASS PHASE AGE VERSION + > default somerville-jade-20240221-work Provisioned 38m + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get cluster \ + --namespace default \ + 'somerville-jade-20240221-work' + ' + + > NAME CLUSTERCLASS PHASE AGE VERSION + > somerville-jade-20240221-work Provisioned 39m + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get cluster \ + --namespace default \ + --output json \ + 'somerville-jade-20240221-work' + ' + + > { + > "apiVersion": "cluster.x-k8s.io/v1beta1", + > "kind": "Cluster", + > "metadata": { + > "annotations": { + > "meta.helm.sh/release-name": "somerville-jade-20240221-work", + > "meta.helm.sh/release-namespace": "default" + > }, + > "creationTimestamp": "2024-02-21T06:42:08Z", + > "finalizers": [ + > "cluster.cluster.x-k8s.io" + > ], + > "generation": 3, + > "labels": { + > "app.kubernetes.io/managed-by": "Helm", + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/infrastructure-provider": "openstack", + > "capi.stackhpc.com/managed-by": "Helm", + > "helm.sh/chart": "openstack-cluster-0.1.0" + > }, + > "name": "somerville-jade-20240221-work", + > "namespace": "default", + > "resourceVersion": "4059", + > "uid": "5d43f46b-4b5f-4a88-a27c-f2b446434a89" + > }, + > "spec": { + > "clusterNetwork": { + > "pods": { + > "cidrBlocks": [ + > "172.16.0.0/13" + > ] + > }, + > "serviceDomain": "cluster.local", + > "services": { + > "cidrBlocks": [ + > "172.24.0.0/13" + > ] + > } + > }, + > "controlPlaneEndpoint": { + > "host": "192.41.122.207", + > "port": 6443 + > }, + > "controlPlaneRef": { + > "apiVersion": "controlplane.cluster.x-k8s.io/v1beta1", + > "kind": "KubeadmControlPlane", + > "name": "somerville-jade-20240221-work-control-plane", + > "namespace": "default" + > }, + > "infrastructureRef": { + > "apiVersion": "infrastructure.cluster.x-k8s.io/v1alpha7", + > "kind": "OpenStackCluster", + > "name": "somerville-jade-20240221-work", + > "namespace": "default" + > } + > }, + > "status": { + > "conditions": [ + > { + > "lastTransitionTime": "2024-02-21T06:44:13Z", + > "message": "Scaling up control plane to 3 replicas (actual 1)", + > "reason": "ScalingUp", + > "severity": "Warning", + > "status": "False", + > "type": "Ready" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:45:58Z", + > "status": "True", + > "type": "ControlPlaneInitialized" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:44:13Z", + > "message": "Scaling up control plane to 3 replicas (actual 1)", + > "reason": "ScalingUp", + > "severity": "Warning", + > "status": "False", + > "type": "ControlPlaneReady" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:43:49Z", + > "status": "True", + > "type": "InfrastructureReady" + > } + > ], + > "controlPlaneReady": true, + > "failureDomains": { + > "nova": {}, + > "testbed": {} + > }, + > "infrastructureReady": true, + > "observedGeneration": 3, + > "phase": "Provisioned" + > } + > } + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get openstackcluster \ + --namespace default \ + --output json \ + 'somerville-jade-20240221-work' + ' + + > { + > "apiVersion": "infrastructure.cluster.x-k8s.io/v1alpha7", + > "kind": "OpenStackCluster", + > "metadata": { + > "annotations": { + > "helm.sh/resource-policy": "keep", + > "meta.helm.sh/release-name": "somerville-jade-20240221-work", + > "meta.helm.sh/release-namespace": "default" + > }, + > "creationTimestamp": "2024-02-21T06:42:10Z", + > "finalizers": [ + > "openstackcluster.infrastructure.cluster.x-k8s.io" + > ], + > "generation": 3, + > "labels": { + > "app.kubernetes.io/managed-by": "Helm", + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/infrastructure-provider": "openstack", + > "capi.stackhpc.com/managed-by": "Helm", + > "cluster.x-k8s.io/cluster-name": "somerville-jade-20240221-work", + > "helm.sh/chart": "openstack-cluster-0.1.0" + > }, + > "name": "somerville-jade-20240221-work", + > "namespace": "default", + > "ownerReferences": [ + > { + > "apiVersion": "cluster.x-k8s.io/v1beta1", + > "blockOwnerDeletion": true, + > "controller": true, + > "kind": "Cluster", + > "name": "somerville-jade-20240221-work", + > "uid": "5d43f46b-4b5f-4a88-a27c-f2b446434a89" + > } + > ], + > "resourceVersion": "3818", + > "uid": "13e936cc-625b-40f5-8cac-1773818a067f" + > }, + > "spec": { + > "allowAllInClusterTraffic": true, + > "apiServerLoadBalancer": { + > "allowedCidrs": [ + > "192.41.122.71/32", + > "90.155.51.57/32" + > ], + > "enabled": true + > }, + > "apiServerPort": 6443, + > "cloudName": "openstack", + > "controlPlaneEndpoint": { + > "host": "192.41.122.207", + > "port": 6443 + > }, + > "controlPlaneOmitAvailabilityZone": true, + > "disableAPIServerFloatingIP": false, + > "externalNetworkId": "1875828a-ccc3-419b-87fd-856aaa781492", + > "identityRef": { + > "kind": "Secret", + > "name": "somerville-jade-20240221-work-cloud-credentials" + > }, + > "managedSecurityGroups": true, + > "network": {}, + > "nodeCidr": "192.168.3.0/24", + > "subnet": {} + > }, + > "status": { + > "apiServerLoadBalancer": { + > "allowedCIDRs": [ + > "192.168.3.0/24", + > "192.41.122.71/32", + > "192.41.122.98/32", + > "90.155.51.57/32" + > ], + > "id": "71b5524d-a508-496b-9f49-06edbf8ed358", + > "internalIP": "192.168.3.46", + > "ip": "192.41.122.207", + > "name": "k8s-clusterapi-cluster-default-somerville-jade-20240221-work-kubeapi" + > }, + > "controlPlaneSecurityGroup": { + > "id": "02053952-b7da-4e4a-a659-31c0ead42ef9", + > "name": "k8s-cluster-default-somerville-jade-20240221-work-secgroup-controlplane", + > "rules": [ + > { + > "description": "Full open", + > "direction": "egress", + > "etherType": "IPv4", + > "name": "93649003-4943-4480-8218-0ea309640a49", + > "portRangeMax": 0, + > "portRangeMin": 0, + > "protocol": "", + > "remoteGroupID": "", + > "remoteIPPrefix": "", + > "securityGroupID": "02053952-b7da-4e4a-a659-31c0ead42ef9" + > }, + > { + > "description": "Full open", + > "direction": "egress", + > "etherType": "IPv6", + > "name": "5285f124-624f-420e-9a80-2035b0351715", + > "portRangeMax": 0, + > "portRangeMin": 0, + > "protocol": "", + > "remoteGroupID": "", + > "remoteIPPrefix": "", + > "securityGroupID": "02053952-b7da-4e4a-a659-31c0ead42ef9" + > }, + > { + > "description": "Kubernetes API", + > "direction": "ingress", + > "etherType": "IPv4", + > "name": "7bcfc55e-4a0b-4be0-b89f-663444420860", + > "portRangeMax": 6443, + > "portRangeMin": 6443, + > "protocol": "tcp", + > "remoteGroupID": "", + > "remoteIPPrefix": "", + > "securityGroupID": "02053952-b7da-4e4a-a659-31c0ead42ef9" + > }, + > { + > "description": "In-cluster Ingress", + > "direction": "ingress", + > "etherType": "IPv4", + > "name": "3c813479-0661-4ae2-bc69-02338ef64c12", + > "portRangeMax": 0, + > "portRangeMin": 0, + > "protocol": "", + > "remoteGroupID": "02053952-b7da-4e4a-a659-31c0ead42ef9", + > "remoteIPPrefix": "", + > "securityGroupID": "02053952-b7da-4e4a-a659-31c0ead42ef9" + > }, + > { + > "description": "In-cluster Ingress", + > "direction": "ingress", + > "etherType": "IPv4", + > "name": "e3b737e6-4928-4b05-900c-320ce89f664a", + > "portRangeMax": 0, + > "portRangeMin": 0, + > "protocol": "", + > "remoteGroupID": "416d64bf-8b76-4602-a497-6773b48dc3df", + > "remoteIPPrefix": "", + > "securityGroupID": "02053952-b7da-4e4a-a659-31c0ead42ef9" + > } + > ] + > }, + > "externalNetwork": { + > "id": "1875828a-ccc3-419b-87fd-856aaa781492", + > "name": "external" + > }, + > "failureDomains": { + > "nova": {}, + > "testbed": {} + > }, + > "network": { + > "id": "401237d6-1385-4b2b-9167-c0abd0d0fe48", + > "name": "k8s-clusterapi-cluster-default-somerville-jade-20240221-work", + > "subnets": [ + > { + > "cidr": "192.168.3.0/24", + > "id": "94f4ece7-4439-4de9-a863-903c6428b6e5", + > "name": "k8s-clusterapi-cluster-default-somerville-jade-20240221-work" + > } + > ] + > }, + > "ready": true, + > "router": { + > "id": "a242ea47-3d3d-4215-9350-453da310614b", + > "ips": [ + > "192.41.122.98" + > ], + > "name": "k8s-clusterapi-cluster-default-somerville-jade-20240221-work" + > }, + > "workerSecurityGroup": { + > "id": "416d64bf-8b76-4602-a497-6773b48dc3df", + > "name": "k8s-cluster-default-somerville-jade-20240221-work-secgroup-worker", + > "rules": [ + > { + > "description": "Full open", + > "direction": "egress", + > "etherType": "IPv4", + > "name": "59056e61-a9d7-4797-b2ba-210740d3ab0d", + > "portRangeMax": 0, + > "portRangeMin": 0, + > "protocol": "", + > "remoteGroupID": "", + > "remoteIPPrefix": "", + > "securityGroupID": "416d64bf-8b76-4602-a497-6773b48dc3df" + > }, + > { + > "description": "Full open", + > "direction": "egress", + > "etherType": "IPv6", + > "name": "8f1feb2b-6514-4b53-8c00-dd5b1176fca5", + > "portRangeMax": 0, + > "portRangeMin": 0, + > "protocol": "", + > "remoteGroupID": "", + > "remoteIPPrefix": "", + > "securityGroupID": "416d64bf-8b76-4602-a497-6773b48dc3df" + > }, + > { + > "description": "Node Port Services", + > "direction": "ingress", + > "etherType": "IPv4", + > "name": "272f1fce-ec60-43e1-a885-3ec119006a0c", + > "portRangeMax": 32767, + > "portRangeMin": 30000, + > "protocol": "tcp", + > "remoteGroupID": "", + > "remoteIPPrefix": "", + > "securityGroupID": "416d64bf-8b76-4602-a497-6773b48dc3df" + > }, + > { + > "description": "Node Port Services", + > "direction": "ingress", + > "etherType": "IPv4", + > "name": "848147ab-df94-4b59-aecc-55bc61e4ca6a", + > "portRangeMax": 32767, + > "portRangeMin": 30000, + > "protocol": "udp", + > "remoteGroupID": "", + > "remoteIPPrefix": "", + > "securityGroupID": "416d64bf-8b76-4602-a497-6773b48dc3df" + > }, + > { + > "description": "In-cluster Ingress", + > "direction": "ingress", + > "etherType": "IPv4", + > "name": "93075903-fbcd-4819-b348-f0a086cd093e", + > "portRangeMax": 0, + > "portRangeMin": 0, + > "protocol": "", + > "remoteGroupID": "416d64bf-8b76-4602-a497-6773b48dc3df", + > "remoteIPPrefix": "", + > "securityGroupID": "416d64bf-8b76-4602-a497-6773b48dc3df" + > }, + > { + > "description": "In-cluster Ingress", + > "direction": "ingress", + > "etherType": "IPv4", + > "name": "145a6db0-18f6-4ed4-b274-39f215790616", + > "portRangeMax": 0, + > "portRangeMin": 0, + > "protocol": "", + > "remoteGroupID": "02053952-b7da-4e4a-a659-31c0ead42ef9", + > "remoteIPPrefix": "", + > "securityGroupID": "416d64bf-8b76-4602-a497-6773b48dc3df" + > } + > ] + > } + > } + > } + + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get machinehealthchecks \ + --namespace default \ + --output json + ' + + > { + > "apiVersion": "v1", + > "items": [ + > { + > "apiVersion": "cluster.x-k8s.io/v1beta1", + > "kind": "MachineHealthCheck", + > "metadata": { + > "annotations": { + > "meta.helm.sh/release-name": "somerville-jade-20240221-work", + > "meta.helm.sh/release-namespace": "default" + > }, + > "creationTimestamp": "2024-02-21T06:42:10Z", + > "generation": 1, + > "labels": { + > "app.kubernetes.io/managed-by": "Helm", + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/component": "control-plane", + > "capi.stackhpc.com/infrastructure-provider": "openstack", + > "capi.stackhpc.com/managed-by": "Helm", + > "cluster.x-k8s.io/cluster-name": "somerville-jade-20240221-work", + > "helm.sh/chart": "openstack-cluster-0.1.0" + > }, + > "name": "somerville-jade-20240221-work-control-plane", + > "namespace": "default", + > "ownerReferences": [ + > { + > "apiVersion": "cluster.x-k8s.io/v1beta1", + > "kind": "Cluster", + > "name": "somerville-jade-20240221-work", + > "uid": "5d43f46b-4b5f-4a88-a27c-f2b446434a89" + > } + > ], + > "resourceVersion": "2273", + > "uid": "c8992d33-517e-4d9b-b54d-6d2546640d3e" + > }, + > "spec": { + > "clusterName": "somerville-jade-20240221-work", + > "maxUnhealthy": "100%", + > "nodeStartupTimeout": "10m0s", + > "selector": { + > "matchLabels": { + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/component": "control-plane" + > } + > }, + > "unhealthyConditions": [ + > { + > "status": "Unknown", + > "timeout": "5m0s", + > "type": "Ready" + > }, + > { + > "status": "False", + > "timeout": "5m0s", + > "type": "Ready" + > } + > ] + > }, + > "status": { + > "conditions": [ + > { + > "lastTransitionTime": "2024-02-21T06:42:10Z", + > "status": "True", + > "type": "RemediationAllowed" + > } + > ], + > "expectedMachines": 1, + > "observedGeneration": 1, + > "targets": [ + > "somerville-jade-20240221-work-control-plane-bdfrr" + > ] + > } + > }, + > { + > "apiVersion": "cluster.x-k8s.io/v1beta1", + > "kind": "MachineHealthCheck", + > "metadata": { + > "annotations": { + > "meta.helm.sh/release-name": "somerville-jade-20240221-work", + > "meta.helm.sh/release-namespace": "default" + > }, + > "creationTimestamp": "2024-02-21T06:42:10Z", + > "generation": 1, + > "labels": { + > "app.kubernetes.io/managed-by": "Helm", + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/component": "worker", + > "capi.stackhpc.com/infrastructure-provider": "openstack", + > "capi.stackhpc.com/managed-by": "Helm", + > "capi.stackhpc.com/node-group": "md-0", + > "cluster.x-k8s.io/cluster-name": "somerville-jade-20240221-work", + > "helm.sh/chart": "openstack-cluster-0.1.0" + > }, + > "name": "somerville-jade-20240221-work-md-0", + > "namespace": "default", + > "ownerReferences": [ + > { + > "apiVersion": "cluster.x-k8s.io/v1beta1", + > "kind": "Cluster", + > "name": "somerville-jade-20240221-work", + > "uid": "5d43f46b-4b5f-4a88-a27c-f2b446434a89" + > } + > ], + > "resourceVersion": "11655", + > "uid": "de48c1ad-7401-4027-bc7a-6f5b385c8b63" + > }, + > "spec": { + > "clusterName": "somerville-jade-20240221-work", + > "maxUnhealthy": "100%", + > "nodeStartupTimeout": "10m0s", + > "selector": { + > "matchLabels": { + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/component": "worker", + > "capi.stackhpc.com/node-group": "md-0" + > } + > }, + > "unhealthyConditions": [ + > { + > "status": "Unknown", + > "timeout": "5m0s", + > "type": "Ready" + > }, + > { + > "status": "False", + > "timeout": "5m0s", + > "type": "Ready" + > } + > ] + > }, + > "status": { + > "conditions": [ + > { + > "lastTransitionTime": "2024-02-21T06:42:10Z", + > "status": "True", + > "type": "RemediationAllowed" + > } + > ], + > "expectedMachines": 6, + > "observedGeneration": 1, + > "targets": [ + > "somerville-jade-20240221-work-md-0-bc4ps-2vwsp", + > "somerville-jade-20240221-work-md-0-bc4ps-64l4h", + > "somerville-jade-20240221-work-md-0-bc4ps-bn992", + > "somerville-jade-20240221-work-md-0-bc4ps-dqbpb", + > "somerville-jade-20240221-work-md-0-bc4ps-jxnvk", + > "somerville-jade-20240221-work-md-0-bc4ps-zvgb7" + > ] + > } + > } + > ], + > "kind": "List", + > "metadata": { + > "resourceVersion": "" + > } + > } + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get machinedeployments \ + --namespace default \ + --output json + ' + + > { + > "apiVersion": "v1", + > "items": [ + > { + > "apiVersion": "cluster.x-k8s.io/v1beta1", + > "kind": "MachineDeployment", + > "metadata": { + > "annotations": { + > "machinedeployment.clusters.x-k8s.io/revision": "1", + > "meta.helm.sh/release-name": "somerville-jade-20240221-work", + > "meta.helm.sh/release-namespace": "default" + > }, + > "creationTimestamp": "2024-02-21T06:42:09Z", + > "generation": 2, + > "labels": { + > "app.kubernetes.io/managed-by": "Helm", + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/component": "worker", + > "capi.stackhpc.com/infrastructure-provider": "openstack", + > "capi.stackhpc.com/managed-by": "Helm", + > "capi.stackhpc.com/node-group": "md-0", + > "cluster.x-k8s.io/cluster-name": "somerville-jade-20240221-work", + > "helm.sh/chart": "openstack-cluster-0.1.0" + > }, + > "name": "somerville-jade-20240221-work-md-0", + > "namespace": "default", + > "ownerReferences": [ + > { + > "apiVersion": "cluster.x-k8s.io/v1beta1", + > "kind": "Cluster", + > "name": "somerville-jade-20240221-work", + > "uid": "5d43f46b-4b5f-4a88-a27c-f2b446434a89" + > } + > ], + > "resourceVersion": "11702", + > "uid": "e9d2ee01-aded-4f47-af74-f4d93aca4cdc" + > }, + > "spec": { + > "clusterName": "somerville-jade-20240221-work", + > "minReadySeconds": 0, + > "progressDeadlineSeconds": 600, + > "replicas": 6, + > "revisionHistoryLimit": 1, + > "selector": { + > "matchLabels": { + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/component": "worker", + > "capi.stackhpc.com/node-group": "md-0", + > "cluster.x-k8s.io/cluster-name": "somerville-jade-20240221-work" + > } + > }, + > "strategy": { + > "rollingUpdate": { + > "deletePolicy": "Random", + > "maxSurge": 0, + > "maxUnavailable": 1 + > }, + > "type": "RollingUpdate" + > }, + > "template": { + > "metadata": { + > "labels": { + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/component": "worker", + > "capi.stackhpc.com/node-group": "md-0", + > "cluster.x-k8s.io/cluster-name": "somerville-jade-20240221-work" + > } + > }, + > "spec": { + > "bootstrap": { + > "configRef": { + > "apiVersion": "bootstrap.cluster.x-k8s.io/v1beta1", + > "kind": "KubeadmConfigTemplate", + > "name": "somerville-jade-20240221-work-md-0-99910806" + > } + > }, + > "clusterName": "somerville-jade-20240221-work", + > "infrastructureRef": { + > "apiVersion": "infrastructure.cluster.x-k8s.io/v1alpha7", + > "kind": "OpenStackMachineTemplate", + > "name": "somerville-jade-20240221-work-md-0-f538b732" + > }, + > "nodeDrainTimeout": "5m0s", + > "version": "v1.26.7" + > } + > } + > }, + > "status": { + > "conditions": [ + > { + > "lastTransitionTime": "2024-02-21T06:42:12Z", + > "message": "Minimum availability requires 5 replicas, current 0 available", + > "reason": "WaitingForAvailableMachines", + > "severity": "Warning", + > "status": "False", + > "type": "Ready" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:42:12Z", + > "message": "Minimum availability requires 5 replicas, current 0 available", + > "reason": "WaitingForAvailableMachines", + > "severity": "Warning", + > "status": "False", + > "type": "Available" + > } + > ], + > "observedGeneration": 2, + > "phase": "ScalingUp", + > "replicas": 6, + > "selector": "capi.stackhpc.com/cluster=somerville-jade-20240221-work,capi.stackhpc.com/component=worker,capi.stackhpc.com/node-group=md-0,cluster.x-k8s.io/cluster-name=somerville-jade-20240221-work", + > "unavailableReplicas": 6, + > "updatedReplicas": 6 + > } + > } + > ], + > "kind": "List", + > "metadata": { + > "resourceVersion": "" + > } + > } + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get machinedeployment \ + --namespace default \ + --output json \ + "somerville-jade-20240221-work-md-0" \ + | jq ".status" + ' + + > { + > "conditions": [ + > { + > "lastTransitionTime": "2024-02-21T06:42:12Z", + > "message": "Minimum availability requires 5 replicas, current 0 available", + > "reason": "WaitingForAvailableMachines", + > "severity": "Warning", + > "status": "False", + > "type": "Ready" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:42:12Z", + > "message": "Minimum availability requires 5 replicas, current 0 available", + > "reason": "WaitingForAvailableMachines", + > "severity": "Warning", + > "status": "False", + > "type": "Available" + > } + > ], + > "observedGeneration": 2, + > "phase": "ScalingUp", + > "replicas": 6, + > "selector": "capi.stackhpc.com/cluster=somerville-jade-20240221-work,capi.stackhpc.com/component=worker,capi.stackhpc.com/node-group=md-0,cluster.x-k8s.io/cluster-name=somerville-jade-20240221-work", + > "unavailableReplicas": 6, + > "updatedReplicas": 6 + > } + + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get KubeadmControlPlanes \ + --namespace default \ + --output json + ' + + > { + > "apiVersion": "v1", + > "items": [ + > { + > "apiVersion": "controlplane.cluster.x-k8s.io/v1beta1", + > "kind": "KubeadmControlPlane", + > "metadata": { + > "annotations": { + > "helm.sh/resource-policy": "keep", + > "meta.helm.sh/release-name": "somerville-jade-20240221-work", + > "meta.helm.sh/release-namespace": "default" + > }, + > "creationTimestamp": "2024-02-21T06:42:09Z", + > "finalizers": [ + > "kubeadm.controlplane.cluster.x-k8s.io" + > ], + > "generation": 2, + > "labels": { + > "app.kubernetes.io/managed-by": "Helm", + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/component": "control-plane", + > "capi.stackhpc.com/infrastructure-provider": "openstack", + > "capi.stackhpc.com/managed-by": "Helm", + > "cluster.x-k8s.io/cluster-name": "somerville-jade-20240221-work", + > "helm.sh/chart": "openstack-cluster-0.1.0" + > }, + > "name": "somerville-jade-20240221-work-control-plane", + > "namespace": "default", + > "ownerReferences": [ + > { + > "apiVersion": "cluster.x-k8s.io/v1beta1", + > "blockOwnerDeletion": true, + > "controller": true, + > "kind": "Cluster", + > "name": "somerville-jade-20240221-work", + > "uid": "5d43f46b-4b5f-4a88-a27c-f2b446434a89" + > } + > ], + > "resourceVersion": "4570", + > "uid": "2cf20dae-b637-41d6-9097-5d004aff1206" + > }, + > "spec": { + > "kubeadmConfigSpec": { + > "clusterConfiguration": { + > "apiServer": { + > "extraArgs": { + > "cloud-provider": "external" + > } + > }, + > "controllerManager": { + > "extraArgs": { + > "bind-address": "0.0.0.0", + > "cloud-provider": "external" + > } + > }, + > "dns": {}, + > "etcd": { + > "local": { + > "extraArgs": { + > "listen-metrics-urls": "http://0.0.0.0:2381" + > } + > } + > }, + > "networking": {}, + > "scheduler": { + > "extraArgs": { + > "bind-address": "0.0.0.0" + > } + > } + > }, + > "files": [ + > { + > "content": "# This file is created by the capi-helm-chart to\n# ensure that its parent directory exists. *.toml\n# files in this directory are included in containerd\n# config when /etc/containerd/config.toml is parsed.\n", + > "owner": "root:root", + > "path": "/etc/containerd/conf.d/.keepdir", + > "permissions": "0644" + > }, + > { + > "content": "---\napiVersion: kubeproxy.config.k8s.io/v1alpha1\nkind: KubeProxyConfiguration\nmetricsBindAddress: 0.0.0.0:10249\n", + > "owner": "root:root", + > "path": "/run/kubeadm/kube-proxy-configuration.yaml", + > "permissions": "0644" + > } + > ], + > "format": "cloud-config", + > "initConfiguration": { + > "localAPIEndpoint": {}, + > "nodeRegistration": { + > "imagePullPolicy": "IfNotPresent", + > "kubeletExtraArgs": { + > "cloud-provider": "external" + > }, + > "name": "{{ local_hostname }}" + > } + > }, + > "joinConfiguration": { + > "discovery": {}, + > "nodeRegistration": { + > "imagePullPolicy": "IfNotPresent", + > "kubeletExtraArgs": { + > "cloud-provider": "external" + > }, + > "name": "{{ local_hostname }}" + > } + > }, + > "preKubeadmCommands": [ + > "cat /run/kubeadm/kube-proxy-configuration.yaml \u003e\u003e /run/kubeadm/kubeadm.yaml" + > ] + > }, + > "machineTemplate": { + > "infrastructureRef": { + > "apiVersion": "infrastructure.cluster.x-k8s.io/v1alpha7", + > "kind": "OpenStackMachineTemplate", + > "name": "somerville-jade-20240221-work-control-plane-c6b6f2d1", + > "namespace": "default" + > }, + > "metadata": { + > "labels": { + > "capi.stackhpc.com/cluster": "somerville-jade-20240221-work", + > "capi.stackhpc.com/component": "control-plane" + > } + > }, + > "nodeDrainTimeout": "5m0s" + > }, + > "replicas": 3, + > "rolloutStrategy": { + > "rollingUpdate": { + > "maxSurge": 1 + > }, + > "type": "RollingUpdate" + > }, + > "version": "v1.26.7" + > }, + > "status": { + > "conditions": [ + > { + > "lastTransitionTime": "2024-02-21T06:44:12Z", + > "message": "Scaling up control plane to 3 replicas (actual 1)", + > "reason": "ScalingUp", + > "severity": "Warning", + > "status": "False", + > "type": "Ready" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:45:58Z", + > "status": "True", + > "type": "Available" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:43:51Z", + > "status": "True", + > "type": "CertificatesAvailable" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:56:02Z", + > "message": "Node failed to report startup in 10m0s", + > "reason": "NodeStartupTimeout @ /somerville-jade-20240221-work-control-plane-bdfrr", + > "severity": "Warning", + > "status": "False", + > "type": "MachinesReady" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:44:12Z", + > "message": "Scaling up control plane to 3 replicas (actual 1)", + > "reason": "ScalingUp", + > "severity": "Warning", + > "status": "False", + > "type": "Resized" + > } + > ], + > "initialized": true, + > "observedGeneration": 2, + > "ready": true, + > "readyReplicas": 1, + > "replicas": 1, + > "selector": "cluster.x-k8s.io/cluster-name=somerville-jade-20240221-work,cluster.x-k8s.io/control-plane", + > "unavailableReplicas": 0, + > "updatedReplicas": 1 + > } + > } + > ], + > "kind": "List", + > "metadata": { + > "resourceVersion": "" + > } + > } + + + ssh bootstrap -t \ + ' + source loadconfig + kubectl \ + --kubeconfig "${kindclusterconf:?}" \ + get KubeadmControlPlane \ + --namespace default \ + --output json \ + "somerville-jade-20240221-work-control-plane" \ + | jq ".status" + ' + + > { + > "conditions": [ + > { + > "lastTransitionTime": "2024-02-21T06:44:12Z", + > "message": "Scaling up control plane to 3 replicas (actual 1)", + > "reason": "ScalingUp", + > "severity": "Warning", + > "status": "False", + > "type": "Ready" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:45:58Z", + > "status": "True", + > "type": "Available" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:43:51Z", + > "status": "True", + > "type": "CertificatesAvailable" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:56:02Z", + > "message": "Node failed to report startup in 10m0s", + > "reason": "NodeStartupTimeout @ /somerville-jade-20240221-work-control-plane-bdfrr", + > "severity": "Warning", + > "status": "False", + > "type": "MachinesReady" + > }, + > { + > "lastTransitionTime": "2024-02-21T06:44:12Z", + > "message": "Scaling up control plane to 3 replicas (actual 1)", + > "reason": "ScalingUp", + > "severity": "Warning", + > "status": "False", + > "type": "Resized" + > } + > ], + > "initialized": true, + > "observedGeneration": 2, + > "ready": true, + > "readyReplicas": 1, + > "replicas": 1, + > "selector": "cluster.x-k8s.io/cluster-name=somerville-jade-20240221-work,cluster.x-k8s.io/control-plane", + > "unavailableReplicas": 0, + > "updatedReplicas": 1 + > } + + +# ----------------------------------------------------- +# List stuff in the work cluster .... +#[root@ansibler] + + + +# ----------------------------------------------------- +# Try connect to the metrics endpoint. +# https://github.com/stackhpc/capi-helm-charts/tree/main/charts/cluster-addons#monitoring-and-logging +#[root@ansibler] + + source /deployments/cluster-api/ansible/files/aglais/bin/loadconfig + + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + --namespace monitoring-system \ + port-forward \ + svc/kube-prometheus-stack-grafana \ + 3000:80 + + # + # Need to use ssh port forwarding as well. + # podman -> client + # ssh socks desktop -> bootstrap + # kubectl bootstrap -> work + # .... + # + # Added port 3001 to the podman command + # + +# ----------------------------------------------------- +# Watch the cluster status ... +#[root@bootstrap] + + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240221-work False Warning ScalingUp 3h33m Scaling up control plane to 3 replicas (actual 1) + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240221-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240221-work-control-plane False Warning ScalingUp 3h33m Scaling up control plane to 3 replicas (actual 1) + > │ └─Machine/somerville-jade-20240221-work-control-plane-bdfrr False Warning NodeStartupTimeout 3h22m Node failed to report startup in 10m0s + > └─Workers + > └─MachineDeployment/somerville-jade-20240221-work-md-0 False Warning WaitingForAvailableMachines 3h35m Minimum availability requires 5 replicas, current 0 available + > └─6 Machines... True 6m21s See somerville-jade-20240221-work-md-0-bc4ps-5bvnv, somerville-jade-20240221-work-md-0-bc4ps-dd6rs, ... + + # + # 5hrs and counting ... + # 6hrs and counting ... + # 30hrs .... + # + + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240221-work False Warning ScalingUp 30h Scaling up control plane to 3 replicas (actual 1) + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240221-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240221-work-control-plane False Warning ScalingUp 30h Scaling up control plane to 3 replicas (actual 1) + > │ └─Machine/somerville-jade-20240221-work-control-plane-bdfrr False Warning NodeStartupTimeout 30h Node failed to report startup in 10m0s + > └─Workers + > └─MachineDeployment/somerville-jade-20240221-work-md-0 False Warning WaitingForAvailableMachines 30h Minimum availability requires 5 replicas, current 0 available + > └─6 Machines... True 5m50s See somerville-jade-20240221-work-md-0-bc4ps-6x8lk, somerville-jade-20240221-work-md-0-bc4ps-nlwb7, ... + + + diff --git a/notes/zrq/20240222-01-jade-deploy.txt b/notes/zrq/20240222-01-jade-deploy.txt new file mode 100644 index 00000000..15b55331 --- /dev/null +++ b/notes/zrq/20240222-01-jade-deploy.txt @@ -0,0 +1,282 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# +# AIMetrics: [] +# + + Target: + + Try a new deployment, adding proxies to access the metrics. + + Result: + + Nope, still broken. + + +# ----------------------------------------------------- +# Run our local client. +#[user@desktop] + + source "${HOME:?}/aglais.env" + export PATH=${PATH}:${AGLAIS_CODE}/bin + + kube-client jade + + > .... + > .... + + +# ----------------------------------------------------- +# Delete and create everything. +#[root@ansibler] + + export cloudsite=somerville-jade + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + > .... + > .... + + +# ----------------------------------------------------- +# Check the deployment configuration. +#[root@ansibler] + + cat /opt/aglais/aglais-status.yml + + > aglais: + > ansibler: + > external: + > ipv4: 90.155.51.57 + > deployment: + > date: 20240222 + > debug: + > started: '2024-02-22 17:44:10.779956' + > name: somerville-jade-20240222 + > type: cluster-api + > kubernetes: + > cluster: + > kind: + > conf: /opt/aglais/somerville-jade-20240222-kind.yml + > debug: + > created: '2024-02-22 17:50:39.471363' + > name: somerville-jade-20240222-kind + > work: + > conf: /opt/aglais/somerville-jade-20240222-work.yml + > debug: + > created: '2024-02-22 17:58:09.520860' + > name: somerville-jade-20240222-work + > version: 1.26.7 + > openstack: + > cloud: + > name: somerville-jade + > site: somerville-jade + > keypair: + > fingerprint: 2e:84:98:98:df:70:06:0e:4c:ed:bd:d4:d6:6b:eb:16 + > id: somerville-jade-20240222-keypair + > name: somerville-jade-20240222-keypair + > networks: + > bootstrap: + > network: + > id: 63675b2c-6491-4f43-a678-d45245b531d7 + > name: somerville-jade-20240222-bootstrap-network + > router: + > id: 1abb80d5-09a7-4cc6-9693-51554140a915 + > name: somerville-jade-20240222-bootstrap-network-router + > subnet: + > cidr: 10.10.0.0/16 + > id: 1f97702c-ace1-4cde-b97d-27f531931419 + > name: somerville-jade-20240222-bootstrap-network-subnet + > external: + > network: + > id: 1875828a-ccc3-419b-87fd-856aaa781492 + > name: external + > project: + > id: be227fe0300b4ce5b03f44264df615df, + > name: Somerville-Gaia-Jade + > servers: + > bootstrap: + > float: + > external: 192.41.122.188 + > id: a33d9502-6af3-45ed-8278-448f16fe8b67 + > internal: 10.10.0.235 + > server: + > address: + > ipv4: 10.10.0.235 + > flavor: + > name: gaia.vm.2vcpu + > hostname: bootstrap + > id: 799411c4-000b-46c7-b32f-3971715958ef + > image: + > id: ce533fcf-75a6-4267-a622-d0227e6940b0 + > name: gaia-dmp-fedora-cloud-38-1.6 + > name: somerville-jade-20240222-bootstrap-node + > user: + > id: c4aad146ab7acaf44819e90e3e67a4d0490c164fbb02d388823c1ac9f0ae2e13, + > name: Dave Morris + + +# ----------------------------------------------------- +# Check the cluster status. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + --grouping=false \ + --show-conditions all \ + "${workclustername:?}" + ' + + > NAME READY SEVERITY REASON SINCE MESSAGE + > Cluster/somerville-jade-20240222-work False Warning ScalingUp 4m28s Scaling up control plane to 3 replicas (actual 1) + > │ ├─ControlPlaneInitialized True 3m51s + > │ ├─ControlPlaneReady False Warning ScalingUp 4m28s Scaling up control plane to 3 replicas (actual 1) + > │ └─InfrastructureReady True 4m53s + > ├─ClusterInfrastructure - OpenStackCluster/somerville-jade-20240222-work + > ├─ControlPlane - KubeadmControlPlane/somerville-jade-20240222-work-control-plane False Warning ScalingUp 4m28s Scaling up control plane to 3 replicas (actual 1) + > │ │ ├─Available True 3m51s + > │ │ ├─CertificatesAvailable True 4m50s + > │ │ ├─MachinesReady True 4m18s + > │ │ └─Resized False Warning ScalingUp 4m28s Scaling up control plane to 3 replicas (actual 1) + > │ └─Machine/somerville-jade-20240222-work-control-plane-7vdtz True 4m31s + > │ ├─BootstrapReady True 4m49s + > │ ├─InfrastructureReady True 4m31s + > │ └─NodeHealthy False Warning NodeProvisioning 3m52s + > └─Workers + > └─MachineDeployment/somerville-jade-20240222-work-md-0 False Warning WaitingForAvailableMachines 6m33s Minimum availability requires 5 replicas, current 0 available + > │ └─Available False Warning WaitingForAvailableMachines 6m33s Minimum availability requires 5 replicas, current 0 available + > ├─Machine/somerville-jade-20240222-work-md-0-fb2mb-52csp True 3m8s + > │ ├─BootstrapReady True 3m21s + > │ ├─InfrastructureReady True 3m8s + > │ └─NodeHealthy False Warning NodeProvisioning 3m8s + > ├─Machine/somerville-jade-20240222-work-md-0-fb2mb-9fxhc True 3m8s + > │ ├─BootstrapReady True 3m21s + > │ ├─InfrastructureReady True 3m8s + > │ └─NodeHealthy False Warning NodeProvisioning 3m7s + > ├─Machine/somerville-jade-20240222-work-md-0-fb2mb-gbs2n True 3m8s + > │ ├─BootstrapReady True 3m21s + > │ ├─InfrastructureReady True 3m8s + > │ └─NodeHealthy False Warning NodeProvisioning 3m8s + > ├─Machine/somerville-jade-20240222-work-md-0-fb2mb-gxkq7 True 3m7s + > │ ├─BootstrapReady True 3m21s + > │ ├─InfrastructureReady True 3m7s + > │ └─NodeHealthy False Warning NodeProvisioning 3m6s + > ├─Machine/somerville-jade-20240222-work-md-0-fb2mb-j7q9g True 3m6s + > │ ├─BootstrapReady True 3m21s + > │ ├─InfrastructureReady True 3m6s + > │ └─NodeHealthy False Warning NodeProvisioning 3m6s + > └─Machine/somerville-jade-20240222-work-md-0-fb2mb-qptvh True 3m6s + > ├─BootstrapReady True 3m21s + > ├─InfrastructureReady True 3m6s + > └─NodeHealthy False Warning NodeProvisioning 3m6s + + +# ----------------------------------------------------- +# Setup the proxies to access the monitoring endpoint. +# https://github.com/stackhpc/capi-helm-charts/tree/main/charts/cluster-addons#monitoring-and-logging +#[root@ansibler] + + # + # Based on the monitoring-and-logging section in their documentation, + # we should be looking for a service called 'kube-prometheus-stack-grafana'. + # + + ssh bootstrap \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + get services \ + --all-namespaces + ' + + > E0222 18:03:18.397556 13404 memcache.go:287] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0222 18:03:18.439676 13404 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0222 18:03:18.444345 13404 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0222 18:03:18.448025 13404 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > NAMESPACE NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE + > calico-system calico-typha ClusterIP 172.31.125.139 5473/TCP 3m15s + > default kubernetes ClusterIP 172.24.0.1 443/TCP 5m15s + > kube-system kube-dns ClusterIP 172.24.0.10 53/UDP,53/TCP,9153/TCP 4m54s + > kube-system metrics-server ClusterIP 172.24.139.107 443/TCP 4m50s + > kubernetes-dashboard kubernetes-dashboard ClusterIP 172.30.195.58 443/TCP 4m47s + > monitoring-system loki-stack ClusterIP 172.24.244.80 3100/TCP 4m35s + > monitoring-system loki-stack-headless ClusterIP None 3100/TCP 4m35s + > monitoring-system loki-stack-memberlist ClusterIP None 7946/TCP 4m35s + > node-feature-discovery node-feature-discovery-master ClusterIP 172.24.77.107 8080/TCP 4m47s + + # + # .... but there isn't one. + # + + ssh bootstrap \ + ' + source loadconfig + kubectl \ + --kubeconfig "${workclusterconf:?}" \ + get deployments \ + --all-namespaces + ' + + > E0222 18:10:31.007049 13650 memcache.go:287] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0222 18:10:31.009879 13650 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0222 18:10:31.013548 13650 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > E0222 18:10:31.016791 13650 memcache.go:121] couldn't get resource list for metrics.k8s.io/v1beta1: the server is currently unable to handle the request + > NAMESPACE NAME READY UP-TO-DATE AVAILABLE AGE + > calico-system calico-kube-controllers 0/1 1 0 10m + > calico-system calico-typha 0/3 3 0 10m + > gpu-operator gpu-operator 0/1 1 0 12m + > kube-system coredns 0/2 2 0 12m + > kube-system metrics-server 0/1 1 0 12m + > kubernetes-dashboard kubernetes-dashboard 0/1 1 0 12m + > network-operator mellanox-network-operator 0/1 1 0 12m + > node-feature-discovery node-feature-discovery-master 0/1 1 0 12m + > tigera-operator tigera-operator 0/1 1 0 10m + + # + # I'd assume there is a linit to which components we can run in the cluster + # when the cluster itself is broken ? + # I understand the kube-system and calico-system deployments should be there + # because they are part of the cluster infrastructure. + # But I'm wondering how much of the Prometheus and Grafana deployments + # we can run on a broken cluster ? + # + # One way to find out is to deploy thison the Cambeidge Arcus system, + # get the proxies connected up and learn how it works on a working + # cluster, and then come back to deploy it on the Somerville system. + # + diff --git a/notes/zrq/20240222-02-jade-deploy.txt b/notes/zrq/20240222-02-jade-deploy.txt new file mode 100644 index 00000000..1fce572b --- /dev/null +++ b/notes/zrq/20240222-02-jade-deploy.txt @@ -0,0 +1,104 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# +# AIMetrics: [] +# + + Target: + + We added the firewall rules because Cambridge don't want us to leave the kubectl endpoint public. + but no such rules apply to Somerville (despite what they do to the Openstack API). + So might as well try the deploy without the firewall rules patch on the kubectl endpoint. + + Result: + + Work in progress ... + + +# ----------------------------------------------------- +# Remove the firewall rules patch. +#[user@desktop] + + source "${HOME:?}/aglais.env" + pushd "${AGLAIS_CODE}" + + gedit deployments/cluster-api/ansible/00-create-all.yml & + + - import_playbook: 25-create-work-cluster.yml + ~ # import_playbook: 26-secure-work-cluster.yml + + popd + +# ----------------------------------------------------- +# Run our local client. +#[user@desktop] + + source "${HOME:?}/aglais.env" + export PATH=${PATH}:${AGLAIS_CODE}/bin + + kube-client jade + + > .... + > .... + + +# ----------------------------------------------------- +# Delete and create everything. +#[root@ansibler] + + export cloudsite=somerville-jade + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + > .... + > .... + + +# ----------------------------------------------------- +# Watch the cluster status. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + --grouping=false \ + --show-conditions all \ + "${workclustername:?}" + ' + + > .... + > .... + diff --git a/notes/zrq/20240222-03-arcus-deploy.txt b/notes/zrq/20240222-03-arcus-deploy.txt new file mode 100644 index 00000000..97ad1b3d --- /dev/null +++ b/notes/zrq/20240222-03-arcus-deploy.txt @@ -0,0 +1,95 @@ +# +# +# +# Copyright (c) 2024, ROE (http://www.roe.ac.uk/) +# +# This information is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This information is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . +# +# +# +#zrq-notes-time +#zrq-notes-indent +#zrq-notes-crypto +#zrq-notes-ansible +#zrq-notes-osformat +#zrq-notes-zeppelin +# +# AIMetrics: [] +# + + Target: + + Comparison test. + Run the ClusterAPI deploy on Arcus blue. + First to check that it still works on the Cambridge platform. + Also to learn how to use the monitoring components on a working cluster + before we try to use them on a broken cluster. + + Result: + + Work in progress ... + + +# ----------------------------------------------------- +# Run our local client. +#[user@desktop] + + source "${HOME:?}/aglais.env" + export PATH=${PATH}:${AGLAIS_CODE}/bin + + kube-client blue + + > .... + > .... + + +# ----------------------------------------------------- +# Delete and create everything. +#[root@ansibler] + + /deployments/openstack/bin/delete-all.sh \ + "${cloudname:?}" + + ansible-playbook \ + --inventory 'bootstrap,' \ + '/deployments/cluster-api/ansible/00-create-all.yml' + + > .... + > .... + + +# ----------------------------------------------------- +# Watch the cluster status. +#[root@ansibler] + + ssh bootstrap -t \ + ' + source loadconfig + watch \ + clusterctl \ + --kubeconfig "${kindclusterconf:?}" \ + describe cluster \ + "${workclustername:?}" + ' + + > .... + > .... + + + + # + # create delete create delete ....... + # create on red, blue, jade + # It works, ir doesn't it doesn't it does ... then it dosn't +