Skip to content

Commit

Permalink
chore: restructure k8s component health checks
Browse files Browse the repository at this point in the history
Re-structure k8s components health checks so that K8s health can be
independently checked without auxiliary components being up.

Signed-off-by: Noel Georgi <[email protected]>
  • Loading branch information
frezbo committed Aug 19, 2024
1 parent e193e7d commit c312a46
Showing 1 changed file with 60 additions and 48 deletions.
108 changes: 60 additions & 48 deletions pkg/cluster/check/default.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ package check

import (
"context"
"slices"
"time"

"github.com/siderolabs/talos/pkg/conditions"
Expand All @@ -14,21 +15,71 @@ import (

// DefaultClusterChecks returns a set of default Talos cluster readiness checks.
func DefaultClusterChecks() []ClusterCheck {
return append(PreBootSequenceChecks(), []ClusterCheck{
return slices.Concat(
PreBootSequenceChecks(),
K8sComponentsReadinessChecks(),
[]ClusterCheck{
// wait for all the nodes to report ready at k8s level
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all k8s nodes to report ready", func(ctx context.Context) error {
return K8sAllNodesReadyAssertion(ctx, cluster)
}, 10*time.Minute, 5*time.Second)
},

// wait for kube-proxy to report ready
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("kube-proxy to report ready", func(ctx context.Context) error {
present, err := DaemonSetPresent(ctx, cluster, "kube-system", "k8s-app=kube-proxy")
if err != nil {
return err
}

if !present {
return conditions.ErrSkipAssertion
}

return K8sPodReadyAssertion(ctx, cluster, "kube-system", "k8s-app=kube-proxy")
}, 3*time.Minute, 5*time.Second)
},

// wait for coredns to report ready
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("coredns to report ready", func(ctx context.Context) error {
present, err := ReplicaSetPresent(ctx, cluster, "kube-system", "k8s-app=kube-dns")
if err != nil {
return err
}

if !present {
return conditions.ErrSkipAssertion
}

return K8sPodReadyAssertion(ctx, cluster, "kube-system", "k8s-app=kube-dns")
}, 3*time.Minute, 5*time.Second)
},

// wait for all the nodes to be schedulable
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all k8s nodes to report schedulable", func(ctx context.Context) error {
return K8sAllNodesSchedulableAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},
},
)
}

// K8sComponentsReadinessChecks returns a set of K8s cluster readiness checks which are specific to the k8s components
// being up and running. This test can be skipped if the cluster is set to use a custom CNI, as the checks won't be healthy
// until the CNI is up and running.
func K8sComponentsReadinessChecks() []ClusterCheck {
return []ClusterCheck{
// wait for all the nodes to report in at k8s level
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all k8s nodes to report", func(ctx context.Context) error {
return K8sAllNodesReportedAssertion(ctx, cluster)
}, 5*time.Minute, 30*time.Second) // give more time per each attempt, as this check is going to build and cache kubeconfig
},

// wait for all the nodes to report ready at k8s level
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all k8s nodes to report ready", func(ctx context.Context) error {
return K8sAllNodesReadyAssertion(ctx, cluster)
}, 10*time.Minute, 5*time.Second)
},

// wait for k8s control plane static pods
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all control plane static pods to be running", func(ctx context.Context) error {
Expand All @@ -42,46 +93,7 @@ func DefaultClusterChecks() []ClusterCheck {
return K8sFullControlPlaneAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},

// wait for kube-proxy to report ready
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("kube-proxy to report ready", func(ctx context.Context) error {
present, err := DaemonSetPresent(ctx, cluster, "kube-system", "k8s-app=kube-proxy")
if err != nil {
return err
}

if !present {
return conditions.ErrSkipAssertion
}

return K8sPodReadyAssertion(ctx, cluster, "kube-system", "k8s-app=kube-proxy")
}, 3*time.Minute, 5*time.Second)
},

// wait for coredns to report ready
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("coredns to report ready", func(ctx context.Context) error {
present, err := ReplicaSetPresent(ctx, cluster, "kube-system", "k8s-app=kube-dns")
if err != nil {
return err
}

if !present {
return conditions.ErrSkipAssertion
}

return K8sPodReadyAssertion(ctx, cluster, "kube-system", "k8s-app=kube-dns")
}, 3*time.Minute, 5*time.Second)
},

// wait for all the nodes to be schedulable
func(cluster ClusterInfo) conditions.Condition {
return conditions.PollingCondition("all k8s nodes to report schedulable", func(ctx context.Context) error {
return K8sAllNodesSchedulableAssertion(ctx, cluster)
}, 5*time.Minute, 5*time.Second)
},
}...)
}
}

// ExtraClusterChecks returns a set of additional Talos cluster readiness checks which work only for newer versions of Talos.
Expand Down

0 comments on commit c312a46

Please sign in to comment.