Skip to content

Commit

Permalink
Fix control plane node remediation
Browse files Browse the repository at this point in the history
When the remediator puts a finalizer on the remediation CR, NHC missed
the actual deletion of the CR and so never remediated another control
plane node. Also, a restarted NHC pods didn't watch CRs, and NHC didn't
watch CRs for classic remediation.

Signed-off-by: Marc Sluiter <[email protected]>
  • Loading branch information
slintes committed Jul 22, 2023
1 parent d55c44c commit 3fe0af7
Show file tree
Hide file tree
Showing 2 changed files with 110 additions and 10 deletions.
20 changes: 11 additions & 9 deletions controllers/nodehealthcheck_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -479,6 +479,13 @@ func (r *NodeHealthCheckReconciler) remediate(node *v1.Node, nhc *remediationv1a
// always update status, in case patching it failed during last reconcile
resources.UpdateStatusRemediationStarted(node, nhc, remediationCR)

// Adding a watch for 2 usecases:
// - watch for conditions (e.g. "Succeeded" for escalating remediations)
// - watch for removed finalizers / CR deletion, for potentially start the next control plane node remediation
if err = r.addWatch(remediationCR); err != nil {
return nil, errors.Wrapf(err, "failed to add watch for %s", remediationCR.GroupVersionKind().String())
}

// ensure to provide correct metrics in case the CR existed already after a pod restart
metrics.ObserveNodeHealthCheckRemediationCreated(node.GetName(), remediationCR.GetNamespace(), remediationCR.GetKind())

Expand All @@ -499,13 +506,6 @@ func (r *NodeHealthCheckReconciler) remediate(node *v1.Node, nhc *remediationv1a
return nil, nil
}

// Having a timeout also means we are using escalating remediations, for which we need to look at the "Succeeded"
// condition, which can accelerate switching to the next remediator.
// So let's start watching the CRs, so we don't need to poll.
if err = r.addWatch(remediationCR); err != nil {
return nil, errors.Wrapf(err, "failed to add watch for %s", remediationCR.GroupVersionKind().String())
}

startedRemediation := resources.FindStatusRemediation(node, nhc, func(r *remediationv1alpha1.Remediation) bool {
return r.Resource.GroupVersionKind() == remediationCR.GroupVersionKind()
})
Expand Down Expand Up @@ -653,15 +653,17 @@ func (r *NodeHealthCheckReconciler) addWatch(remediationCR *unstructured.Unstruc
&source.Kind{Type: remediationCR},
handler.EnqueueRequestsFromMapFunc(utils.NHCByRemediationCRMapperFunc(r.Log)),
predicate.Funcs{
// we are just interested in update events for now
// we are just interested in update and delete events for now:
// update: for conditions
// delete: when control plane node CRs are deleted for remediation the next control plane node
CreateFunc: func(_ event.CreateEvent) bool { return false },
DeleteFunc: func(_ event.DeleteEvent) bool { return false },
GenericFunc: func(_ event.GenericEvent) bool { return false },
},
); err != nil {
return err
}
r.watches[key] = struct{}{}
r.Log.Info("added watch for CR", "kind", remediationCR.GetKind())
return nil
}

Expand Down
100 changes: 99 additions & 1 deletion controllers/nodehealthcheck_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package controllers
import (
"context"
"fmt"
"strings"
"time"

. "github.com/onsi/ginkgo/v2"
Expand Down Expand Up @@ -611,7 +612,7 @@ var _ = Describe("Node Health Check CR", func() {
})

Context("control plane nodes", func() {
When("two control plane nodes are unhealthy, just one should be remediated", func() {
When("two control plane nodes are unhealthy, they should be remediated one after another", func() {
BeforeEach(func() {
objects = newNodes(2, 1, true, true)
objects = append(objects, newNodes(1, 5, false, true)...)
Expand Down Expand Up @@ -657,6 +658,103 @@ var _ = Describe("Node Health Check CR", func() {
)),
),
))

var unhealthyCPNodeName string
for _, unhealthyNode := range underTest.Status.UnhealthyNodes {
if strings.Contains(unhealthyNode.Name, "unhealthy-control-plane-node") {
unhealthyCPNodeName = unhealthyNode.Name
break
}
}
Expect(unhealthyCPNodeName).ToNot(BeEmpty())

By("simulating remediator by putting a finalizer on the cp remediation CR")
for _, cr := range crList.Items {
if cr.GetName() == unhealthyCPNodeName {
cr.SetFinalizers([]string{"dummy"})
Expect(k8sClient.Update(context.Background(), &cr)).To(Succeed())
break
}
}

By("make cp node healthy")
unhealthyCPNode := &v1.Node{}
unhealthyCPNode.Name = unhealthyCPNodeName
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(unhealthyCPNode), unhealthyCPNode)).To(Succeed())
unhealthyCPNode.Status.Conditions = []v1.NodeCondition{
{
Type: v1.NodeReady,
Status: v1.ConditionTrue,
},
}
Expect(k8sClient.Status().Update(context.Background(), unhealthyCPNode))

By("waiting for remediation end of cp node")
Eventually(func() []*v1alpha1.UnhealthyNode {
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed())
return underTest.Status.UnhealthyNodes

}, "2s", "100ms").Should(HaveLen(1))

By("ensuring other cp node isn't remediated yet")
Consistently(func() []*v1alpha1.UnhealthyNode {
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed())
return underTest.Status.UnhealthyNodes

}, "5s", "1s").Should(HaveLen(1))

By("simulating remediator finished by removing finalizer on the cp remediation CR")
for _, cr := range crList.Items {
if cr.GetName() == unhealthyCPNodeName {
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(&cr), &cr)).To(Succeed())
cr.SetFinalizers([]string{})
Expect(k8sClient.Update(context.Background(), &cr)).To(Succeed())
break
}
}

By("ensuring other cp node is remediated now")
Eventually(func() []*v1alpha1.UnhealthyNode {
Expect(k8sClient.Get(context.Background(), client.ObjectKeyFromObject(underTest), underTest)).To(Succeed())
return underTest.Status.UnhealthyNodes
}, "2s", "100ms").Should(ContainElements(
And(
HaveField("Name", "unhealthy-worker-node-1"),
HaveField("Remediations", ContainElement(
And(
HaveField("Resource.Name", "unhealthy-worker-node-1"),
HaveField("Started", Not(BeNil())),
HaveField("TimedOut", BeNil()),
),
)),
),
And(
HaveField("Name", ContainSubstring("unhealthy-control-plane-node")),
// ensure it's the other cp node now
Not(HaveField("Name", unhealthyCPNodeName)),
HaveField("Remediations", ContainElement(
And(
HaveField("Resource.Name", ContainSubstring("unhealthy-control-plane-node")),
HaveField("Started", Not(BeNil())),
HaveField("TimedOut", BeNil()),
),
)),
),
))
crList = &unstructured.UnstructuredList{Object: cr.Object}
Expect(k8sClient.List(context.Background(), crList)).To(Succeed())
Expect(len(crList.Items)).To(BeNumerically("==", 2), "expected 2 remediations, one for control plane, one for worker")
Expect(crList.Items).To(ContainElements(
// the unhealthy worker
HaveField("Object", HaveKeyWithValue("metadata", HaveKeyWithValue("name", "unhealthy-worker-node-1"))),
// the other unhealthy control plane nodes
HaveField("Object", HaveKeyWithValue("metadata", HaveKeyWithValue("name", ContainSubstring("unhealthy-control-plane-node")))),
))
Expect(crList.Items).ToNot(ContainElements(
// the old unhealthy control plane node
HaveField("Object", HaveKeyWithValue("metadata", HaveKeyWithValue("name", unhealthyCPNodeName))),
))

})
})
})
Expand Down

0 comments on commit 3fe0af7

Please sign in to comment.