From d57c3173d27a91e294f9c960042132da64a1b4e4 Mon Sep 17 00:00:00 2001 From: Andres Morey Date: Wed, 4 Aug 2021 10:07:22 +0300 Subject: [PATCH] Add support for graceful removal/addition from external load balancers Previously, kured issued the system reboot command without first removing the node from any connected external load balancers (ELBs). This behavior caused downtime on restart because ELBs send traffic to kube-proxy pods running on nodes until the ELB health checks fail or the node is de-registered explicitly. This patch solves the problem by adding a "node.kubernetes.io/exclude-from -external-load-balancers" label to the node before restart which tells the Kubernetes control plane to de-register the node from any connected ELBs. The node label is removed after restart which causes the control plane to re-register the node with the ELBs. Close #358 --- cmd/kured/main.go | 63 ++++++++++++++++++++++++++++++++++++++++++ cmd/kured/main_test.go | 61 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 124 insertions(+) diff --git a/cmd/kured/main.go b/cmd/kured/main.go index d7759f66b..967291d8b 100644 --- a/cmd/kured/main.go +++ b/cmd/kured/main.go @@ -88,6 +88,15 @@ const ( KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed" ) +const ( + // ExcludeFromELBsLabelKey is a label key that tells the K8S control plane to exclude a node from external load balancers + ExcludeFromELBsLabelKey = "node.kubernetes.io/exclude-from-external-load-balancers" + // ExcludeFromELBsLabelVal is a label value used to track label placement by kured + ExcludeFromELBsLabelVal = "kured-remove-after-reboot" + // ExcludeFromELBsLabelKeyEscaped is the escaped label key value passed to the Patch() function + ExcludeFromELBsLabelKeyEscaped = "node.kubernetes.io~1exclude-from-external-load-balancers" +) + func init() { prometheus.MustRegister(rebootRequiredGauge) } @@ -348,6 +357,54 @@ func release(lock *daemonsetlock.DaemonSetLock) { } } +func enableExcludeFromELBs(client kubernetes.Interface, nodeID string) error { + log.Infof("Adding ExcludeFromELBs label to node") + + // Add ExcludeFromELBs node label + labelPatch := fmt.Sprintf(`[{"op":"add","path":"/metadata/labels/%s","value":"%s" }]`, ExcludeFromELBsLabelKeyEscaped, ExcludeFromELBsLabelVal) + _, err := client.CoreV1().Nodes().Patch(context.Background(), nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{}) + if err != nil { + log.Errorf("Unable to add ExcludeFromELBs label to node: %s", err.Error()) + return err + } + + return nil +} + +func disableExcludeFromELBs(client kubernetes.Interface, nodeID string) error { + ctx := context.Background() + + // Get node + node, err := client.CoreV1().Nodes().Get(ctx, nodeID, metav1.GetOptions{}) + if err != nil { + log.Warnf("Unable to find node: %s", nodeID) + return err + } + + // Check ExcludeFromELBs node label + labelVal, ok := node.Labels[ExcludeFromELBsLabelKey] + if !ok { + return nil + } + + // Different label value found + if labelVal != ExcludeFromELBsLabelVal { + log.Debugf("Found ExcludeFromELBs label on node with value: '%s' (no action taken)", labelVal) + return nil + } + + // Remove ExcludeFromELBs node label + labelPatch := fmt.Sprintf(`[{"op":"remove","path":"/metadata/labels/%s"}]`, ExcludeFromELBsLabelKeyEscaped) + _, err = client.CoreV1().Nodes().Patch(ctx, nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{}) + if err != nil { + log.Errorf("Unable to remove ExcludeFromELBs label from node: %s", err.Error()) + return err + } + + log.Infof("Removed ExcludeFromELBs label from node") + return nil +} + func drain(client *kubernetes.Clientset, node *v1.Node) { nodename := node.GetName() @@ -492,6 +549,9 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s log.Fatal(err) } + // Remove ExcludeFromELBs label immediately to allow ELB registration + disableExcludeFromELBs(client, nodeID) + lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation) nodeMeta := nodeMeta{} @@ -500,6 +560,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s if err != nil { log.Fatalf("Error retrieving node object via k8s API: %v", err) } + if !nodeMeta.Unschedulable { uncordon(client, node) } @@ -513,6 +574,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation) } } + throttle(releaseDelay) release(lock) } @@ -583,6 +645,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s continue } + enableExcludeFromELBs(client, nodeID) drain(client, node) if rebootDelay > 0 { diff --git a/cmd/kured/main_test.go b/cmd/kured/main_test.go index 78dab6546..c4de3d85d 100644 --- a/cmd/kured/main_test.go +++ b/cmd/kured/main_test.go @@ -1,12 +1,16 @@ package main import ( + "context" "reflect" "testing" log "github.com/sirupsen/logrus" "github.com/weaveworks/kured/pkg/alerts" assert "gotest.tools/v3/assert" + k8sv1 "k8s.io/api/core/v1" + k8smetav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + k8sfake "k8s.io/client-go/kubernetes/fake" papi "github.com/prometheus/client_golang/api" ) @@ -223,3 +227,60 @@ func Test_rebootRequired_fatals(t *testing.T) { } } + +func newK8STestClient() *k8sfake.Clientset { + // Init test client and add a node + client := k8sfake.NewSimpleClientset() + + node := &k8sv1.Node{ + ObjectMeta: k8smetav1.ObjectMeta{ + Name: "test-node-id", + Labels: map[string]string{"key1": "val1"}, // seed map so patch `add` operation will work + }, + } + + _, err := client.CoreV1().Nodes().Create(context.TODO(), node, k8smetav1.CreateOptions{}) + if err != nil { + log.Fatal(err.Error()) + } + + return client +} + +func Test_enableExcludeFromELBs(t *testing.T) { + testclient := newK8STestClient() + + // Test that method returns error if node doesn't exist + err := enableExcludeFromELBs(testclient, "doesnt-exist") + assert.Error(t, err, "nodes \"doesnt-exist\" not found") + + // Test that method adds ExcludeFromELBs label if node exists + err = enableExcludeFromELBs(testclient, "test-node-id") + assert.Assert(t, err == nil) + + nodes, err := testclient.CoreV1().Nodes().List(context.TODO(), k8smetav1.ListOptions{}) + labels := nodes.Items[0].Labels + assert.Equal(t, labels[ExcludeFromELBsLabelKey], ExcludeFromELBsLabelVal) +} + +func Test_disableExcludeFromELBs(t *testing.T) { + testclient := newK8STestClient() + + // Test that method returns error if node doesn't exist + err := disableExcludeFromELBs(testclient, "doesnt-exist") + assert.Error(t, err, "nodes \"doesnt-exist\" not found") + + // Test that method executes silently on existing node without pre-existing ExcludeFromELBs label + err = disableExcludeFromELBs(testclient, "test-node-id") + assert.Assert(t, err == nil) + + // Add ExcludeFromELBs label and check that method removes it + enableExcludeFromELBs(testclient, "test-node-id") + err = disableExcludeFromELBs(testclient, "test-node-id") + assert.Assert(t, err == nil) + + nodes, err := testclient.CoreV1().Nodes().List(context.TODO(), k8smetav1.ListOptions{}) + labels := nodes.Items[0].Labels + _, ok := labels[ExcludeFromELBsLabelKey] + assert.Assert(t, !ok) +}