Skip to content

Commit

Permalink
Add support for graceful removal/addition from external load balancers
Browse files Browse the repository at this point in the history
Previously, kured issued the system reboot command without first
removing the node from any connected external load balancers (ELBs).

This behavior caused downtime on restart because ELBs send traffic to
kube-proxy pods running on nodes until the ELB health checks fail or the
node is de-registered explicitly.

This patch solves the problem by adding a "node.kubernetes.io/exclude-from
-external-load-balancers" label to the node before restart which tells
the Kubernetes control plane to de-register the node from any connected
ELBs. The node label is removed after restart which causes the control
plane to re-register the node with the ELBs.

Close kubereboot#358
  • Loading branch information
amorey committed Aug 25, 2021
1 parent 3682eb3 commit d57c317
Show file tree
Hide file tree
Showing 2 changed files with 124 additions and 0 deletions.
63 changes: 63 additions & 0 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,15 @@ const (
KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed"
)

const (
// ExcludeFromELBsLabelKey is a label key that tells the K8S control plane to exclude a node from external load balancers
ExcludeFromELBsLabelKey = "node.kubernetes.io/exclude-from-external-load-balancers"
// ExcludeFromELBsLabelVal is a label value used to track label placement by kured
ExcludeFromELBsLabelVal = "kured-remove-after-reboot"
// ExcludeFromELBsLabelKeyEscaped is the escaped label key value passed to the Patch() function
ExcludeFromELBsLabelKeyEscaped = "node.kubernetes.io~1exclude-from-external-load-balancers"
)

func init() {
prometheus.MustRegister(rebootRequiredGauge)
}
Expand Down Expand Up @@ -348,6 +357,54 @@ func release(lock *daemonsetlock.DaemonSetLock) {
}
}

func enableExcludeFromELBs(client kubernetes.Interface, nodeID string) error {
log.Infof("Adding ExcludeFromELBs label to node")

// Add ExcludeFromELBs node label
labelPatch := fmt.Sprintf(`[{"op":"add","path":"/metadata/labels/%s","value":"%s" }]`, ExcludeFromELBsLabelKeyEscaped, ExcludeFromELBsLabelVal)
_, err := client.CoreV1().Nodes().Patch(context.Background(), nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{})
if err != nil {
log.Errorf("Unable to add ExcludeFromELBs label to node: %s", err.Error())
return err
}

return nil
}

func disableExcludeFromELBs(client kubernetes.Interface, nodeID string) error {
ctx := context.Background()

// Get node
node, err := client.CoreV1().Nodes().Get(ctx, nodeID, metav1.GetOptions{})
if err != nil {
log.Warnf("Unable to find node: %s", nodeID)
return err
}

// Check ExcludeFromELBs node label
labelVal, ok := node.Labels[ExcludeFromELBsLabelKey]
if !ok {
return nil
}

// Different label value found
if labelVal != ExcludeFromELBsLabelVal {
log.Debugf("Found ExcludeFromELBs label on node with value: '%s' (no action taken)", labelVal)
return nil
}

// Remove ExcludeFromELBs node label
labelPatch := fmt.Sprintf(`[{"op":"remove","path":"/metadata/labels/%s"}]`, ExcludeFromELBsLabelKeyEscaped)
_, err = client.CoreV1().Nodes().Patch(ctx, nodeID, types.JSONPatchType, []byte(labelPatch), metav1.PatchOptions{})
if err != nil {
log.Errorf("Unable to remove ExcludeFromELBs label from node: %s", err.Error())
return err
}

log.Infof("Removed ExcludeFromELBs label from node")
return nil
}

func drain(client *kubernetes.Clientset, node *v1.Node) {
nodename := node.GetName()

Expand Down Expand Up @@ -492,6 +549,9 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
log.Fatal(err)
}

// Remove ExcludeFromELBs label immediately to allow ELB registration
disableExcludeFromELBs(client, nodeID)

lock := daemonsetlock.New(client, nodeID, dsNamespace, dsName, lockAnnotation)

nodeMeta := nodeMeta{}
Expand All @@ -500,6 +560,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
if err != nil {
log.Fatalf("Error retrieving node object via k8s API: %v", err)
}

if !nodeMeta.Unschedulable {
uncordon(client, node)
}
Expand All @@ -513,6 +574,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
deleteNodeAnnotation(client, nodeID, KuredRebootInProgressAnnotation)
}
}

throttle(releaseDelay)
release(lock)
}
Expand Down Expand Up @@ -583,6 +645,7 @@ func rebootAsRequired(nodeID string, rebootCommand []string, sentinelCommand []s
continue
}

enableExcludeFromELBs(client, nodeID)
drain(client, node)

if rebootDelay > 0 {
Expand Down
61 changes: 61 additions & 0 deletions cmd/kured/main_test.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
package main

import (
"context"
"reflect"
"testing"

log "github.com/sirupsen/logrus"
"github.com/weaveworks/kured/pkg/alerts"
assert "gotest.tools/v3/assert"
k8sv1 "k8s.io/api/core/v1"
k8smetav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
k8sfake "k8s.io/client-go/kubernetes/fake"

papi "github.com/prometheus/client_golang/api"
)
Expand Down Expand Up @@ -223,3 +227,60 @@ func Test_rebootRequired_fatals(t *testing.T) {
}

}

func newK8STestClient() *k8sfake.Clientset {
// Init test client and add a node
client := k8sfake.NewSimpleClientset()

node := &k8sv1.Node{
ObjectMeta: k8smetav1.ObjectMeta{
Name: "test-node-id",
Labels: map[string]string{"key1": "val1"}, // seed map so patch `add` operation will work
},
}

_, err := client.CoreV1().Nodes().Create(context.TODO(), node, k8smetav1.CreateOptions{})
if err != nil {
log.Fatal(err.Error())
}

return client
}

func Test_enableExcludeFromELBs(t *testing.T) {
testclient := newK8STestClient()

// Test that method returns error if node doesn't exist
err := enableExcludeFromELBs(testclient, "doesnt-exist")
assert.Error(t, err, "nodes \"doesnt-exist\" not found")

// Test that method adds ExcludeFromELBs label if node exists
err = enableExcludeFromELBs(testclient, "test-node-id")
assert.Assert(t, err == nil)

nodes, err := testclient.CoreV1().Nodes().List(context.TODO(), k8smetav1.ListOptions{})
labels := nodes.Items[0].Labels
assert.Equal(t, labels[ExcludeFromELBsLabelKey], ExcludeFromELBsLabelVal)
}

func Test_disableExcludeFromELBs(t *testing.T) {
testclient := newK8STestClient()

// Test that method returns error if node doesn't exist
err := disableExcludeFromELBs(testclient, "doesnt-exist")
assert.Error(t, err, "nodes \"doesnt-exist\" not found")

// Test that method executes silently on existing node without pre-existing ExcludeFromELBs label
err = disableExcludeFromELBs(testclient, "test-node-id")
assert.Assert(t, err == nil)

// Add ExcludeFromELBs label and check that method removes it
enableExcludeFromELBs(testclient, "test-node-id")
err = disableExcludeFromELBs(testclient, "test-node-id")
assert.Assert(t, err == nil)

nodes, err := testclient.CoreV1().Nodes().List(context.TODO(), k8smetav1.ListOptions{})
labels := nodes.Items[0].Labels
_, ok := labels[ExcludeFromELBsLabelKey]
assert.Assert(t, !ok)
}

0 comments on commit d57c317

Please sign in to comment.