Fix / improve reboot duration calculation and add unit tests

- deduplicate batch size calculations - better structure and comments for reboot duration calculation - fix usage of MaxTimeForNoPeersResponse in calculation - unit tests for both Signed-off-by: Marc Sluiter <[email protected]>
medik8s · Jun 18, 2024 · a487c23 · a487c23
1 parent 6b2e057
commit a487c23
Show file tree

Hide file tree

Showing 7 changed files with 380 additions and 55 deletions.
diff --git a/pkg/apicheck/check.go b/pkg/apicheck/check.go
@@ -22,6 +22,7 @@ import (
 	"github.com/medik8s/self-node-remediation/pkg/peerhealth"
 	"github.com/medik8s/self-node-remediation/pkg/peers"
 	"github.com/medik8s/self-node-remediation/pkg/reboot"
+	"github.com/medik8s/self-node-remediation/pkg/utils"
 )
 
 type ApiConnectivityCheck struct {
@@ -142,22 +143,8 @@ func (c *ApiConnectivityCheck) getWorkerPeersResponse() peers.Response {
 	// nodesToAsk is being reduced in every iteration, iterate until no nodes left to ask
 	for i := 0; len(nodesToAsk) > 0; i++ {
 
-		// start asking a few nodes only in first iteration to cover the case we get a healthy / unhealthy result
-		nodesBatchCount := reboot.MinNodesNumberInBatch
-		if i > 0 {
-			// after that ask 10% of the cluster each time to check the api problem case
-			nodesBatchCount = len(nodesToAsk) / reboot.MaxBatchesAfterFirst
-			if nodesBatchCount < reboot.MinNodesNumberInBatch {
-				nodesBatchCount = reboot.MinNodesNumberInBatch
-			}
-		}
-
-		// but do not ask more than we have
-		if len(nodesToAsk) < nodesBatchCount {
-			nodesBatchCount = len(nodesToAsk)
-		}
-
-		chosenNodesAddresses := c.popNodes(&nodesToAsk, nodesBatchCount)
+		batchSize := utils.GetNextBatchSize(nrAllNodes, len(nodesToAsk))
+		chosenNodesAddresses := c.popNodes(&nodesToAsk, batchSize)
 		healthyResponses, unhealthyResponses, apiErrorsResponses, _ := c.getHealthStatusFromPeers(chosenNodesAddresses)
 		if healthyResponses+unhealthyResponses+apiErrorsResponses > 0 {
 			c.timeOfLastPeerResponse = time.Now()
@@ -189,6 +176,7 @@ func (c *ApiConnectivityCheck) getWorkerPeersResponse() peers.Response {
 
 	//we asked all peers
 	now := time.Now()
+	// MaxTimeForNoPeersResponse check prevents the node from being considered unhealthy in case of short network outages
 	if now.After(c.timeOfLastPeerResponse.Add(c.config.MaxTimeForNoPeersResponse)) {
 		c.config.Log.Error(fmt.Errorf("failed health check"), "Failed to get health status peers. Assuming unhealthy")
 		return peers.Response{IsHealthy: false, Reason: peers.UnHealthyBecauseNodeIsIsolated}

diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go
@@ -2,6 +2,7 @@ package reboot
 
 import (
 	"context"
+	"sync"
 	"time"
 
 	"github.com/go-logr/logr"
@@ -19,8 +20,6 @@ import (
 
 const (
 	MaxTimeForNoPeersResponse = 30 * time.Second
-	MinNodesNumberInBatch     = 3
-	MaxBatchesAfterFirst      = 10
 )
 
 type Calculator interface {
@@ -39,7 +38,8 @@ type calculator struct {
 	k8sClient client.Client
 	log       logr.Logger
 	// storing the config here when reconciling it increases resilience in case of issues during remediation
-	snrConfig *v1alpha1.SelfNodeRemediationConfig
+	snrConfig     *v1alpha1.SelfNodeRemediationConfig
+	snrConfigLock sync.RWMutex
 }
 
 func NewCalculator(k8sClient client.Client, log logr.Logger) Calculator {
@@ -50,12 +50,16 @@ func NewCalculator(k8sClient client.Client, log logr.Logger) Calculator {
 }
 
 func (r *calculator) SetConfig(config *v1alpha1.SelfNodeRemediationConfig) {
+	r.snrConfigLock.Lock()
+	defer r.snrConfigLock.Unlock()
 	r.snrConfig = config
 }
 
 // TODO add unit test!
 func (r *calculator) GetRebootDuration(ctx context.Context, node *v1.Node) (time.Duration, error) {
 
+	r.snrConfigLock.Lock()
+	defer r.snrConfigLock.Unlock()
 	if r.snrConfig == nil {
 		return 0, errors.New("SelfNodeRemediationConfig not set yet, can't calculate minimum reboot duration")
 	}
@@ -96,57 +100,55 @@ func (r *calculator) calculateMinimumRebootDuration(ctx context.Context, watchdo
 
 	spec := r.snrConfig.Spec
 
-	// The minimum reboot duration consists of the duration to identify a node issue, and to trigger the reboot
+	// The minimum reboot duration consists of the duration
+	// 1) to detect API connectivity issue
+	// 2) to confirm issue with peers
+	// 3) to trigger the reboot
 
-	// 1. time for determine node issue
-	// a) API check duration ...
-	minTime := spec.ApiCheckInterval.Duration + spec.ApiServerTimeout.Duration
+	// 1. detect API connectivity issue
+	// a) max API check duration ...
+	apiCheckDuration := spec.ApiCheckInterval.Duration + spec.ApiServerTimeout.Duration
 	// b) ... times error threshold ...
-	minTime *= time.Duration(spec.MaxApiErrorThreshold)
-	// c) ... plus peer timeout
-	minTime += MaxTimeForNoPeersResponse
+	apiCheckDuration *= time.Duration(spec.MaxApiErrorThreshold)
 
-	// 2. plus time for asking peers (10% batches + 1st smaller batch)
+	// 2. confirm issue with peers
+	// a) nr of peer request batches (10% batches + 1st smaller batch) ...
 	numBatches, err := r.calcNumOfBatches(r.k8sClient, ctx)
 	if err != nil {
 		return 0, errors.Wrap(err, "failed to calculate number of batches")
 	}
-	minTime += time.Duration(numBatches)*spec.PeerDialTimeout.Duration + spec.PeerRequestTimeout.Duration
-
-	// 3. plus watchdog timeout
-	minTime += watchdogTimeout
+	peerRequestsDuration := time.Duration(numBatches)
+	// b) ... times max peer request duration
+	peerRequestsDuration *= spec.PeerDialTimeout.Duration + spec.PeerRequestTimeout.Duration
+	// c) in order to prevent false positives in case of temporary network issues,
+	//    we don't consider nodes being unhealthy before MaxTimeForNoPeersResponse.
+	//    So that's the minimum time we need for the peers check.
+	if peerRequestsDuration < MaxTimeForNoPeersResponse {
+		peerRequestsDuration = MaxTimeForNoPeersResponse
+	}
 
-	// 4. plus some buffer
-	minTime += 15 * time.Second
+	// 3. trigger the reboot
+	// a) watchdog timeout ...
+	rebootDuration := watchdogTimeout
+	// b) ... plus some buffer for actually rebooting
+	rebootDuration += 15 * time.Second
 
-	return minTime, nil
+	return apiCheckDuration + peerRequestsDuration + rebootDuration, nil
 }
 
 func (r *calculator) calcNumOfBatches(k8sClient client.Client, ctx context.Context) (int, error) {
 
+	// get all worker nodes
 	reqPeers, _ := labels.NewRequirement(commonlabels.WorkerRole, selection.Exists, []string{})
 	selector := labels.NewSelector()
 	selector = selector.Add(*reqPeers)
 
 	nodes := &v1.NodeList{}
-	// time for asking peers (10% batches + 1st smaller batch)
-	maxNumberOfBatches := MaxBatchesAfterFirst + 1
 	if err := k8sClient.List(ctx, nodes, client.MatchingLabelsSelector{Selector: selector}); err != nil {
-		return maxNumberOfBatches, errors.Wrap(err, "failed to list worker nodes")
+		return 0, errors.Wrap(err, "failed to list worker nodes")
 	}
-	workerNodesCount := len(nodes.Items)
 
-	var numberOfBatches int
-	switch {
-	//high number of workers: we need max batches (for example 53 nodes will be done in 11 batches -> 1 * 3 + 10 * 5 )
-	case workerNodesCount > maxNumberOfBatches*MinNodesNumberInBatch:
-		numberOfBatches = maxNumberOfBatches
-	//there are few enough nodes to use the min batch (for example 20 nodes will be done in 7 batches -> 1 * 3 +  6 * 3 )
-	default:
-		numberOfBatches = workerNodesCount / MinNodesNumberInBatch
-		if workerNodesCount%MinNodesNumberInBatch != 0 {
-			numberOfBatches++
-		}
-	}
-	return numberOfBatches, nil
+	workerNodesCount := len(nodes.Items)
+	batchCount := utils.GetNrOfBatches(workerNodesCount)
+	return batchCount, nil
 }
diff --git a/pkg/reboot/calculator_test.go b/pkg/reboot/calculator_test.go
@@ -0,0 +1,128 @@
+package reboot_test
+
+import (
+	"context"
+	"strconv"
+	"time"
+
+	"github.com/medik8s/common/pkg/labels"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"sigs.k8s.io/controller-runtime/pkg/client"
+
+	"github.com/medik8s/self-node-remediation/api/v1alpha1"
+	"github.com/medik8s/self-node-remediation/pkg/utils"
+)
+
+var _ = Describe("Calculator tests", func() {
+
+	var snrConfig *v1alpha1.SelfNodeRemediationConfig
+	var watchdogTimeoutSeconds int
+	var unhealthyNode *v1.Node
+	var nrOfPeers int
+	var expectedRebootDurationSeconds int
+
+	BeforeEach(func() {
+		// just defaults, override values in tests
+		snrConfig = &v1alpha1.SelfNodeRemediationConfig{
+			ObjectMeta: metav1.ObjectMeta{
+				Namespace: "default",
+				Name:      v1alpha1.ConfigCRName,
+			},
+			Spec: v1alpha1.SelfNodeRemediationConfigSpec{},
+		}
+	})
+
+	JustBeforeEach(func() {
+		Expect(k8sClient.Create(context.Background(), snrConfig)).To(Succeed())
+		DeferCleanup(func() {
+			Expect(k8sClient.Delete(context.Background(), snrConfig)).To(Succeed())
+			Eventually(func(g Gomega) {
+				err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(snrConfig), &v1alpha1.SelfNodeRemediationConfig{})
+				g.Expect(errors.IsNotFound(err)).To(BeTrue())
+			}, "5s", "1s").Should(Succeed())
+			calculator.SetConfig(nil)
+		})
+
+		unhealthyNode = getNode("unhealthy-node")
+		unhealthyNode.Annotations = map[string]string{
+			utils.WatchdogTimeoutSecondsAnnotation: strconv.Itoa(watchdogTimeoutSeconds),
+		}
+		Expect(k8sClient.Create(context.Background(), unhealthyNode)).To(Succeed())
+		DeferCleanup(func() {
+			Expect(k8sClient.Delete(context.Background(), unhealthyNode)).To(Succeed())
+			Eventually(func(g Gomega) {
+				err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(unhealthyNode), &v1.Node{})
+				g.Expect(errors.IsNotFound(err)).To(BeTrue())
+			}, "5s", "1s").Should(Succeed())
+		})
+
+		for i := 0; i < nrOfPeers; i++ {
+			peer := getNode("peer-node-" + strconv.Itoa(i))
+			peer.Labels[labels.WorkerRole] = ""
+			Expect(k8sClient.Create(context.Background(), peer)).To(Succeed())
+			DeferCleanup(func() {
+				Expect(k8sClient.Delete(context.Background(), peer)).To(Succeed())
+				Eventually(func(g Gomega) {
+					err := k8sClient.Get(context.Background(), client.ObjectKeyFromObject(peer), &v1.Node{})
+					g.Expect(errors.IsNotFound(err)).To(BeTrue())
+				}, "5s", "1s").Should(Succeed())
+			})
+		}
+	})
+
+	Context("with default SNRConfig, 2 peers, and 10s watchdog timeout", func() {
+		BeforeEach(func() {
+			watchdogTimeoutSeconds = 10
+			nrOfPeers = 2
+			// 3 * (15 + 5) (API server)
+			// + 30 (MaxTimeForNoPeersResponse)
+			// + 10 (Watchdog)
+			// + 15
+			expectedRebootDurationSeconds = 115
+		})
+		It("GetRebootTime should return correct value", func() {
+			Eventually(func() (time.Duration, error) {
+				return calculator.GetRebootDuration(context.Background(), unhealthyNode)
+			}, "5s", "200ms").Should(Equal(time.Duration(expectedRebootDurationSeconds) * time.Second))
+		})
+	})
+
+	Context("with modified SNRConfig, 20 peers, and 25s watchdog timeout", func() {
+		BeforeEach(func() {
+			// modify all values used by calculator
+			snrConfig.Spec.ApiCheckInterval = &metav1.Duration{Duration: 25 * time.Second}
+			snrConfig.Spec.ApiServerTimeout = &metav1.Duration{Duration: 7 * time.Second}
+			snrConfig.Spec.MaxApiErrorThreshold = 4
+
+			snrConfig.Spec.PeerDialTimeout = &metav1.Duration{Duration: 11 * time.Second}
+			snrConfig.Spec.PeerRequestTimeout = &metav1.Duration{Duration: 13 * time.Second}
+
+			watchdogTimeoutSeconds = 25
+			nrOfPeers = 20 // 7 batches
+
+			// 4 * (25 + 7) = 128 (API server)
+			// + 7 * (11 + 13) = 168 (Peers)
+			// + 25 (Watchdog)
+			// + 15
+			expectedRebootDurationSeconds = 336
+		})
+		It("GetRebootTime should return correct value", func() {
+			Eventually(func() (time.Duration, error) {
+				return calculator.GetRebootDuration(context.Background(), unhealthyNode)
+			}, "5s", "200ms").Should(Equal(time.Duration(expectedRebootDurationSeconds) * time.Second))
+		})
+	})
+})
+
+func getNode(name string) *v1.Node {
+	node := &v1.Node{}
+	node.Name = name
+	node.Labels = make(map[string]string)
+	node.Labels["kubernetes.io/hostname"] = name
+	return node
+}