Skip to content

Commit

Permalink
using "safet" minimum calculated value
Browse files Browse the repository at this point in the history
Signed-off-by: Michael Shitrit <[email protected]>
  • Loading branch information
mshitrit committed Jun 4, 2024
1 parent bf88778 commit 04a6964
Show file tree
Hide file tree
Showing 9 changed files with 300 additions and 2 deletions.
8 changes: 8 additions & 0 deletions api/v1alpha1/selfnoderemediationconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,14 @@ type SelfNodeRemediationConfigStatus struct {
// +listMapKey=type
// +optional
Conditions []metav1.Condition `json:"conditions,omitempty"`

// LastUpdateTime is the last time the status was updated.
//
//+optional
//+kubebuilder:validation:Type=string
//+kubebuilder:validation:Format=date-time
//+operator-sdk:csv:customresourcedefinitions:type=status
LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"`
}

//+kubebuilder:object:root=true
Expand Down
4 changes: 4 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ spec:
path: conditions
x-descriptors:
- urn:alm:descriptor:io.kubernetes.conditions
- description: LastUpdateTime is the last time the status was updated.
displayName: Last Update Time
path: lastUpdateTime
version: v1alpha1
- description: SelfNodeRemediation is the Schema for the selfnoderemediations
API
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,10 @@ spec:
x-kubernetes-list-map-keys:
- type
x-kubernetes-list-type: map
lastUpdateTime:
description: LastUpdateTime is the last time the status was updated.
format: date-time
type: string
minSafeTimeToAssumeNodeRebootedSeconds:
description: MinSafeTimeToAssumeNodeRebootedSeconds is the minimum
value that can be assigned to SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,10 @@ spec:
x-kubernetes-list-map-keys:
- type
x-kubernetes-list-type: map
lastUpdateTime:
description: LastUpdateTime is the last time the status was updated.
format: date-time
type: string
minSafeTimeToAssumeNodeRebootedSeconds:
description: MinSafeTimeToAssumeNodeRebootedSeconds is the minimum
value that can be assigned to SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ spec:
path: conditions
x-descriptors:
- urn:alm:descriptor:io.kubernetes.conditions
- description: LastUpdateTime is the last time the status was updated.
displayName: Last Update Time
path: lastUpdateTime
version: v1alpha1
- description: SelfNodeRemediation is the Schema for the selfnoderemediations
API
Expand Down
13 changes: 12 additions & 1 deletion pkg/reboot/calculator.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,11 @@ const (
MaxBatchesAfterFirst = 10
)

var (
//timeBeforeAssumingRecentUpdate is the time the timeframe in which we assume multiple updates belong to the same configuration change and are applied by different agents
timeBeforeAssumingRecentUpdate = time.Second * 15
)

type SafeTimeCalculator interface {
// GetTimeToAssumeNodeRebooted returns the safe time to assume node was already rebooted
// note that this time must include the time for a unhealthy node without api-server access to reach the conclusion that it's unhealthy
Expand Down Expand Up @@ -133,9 +138,14 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(ctx context.Con
if err != nil {
return err
}

orgConfig := config.DeepCopy()
prevMinRebootTimeSec := config.Status.MinSafeTimeToAssumeNodeRebootedSeconds
if prevMinRebootTimeSec != minTimeSec {
isUpdatedRecently := config.Status.LastUpdateTime != nil && (*config.Status.LastUpdateTime).Add(timeBeforeAssumingRecentUpdate).After(time.Now())
//Use safer value
if minTimeSec > prevMinRebootTimeSec ||
// we can update even though value may be less safe because the config has changed
(!isUpdatedRecently && prevMinRebootTimeSec != minTimeSec) {
config.Status.MinSafeTimeToAssumeNodeRebootedSeconds = minTimeSec
}

Expand All @@ -160,6 +170,7 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(ctx context.Con
}

if !reflect.DeepEqual(config, orgConfig) {
config.Status.LastUpdateTime = &metav1.Time{Time: time.Now()}
if err := s.k8sClient.Status().Patch(ctx, config, client.MergeFrom(orgConfig)); err != nil {
return err
}
Expand Down
198 changes: 198 additions & 0 deletions pkg/reboot/calculator_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
package reboot

import (
"context"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"

apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/medik8s/self-node-remediation/api/v1alpha1"
)

var _ = Describe("Calculator Tests", func() {
var config *v1alpha1.SelfNodeRemediationConfig
var configKey client.ObjectKey
//105 is just the calculated value according to current specs fill free to change in case the Specs or calculation changes
var defaultCalculatedMinSafeTime = 105
var increasedCalculatedMinSafeTime = 108
var decreasedCalculatedMinSafeTime = 102
Context("Agent calculator", func() {
config = createDefaultSelfNodeRemediationConfigCR()
configKey = client.ObjectKeyFromObject(config)

AfterEach(func() {
Expect(k8sClient.Delete(context.TODO(), config)).To(Succeed())
Eventually(func(g Gomega) bool {
tmpConfig := &v1alpha1.SelfNodeRemediationConfig{}
err := k8sClient.Get(context.TODO(), configKey, tmpConfig)
g.Expect(err).To(HaveOccurred())
g.Expect(apierrors.IsNotFound(err)).To(BeTrue())
return true
}, 5*time.Second, 250*time.Millisecond).Should(BeTrue())
})

BeforeEach(func() {
//Initial configuration creation
createConfig(config)

//Simulating agent starting which will set Status fields
simulateSNRAgentStart(configKey)

})
It("Status should be updated after agent is initialized", func() {
Eventually(func(g Gomega) bool {
tmpConfig := &v1alpha1.SelfNodeRemediationConfig{}
g.Expect(k8sClient.Get(context.TODO(), configKey, tmpConfig)).To(Succeed())
//Configuration status fields are now filled by the agent
g.Expect(tmpConfig.Status.LastUpdateTime).ToNot(BeNil())
g.Expect(tmpConfig.Status.MinSafeTimeToAssumeNodeRebootedSeconds).To(Equal(defaultCalculatedMinSafeTime))
return true
}, 5*time.Second, 250*time.Millisecond).Should(BeTrue())
})

Context("A recent configuration change", func() {
When("the change increase the min calculated value", func() {
BeforeEach(func() {
increaseApiCheckInConfig(configKey)
simulateSNRAgentStart(configKey)
})
It("Status should be updated by the agent", func() {
verifyMinSafeTimeToAssumeNodeRebootedSecondsValue(configKey, increasedCalculatedMinSafeTime)
})

})
When("the change decrease the min calculated value", func() {
BeforeEach(func() {
decreaseApiCheckInConfig(configKey)
simulateSNRAgentStart(configKey)
})
It("Status should NOT be updated by the agent", func() {
verifyMinSafeTimeToAssumeNodeRebootedSecondsValue(configKey, defaultCalculatedMinSafeTime)
})

})
})

Context("A Non recent configuration change", func() {
BeforeEach(func() {
originalTimeBeforeAssumingRecentUpdate := timeBeforeAssumingRecentUpdate
//mocking the time for faster tests
timeBeforeAssumingRecentUpdate = time.Second
//simulating non recent update
time.Sleep(time.Millisecond * 1100)
DeferCleanup(func() {
timeBeforeAssumingRecentUpdate = originalTimeBeforeAssumingRecentUpdate
})
})
When("the change increase the min calculated value", func() {
BeforeEach(func() {
increaseApiCheckInConfig(configKey)
simulateSNRAgentStart(configKey)
})
It("Status should be updated by the agent", func() {
verifyMinSafeTimeToAssumeNodeRebootedSecondsValue(configKey, increasedCalculatedMinSafeTime)
})

})
When("the change decrease the min calculated value", func() {
BeforeEach(func() {
decreaseApiCheckInConfig(configKey)
simulateSNRAgentStart(configKey)
})
It("Status should be updated by the agent", func() {
verifyMinSafeTimeToAssumeNodeRebootedSecondsValue(configKey, decreasedCalculatedMinSafeTime)
})

})

})

})
})

func verifyMinSafeTimeToAssumeNodeRebootedSecondsValue(configKey client.ObjectKey, expectedValue int) {
Eventually(func(g Gomega) bool {
tmpConfig := &v1alpha1.SelfNodeRemediationConfig{}
g.Expect(k8sClient.Get(context.TODO(), configKey, tmpConfig)).To(Succeed())
//Configuration status fields are now filled by the agent
g.Expect(tmpConfig.Status.MinSafeTimeToAssumeNodeRebootedSeconds).To(Equal(expectedValue))
return true
}, 5*time.Second, 250*time.Millisecond).Should(BeTrue())
}

func increaseApiCheckInConfig(configKey client.ObjectKey) {
modifyApiCheckInConfig(configKey, true)
}

func decreaseApiCheckInConfig(configKey client.ObjectKey) {
modifyApiCheckInConfig(configKey, false)
}

func modifyApiCheckInConfig(configKey client.ObjectKey, isIncrease bool) {
tmpConfig := &v1alpha1.SelfNodeRemediationConfig{}
Expect(k8sClient.Get(context.TODO(), configKey, tmpConfig)).To(Succeed())
originalCheckInterval := tmpConfig.Spec.ApiCheckInterval
var newVal time.Duration
if isIncrease {
newVal = originalCheckInterval.Duration + time.Second
} else {
newVal = originalCheckInterval.Duration - time.Second
}

tmpConfig.Spec.ApiCheckInterval = &metav1.Duration{Duration: newVal}
Expect(k8sClient.Update(context.TODO(), tmpConfig)).To(Succeed())

//Wait for value update in config
Eventually(func(g Gomega) bool {
tmpConfig := &v1alpha1.SelfNodeRemediationConfig{}
g.Expect(k8sClient.Get(context.TODO(), configKey, tmpConfig)).To(Succeed())
g.Expect(tmpConfig.Spec.ApiCheckInterval.Duration).To(Equal(newVal))
return true
}, 5*time.Second, 250*time.Millisecond).Should(BeTrue())

}

func createDefaultSelfNodeRemediationConfigCR() *v1alpha1.SelfNodeRemediationConfig {
snrc := &v1alpha1.SelfNodeRemediationConfig{}
snrc.Name = "test"
snrc.Namespace = "default"

//default values for time fields
snrc.Spec.PeerApiServerTimeout = &metav1.Duration{Duration: 5 * time.Second}
snrc.Spec.ApiServerTimeout = &metav1.Duration{Duration: 5 * time.Second}
snrc.Spec.PeerDialTimeout = &metav1.Duration{Duration: 5 * time.Second}
snrc.Spec.PeerRequestTimeout = &metav1.Duration{Duration: 5 * time.Second}
snrc.Spec.ApiCheckInterval = &metav1.Duration{Duration: 15 * time.Second}
snrc.Spec.PeerUpdateInterval = &metav1.Duration{Duration: 15 * time.Minute}

return snrc
}

func simulateSNRAgentStart(configKey client.ObjectKey) {
config := &v1alpha1.SelfNodeRemediationConfig{}
Expect(k8sClient.Get(context.TODO(), configKey, config)).To(Succeed())
calc := NewAgentSafeTimeCalculator(k8sClient, nil, nil, config.Spec.MaxApiErrorThreshold, config.Spec.ApiCheckInterval.Duration, config.Spec.ApiServerTimeout.Duration, config.Spec.PeerDialTimeout.Duration, config.Spec.PeerRequestTimeout.Duration, 0)

//Simulating agent starting according to config
Expect(calc.Start(context.TODO())).To(Succeed())
//wait for agent to finish updating the config
time.Sleep(time.Second)

}

func createConfig(config *v1alpha1.SelfNodeRemediationConfig) {
//Initial configuration creation
Expect(k8sClient.Create(context.TODO(), config.DeepCopy())).To(Succeed())
//Wait for config to be created
Eventually(func(g Gomega) bool {
tmpConfig := &v1alpha1.SelfNodeRemediationConfig{}
err := k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(config), tmpConfig)
g.Expect(err).To(Succeed())
return true
}, 5*time.Second, 250*time.Millisecond).Should(BeTrue())
}
65 changes: 64 additions & 1 deletion pkg/reboot/suite_test.go
Original file line number Diff line number Diff line change
@@ -1,21 +1,84 @@
package reboot

import (
"context"
"path/filepath"
"testing"
"time"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"go.uber.org/zap/zapcore"

"k8s.io/client-go/kubernetes/scheme"
"k8s.io/client-go/rest"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/envtest"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"

selfnoderemediationv1alpha1 "github.com/medik8s/self-node-remediation/api/v1alpha1"
)

var testEnv *envtest.Environment
var k8sClient client.Client
var cfg *rest.Config
var cancelFunc context.CancelFunc

func TestWatchdog(t *testing.T) {
RegisterFailHandler(Fail)
RunSpecs(t, "Rebooter Suite")
}

var _ = BeforeSuite(func() {
By("bootstrapping test environment")
opts := zap.Options{
Development: true,
TimeEncoder: zapcore.RFC3339NanoTimeEncoder,
}
logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseFlagOptions(&opts)))

By("bootstrapping test environment")
testEnv = &envtest.Environment{
CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")},
ErrorIfCRDPathMissing: true,
}

var err error
cfg, err = testEnv.Start()
Expect(err).NotTo(HaveOccurred())
Expect(cfg).NotTo(BeNil())

err = selfnoderemediationv1alpha1.AddToScheme(scheme.Scheme)
Expect(err).NotTo(HaveOccurred())

//+kubebuilder:scaffold:scheme

gracefulShutdown := 0 * time.Second
Expect(err).ToNot(HaveOccurred())
k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{
Scheme: scheme.Scheme,
LeaderElection: false,
MetricsBindAddress: "0",
GracefulShutdownTimeout: &gracefulShutdown,
})
Expect(err).ToNot(HaveOccurred())

k8sClient = k8sManager.GetClient()
Expect(k8sClient).ToNot(BeNil())

var ctx context.Context
ctx, cancelFunc = context.WithCancel(ctrl.SetupSignalHandler())
go func() {
defer GinkgoRecover()
err = k8sManager.Start(ctx)
Expect(err).ToNot(HaveOccurred())
}()
})

var _ = AfterSuite(func() {

By("tearing down the test environment")
cancelFunc()
Expect(testEnv.Stop()).To(Succeed())
})

0 comments on commit 04a6964

Please sign in to comment.