From 617924b0446d6ec2b3628ba8994466cf09f09bf0 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Thu, 18 Apr 2024 14:52:17 +0300 Subject: [PATCH 01/21] - Setting an annotation with min reboot time on configuration - overrriding SafeTimeToReboot value in configuration in case it's invalid Signed-off-by: Michael Shitrit --- .../selfnoderemediationconfig_types.go | 3 +- ...medik8s.io_selfnoderemediationconfigs.yaml | 3 +- ...medik8s.io_selfnoderemediationconfigs.yaml | 3 +- go.mod | 2 +- go.sum | 4 +- main.go | 5 +- pkg/reboot/calculator.go | 70 +++++++++++++++++-- pkg/utils/annotations.go | 1 + .../medik8s/common/pkg/events/events.go | 40 ++++++++--- vendor/modules.txt | 2 +- 10 files changed, 107 insertions(+), 26 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index dd4b967df..bbf7326f1 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -69,7 +69,8 @@ type SelfNodeRemediationConfigSpec struct { // Valid time units are "ms", "s", "m", "h". // +optional // +kubebuilder:default:="15m" - // +kubebuilder:validation:Pattern="^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$" + // +kubebuilder:validation:Pattern="^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$" + // +kubebuilder:validation:Type:=string PeerUpdateInterval *metav1.Duration `json:"peerUpdateInterval,omitempty"` diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 16905f068..2fe356728 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -148,8 +148,7 @@ spec: type: string peerUpdateInterval: default: 15m - description: Valid time units are "ms", "s", "m", "h". - pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$ + pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$ type: string safeTimeToAssumeNodeRebootedSeconds: default: 180 diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 2faa4cc02..2d0767e96 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -146,8 +146,7 @@ spec: type: string peerUpdateInterval: default: 15m - description: Valid time units are "ms", "s", "m", "h". - pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$ + pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$ type: string safeTimeToAssumeNodeRebootedSeconds: default: 180 diff --git a/go.mod b/go.mod index c857cb846..9ebe8705b 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/Masterminds/sprig v2.22.0+incompatible github.com/go-logr/logr v1.2.3 github.com/go-ping/ping v1.1.0 - github.com/medik8s/common v1.15.1 + github.com/medik8s/common v1.17.0 github.com/onsi/ginkgo/v2 v2.9.1 github.com/onsi/gomega v1.27.4 github.com/openshift/api v0.0.0-20230414143018-3367bc7e6ac7 // release-4.13 diff --git a/go.sum b/go.sum index c11e53de3..41396d4d6 100644 --- a/go.sum +++ b/go.sum @@ -216,8 +216,8 @@ github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJ github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/matttproud/golang_protobuf_extensions v1.0.2 h1:hAHbPm5IJGijwng3PWk09JkG9WeqChjprR5s9bBZ+OM= github.com/matttproud/golang_protobuf_extensions v1.0.2/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4= -github.com/medik8s/common v1.15.1 h1:qo2FBZGSegf5q35AZlWzrmgwW1GfUPNmmWaPhb/Uurc= -github.com/medik8s/common v1.15.1/go.mod h1:A9jYldC6PZcAuBowNNm712FqWdASB2ey5Vjp8MYN/PY= +github.com/medik8s/common v1.17.0 h1:AmJKx0tzqGZF27Ot0A4ak85q0F0zqUkVyCvYmm67rtY= +github.com/medik8s/common v1.17.0/go.mod h1:A9jYldC6PZcAuBowNNm712FqWdASB2ey5Vjp8MYN/PY= github.com/mitchellh/copystructure v1.1.2 h1:Th2TIvG1+6ma3e/0/bopBKohOTY7s4dA8V2q4EUcBJ0= github.com/mitchellh/copystructure v1.1.2/go.mod h1:EBArHfARyrSWO/+Wyr9zwEkc6XMFB9XyNgFNmRkZZU4= github.com/mitchellh/reflectwalk v1.0.1 h1:FVzMWA5RllMAKIdUSC8mdWo3XtwoecrH79BY70sEEpE= diff --git a/main.go b/main.go index 07178eb13..96ef4aeb4 100644 --- a/main.go +++ b/main.go @@ -305,7 +305,8 @@ func initSelfNodeRemediationAgent(mgr manager.Manager) { timeToAssumeNodeRebootedInSeconds := getIntEnvVarOrDie("TIME_TO_ASSUME_NODE_REBOOTED") peerHealthDefaultPort := getIntEnvVarOrDie("HOST_PORT") - safeRebootCalc := reboot.NewAgentSafeTimeCalculator(mgr.GetClient(), wd, maxErrorThreshold, apiCheckInterval, apiServerTimeout, peerDialTimeout, peerRequestTimeout, time.Duration(timeToAssumeNodeRebootedInSeconds)*time.Second) + snrRecorder := mgr.GetEventRecorderFor("SelfNodeRemediation") + safeRebootCalc := reboot.NewAgentSafeTimeCalculator(mgr.GetClient(), snrRecorder, wd, maxErrorThreshold, apiCheckInterval, apiServerTimeout, peerDialTimeout, peerRequestTimeout, time.Duration(timeToAssumeNodeRebootedInSeconds)*time.Second) if err = mgr.Add(safeRebootCalc); err != nil { setupLog.Error(err, "failed to add safe reboot time calculator to the manager") os.Exit(1) @@ -358,7 +359,7 @@ func initSelfNodeRemediationAgent(mgr manager.Manager) { Client: mgr.GetClient(), Log: ctrl.Log.WithName("controllers").WithName("SelfNodeRemediation"), Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("SelfNodeRemediation"), + Recorder: snrRecorder, Rebooter: rebooter, MyNodeName: myNodeName, RestoreNodeAfter: restoreNodeAfter, diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index f591bcbc2..5a2eb35d2 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -3,17 +3,22 @@ package reboot import ( "context" "fmt" + "strconv" "time" "github.com/go-logr/logr" + "github.com/medik8s/common/pkg/events" commonlabels "github.com/medik8s/common/pkg/labels" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + "github.com/medik8s/self-node-remediation/api/v1alpha1" + "github.com/medik8s/self-node-remediation/pkg/utils" "github.com/medik8s/self-node-remediation/pkg/watchdog" ) @@ -41,11 +46,12 @@ type safeTimeCalculator struct { apiCheckInterval, apiServerTimeout, peerDialTimeout, peerRequestTimeout time.Duration log logr.Logger k8sClient client.Client + recorder record.EventRecorder highestCalculatedBatchNumber int isAgent bool } -func NewAgentSafeTimeCalculator(k8sClient client.Client, wd watchdog.Watchdog, maxErrorThreshold int, apiCheckInterval, apiServerTimeout, peerDialTimeout, peerRequestTimeout, timeToAssumeNodeRebooted time.Duration) SafeTimeCalculator { +func NewAgentSafeTimeCalculator(k8sClient client.Client, recorder record.EventRecorder, wd watchdog.Watchdog, maxErrorThreshold int, apiCheckInterval, apiServerTimeout, peerDialTimeout, peerRequestTimeout, timeToAssumeNodeRebooted time.Duration) SafeTimeCalculator { return &safeTimeCalculator{ wd: wd, maxErrorThreshold: maxErrorThreshold, @@ -55,6 +61,7 @@ func NewAgentSafeTimeCalculator(k8sClient client.Client, wd watchdog.Watchdog, m peerRequestTimeout: peerRequestTimeout, timeToAssumeNodeRebooted: timeToAssumeNodeRebooted, k8sClient: k8sClient, + recorder: recorder, isAgent: true, log: ctrl.Log.WithName("safe-time-calculator"), } @@ -106,11 +113,66 @@ func (s *safeTimeCalculator) calcMinTimeAssumeRebooted() error { s.log.Info("calculated minTimeToAssumeNodeRebooted is:", "minTimeToAssumeNodeRebooted", minTime) s.minTimeToAssumeNodeRebooted = minTime - if s.timeToAssumeNodeRebooted < minTime { - err := fmt.Errorf("snr agent can't start: the requested value for SafeTimeToAssumeNodeRebootedSeconds is too low") - s.log.Error(err, err.Error(), "requested SafeTimeToAssumeNodeRebootedSeconds", s.timeToAssumeNodeRebooted, "minimal calculated value for SafeTimeToAssumeNodeRebootedSeconds", minTime) + //update related logic of min time on configuration if necessary + if err := s.manageSafeRebootTimeInConfiguration(minTime); err != nil { return err } + + return nil +} + +// manageSafeRebootTimeInConfiguration does two things: +// 1. It sets an annotation on the configuration with the most updated value of minTimeToAssumeNodeRebooted in seconds (this value will be used by the webhook to verify user doesn't set an invalid value for SafeTimeToAssumeNodeRebootedSeconds). +// 2. In case SafeTimeToAssumeNodeRebootedSeconds is too low (either because of the default configuration or because minvalue has changed due to an update of another field) it'll set it to a valid value and issue an event. +func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Duration) error { + minTimeSec := int(minTime.Seconds()) + //set it on configuration + confList := &v1alpha1.SelfNodeRemediationConfigList{} + if err := s.k8sClient.List(context.Background(), confList); err != nil { + s.log.Error(err, "failed to get snr configuration") + + return err + } + + for _, snrConf := range confList.Items { + var isUpdateRequired bool + if snrConf.GetAnnotations() == nil { + snrConf.Annotations = make(map[string]string) + } + prevMinRebootTimeSec, isSet := snrConf.GetAnnotations()[utils.MinSafeTimeAnnotation] + + minTimeSecString := strconv.Itoa(minTimeSec) + if !isSet || prevMinRebootTimeSec != minTimeSecString { + if !isSet { + s.log.Info("snr agent about to modify config adding an annotation", "annotation key", utils.MinSafeTimeAnnotation, "annotation value", minTimeSecString) + } else { + s.log.Info("snr agent about to modify config updating annotation value", "annotation key", utils.MinSafeTimeAnnotation, "annotation previous value", prevMinRebootTimeSec, "annotation new value", minTimeSecString) + } + snrConf.GetAnnotations()[utils.MinSafeTimeAnnotation] = minTimeSecString + isUpdateRequired = true + } + + if snrConf.Spec.SafeTimeToAssumeNodeRebootedSeconds < minTimeSec { + isUpdateRequired = true + snrConf.Spec.SafeTimeToAssumeNodeRebootedSeconds = minTimeSec + s.SetTimeToAssumeNodeRebooted(time.Duration(minTimeSec) * time.Second) + msg, reason := "Automatic update since value isn't valid anymore", "SafeTimeToAssumeNodeRebootedSecondsModified" + if !isSet { + msg = "Default value was lower than minimum value" + } else if prevMinRebootTimeSec != minTimeSecString { + msg = "Minimum value has changed and now is greater than configured value" + } + s.log.Info(fmt.Sprintf("%s:%s", reason, msg)) + events.NormalEvent(s.recorder, &snrConf, reason, msg) + } + + if isUpdateRequired { + if err := s.k8sClient.Update(context.Background(), &snrConf); err != nil { + s.log.Error(err, "failed to update SafeTimeToAssumeNodeRebootedSeconds with min value") + return err + } + } + } return nil } diff --git a/pkg/utils/annotations.go b/pkg/utils/annotations.go index 8e432a567..8054959be 100644 --- a/pkg/utils/annotations.go +++ b/pkg/utils/annotations.go @@ -16,6 +16,7 @@ const ( // IsRebootCapableAnnotation value is the key name for the node's annotation that will determine if node is reboot capable IsRebootCapableAnnotation = "is-reboot-capable.self-node-remediation.medik8s.io" IsSoftwareRebootEnabledEnvVar = "IS_SOFTWARE_REBOOT_ENABLED" + MinSafeTimeAnnotation = "minimum-safe-reboot-sec.self-node-remediation.medik8s.io" ) // UpdateNodeWithIsRebootCapableAnnotation updates the is-reboot-capable node annotation to be true if any kind diff --git a/vendor/github.com/medik8s/common/pkg/events/events.go b/vendor/github.com/medik8s/common/pkg/events/events.go index cdc6620c4..07d50a0db 100644 --- a/vendor/github.com/medik8s/common/pkg/events/events.go +++ b/vendor/github.com/medik8s/common/pkg/events/events.go @@ -9,9 +9,20 @@ import ( ) // Event message format "medik8s " -const customFmt = "[remediation] %s" +const ( + customFmt = "[remediation] %s" -// NormalEvent will record an event with type Normal and fixed message. + RemediationStartedEventReason = "RemediationStarted" + RemediationStoppedEventReason = "RemediationStopped" + RemediationFinishedEventReason = "RemediationFinished" + RemediationCannotStartEventReason = "RemediationCannotStart" + remediationStartedEventMessage = "Remediation started" + remediationStoppedEventMessage = "NHC added the timed-out annotation, remediation will be stopped" + remediationFinishedEventMessage = "Remediation finished" + remediationCannotStartTargetNodeFailedEventMessage = "Could not get remediation target Node" +) + +// NormalEvent will record an event with type Normal and custom message. func NormalEvent(recorder record.EventRecorder, object runtime.Object, reason, message string) { recorder.Event(object, corev1.EventTypeNormal, reason, fmt.Sprintf(customFmt, message)) } @@ -22,7 +33,7 @@ func NormalEventf(recorder record.EventRecorder, object runtime.Object, reason, recorder.Event(object, corev1.EventTypeNormal, reason, fmt.Sprintf(customFmt, message)) } -// WarningEvent will record an event with type Warning and fixed message. +// WarningEvent will record an event with type Warning and custom message. func WarningEvent(recorder record.EventRecorder, object runtime.Object, reason, message string) { recorder.Event(object, corev1.EventTypeWarning, reason, fmt.Sprintf(customFmt, message)) } @@ -35,22 +46,29 @@ func WarningEventf(recorder record.EventRecorder, object runtime.Object, reason, // Special case events -// RemediationStarted will record a Normal event with reason RemediationStarted and message "Remediation started". +// RemediationStarted will record a Normal event to signal that the remediation has started. func RemediationStarted(recorder record.EventRecorder, object runtime.Object) { - NormalEvent(recorder, object, "RemediationStarted", "Remediation started") + NormalEvent(recorder, object, RemediationStartedEventReason, remediationStartedEventMessage) } -// RemediationStoppedByNHC will record a Normal event with reason RemediationStopped and message "NHC added the timed-out annotation, remediation will be stopped". +// RemediationStoppedByNHC will record a Normal event to signal that the remediation was stopped by the Node Healthcheck operator. func RemediationStoppedByNHC(recorder record.EventRecorder, object runtime.Object) { - NormalEvent(recorder, object, "RemediationStopped", "NHC added the timed-out annotation, remediation will be stopped") + NormalEvent(recorder, object, RemediationStoppedEventReason, remediationStoppedEventMessage) } -// RemediationFinished will record a Normal event with reason RemediationFinished and message "Remediation finished". +// RemediationFinished will record a Normal event to signal that the remediation has finished. func RemediationFinished(recorder record.EventRecorder, object runtime.Object) { - NormalEvent(recorder, object, "RemediationFinished", "Remediation finished") + NormalEvent(recorder, object, RemediationFinishedEventReason, remediationFinishedEventMessage) +} + +// RemediationCannotStart will record a Warning event to signal that the remediation cannot start. A custom message can +// be used to further explain the reason. +func RemediationCannotStart(recorder record.EventRecorder, object runtime.Object, message string) { + WarningEvent(recorder, object, RemediationCannotStartEventReason, message) } -// GetTargetNodeFailed will record a Warning event with reason RemediationFailed and message "Could not get remediation target node". +// GetTargetNodeFailed will record a Warning event to signal that the remediation cannot start because the target Node +// could not be found. func GetTargetNodeFailed(recorder record.EventRecorder, object runtime.Object) { - WarningEvent(recorder, object, "RemediationCannotStart", "Could not get remediation target Node") + RemediationCannotStart(recorder, object, remediationCannotStartTargetNodeFailedEventMessage) } diff --git a/vendor/modules.txt b/vendor/modules.txt index 1f54dc954..e901dc888 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -107,7 +107,7 @@ github.com/mailru/easyjson/jwriter # github.com/matttproud/golang_protobuf_extensions v1.0.2 ## explicit; go 1.9 github.com/matttproud/golang_protobuf_extensions/pbutil -# github.com/medik8s/common v1.15.1 +# github.com/medik8s/common v1.17.0 ## explicit; go 1.20 github.com/medik8s/common/pkg/events github.com/medik8s/common/pkg/labels From f01a3391baad7ce270e6913d8e934b47b284cc5f Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Thu, 18 Apr 2024 14:53:33 +0300 Subject: [PATCH 02/21] Using the webhook to prevent user setting an invalid value for Safe Time to reboot Signed-off-by: Michael Shitrit --- .../selfnoderemediationconfig_webhook.go | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook.go b/api/v1alpha1/selfnoderemediationconfig_webhook.go index 74d2660d3..588fc38cc 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook.go @@ -18,14 +18,19 @@ package v1alpha1 import ( "fmt" + "strconv" "time" + pkgerrors "github.com/pkg/errors" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/errors" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook" + + "github.com/medik8s/self-node-remediation/pkg/utils" ) // fields names @@ -85,6 +90,7 @@ func (r *SelfNodeRemediationConfig) ValidateUpdate(_ runtime.Object) error { return errors.NewAggregate([]error{ r.validateTimes(), r.validateCustomTolerations(), + r.validateMinRebootTime(), }) } @@ -172,3 +178,19 @@ func validateToleration(toleration v1.Toleration) error { } return nil } + +func (r *SelfNodeRemediationConfig) validateMinRebootTime() error { + annotations := r.GetAnnotations() + if annotations == nil { + return fmt.Errorf("failed to verify min value of SafeRebootTimeSec, %s annotation should not be empty", utils.MinSafeTimeAnnotation) + + } + if minValString, isSet := annotations[utils.MinSafeTimeAnnotation]; isSet { + if minVal, err := strconv.Atoi(minValString); err != nil { + return pkgerrors.Wrapf(err, "failed to verify min value of SafeRebootTimeSec") + } else if r.Spec.SafeTimeToAssumeNodeRebootedSeconds < minVal { + return fmt.Errorf("can not set SafeTimeToAssumeNodeRebootedSeconds value below the calculated minimum value of: %s", minValString) + } + } + return nil +} From 14105633fbbc5f5c729eb6929b28ed858d9fb7b6 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Thu, 18 Apr 2024 14:53:51 +0300 Subject: [PATCH 03/21] UT & e2e test Signed-off-by: Michael Shitrit --- .../selfnoderemediationconfig_webhook_test.go | 39 +++++++++++++++++++ e2e/self_node_remediation_test.go | 19 +++++++++ 2 files changed, 58 insertions(+) diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go index f2a156cb5..a98c58fed 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go @@ -10,6 +10,8 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/pointer" + + "github.com/medik8s/self-node-remediation/pkg/utils" ) // default CR fields durations @@ -71,6 +73,41 @@ var _ = Describe("SelfNodeRemediationConfig Validation", func() { // test update validation on a valid CR testValidCR("update") + Context("SafeTimeToAssumeNodeRebootedSeconds Validation", func() { + var oldSnrc, newSnrc *SelfNodeRemediationConfig + BeforeEach(func() { + oldSnrc = createDefaultSelfNodeRemediationConfigCR() + newSnrc = oldSnrc.DeepCopy() + newSnrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = 200 + }) + When("Annotation does not exit", func() { + It("validation should fail", func() { + err := newSnrc.ValidateUpdate(oldSnrc) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(MatchRegexp("annotation should not be empty")) + }) + }) + When("Annotation value is higher than user assigned value", func() { + BeforeEach(func() { + newSnrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "220"} + }) + It("validation should fail", func() { + err := newSnrc.ValidateUpdate(oldSnrc) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(MatchRegexp("value below the calculated minimum value of: 220")) + }) + }) + When("Annotation value is lower than user assigned value", func() { + BeforeEach(func() { + newSnrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "190"} + }) + It("validation should pass", func() { + Expect(newSnrc.ValidateUpdate(oldSnrc)).To(Succeed()) + }) + }) + + }) + }) }) @@ -174,6 +211,8 @@ func testValidCR(validationType string) { snrc.Spec.ApiCheckInterval = &metav1.Duration{Duration: 10*time.Second + 500*time.Millisecond} snrc.Spec.PeerUpdateInterval = &metav1.Duration{Duration: 10 * time.Second} snrc.Spec.CustomDsTolerations = []v1.Toleration{{Key: "validValue", Effect: v1.TaintEffectNoExecute}, {}, {Operator: v1.TolerationOpEqual, TolerationSeconds: pointer.Int64(-5)}, {Value: "SomeValidValue"}} + snrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = DefaultSafeToAssumeNodeRebootTimeout + snrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "150"} Context("for valid CR", func() { It("should not be rejected", func() { diff --git a/e2e/self_node_remediation_test.go b/e2e/self_node_remediation_test.go index 5678c2860..31b4722a7 100644 --- a/e2e/self_node_remediation_test.go +++ b/e2e/self_node_remediation_test.go @@ -38,6 +38,25 @@ const ( var _ = Describe("Self Node Remediation E2E", func() { + Describe("Setup", func() { + It("Verify min reboot annotation", func() { + configs := v1alpha1.SelfNodeRemediationConfigList{} + Expect(k8sClient.List(context.Background(), &configs)).To(Succeed()) + Expect(len(configs.Items)).To(Equal(1)) + config := configs.Items[0] + annotations := config.GetAnnotations() + Expect(annotations).ToNot(BeNil()) + minSafeTimeAnnotation := "minimum-safe-reboot-sec.self-node-remediation.medik8s.io" + minSafeTimeValueString, isExist := annotations[minSafeTimeAnnotation] + Expect(isExist).To(BeTrue()) + minSafeTimeValue, err := strconv.Atoi(minSafeTimeValueString) + Expect(err).To(Succeed()) + Expect(minSafeTimeValue).To(BeNumerically(">", 0)) + safeTimeValue := config.Spec.SafeTimeToAssumeNodeRebootedSeconds + Expect(safeTimeValue).To(BeNumerically(">=", minSafeTimeValue)) + }) + }) + Describe("Workers Remediation", func() { var node *v1.Node workers := &v1.NodeList{} From 46c47443ee2bdb1e668a3aea2e9ee39969dd3cd4 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Thu, 25 Apr 2024 16:25:23 +0300 Subject: [PATCH 04/21] updating var names & typo fix Signed-off-by: Michael Shitrit --- .../selfnoderemediationconfig_webhook_test.go | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go index a98c58fed..ef8c554aa 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go @@ -74,35 +74,35 @@ var _ = Describe("SelfNodeRemediationConfig Validation", func() { testValidCR("update") Context("SafeTimeToAssumeNodeRebootedSeconds Validation", func() { - var oldSnrc, newSnrc *SelfNodeRemediationConfig + var originalSnrc, updatedSnrc *SelfNodeRemediationConfig BeforeEach(func() { - oldSnrc = createDefaultSelfNodeRemediationConfigCR() - newSnrc = oldSnrc.DeepCopy() - newSnrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = 200 + originalSnrc = createDefaultSelfNodeRemediationConfigCR() + updatedSnrc = originalSnrc.DeepCopy() + updatedSnrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = 200 }) - When("Annotation does not exit", func() { + When("Annotation does not exist", func() { It("validation should fail", func() { - err := newSnrc.ValidateUpdate(oldSnrc) + err := updatedSnrc.ValidateUpdate(originalSnrc) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(MatchRegexp("annotation should not be empty")) }) }) When("Annotation value is higher than user assigned value", func() { BeforeEach(func() { - newSnrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "220"} + updatedSnrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "220"} }) It("validation should fail", func() { - err := newSnrc.ValidateUpdate(oldSnrc) + err := updatedSnrc.ValidateUpdate(originalSnrc) Expect(err).To(HaveOccurred()) Expect(err.Error()).To(MatchRegexp("value below the calculated minimum value of: 220")) }) }) When("Annotation value is lower than user assigned value", func() { BeforeEach(func() { - newSnrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "190"} + updatedSnrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "190"} }) It("validation should pass", func() { - Expect(newSnrc.ValidateUpdate(oldSnrc)).To(Succeed()) + Expect(updatedSnrc.ValidateUpdate(originalSnrc)).To(Succeed()) }) }) From 98b9a510cc502dac86bebe2e573845beb394f757 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 15 Apr 2024 15:44:27 +0300 Subject: [PATCH 05/21] remove default value Signed-off-by: Michael Shitrit --- api/v1alpha1/selfnoderemediationconfig_types.go | 13 +++++-------- .../selfnoderemediationconfig_webhook_test.go | 3 +-- ...ation.medik8s.io_selfnoderemediationconfigs.yaml | 1 - ...ation.medik8s.io_selfnoderemediationconfigs.yaml | 1 - controllers/selfnoderemediationconfig_controller.go | 3 --- .../selfnoderemediationconfig_controller_test.go | 1 - main.go | 2 +- pkg/reboot/calculator.go | 9 ++++----- 8 files changed, 11 insertions(+), 22 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index bbf7326f1..e9da275d0 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -25,10 +25,9 @@ import ( // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. const ( - ConfigCRName = "self-node-remediation-config" - defaultWatchdogPath = "/dev/watchdog" - DefaultSafeToAssumeNodeRebootTimeout = 180 - defaultIsSoftwareRebootEnabled = true + ConfigCRName = "self-node-remediation-config" + defaultWatchdogPath = "/dev/watchdog" + defaultIsSoftwareRebootEnabled = true ) // SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig @@ -47,7 +46,6 @@ type SelfNodeRemediationConfigSpec struct { // In an effort to prevent this, the operator ignores values lower than a minimum calculated from the // ApiCheckInterval, ApiServerTimeout, MaxApiErrorThreshold, PeerDialTimeout, and PeerRequestTimeout fields. // +kubebuilder:validation:Minimum=0 - // +kubebuilder:default=180 SafeTimeToAssumeNodeRebootedSeconds int `json:"safeTimeToAssumeNodeRebootedSeconds,omitempty"` // Valid time units are "ms", "s", "m", "h". @@ -162,9 +160,8 @@ func NewDefaultSelfNodeRemediationConfig() SelfNodeRemediationConfig { return SelfNodeRemediationConfig{ ObjectMeta: metav1.ObjectMeta{Name: ConfigCRName}, Spec: SelfNodeRemediationConfigSpec{ - WatchdogFilePath: defaultWatchdogPath, - SafeTimeToAssumeNodeRebootedSeconds: DefaultSafeToAssumeNodeRebootTimeout, - IsSoftwareRebootEnabled: defaultIsSoftwareRebootEnabled, + WatchdogFilePath: defaultWatchdogPath, + IsSoftwareRebootEnabled: defaultIsSoftwareRebootEnabled, }, } } diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go index ef8c554aa..1a26c2777 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go @@ -211,9 +211,8 @@ func testValidCR(validationType string) { snrc.Spec.ApiCheckInterval = &metav1.Duration{Duration: 10*time.Second + 500*time.Millisecond} snrc.Spec.PeerUpdateInterval = &metav1.Duration{Duration: 10 * time.Second} snrc.Spec.CustomDsTolerations = []v1.Toleration{{Key: "validValue", Effect: v1.TaintEffectNoExecute}, {}, {Operator: v1.TolerationOpEqual, TolerationSeconds: pointer.Int64(-5)}, {Value: "SomeValidValue"}} - snrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = DefaultSafeToAssumeNodeRebootTimeout snrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "150"} - + snrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = 160 Context("for valid CR", func() { It("should not be rejected", func() { var err error diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 2fe356728..311d9629c 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -151,7 +151,6 @@ spec: pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$ type: string safeTimeToAssumeNodeRebootedSeconds: - default: 180 description: |- SafeTimeToAssumeNodeRebootedSeconds is the time after which the healthy self node remediation agents will assume the unhealthy node has been rebooted, and it is safe to recover affected workloads. diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 2d0767e96..558b6e76a 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -149,7 +149,6 @@ spec: pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$ type: string safeTimeToAssumeNodeRebootedSeconds: - default: 180 description: |- SafeTimeToAssumeNodeRebootedSeconds is the time after which the healthy self node remediation agents will assume the unhealthy node has been rebooted, and it is safe to recover affected workloads. diff --git a/controllers/selfnoderemediationconfig_controller.go b/controllers/selfnoderemediationconfig_controller.go index 888d0e474..e90aa72f4 100644 --- a/controllers/selfnoderemediationconfig_controller.go +++ b/controllers/selfnoderemediationconfig_controller.go @@ -138,9 +138,6 @@ func (r *SelfNodeRemediationConfigReconciler) syncConfigDaemonSet(ctx context.Co data.Data["HostPort"] = snrConfig.Spec.HostPort safeTimeToAssumeNodeRebootedSeconds := snrConfig.Spec.SafeTimeToAssumeNodeRebootedSeconds - if safeTimeToAssumeNodeRebootedSeconds == 0 { - safeTimeToAssumeNodeRebootedSeconds = selfnoderemediationv1alpha1.DefaultSafeToAssumeNodeRebootTimeout - } data.Data["TimeToAssumeNodeRebooted"] = fmt.Sprintf("\"%d\"", safeTimeToAssumeNodeRebootedSeconds) data.Data["IsSoftwareRebootEnabled"] = fmt.Sprintf("\"%t\"", snrConfig.Spec.IsSoftwareRebootEnabled) diff --git a/controllers/tests/config/selfnoderemediationconfig_controller_test.go b/controllers/tests/config/selfnoderemediationconfig_controller_test.go index 8327f526b..b883ac383 100644 --- a/controllers/tests/config/selfnoderemediationconfig_controller_test.go +++ b/controllers/tests/config/selfnoderemediationconfig_controller_test.go @@ -232,7 +232,6 @@ var _ = Describe("SNR Config Test", func() { }, 5*time.Second, 250*time.Millisecond).Should(BeNil()) Expect(createdConfig.Spec.WatchdogFilePath).To(Equal("/dev/watchdog")) - Expect(createdConfig.Spec.SafeTimeToAssumeNodeRebootedSeconds).To(Equal(180)) Expect(createdConfig.Spec.MaxApiErrorThreshold).To(Equal(3)) Expect(createdConfig.Spec.PeerApiServerTimeout.Seconds()).To(BeEquivalentTo(5)) diff --git a/main.go b/main.go index 96ef4aeb4..bd61c87ad 100644 --- a/main.go +++ b/main.go @@ -180,7 +180,7 @@ func initSelfNodeRemediationManager(mgr manager.Manager, enableHTTP2 bool) { os.Exit(1) } - safeRebootCalc := reboot.NewManagerSafeTimeCalculator(mgr.GetClient(), selfnoderemediationv1alpha1.DefaultSafeToAssumeNodeRebootTimeout*time.Second) + safeRebootCalc := reboot.NewManagerSafeTimeCalculator(mgr.GetClient()) if err = mgr.Add(safeRebootCalc); err != nil { setupLog.Error(err, "failed to add safe reboot time calculator to the manager") os.Exit(1) diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index 5a2eb35d2..59375bea5 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -67,12 +67,11 @@ func NewAgentSafeTimeCalculator(k8sClient client.Client, recorder record.EventRe } } -func NewManagerSafeTimeCalculator(k8sClient client.Client, timeToAssumeNodeRebooted time.Duration) SafeTimeCalculator { +func NewManagerSafeTimeCalculator(k8sClient client.Client) SafeTimeCalculator { return &safeTimeCalculator{ - timeToAssumeNodeRebooted: timeToAssumeNodeRebooted, - k8sClient: k8sClient, - isAgent: false, - log: ctrl.Log.WithName("safe-time-calculator"), + k8sClient: k8sClient, + isAgent: false, + log: ctrl.Log.WithName("safe-time-calculator"), } } From f47da601f19fac5c5c719cdad4d3e3224e0114d4 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Sun, 5 May 2024 19:17:59 +0300 Subject: [PATCH 06/21] Get min value for non-agent from config Signed-off-by: Michael Shitrit --- pkg/reboot/calculator.go | 44 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index 59375bea5..b49bbc85c 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -2,6 +2,7 @@ package reboot import ( "context" + "errors" "fmt" "strconv" "time" @@ -9,6 +10,7 @@ import ( "github.com/go-logr/logr" "github.com/medik8s/common/pkg/events" commonlabels "github.com/medik8s/common/pkg/labels" + pkgerrors "github.com/pkg/errors" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" @@ -76,12 +78,14 @@ func NewManagerSafeTimeCalculator(k8sClient client.Client) SafeTimeCalculator { } func (s *safeTimeCalculator) GetTimeToAssumeNodeRebooted() time.Duration { + minTime := s.minTimeToAssumeNodeRebooted if !s.isAgent { - return s.timeToAssumeNodeRebooted + //TODO mshitrit handle error + minTime, _ = s.getMinSafeTimeFromConfig() } - if s.timeToAssumeNodeRebooted < s.minTimeToAssumeNodeRebooted { - return s.minTimeToAssumeNodeRebooted + if s.timeToAssumeNodeRebooted < minTime { + return minTime } return s.timeToAssumeNodeRebooted } @@ -153,6 +157,7 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Du if snrConf.Spec.SafeTimeToAssumeNodeRebootedSeconds < minTimeSec { isUpdateRequired = true + //TODO mshitrit don't update the spec but create a warning in the status instead snrConf.Spec.SafeTimeToAssumeNodeRebootedSeconds = minTimeSec s.SetTimeToAssumeNodeRebooted(time.Duration(minTimeSec) * time.Second) msg, reason := "Automatic update since value isn't valid anymore", "SafeTimeToAssumeNodeRebootedSecondsModified" @@ -175,6 +180,39 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Du return nil } +func (s *safeTimeCalculator) getMinSafeTimeFromConfig() (time.Duration, error) { + //TODO mshitrit add some logs + confList := &v1alpha1.SelfNodeRemediationConfigList{} + if err := s.k8sClient.List(context.Background(), confList); err != nil { + s.log.Error(err, "failed to get snr configuration") + + return 0, err + } + + var config v1alpha1.SelfNodeRemediationConfig + if len(confList.Items) < 1 { + return 0, errors.New("SNR config not found") + } else { + config = confList.Items[0] + } + + if config.GetAnnotations() == nil { + return 0, errors.New("failed getting MinSafeTimeFromConfig config does not contain annotations") + } + + minTimeSecString, isFound := config.GetAnnotations()[utils.MinSafeTimeAnnotation] + if !isFound { + return 0, fmt.Errorf("failed getting MinSafeTimeFromConfig config does not contain %s annotation", utils.MinSafeTimeAnnotation) + } + + minTimeSec, err := strconv.Atoi(minTimeSecString) + if err != nil { + return 0, pkgerrors.Wrapf(err, "failed getting MinSafeTimeFromConfig") + } + return time.Duration(minTimeSec) * time.Second, nil + +} + func (s *safeTimeCalculator) IsAgent() bool { return s.isAgent } From 9f47e0afa736ed1bdf8ad6f09d48283536bf2d59 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Sun, 5 May 2024 20:20:22 +0300 Subject: [PATCH 07/21] Prevent non emty value for safe time on creation - at this point the value can't yet be verified Signed-off-by: Michael Shitrit --- .../selfnoderemediationconfig_webhook.go | 21 +++++++++++++------ .../selfnoderemediationconfig_webhook_test.go | 11 +++++++++- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook.go b/api/v1alpha1/selfnoderemediationconfig_webhook.go index 588fc38cc..4d7a2bc5a 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook.go @@ -35,12 +35,13 @@ import ( // fields names const ( - peerApiServerTimeout = "PeerApiServerTimeout" - apiServerTimeout = "ApiServerTimeout" - peerDialTimeout = "PeerDialTimeout" - peerRequestTimeout = "PeerRequestTimeout" - apiCheckInterval = "ApiCheckInterval" - peerUpdateInterval = "PeerUpdateInterval" + peerApiServerTimeout = "PeerApiServerTimeout" + apiServerTimeout = "ApiServerTimeout" + peerDialTimeout = "PeerDialTimeout" + peerRequestTimeout = "PeerRequestTimeout" + apiCheckInterval = "ApiCheckInterval" + peerUpdateInterval = "PeerUpdateInterval" + safeTimeToAssumeNodeRebootedSeconds = "SafeTimeToAssumeNodeRebootedSeconds" ) // minimal time durations allowed for fields @@ -79,6 +80,7 @@ func (r *SelfNodeRemediationConfig) ValidateCreate() error { return errors.NewAggregate([]error{ r.validateTimes(), r.validateCustomTolerations(), + r.validateSafeTimeEmpty(), }) } @@ -194,3 +196,10 @@ func (r *SelfNodeRemediationConfig) validateMinRebootTime() error { } return nil } + +func (r *SelfNodeRemediationConfig) validateSafeTimeEmpty() error { + if r.Spec.SafeTimeToAssumeNodeRebootedSeconds > 0 { + return fmt.Errorf("SafeTimeToAssumeNodeRebootedSeconds can only be set after configuration is created") + } + return nil +} diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go index 1a26c2777..814654eed 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go @@ -182,6 +182,11 @@ func testMultipleInvalidFields(validationType string) { snrc.Spec.CustomDsTolerations = []v1.Toleration{{Key: "validValue", Operator: "dummyInvalidOperatorValue"}} errorMsg += ", invalid operator for toleration: dummyInvalidOperatorValue" + if validationType == "create" { + setFieldValue(snrc, safeTimeToAssumeNodeRebootedSeconds, 180) + errorMsg += ", SafeTimeToAssumeNodeRebootedSeconds can only be set after configuration is created" + } + Context("for CR multiple invalid fields", func() { It("should be rejected", func() { var err error @@ -212,7 +217,9 @@ func testValidCR(validationType string) { snrc.Spec.PeerUpdateInterval = &metav1.Duration{Duration: 10 * time.Second} snrc.Spec.CustomDsTolerations = []v1.Toleration{{Key: "validValue", Effect: v1.TaintEffectNoExecute}, {}, {Operator: v1.TolerationOpEqual, TolerationSeconds: pointer.Int64(-5)}, {Value: "SomeValidValue"}} snrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "150"} - snrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = 160 + if validationType == "update" { + snrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = 160 + } Context("for valid CR", func() { It("should not be rejected", func() { var err error @@ -269,5 +276,7 @@ func setFieldValue(snrc *SelfNodeRemediationConfig, fieldName string, value time snrc.Spec.ApiCheckInterval = timeValue case peerUpdateInterval: snrc.Spec.PeerUpdateInterval = timeValue + case safeTimeToAssumeNodeRebootedSeconds: + snrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = int(value) } } From 916e9f1cff578def9cb105e24aa63570cb38196e Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 6 May 2024 11:20:41 +0300 Subject: [PATCH 08/21] Replacing annotation with Status field Signed-off-by: Michael Shitrit --- .../selfnoderemediationconfig_types.go | 5 + .../selfnoderemediationconfig_webhook.go | 21 ++-- .../selfnoderemediationconfig_webhook_test.go | 14 ++- ...medik8s.io_selfnoderemediationconfigs.yaml | 7 ++ ...medik8s.io_selfnoderemediationconfigs.yaml | 7 ++ e2e/self_node_remediation_test.go | 10 +- pkg/reboot/calculator.go | 99 ++++++------------- pkg/utils/annotations.go | 1 - 8 files changed, 63 insertions(+), 101 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index e9da275d0..3e2e018fd 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -127,6 +127,11 @@ type SelfNodeRemediationConfigSpec struct { type SelfNodeRemediationConfigStatus struct { // INSERT ADDITIONAL STATUS FIELD - define observed state of cluster // Important: Run "make" to regenerate code after modifying this file + + // MinSafeTimeToAssumeNodeRebootedSeconds is the minimum value that can be assigned to SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds, it is calculated and assigned dynamically. + // +optional + // +kubebuilder:validation:Minimum=0 + MinSafeTimeToAssumeNodeRebootedSeconds int `json:"minSafeTimeToAssumeNodeRebootedSeconds,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook.go b/api/v1alpha1/selfnoderemediationconfig_webhook.go index 4d7a2bc5a..22f600587 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook.go @@ -18,19 +18,14 @@ package v1alpha1 import ( "fmt" - "strconv" "time" - pkgerrors "github.com/pkg/errors" - v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/util/errors" ctrl "sigs.k8s.io/controller-runtime" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook" - - "github.com/medik8s/self-node-remediation/pkg/utils" ) // fields names @@ -182,17 +177,13 @@ func validateToleration(toleration v1.Toleration) error { } func (r *SelfNodeRemediationConfig) validateMinRebootTime() error { - annotations := r.GetAnnotations() - if annotations == nil { - return fmt.Errorf("failed to verify min value of SafeRebootTimeSec, %s annotation should not be empty", utils.MinSafeTimeAnnotation) - + //TODO mshitrit remove status warning in case it's not relevant + if r.Status.MinSafeTimeToAssumeNodeRebootedSeconds == 0 { + return fmt.Errorf("failed to verify min value of SafeRebootTimeSec, Status.MinSafeTimeToAssumeNodeRebootedSeconds should not be empty") } - if minValString, isSet := annotations[utils.MinSafeTimeAnnotation]; isSet { - if minVal, err := strconv.Atoi(minValString); err != nil { - return pkgerrors.Wrapf(err, "failed to verify min value of SafeRebootTimeSec") - } else if r.Spec.SafeTimeToAssumeNodeRebootedSeconds < minVal { - return fmt.Errorf("can not set SafeTimeToAssumeNodeRebootedSeconds value below the calculated minimum value of: %s", minValString) - } + + if r.Status.MinSafeTimeToAssumeNodeRebootedSeconds > r.Spec.SafeTimeToAssumeNodeRebootedSeconds { + return fmt.Errorf("can not set SafeTimeToAssumeNodeRebootedSeconds value below the calculated minimum value of: %d", r.Status.MinSafeTimeToAssumeNodeRebootedSeconds) } return nil } diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go index 814654eed..8004060cd 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go @@ -10,8 +10,6 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/pointer" - - "github.com/medik8s/self-node-remediation/pkg/utils" ) // default CR fields durations @@ -80,16 +78,16 @@ var _ = Describe("SelfNodeRemediationConfig Validation", func() { updatedSnrc = originalSnrc.DeepCopy() updatedSnrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = 200 }) - When("Annotation does not exist", func() { + When("MinSafeTimeToAssumeNodeRebootedSeconds does not exist or empty", func() { It("validation should fail", func() { err := updatedSnrc.ValidateUpdate(originalSnrc) Expect(err).To(HaveOccurred()) - Expect(err.Error()).To(MatchRegexp("annotation should not be empty")) + Expect(err.Error()).To(MatchRegexp("Status.MinSafeTimeToAssumeNodeRebootedSeconds should not be empty")) }) }) - When("Annotation value is higher than user assigned value", func() { + When("MinSafeTimeToAssumeNodeRebootedSeconds value is higher than user assigned value", func() { BeforeEach(func() { - updatedSnrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "220"} + updatedSnrc.Status.MinSafeTimeToAssumeNodeRebootedSeconds = 220 }) It("validation should fail", func() { err := updatedSnrc.ValidateUpdate(originalSnrc) @@ -99,7 +97,7 @@ var _ = Describe("SelfNodeRemediationConfig Validation", func() { }) When("Annotation value is lower than user assigned value", func() { BeforeEach(func() { - updatedSnrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "190"} + updatedSnrc.Status.MinSafeTimeToAssumeNodeRebootedSeconds = 190 }) It("validation should pass", func() { Expect(updatedSnrc.ValidateUpdate(originalSnrc)).To(Succeed()) @@ -216,7 +214,7 @@ func testValidCR(validationType string) { snrc.Spec.ApiCheckInterval = &metav1.Duration{Duration: 10*time.Second + 500*time.Millisecond} snrc.Spec.PeerUpdateInterval = &metav1.Duration{Duration: 10 * time.Second} snrc.Spec.CustomDsTolerations = []v1.Toleration{{Key: "validValue", Effect: v1.TaintEffectNoExecute}, {}, {Operator: v1.TolerationOpEqual, TolerationSeconds: pointer.Int64(-5)}, {Value: "SomeValidValue"}} - snrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "150"} + snrc.Status.MinSafeTimeToAssumeNodeRebootedSeconds = 150 if validationType == "update" { snrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = 160 } diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 311d9629c..cc53448b5 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -169,6 +169,13 @@ spec: status: description: SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig + properties: + minSafeTimeToAssumeNodeRebootedSeconds: + description: MinSafeTimeToAssumeNodeRebootedSeconds is the minimum + value that can be assigned to SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds, + it is calculated and assigned dynamically. + minimum: 0 + type: integer type: object type: object served: true diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 558b6e76a..e01d62877 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -167,6 +167,13 @@ spec: status: description: SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig + properties: + minSafeTimeToAssumeNodeRebootedSeconds: + description: MinSafeTimeToAssumeNodeRebootedSeconds is the minimum + value that can be assigned to SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds, + it is calculated and assigned dynamically. + minimum: 0 + type: integer type: object type: object served: true diff --git a/e2e/self_node_remediation_test.go b/e2e/self_node_remediation_test.go index 31b4722a7..a725c744e 100644 --- a/e2e/self_node_remediation_test.go +++ b/e2e/self_node_remediation_test.go @@ -39,18 +39,12 @@ const ( var _ = Describe("Self Node Remediation E2E", func() { Describe("Setup", func() { - It("Verify min reboot annotation", func() { + It("Verify min reboot status value", func() { configs := v1alpha1.SelfNodeRemediationConfigList{} Expect(k8sClient.List(context.Background(), &configs)).To(Succeed()) Expect(len(configs.Items)).To(Equal(1)) config := configs.Items[0] - annotations := config.GetAnnotations() - Expect(annotations).ToNot(BeNil()) - minSafeTimeAnnotation := "minimum-safe-reboot-sec.self-node-remediation.medik8s.io" - minSafeTimeValueString, isExist := annotations[minSafeTimeAnnotation] - Expect(isExist).To(BeTrue()) - minSafeTimeValue, err := strconv.Atoi(minSafeTimeValueString) - Expect(err).To(Succeed()) + minSafeTimeValue := config.Status.MinSafeTimeToAssumeNodeRebootedSeconds Expect(minSafeTimeValue).To(BeNumerically(">", 0)) safeTimeValue := config.Spec.SafeTimeToAssumeNodeRebootedSeconds Expect(safeTimeValue).To(BeNumerically(">=", minSafeTimeValue)) diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index b49bbc85c..8eaf5c358 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -3,14 +3,10 @@ package reboot import ( "context" "errors" - "fmt" - "strconv" "time" "github.com/go-logr/logr" - "github.com/medik8s/common/pkg/events" commonlabels "github.com/medik8s/common/pkg/labels" - pkgerrors "github.com/pkg/errors" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" @@ -20,7 +16,6 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/medik8s/self-node-remediation/api/v1alpha1" - "github.com/medik8s/self-node-remediation/pkg/utils" "github.com/medik8s/self-node-remediation/pkg/watchdog" ) @@ -81,7 +76,7 @@ func (s *safeTimeCalculator) GetTimeToAssumeNodeRebooted() time.Duration { minTime := s.minTimeToAssumeNodeRebooted if !s.isAgent { //TODO mshitrit handle error - minTime, _ = s.getMinSafeTimeFromConfig() + minTime, _ = s.getMinSafeTimeSecFromConfig() } if s.timeToAssumeNodeRebooted < minTime { @@ -129,88 +124,54 @@ func (s *safeTimeCalculator) calcMinTimeAssumeRebooted() error { // 2. In case SafeTimeToAssumeNodeRebootedSeconds is too low (either because of the default configuration or because minvalue has changed due to an update of another field) it'll set it to a valid value and issue an event. func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Duration) error { minTimeSec := int(minTime.Seconds()) - //set it on configuration - confList := &v1alpha1.SelfNodeRemediationConfigList{} - if err := s.k8sClient.List(context.Background(), confList); err != nil { - s.log.Error(err, "failed to get snr configuration") - + config, err := s.getConfig() + if err != nil { return err } - - for _, snrConf := range confList.Items { - var isUpdateRequired bool - if snrConf.GetAnnotations() == nil { - snrConf.Annotations = make(map[string]string) - } - prevMinRebootTimeSec, isSet := snrConf.GetAnnotations()[utils.MinSafeTimeAnnotation] - - minTimeSecString := strconv.Itoa(minTimeSec) - if !isSet || prevMinRebootTimeSec != minTimeSecString { - if !isSet { - s.log.Info("snr agent about to modify config adding an annotation", "annotation key", utils.MinSafeTimeAnnotation, "annotation value", minTimeSecString) - } else { - s.log.Info("snr agent about to modify config updating annotation value", "annotation key", utils.MinSafeTimeAnnotation, "annotation previous value", prevMinRebootTimeSec, "annotation new value", minTimeSecString) - } - snrConf.GetAnnotations()[utils.MinSafeTimeAnnotation] = minTimeSecString - isUpdateRequired = true + orgConfig := config.DeepCopy() + prevMinRebootTimeSec := config.Status.MinSafeTimeToAssumeNodeRebootedSeconds + if prevMinRebootTimeSec != minTimeSec { + config.Status.MinSafeTimeToAssumeNodeRebootedSeconds = minTimeSec + if config.Spec.SafeTimeToAssumeNodeRebootedSeconds != 0 && minTimeSec > config.Spec.SafeTimeToAssumeNodeRebootedSeconds { + //TODO mshitrit add warning to status } - - if snrConf.Spec.SafeTimeToAssumeNodeRebootedSeconds < minTimeSec { - isUpdateRequired = true - //TODO mshitrit don't update the spec but create a warning in the status instead - snrConf.Spec.SafeTimeToAssumeNodeRebootedSeconds = minTimeSec - s.SetTimeToAssumeNodeRebooted(time.Duration(minTimeSec) * time.Second) - msg, reason := "Automatic update since value isn't valid anymore", "SafeTimeToAssumeNodeRebootedSecondsModified" - if !isSet { - msg = "Default value was lower than minimum value" - } else if prevMinRebootTimeSec != minTimeSecString { - msg = "Minimum value has changed and now is greater than configured value" - } - s.log.Info(fmt.Sprintf("%s:%s", reason, msg)) - events.NormalEvent(s.recorder, &snrConf, reason, msg) - } - - if isUpdateRequired { - if err := s.k8sClient.Update(context.Background(), &snrConf); err != nil { - s.log.Error(err, "failed to update SafeTimeToAssumeNodeRebootedSeconds with min value") - return err - } + if err := s.k8sClient.Status().Patch(context.Background(), config, client.MergeFrom(orgConfig)); err != nil { + return err } + //TODO mshitrit add event maybe logs } return nil } -func (s *safeTimeCalculator) getMinSafeTimeFromConfig() (time.Duration, error) { +func (s *safeTimeCalculator) getMinSafeTimeSecFromConfig() (time.Duration, error) { //TODO mshitrit add some logs + config, err := s.getConfig() + if err != nil { + return 0, err + } + + minTimeSec := config.Status.MinSafeTimeToAssumeNodeRebootedSeconds + if minTimeSec > 0 { + return time.Duration(minTimeSec) * time.Second, nil + } + + return 0, errors.New("failed getting MinSafeTimeToAssumeNodeRebootedSeconds, value isn't initialized") +} + +func (s *safeTimeCalculator) getConfig() (*v1alpha1.SelfNodeRemediationConfig, error) { confList := &v1alpha1.SelfNodeRemediationConfigList{} if err := s.k8sClient.List(context.Background(), confList); err != nil { s.log.Error(err, "failed to get snr configuration") - - return 0, err + return nil, err } var config v1alpha1.SelfNodeRemediationConfig if len(confList.Items) < 1 { - return 0, errors.New("SNR config not found") + return nil, errors.New("SNR config not found") } else { config = confList.Items[0] } - - if config.GetAnnotations() == nil { - return 0, errors.New("failed getting MinSafeTimeFromConfig config does not contain annotations") - } - - minTimeSecString, isFound := config.GetAnnotations()[utils.MinSafeTimeAnnotation] - if !isFound { - return 0, fmt.Errorf("failed getting MinSafeTimeFromConfig config does not contain %s annotation", utils.MinSafeTimeAnnotation) - } - - minTimeSec, err := strconv.Atoi(minTimeSecString) - if err != nil { - return 0, pkgerrors.Wrapf(err, "failed getting MinSafeTimeFromConfig") - } - return time.Duration(minTimeSec) * time.Second, nil - + return &config, nil } func (s *safeTimeCalculator) IsAgent() bool { diff --git a/pkg/utils/annotations.go b/pkg/utils/annotations.go index 8054959be..8e432a567 100644 --- a/pkg/utils/annotations.go +++ b/pkg/utils/annotations.go @@ -16,7 +16,6 @@ const ( // IsRebootCapableAnnotation value is the key name for the node's annotation that will determine if node is reboot capable IsRebootCapableAnnotation = "is-reboot-capable.self-node-remediation.medik8s.io" IsSoftwareRebootEnabledEnvVar = "IS_SOFTWARE_REBOOT_ENABLED" - MinSafeTimeAnnotation = "minimum-safe-reboot-sec.self-node-remediation.medik8s.io" ) // UpdateNodeWithIsRebootCapableAnnotation updates the is-reboot-capable node annotation to be true if any kind From df5d35aa05d817aaf4dd3d704265b7169bec4712 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 6 May 2024 14:48:20 +0300 Subject: [PATCH 09/21] Add/remove status warning in case min value is too low Signed-off-by: Michael Shitrit --- api/v1alpha1/selfnoderemediationconfig_types.go | 11 ++++++++--- api/v1alpha1/selfnoderemediationconfig_webhook.go | 1 - ...ation.medik8s.io_selfnoderemediationconfigs.yaml | 6 ++++++ ...ation.medik8s.io_selfnoderemediationconfigs.yaml | 6 ++++++ pkg/reboot/calculator.go | 13 +++++++++++-- 5 files changed, 31 insertions(+), 6 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index 3e2e018fd..7e5c15d2c 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -25,9 +25,10 @@ import ( // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. const ( - ConfigCRName = "self-node-remediation-config" - defaultWatchdogPath = "/dev/watchdog" - defaultIsSoftwareRebootEnabled = true + ConfigCRName = "self-node-remediation-config" + defaultWatchdogPath = "/dev/watchdog" + defaultIsSoftwareRebootEnabled = true + SafeTimeToAssumeNodeRebootedSecondsWarning = "using the calculated min value instead of SafeTimeToAssumeNodeRebootedSeconds Spec" ) // SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig @@ -132,6 +133,10 @@ type SelfNodeRemediationConfigStatus struct { // +optional // +kubebuilder:validation:Minimum=0 MinSafeTimeToAssumeNodeRebootedSeconds int `json:"minSafeTimeToAssumeNodeRebootedSeconds,omitempty"` + + // SpecNotUsedWarning field is used to indicate that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds is overridden by SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds because SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds is greater than SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds. + // +optional + SpecNotUsedWarning string `json:"specNotUsedWarning,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook.go b/api/v1alpha1/selfnoderemediationconfig_webhook.go index 22f600587..45a10079a 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook.go @@ -177,7 +177,6 @@ func validateToleration(toleration v1.Toleration) error { } func (r *SelfNodeRemediationConfig) validateMinRebootTime() error { - //TODO mshitrit remove status warning in case it's not relevant if r.Status.MinSafeTimeToAssumeNodeRebootedSeconds == 0 { return fmt.Errorf("failed to verify min value of SafeRebootTimeSec, Status.MinSafeTimeToAssumeNodeRebootedSeconds should not be empty") } diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index cc53448b5..df68dff79 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -176,6 +176,12 @@ spec: it is calculated and assigned dynamically. minimum: 0 type: integer + specNotUsedWarning: + description: SpecNotUsedWarning field is used to indicate that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds + is overridden by SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds + because SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds + is greater than SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds. + type: string type: object type: object served: true diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index e01d62877..3652103bf 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -174,6 +174,12 @@ spec: it is calculated and assigned dynamically. minimum: 0 type: integer + specNotUsedWarning: + description: SpecNotUsedWarning field is used to indicate that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds + is overridden by SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds + because SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds + is greater than SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds. + type: string type: object type: object served: true diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index 8eaf5c358..4feca96ea 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -3,6 +3,7 @@ package reboot import ( "context" "errors" + "reflect" "time" "github.com/go-logr/logr" @@ -133,13 +134,21 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Du if prevMinRebootTimeSec != minTimeSec { config.Status.MinSafeTimeToAssumeNodeRebootedSeconds = minTimeSec if config.Spec.SafeTimeToAssumeNodeRebootedSeconds != 0 && minTimeSec > config.Spec.SafeTimeToAssumeNodeRebootedSeconds { - //TODO mshitrit add warning to status + config.Status.SpecNotUsedWarning = v1alpha1.SafeTimeToAssumeNodeRebootedSecondsWarning } + + //TODO mshitrit add event maybe logs + } + //Spec is ok, we can remove the warning + if minTimeSec <= config.Spec.SafeTimeToAssumeNodeRebootedSeconds && len(config.Status.SpecNotUsedWarning) > 0 { + config.Status.SpecNotUsedWarning = "" + } + if !reflect.DeepEqual(config, orgConfig) { if err := s.k8sClient.Status().Patch(context.Background(), config, client.MergeFrom(orgConfig)); err != nil { return err } - //TODO mshitrit add event maybe logs } + return nil } From d200dce3a3e1ac3cc0f03848a4b31bb622833a69 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 6 May 2024 16:34:11 +0300 Subject: [PATCH 10/21] Some additional nits. - Make sure safe time can be deleted - Additional log and events - e2e fix Signed-off-by: Michael Shitrit --- .../selfnoderemediationconfig_types.go | 2 +- .../selfnoderemediationconfig_webhook.go | 5 +++ ...medik8s.io_selfnoderemediationconfigs.yaml | 2 +- ...medik8s.io_selfnoderemediationconfigs.yaml | 2 +- controllers/selfnoderemediation_controller.go | 13 ++++-- controllers/tests/config/suite_test.go | 4 +- controllers/tests/shared/shared.go | 4 +- e2e/self_node_remediation_test.go | 2 +- pkg/reboot/calculator.go | 43 +++++++++++-------- 9 files changed, 47 insertions(+), 30 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index 7e5c15d2c..4960dd3f4 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -46,7 +46,7 @@ type SelfNodeRemediationConfigSpec struct { // node will likely lead to data corruption and violation of run-once semantics. // In an effort to prevent this, the operator ignores values lower than a minimum calculated from the // ApiCheckInterval, ApiServerTimeout, MaxApiErrorThreshold, PeerDialTimeout, and PeerRequestTimeout fields. - // +kubebuilder:validation:Minimum=0 + // +kubebuilder:validation:Minimum=1 SafeTimeToAssumeNodeRebootedSeconds int `json:"safeTimeToAssumeNodeRebootedSeconds,omitempty"` // Valid time units are "ms", "s", "m", "h". diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook.go b/api/v1alpha1/selfnoderemediationconfig_webhook.go index 45a10079a..b9a0a2c63 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook.go @@ -181,6 +181,11 @@ func (r *SelfNodeRemediationConfig) validateMinRebootTime() error { return fmt.Errorf("failed to verify min value of SafeRebootTimeSec, Status.MinSafeTimeToAssumeNodeRebootedSeconds should not be empty") } + //allow removing this optional field + if r.Spec.SafeTimeToAssumeNodeRebootedSeconds == 0 { + return nil + } + if r.Status.MinSafeTimeToAssumeNodeRebootedSeconds > r.Spec.SafeTimeToAssumeNodeRebootedSeconds { return fmt.Errorf("can not set SafeTimeToAssumeNodeRebootedSeconds value below the calculated minimum value of: %d", r.Status.MinSafeTimeToAssumeNodeRebootedSeconds) } diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index df68dff79..758a2c1dc 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -158,7 +158,7 @@ spec: node will likely lead to data corruption and violation of run-once semantics. In an effort to prevent this, the operator ignores values lower than a minimum calculated from the ApiCheckInterval, ApiServerTimeout, MaxApiErrorThreshold, PeerDialTimeout, and PeerRequestTimeout fields. - minimum: 0 + minimum: 1 type: integer watchdogFilePath: default: /dev/watchdog diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 3652103bf..b253bb839 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -156,7 +156,7 @@ spec: node will likely lead to data corruption and violation of run-once semantics. In an effort to prevent this, the operator ignores values lower than a minimum calculated from the ApiCheckInterval, ApiServerTimeout, MaxApiErrorThreshold, PeerDialTimeout, and PeerRequestTimeout fields. - minimum: 0 + minimum: 1 type: integer watchdogFilePath: default: /dev/watchdog diff --git a/controllers/selfnoderemediation_controller.go b/controllers/selfnoderemediation_controller.go index e65b8a7db..b441b1c18 100644 --- a/controllers/selfnoderemediation_controller.go +++ b/controllers/selfnoderemediation_controller.go @@ -401,7 +401,9 @@ func (r *SelfNodeRemediationReconciler) prepareReboot(node *v1.Node, snr *v1alph } if snr.Status.TimeAssumedRebooted.IsZero() { - r.updateTimeAssumedRebooted(node, snr) + if err := r.updateTimeAssumedRebooted(node, snr); err != nil { + return ctrl.Result{}, err + } } preRebootCompleted := string(preRebootCompletedPhase) @@ -625,12 +627,17 @@ func (r *SelfNodeRemediationReconciler) updateSnrStatus(ctx context.Context, snr return nil } -func (r *SelfNodeRemediationReconciler) updateTimeAssumedRebooted(node *v1.Node, snr *v1alpha1.SelfNodeRemediation) { +func (r *SelfNodeRemediationReconciler) updateTimeAssumedRebooted(node *v1.Node, snr *v1alpha1.SelfNodeRemediation) error { r.logger.Info("updating snr with node backup and updating time to assume node has been rebooted", "node name", node.Name) //we assume the unhealthy node will be rebooted by maxTimeNodeHasRebooted - maxTimeNodeHasRebooted := metav1.NewTime(metav1.Now().Add(r.SafeTimeCalculator.GetTimeToAssumeNodeRebooted())) + toAssumeNodeRebooted, err := r.SafeTimeCalculator.GetTimeToAssumeNodeRebooted() + if err != nil { + return err + } + maxTimeNodeHasRebooted := metav1.NewTime(metav1.Now().Add(toAssumeNodeRebooted)) snr.Status.TimeAssumedRebooted = &maxTimeNodeHasRebooted events.NormalEvent(r.Recorder, snr, eventReasonUpdateTimeAssumedRebooted, "Remediation process - about to update required fencing time on snr") + return nil } // getNodeFromSnr returns the unhealthy node reported in the given snr diff --git a/controllers/tests/config/suite_test.go b/controllers/tests/config/suite_test.go index 898f9a13b..00d1c229f 100644 --- a/controllers/tests/config/suite_test.go +++ b/controllers/tests/config/suite_test.go @@ -197,8 +197,8 @@ type mockCalculator struct { isAgent bool } -func (m *mockCalculator) GetTimeToAssumeNodeRebooted() time.Duration { - return m.mockTimeToAssumeNodeRebooted +func (m *mockCalculator) GetTimeToAssumeNodeRebooted() (time.Duration, error) { + return m.mockTimeToAssumeNodeRebooted, nil } func (m *mockCalculator) SetTimeToAssumeNodeRebooted(timeToAssumeNodeRebooted time.Duration) { diff --git a/controllers/tests/shared/shared.go b/controllers/tests/shared/shared.go index 282703874..471e6902f 100644 --- a/controllers/tests/shared/shared.go +++ b/controllers/tests/shared/shared.go @@ -42,8 +42,8 @@ func (kcw *K8sClientWrapper) List(ctx context.Context, list client.ObjectList, o return kcw.Client.List(ctx, list, opts...) } -func (m *MockCalculator) GetTimeToAssumeNodeRebooted() time.Duration { - return m.MockTimeToAssumeNodeRebooted +func (m *MockCalculator) GetTimeToAssumeNodeRebooted() (time.Duration, error) { + return m.MockTimeToAssumeNodeRebooted, nil } func (m *MockCalculator) SetTimeToAssumeNodeRebooted(timeToAssumeNodeRebooted time.Duration) { diff --git a/e2e/self_node_remediation_test.go b/e2e/self_node_remediation_test.go index a725c744e..ea30c8dfc 100644 --- a/e2e/self_node_remediation_test.go +++ b/e2e/self_node_remediation_test.go @@ -47,7 +47,7 @@ var _ = Describe("Self Node Remediation E2E", func() { minSafeTimeValue := config.Status.MinSafeTimeToAssumeNodeRebootedSeconds Expect(minSafeTimeValue).To(BeNumerically(">", 0)) safeTimeValue := config.Spec.SafeTimeToAssumeNodeRebootedSeconds - Expect(safeTimeValue).To(BeNumerically(">=", minSafeTimeValue)) + Expect(safeTimeValue).To(Equal(0)) }) }) diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index 4feca96ea..d6c724a94 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -7,6 +7,7 @@ import ( "time" "github.com/go-logr/logr" + "github.com/medik8s/common/pkg/events" commonlabels "github.com/medik8s/common/pkg/labels" v1 "k8s.io/api/core/v1" @@ -30,7 +31,7 @@ type SafeTimeCalculator interface { // GetTimeToAssumeNodeRebooted returns the safe time to assume node was already rebooted // note that this time must include the time for a unhealthy node without api-server access to reach the conclusion that it's unhealthy // this should be at least worst-case time to reach a conclusion from the other peers * request context timeout + watchdog interval + maxFailuresThreshold * reconcileInterval + padding - GetTimeToAssumeNodeRebooted() time.Duration + GetTimeToAssumeNodeRebooted() (time.Duration, error) SetTimeToAssumeNodeRebooted(time.Duration) Start(ctx context.Context) error //IsAgent return true in case running on an agent pod (responsible for reboot) or false in case running on a manager pod @@ -73,17 +74,19 @@ func NewManagerSafeTimeCalculator(k8sClient client.Client) SafeTimeCalculator { } } -func (s *safeTimeCalculator) GetTimeToAssumeNodeRebooted() time.Duration { +func (s *safeTimeCalculator) GetTimeToAssumeNodeRebooted() (time.Duration, error) { minTime := s.minTimeToAssumeNodeRebooted + var err error if !s.isAgent { - //TODO mshitrit handle error - minTime, _ = s.getMinSafeTimeSecFromConfig() + if minTime, err = s.getMinSafeTimeSecFromConfig(); err != nil { + return 0, err + } } if s.timeToAssumeNodeRebooted < minTime { - return minTime + return minTime, nil } - return s.timeToAssumeNodeRebooted + return s.timeToAssumeNodeRebooted, nil } func (s *safeTimeCalculator) SetTimeToAssumeNodeRebooted(timeToAssumeNodeRebooted time.Duration) { @@ -121,8 +124,8 @@ func (s *safeTimeCalculator) calcMinTimeAssumeRebooted() error { } // manageSafeRebootTimeInConfiguration does two things: -// 1. It sets an annotation on the configuration with the most updated value of minTimeToAssumeNodeRebooted in seconds (this value will be used by the webhook to verify user doesn't set an invalid value for SafeTimeToAssumeNodeRebootedSeconds). -// 2. In case SafeTimeToAssumeNodeRebootedSeconds is too low (either because of the default configuration or because minvalue has changed due to an update of another field) it'll set it to a valid value and issue an event. +// 1. It sets Status.MinSafeTimeToAssumeNodeRebootedSeconds in case it's changed by latest calculation. +// 2. It Adds/removes config.Status.SpecNotUsedWarning if necessary. func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Duration) error { minTimeSec := int(minTime.Seconds()) config, err := s.getConfig() @@ -133,16 +136,15 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Du prevMinRebootTimeSec := config.Status.MinSafeTimeToAssumeNodeRebootedSeconds if prevMinRebootTimeSec != minTimeSec { config.Status.MinSafeTimeToAssumeNodeRebootedSeconds = minTimeSec - if config.Spec.SafeTimeToAssumeNodeRebootedSeconds != 0 && minTimeSec > config.Spec.SafeTimeToAssumeNodeRebootedSeconds { - config.Status.SpecNotUsedWarning = v1alpha1.SafeTimeToAssumeNodeRebootedSecondsWarning - } - - //TODO mshitrit add event maybe logs } - //Spec is ok, we can remove the warning - if minTimeSec <= config.Spec.SafeTimeToAssumeNodeRebootedSeconds && len(config.Status.SpecNotUsedWarning) > 0 { - config.Status.SpecNotUsedWarning = "" + //Manage warning + config.Status.SpecNotUsedWarning = "" + if config.Spec.SafeTimeToAssumeNodeRebootedSeconds > 0 && minTimeSec > config.Spec.SafeTimeToAssumeNodeRebootedSeconds { + config.Status.SpecNotUsedWarning = v1alpha1.SafeTimeToAssumeNodeRebootedSecondsWarning + s.log.Info("SafeTimeToAssumeNodeRebootedSeconds is overridden by calculated value because it's invalid, calculated value stored in Status.MinSafeTimeToAssumeNodeRebootedSeconds would be used instead") + events.WarningEvent(s.recorder, config, "SafeTimeToAssumeNodeRebootedSecondsInvalid", "SafeTimeToAssumeNodeRebootedSeconds is overridden by calculated value") } + if !reflect.DeepEqual(config, orgConfig) { if err := s.k8sClient.Status().Patch(context.Background(), config, client.MergeFrom(orgConfig)); err != nil { return err @@ -153,7 +155,6 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Du } func (s *safeTimeCalculator) getMinSafeTimeSecFromConfig() (time.Duration, error) { - //TODO mshitrit add some logs config, err := s.getConfig() if err != nil { return 0, err @@ -164,7 +165,9 @@ func (s *safeTimeCalculator) getMinSafeTimeSecFromConfig() (time.Duration, error return time.Duration(minTimeSec) * time.Second, nil } - return 0, errors.New("failed getting MinSafeTimeToAssumeNodeRebootedSeconds, value isn't initialized") + err = errors.New("failed getting MinSafeTimeToAssumeNodeRebootedSeconds, value isn't initialized") + s.log.Error(err, "MinSafeTimeToAssumeNodeRebootedSeconds should not be empty") + return 0, err } func (s *safeTimeCalculator) getConfig() (*v1alpha1.SelfNodeRemediationConfig, error) { @@ -176,7 +179,9 @@ func (s *safeTimeCalculator) getConfig() (*v1alpha1.SelfNodeRemediationConfig, e var config v1alpha1.SelfNodeRemediationConfig if len(confList.Items) < 1 { - return nil, errors.New("SNR config not found") + err := errors.New("SNR config not found") + s.log.Error(err, "failed to get snr configuration") + return nil, err } else { config = confList.Items[0] } From d16195a1d218e20a5b30dfef09b6ba5bdf0d4b7b Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Wed, 8 May 2024 14:58:28 +0300 Subject: [PATCH 11/21] update status field warning name Signed-off-by: Michael Shitrit --- api/v1alpha1/selfnoderemediationconfig_types.go | 4 ++-- ...e-remediation.medik8s.io_selfnoderemediationconfigs.yaml | 5 +++-- ...e-remediation.medik8s.io_selfnoderemediationconfigs.yaml | 5 +++-- pkg/reboot/calculator.go | 6 +++--- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index 4960dd3f4..f6907ee35 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -134,9 +134,9 @@ type SelfNodeRemediationConfigStatus struct { // +kubebuilder:validation:Minimum=0 MinSafeTimeToAssumeNodeRebootedSeconds int `json:"minSafeTimeToAssumeNodeRebootedSeconds,omitempty"` - // SpecNotUsedWarning field is used to indicate that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds is overridden by SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds because SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds is greater than SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds. + // SpecSafeTimeOverriddenWarning field is used to indicate that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds is overridden by SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds because SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds is greater than SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds. // +optional - SpecNotUsedWarning string `json:"specNotUsedWarning,omitempty"` + SpecSafeTimeOverriddenWarning string `json:"specSafeTimeOverriddenWarning,omitempty"` } //+kubebuilder:object:root=true diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 758a2c1dc..5ef27680f 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -176,8 +176,9 @@ spec: it is calculated and assigned dynamically. minimum: 0 type: integer - specNotUsedWarning: - description: SpecNotUsedWarning field is used to indicate that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds + specSafeTimeOverriddenWarning: + description: SpecSafeTimeOverriddenWarning field is used to indicate + that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds is overridden by SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds because SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds is greater than SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds. diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index b253bb839..8191c8a0a 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -174,8 +174,9 @@ spec: it is calculated and assigned dynamically. minimum: 0 type: integer - specNotUsedWarning: - description: SpecNotUsedWarning field is used to indicate that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds + specSafeTimeOverriddenWarning: + description: SpecSafeTimeOverriddenWarning field is used to indicate + that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds is overridden by SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds because SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds is greater than SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds. diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index d6c724a94..ff798485b 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -125,7 +125,7 @@ func (s *safeTimeCalculator) calcMinTimeAssumeRebooted() error { // manageSafeRebootTimeInConfiguration does two things: // 1. It sets Status.MinSafeTimeToAssumeNodeRebootedSeconds in case it's changed by latest calculation. -// 2. It Adds/removes config.Status.SpecNotUsedWarning if necessary. +// 2. It Adds/removes config.Status.SpecSafeTimeOverriddenWarning if necessary. func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Duration) error { minTimeSec := int(minTime.Seconds()) config, err := s.getConfig() @@ -138,9 +138,9 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Du config.Status.MinSafeTimeToAssumeNodeRebootedSeconds = minTimeSec } //Manage warning - config.Status.SpecNotUsedWarning = "" + config.Status.SpecSafeTimeOverriddenWarning = "" if config.Spec.SafeTimeToAssumeNodeRebootedSeconds > 0 && minTimeSec > config.Spec.SafeTimeToAssumeNodeRebootedSeconds { - config.Status.SpecNotUsedWarning = v1alpha1.SafeTimeToAssumeNodeRebootedSecondsWarning + config.Status.SpecSafeTimeOverriddenWarning = v1alpha1.SafeTimeToAssumeNodeRebootedSecondsWarning s.log.Info("SafeTimeToAssumeNodeRebootedSeconds is overridden by calculated value because it's invalid, calculated value stored in Status.MinSafeTimeToAssumeNodeRebootedSeconds would be used instead") events.WarningEvent(s.recorder, config, "SafeTimeToAssumeNodeRebootedSecondsInvalid", "SafeTimeToAssumeNodeRebootedSeconds is overridden by calculated value") } From 91965756596ff037aa98cd12bada8f561d56ad03 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Sun, 19 May 2024 14:18:11 +0300 Subject: [PATCH 12/21] remove SetTimeToAssumeNodeRebooted Signed-off-by: Michael Shitrit --- .../selfnoderemediationconfig_controller.go | 14 +++---- ...lfnoderemediationconfig_controller_test.go | 30 ++++++++------ controllers/tests/config/suite_test.go | 39 ++++--------------- controllers/tests/controller/suite_test.go | 11 +++--- main.go | 13 +++---- pkg/reboot/calculator.go | 22 ++++------- 6 files changed, 50 insertions(+), 79 deletions(-) diff --git a/controllers/selfnoderemediationconfig_controller.go b/controllers/selfnoderemediationconfig_controller.go index e90aa72f4..9b6e1f840 100644 --- a/controllers/selfnoderemediationconfig_controller.go +++ b/controllers/selfnoderemediationconfig_controller.go @@ -38,7 +38,6 @@ import ( selfnoderemediationv1alpha1 "github.com/medik8s/self-node-remediation/api/v1alpha1" "github.com/medik8s/self-node-remediation/pkg/apply" "github.com/medik8s/self-node-remediation/pkg/certificates" - "github.com/medik8s/self-node-remediation/pkg/reboot" "github.com/medik8s/self-node-remediation/pkg/render" ) @@ -49,12 +48,11 @@ const ( // SelfNodeRemediationConfigReconciler reconciles a SelfNodeRemediationConfig object type SelfNodeRemediationConfigReconciler struct { client.Client - Log logr.Logger - Scheme *runtime.Scheme - InstallFileFolder string - DefaultPpcCreator func(c client.Client) error - Namespace string - ManagerSafeTimeCalculator reboot.SafeTimeCalculator + Log logr.Logger + Scheme *runtime.Scheme + InstallFileFolder string + DefaultPpcCreator func(c client.Client) error + Namespace string } //+kubebuilder:rbac:groups=self-node-remediation.medik8s.io,resources=selfnoderemediationconfigs,verbs=get;list;watch;create;update;patch;delete @@ -100,8 +98,6 @@ func (r *SelfNodeRemediationConfigReconciler) Reconcile(ctx context.Context, req return ctrl.Result{}, err } - //sync manager reconciler - r.ManagerSafeTimeCalculator.SetTimeToAssumeNodeRebooted(time.Duration(config.Spec.SafeTimeToAssumeNodeRebootedSeconds) * time.Second) return ctrl.Result{}, nil } diff --git a/controllers/tests/config/selfnoderemediationconfig_controller_test.go b/controllers/tests/config/selfnoderemediationconfig_controller_test.go index b883ac383..e559d459d 100644 --- a/controllers/tests/config/selfnoderemediationconfig_controller_test.go +++ b/controllers/tests/config/selfnoderemediationconfig_controller_test.go @@ -11,6 +11,7 @@ import ( appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" @@ -27,6 +28,9 @@ var _ = Describe("SNR Config Test", func() { BeforeEach(func() { ds = &appsv1.DaemonSet{} + ds.Name = dsName + ds.Namespace = shared.Namespace + _ = os.Setenv("SELF_NODE_REMEDIATION_IMAGE", dummySelfNodeRemediationImage) config = &selfnoderemediationv1alpha1.SelfNodeRemediationConfig{} config.Kind = "SelfNodeRemediationConfig" @@ -46,10 +50,19 @@ var _ = Describe("SNR Config Test", func() { } JustBeforeEach(func() { + minSafeTimeToAssumeNodeRebootedSeconds := config.Status.MinSafeTimeToAssumeNodeRebootedSeconds Expect(k8sClient.Create(context.Background(), config)).To(Succeed()) + if minSafeTimeToAssumeNodeRebootedSeconds > 0 { + config.Status.MinSafeTimeToAssumeNodeRebootedSeconds = minSafeTimeToAssumeNodeRebootedSeconds + time.Sleep(time.Second) + Expect(k8sClient.Status().Update(context.Background(), config)).To(Succeed()) + } + DeferCleanup(func() { Expect(k8sClient.Delete(context.Background(), config)).To(Succeed()) - k8sClient.Delete(context.Background(), ds) + if err := k8sClient.Delete(context.Background(), ds); err != nil { + Expect(apierrors.IsNotFound(err)).To(BeTrue()) + } }) }) @@ -148,19 +161,12 @@ var _ = Describe("SNR Config Test", func() { }) When("SafeTimeToAssumeNodeRebootedSeconds is modified in Configuration", func() { BeforeEach(func() { - Expect(managerReconciler.SafeTimeCalculator.GetTimeToAssumeNodeRebooted()).ToNot(Equal(time.Minute)) - config.Spec.SafeTimeToAssumeNodeRebootedSeconds = 60 + config.Spec.SafeTimeToAssumeNodeRebootedSeconds = 65 + config.Status.MinSafeTimeToAssumeNodeRebootedSeconds = 50 }) - It("The Manager Reconciler and the DS should be modified with the new value", func() { + It("SafeTimeCalculator of the manager gets the correct TimeToAssumeNodeRebooted value from the configuration", func() { Eventually(func(g Gomega) bool { - ds = &appsv1.DaemonSet{} - g.Expect(k8sClient.Get(context.Background(), key, ds)).To(Succeed()) - dsContainers := ds.Spec.Template.Spec.Containers - g.Expect(len(dsContainers)).To(BeNumerically("==", 1)) - container := dsContainers[0] - envVars := getEnvVarMap(container.Env) - g.Expect(envVars["TIME_TO_ASSUME_NODE_REBOOTED"].Value).To(Equal("60")) - g.Expect(managerReconciler.SafeTimeCalculator.GetTimeToAssumeNodeRebooted()).To(Equal(time.Minute)) + g.Expect(managerReconciler.SafeTimeCalculator.GetTimeToAssumeNodeRebooted()).To(Equal(time.Second * 65)) return true }, 10*time.Second, 250*time.Millisecond).Should(BeTrue()) }) diff --git a/controllers/tests/config/suite_test.go b/controllers/tests/config/suite_test.go index 00d1c229f..fe442837f 100644 --- a/controllers/tests/config/suite_test.go +++ b/controllers/tests/config/suite_test.go @@ -39,6 +39,7 @@ import ( "github.com/medik8s/self-node-remediation/pkg/apicheck" "github.com/medik8s/self-node-remediation/pkg/certificates" "github.com/medik8s/self-node-remediation/pkg/peers" + "github.com/medik8s/self-node-remediation/pkg/reboot" //+kubebuilder:scaffold:imports ) @@ -95,14 +96,12 @@ var _ = BeforeSuite(func() { Expect(k8sClient.Create(context.Background(), nsToCreate)).To(Succeed()) Expect(err).ToNot(HaveOccurred()) - mockManagerCalculator := &mockCalculator{isAgent: false} err = (&controllers.SelfNodeRemediationConfigReconciler{ - Client: k8sManager.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("self-node-remediation-config-controller"), - InstallFileFolder: "../../../install/", - Scheme: scheme.Scheme, - Namespace: shared.Namespace, - ManagerSafeTimeCalculator: mockManagerCalculator, + Client: k8sManager.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("self-node-remediation-config-controller"), + InstallFileFolder: "../../../install/", + Scheme: scheme.Scheme, + Namespace: shared.Namespace, }).SetupWithManager(k8sManager) // peers need their own node on start @@ -132,7 +131,7 @@ var _ = BeforeSuite(func() { Expect(err).ToNot(HaveOccurred()) restoreNodeAfter := 5 * time.Second - mockAgentCalculator := &mockCalculator{isAgent: true} + mockAgentCalculator := reboot.NewAgentSafeTimeCalculator(k8sClient, nil, nil, 0, 0, 0, 0, 0, 0) // reconciler for unhealthy node err = (&controllers.SelfNodeRemediationReconciler{ Client: k8sClient, @@ -159,7 +158,7 @@ var _ = BeforeSuite(func() { Log: ctrl.Log.WithName("controllers").WithName("self-node-remediation-controller").WithName("manager node"), MyNodeName: shared.PeerNodeName, RestoreNodeAfter: restoreNodeAfter, - SafeTimeCalculator: mockManagerCalculator, + SafeTimeCalculator: reboot.NewManagerSafeTimeCalculator(k8sClient), } err = managerReconciler.SetupWithManager(k8sManager) Expect(err).ToNot(HaveOccurred()) @@ -191,25 +190,3 @@ var _ = AfterSuite(func() { Expect(err).NotTo(HaveOccurred()) }) - -type mockCalculator struct { - mockTimeToAssumeNodeRebooted time.Duration - isAgent bool -} - -func (m *mockCalculator) GetTimeToAssumeNodeRebooted() (time.Duration, error) { - return m.mockTimeToAssumeNodeRebooted, nil -} - -func (m *mockCalculator) SetTimeToAssumeNodeRebooted(timeToAssumeNodeRebooted time.Duration) { - m.mockTimeToAssumeNodeRebooted = timeToAssumeNodeRebooted -} - -func (m *mockCalculator) IsAgent() bool { - return m.isAgent -} - -//goland:noinspection GoUnusedParameter -func (m *mockCalculator) Start(ctx context.Context) error { - return nil -} diff --git a/controllers/tests/controller/suite_test.go b/controllers/tests/controller/suite_test.go index ed8f122f2..6a7f14d52 100644 --- a/controllers/tests/controller/suite_test.go +++ b/controllers/tests/controller/suite_test.go @@ -128,12 +128,11 @@ var _ = BeforeSuite(func() { timeToAssumeNodeRebooted += 5 * time.Second mockManagerCalculator := &shared.MockCalculator{MockTimeToAssumeNodeRebooted: timeToAssumeNodeRebooted, IsAgentVar: false} err = (&controllers.SelfNodeRemediationConfigReconciler{ - Client: k8sManager.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("self-node-remediation-config-controller"), - InstallFileFolder: "../../../install/", - Scheme: testScheme, - Namespace: shared.Namespace, - ManagerSafeTimeCalculator: mockManagerCalculator, + Client: k8sManager.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("self-node-remediation-config-controller"), + InstallFileFolder: "../../../install/", + Scheme: testScheme, + Namespace: shared.Namespace, }).SetupWithManager(k8sManager) // peers need their own node on start diff --git a/main.go b/main.go index bd61c87ad..c9fa762ee 100644 --- a/main.go +++ b/main.go @@ -187,13 +187,12 @@ func initSelfNodeRemediationManager(mgr manager.Manager, enableHTTP2 bool) { } if err := (&controllers.SelfNodeRemediationConfigReconciler{ - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("SelfNodeRemediationConfig"), - Scheme: mgr.GetScheme(), - InstallFileFolder: "./install", - DefaultPpcCreator: snrconfighelper.NewConfigIfNotExist, - Namespace: ns, - ManagerSafeTimeCalculator: safeRebootCalc, + Client: mgr.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("SelfNodeRemediationConfig"), + Scheme: mgr.GetScheme(), + InstallFileFolder: "./install", + DefaultPpcCreator: snrconfighelper.NewConfigIfNotExist, + Namespace: ns, }).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "SelfNodeRemediationConfig") os.Exit(1) diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index ff798485b..769c8c27f 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -32,7 +32,6 @@ type SafeTimeCalculator interface { // note that this time must include the time for a unhealthy node without api-server access to reach the conclusion that it's unhealthy // this should be at least worst-case time to reach a conclusion from the other peers * request context timeout + watchdog interval + maxFailuresThreshold * reconcileInterval + padding GetTimeToAssumeNodeRebooted() (time.Duration, error) - SetTimeToAssumeNodeRebooted(time.Duration) Start(ctx context.Context) error //IsAgent return true in case running on an agent pod (responsible for reboot) or false in case running on a manager pod IsAgent() bool @@ -76,11 +75,15 @@ func NewManagerSafeTimeCalculator(k8sClient client.Client) SafeTimeCalculator { func (s *safeTimeCalculator) GetTimeToAssumeNodeRebooted() (time.Duration, error) { minTime := s.minTimeToAssumeNodeRebooted - var err error if !s.isAgent { - if minTime, err = s.getMinSafeTimeSecFromConfig(); err != nil { + config, err := s.getConfig() + if err != nil { return 0, err } + if minTime, err = s.getMinSafeTimeSecFromConfig(config); err != nil { + return 0, err + } + s.timeToAssumeNodeRebooted = time.Duration(config.Spec.SafeTimeToAssumeNodeRebootedSeconds) * time.Second } if s.timeToAssumeNodeRebooted < minTime { @@ -89,10 +92,6 @@ func (s *safeTimeCalculator) GetTimeToAssumeNodeRebooted() (time.Duration, error return s.timeToAssumeNodeRebooted, nil } -func (s *safeTimeCalculator) SetTimeToAssumeNodeRebooted(timeToAssumeNodeRebooted time.Duration) { - s.timeToAssumeNodeRebooted = timeToAssumeNodeRebooted -} - func (s *safeTimeCalculator) Start(_ context.Context) error { return s.calcMinTimeAssumeRebooted() } @@ -154,18 +153,13 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Du return nil } -func (s *safeTimeCalculator) getMinSafeTimeSecFromConfig() (time.Duration, error) { - config, err := s.getConfig() - if err != nil { - return 0, err - } - +func (s *safeTimeCalculator) getMinSafeTimeSecFromConfig(config *v1alpha1.SelfNodeRemediationConfig) (time.Duration, error) { minTimeSec := config.Status.MinSafeTimeToAssumeNodeRebootedSeconds if minTimeSec > 0 { return time.Duration(minTimeSec) * time.Second, nil } - err = errors.New("failed getting MinSafeTimeToAssumeNodeRebootedSeconds, value isn't initialized") + err := errors.New("failed getting MinSafeTimeToAssumeNodeRebootedSeconds, value isn't initialized") s.log.Error(err, "MinSafeTimeToAssumeNodeRebootedSeconds should not be empty") return 0, err } From 0ca7523d0062327d4cb744e11c0959b37aad323e Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 20 May 2024 10:56:40 +0300 Subject: [PATCH 13/21] remove empty line - fixing generated crd description Signed-off-by: Michael Shitrit --- api/v1alpha1/selfnoderemediationconfig_types.go | 1 - ...f-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml | 1 + ...f-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml | 1 + 3 files changed, 2 insertions(+), 1 deletion(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index f6907ee35..c7628a671 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -69,7 +69,6 @@ type SelfNodeRemediationConfigSpec struct { // +optional // +kubebuilder:default:="15m" // +kubebuilder:validation:Pattern="^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$" - // +kubebuilder:validation:Type:=string PeerUpdateInterval *metav1.Duration `json:"peerUpdateInterval,omitempty"` diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 5ef27680f..9b2467c89 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -148,6 +148,7 @@ spec: type: string peerUpdateInterval: default: 15m + description: Valid time units are "ms", "s", "m", "h". pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$ type: string safeTimeToAssumeNodeRebootedSeconds: diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 8191c8a0a..b491093f5 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -146,6 +146,7 @@ spec: type: string peerUpdateInterval: default: 15m + description: Valid time units are "ms", "s", "m", "h". pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$ type: string safeTimeToAssumeNodeRebootedSeconds: From 8bdbfa92bb54d3fdd26cef28619b63500257cd7f Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 20 May 2024 11:23:03 +0300 Subject: [PATCH 14/21] replace status warning with condition Signed-off-by: Michael Shitrit --- .../selfnoderemediationconfig_types.go | 18 ++-- api/v1alpha1/zz_generated.deepcopy.go | 9 +- ...ode-remediation.clusterserviceversion.yaml | 7 ++ ...medik8s.io_selfnoderemediationconfigs.yaml | 82 +++++++++++++++++-- ...medik8s.io_selfnoderemediationconfigs.yaml | 82 +++++++++++++++++-- ...ode-remediation.clusterserviceversion.yaml | 7 ++ pkg/reboot/calculator.go | 17 +++- 7 files changed, 198 insertions(+), 24 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index c7628a671..032370418 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -25,10 +25,12 @@ import ( // NOTE: json tags are required. Any new fields you add must have json tags for the fields to be serialized. const ( - ConfigCRName = "self-node-remediation-config" - defaultWatchdogPath = "/dev/watchdog" - defaultIsSoftwareRebootEnabled = true - SafeTimeToAssumeNodeRebootedSecondsWarning = "using the calculated min value instead of SafeTimeToAssumeNodeRebootedSeconds Spec" + ConfigCRName = "self-node-remediation-config" + defaultWatchdogPath = "/dev/watchdog" + defaultIsSoftwareRebootEnabled = true + + // SafeTimeToAssumeNodeRebootedOverriddenConditionType is the condition type used to signal whether SNR is overriding SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds with SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds + SafeTimeToAssumeNodeRebootedOverriddenConditionType = "SafeTimeToAssumeNodeRebootedOverridden" ) // SelfNodeRemediationConfigSpec defines the desired state of SelfNodeRemediationConfig @@ -133,9 +135,13 @@ type SelfNodeRemediationConfigStatus struct { // +kubebuilder:validation:Minimum=0 MinSafeTimeToAssumeNodeRebootedSeconds int `json:"minSafeTimeToAssumeNodeRebootedSeconds,omitempty"` - // SpecSafeTimeOverriddenWarning field is used to indicate that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds is overridden by SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds because SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds is greater than SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds. + // +operator-sdk:csv:customresourcedefinitions:type=status,displayName="conditions",xDescriptors="urn:alm:descriptor:com.tectonic.ui:conditions" + // Represents the observations of a SelfNodeRemediationConfig's current state. + // Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden" + // +listType=map + // +listMapKey=type // +optional - SpecSafeTimeOverriddenWarning string `json:"specSafeTimeOverriddenWarning,omitempty"` + Conditions []metav1.Condition `json:"conditions,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 4c18a2e7c..451c29128 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -59,7 +59,7 @@ func (in *SelfNodeRemediationConfig) DeepCopyInto(out *SelfNodeRemediationConfig out.TypeMeta = in.TypeMeta in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) in.Spec.DeepCopyInto(&out.Spec) - out.Status = in.Status + in.Status.DeepCopyInto(&out.Status) } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SelfNodeRemediationConfig. @@ -167,6 +167,13 @@ func (in *SelfNodeRemediationConfigSpec) DeepCopy() *SelfNodeRemediationConfigSp // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SelfNodeRemediationConfigStatus) DeepCopyInto(out *SelfNodeRemediationConfigStatus) { *out = *in + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SelfNodeRemediationConfigStatus. diff --git a/bundle/manifests/self-node-remediation.clusterserviceversion.yaml b/bundle/manifests/self-node-remediation.clusterserviceversion.yaml index 7e9d4f852..2b33d1491 100644 --- a/bundle/manifests/self-node-remediation.clusterserviceversion.yaml +++ b/bundle/manifests/self-node-remediation.clusterserviceversion.yaml @@ -62,6 +62,13 @@ spec: - kind: SelfNodeRemediationConfig name: selfnoderemediationconfigs version: v1alpha1 + statusDescriptors: + - description: 'Represents the observations of a SelfNodeRemediationConfig''s + current state. Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden"' + displayName: conditions + path: conditions + x-descriptors: + - urn:alm:descriptor:com.tectonic.ui:conditions version: v1alpha1 - description: SelfNodeRemediation is the Schema for the selfnoderemediations API diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 9b2467c89..a567c039e 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -171,19 +171,87 @@ spec: description: SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig properties: + conditions: + description: |- + Represents the observations of a SelfNodeRemediationConfig's current state. + Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden" + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map minSafeTimeToAssumeNodeRebootedSeconds: description: MinSafeTimeToAssumeNodeRebootedSeconds is the minimum value that can be assigned to SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds, it is calculated and assigned dynamically. minimum: 0 type: integer - specSafeTimeOverriddenWarning: - description: SpecSafeTimeOverriddenWarning field is used to indicate - that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds - is overridden by SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds - because SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds - is greater than SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds. - type: string type: object type: object served: true diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index b491093f5..c063d6c92 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -169,19 +169,87 @@ spec: description: SelfNodeRemediationConfigStatus defines the observed state of SelfNodeRemediationConfig properties: + conditions: + description: |- + Represents the observations of a SelfNodeRemediationConfig's current state. + Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden" + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map minSafeTimeToAssumeNodeRebootedSeconds: description: MinSafeTimeToAssumeNodeRebootedSeconds is the minimum value that can be assigned to SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds, it is calculated and assigned dynamically. minimum: 0 type: integer - specSafeTimeOverriddenWarning: - description: SpecSafeTimeOverriddenWarning field is used to indicate - that SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds - is overridden by SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds - because SelfNodeRemediationConfigStatus.MinSafeTimeToAssumeNodeRebootedSeconds - is greater than SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds. - type: string type: object type: object served: true diff --git a/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml b/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml index d22559b9f..8b5f2ebf9 100644 --- a/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml +++ b/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml @@ -29,6 +29,13 @@ spec: - kind: SelfNodeRemediationConfig name: selfnoderemediationconfigs version: v1alpha1 + statusDescriptors: + - description: 'Represents the observations of a SelfNodeRemediationConfig''s + current state. Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden"' + displayName: conditions + path: conditions + x-descriptors: + - urn:alm:descriptor:com.tectonic.ui:conditions version: v1alpha1 - description: SelfNodeRemediation is the Schema for the selfnoderemediations API diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index 769c8c27f..6a4ec38b3 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -11,6 +11,8 @@ import ( commonlabels "github.com/medik8s/common/pkg/labels" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" "k8s.io/client-go/tools/record" @@ -136,10 +138,19 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Du if prevMinRebootTimeSec != minTimeSec { config.Status.MinSafeTimeToAssumeNodeRebootedSeconds = minTimeSec } - //Manage warning - config.Status.SpecSafeTimeOverriddenWarning = "" + //Manage condition + meta.SetStatusCondition(&config.Status.Conditions, metav1.Condition{ + Type: v1alpha1.SafeTimeToAssumeNodeRebootedOverriddenConditionType, + Status: metav1.ConditionFalse, + Reason: "SafeTimeToAssumeNodeRebootedSeconds is valid or empty", + }) if config.Spec.SafeTimeToAssumeNodeRebootedSeconds > 0 && minTimeSec > config.Spec.SafeTimeToAssumeNodeRebootedSeconds { - config.Status.SpecSafeTimeOverriddenWarning = v1alpha1.SafeTimeToAssumeNodeRebootedSecondsWarning + meta.SetStatusCondition(&config.Status.Conditions, metav1.Condition{ + Type: v1alpha1.SafeTimeToAssumeNodeRebootedOverriddenConditionType, + Status: metav1.ConditionTrue, + Reason: "SafeTimeToAssumeNodeRebootedSeconds is overridden by calculated value because it's invalid", + }) + s.log.Info("SafeTimeToAssumeNodeRebootedSeconds is overridden by calculated value because it's invalid, calculated value stored in Status.MinSafeTimeToAssumeNodeRebootedSeconds would be used instead") events.WarningEvent(s.recorder, config, "SafeTimeToAssumeNodeRebootedSecondsInvalid", "SafeTimeToAssumeNodeRebootedSeconds is overridden by calculated value") } From 6d05fa36886141953b3f394ff750c557b60dce89 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 20 May 2024 14:36:23 +0300 Subject: [PATCH 15/21] remove webhook validation of SafeTime empty when config is created Signed-off-by: Michael Shitrit --- api/v1alpha1/selfnoderemediationconfig_webhook.go | 8 -------- api/v1alpha1/selfnoderemediationconfig_webhook_test.go | 5 ----- 2 files changed, 13 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook.go b/api/v1alpha1/selfnoderemediationconfig_webhook.go index b9a0a2c63..f6afb5962 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook.go @@ -75,7 +75,6 @@ func (r *SelfNodeRemediationConfig) ValidateCreate() error { return errors.NewAggregate([]error{ r.validateTimes(), r.validateCustomTolerations(), - r.validateSafeTimeEmpty(), }) } @@ -191,10 +190,3 @@ func (r *SelfNodeRemediationConfig) validateMinRebootTime() error { } return nil } - -func (r *SelfNodeRemediationConfig) validateSafeTimeEmpty() error { - if r.Spec.SafeTimeToAssumeNodeRebootedSeconds > 0 { - return fmt.Errorf("SafeTimeToAssumeNodeRebootedSeconds can only be set after configuration is created") - } - return nil -} diff --git a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go index 8004060cd..c85f2ef6a 100644 --- a/api/v1alpha1/selfnoderemediationconfig_webhook_test.go +++ b/api/v1alpha1/selfnoderemediationconfig_webhook_test.go @@ -180,11 +180,6 @@ func testMultipleInvalidFields(validationType string) { snrc.Spec.CustomDsTolerations = []v1.Toleration{{Key: "validValue", Operator: "dummyInvalidOperatorValue"}} errorMsg += ", invalid operator for toleration: dummyInvalidOperatorValue" - if validationType == "create" { - setFieldValue(snrc, safeTimeToAssumeNodeRebootedSeconds, 180) - errorMsg += ", SafeTimeToAssumeNodeRebootedSeconds can only be set after configuration is created" - } - Context("for CR multiple invalid fields", func() { It("should be rejected", func() { var err error From 11e21a7622c65ff0bf05d60f6ae68eb41e728a5a Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 20 May 2024 16:25:00 +0300 Subject: [PATCH 16/21] use original context instead of creating a new one Signed-off-by: Michael Shitrit --- pkg/reboot/calculator.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index 6a4ec38b3..f7e93db99 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -94,11 +94,11 @@ func (s *safeTimeCalculator) GetTimeToAssumeNodeRebooted() (time.Duration, error return s.timeToAssumeNodeRebooted, nil } -func (s *safeTimeCalculator) Start(_ context.Context) error { - return s.calcMinTimeAssumeRebooted() +func (s *safeTimeCalculator) Start(ctx context.Context) error { + return s.calcMinTimeAssumeRebooted(ctx) } -func (s *safeTimeCalculator) calcMinTimeAssumeRebooted() error { +func (s *safeTimeCalculator) calcMinTimeAssumeRebooted(ctx context.Context) error { if !s.isAgent { return nil } @@ -117,7 +117,7 @@ func (s *safeTimeCalculator) calcMinTimeAssumeRebooted() error { s.minTimeToAssumeNodeRebooted = minTime //update related logic of min time on configuration if necessary - if err := s.manageSafeRebootTimeInConfiguration(minTime); err != nil { + if err := s.manageSafeRebootTimeInConfiguration(ctx, minTime); err != nil { return err } @@ -127,7 +127,7 @@ func (s *safeTimeCalculator) calcMinTimeAssumeRebooted() error { // manageSafeRebootTimeInConfiguration does two things: // 1. It sets Status.MinSafeTimeToAssumeNodeRebootedSeconds in case it's changed by latest calculation. // 2. It Adds/removes config.Status.SpecSafeTimeOverriddenWarning if necessary. -func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Duration) error { +func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(ctx context.Context, minTime time.Duration) error { minTimeSec := int(minTime.Seconds()) config, err := s.getConfig() if err != nil { @@ -156,7 +156,7 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Du } if !reflect.DeepEqual(config, orgConfig) { - if err := s.k8sClient.Status().Patch(context.Background(), config, client.MergeFrom(orgConfig)); err != nil { + if err := s.k8sClient.Status().Patch(ctx, config, client.MergeFrom(orgConfig)); err != nil { return err } } From ac1b238a7756009c591346eda73b32b9a3858533 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Thu, 23 May 2024 12:49:40 +0300 Subject: [PATCH 17/21] adding context Signed-off-by: Michael Shitrit --- controllers/selfnoderemediation_controller.go | 28 +++++++++---------- ...lfnoderemediationconfig_controller_test.go | 2 +- controllers/tests/shared/shared.go | 2 +- pkg/reboot/calculator.go | 12 ++++---- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/controllers/selfnoderemediation_controller.go b/controllers/selfnoderemediation_controller.go index b441b1c18..08675d3cb 100644 --- a/controllers/selfnoderemediation_controller.go +++ b/controllers/selfnoderemediation_controller.go @@ -239,9 +239,9 @@ func (r *SelfNodeRemediationReconciler) Reconcile(ctx context.Context, req ctrl. strategy := r.getRuntimeStrategy(snr) switch strategy { case v1alpha1.ResourceDeletionRemediationStrategy: - result, err = r.remediateWithResourceDeletion(snr, node) + result, err = r.remediateWithResourceDeletion(ctx, snr, node) case v1alpha1.OutOfServiceTaintRemediationStrategy: - result, err = r.remediateWithOutOfServiceTaint(snr, node) + result, err = r.remediateWithOutOfServiceTaint(ctx, snr, node) default: //this should never happen since we enforce valid values with kubebuilder err := errors.New("unsupported remediation strategy") @@ -316,8 +316,8 @@ func (r *SelfNodeRemediationReconciler) getPhase(snr *v1alpha1.SelfNodeRemediati } } -func (r *SelfNodeRemediationReconciler) remediateWithResourceDeletion(snr *v1alpha1.SelfNodeRemediation, node *v1.Node) (ctrl.Result, error) { - return r.remediateWithResourceRemoval(snr, node, r.deleteResourcesWrapper) +func (r *SelfNodeRemediationReconciler) remediateWithResourceDeletion(ctx context.Context, snr *v1alpha1.SelfNodeRemediation, node *v1.Node) (ctrl.Result, error) { + return r.remediateWithResourceRemoval(ctx, snr, node, r.deleteResourcesWrapper) } // deleteResourcesWrapper returns a 'zero' time and nil if it completes to delete node resources successfully @@ -327,8 +327,8 @@ func (r *SelfNodeRemediationReconciler) deleteResourcesWrapper(node *v1.Node, _ return 0, resources.DeletePods(context.Background(), r.Client, node.Name) } -func (r *SelfNodeRemediationReconciler) remediateWithOutOfServiceTaint(snr *v1alpha1.SelfNodeRemediation, node *v1.Node) (ctrl.Result, error) { - return r.remediateWithResourceRemoval(snr, node, r.useOutOfServiceTaint) +func (r *SelfNodeRemediationReconciler) remediateWithOutOfServiceTaint(ctx context.Context, snr *v1alpha1.SelfNodeRemediation, node *v1.Node) (ctrl.Result, error) { + return r.remediateWithResourceRemoval(ctx, snr, node, r.useOutOfServiceTaint) } func (r *SelfNodeRemediationReconciler) useOutOfServiceTaint(node *v1.Node, snr *v1alpha1.SelfNodeRemediation) (time.Duration, error) { @@ -356,13 +356,13 @@ func (r *SelfNodeRemediationReconciler) useOutOfServiceTaint(node *v1.Node, snr type removeNodeResources func(*v1.Node, *v1alpha1.SelfNodeRemediation) (time.Duration, error) -func (r *SelfNodeRemediationReconciler) remediateWithResourceRemoval(snr *v1alpha1.SelfNodeRemediation, node *v1.Node, rmNodeResources removeNodeResources) (ctrl.Result, error) { +func (r *SelfNodeRemediationReconciler) remediateWithResourceRemoval(ctx context.Context, snr *v1alpha1.SelfNodeRemediation, node *v1.Node, rmNodeResources removeNodeResources) (ctrl.Result, error) { result := ctrl.Result{} phase := r.getPhase(snr) var err error switch phase { case fencingStartedPhase: - result, err = r.handleFencingStartedPhase(node, snr) + result, err = r.handleFencingStartedPhase(ctx, node, snr) case preRebootCompletedPhase: result, err = r.handlePreRebootCompletedPhase(node, snr) case rebootCompletedPhase: @@ -377,11 +377,11 @@ func (r *SelfNodeRemediationReconciler) remediateWithResourceRemoval(snr *v1alph return result, err } -func (r *SelfNodeRemediationReconciler) handleFencingStartedPhase(node *v1.Node, snr *v1alpha1.SelfNodeRemediation) (ctrl.Result, error) { - return r.prepareReboot(node, snr) +func (r *SelfNodeRemediationReconciler) handleFencingStartedPhase(ctx context.Context, node *v1.Node, snr *v1alpha1.SelfNodeRemediation) (ctrl.Result, error) { + return r.prepareReboot(ctx, node, snr) } -func (r *SelfNodeRemediationReconciler) prepareReboot(node *v1.Node, snr *v1alpha1.SelfNodeRemediation) (ctrl.Result, error) { +func (r *SelfNodeRemediationReconciler) prepareReboot(ctx context.Context, node *v1.Node, snr *v1alpha1.SelfNodeRemediation) (ctrl.Result, error) { r.logger.Info("pre-reboot not completed yet, prepare for rebooting") if !r.isNodeRebootCapable(node) { //use err to trigger exponential backoff @@ -401,7 +401,7 @@ func (r *SelfNodeRemediationReconciler) prepareReboot(node *v1.Node, snr *v1alph } if snr.Status.TimeAssumedRebooted.IsZero() { - if err := r.updateTimeAssumedRebooted(node, snr); err != nil { + if err := r.updateTimeAssumedRebooted(ctx, node, snr); err != nil { return ctrl.Result{}, err } } @@ -627,10 +627,10 @@ func (r *SelfNodeRemediationReconciler) updateSnrStatus(ctx context.Context, snr return nil } -func (r *SelfNodeRemediationReconciler) updateTimeAssumedRebooted(node *v1.Node, snr *v1alpha1.SelfNodeRemediation) error { +func (r *SelfNodeRemediationReconciler) updateTimeAssumedRebooted(ctx context.Context, node *v1.Node, snr *v1alpha1.SelfNodeRemediation) error { r.logger.Info("updating snr with node backup and updating time to assume node has been rebooted", "node name", node.Name) //we assume the unhealthy node will be rebooted by maxTimeNodeHasRebooted - toAssumeNodeRebooted, err := r.SafeTimeCalculator.GetTimeToAssumeNodeRebooted() + toAssumeNodeRebooted, err := r.SafeTimeCalculator.GetTimeToAssumeNodeRebooted(ctx) if err != nil { return err } diff --git a/controllers/tests/config/selfnoderemediationconfig_controller_test.go b/controllers/tests/config/selfnoderemediationconfig_controller_test.go index e559d459d..ca68b763c 100644 --- a/controllers/tests/config/selfnoderemediationconfig_controller_test.go +++ b/controllers/tests/config/selfnoderemediationconfig_controller_test.go @@ -166,7 +166,7 @@ var _ = Describe("SNR Config Test", func() { }) It("SafeTimeCalculator of the manager gets the correct TimeToAssumeNodeRebooted value from the configuration", func() { Eventually(func(g Gomega) bool { - g.Expect(managerReconciler.SafeTimeCalculator.GetTimeToAssumeNodeRebooted()).To(Equal(time.Second * 65)) + g.Expect(managerReconciler.SafeTimeCalculator.GetTimeToAssumeNodeRebooted(context.TODO())).To(Equal(time.Second * 65)) return true }, 10*time.Second, 250*time.Millisecond).Should(BeTrue()) }) diff --git a/controllers/tests/shared/shared.go b/controllers/tests/shared/shared.go index 471e6902f..5c072a5f8 100644 --- a/controllers/tests/shared/shared.go +++ b/controllers/tests/shared/shared.go @@ -42,7 +42,7 @@ func (kcw *K8sClientWrapper) List(ctx context.Context, list client.ObjectList, o return kcw.Client.List(ctx, list, opts...) } -func (m *MockCalculator) GetTimeToAssumeNodeRebooted() (time.Duration, error) { +func (m *MockCalculator) GetTimeToAssumeNodeRebooted(context.Context) (time.Duration, error) { return m.MockTimeToAssumeNodeRebooted, nil } diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index f7e93db99..9d9721ee9 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -33,7 +33,7 @@ type SafeTimeCalculator interface { // GetTimeToAssumeNodeRebooted returns the safe time to assume node was already rebooted // note that this time must include the time for a unhealthy node without api-server access to reach the conclusion that it's unhealthy // this should be at least worst-case time to reach a conclusion from the other peers * request context timeout + watchdog interval + maxFailuresThreshold * reconcileInterval + padding - GetTimeToAssumeNodeRebooted() (time.Duration, error) + GetTimeToAssumeNodeRebooted(ctx context.Context) (time.Duration, error) Start(ctx context.Context) error //IsAgent return true in case running on an agent pod (responsible for reboot) or false in case running on a manager pod IsAgent() bool @@ -75,10 +75,10 @@ func NewManagerSafeTimeCalculator(k8sClient client.Client) SafeTimeCalculator { } } -func (s *safeTimeCalculator) GetTimeToAssumeNodeRebooted() (time.Duration, error) { +func (s *safeTimeCalculator) GetTimeToAssumeNodeRebooted(ctx context.Context) (time.Duration, error) { minTime := s.minTimeToAssumeNodeRebooted if !s.isAgent { - config, err := s.getConfig() + config, err := s.getConfig(ctx) if err != nil { return 0, err } @@ -129,7 +129,7 @@ func (s *safeTimeCalculator) calcMinTimeAssumeRebooted(ctx context.Context) erro // 2. It Adds/removes config.Status.SpecSafeTimeOverriddenWarning if necessary. func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(ctx context.Context, minTime time.Duration) error { minTimeSec := int(minTime.Seconds()) - config, err := s.getConfig() + config, err := s.getConfig(ctx) if err != nil { return err } @@ -175,9 +175,9 @@ func (s *safeTimeCalculator) getMinSafeTimeSecFromConfig(config *v1alpha1.SelfNo return 0, err } -func (s *safeTimeCalculator) getConfig() (*v1alpha1.SelfNodeRemediationConfig, error) { +func (s *safeTimeCalculator) getConfig(ctx context.Context) (*v1alpha1.SelfNodeRemediationConfig, error) { confList := &v1alpha1.SelfNodeRemediationConfigList{} - if err := s.k8sClient.List(context.Background(), confList); err != nil { + if err := s.k8sClient.List(ctx, confList); err != nil { s.log.Error(err, "failed to get snr configuration") return nil, err } From 057f2e53b2ccdb6c954c93ddb9f37846e6200dcf Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Thu, 23 May 2024 13:08:11 +0300 Subject: [PATCH 18/21] improve conditions's field readabilty Signed-off-by: Michael Shitrit --- api/v1alpha1/selfnoderemediationconfig_types.go | 4 ++-- .../self-node-remediation.clusterserviceversion.yaml | 2 +- ...ode-remediation.medik8s.io_selfnoderemediationconfigs.yaml | 2 +- ...ode-remediation.medik8s.io_selfnoderemediationconfigs.yaml | 2 +- .../bases/self-node-remediation.clusterserviceversion.yaml | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index 032370418..b33c5cd0e 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -135,9 +135,9 @@ type SelfNodeRemediationConfigStatus struct { // +kubebuilder:validation:Minimum=0 MinSafeTimeToAssumeNodeRebootedSeconds int `json:"minSafeTimeToAssumeNodeRebootedSeconds,omitempty"` - // +operator-sdk:csv:customresourcedefinitions:type=status,displayName="conditions",xDescriptors="urn:alm:descriptor:com.tectonic.ui:conditions" - // Represents the observations of a SelfNodeRemediationConfig's current state. + // Conditions Represents the observations of a SelfNodeRemediationConfig's current state. // Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden" + // +operator-sdk:csv:customresourcedefinitions:type=status,displayName="conditions",xDescriptors="urn:alm:descriptor:com.tectonic.ui:conditions" // +listType=map // +listMapKey=type // +optional diff --git a/bundle/manifests/self-node-remediation.clusterserviceversion.yaml b/bundle/manifests/self-node-remediation.clusterserviceversion.yaml index 2b33d1491..68880b40b 100644 --- a/bundle/manifests/self-node-remediation.clusterserviceversion.yaml +++ b/bundle/manifests/self-node-remediation.clusterserviceversion.yaml @@ -63,7 +63,7 @@ spec: name: selfnoderemediationconfigs version: v1alpha1 statusDescriptors: - - description: 'Represents the observations of a SelfNodeRemediationConfig''s + - description: 'Conditions Represents the observations of a SelfNodeRemediationConfig''s current state. Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden"' displayName: conditions path: conditions diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index a567c039e..daade711d 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -173,7 +173,7 @@ spec: properties: conditions: description: |- - Represents the observations of a SelfNodeRemediationConfig's current state. + Conditions Represents the observations of a SelfNodeRemediationConfig's current state. Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden" items: description: "Condition contains details for one aspect of the current diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index c063d6c92..6267eaed0 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -171,7 +171,7 @@ spec: properties: conditions: description: |- - Represents the observations of a SelfNodeRemediationConfig's current state. + Conditions Represents the observations of a SelfNodeRemediationConfig's current state. Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden" items: description: "Condition contains details for one aspect of the current diff --git a/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml b/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml index 8b5f2ebf9..1f5f9c245 100644 --- a/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml +++ b/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml @@ -30,7 +30,7 @@ spec: name: selfnoderemediationconfigs version: v1alpha1 statusDescriptors: - - description: 'Represents the observations of a SelfNodeRemediationConfig''s + - description: 'Conditions Represents the observations of a SelfNodeRemediationConfig''s current state. Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden"' displayName: conditions path: conditions From db7144427fa807c1c3720c7f3331cd21e9963a14 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Thu, 23 May 2024 13:21:57 +0300 Subject: [PATCH 19/21] - update condition's Reason/Message - improve method readability Signed-off-by: Michael Shitrit --- pkg/reboot/calculator.go | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index 9d9721ee9..a2f995235 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -138,21 +138,25 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(ctx context.Con if prevMinRebootTimeSec != minTimeSec { config.Status.MinSafeTimeToAssumeNodeRebootedSeconds = minTimeSec } + //Manage condition - meta.SetStatusCondition(&config.Status.Conditions, metav1.Condition{ - Type: v1alpha1.SafeTimeToAssumeNodeRebootedOverriddenConditionType, - Status: metav1.ConditionFalse, - Reason: "SafeTimeToAssumeNodeRebootedSeconds is valid or empty", - }) if config.Spec.SafeTimeToAssumeNodeRebootedSeconds > 0 && minTimeSec > config.Spec.SafeTimeToAssumeNodeRebootedSeconds { meta.SetStatusCondition(&config.Status.Conditions, metav1.Condition{ - Type: v1alpha1.SafeTimeToAssumeNodeRebootedOverriddenConditionType, - Status: metav1.ConditionTrue, - Reason: "SafeTimeToAssumeNodeRebootedSeconds is overridden by calculated value because it's invalid", + Type: v1alpha1.SafeTimeToAssumeNodeRebootedOverriddenConditionType, + Status: metav1.ConditionTrue, + Reason: "SafeTimeIsInvalid", + Message: "Spec.SafeTimeToAssumeNodeRebootedSeconds is overridden because it's invalid, its value is lower than the automatically minimum calculated value at Status.MinSafeTimeToAssumeNodeRebootedSeconds", }) s.log.Info("SafeTimeToAssumeNodeRebootedSeconds is overridden by calculated value because it's invalid, calculated value stored in Status.MinSafeTimeToAssumeNodeRebootedSeconds would be used instead") events.WarningEvent(s.recorder, config, "SafeTimeToAssumeNodeRebootedSecondsInvalid", "SafeTimeToAssumeNodeRebootedSeconds is overridden by calculated value") + } else { + meta.SetStatusCondition(&config.Status.Conditions, metav1.Condition{ + Type: v1alpha1.SafeTimeToAssumeNodeRebootedOverriddenConditionType, + Status: metav1.ConditionFalse, + Reason: "SafeTimeIsValid", + Message: "Spec.SafeTimeToAssumeNodeRebootedSeconds is valid because it isn't lower than the automatically minimum calculated value at Status.MinSafeTimeToAssumeNodeRebootedSeconds", + }) } if !reflect.DeepEqual(config, orgConfig) { From bf88778b95d14c6f262bab49dc8cf77a732e6526 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Thu, 30 May 2024 13:31:35 +0300 Subject: [PATCH 20/21] - update descriptor - use midsentence lowercase - remove redundant display name Signed-off-by: Michael Shitrit --- api/v1alpha1/selfnoderemediationconfig_types.go | 4 ++-- .../self-node-remediation.clusterserviceversion.yaml | 6 +++--- ...e-remediation.medik8s.io_selfnoderemediationconfigs.yaml | 2 +- ...e-remediation.medik8s.io_selfnoderemediationconfigs.yaml | 2 +- .../bases/self-node-remediation.clusterserviceversion.yaml | 6 +++--- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index b33c5cd0e..2921ddf3f 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -135,9 +135,9 @@ type SelfNodeRemediationConfigStatus struct { // +kubebuilder:validation:Minimum=0 MinSafeTimeToAssumeNodeRebootedSeconds int `json:"minSafeTimeToAssumeNodeRebootedSeconds,omitempty"` - // Conditions Represents the observations of a SelfNodeRemediationConfig's current state. + // Conditions represents the observations of a SelfNodeRemediationConfig's current state. // Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden" - // +operator-sdk:csv:customresourcedefinitions:type=status,displayName="conditions",xDescriptors="urn:alm:descriptor:com.tectonic.ui:conditions" + // +operator-sdk:csv:customresourcedefinitions:type=status,xDescriptors="urn:alm:descriptor:io.kubernetes.conditions" // +listType=map // +listMapKey=type // +optional diff --git a/bundle/manifests/self-node-remediation.clusterserviceversion.yaml b/bundle/manifests/self-node-remediation.clusterserviceversion.yaml index 68880b40b..71cf0e0be 100644 --- a/bundle/manifests/self-node-remediation.clusterserviceversion.yaml +++ b/bundle/manifests/self-node-remediation.clusterserviceversion.yaml @@ -63,12 +63,12 @@ spec: name: selfnoderemediationconfigs version: v1alpha1 statusDescriptors: - - description: 'Conditions Represents the observations of a SelfNodeRemediationConfig''s + - description: 'Conditions represents the observations of a SelfNodeRemediationConfig''s current state. Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden"' - displayName: conditions + displayName: Conditions path: conditions x-descriptors: - - urn:alm:descriptor:com.tectonic.ui:conditions + - urn:alm:descriptor:io.kubernetes.conditions version: v1alpha1 - description: SelfNodeRemediation is the Schema for the selfnoderemediations API diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index daade711d..4c9313449 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -173,7 +173,7 @@ spec: properties: conditions: description: |- - Conditions Represents the observations of a SelfNodeRemediationConfig's current state. + Conditions represents the observations of a SelfNodeRemediationConfig's current state. Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden" items: description: "Condition contains details for one aspect of the current diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 6267eaed0..27bfd3469 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -171,7 +171,7 @@ spec: properties: conditions: description: |- - Conditions Represents the observations of a SelfNodeRemediationConfig's current state. + Conditions represents the observations of a SelfNodeRemediationConfig's current state. Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden" items: description: "Condition contains details for one aspect of the current diff --git a/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml b/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml index 1f5f9c245..11c21b273 100644 --- a/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml +++ b/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml @@ -30,12 +30,12 @@ spec: name: selfnoderemediationconfigs version: v1alpha1 statusDescriptors: - - description: 'Conditions Represents the observations of a SelfNodeRemediationConfig''s + - description: 'Conditions represents the observations of a SelfNodeRemediationConfig''s current state. Known .status.conditions.type are: "SafeTimeToAssumeNodeRebootedOverridden"' - displayName: conditions + displayName: Conditions path: conditions x-descriptors: - - urn:alm:descriptor:com.tectonic.ui:conditions + - urn:alm:descriptor:io.kubernetes.conditions version: v1alpha1 - description: SelfNodeRemediation is the Schema for the selfnoderemediations API From 04a6964998e806ac4fda07e921c9a29ea1e5e0b9 Mon Sep 17 00:00:00 2001 From: Michael Shitrit Date: Mon, 3 Jun 2024 15:48:28 +0300 Subject: [PATCH 21/21] using "safet" minimum calculated value Signed-off-by: Michael Shitrit --- .../selfnoderemediationconfig_types.go | 8 + api/v1alpha1/zz_generated.deepcopy.go | 4 + ...ode-remediation.clusterserviceversion.yaml | 3 + ...medik8s.io_selfnoderemediationconfigs.yaml | 4 + ...medik8s.io_selfnoderemediationconfigs.yaml | 4 + ...ode-remediation.clusterserviceversion.yaml | 3 + pkg/reboot/calculator.go | 13 +- pkg/reboot/calculator_test.go | 198 ++++++++++++++++++ pkg/reboot/suite_test.go | 65 +++++- 9 files changed, 300 insertions(+), 2 deletions(-) create mode 100644 pkg/reboot/calculator_test.go diff --git a/api/v1alpha1/selfnoderemediationconfig_types.go b/api/v1alpha1/selfnoderemediationconfig_types.go index 2921ddf3f..4ba3905fe 100644 --- a/api/v1alpha1/selfnoderemediationconfig_types.go +++ b/api/v1alpha1/selfnoderemediationconfig_types.go @@ -142,6 +142,14 @@ type SelfNodeRemediationConfigStatus struct { // +listMapKey=type // +optional Conditions []metav1.Condition `json:"conditions,omitempty"` + + // LastUpdateTime is the last time the status was updated. + // + //+optional + //+kubebuilder:validation:Type=string + //+kubebuilder:validation:Format=date-time + //+operator-sdk:csv:customresourcedefinitions:type=status + LastUpdateTime *metav1.Time `json:"lastUpdateTime,omitempty"` } //+kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 451c29128..cc3e04ab6 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -174,6 +174,10 @@ func (in *SelfNodeRemediationConfigStatus) DeepCopyInto(out *SelfNodeRemediation (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.LastUpdateTime != nil { + in, out := &in.LastUpdateTime, &out.LastUpdateTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new SelfNodeRemediationConfigStatus. diff --git a/bundle/manifests/self-node-remediation.clusterserviceversion.yaml b/bundle/manifests/self-node-remediation.clusterserviceversion.yaml index 71cf0e0be..6b5af047c 100644 --- a/bundle/manifests/self-node-remediation.clusterserviceversion.yaml +++ b/bundle/manifests/self-node-remediation.clusterserviceversion.yaml @@ -69,6 +69,9 @@ spec: path: conditions x-descriptors: - urn:alm:descriptor:io.kubernetes.conditions + - description: LastUpdateTime is the last time the status was updated. + displayName: Last Update Time + path: lastUpdateTime version: v1alpha1 - description: SelfNodeRemediation is the Schema for the selfnoderemediations API diff --git a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 4c9313449..88bfffead 100644 --- a/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/bundle/manifests/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -246,6 +246,10 @@ spec: x-kubernetes-list-map-keys: - type x-kubernetes-list-type: map + lastUpdateTime: + description: LastUpdateTime is the last time the status was updated. + format: date-time + type: string minSafeTimeToAssumeNodeRebootedSeconds: description: MinSafeTimeToAssumeNodeRebootedSeconds is the minimum value that can be assigned to SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds, diff --git a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml index 27bfd3469..7b7720f87 100644 --- a/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml +++ b/config/crd/bases/self-node-remediation.medik8s.io_selfnoderemediationconfigs.yaml @@ -244,6 +244,10 @@ spec: x-kubernetes-list-map-keys: - type x-kubernetes-list-type: map + lastUpdateTime: + description: LastUpdateTime is the last time the status was updated. + format: date-time + type: string minSafeTimeToAssumeNodeRebootedSeconds: description: MinSafeTimeToAssumeNodeRebootedSeconds is the minimum value that can be assigned to SelfNodeRemediationConfigSpec.SafeTimeToAssumeNodeRebootedSeconds, diff --git a/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml b/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml index 11c21b273..770e17a4d 100644 --- a/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml +++ b/config/manifests/bases/self-node-remediation.clusterserviceversion.yaml @@ -36,6 +36,9 @@ spec: path: conditions x-descriptors: - urn:alm:descriptor:io.kubernetes.conditions + - description: LastUpdateTime is the last time the status was updated. + displayName: Last Update Time + path: lastUpdateTime version: v1alpha1 - description: SelfNodeRemediation is the Schema for the selfnoderemediations API diff --git a/pkg/reboot/calculator.go b/pkg/reboot/calculator.go index a2f995235..b3df7f0e0 100644 --- a/pkg/reboot/calculator.go +++ b/pkg/reboot/calculator.go @@ -29,6 +29,11 @@ const ( MaxBatchesAfterFirst = 10 ) +var ( + //timeBeforeAssumingRecentUpdate is the time the timeframe in which we assume multiple updates belong to the same configuration change and are applied by different agents + timeBeforeAssumingRecentUpdate = time.Second * 15 +) + type SafeTimeCalculator interface { // GetTimeToAssumeNodeRebooted returns the safe time to assume node was already rebooted // note that this time must include the time for a unhealthy node without api-server access to reach the conclusion that it's unhealthy @@ -133,9 +138,14 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(ctx context.Con if err != nil { return err } + orgConfig := config.DeepCopy() prevMinRebootTimeSec := config.Status.MinSafeTimeToAssumeNodeRebootedSeconds - if prevMinRebootTimeSec != minTimeSec { + isUpdatedRecently := config.Status.LastUpdateTime != nil && (*config.Status.LastUpdateTime).Add(timeBeforeAssumingRecentUpdate).After(time.Now()) + //Use safer value + if minTimeSec > prevMinRebootTimeSec || + // we can update even though value may be less safe because the config has changed + (!isUpdatedRecently && prevMinRebootTimeSec != minTimeSec) { config.Status.MinSafeTimeToAssumeNodeRebootedSeconds = minTimeSec } @@ -160,6 +170,7 @@ func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(ctx context.Con } if !reflect.DeepEqual(config, orgConfig) { + config.Status.LastUpdateTime = &metav1.Time{Time: time.Now()} if err := s.k8sClient.Status().Patch(ctx, config, client.MergeFrom(orgConfig)); err != nil { return err } diff --git a/pkg/reboot/calculator_test.go b/pkg/reboot/calculator_test.go new file mode 100644 index 000000000..b121ce568 --- /dev/null +++ b/pkg/reboot/calculator_test.go @@ -0,0 +1,198 @@ +package reboot + +import ( + "context" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/medik8s/self-node-remediation/api/v1alpha1" +) + +var _ = Describe("Calculator Tests", func() { + var config *v1alpha1.SelfNodeRemediationConfig + var configKey client.ObjectKey + //105 is just the calculated value according to current specs fill free to change in case the Specs or calculation changes + var defaultCalculatedMinSafeTime = 105 + var increasedCalculatedMinSafeTime = 108 + var decreasedCalculatedMinSafeTime = 102 + Context("Agent calculator", func() { + config = createDefaultSelfNodeRemediationConfigCR() + configKey = client.ObjectKeyFromObject(config) + + AfterEach(func() { + Expect(k8sClient.Delete(context.TODO(), config)).To(Succeed()) + Eventually(func(g Gomega) bool { + tmpConfig := &v1alpha1.SelfNodeRemediationConfig{} + err := k8sClient.Get(context.TODO(), configKey, tmpConfig) + g.Expect(err).To(HaveOccurred()) + g.Expect(apierrors.IsNotFound(err)).To(BeTrue()) + return true + }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) + }) + + BeforeEach(func() { + //Initial configuration creation + createConfig(config) + + //Simulating agent starting which will set Status fields + simulateSNRAgentStart(configKey) + + }) + It("Status should be updated after agent is initialized", func() { + Eventually(func(g Gomega) bool { + tmpConfig := &v1alpha1.SelfNodeRemediationConfig{} + g.Expect(k8sClient.Get(context.TODO(), configKey, tmpConfig)).To(Succeed()) + //Configuration status fields are now filled by the agent + g.Expect(tmpConfig.Status.LastUpdateTime).ToNot(BeNil()) + g.Expect(tmpConfig.Status.MinSafeTimeToAssumeNodeRebootedSeconds).To(Equal(defaultCalculatedMinSafeTime)) + return true + }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) + }) + + Context("A recent configuration change", func() { + When("the change increase the min calculated value", func() { + BeforeEach(func() { + increaseApiCheckInConfig(configKey) + simulateSNRAgentStart(configKey) + }) + It("Status should be updated by the agent", func() { + verifyMinSafeTimeToAssumeNodeRebootedSecondsValue(configKey, increasedCalculatedMinSafeTime) + }) + + }) + When("the change decrease the min calculated value", func() { + BeforeEach(func() { + decreaseApiCheckInConfig(configKey) + simulateSNRAgentStart(configKey) + }) + It("Status should NOT be updated by the agent", func() { + verifyMinSafeTimeToAssumeNodeRebootedSecondsValue(configKey, defaultCalculatedMinSafeTime) + }) + + }) + }) + + Context("A Non recent configuration change", func() { + BeforeEach(func() { + originalTimeBeforeAssumingRecentUpdate := timeBeforeAssumingRecentUpdate + //mocking the time for faster tests + timeBeforeAssumingRecentUpdate = time.Second + //simulating non recent update + time.Sleep(time.Millisecond * 1100) + DeferCleanup(func() { + timeBeforeAssumingRecentUpdate = originalTimeBeforeAssumingRecentUpdate + }) + }) + When("the change increase the min calculated value", func() { + BeforeEach(func() { + increaseApiCheckInConfig(configKey) + simulateSNRAgentStart(configKey) + }) + It("Status should be updated by the agent", func() { + verifyMinSafeTimeToAssumeNodeRebootedSecondsValue(configKey, increasedCalculatedMinSafeTime) + }) + + }) + When("the change decrease the min calculated value", func() { + BeforeEach(func() { + decreaseApiCheckInConfig(configKey) + simulateSNRAgentStart(configKey) + }) + It("Status should be updated by the agent", func() { + verifyMinSafeTimeToAssumeNodeRebootedSecondsValue(configKey, decreasedCalculatedMinSafeTime) + }) + + }) + + }) + + }) +}) + +func verifyMinSafeTimeToAssumeNodeRebootedSecondsValue(configKey client.ObjectKey, expectedValue int) { + Eventually(func(g Gomega) bool { + tmpConfig := &v1alpha1.SelfNodeRemediationConfig{} + g.Expect(k8sClient.Get(context.TODO(), configKey, tmpConfig)).To(Succeed()) + //Configuration status fields are now filled by the agent + g.Expect(tmpConfig.Status.MinSafeTimeToAssumeNodeRebootedSeconds).To(Equal(expectedValue)) + return true + }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) +} + +func increaseApiCheckInConfig(configKey client.ObjectKey) { + modifyApiCheckInConfig(configKey, true) +} + +func decreaseApiCheckInConfig(configKey client.ObjectKey) { + modifyApiCheckInConfig(configKey, false) +} + +func modifyApiCheckInConfig(configKey client.ObjectKey, isIncrease bool) { + tmpConfig := &v1alpha1.SelfNodeRemediationConfig{} + Expect(k8sClient.Get(context.TODO(), configKey, tmpConfig)).To(Succeed()) + originalCheckInterval := tmpConfig.Spec.ApiCheckInterval + var newVal time.Duration + if isIncrease { + newVal = originalCheckInterval.Duration + time.Second + } else { + newVal = originalCheckInterval.Duration - time.Second + } + + tmpConfig.Spec.ApiCheckInterval = &metav1.Duration{Duration: newVal} + Expect(k8sClient.Update(context.TODO(), tmpConfig)).To(Succeed()) + + //Wait for value update in config + Eventually(func(g Gomega) bool { + tmpConfig := &v1alpha1.SelfNodeRemediationConfig{} + g.Expect(k8sClient.Get(context.TODO(), configKey, tmpConfig)).To(Succeed()) + g.Expect(tmpConfig.Spec.ApiCheckInterval.Duration).To(Equal(newVal)) + return true + }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) + +} + +func createDefaultSelfNodeRemediationConfigCR() *v1alpha1.SelfNodeRemediationConfig { + snrc := &v1alpha1.SelfNodeRemediationConfig{} + snrc.Name = "test" + snrc.Namespace = "default" + + //default values for time fields + snrc.Spec.PeerApiServerTimeout = &metav1.Duration{Duration: 5 * time.Second} + snrc.Spec.ApiServerTimeout = &metav1.Duration{Duration: 5 * time.Second} + snrc.Spec.PeerDialTimeout = &metav1.Duration{Duration: 5 * time.Second} + snrc.Spec.PeerRequestTimeout = &metav1.Duration{Duration: 5 * time.Second} + snrc.Spec.ApiCheckInterval = &metav1.Duration{Duration: 15 * time.Second} + snrc.Spec.PeerUpdateInterval = &metav1.Duration{Duration: 15 * time.Minute} + + return snrc +} + +func simulateSNRAgentStart(configKey client.ObjectKey) { + config := &v1alpha1.SelfNodeRemediationConfig{} + Expect(k8sClient.Get(context.TODO(), configKey, config)).To(Succeed()) + calc := NewAgentSafeTimeCalculator(k8sClient, nil, nil, config.Spec.MaxApiErrorThreshold, config.Spec.ApiCheckInterval.Duration, config.Spec.ApiServerTimeout.Duration, config.Spec.PeerDialTimeout.Duration, config.Spec.PeerRequestTimeout.Duration, 0) + + //Simulating agent starting according to config + Expect(calc.Start(context.TODO())).To(Succeed()) + //wait for agent to finish updating the config + time.Sleep(time.Second) + +} + +func createConfig(config *v1alpha1.SelfNodeRemediationConfig) { + //Initial configuration creation + Expect(k8sClient.Create(context.TODO(), config.DeepCopy())).To(Succeed()) + //Wait for config to be created + Eventually(func(g Gomega) bool { + tmpConfig := &v1alpha1.SelfNodeRemediationConfig{} + err := k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(config), tmpConfig) + g.Expect(err).To(Succeed()) + return true + }, 5*time.Second, 250*time.Millisecond).Should(BeTrue()) +} diff --git a/pkg/reboot/suite_test.go b/pkg/reboot/suite_test.go index 77a4f310c..127693c6f 100644 --- a/pkg/reboot/suite_test.go +++ b/pkg/reboot/suite_test.go @@ -1,21 +1,84 @@ package reboot import ( + "context" + "path/filepath" "testing" + "time" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "go.uber.org/zap/zapcore" + + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + selfnoderemediationv1alpha1 "github.com/medik8s/self-node-remediation/api/v1alpha1" ) +var testEnv *envtest.Environment +var k8sClient client.Client +var cfg *rest.Config +var cancelFunc context.CancelFunc + func TestWatchdog(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "Rebooter Suite") } var _ = BeforeSuite(func() { + By("bootstrapping test environment") + opts := zap.Options{ + Development: true, + TimeEncoder: zapcore.RFC3339NanoTimeEncoder, + } + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseFlagOptions(&opts))) + + By("bootstrapping test environment") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{filepath.Join("..", "..", "config", "crd", "bases")}, + ErrorIfCRDPathMissing: true, + } + + var err error + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + err = selfnoderemediationv1alpha1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + //+kubebuilder:scaffold:scheme + + gracefulShutdown := 0 * time.Second + Expect(err).ToNot(HaveOccurred()) + k8sManager, err := ctrl.NewManager(cfg, ctrl.Options{ + Scheme: scheme.Scheme, + LeaderElection: false, + MetricsBindAddress: "0", + GracefulShutdownTimeout: &gracefulShutdown, + }) + Expect(err).ToNot(HaveOccurred()) + + k8sClient = k8sManager.GetClient() + Expect(k8sClient).ToNot(BeNil()) + + var ctx context.Context + ctx, cancelFunc = context.WithCancel(ctrl.SetupSignalHandler()) + go func() { + defer GinkgoRecover() + err = k8sManager.Start(ctx) + Expect(err).ToNot(HaveOccurred()) + }() }) var _ = AfterSuite(func() { - + By("tearing down the test environment") + cancelFunc() + Expect(testEnv.Stop()).To(Succeed()) })