Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Dynamiclly set safe time to assume node rebooted seconds #197

Closed
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion api/v1alpha1/selfnoderemediationconfig_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ type SelfNodeRemediationConfigSpec struct {
// Valid time units are "ms", "s", "m", "h".
// +optional
// +kubebuilder:default:="15m"
// +kubebuilder:validation:Pattern="^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$"
// +kubebuilder:validation:Pattern="^(0|([0-9]+(\\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$"
slintes marked this conversation as resolved.
Show resolved Hide resolved

mshitrit marked this conversation as resolved.
Show resolved Hide resolved
// +kubebuilder:validation:Type:=string
PeerUpdateInterval *metav1.Duration `json:"peerUpdateInterval,omitempty"`

Expand Down
22 changes: 22 additions & 0 deletions api/v1alpha1/selfnoderemediationconfig_webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,19 @@ package v1alpha1

import (
"fmt"
"strconv"
"time"

pkgerrors "github.com/pkg/errors"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/util/errors"
ctrl "sigs.k8s.io/controller-runtime"
logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/webhook"

"github.com/medik8s/self-node-remediation/pkg/utils"
)

// fields names
Expand Down Expand Up @@ -85,6 +90,7 @@ func (r *SelfNodeRemediationConfig) ValidateUpdate(_ runtime.Object) error {
return errors.NewAggregate([]error{
r.validateTimes(),
r.validateCustomTolerations(),
r.validateMinRebootTime(),
})
}

Expand Down Expand Up @@ -172,3 +178,19 @@ func validateToleration(toleration v1.Toleration) error {
}
return nil
}

func (r *SelfNodeRemediationConfig) validateMinRebootTime() error {
annotations := r.GetAnnotations()
if annotations == nil {
return fmt.Errorf("failed to verify min value of SafeRebootTimeSec, %s annotation should not be empty", utils.MinSafeTimeAnnotation)

}
if minValString, isSet := annotations[utils.MinSafeTimeAnnotation]; isSet {
if minVal, err := strconv.Atoi(minValString); err != nil {
return pkgerrors.Wrapf(err, "failed to verify min value of SafeRebootTimeSec")
} else if r.Spec.SafeTimeToAssumeNodeRebootedSeconds < minVal {
return fmt.Errorf("can not set SafeTimeToAssumeNodeRebootedSeconds value below the calculated minimum value of: %s", minValString)
}
}
return nil
}
39 changes: 39 additions & 0 deletions api/v1alpha1/selfnoderemediationconfig_webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ import (
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/utils/pointer"

"github.com/medik8s/self-node-remediation/pkg/utils"
)

// default CR fields durations
Expand Down Expand Up @@ -71,6 +73,41 @@ var _ = Describe("SelfNodeRemediationConfig Validation", func() {
// test update validation on a valid CR
testValidCR("update")

Context("SafeTimeToAssumeNodeRebootedSeconds Validation", func() {
var originalSnrc, updatedSnrc *SelfNodeRemediationConfig
BeforeEach(func() {
originalSnrc = createDefaultSelfNodeRemediationConfigCR()
updatedSnrc = originalSnrc.DeepCopy()
updatedSnrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = 200
})
When("Annotation does not exist", func() {
It("validation should fail", func() {
err := updatedSnrc.ValidateUpdate(originalSnrc)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(MatchRegexp("annotation should not be empty"))
})
})
When("Annotation value is higher than user assigned value", func() {
BeforeEach(func() {
updatedSnrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "220"}
})
It("validation should fail", func() {
err := updatedSnrc.ValidateUpdate(originalSnrc)
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(MatchRegexp("value below the calculated minimum value of: 220"))
})
})
When("Annotation value is lower than user assigned value", func() {
BeforeEach(func() {
updatedSnrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "190"}
})
It("validation should pass", func() {
Expect(updatedSnrc.ValidateUpdate(originalSnrc)).To(Succeed())
})
})

})

})

})
Expand Down Expand Up @@ -174,6 +211,8 @@ func testValidCR(validationType string) {
snrc.Spec.ApiCheckInterval = &metav1.Duration{Duration: 10*time.Second + 500*time.Millisecond}
snrc.Spec.PeerUpdateInterval = &metav1.Duration{Duration: 10 * time.Second}
snrc.Spec.CustomDsTolerations = []v1.Toleration{{Key: "validValue", Effect: v1.TaintEffectNoExecute}, {}, {Operator: v1.TolerationOpEqual, TolerationSeconds: pointer.Int64(-5)}, {Value: "SomeValidValue"}}
snrc.Spec.SafeTimeToAssumeNodeRebootedSeconds = DefaultSafeToAssumeNodeRebootTimeout
snrc.Annotations = map[string]string{utils.MinSafeTimeAnnotation: "150"}

Context("for valid CR", func() {
It("should not be rejected", func() {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,7 @@ spec:
type: string
peerUpdateInterval:
default: 15m
description: Valid time units are "ms", "s", "m", "h".
pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$
pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$
type: string
safeTimeToAssumeNodeRebootedSeconds:
default: 180
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,7 @@ spec:
type: string
peerUpdateInterval:
default: 15m
description: Valid time units are "ms", "s", "m", "h".
pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$
pattern: ^(0|([0-9]+(\.[0-9]+)?(ms|s|m|h)))$|^([1-9][0-9]*m0s)$
type: string
safeTimeToAssumeNodeRebootedSeconds:
default: 180
Expand Down
19 changes: 19 additions & 0 deletions e2e/self_node_remediation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,25 @@ const (

var _ = Describe("Self Node Remediation E2E", func() {

Describe("Setup", func() {
It("Verify min reboot annotation", func() {
configs := v1alpha1.SelfNodeRemediationConfigList{}
Expect(k8sClient.List(context.Background(), &configs)).To(Succeed())
Expect(len(configs.Items)).To(Equal(1))
config := configs.Items[0]
annotations := config.GetAnnotations()
Expect(annotations).ToNot(BeNil())
minSafeTimeAnnotation := "minimum-safe-reboot-sec.self-node-remediation.medik8s.io"
minSafeTimeValueString, isExist := annotations[minSafeTimeAnnotation]
Expect(isExist).To(BeTrue())
minSafeTimeValue, err := strconv.Atoi(minSafeTimeValueString)
Expect(err).To(Succeed())
Expect(minSafeTimeValue).To(BeNumerically(">", 0))
safeTimeValue := config.Spec.SafeTimeToAssumeNodeRebootedSeconds
Expect(safeTimeValue).To(BeNumerically(">=", minSafeTimeValue))
})
})

Describe("Workers Remediation", func() {
var node *v1.Node
workers := &v1.NodeList{}
Expand Down
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ require (
github.com/Masterminds/sprig v2.22.0+incompatible
github.com/go-logr/logr v1.2.3
github.com/go-ping/ping v1.1.0
github.com/medik8s/common v1.15.1
github.com/medik8s/common v1.17.0
github.com/onsi/ginkgo/v2 v2.9.1
github.com/onsi/gomega v1.27.4
github.com/openshift/api v0.0.0-20230414143018-3367bc7e6ac7 // release-4.13
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,8 @@ github.com/mailru/easyjson v0.7.6/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJ
github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0=
github.com/matttproud/golang_protobuf_extensions v1.0.2 h1:hAHbPm5IJGijwng3PWk09JkG9WeqChjprR5s9bBZ+OM=
github.com/matttproud/golang_protobuf_extensions v1.0.2/go.mod h1:BSXmuO+STAnVfrANrmjBb36TMTDstsz7MSK+HVaYKv4=
github.com/medik8s/common v1.15.1 h1:qo2FBZGSegf5q35AZlWzrmgwW1GfUPNmmWaPhb/Uurc=
github.com/medik8s/common v1.15.1/go.mod h1:A9jYldC6PZcAuBowNNm712FqWdASB2ey5Vjp8MYN/PY=
github.com/medik8s/common v1.17.0 h1:AmJKx0tzqGZF27Ot0A4ak85q0F0zqUkVyCvYmm67rtY=
github.com/medik8s/common v1.17.0/go.mod h1:A9jYldC6PZcAuBowNNm712FqWdASB2ey5Vjp8MYN/PY=
github.com/mitchellh/copystructure v1.1.2 h1:Th2TIvG1+6ma3e/0/bopBKohOTY7s4dA8V2q4EUcBJ0=
github.com/mitchellh/copystructure v1.1.2/go.mod h1:EBArHfARyrSWO/+Wyr9zwEkc6XMFB9XyNgFNmRkZZU4=
github.com/mitchellh/reflectwalk v1.0.1 h1:FVzMWA5RllMAKIdUSC8mdWo3XtwoecrH79BY70sEEpE=
Expand Down
5 changes: 3 additions & 2 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,8 @@ func initSelfNodeRemediationAgent(mgr manager.Manager) {
timeToAssumeNodeRebootedInSeconds := getIntEnvVarOrDie("TIME_TO_ASSUME_NODE_REBOOTED")
peerHealthDefaultPort := getIntEnvVarOrDie("HOST_PORT")

safeRebootCalc := reboot.NewAgentSafeTimeCalculator(mgr.GetClient(), wd, maxErrorThreshold, apiCheckInterval, apiServerTimeout, peerDialTimeout, peerRequestTimeout, time.Duration(timeToAssumeNodeRebootedInSeconds)*time.Second)
snrRecorder := mgr.GetEventRecorderFor("SelfNodeRemediation")
safeRebootCalc := reboot.NewAgentSafeTimeCalculator(mgr.GetClient(), snrRecorder, wd, maxErrorThreshold, apiCheckInterval, apiServerTimeout, peerDialTimeout, peerRequestTimeout, time.Duration(timeToAssumeNodeRebootedInSeconds)*time.Second)
if err = mgr.Add(safeRebootCalc); err != nil {
setupLog.Error(err, "failed to add safe reboot time calculator to the manager")
os.Exit(1)
Expand Down Expand Up @@ -358,7 +359,7 @@ func initSelfNodeRemediationAgent(mgr manager.Manager) {
Client: mgr.GetClient(),
Log: ctrl.Log.WithName("controllers").WithName("SelfNodeRemediation"),
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("SelfNodeRemediation"),
Recorder: snrRecorder,
Rebooter: rebooter,
MyNodeName: myNodeName,
RestoreNodeAfter: restoreNodeAfter,
Expand Down
70 changes: 66 additions & 4 deletions pkg/reboot/calculator.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,22 @@ package reboot
import (
"context"
"fmt"
"strconv"
"time"

"github.com/go-logr/logr"
"github.com/medik8s/common/pkg/events"
commonlabels "github.com/medik8s/common/pkg/labels"

v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/labels"
"k8s.io/apimachinery/pkg/selection"
"k8s.io/client-go/tools/record"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/medik8s/self-node-remediation/api/v1alpha1"
"github.com/medik8s/self-node-remediation/pkg/utils"
"github.com/medik8s/self-node-remediation/pkg/watchdog"
)

Expand Down Expand Up @@ -41,11 +46,12 @@ type safeTimeCalculator struct {
apiCheckInterval, apiServerTimeout, peerDialTimeout, peerRequestTimeout time.Duration
log logr.Logger
k8sClient client.Client
recorder record.EventRecorder
highestCalculatedBatchNumber int
isAgent bool
}

func NewAgentSafeTimeCalculator(k8sClient client.Client, wd watchdog.Watchdog, maxErrorThreshold int, apiCheckInterval, apiServerTimeout, peerDialTimeout, peerRequestTimeout, timeToAssumeNodeRebooted time.Duration) SafeTimeCalculator {
func NewAgentSafeTimeCalculator(k8sClient client.Client, recorder record.EventRecorder, wd watchdog.Watchdog, maxErrorThreshold int, apiCheckInterval, apiServerTimeout, peerDialTimeout, peerRequestTimeout, timeToAssumeNodeRebooted time.Duration) SafeTimeCalculator {
return &safeTimeCalculator{
wd: wd,
maxErrorThreshold: maxErrorThreshold,
Expand All @@ -55,6 +61,7 @@ func NewAgentSafeTimeCalculator(k8sClient client.Client, wd watchdog.Watchdog, m
peerRequestTimeout: peerRequestTimeout,
timeToAssumeNodeRebooted: timeToAssumeNodeRebooted,
k8sClient: k8sClient,
recorder: recorder,
isAgent: true,
log: ctrl.Log.WithName("safe-time-calculator"),
}
Expand Down Expand Up @@ -106,11 +113,66 @@ func (s *safeTimeCalculator) calcMinTimeAssumeRebooted() error {
s.log.Info("calculated minTimeToAssumeNodeRebooted is:", "minTimeToAssumeNodeRebooted", minTime)
s.minTimeToAssumeNodeRebooted = minTime

if s.timeToAssumeNodeRebooted < minTime {
err := fmt.Errorf("snr agent can't start: the requested value for SafeTimeToAssumeNodeRebootedSeconds is too low")
s.log.Error(err, err.Error(), "requested SafeTimeToAssumeNodeRebootedSeconds", s.timeToAssumeNodeRebooted, "minimal calculated value for SafeTimeToAssumeNodeRebootedSeconds", minTime)
//update related logic of min time on configuration if necessary
if err := s.manageSafeRebootTimeInConfiguration(minTime); err != nil {
return err
}

return nil
}

// manageSafeRebootTimeInConfiguration does two things:
// 1. It sets an annotation on the configuration with the most updated value of minTimeToAssumeNodeRebooted in seconds (this value will be used by the webhook to verify user doesn't set an invalid value for SafeTimeToAssumeNodeRebootedSeconds).
// 2. In case SafeTimeToAssumeNodeRebootedSeconds is too low (either because of the default configuration or because minvalue has changed due to an update of another field) it'll set it to a valid value and issue an event.
func (s *safeTimeCalculator) manageSafeRebootTimeInConfiguration(minTime time.Duration) error {
minTimeSec := int(minTime.Seconds())
//set it on configuration
confList := &v1alpha1.SelfNodeRemediationConfigList{}
if err := s.k8sClient.List(context.Background(), confList); err != nil {
s.log.Error(err, "failed to get snr configuration")

return err
}

for _, snrConf := range confList.Items {
var isUpdateRequired bool
if snrConf.GetAnnotations() == nil {
snrConf.Annotations = make(map[string]string)
}
prevMinRebootTimeSec, isSet := snrConf.GetAnnotations()[utils.MinSafeTimeAnnotation]

minTimeSecString := strconv.Itoa(minTimeSec)
if !isSet || prevMinRebootTimeSec != minTimeSecString {
if !isSet {
s.log.Info("snr agent about to modify config adding an annotation", "annotation key", utils.MinSafeTimeAnnotation, "annotation value", minTimeSecString)
} else {
s.log.Info("snr agent about to modify config updating annotation value", "annotation key", utils.MinSafeTimeAnnotation, "annotation previous value", prevMinRebootTimeSec, "annotation new value", minTimeSecString)
}
snrConf.GetAnnotations()[utils.MinSafeTimeAnnotation] = minTimeSecString
isUpdateRequired = true
}

if snrConf.Spec.SafeTimeToAssumeNodeRebootedSeconds < minTimeSec {
isUpdateRequired = true
snrConf.Spec.SafeTimeToAssumeNodeRebootedSeconds = minTimeSec
s.SetTimeToAssumeNodeRebooted(time.Duration(minTimeSec) * time.Second)
msg, reason := "Automatic update since value isn't valid anymore", "SafeTimeToAssumeNodeRebootedSecondsModified"
if !isSet {
msg = "Default value was lower than minimum value"
} else if prevMinRebootTimeSec != minTimeSecString {
msg = "Minimum value has changed and now is greater than configured value"
}
s.log.Info(fmt.Sprintf("%s:%s", reason, msg))
events.NormalEvent(s.recorder, &snrConf, reason, msg)
}

if isUpdateRequired {
if err := s.k8sClient.Update(context.Background(), &snrConf); err != nil {
s.log.Error(err, "failed to update SafeTimeToAssumeNodeRebootedSeconds with min value")
return err
}
}
}
return nil
}

Expand Down
1 change: 1 addition & 0 deletions pkg/utils/annotations.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ const (
// IsRebootCapableAnnotation value is the key name for the node's annotation that will determine if node is reboot capable
IsRebootCapableAnnotation = "is-reboot-capable.self-node-remediation.medik8s.io"
IsSoftwareRebootEnabledEnvVar = "IS_SOFTWARE_REBOOT_ENABLED"
MinSafeTimeAnnotation = "minimum-safe-reboot-sec.self-node-remediation.medik8s.io"
)

// UpdateNodeWithIsRebootCapableAnnotation updates the is-reboot-capable node annotation to be true if any kind
Expand Down
40 changes: 29 additions & 11 deletions vendor/github.com/medik8s/common/pkg/events/events.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading
Loading