Skip to content

Commit

Permalink
Add option for minimal reboot period
Browse files Browse the repository at this point in the history
The flag --min-reboot-period can be used to define the minimal days
between reboots of a node in days.

Signed-off-by: Leon Löchner <[email protected]>
  • Loading branch information
leonnicolas committed Feb 29, 2024
1 parent ebb7ccf commit f398712
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 12 deletions.
59 changes: 47 additions & 12 deletions cmd/kured/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,11 +84,12 @@ var (
nodeID string
concurrency int

rebootDays []string
rebootStart string
rebootEnd string
timezone string
annotateNodes bool
rebootDays []string
rebootStart string
rebootEnd string
timezone string
minRebootPeriod time.Duration
annotateNodes bool

// Metrics
rebootRequiredGauge = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Expand All @@ -105,6 +106,8 @@ const (
KuredRebootInProgressAnnotation string = "weave.works/kured-reboot-in-progress"
// KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured most-recent-reboot-needed annotation
KuredMostRecentRebootNeededAnnotation string = "weave.works/kured-most-recent-reboot-needed"
// KuredMostRecentRebootNeededAnnotation is the canonical string value for the kured last-successful-reboot annotation
KuredLastSuccessfulRebootAnnotation string = "weave.works/kured-last-successful-reboot"
// EnvPrefix The environment variable prefix of all environment variables bound to our command line flags.
EnvPrefix = "KURED"

Expand Down Expand Up @@ -135,7 +138,8 @@ func NewRootCommand() *cobra.Command {
Short: "Kubernetes Reboot Daemon",
PersistentPreRunE: bindViper,
PreRun: flagCheck,
Run: root}
Run: root,
}

rootCmd.PersistentFlags().StringVar(&nodeID, "node-id", "",
"node name kured runs on, should be passed down from spec.nodeName via KURED_NODE_ID environment variable")
Expand Down Expand Up @@ -218,6 +222,8 @@ func NewRootCommand() *cobra.Command {
"schedule reboot only before this time of day")
rootCmd.PersistentFlags().StringVar(&timezone, "time-zone", "UTC",
"use this timezone for schedule inputs")
rootCmd.PersistentFlags().DurationVar(&minRebootPeriod, "min-reboot-period", 0,
"the minimal duration between reboots of a node. Requires --annotate-nodes")

rootCmd.PersistentFlags().BoolVar(&annotateNodes, "annotate-nodes", false,
"if set, the annotations 'weave.works/kured-reboot-in-progress' and 'weave.works/kured-most-recent-reboot-needed' will be given to nodes undergoing kured reboots")
Expand Down Expand Up @@ -265,6 +271,9 @@ func flagCheck(cmd *cobra.Command, args []string) {
if !reflect.DeepEqual(preRebootNodeLabelKeys, postRebootNodeLabelKeys) {
log.Warnf("pre-reboot-node-labels keys and post-reboot-node-labels keys do not match. This may result in unexpected behaviour.")
}
if !annotateNodes && minRebootPeriod != 0 {
log.Fatal("Cannot use --min-reboot-period without --annotate-nodes")
}
}

// stripQuotes removes any literal single or double quote chars that surround a string
Expand Down Expand Up @@ -317,7 +326,6 @@ func flagToEnvVar(flag string) string {
// buildHostCommand writes a new command to run in the host namespace
// Rancher based need different pid
func buildHostCommand(pid int, command []string) []string {

// From the container, we nsenter into the proper PID to run the hostCommand.
// For this, kured daemonset need to be configured with hostPID:true and privileged:true
cmd := []string{"/usr/bin/nsenter", fmt.Sprintf("-m/proc/%d/ns/mnt", pid), "--"}
Expand Down Expand Up @@ -400,7 +408,8 @@ func (kb KubernetesBlockingChecker) isBlocked() bool {
podList, err := kb.client.CoreV1().Pods("").List(context.TODO(), metav1.ListOptions{
LabelSelector: labelSelector,
FieldSelector: fieldSelector,
Limit: 10})
Limit: 10,
})
if err != nil {
log.Warnf("Reboot blocked: pod query error: %v", err)
return true
Expand Down Expand Up @@ -694,6 +703,11 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str
continue
}
}
if minRebootPeriod != 0 {
if err := addNodeAnnotations(client, nodeID, map[string]string{KuredLastSuccessfulRebootAnnotation: time.Now().Format(time.RFC3339)}); err != nil {
continue
}
}
}
throttle(releaseDelay)
release(lock, concurrency > 1)
Expand Down Expand Up @@ -725,16 +739,22 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str
continue
}

node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
log.Fatalf("Error retrieving node object via k8s API: %v", err)
}

if lastSuccessfulRebootWithinMinRebootPeriod(node) {
log.Infof("Last successful reboot within minimal reboot period")
continue
}

if !rebootRequired(sentinelCommand) {
log.Infof("Reboot not required")
preferNoScheduleTaint.Disable()
continue
}

node, err := client.CoreV1().Nodes().Get(context.TODO(), nodeID, metav1.GetOptions{})
if err != nil {
log.Fatalf("Error retrieving node object via k8s API: %v", err)
}
nodeMeta.Unschedulable = node.Spec.Unschedulable

var timeNowString string
Expand Down Expand Up @@ -804,6 +824,21 @@ func rebootAsRequired(nodeID string, booter reboot.Reboot, sentinelCommand []str
}
}

func lastSuccessfulRebootWithinMinRebootPeriod(node *v1.Node) bool {
if minRebootPeriod == 0 {
return false
}
if v, ok := node.GetAnnotations()[KuredLastSuccessfulRebootAnnotation]; ok {
t, err := time.Parse(time.RFC3339, v)
if err != nil {
log.Warnf("failed to parse time %q in annotation %q: %s", v, KuredLastSuccessfulRebootAnnotation, err.Error())
return false
}
return time.Now().Before(t.Add(minRebootPeriod))
}
return false
}

// buildSentinelCommand creates the shell command line which will need wrapping to escape
// the container boundaries
func buildSentinelCommand(rebootSentinelFile string, rebootSentinelCommand string) []string {
Expand Down
1 change: 1 addition & 0 deletions kured-ds-signal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,4 @@ spec:
# - --annotate-nodes=false
# - --lock-release-delay=30m
# - --log-format=text
# - --min-reboot-period=336h
1 change: 1 addition & 0 deletions kured-ds.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,3 +100,4 @@ spec:
# - --metrics-host=""
# - --metrics-port=8080
# - --concurrency=1
# - --min-reboot-period=336h

0 comments on commit f398712

Please sign in to comment.