Skip to content

Commit

Permalink
Annotate node with machine state leading to termination
Browse files Browse the repository at this point in the history
  • Loading branch information
guydc committed Aug 2, 2020
1 parent 8f36522 commit 2a1a1f0
Show file tree
Hide file tree
Showing 5 changed files with 346 additions and 36 deletions.
55 changes: 51 additions & 4 deletions pkg/controller/controller_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ import (

"github.com/gardener/machine-controller-manager/pkg/apis/machine/v1alpha1"
machineapi "github.com/gardener/machine-controller-manager/pkg/client/clientset/versioned/typed/machine/v1alpha1"
annotationutils "github.com/gardener/machine-controller-manager/pkg/util/annotations"
hashutil "github.com/gardener/machine-controller-manager/pkg/util/hash"
taintutils "github.com/gardener/machine-controller-manager/pkg/util/taints"
v1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -88,8 +89,8 @@ const (
SlowStartInitialBatchSize = 1
)

// UpdateTaintBackoff is the backoff period used while updating taint
var UpdateTaintBackoff = wait.Backoff{
// UpdateNodeBackoff is the backoff period used while updating nodes
var UpdateNodeBackoff = wait.Backoff{
Steps: 5,
Duration: 100 * time.Millisecond,
Jitter: 1.0,
Expand Down Expand Up @@ -901,7 +902,7 @@ func AddOrUpdateTaintOnNode(c clientset.Interface, nodeName string, taints ...*v
return nil
}
firstTry := true
return clientretry.RetryOnConflict(UpdateTaintBackoff, func() error {
return clientretry.RetryOnConflict(UpdateNodeBackoff, func() error {
var err error
var oldNode *v1.Node
// First we try getting node from the API server cache, as it's cheaper. If it fails
Expand Down Expand Up @@ -958,7 +959,7 @@ func RemoveTaintOffNode(c clientset.Interface, nodeName string, node *v1.Node, t
}

firstTry := true
return clientretry.RetryOnConflict(UpdateTaintBackoff, func() error {
return clientretry.RetryOnConflict(UpdateNodeBackoff, func() error {
var err error
var oldNode *v1.Node
// First we try getting node from the API server cache, as it's cheaper. If it fails
Expand Down Expand Up @@ -1032,6 +1033,52 @@ func UpdateNodeTaints(c clientset.Interface, nodeName string, oldNode *v1.Node,
return nil
}

// AddOrUpdateAnnotationsOnNode add annotations to the node. If annotations was added into node, it'll issue API calls
// to update nodes; otherwise, no API calls. Return error if any.
func AddOrUpdateAnnotationsOnNode(c clientset.Interface, nodeName string, annotations map[string]string) error {
if annotations == nil {
return nil
}
firstTry := true
return clientretry.RetryOnConflict(UpdateNodeBackoff, func() error {
var err error
var oldNode *v1.Node
if firstTry {
oldNode, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{ResourceVersion: "0"})
firstTry = false
} else {
oldNode, err = c.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
}

if err != nil {
return err
}

var newNode *v1.Node
oldNodeCopy := oldNode
for key, value := range annotations {
curNewNode := annotationutils.AddOrUpdateAnnotation(oldNodeCopy, key, value)
newNode = curNewNode
oldNodeCopy = curNewNode
}
return UpdateNodeAnnotations(c, nodeName, oldNode, newNode)
})
}

// UpdateNodeAnnotations is for updating the node annotations from oldNode to the newNode
// using the nodes Update() method
func UpdateNodeAnnotations(c clientset.Interface, nodeName string, oldNode *v1.Node, newNode *v1.Node) error {
newNodeClone := oldNode.DeepCopy()
newNodeClone.ObjectMeta.Annotations = newNode.ObjectMeta.Annotations

_, err := c.CoreV1().Nodes().Update(newNodeClone)
if err != nil {
return fmt.Errorf("failed to create update annotations for node %q: %v", nodeName, err)
}

return nil
}

// WaitForCacheSync is a wrapper around cache.WaitForCacheSync that generates log messages
// indicating that the controller identified by controllerName is waiting for syncs, followed by
// either a successful or failed sync.
Expand Down
72 changes: 40 additions & 32 deletions pkg/controller/machine.go
Original file line number Diff line number Diff line change
Expand Up @@ -565,10 +565,50 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv

if finalizers := sets.NewString(machine.Finalizers...); finalizers.Has(DeleteFinalizerName) {
klog.V(2).Infof("Deleting Machine %q", machine.Name)
var (
forceDeletePods = false
forceDeleteMachine = false
timeOutOccurred = false
maxEvictRetries = int32(math.Min(float64(*c.getEffectiveMaxEvictRetries(machine)), c.getEffectiveDrainTimeout(machine).Seconds()/PodEvictionRetryInterval.Seconds()))
pvDetachTimeOut = c.safetyOptions.PvDetachTimeout.Duration
timeOutDuration = c.getEffectiveDrainTimeout(machine).Duration
forceDeleteLabelPresent = machine.Labels["force-deletion"] == "True"
)

// Timeout value obtained by subtracting last operation with expected time out period
timeOut := metav1.Now().Add(-timeOutDuration).Sub(machine.Status.CurrentStatus.LastUpdateTime.Time)
timeOutOccurred = timeOut > 0

if forceDeleteLabelPresent || timeOutOccurred {
// To perform forceful machine drain/delete either one of the below conditions must be satified
// 1. force-deletion: "True" label must be present
// 2. Deletion operation is more than drain-timeout minutes old
forceDeleteMachine = true
forceDeletePods = true
timeOutDuration = 1 * time.Minute
maxEvictRetries = 1

klog.V(2).Infof(
"Force deletion has been triggerred for machine %q due to ForceDeletionLabel:%t, Timeout:%t",
machine.Name,
forceDeleteLabelPresent,
timeOutOccurred,
)
}

// If machine was created on the cloud provider
machineID, _ := driver.GetExisting()

// update node with the machine's state prior to termination
if err = c.AnnotateTerminatingMachineNode(machine); err != nil {
if forceDeleteMachine {
klog.Warningf("Annotation of node failed: %v. However, since it's a force deletion shall continue deletion of VM.", err)
} else {
klog.Error(err)
return err
}
}

if machine.Status.CurrentStatus.Phase != v1alpha1.MachineTerminating {
lastOperation := v1alpha1.LastOperation{
Description: "Deleting machine from cloud provider",
Expand Down Expand Up @@ -601,38 +641,6 @@ func (c *controller) machineDelete(machine *v1alpha1.Machine, driver driver.Driv

if machineID != "" && nodeName != "" {
// Begin drain logic only when the nodeName & providerID exist's for the machine

var (
forceDeletePods = false
forceDeleteMachine = false
timeOutOccurred = false
maxEvictRetries = int32(math.Min(float64(*c.getEffectiveMaxEvictRetries(machine)), c.getEffectiveDrainTimeout(machine).Seconds()/PodEvictionRetryInterval.Seconds()))
pvDetachTimeOut = c.safetyOptions.PvDetachTimeout.Duration
timeOutDuration = c.getEffectiveDrainTimeout(machine).Duration
forceDeleteLabelPresent = machine.Labels["force-deletion"] == "True"
)

// Timeout value obtained by subtracting last operation with expected time out period
timeOut := metav1.Now().Add(-timeOutDuration).Sub(machine.Status.CurrentStatus.LastUpdateTime.Time)
timeOutOccurred = timeOut > 0

if forceDeleteLabelPresent || timeOutOccurred {
// To perform forceful machine drain/delete either one of the below conditions must be satified
// 1. force-deletion: "True" label must be present
// 2. Deletion operation is more than drain-timeout minutes old
forceDeleteMachine = true
forceDeletePods = true
timeOutDuration = 1 * time.Minute
maxEvictRetries = 1

klog.V(2).Infof(
"Force deletion has been triggerred for machine %q due to ForceDeletionLabel:%t, Timeout:%t",
machine.Name,
forceDeleteLabelPresent,
timeOutOccurred,
)
}

buf := bytes.NewBuffer([]byte{})
errBuf := bytes.NewBuffer([]byte{})

Expand Down
22 changes: 22 additions & 0 deletions pkg/controller/machine_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ package controller

import (
"encoding/json"
"fmt"

"github.com/gardener/machine-controller-manager/pkg/apis/machine/validation"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/klog"

machineapi "github.com/gardener/machine-controller-manager/pkg/apis/machine"
Expand All @@ -40,6 +42,8 @@ import (
const (
// LastAppliedALTAnnotation contains the last configuration of annotations, labels & taints applied on the node object
LastAppliedALTAnnotation = "node.machine.sapcloud.io/last-applied-anno-labels-taints"
// NodeTerminationAnnotation contains the machine's phase before it was terminated
NodeTerminationAnnotation = "node.machine.sapcloud.io/termination-phase"
)

var (
Expand Down Expand Up @@ -496,3 +500,21 @@ func SyncMachineTaints(

return toBeUpdated
}

func (c *controller) AnnotateTerminatingMachineNode(machine *v1alpha1.Machine) error {
if machine.Status.CurrentStatus.Phase == "" || machine.Status.CurrentStatus.Phase == v1alpha1.MachineTerminating {
return nil
}

nodeName := machine.Status.Node
if machine.Status.CurrentStatus.Phase == "" {
return nil
}

annotations := map[string]string{NodeTerminationAnnotation: fmt.Sprintf("%v", machine.Status.CurrentStatus.Phase)}
err := AddOrUpdateAnnotationsOnNode(c.targetCoreClient, nodeName, annotations)
if apierrors.IsNotFound(err) {
return nil
}
return err
}
Loading

0 comments on commit 2a1a1f0

Please sign in to comment.