Skip to content

Commit

Permalink
Merge pull request #1808 from clobrano/feature/out-of-service-taint-0
Browse files Browse the repository at this point in the history
✨ Use Out-of-service taint in Node remediation in place of deletion
  • Loading branch information
metal3-io-bot authored Jul 26, 2024
2 parents a32f1f5 + c6cd725 commit a599911
Show file tree
Hide file tree
Showing 11 changed files with 441 additions and 80 deletions.
96 changes: 96 additions & 0 deletions baremetal/metal3remediation_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
infrav1 "github.com/metal3-io/cluster-api-provider-metal3/api/v1beta1"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/client-go/kubernetes/typed/core/v1"
Expand Down Expand Up @@ -76,6 +77,16 @@ type RemediationManagerInterface interface {
SetNodeBackupAnnotations(annotations string, labels string) bool
GetNodeBackupAnnotations() (annotations, labels string)
RemoveNodeBackupAnnotations()
AddOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error
RemoveOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error
HasOutOfServiceTaint(node *corev1.Node) bool
IsNodeDrained(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) bool
}

var outOfServiceTaint = &corev1.Taint{
Key: "node.kubernetes.io/out-of-service",
Value: "nodeshutdown",
Effect: corev1.TaintEffectNoExecute,
}

// RemediationManager is responsible for performing remediation reconciliation.
Expand Down Expand Up @@ -476,3 +487,88 @@ func (r *RemediationManager) RemoveNodeBackupAnnotations() {
func (r *RemediationManager) getPowerOffAnnotationKey() string {
return fmt.Sprintf(powerOffAnnotation, r.Metal3Remediation.UID)
}

func (r *RemediationManager) HasOutOfServiceTaint(node *corev1.Node) bool {
for _, taint := range node.Spec.Taints {
if taint.MatchTaint(outOfServiceTaint) {
return true
}
}
return false
}

func (r *RemediationManager) AddOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error {
taint := outOfServiceTaint
now := metav1.Now()
taint.TimeAdded = &now
node.Spec.Taints = append(node.Spec.Taints, *taint)
if err := r.UpdateNode(ctx, clusterClient, node); err != nil {
msg := fmt.Sprintf("failed to add out-of-service taint on node %s", node.Name)
r.Log.Error(err, msg)
return errors.Wrap(err, msg)
}
r.Log.Info("Out-of-service taint added", "node", node.Name)
return nil
}

func (r *RemediationManager) RemoveOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error {
newTaints := []corev1.Taint{}

var isPopOutOfServiceTaint bool
for _, taint := range node.Spec.Taints {
if taint.MatchTaint(outOfServiceTaint) {
isPopOutOfServiceTaint = true
continue
}
newTaints = append(newTaints, taint)
}

if isPopOutOfServiceTaint {
r.Log.Info("Removing out-of-service taint from node taints", "node", node.Name)
node.Spec.Taints = newTaints
} else {
r.Log.Info("Out-of-service taint not found. Nothing to do", "node name", node.Name)
return nil
}

if err := r.UpdateNode(ctx, clusterClient, node); err != nil {
msg := fmt.Sprintf("failed to remove out-of-service taint on node %s", node.Name)
r.Log.Error(err, msg)
return errors.Wrap(err, msg)
}

r.Log.Info("Out-of-service taint removed", "node", node.Name)
return nil
}

func (r *RemediationManager) IsNodeDrained(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) bool {
pods, err := clusterClient.Pods("").List(ctx, metav1.ListOptions{})
if err != nil {
r.Log.Error(err, "failed to get pod list in the cluster")
return false
}

for _, pod := range pods.Items {
if pod.Spec.NodeName == node.Name {
if pod.ObjectMeta.DeletionTimestamp != nil {
r.Log.Info("Waiting for terminating pod", "node", node.Name, "pod name", pod.Name, "phase", pod.Status.Phase)
return false
}
}
}

volumeAttachments := &storagev1.VolumeAttachmentList{}
if err := r.Client.List(ctx, volumeAttachments); err != nil {
r.Log.Error(err, "failed to get volumeAttachments list")
return false
}
for _, va := range volumeAttachments.Items {
if va.Spec.NodeName == node.Name {
r.Log.Info("Waiting for deleting volumeAttachement", "node", node.Name, "name", va.Name)
return false
}
}

r.Log.Info("Node is drained", "node", node.Name)
return true
}
56 changes: 56 additions & 0 deletions baremetal/mocks/zz_generated.metal3remediation_manager.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ rules:
- patch
- update
- watch
- apiGroups:
- ""
resources:
- pods
verbs:
- list
- apiGroups:
- infrastructure.cluster.x-k8s.io
resources:
Expand Down Expand Up @@ -336,3 +342,10 @@ rules:
- get
- patch
- update
- apiGroups:
- storage.k8s.io
resources:
- volumeattachments
verbs:
- list
- watch
83 changes: 55 additions & 28 deletions controllers/metal3remediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,13 @@ import (
// Metal3RemediationReconciler reconciles a Metal3Remediation object.
type Metal3RemediationReconciler struct {
client.Client
ManagerFactory baremetal.ManagerFactoryInterface
Log logr.Logger
ManagerFactory baremetal.ManagerFactoryInterface
Log logr.Logger
IsOutOfServiceTaintEnabled bool
}

// +kubebuilder:rbac:groups=core,resources=pods,verbs=list
// +kubebuilder:rbac:groups=storage.k8s.io,resources=volumeattachments,verbs=list;watch
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=metal3remediations,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=metal3remediations/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -206,15 +209,25 @@ func (r *Metal3RemediationReconciler) reconcileNormal(ctx context.Context,
// Restore node if available and not done yet
if remediationMgr.HasFinalizer() {
if node != nil {
// Node was recreated, restore annotations and labels
r.Log.Info("Restoring the node")
if err := r.restoreNode(ctx, remediationMgr, clusterClient, node); err != nil {
return ctrl.Result{}, err
if r.IsOutOfServiceTaintEnabled {
if remediationMgr.HasOutOfServiceTaint(node) {
if err := remediationMgr.RemoveOutOfServiceTaint(ctx, clusterClient, node); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "error removing out-of-service taint from node %s", node.Name)
}
}
} else {
// Node was recreated, restore annotations and labels
r.Log.Info("Restoring the node")
if err := r.restoreNode(ctx, remediationMgr, clusterClient, node); err != nil {
return ctrl.Result{}, err
}
}

// clean up
r.Log.Info("Remediation done, cleaning up remediation CR")
remediationMgr.RemoveNodeBackupAnnotations()
if !r.IsOutOfServiceTaintEnabled {
remediationMgr.RemoveNodeBackupAnnotations()
}
remediationMgr.UnsetFinalizer()
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
} else if isNodeForbidden {
Expand Down Expand Up @@ -319,29 +332,43 @@ func (r *Metal3RemediationReconciler) remediateRebootStrategy(ctx context.Contex
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}

// if we have a node, store annotations and labels, and delete it
if node != nil {
/*
Delete the node only after the host is powered off. Otherwise, if we would delete the node
when the host is powered on, the scheduler would assign the workload to other nodes, with the
possibility that two instances of the same application are running in parallel. This might result
in corruption or other issues for applications with singleton requirement. After the host is powered
off we know for sure that it is safe to re-assign that workload to other nodes.
*/
modified := r.backupNode(remediationMgr, node)
if modified {
r.Log.Info("Backing up node")
// save annotations before deleting node
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
}
r.Log.Info("Deleting node")
err := remediationMgr.DeleteNode(ctx, clusterClient, node)
if err != nil {
r.Log.Error(err, "error deleting node")
return ctrl.Result{}, errors.Wrap(err, "error deleting node")
if r.IsOutOfServiceTaintEnabled {
if !remediationMgr.HasOutOfServiceTaint(node) {
if err := remediationMgr.AddOutOfServiceTaint(ctx, clusterClient, node); err != nil {
return ctrl.Result{}, err
}
// If we immediately check if the node is drained, we might find no pods with
// Deletion timestamp set yet and assume the node is drained.
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
}

if !remediationMgr.IsNodeDrained(ctx, clusterClient, node) {
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
} else {
/*
Delete the node only after the host is powered off. Otherwise, if we would delete the node
when the host is powered on, the scheduler would assign the workload to other nodes, with the
possibility that two instances of the same application are running in parallel. This might result
in corruption or other issues for applications with singleton requirement. After the host is powered
off we know for sure that it is safe to re-assign that workload to other nodes.
*/
modified := r.backupNode(remediationMgr, node)
if modified {
r.Log.Info("Backing up node")
// save annotations before deleting node
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
}
r.Log.Info("Deleting node")
err := remediationMgr.DeleteNode(ctx, clusterClient, node)
if err != nil {
r.Log.Error(err, "error deleting node")
return ctrl.Result{}, errors.Wrap(err, "error deleting node")
}
// wait until node is gone
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
// wait until node is gone
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}

// we are done for this phase, switch to waiting for power on and the node restore
Expand Down
Loading

0 comments on commit a599911

Please sign in to comment.