metal3-io · metal3-io-bot · Jul 26, 2024 · Jun 26, 2024
diff --git a/baremetal/metal3remediation_manager.go b/baremetal/metal3remediation_manager.go
@@ -27,6 +27,7 @@ import (
 	infrav1 "github.com/metal3-io/cluster-api-provider-metal3/api/v1beta1"
 	"github.com/pkg/errors"
 	corev1 "k8s.io/api/core/v1"
+	storagev1 "k8s.io/api/storage/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	v1 "k8s.io/client-go/kubernetes/typed/core/v1"
@@ -76,6 +77,16 @@ type RemediationManagerInterface interface {
 	SetNodeBackupAnnotations(annotations string, labels string) bool
 	GetNodeBackupAnnotations() (annotations, labels string)
 	RemoveNodeBackupAnnotations()
+	AddOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error
+	RemoveOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error
+	HasOutOfServiceTaint(node *corev1.Node) bool
+	IsNodeDrained(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) bool
+}
+
+var outOfServiceTaint = &corev1.Taint{
+	Key:    "node.kubernetes.io/out-of-service",
+	Value:  "nodeshutdown",
+	Effect: corev1.TaintEffectNoExecute,
 }
 
 // RemediationManager is responsible for performing remediation reconciliation.
@@ -476,3 +487,88 @@ func (r *RemediationManager) RemoveNodeBackupAnnotations() {
 func (r *RemediationManager) getPowerOffAnnotationKey() string {
 	return fmt.Sprintf(powerOffAnnotation, r.Metal3Remediation.UID)
 }
+
+func (r *RemediationManager) HasOutOfServiceTaint(node *corev1.Node) bool {
+	for _, taint := range node.Spec.Taints {
+		if taint.MatchTaint(outOfServiceTaint) {
+			return true
+		}
+	}
+	return false
+}
+
+func (r *RemediationManager) AddOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error {
+	taint := outOfServiceTaint
+	now := metav1.Now()
+	taint.TimeAdded = &now
+	node.Spec.Taints = append(node.Spec.Taints, *taint)
+	if err := r.UpdateNode(ctx, clusterClient, node); err != nil {
+		msg := fmt.Sprintf("failed to add out-of-service taint on node %s", node.Name)
+		r.Log.Error(err, msg)
+		return errors.Wrap(err, msg)
+	}
+	r.Log.Info("Out-of-service taint added", "node", node.Name)
+	return nil
+}
+
+func (r *RemediationManager) RemoveOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error {
+	newTaints := []corev1.Taint{}
+
+	var isPopOutOfServiceTaint bool
+	for _, taint := range node.Spec.Taints {
+		if taint.MatchTaint(outOfServiceTaint) {
+			isPopOutOfServiceTaint = true
+			continue
+		}
+		newTaints = append(newTaints, taint)
+	}
+
+	if isPopOutOfServiceTaint {
+		r.Log.Info("Removing out-of-service taint from node taints", "node", node.Name)
+		node.Spec.Taints = newTaints
+	} else {
+		r.Log.Info("Out-of-service taint not found. Nothing to do", "node name", node.Name)
+		return nil
+	}
+
+	if err := r.UpdateNode(ctx, clusterClient, node); err != nil {
+		msg := fmt.Sprintf("failed to remove out-of-service taint on node %s", node.Name)
+		r.Log.Error(err, msg)
+		return errors.Wrap(err, msg)
+	}
+
+	r.Log.Info("Out-of-service taint removed", "node", node.Name)
+	return nil
+}
+
+func (r *RemediationManager) IsNodeDrained(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) bool {
+	pods, err := clusterClient.Pods("").List(ctx, metav1.ListOptions{})
+	if err != nil {
+		r.Log.Error(err, "failed to get pod list in the cluster")
+		return false
+	}
+
+	for _, pod := range pods.Items {
+		if pod.Spec.NodeName == node.Name {
+			if pod.ObjectMeta.DeletionTimestamp != nil {
+				r.Log.Info("Waiting for terminating pod", "node", node.Name, "pod name", pod.Name, "phase", pod.Status.Phase)
+				return false
+			}
+		}
+	}
+
+	volumeAttachments := &storagev1.VolumeAttachmentList{}
+	if err := r.Client.List(ctx, volumeAttachments); err != nil {
+		r.Log.Error(err, "failed to get volumeAttachments list")
+		return false
+	}
+	for _, va := range volumeAttachments.Items {
+		if va.Spec.NodeName == node.Name {
+			r.Log.Info("Waiting for deleting volumeAttachement", "node", node.Name, "name", va.Name)
+			return false
+		}
+	}
+
+	r.Log.Info("Node is drained", "node", node.Name)
+	return true
+}
diff --git a/baremetal/mocks/zz_generated.metal3remediation_manager.go b/baremetal/mocks/zz_generated.metal3remediation_manager.go
diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml
@@ -118,6 +118,12 @@ rules:
   - patch
   - update
   - watch
+- apiGroups:
+  - ""
+  resources:
+  - pods
+  verbs:
+  - list
 - apiGroups:
   - infrastructure.cluster.x-k8s.io
   resources:
@@ -336,3 +342,10 @@ rules:
   - get
   - patch
   - update
+- apiGroups:
+  - storage.k8s.io
+  resources:
+  - volumeattachments
+  verbs:
+  - list
+  - watch
diff --git a/controllers/metal3remediation_controller.go b/controllers/metal3remediation_controller.go
@@ -39,10 +39,13 @@ import (
 // Metal3RemediationReconciler reconciles a Metal3Remediation object.
 type Metal3RemediationReconciler struct {
 	client.Client
-	ManagerFactory baremetal.ManagerFactoryInterface
-	Log            logr.Logger
+	ManagerFactory             baremetal.ManagerFactoryInterface
+	Log                        logr.Logger
+	IsOutOfServiceTaintEnabled bool
 }
 
+// +kubebuilder:rbac:groups=core,resources=pods,verbs=list
+// +kubebuilder:rbac:groups=storage.k8s.io,resources=volumeattachments,verbs=list;watch
 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=metal3remediations,verbs=get;list;watch;create;update;patch;delete
 // +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=metal3remediations/status,verbs=get;update;patch
 // +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;create;update;patch;delete
@@ -206,15 +209,25 @@ func (r *Metal3RemediationReconciler) reconcileNormal(ctx context.Context,
 			// Restore node if available and not done yet
 			if remediationMgr.HasFinalizer() {
 				if node != nil {
-					// Node was recreated, restore annotations and labels
-					r.Log.Info("Restoring the node")
-					if err := r.restoreNode(ctx, remediationMgr, clusterClient, node); err != nil {
-						return ctrl.Result{}, err
+					if r.IsOutOfServiceTaintEnabled {
+						if remediationMgr.HasOutOfServiceTaint(node) {
+							if err := remediationMgr.RemoveOutOfServiceTaint(ctx, clusterClient, node); err != nil {
+								return ctrl.Result{}, errors.Wrapf(err, "error removing out-of-service taint from node %s", node.Name)
+							}
+						}
+					} else {
+						// Node was recreated, restore annotations and labels
+						r.Log.Info("Restoring the node")
+						if err := r.restoreNode(ctx, remediationMgr, clusterClient, node); err != nil {
+							return ctrl.Result{}, err
+						}
 					}
 
 					// clean up
 					r.Log.Info("Remediation done, cleaning up remediation CR")
-					remediationMgr.RemoveNodeBackupAnnotations()
+					if !r.IsOutOfServiceTaintEnabled {
+						remediationMgr.RemoveNodeBackupAnnotations()
+					}
 					remediationMgr.UnsetFinalizer()
 					return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
 				} else if isNodeForbidden {
@@ -319,29 +332,43 @@ func (r *Metal3RemediationReconciler) remediateRebootStrategy(ctx context.Contex
 		return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
 	}
 
-	// if we have a node, store annotations and labels, and delete it
 	if node != nil {
-		/*
-			Delete the node only after the host is powered off. Otherwise, if we would delete the node
-			when the host is powered on, the scheduler would assign the workload to other nodes, with the
-			possibility that two instances of the same application are running in parallel. This might result
-			in corruption or other issues for applications with singleton requirement. After the host is powered
-			off we know for sure that it is safe to re-assign that workload to other nodes.
-		*/
-		modified := r.backupNode(remediationMgr, node)
-		if modified {
-			r.Log.Info("Backing up node")
-			// save annotations before deleting node
-			return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
-		}
-		r.Log.Info("Deleting node")
-		err := remediationMgr.DeleteNode(ctx, clusterClient, node)
-		if err != nil {
-			r.Log.Error(err, "error deleting node")
-			return ctrl.Result{}, errors.Wrap(err, "error deleting node")
+		if r.IsOutOfServiceTaintEnabled {
+			if !remediationMgr.HasOutOfServiceTaint(node) {
+				if err := remediationMgr.AddOutOfServiceTaint(ctx, clusterClient, node); err != nil {
+					return ctrl.Result{}, err
+				}
+				// If we immediately check if the node is drained, we might find no pods with
+				// Deletion timestamp set yet and assume the node is drained.
+				return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
+			}
+
+			if !remediationMgr.IsNodeDrained(ctx, clusterClient, node) {
+				return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
+			}
+		} else {
+			/*
+				Delete the node only after the host is powered off. Otherwise, if we would delete the node
+				when the host is powered on, the scheduler would assign the workload to other nodes, with the
+				possibility that two instances of the same application are running in parallel. This might result
+				in corruption or other issues for applications with singleton requirement. After the host is powered
+				off we know for sure that it is safe to re-assign that workload to other nodes.
+			*/
+			modified := r.backupNode(remediationMgr, node)
+			if modified {
+				r.Log.Info("Backing up node")
+				// save annotations before deleting node
+				return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
+			}
+			r.Log.Info("Deleting node")
+			err := remediationMgr.DeleteNode(ctx, clusterClient, node)
+			if err != nil {
+				r.Log.Error(err, "error deleting node")
+				return ctrl.Result{}, errors.Wrap(err, "error deleting node")
+			}
+			// wait until node is gone
+			return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
 		}
-		// wait until node is gone
-		return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
 	}
 
 	// we are done for this phase, switch to waiting for power on and the node restore