Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨ Use Out-of-service taint in Node remediation in place of deletion #1808

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
96 changes: 96 additions & 0 deletions baremetal/metal3remediation_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ import (
infrav1 "github.com/metal3-io/cluster-api-provider-metal3/api/v1beta1"
"github.com/pkg/errors"
corev1 "k8s.io/api/core/v1"
storagev1 "k8s.io/api/storage/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
v1 "k8s.io/client-go/kubernetes/typed/core/v1"
Expand Down Expand Up @@ -76,6 +77,16 @@ type RemediationManagerInterface interface {
SetNodeBackupAnnotations(annotations string, labels string) bool
GetNodeBackupAnnotations() (annotations, labels string)
RemoveNodeBackupAnnotations()
AddOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error
RemoveOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error
HasOutOfServiceTaint(node *corev1.Node) bool
IsNodeDrained(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) bool
}

var outOfServiceTaint = &corev1.Taint{
Key: "node.kubernetes.io/out-of-service",
Value: "nodeshutdown",
Effect: corev1.TaintEffectNoExecute,
}

// RemediationManager is responsible for performing remediation reconciliation.
Expand Down Expand Up @@ -476,3 +487,88 @@ func (r *RemediationManager) RemoveNodeBackupAnnotations() {
func (r *RemediationManager) getPowerOffAnnotationKey() string {
return fmt.Sprintf(powerOffAnnotation, r.Metal3Remediation.UID)
}

func (r *RemediationManager) HasOutOfServiceTaint(node *corev1.Node) bool {
for _, taint := range node.Spec.Taints {
if taint.MatchTaint(outOfServiceTaint) {
return true
}
}
return false
}

func (r *RemediationManager) AddOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error {
taint := outOfServiceTaint
now := metav1.Now()
taint.TimeAdded = &now
node.Spec.Taints = append(node.Spec.Taints, *taint)
if err := r.UpdateNode(ctx, clusterClient, node); err != nil {
msg := fmt.Sprintf("failed to add out-of-service taint on node %s", node.Name)
r.Log.Error(err, msg)
return errors.Wrap(err, msg)
}
r.Log.Info("Out-of-service taint added", "node", node.Name)
return nil
}

func (r *RemediationManager) RemoveOutOfServiceTaint(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) error {
newTaints := []corev1.Taint{}

var isPopOutOfServiceTaint bool
for _, taint := range node.Spec.Taints {
if taint.MatchTaint(outOfServiceTaint) {
isPopOutOfServiceTaint = true
continue
}
newTaints = append(newTaints, taint)
}

if isPopOutOfServiceTaint {
r.Log.Info("Removing out-of-service taint from node taints", "node", node.Name)
node.Spec.Taints = newTaints
} else {
r.Log.Info("Out-of-service taint not found. Nothing to do", "node name", node.Name)
return nil
}

if err := r.UpdateNode(ctx, clusterClient, node); err != nil {
msg := fmt.Sprintf("failed to remove out-of-service taint on node %s", node.Name)
r.Log.Error(err, msg)
return errors.Wrap(err, msg)
}

r.Log.Info("Out-of-service taint removed", "node", node.Name)
return nil
}

func (r *RemediationManager) IsNodeDrained(ctx context.Context, clusterClient v1.CoreV1Interface, node *corev1.Node) bool {
pods, err := clusterClient.Pods("").List(ctx, metav1.ListOptions{})
if err != nil {
r.Log.Error(err, "failed to get pod list in the cluster")
return false
}

for _, pod := range pods.Items {
if pod.Spec.NodeName == node.Name {
if pod.ObjectMeta.DeletionTimestamp != nil {
r.Log.Info("Waiting for terminating pod", "node", node.Name, "pod name", pod.Name, "phase", pod.Status.Phase)
return false
}
}
}

volumeAttachments := &storagev1.VolumeAttachmentList{}
if err := r.Client.List(ctx, volumeAttachments); err != nil {
r.Log.Error(err, "failed to get volumeAttachments list")
return false
}
for _, va := range volumeAttachments.Items {
if va.Spec.NodeName == node.Name {
r.Log.Info("Waiting for deleting volumeAttachement", "node", node.Name, "name", va.Name)
return false
}
}

r.Log.Info("Node is drained", "node", node.Name)
return true
}
56 changes: 56 additions & 0 deletions baremetal/mocks/zz_generated.metal3remediation_manager.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 13 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,12 @@ rules:
- patch
- update
- watch
- apiGroups:
- ""
resources:
- pods
verbs:
- list
- apiGroups:
- infrastructure.cluster.x-k8s.io
resources:
Expand Down Expand Up @@ -336,3 +342,10 @@ rules:
- get
- patch
- update
- apiGroups:
- storage.k8s.io
resources:
- volumeattachments
verbs:
- list
- watch
83 changes: 55 additions & 28 deletions controllers/metal3remediation_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,13 @@ import (
// Metal3RemediationReconciler reconciles a Metal3Remediation object.
type Metal3RemediationReconciler struct {
client.Client
ManagerFactory baremetal.ManagerFactoryInterface
Log logr.Logger
ManagerFactory baremetal.ManagerFactoryInterface
Log logr.Logger
IsOutOfServiceTaintEnabled bool
}

// +kubebuilder:rbac:groups=core,resources=pods,verbs=list
// +kubebuilder:rbac:groups=storage.k8s.io,resources=volumeattachments,verbs=list;watch
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=metal3remediations,verbs=get;list;watch;create;update;patch;delete
// +kubebuilder:rbac:groups=infrastructure.cluster.x-k8s.io,resources=metal3remediations/status,verbs=get;update;patch
// +kubebuilder:rbac:groups=cluster.x-k8s.io,resources=machines;machines/status,verbs=get;list;watch;create;update;patch;delete
Expand Down Expand Up @@ -206,15 +209,25 @@ func (r *Metal3RemediationReconciler) reconcileNormal(ctx context.Context,
// Restore node if available and not done yet
if remediationMgr.HasFinalizer() {
if node != nil {
// Node was recreated, restore annotations and labels
r.Log.Info("Restoring the node")
if err := r.restoreNode(ctx, remediationMgr, clusterClient, node); err != nil {
return ctrl.Result{}, err
if r.IsOutOfServiceTaintEnabled {
if remediationMgr.HasOutOfServiceTaint(node) {
if err := remediationMgr.RemoveOutOfServiceTaint(ctx, clusterClient, node); err != nil {
return ctrl.Result{}, errors.Wrapf(err, "error removing out-of-service taint from node %s", node.Name)
}
}
} else {
// Node was recreated, restore annotations and labels
r.Log.Info("Restoring the node")
if err := r.restoreNode(ctx, remediationMgr, clusterClient, node); err != nil {
return ctrl.Result{}, err
}
}

// clean up
r.Log.Info("Remediation done, cleaning up remediation CR")
remediationMgr.RemoveNodeBackupAnnotations()
if !r.IsOutOfServiceTaintEnabled {
remediationMgr.RemoveNodeBackupAnnotations()
}
remediationMgr.UnsetFinalizer()
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
} else if isNodeForbidden {
Expand Down Expand Up @@ -319,29 +332,43 @@ func (r *Metal3RemediationReconciler) remediateRebootStrategy(ctx context.Contex
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}

// if we have a node, store annotations and labels, and delete it
if node != nil {
/*
Delete the node only after the host is powered off. Otherwise, if we would delete the node
when the host is powered on, the scheduler would assign the workload to other nodes, with the
possibility that two instances of the same application are running in parallel. This might result
in corruption or other issues for applications with singleton requirement. After the host is powered
off we know for sure that it is safe to re-assign that workload to other nodes.
*/
modified := r.backupNode(remediationMgr, node)
if modified {
r.Log.Info("Backing up node")
// save annotations before deleting node
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
}
r.Log.Info("Deleting node")
err := remediationMgr.DeleteNode(ctx, clusterClient, node)
if err != nil {
r.Log.Error(err, "error deleting node")
return ctrl.Result{}, errors.Wrap(err, "error deleting node")
if r.IsOutOfServiceTaintEnabled {
if !remediationMgr.HasOutOfServiceTaint(node) {
if err := remediationMgr.AddOutOfServiceTaint(ctx, clusterClient, node); err != nil {
return ctrl.Result{}, err
}
// If we immediately check if the node is drained, we might find no pods with
// Deletion timestamp set yet and assume the node is drained.
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
}

if !remediationMgr.IsNodeDrained(ctx, clusterClient, node) {
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
} else {
/*
Delete the node only after the host is powered off. Otherwise, if we would delete the node
when the host is powered on, the scheduler would assign the workload to other nodes, with the
possibility that two instances of the same application are running in parallel. This might result
in corruption or other issues for applications with singleton requirement. After the host is powered
off we know for sure that it is safe to re-assign that workload to other nodes.
*/
modified := r.backupNode(remediationMgr, node)
if modified {
r.Log.Info("Backing up node")
// save annotations before deleting node
return ctrl.Result{RequeueAfter: 1 * time.Second}, nil
}
r.Log.Info("Deleting node")
err := remediationMgr.DeleteNode(ctx, clusterClient, node)
if err != nil {
r.Log.Error(err, "error deleting node")
return ctrl.Result{}, errors.Wrap(err, "error deleting node")
}
// wait until node is gone
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}
// wait until node is gone
return ctrl.Result{RequeueAfter: 5 * time.Second}, nil
}

// we are done for this phase, switch to waiting for power on and the node restore
Expand Down
Loading