From 97888f7b3ea8c668f9a1ea8df537a4f5c5d6f306 Mon Sep 17 00:00:00 2001 From: Carlo Lobrano Date: Thu, 27 Jun 2024 14:10:42 +0200 Subject: [PATCH] Update E2E tests for Metal3Remediation with out-of-service taint Signed-off-by: Carlo Lobrano --- test/e2e/common.go | 3 + test/e2e/node_deletion_remediation.go | 71 ++++++++++++++++++---- test/e2e/remediation_based_feature_test.go | 16 +++-- test/go.mod | 2 +- test/go.sum | 4 +- 5 files changed, 75 insertions(+), 21 deletions(-) diff --git a/test/e2e/common.go b/test/e2e/common.go index ce7f583e26..865e8c290e 100644 --- a/test/e2e/common.go +++ b/test/e2e/common.go @@ -52,6 +52,9 @@ const ( osTypeCentos = "centos" osTypeUbuntu = "ubuntu" ironicSuffix = "-ironic" + // Out-of-service Taint test actions. + oostAdded = "added" + oostRemoved = "removed" ) var releaseMarkerPrefix = "go://github.com/metal3-io/cluster-api-provider-metal3@v%s" diff --git a/test/e2e/node_deletion_remediation.go b/test/e2e/node_deletion_remediation.go index 3f0ba19f8f..091654f148 100644 --- a/test/e2e/node_deletion_remediation.go +++ b/test/e2e/node_deletion_remediation.go @@ -8,6 +8,7 @@ import ( infrav1 "github.com/metal3-io/cluster-api-provider-metal3/api/v1beta1" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "golang.org/x/mod/semver" corev1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -17,7 +18,9 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" ) -type NodeDeletionRemediation struct { +const minK8sVersionOutOfServiceTaint = "1.28" + +type NodeRemediation struct { E2EConfig *clusterctl.E2EConfig BootstrapClusterProxy framework.ClusterProxy TargetCluster framework.ClusterProxy @@ -27,22 +30,30 @@ type NodeDeletionRemediation struct { } /* - * Node Deletion Remediation Test + * Node Remediation Test * - * This test evaluates node deletion in reboot remediation feature added to CAPM3 Remediation Controller. + * This test evaluates node remediation (via deletion or use of out-of-service taint) in reboot remediation feature added to CAPM3 Remediation Controller. * issue #392: Reboot remediation is incomplete * PR #668: Fix reboot remediation by adding node deletion - * This test evaluates the reboot remediation strategy with an enhancement of node deletion in the CAPM3 (Cluster API Provider for Metal3) Remediation Controller. + * This test evaluates the reboot remediation strategy with an enhancement in the CAPM3 (Cluster API Provider for Metal3) Remediation Controller + * consisting in: + * - node deletion (kubernetes server version < 1.28) + * - out-of-service taint on node (kubernetes server version >= 1.28) * * Tested Feature: - * - Delete Node in Reboot Remediation + * - Manage Node in Reboot Remediation (deletion or out-of-service taint) * * Workflow: * 1. Retrieve the Metal3Machines associated with the worker nodes in the cluster. * 2. Identify the target worker machine node its associated BMH object corresponding to the Metal3Machine. * 3. Create a Metal3Remediation resource, specifying the remediation strategy as "Reboot" with a retry limit and timeout. * 4. Wait for the VM (Virtual Machine) associated with target BMH to power off. - * 5. Wait for the target worker node to be deleted from the cluster. + * 5. Identify the kubernetes service version: + * 5a. if version < 1.28: + * - Wait for the target worker node to be deleted from the cluster. + * 5b. if version < 1.28: + * - Wait for the out-of-service taint to be set on target worker node. + * - Wait for the out-of-service taint to be removed from target worker node. * 6. Wait for the VMs to power on. * 7. Verify that the target worker node becomes ready. * 8. Verify that the Metal3Remediation resource is successfully delete @@ -52,8 +63,8 @@ type NodeDeletionRemediation struct { * resiliency of the cluster by allowing workloads to be seamlessly migrated from unhealthy nodes to healthy node */ -func nodeDeletionRemediation(ctx context.Context, inputGetter func() NodeDeletionRemediation) { - Logf("Starting node deletion remediation tests") +func nodeRemediation(ctx context.Context, inputGetter func() NodeRemediation) { + Logf("Starting node remediation tests") input := inputGetter() bootstrapClient := input.BootstrapClusterProxy.GetClient() targetClient := input.TargetCluster.GetClient() @@ -106,9 +117,18 @@ func nodeDeletionRemediation(ctx context.Context, inputGetter func() NodeDeletio By("Waiting for VM power off") waitForVmsState([]string{vmName}, shutoff, input.SpecName, input.E2EConfig.GetIntervals(input.SpecName, "wait-vm-state")...) - By("Waiting for node deletion") - interval := input.E2EConfig.GetIntervals(input.SpecName, "wait-vm-state") - waitForNodeDeletion(ctx, targetClient, workerNodeName, interval...) + k8sVersion := input.E2EConfig.GetVariable("KUBERNETES_VERSION") + if isOutOfServiceTaintSupported(k8sVersion) { + Byf("Waiting for Out of service taint on node to be added (kubernetes version %s)", k8sVersion) + interval := input.E2EConfig.GetIntervals(input.SpecName, "wait-vm-state") + waitForOutOfServiceTaint(ctx, targetClient, workerNodeName, oostAdded, interval...) + Byf("Waiting for Out of service taint on node to be removed (kubernetes version %s)", k8sVersion) + waitForOutOfServiceTaint(ctx, targetClient, workerNodeName, oostRemoved, interval...) + } else { + By("Waiting for node deletion") + interval := input.E2EConfig.GetIntervals(input.SpecName, "wait-vm-state") + waitForNodeDeletion(ctx, targetClient, workerNodeName, interval...) + } By("Waiting for VM power on") waitForVmsState([]string{vmName}, running, input.SpecName, input.E2EConfig.GetIntervals(input.SpecName, "wait-vm-state")...) @@ -125,7 +145,7 @@ func nodeDeletionRemediation(ctx context.Context, inputGetter func() NodeDeletio return apierrors.IsNotFound(err) }, 2*time.Minute, 10*time.Second).Should(BeTrue(), "Metal3Remediation should have been deleted") - By("NODE DELETION TESTS PASSED!") + By("NODE REMEDIATION TESTS PASSED!") } func waitForNodeDeletion(ctx context.Context, cl client.Client, name string, intervals ...interface{}) { @@ -137,3 +157,30 @@ func waitForNodeDeletion(ctx context.Context, cl client.Client, name string, int return apierrors.IsNotFound(err) }, intervals...).Should(BeTrue()) } + +func waitForOutOfServiceTaint(ctx context.Context, cl client.Client, name, action string, intervals ...interface{}) { + Byf("Waiting for Out of service taint to Node '%s' to be %s", name, action) + var oostExpectedToExist bool + if action == oostAdded { + oostExpectedToExist = true + } + Eventually( + func() bool { + node := &corev1.Node{} + err := cl.Get(ctx, client.ObjectKey{Name: name}, node) + Expect(err).ToNot(HaveOccurred()) + for _, t := range node.Spec.Taints { + if t.Key == "node.kubernetes.io/out-of-service" && + t.Value == "nodeshutdown" && + t.Effect == corev1.TaintEffectNoExecute { + return oostExpectedToExist + } + } + return !oostExpectedToExist + }, intervals...).Should(BeTrue()) +} + +func isOutOfServiceTaintSupported(k8sVersion string) bool { + Byf("comparing current version %s with supported version %s", k8sVersion, minK8sVersionOutOfServiceTaint) + return semver.Compare(k8sVersion, minK8sVersionOutOfServiceTaint) >= 0 +} diff --git a/test/e2e/remediation_based_feature_test.go b/test/e2e/remediation_based_feature_test.go index 774bbf95a4..c53c511361 100644 --- a/test/e2e/remediation_based_feature_test.go +++ b/test/e2e/remediation_based_feature_test.go @@ -17,15 +17,19 @@ import ( * These tests involve simulating failure scenarios, triggering the remediation process, and then verifying that the remediation actions successfully restore the nodes to the desired state. * * Test Types: - * 1. Metal3Remediation Test: This test specifically evaluates the Metal3 Remediation Controller's node deletion feature in the reboot remediation strategy. + * 1. Metal3Remediation Test: This test specifically evaluates the Metal3 Remediation Controller's node management feature in the reboot remediation strategy. * 2. Remediation Test: This test focuses on verifying various annotations and actions related to remediation in the CAPM3 (Cluster API Provider for Metal3). * - * NodeDeletionRemediation Test: + * NodeRemediation Test: * - Retrieve the list of Metal3 machines associated with the worker nodes. * - Identify the target worker Metal3Machine and its corresponding BareMetalHost (BMH) object. * - Create a Metal3Remediation resource with a remediation strategy of type "Reboot" and a specified timeout. * - Wait for the associated virtual machine (VM) to power off. - * - Wait for the node (VM) to be deleted. + * - If kubernetes server version < 1.28: + * - Wait for the node (VM) to be deleted. + * - If kubernetes server version >= 1.28: + * - Wait for the out-of-service taint to be set on the node. + * - Wait for the out-of-service taint to be removed from the node. * - Wait for the VM to power on. * - Wait for the node to be in a ready state. * - Delete the Metal3Remediation resource. @@ -71,9 +75,9 @@ var _ = Describe("Testing nodes remediation [remediation] [features]", Label("re targetCluster, _ = createTargetCluster(e2eConfig.GetVariable("KUBERNETES_VERSION")) // Run Metal3Remediation test first, doesn't work after remediation... - By("Running node deletion remediation tests") - nodeDeletionRemediation(ctx, func() NodeDeletionRemediation { - return NodeDeletionRemediation{ + By("Running node remediation tests") + nodeRemediation(ctx, func() NodeRemediation { + return NodeRemediation{ E2EConfig: e2eConfig, BootstrapClusterProxy: bootstrapClusterProxy, TargetCluster: targetCluster, diff --git a/test/go.mod b/test/go.mod index 36430cbe8a..692b9e9d6f 100644 --- a/test/go.mod +++ b/test/go.mod @@ -12,6 +12,7 @@ require ( github.com/onsi/gomega v1.33.1 github.com/pkg/errors v0.9.1 golang.org/x/crypto v0.24.0 + golang.org/x/mod v0.18.0 gopkg.in/yaml.v3 v3.0.1 k8s.io/api v0.29.5 k8s.io/apiextensions-apiserver v0.29.5 @@ -116,7 +117,6 @@ require ( go.opentelemetry.io/otel/trace v1.22.0 // indirect go.uber.org/multierr v1.11.0 // indirect golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect - golang.org/x/mod v0.17.0 // indirect golang.org/x/net v0.26.0 // indirect golang.org/x/oauth2 v0.18.0 // indirect golang.org/x/sync v0.7.0 // indirect diff --git a/test/go.sum b/test/go.sum index 78b20fee1b..b003b42bac 100644 --- a/test/go.sum +++ b/test/go.sum @@ -305,8 +305,8 @@ golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqR golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= -golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA= -golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0= +golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=