Skip to content

Commit

Permalink
Update E2E tests for Metal3Remediation with out-of-service taint
Browse files Browse the repository at this point in the history
Signed-off-by: Carlo Lobrano <[email protected]>
  • Loading branch information
clobrano committed Jun 27, 2024
1 parent 348d695 commit 60009dc
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 21 deletions.
3 changes: 3 additions & 0 deletions test/e2e/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ const (
osTypeCentos = "centos"
osTypeUbuntu = "ubuntu"
ironicSuffix = "-ironic"
// Out-of-service Taint test actions

Check failure on line 55 in test/e2e/common.go

View workflow job for this annotation

GitHub Actions / lint (test)

Comment should end in a period (godot)
oostAdded = "added"
oostRemoved = "removed"
)

var releaseMarkerPrefix = "go://github.com/metal3-io/cluster-api-provider-metal3@v%s"
Expand Down
71 changes: 59 additions & 12 deletions test/e2e/node_deletion_remediation.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
infrav1 "github.com/metal3-io/cluster-api-provider-metal3/api/v1beta1"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
"golang.org/x/mod/semver"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -17,7 +18,9 @@ import (
"sigs.k8s.io/controller-runtime/pkg/client"
)

type NodeDeletionRemediation struct {
const minK8sVersionOutOfServiceTaint = "1.28"

type NodeRemediation struct {
E2EConfig *clusterctl.E2EConfig
BootstrapClusterProxy framework.ClusterProxy
TargetCluster framework.ClusterProxy
Expand All @@ -27,22 +30,30 @@ type NodeDeletionRemediation struct {
}

/*
* Node Deletion Remediation Test
* Node Remediation Test
*
* This test evaluates node deletion in reboot remediation feature added to CAPM3 Remediation Controller.
* This test evaluates node remediation (via deletion or use of out-of-service taint) in reboot remediation feature added to CAPM3 Remediation Controller.
* issue #392: Reboot remediation is incomplete
* PR #668: Fix reboot remediation by adding node deletion
* This test evaluates the reboot remediation strategy with an enhancement of node deletion in the CAPM3 (Cluster API Provider for Metal3) Remediation Controller.
* This test evaluates the reboot remediation strategy with an enhancement in the CAPM3 (Cluster API Provider for Metal3) Remediation Controller
* consisting in:
* - node deletion (kubernetes server version < 1.28)
* - out-of-service taint on node (kubernetes server version >= 1.28)
*
* Tested Feature:
* - Delete Node in Reboot Remediation
* - Manage Node in Reboot Remediation (deletion or out-of-service taint)
*
* Workflow:
* 1. Retrieve the Metal3Machines associated with the worker nodes in the cluster.
* 2. Identify the target worker machine node its associated BMH object corresponding to the Metal3Machine.
* 3. Create a Metal3Remediation resource, specifying the remediation strategy as "Reboot" with a retry limit and timeout.
* 4. Wait for the VM (Virtual Machine) associated with target BMH to power off.
* 5. Wait for the target worker node to be deleted from the cluster.
* 5. Identify the kubernetes service version:
* 5a. if version < 1.28:
* - Wait for the target worker node to be deleted from the cluster.
* 5b. if version < 1.28:
* - Wait for the out-of-service taint to be set on target worker node.
* - Wait for the out-of-service taint to be removed from target worker node.
* 6. Wait for the VMs to power on.
* 7. Verify that the target worker node becomes ready.
* 8. Verify that the Metal3Remediation resource is successfully delete
Expand All @@ -52,8 +63,8 @@ type NodeDeletionRemediation struct {
* resiliency of the cluster by allowing workloads to be seamlessly migrated from unhealthy nodes to healthy node
*/

func nodeDeletionRemediation(ctx context.Context, inputGetter func() NodeDeletionRemediation) {
Logf("Starting node deletion remediation tests")
func nodeRemediation(ctx context.Context, inputGetter func() NodeRemediation) {
Logf("Starting node remediation tests")
input := inputGetter()
bootstrapClient := input.BootstrapClusterProxy.GetClient()
targetClient := input.TargetCluster.GetClient()
Expand Down Expand Up @@ -106,9 +117,18 @@ func nodeDeletionRemediation(ctx context.Context, inputGetter func() NodeDeletio
By("Waiting for VM power off")
waitForVmsState([]string{vmName}, shutoff, input.SpecName, input.E2EConfig.GetIntervals(input.SpecName, "wait-vm-state")...)

By("Waiting for node deletion")
interval := input.E2EConfig.GetIntervals(input.SpecName, "wait-vm-state")
waitForNodeDeletion(ctx, targetClient, workerNodeName, interval...)
k8sVersion := input.E2EConfig.GetVariable("KUBERNETES_VERSION")
if isOutOfServiceTaintSupported(k8sVersion) {
Byf("Waiting for Out of service taint on node to be added (kubernetes version %s)", k8sVersion)
interval := input.E2EConfig.GetIntervals(input.SpecName, "wait-vm-state")
waitForOutOfServiceTaint(ctx, targetClient, workerNodeName, oostAdded, interval...)
Byf("Waiting for Out of service taint on node to be removed (kubernetes version %s)", k8sVersion)
waitForOutOfServiceTaint(ctx, targetClient, workerNodeName, oostRemoved, interval...)
} else {
By("Waiting for node deletion")
interval := input.E2EConfig.GetIntervals(input.SpecName, "wait-vm-state")
waitForNodeDeletion(ctx, targetClient, workerNodeName, interval...)
}

By("Waiting for VM power on")
waitForVmsState([]string{vmName}, running, input.SpecName, input.E2EConfig.GetIntervals(input.SpecName, "wait-vm-state")...)
Expand All @@ -125,7 +145,7 @@ func nodeDeletionRemediation(ctx context.Context, inputGetter func() NodeDeletio
return apierrors.IsNotFound(err)
}, 2*time.Minute, 10*time.Second).Should(BeTrue(), "Metal3Remediation should have been deleted")

By("NODE DELETION TESTS PASSED!")
By("NODE REMEDIATION TESTS PASSED!")
}

func waitForNodeDeletion(ctx context.Context, cl client.Client, name string, intervals ...interface{}) {
Expand All @@ -137,3 +157,30 @@ func waitForNodeDeletion(ctx context.Context, cl client.Client, name string, int
return apierrors.IsNotFound(err)
}, intervals...).Should(BeTrue())
}

func waitForOutOfServiceTaint(ctx context.Context, cl client.Client, name, action string, intervals ...interface{}) {
Byf("Waiting for Out of service taint to Node '%s' to be %s", name, action)
var oostExpectedToExist bool
if action == oostAdded {
oostExpectedToExist = true
}
Eventually(
func() bool {
node := &corev1.Node{}
err := cl.Get(ctx, client.ObjectKey{Name: name}, node)
Expect(err).To(BeNil())

Check failure on line 171 in test/e2e/node_deletion_remediation.go

View workflow job for this annotation

GitHub Actions / lint (test)

ginkgo-linter: wrong error assertion; consider using `Expect(err).ToNot(HaveOccurred())` instead (ginkgolinter)
for _, t := range node.Spec.Taints {
if t.Key == "node.kubernetes.io/out-of-service" &&
t.Value == "nodeshutdown" &&
t.Effect == corev1.TaintEffectNoExecute {
return true == oostExpectedToExist

Check failure on line 176 in test/e2e/node_deletion_remediation.go

View workflow job for this annotation

GitHub Actions / lint (test)

S1002: should omit comparison to bool constant, can be simplified to `oostExpectedToExist` (gosimple)
}
}
return false == oostExpectedToExist

Check failure on line 179 in test/e2e/node_deletion_remediation.go

View workflow job for this annotation

GitHub Actions / lint (test)

S1002: should omit comparison to bool constant, can be simplified to `!oostExpectedToExist` (gosimple)
}, intervals...).Should(BeTrue())
}

func isOutOfServiceTaintSupported(k8sVersion string) bool {
Byf("comparing current version %s with supported version %s", k8sVersion, minK8sVersionOutOfServiceTaint)
return semver.Compare(k8sVersion, minK8sVersionOutOfServiceTaint) >= 0
}
16 changes: 10 additions & 6 deletions test/e2e/remediation_based_feature_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,15 +17,19 @@ import (
* These tests involve simulating failure scenarios, triggering the remediation process, and then verifying that the remediation actions successfully restore the nodes to the desired state.
*
* Test Types:
* 1. Metal3Remediation Test: This test specifically evaluates the Metal3 Remediation Controller's node deletion feature in the reboot remediation strategy.
* 1. Metal3Remediation Test: This test specifically evaluates the Metal3 Remediation Controller's node management feature in the reboot remediation strategy.
* 2. Remediation Test: This test focuses on verifying various annotations and actions related to remediation in the CAPM3 (Cluster API Provider for Metal3).
*
* NodeDeletionRemediation Test:
* NodeRemediation Test:
* - Retrieve the list of Metal3 machines associated with the worker nodes.
* - Identify the target worker Metal3Machine and its corresponding BareMetalHost (BMH) object.
* - Create a Metal3Remediation resource with a remediation strategy of type "Reboot" and a specified timeout.
* - Wait for the associated virtual machine (VM) to power off.
* - Wait for the node (VM) to be deleted.
* - If kubernetes server version < 1.28:
* - Wait for the node (VM) to be deleted.
* - If kubernetes server version >= 1.28:
* - Wait for the out-of-service taint to be set on the node.
* - Wait for the out-of-service taint to be removed from the node.
* - Wait for the VM to power on.
* - Wait for the node to be in a ready state.
* - Delete the Metal3Remediation resource.
Expand Down Expand Up @@ -71,9 +75,9 @@ var _ = Describe("Testing nodes remediation [remediation] [features]", Label("re
targetCluster, _ = createTargetCluster(e2eConfig.GetVariable("KUBERNETES_VERSION"))

// Run Metal3Remediation test first, doesn't work after remediation...
By("Running node deletion remediation tests")
nodeDeletionRemediation(ctx, func() NodeDeletionRemediation {
return NodeDeletionRemediation{
By("Running node remediation tests")
nodeRemediation(ctx, func() NodeRemediation {
return NodeRemediation{
E2EConfig: e2eConfig,
BootstrapClusterProxy: bootstrapClusterProxy,
TargetCluster: targetCluster,
Expand Down
2 changes: 1 addition & 1 deletion test/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ require (
github.com/onsi/gomega v1.33.1
github.com/pkg/errors v0.9.1
golang.org/x/crypto v0.24.0
golang.org/x/mod v0.18.0
gopkg.in/yaml.v3 v3.0.1
k8s.io/api v0.29.5
k8s.io/apiextensions-apiserver v0.29.5
Expand Down Expand Up @@ -116,7 +117,6 @@ require (
go.opentelemetry.io/otel/trace v1.22.0 // indirect
go.uber.org/multierr v1.11.0 // indirect
golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
golang.org/x/mod v0.17.0 // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/oauth2 v0.18.0 // indirect
golang.org/x/sync v0.7.0 // indirect
Expand Down
4 changes: 2 additions & 2 deletions test/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -305,8 +305,8 @@ golang.org/x/exp v0.0.0-20230905200255-921286631fa9/go.mod h1:S2oDrQGGwySpoQPVqR
golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4=
golang.org/x/mod v0.17.0 h1:zY54UmvipHiNd+pm+m0x9KhZ9hl1/7QNMyxXbc6ICqA=
golang.org/x/mod v0.17.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/mod v0.18.0 h1:5+9lSbEzPSdWkH32vYPBwEpX8KwDbM52Ud9xBUvNlb0=
golang.org/x/mod v0.18.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
Expand Down

0 comments on commit 60009dc

Please sign in to comment.