Skip to content

Commit

Permalink
Merge branch 'main' into fix/reliability-test
Browse files Browse the repository at this point in the history
  • Loading branch information
ty-dc authored Oct 18, 2024
2 parents 278739a + dcf524e commit f5f40e8
Show file tree
Hide file tree
Showing 9 changed files with 229 additions and 78 deletions.
2 changes: 1 addition & 1 deletion images/spiderpool-plugins/version.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
# this file is generated by updateLatestCniPluginVersion.sh , please do not edit

# https://github.com/containernetworking/plugins
export CNI_VERSION=${CNI_VERSION:-"v1.5.1"}
export CNI_VERSION=${CNI_VERSION:-"v1.6.0"}
# https://github.com/k8snetworkplumbingwg/ovs-cni
export OVS_VERSION=${OVS_VERSION:-"v0.34.2"}
# https://github.com/k8snetworkplumbingwg/rdma-cni
Expand Down
2 changes: 2 additions & 0 deletions test/e2e/common/constant.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ const (
BatchCreateTimeout = time.Minute * 5
KdoctorCheckTime = time.Minute * 10
SpiderSyncMultusTime = time.Minute * 2
InformerSyncStatusTime = time.Second * 30
KDoctorRunTimeout = time.Minute * 10
)

var ForcedWaitingTime = time.Second
Expand Down
34 changes: 34 additions & 0 deletions test/e2e/common/node.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"fmt"
"os/exec"

"github.com/hashicorp/go-multierror"
. "github.com/onsi/ginkgo/v2"
e2e "github.com/spidernet-io/e2eframework/framework"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -55,3 +56,36 @@ func RestartNodeUntilClusterReady(ctx context.Context, frame *e2e.Framework, nod
GinkgoWriter.Println("Check that the status of all Pods in the cluster is running")
return nil
}

func GetNodeNetworkInfo(ctx context.Context, frame *e2e.Framework, nodeList []string) error {
var jobResult *multierror.Error
for _, node := range nodeList {
GinkgoWriter.Printf("=============== Check the network information of the node %v ============== \n", node)
commands := []string{
"ip a",
"ip link show",
"ip n",
"ip -6 n",
"ip rule",
"ip -6 rule",
"ip route",
"ip route show table 100",
"ip route show table 101",
"ip route show table 500",
"ip -6 route",
"ip -6 route show table 100",
"ip -6 route show table 101",
"ip -6 route show table 500",
}

for _, command := range commands {
GinkgoWriter.Printf("--------------- execute %v in node: %v ------------ \n", command, node)
out, err := frame.DockerExecCommand(ctx, node, command)
if err != nil {
jobResult = multierror.Append(jobResult, fmt.Errorf("node %v: command '%v' failed with error: %w, output: %s", node, command, err, out))
}
}
}

return jobResult.ErrorOrNil()
}
42 changes: 41 additions & 1 deletion test/e2e/common/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ import (
"time"

"github.com/spidernet-io/spiderpool/pkg/constant"
"github.com/spidernet-io/spiderpool/pkg/utils/retry"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/hashicorp/go-multierror"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
e2e "github.com/spidernet-io/e2eframework/framework"
Expand Down Expand Up @@ -49,7 +51,14 @@ func GenerateExamplePodYaml(podName, namespace string) *corev1.Pod {
func CreatePodUntilReady(frame *e2e.Framework, podYaml *corev1.Pod, podName, namespace string, waitPodStartTimeout time.Duration) (pod *corev1.Pod, podIPv4, podIPv6 string) {
// create pod
GinkgoWriter.Printf("create pod %v/%v \n", namespace, podName)
err := frame.CreatePod(podYaml)
err := retry.RetryOnConflictWithContext(context.Background(), retry.DefaultBackoff, func(ctx context.Context) error {
err := frame.CreatePod(podYaml)
if err != nil {
GinkgoLogr.Error(fmt.Errorf("failed to create pod %v/%v, error: %v", namespace, podName, err), "Failed")
return err
}
return nil
})
Expect(err).NotTo(HaveOccurred(), "failed to create pod")

// wait for pod ip
Expand Down Expand Up @@ -143,3 +152,34 @@ func ValidatePodIPConflict(podList *corev1.PodList) error {
}
return nil
}

func GetPodNetworkInfo(ctx context.Context, frame *e2e.Framework, podList *corev1.PodList) error {
var jobResult *multierror.Error
for _, pod := range podList.Items {
GinkgoWriter.Printf("=============== Check the network information of the pod %v/%v ============== \n", pod.Namespace, pod.Name)
commands := []string{
"ip a",
"ip link show",
"ip n",
"ip -6 n",
"ip rule",
"ip -6 rule",
"ip route",
"ip route show table 100",
"ip route show table 101",
"ip -6 route",
"ip -6 route show table 100",
"ip -6 route show table 101",
}

for _, command := range commands {
GinkgoWriter.Printf("--------------- execute %v in pod: %v/%v on node: %v ------------ \n", command, pod.Namespace, pod.Name, pod.Spec.NodeName)
out, err := frame.ExecCommandInPod(pod.Name, pod.Namespace, command, ctx)
if err != nil {
jobResult = multierror.Append(jobResult, fmt.Errorf("pod %v/%v: command '%v' failed with error: %w, output: %s", pod.Namespace, pod.Name, command, err, out))
}
}
}

return jobResult.ErrorOrNil()
}
64 changes: 45 additions & 19 deletions test/e2e/common/spiderpool.go
Original file line number Diff line number Diff line change
Expand Up @@ -516,7 +516,7 @@ func DeleteIPPoolUntilFinish(f *frame.Framework, poolName string, ctx context.Co
default:
_, err := GetIppoolByName(f, poolName)
if err != nil {
GinkgoWriter.Printf("IPPool '%s' has been removederror: %v", poolName, err)
GinkgoWriter.Printf("IPPool '%s' has been removed, error: %v", poolName, err)
return nil
}
time.Sleep(ForcedWaitingTime)
Expand Down Expand Up @@ -608,7 +608,7 @@ func WaitWorkloadDeleteUntilFinish(ctx context.Context, f *frame.Framework, name
_, err := GetWorkloadByName(f, namespace, name)
if err != nil {
if api_errors.IsNotFound(err) {
GinkgoWriter.Printf("workload '%s/%s' has been removederror: %v", namespace, name, err)
GinkgoWriter.Printf("workload '%s/%s' has been removed, error: %v", namespace, name, err)
return nil
}
return err
Expand Down Expand Up @@ -987,24 +987,50 @@ func CheckIppoolSanity(f *frame.Framework, poolName string) error {
}
}

if *ippool.Status.AllocatedIPCount > *ippool.Status.TotalIPCount {
GinkgoWriter.Printf(
"allocated IP count (%v) exceeds total IP count (%v) \n",
*ippool.Status.AllocatedIPCount, *ippool.Status.TotalIPCount,
)
isSanity = false
}
// The status of IPPool is automatically synchronized by the IPPool informer based on the events it receives.
// In the CI environment, the creation of IPPools happens very quickly, and their health checks are performed promptly.
// When checking the TotalIPCount status, if the spiderpool-controller undergoes a leader election or the informer has not yet completed synchronization,
// the IPPool status TotalIPCount may be nil. This can lead to a panic.
// In such cases, try waiting for the informer to complete status synchronization before checking the robustness of the IPPool.
ctx, cancel := context.WithTimeout(context.Background(), InformerSyncStatusTime)
defer cancel()
for {
select {
case <-ctx.Done():
return fmt.Errorf("waiting for informer to synchronize IPPool %s status timed out", poolName)
default:
if ippool.Status.AllocatedIPCount == nil || ippool.Status.TotalIPCount == nil {
GinkgoLogr.Error(fmt.Errorf("IPPool %s has nil status fields, retrying", poolName), "Failed")
ippool, err = GetIppoolByName(f, poolName)
if err != nil {
if api_errors.IsNotFound(err) {
return fmt.Errorf("ippool %s does not exist", poolName)
}
return fmt.Errorf("failed to get ippool %s, error %v", poolName, err)
}
time.Sleep(ForcedWaitingTime)
continue
}

// Ensure that the IP pool's reported usage matches the actual usage
if actualIPUsageCount != int(*ippool.Status.AllocatedIPCount) {
GinkgoWriter.Printf("IPPool %s usage count mismatch: expected %d, got %d \n", poolName, actualIPUsageCount, *ippool.Status.AllocatedIPCount)
isSanity = false
}
if *ippool.Status.AllocatedIPCount > *ippool.Status.TotalIPCount {
GinkgoWriter.Printf(
"allocated IP count (%v) exceeds total IP count (%v) \n",
*ippool.Status.AllocatedIPCount, *ippool.Status.TotalIPCount,
)
isSanity = false
}
// Ensure that the IP pool's reported usage matches the actual usage
if actualIPUsageCount != int(*ippool.Status.AllocatedIPCount) {
GinkgoWriter.Printf("IPPool %s usage count mismatch: expected %d, got %d \n", poolName, actualIPUsageCount, *ippool.Status.AllocatedIPCount)
isSanity = false
}

if !isSanity {
return fmt.Errorf("IPPool %s sanity check failed", poolName)
}
if !isSanity {
return fmt.Errorf("IPPool %s sanity check failed", poolName)
}

GinkgoWriter.Printf("Successfully checked IPPool %s sanity, IPPool record information is correct \n", poolName)
return nil
GinkgoWriter.Printf("Successfully checked IPPool %s sanity, IPPool record information is correct \n", poolName)
return nil
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ import (
apitypes "k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"

"github.com/spidernet-io/spiderpool/pkg/constant"
pkgconstant "github.com/spidernet-io/spiderpool/pkg/constant"
"github.com/spidernet-io/spiderpool/pkg/ip"
spiderpoolv2beta1 "github.com/spidernet-io/spiderpool/pkg/k8s/apis/spiderpool.spidernet.io/v2beta1"
Expand Down Expand Up @@ -68,7 +67,10 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
// Schedule
crontab := "1 1"
schedule.Schedule = &crontab
schedule.RoundNumber = 1
// The sporadic test failures in kdoctor were attempted to be reproduced, but couldn't be.
// By leveraging kdoctor's loop testing, if a failure occurs in the first test,
// check whether it also fails on the second attempt.
schedule.RoundNumber = 3
schedule.RoundTimeoutMinute = 1
task.Spec.Schedule = schedule

Expand All @@ -85,7 +87,7 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
task.Spec.Target = targetAgent

// request
request.DurationInSecond = 5
request.DurationInSecond = 10
request.QPS = 1
request.PerRequestTimeoutInMS = 7000
task.Spec.Request = request
Expand All @@ -94,15 +96,12 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
condition.SuccessRate = &successRate
condition.MeanAccessDelayInMs = &delayMs
task.Spec.SuccessCondition = condition
taskCopy := task

GinkgoWriter.Printf("kdoctor task: %+v \n", task)
err := frame.CreateResource(task)
Expect(err).NotTo(HaveOccurred(), " kdoctor nethttp crd create failed")

err = frame.GetResource(apitypes.NamespacedName{Name: name}, taskCopy)
Expect(err).NotTo(HaveOccurred(), " kdoctor nethttp crd get failed")
Expect(err).NotTo(HaveOccurred(), "failed to create kdoctor task")
GinkgoWriter.Printf("succeeded to create kdoctor task: %+v \n", task)

// update the kdoctor service to use corev1.ServiceExternalTrafficPolicyLocal
if frame.Info.IpV4Enabled {
kdoctorIPv4ServiceName := fmt.Sprintf("%s-%s-ipv4", "kdoctor-netreach", task.Name)
var kdoctorIPv4Service *corev1.Service
Expand Down Expand Up @@ -138,52 +137,50 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
Expect(frame.UpdateResource(kdoctorIPv6Service)).NotTo(HaveOccurred())
}

ctx, cancel := context.WithTimeout(context.Background(), time.Second*60*5)
// waiting for kdoctor task to finish
ctx, cancel := context.WithTimeout(context.Background(), common.KDoctorRunTimeout)
defer cancel()
var err1 = errors.New("error has occurred")
for run {
for {
select {
case <-ctx.Done():
run = false
Expect(errors.New("wait nethttp test timeout")).NotTo(HaveOccurred(), " running kdoctor task timeout")
Expect(errors.New("timeout waiting for kdoctor task to finish")).NotTo(HaveOccurred())
default:
taskCopy := task
err = frame.GetResource(apitypes.NamespacedName{Name: name}, taskCopy)
Expect(err).NotTo(HaveOccurred(), " kdoctor nethttp crd get failed")

if taskCopy.Status.Finish == true {
command := fmt.Sprintf("get netreaches.kdoctor.io %s -oyaml", taskCopy.Name)
netreachesLog, _ := frame.ExecKubectl(command, ctx)
GinkgoWriter.Printf("kdoctor's netreaches execution result %+v \n", string(netreachesLog))

for _, v := range taskCopy.Status.History {
if v.Status == "succeed" {
err1 = nil
Expect(err).NotTo(HaveOccurred(), "Failed to get kdoctor task")
if taskCopy.Status.Finish {
roundFailed := false
for _, t := range taskCopy.Status.History {
// No configuration has been changed, The first round of the test is not considered a failure
if t.RoundNumber != 1 && t.Status == "failed" {
roundFailed = true
break
}
}
run = false

ctx1, cancel1 := context.WithTimeout(context.Background(), time.Second*30)
defer cancel1()
for {
select {
case <-ctx1.Done():
Expect(errors.New("wait kdoctorreport timeout")).NotTo(HaveOccurred(), "failed to run kdoctor task and wait kdoctorreport timeout")
default:
command = fmt.Sprintf("get kdoctorreport %s -oyaml", taskCopy.Name)
kdoctorreportLog, err := frame.ExecKubectl(command, ctx)
if err != nil {
time.Sleep(common.ForcedWaitingTime)
continue
}
GinkgoWriter.Printf("kdoctor's kdoctorreport execution result %+v \n", string(kdoctorreportLog))
}
break
if roundFailed {
Fail("kdoctor task is not successful")
}
return
}
for _, t := range taskCopy.Status.History {
// If the check is successful, exit directly.
if t.RoundNumber == 1 && t.Status == "succeed" {
GinkgoWriter.Println("succeed to run kdoctor task")
return
}
// If the check fails, we should collect the failed Pod network information as soon as possible
// If the first attempt failed but the second attempt succeeded,
// we collected network logs and compared the two attempts to see if there were any differences.
if t.Status == "failed" || (t.RoundNumber != 1 && t.Status == "succeed") {
GinkgoLogr.Error(fmt.Errorf("Failed to run kdoctor task, round %d, at time %s", t.RoundNumber, time.Now()), "Failed")
podList, err := frame.GetPodListByLabel(map[string]string{"app.kubernetes.io/name": taskCopy.Name})
Expect(err).NotTo(HaveOccurred(), "Failed to get pod list by label")
Expect(common.GetPodNetworkInfo(ctx, frame, podList)).NotTo(HaveOccurred(), "Failed to get pod network info")
Expect(common.GetNodeNetworkInfo(ctx, frame, frame.Info.KindNodeList)).NotTo(HaveOccurred(), "Failed to get node network info")
}
}
time.Sleep(time.Second * 5)
}
}
Expect(err1).NotTo(HaveOccurred())
})
})

Expand Down Expand Up @@ -232,7 +229,7 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
Namespace: namespace,
},
Spec: spiderpoolv2beta1.MultusCNIConfigSpec{
CniType: ptr.To(constant.MacvlanCNI),
CniType: ptr.To(pkgconstant.MacvlanCNI),
MacvlanConfig: &spiderpoolv2beta1.SpiderMacvlanCniConfig{
Master: []string{common.NIC1},
VlanID: ptr.To(int32(100)),
Expand Down Expand Up @@ -283,7 +280,7 @@ var _ = Describe("MacvlanUnderlayOne", Serial, Label("underlay", "one-interface"
Expect(err).NotTo(HaveOccurred())
var annotations = make(map[string]string)
annotations[common.MultusNetworks] = fmt.Sprintf("%s/%s", namespace, multusNadName)
annotations[constant.AnnoPodIPPools] = string(podAnnoMarshal)
annotations[pkgconstant.AnnoPodIPPools] = string(podAnnoMarshal)
deployObject := common.GenerateExampleDeploymentYaml(depName, namespace, int32(1))
deployObject.Spec.Template.Annotations = annotations
Expect(frame.CreateDeployment(deployObject)).NotTo(HaveOccurred())
Expand Down
Loading

0 comments on commit f5f40e8

Please sign in to comment.