diff --git a/pkg/daemon/daemon.go b/pkg/daemon/daemon.go index 5ed31ff85..d82bd2604 100644 --- a/pkg/daemon/daemon.go +++ b/pkg/daemon/daemon.go @@ -120,6 +120,7 @@ func New( eventRecorder: er, featureGate: featureGates, disabledPlugins: disabledPlugins, + mu: &sync.Mutex{}, } } @@ -159,7 +160,6 @@ func (dn *Daemon) Run(stopCh <-chan struct{}, exitCh <-chan error) error { var timeout int64 = 5 var metadataKey = "metadata.name" - dn.mu = &sync.Mutex{} informerFactory := sninformer.NewFilteredSharedInformerFactory(dn.sriovClient, time.Second*15, vars.Namespace, @@ -683,7 +683,6 @@ func (dn *Daemon) restartDevicePluginPod() error { defer dn.mu.Unlock() log.Log.V(2).Info("restartDevicePluginPod(): try to restart device plugin pod") - var podToDelete string pods, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{ LabelSelector: "app=sriov-device-plugin", FieldSelector: "spec.nodeName=" + vars.NodeName, @@ -702,35 +701,37 @@ func (dn *Daemon) restartDevicePluginPod() error { log.Log.Info("restartDevicePluginPod(): device plugin pod exited") return nil } - podToDelete = pods.Items[0].Name - log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete) - err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{}) - if errors.IsNotFound(err) { - log.Log.Info("restartDevicePluginPod(): pod to delete not found") - return nil - } - if err != nil { - log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying") - return err - } - - if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) { - _, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{}) + for _, pod := range pods.Items { + podToDelete := pod.Name + log.Log.V(2).Info("restartDevicePluginPod(): Found device plugin pod, deleting it", "pod-name", podToDelete) + err = dn.kubeClient.CoreV1().Pods(vars.Namespace).Delete(context.Background(), podToDelete, metav1.DeleteOptions{}) if errors.IsNotFound(err) { - log.Log.Info("restartDevicePluginPod(): device plugin pod exited") - return true, nil + log.Log.Info("restartDevicePluginPod(): pod to delete not found") + return nil } - if err != nil { - log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying") - } else { - log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete) + log.Log.Error(err, "restartDevicePluginPod(): Failed to delete device plugin pod, retrying") + return err + } + + if err := wait.PollImmediateUntil(3*time.Second, func() (bool, error) { + _, err := dn.kubeClient.CoreV1().Pods(vars.Namespace).Get(context.Background(), podToDelete, metav1.GetOptions{}) + if errors.IsNotFound(err) { + log.Log.Info("restartDevicePluginPod(): device plugin pod exited") + return true, nil + } + + if err != nil { + log.Log.Error(err, "restartDevicePluginPod(): Failed to check for device plugin exit, retrying") + } else { + log.Log.Info("restartDevicePluginPod(): waiting for device plugin pod to exit", "pod-name", podToDelete) + } + return false, nil + }, dn.stopCh); err != nil { + log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion") + return err } - return false, nil - }, dn.stopCh); err != nil { - log.Log.Error(err, "restartDevicePluginPod(): failed to wait for checking pod deletion") - return err } return nil diff --git a/pkg/daemon/daemon_test.go b/pkg/daemon/daemon_test.go index f1111810a..f101cf31a 100644 --- a/pkg/daemon/daemon_test.go +++ b/pkg/daemon/daemon_test.go @@ -32,6 +32,19 @@ import ( "github.com/k8snetworkplumbingwg/sriov-network-operator/test/util/fakefilesystem" ) +var SriovDevicePluginPod corev1.Pod = corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sriov-device-plugin-xxxx", + Namespace: vars.Namespace, + Labels: map[string]string{ + "app": "sriov-device-plugin", + }, + }, + Spec: corev1.PodSpec{ + NodeName: "test-node", + }, +} + func TestConfigDaemon(t *testing.T) { RegisterFailHandler(Fail) RunSpecs(t, "Config Daemon Suite") @@ -107,19 +120,6 @@ var _ = Describe("Config Daemon", func() { }, } - SriovDevicePluginPod := corev1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: "sriov-device-plugin-xxxx", - Namespace: vars.Namespace, - Labels: map[string]string{ - "app": "sriov-device-plugin", - }, - }, - Spec: corev1.PodSpec{ - NodeName: "test-node", - }, - } - err = sriovnetworkv1.AddToScheme(scheme.Scheme) Expect(err).ToNot(HaveOccurred()) kClient := kclient.NewClientBuilder().WithScheme(scheme.Scheme).WithRuntimeObjects(&corev1.Node{ @@ -130,7 +130,7 @@ var _ = Describe("Config Daemon", func() { Namespace: vars.Namespace, }}).Build() - kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs, &SriovDevicePluginPod) + kubeClient := fakek8s.NewSimpleClientset(&FakeSupportedNicIDs) snclient := snclientset.NewSimpleClientset() err = sriovnetworkv1.InitNicIDMapFromConfigMap(kubeClient, vars.Namespace) Expect(err).ToNot(HaveOccurred()) @@ -175,6 +175,10 @@ var _ = Describe("Config Daemon", func() { err := sut.Run(stopCh, exitCh) Expect(err).ToNot(HaveOccurred()) }() + + _, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), &SriovDevicePluginPod, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + }) AfterEach(func() { @@ -286,6 +290,34 @@ var _ = Describe("Config Daemon", func() { Expect(sut.desiredNodeState.GetGeneration()).To(BeNumerically("==", 777)) }) + + It("restart all the sriov-device-plugin pods present on the node", func() { + otherPod1 := SriovDevicePluginPod.DeepCopy() + otherPod1.Name = "sriov-device-plugin-xxxa" + _, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod1, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + otherPod2 := SriovDevicePluginPod.DeepCopy() + otherPod2.Name = "sriov-device-plugin-xxxz" + _, err = sut.kubeClient.CoreV1().Pods(vars.Namespace).Create(context.Background(), otherPod2, metav1.CreateOptions{}) + Expect(err).ToNot(HaveOccurred()) + + err = sut.restartDevicePluginPod() + Expect(err).ToNot(HaveOccurred()) + + Eventually(func() (int, error) { + podList, err := sut.kubeClient.CoreV1().Pods(vars.Namespace).List(context.Background(), metav1.ListOptions{ + LabelSelector: "app=sriov-device-plugin", + FieldSelector: "spec.nodeName=test-node", + }) + + if err != nil { + return 0, err + } + + return len(podList.Items), nil + }, "1s").Should(BeZero()) + }) }) })