diff --git a/README.md b/README.md index e01f3a6c96..079b15ebcd 100644 --- a/README.md +++ b/README.md @@ -66,15 +66,21 @@ Replication Controller (RC), Deployment, or Job running on the same node. If the those duplicate pods are evicted for better spreading of pods in a cluster. This issue could happen if some nodes went down due to whatever reasons, and pods on them were moved to other nodes leading to more than one pod associated with a RS or RC, for example, running on the same node. Once the failed nodes -are ready again, this strategy could be enabled to evict those duplicate pods. Currently, there are no -parameters associated with this strategy. To disable this strategy, the policy should look like: +are ready again, this strategy could be enabled to evict those duplicate pods. + +It provides one optional parameter, `ExcludeOwnerKinds`, which is a list of OwnerRef `Kind`s. If a pod +has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considered for eviction. ``` apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: "RemoveDuplicates": - enabled: false + enabled: true + params: + removeDuplicates: + excludeOwnerKinds: + - "ReplicaSet" ``` ### LowNodeUtilization diff --git a/pkg/api/types.go b/pkg/api/types.go index 34538dcfed..a4e7f17419 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -50,6 +50,7 @@ type StrategyParameters struct { NodeAffinityType []string PodsHavingTooManyRestarts *PodsHavingTooManyRestarts MaxPodLifeTimeSeconds *uint + RemoveDuplicates *RemoveDuplicates } type Percentage float64 @@ -65,3 +66,7 @@ type PodsHavingTooManyRestarts struct { PodRestartThreshold int32 IncludingInitContainers bool } + +type RemoveDuplicates struct { + ExcludeOwnerKinds []string +} diff --git a/pkg/api/v1alpha1/types.go b/pkg/api/v1alpha1/types.go index c03c2fe426..d2be207f1c 100644 --- a/pkg/api/v1alpha1/types.go +++ b/pkg/api/v1alpha1/types.go @@ -50,6 +50,7 @@ type StrategyParameters struct { NodeAffinityType []string `json:"nodeAffinityType,omitempty"` PodsHavingTooManyRestarts *PodsHavingTooManyRestarts `json:"podsHavingTooManyRestarts,omitempty"` MaxPodLifeTimeSeconds *uint `json:"maxPodLifeTimeSeconds,omitempty"` + RemoveDuplicates *RemoveDuplicates `json:"removeDuplicates,omitempty"` } type Percentage float64 @@ -65,3 +66,7 @@ type PodsHavingTooManyRestarts struct { PodRestartThreshold int32 `json:"podRestartThreshold,omitempty"` IncludingInitContainers bool `json:"includingInitContainers,omitempty"` } + +type RemoveDuplicates struct { + ExcludeOwnerKinds []string `json:"excludeOwnerKinds,omitempty"` +} diff --git a/pkg/api/v1alpha1/zz_generated.conversion.go b/pkg/api/v1alpha1/zz_generated.conversion.go index 22ea30d9c0..4317b18c1c 100644 --- a/pkg/api/v1alpha1/zz_generated.conversion.go +++ b/pkg/api/v1alpha1/zz_generated.conversion.go @@ -75,6 +75,16 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddGeneratedConversionFunc((*RemoveDuplicates)(nil), (*api.RemoveDuplicates)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1alpha1_RemoveDuplicates_To_api_RemoveDuplicates(a.(*RemoveDuplicates), b.(*api.RemoveDuplicates), scope) + }); err != nil { + return err + } + if err := s.AddGeneratedConversionFunc((*api.RemoveDuplicates)(nil), (*RemoveDuplicates)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_api_RemoveDuplicates_To_v1alpha1_RemoveDuplicates(a.(*api.RemoveDuplicates), b.(*RemoveDuplicates), scope) + }); err != nil { + return err + } if err := s.AddGeneratedConversionFunc((*StrategyParameters)(nil), (*api.StrategyParameters)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha1_StrategyParameters_To_api_StrategyParameters(a.(*StrategyParameters), b.(*api.StrategyParameters), scope) }); err != nil { @@ -182,11 +192,32 @@ func Convert_api_PodsHavingTooManyRestarts_To_v1alpha1_PodsHavingTooManyRestarts return autoConvert_api_PodsHavingTooManyRestarts_To_v1alpha1_PodsHavingTooManyRestarts(in, out, s) } +func autoConvert_v1alpha1_RemoveDuplicates_To_api_RemoveDuplicates(in *RemoveDuplicates, out *api.RemoveDuplicates, s conversion.Scope) error { + out.ExcludeOwnerKinds = *(*[]string)(unsafe.Pointer(&in.ExcludeOwnerKinds)) + return nil +} + +// Convert_v1alpha1_RemoveDuplicates_To_api_RemoveDuplicates is an autogenerated conversion function. +func Convert_v1alpha1_RemoveDuplicates_To_api_RemoveDuplicates(in *RemoveDuplicates, out *api.RemoveDuplicates, s conversion.Scope) error { + return autoConvert_v1alpha1_RemoveDuplicates_To_api_RemoveDuplicates(in, out, s) +} + +func autoConvert_api_RemoveDuplicates_To_v1alpha1_RemoveDuplicates(in *api.RemoveDuplicates, out *RemoveDuplicates, s conversion.Scope) error { + out.ExcludeOwnerKinds = *(*[]string)(unsafe.Pointer(&in.ExcludeOwnerKinds)) + return nil +} + +// Convert_api_RemoveDuplicates_To_v1alpha1_RemoveDuplicates is an autogenerated conversion function. +func Convert_api_RemoveDuplicates_To_v1alpha1_RemoveDuplicates(in *api.RemoveDuplicates, out *RemoveDuplicates, s conversion.Scope) error { + return autoConvert_api_RemoveDuplicates_To_v1alpha1_RemoveDuplicates(in, out, s) +} + func autoConvert_v1alpha1_StrategyParameters_To_api_StrategyParameters(in *StrategyParameters, out *api.StrategyParameters, s conversion.Scope) error { out.NodeResourceUtilizationThresholds = (*api.NodeResourceUtilizationThresholds)(unsafe.Pointer(in.NodeResourceUtilizationThresholds)) out.NodeAffinityType = *(*[]string)(unsafe.Pointer(&in.NodeAffinityType)) out.PodsHavingTooManyRestarts = (*api.PodsHavingTooManyRestarts)(unsafe.Pointer(in.PodsHavingTooManyRestarts)) out.MaxPodLifeTimeSeconds = (*uint)(unsafe.Pointer(in.MaxPodLifeTimeSeconds)) + out.RemoveDuplicates = (*api.RemoveDuplicates)(unsafe.Pointer(in.RemoveDuplicates)) return nil } @@ -200,6 +231,7 @@ func autoConvert_api_StrategyParameters_To_v1alpha1_StrategyParameters(in *api.S out.NodeAffinityType = *(*[]string)(unsafe.Pointer(&in.NodeAffinityType)) out.PodsHavingTooManyRestarts = (*PodsHavingTooManyRestarts)(unsafe.Pointer(in.PodsHavingTooManyRestarts)) out.MaxPodLifeTimeSeconds = (*uint)(unsafe.Pointer(in.MaxPodLifeTimeSeconds)) + out.RemoveDuplicates = (*RemoveDuplicates)(unsafe.Pointer(in.RemoveDuplicates)) return nil } diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index 035395163e..d5782539ee 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -119,6 +119,27 @@ func (in *PodsHavingTooManyRestarts) DeepCopy() *PodsHavingTooManyRestarts { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RemoveDuplicates) DeepCopyInto(out *RemoveDuplicates) { + *out = *in + if in.ExcludeOwnerKinds != nil { + in, out := &in.ExcludeOwnerKinds, &out.ExcludeOwnerKinds + *out = make([]string, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemoveDuplicates. +func (in *RemoveDuplicates) DeepCopy() *RemoveDuplicates { + if in == nil { + return nil + } + out := new(RemoveDuplicates) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in ResourceThresholds) DeepCopyInto(out *ResourceThresholds) { { @@ -186,6 +207,11 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) { *out = new(uint) **out = **in } + if in.RemoveDuplicates != nil { + in, out := &in.RemoveDuplicates, &out.RemoveDuplicates + *out = new(RemoveDuplicates) + (*in).DeepCopyInto(*out) + } return } diff --git a/pkg/api/zz_generated.deepcopy.go b/pkg/api/zz_generated.deepcopy.go index 857c32738f..1303360c2c 100644 --- a/pkg/api/zz_generated.deepcopy.go +++ b/pkg/api/zz_generated.deepcopy.go @@ -119,6 +119,27 @@ func (in *PodsHavingTooManyRestarts) DeepCopy() *PodsHavingTooManyRestarts { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RemoveDuplicates) DeepCopyInto(out *RemoveDuplicates) { + *out = *in + if in.ExcludeOwnerKinds != nil { + in, out := &in.ExcludeOwnerKinds, &out.ExcludeOwnerKinds + *out = make([]string, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemoveDuplicates. +func (in *RemoveDuplicates) DeepCopy() *RemoveDuplicates { + if in == nil { + return nil + } + out := new(RemoveDuplicates) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in ResourceThresholds) DeepCopyInto(out *ResourceThresholds) { { @@ -186,6 +207,11 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) { *out = new(uint) **out = **in } + if in.RemoveDuplicates != nil { + in, out := &in.RemoveDuplicates, &out.RemoveDuplicates + *out = new(RemoveDuplicates) + (*in).DeepCopyInto(*out) + } return } diff --git a/pkg/descheduler/strategies/duplicates.go b/pkg/descheduler/strategies/duplicates.go index 258521f3c0..471b076170 100644 --- a/pkg/descheduler/strategies/duplicates.go +++ b/pkg/descheduler/strategies/duplicates.go @@ -18,9 +18,13 @@ package strategies import ( "context" + "reflect" + "sort" "strings" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog" @@ -31,7 +35,8 @@ import ( // RemoveDuplicatePods removes the duplicate pods on node. This strategy evicts all duplicate pods on node. // A pod is said to be a duplicate of other if both of them are from same creator, kind and are within the same -// namespace. As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages. +// namespace, and have at least one container with the same image. +// As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages. func RemoveDuplicatePods( ctx context.Context, client clientset.Interface, @@ -42,41 +47,83 @@ func RemoveDuplicatePods( ) { for _, node := range nodes { klog.V(1).Infof("Processing node: %#v", node.Name) - dpm := listDuplicatePodsOnANode(ctx, client, node, evictLocalStoragePods) - for creator, pods := range dpm { - if len(pods) > 1 { - klog.V(1).Infof("%#v", creator) - // i = 0 does not evict the first pod - for i := 1; i < len(pods); i++ { - if _, err := podEvictor.EvictPod(ctx, pods[i], node); err != nil { - klog.Errorf("Error evicting pod: (%#v)", err) - break - } - } + duplicatePods := listDuplicatePodsOnANode(ctx, client, node, strategy, evictLocalStoragePods) + for _, pod := range duplicatePods { + if _, err := podEvictor.EvictPod(ctx, pod, node); err != nil { + klog.Errorf("Error evicting pod: (%#v)", err) + break } } } } -//type creator string -type duplicatePodsMap map[string][]*v1.Pod - // listDuplicatePodsOnANode lists duplicate pods on a given node. -func listDuplicatePodsOnANode(ctx context.Context, client clientset.Interface, node *v1.Node, evictLocalStoragePods bool) duplicatePodsMap { +// It checks for pods which have the same owner and have at least 1 container with the same image spec +func listDuplicatePodsOnANode(ctx context.Context, client clientset.Interface, node *v1.Node, strategy api.DeschedulerStrategy, evictLocalStoragePods bool) []*v1.Pod { pods, err := podutil.ListEvictablePodsOnNode(ctx, client, node, evictLocalStoragePods) if err != nil { return nil } - dpm := duplicatePodsMap{} - // Ignoring the error here as in the ListDuplicatePodsOnNode function we call ListEvictablePodsOnNode which checks for error. + duplicatePods := make([]*v1.Pod, 0, len(pods)) + // Each pod has a list of owners and a list of containers, and each container has 1 image spec. + // For each pod, we go through all the OwnerRef/Image mappings and represent them as a "key" string. + // All of those mappings together makes a list of "key" strings that essentially represent that pod's uniqueness. + // This list of keys representing a single pod is then sorted alphabetically. + // If any other pod has a list that matches that pod's list, those pods are undeniably duplicates for the following reasons: + // - The 2 pods have the exact same ownerrefs + // - The 2 pods have the exact same container images + // + // duplicateKeysMap maps the first Namespace/Kind/Name/Image in a pod's list to a 2D-slice of all the other lists where that is the first key + // (Since we sort each pod's list, we only need to key the map on the first entry in each list. Any pod that doesn't have + // the same first entry is clearly not a duplicate. This makes lookup quick and minimizes storage needed). + // If any of the existing lists for that first key matches the current pod's list, the current pod is a duplicate. + // If not, then we add this pod's list to the list of lists for that key. + duplicateKeysMap := map[string][][]string{} for _, pod := range pods { ownerRefList := podutil.OwnerRef(pod) + if hasExcludedOwnerRefKind(ownerRefList, strategy) { + continue + } + podContainerKeys := make([]string, 0, len(ownerRefList)*len(pod.Spec.Containers)) for _, ownerRef := range ownerRefList { - // Namespace/Kind/Name should be unique for the cluster. - s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name}, "/") - dpm[s] = append(dpm[s], pod) + for _, container := range pod.Spec.Containers { + // Namespace/Kind/Name should be unique for the cluster. + // We also consider the image, as 2 pods could have the same owner but serve different purposes + // So any non-unique Namespace/Kind/Name/Image pattern is a duplicate pod. + s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name, container.Image}, "/") + podContainerKeys = append(podContainerKeys, s) + } + } + sort.Strings(podContainerKeys) + + // If there have been any other pods with the same first "key", look through all the lists to see if any match + if existing, ok := duplicateKeysMap[podContainerKeys[0]]; ok { + for _, keys := range existing { + if reflect.DeepEqual(keys, podContainerKeys) { + duplicatePods = append(duplicatePods, pod) + break + } + // Found no matches, add this list of keys to the list of lists that have the same first key + duplicateKeysMap[podContainerKeys[0]] = append(duplicateKeysMap[podContainerKeys[0]], podContainerKeys) + } + } else { + // This is the first pod we've seen that has this first "key" entry + duplicateKeysMap[podContainerKeys[0]] = [][]string{podContainerKeys} + } + } + return duplicatePods +} + +func hasExcludedOwnerRefKind(ownerRefs []metav1.OwnerReference, strategy api.DeschedulerStrategy) bool { + if strategy.Params.RemoveDuplicates == nil { + return false + } + exclude := sets.NewString(strategy.Params.RemoveDuplicates.ExcludeOwnerKinds...) + for _, owner := range ownerRefs { + if exclude.Has(owner.Kind) { + return true } } - return dpm + return false } diff --git a/pkg/descheduler/strategies/duplicates_test.go b/pkg/descheduler/strategies/duplicates_test.go index 2d781d0544..4208dc9581 100644 --- a/pkg/descheduler/strategies/duplicates_test.go +++ b/pkg/descheduler/strategies/duplicates_test.go @@ -52,6 +52,14 @@ func TestFindDuplicatePods(t *testing.T) { p9.Namespace = "test" p10 := test.BuildTestPod("p10", 100, 0, node.Name, nil) p10.Namespace = "test" + p11 := test.BuildTestPod("p11", 100, 0, node.Name, nil) + p11.Namespace = "different-images" + p12 := test.BuildTestPod("p12", 100, 0, node.Name, nil) + p12.Namespace = "different-images" + p13 := test.BuildTestPod("p13", 100, 0, node.Name, nil) + p13.Namespace = "different-images" + p14 := test.BuildTestPod("p14", 100, 0, node.Name, nil) + p14.Namespace = "different-images" // ### Evictable Pods ### @@ -92,41 +100,88 @@ func TestFindDuplicatePods(t *testing.T) { priority := utils.SystemCriticalPriority p7.Spec.Priority = &priority + // Same owners, but different images + p11.Spec.Containers[0].Image = "foo" + p11.ObjectMeta.OwnerReferences = ownerRef1 + p12.Spec.Containers[0].Image = "bar" + p12.ObjectMeta.OwnerReferences = ownerRef1 + + // Multiple containers + p13.ObjectMeta.OwnerReferences = ownerRef1 + p13.Spec.Containers = append(p13.Spec.Containers, v1.Container{ + Name: "foo", + Image: "foo", + }) + testCases := []struct { description string maxPodsToEvict int pods []v1.Pod expectedEvictedPodCount int + strategy api.DeschedulerStrategy }{ { description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. 2 should be evicted.", maxPodsToEvict: 5, pods: []v1.Pod{*p1, *p2, *p3}, expectedEvictedPodCount: 2, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Three pods in the `dev` Namespace, bound to same ReplicaSet, but ReplicaSet kind is excluded. 0 should be evicted.", + maxPodsToEvict: 5, + pods: []v1.Pod{*p1, *p2, *p3}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{Params: api.StrategyParameters{RemoveDuplicates: &api.RemoveDuplicates{ExcludeOwnerKinds: []string{"ReplicaSet"}}}}, }, { description: "Three Pods in the `test` Namespace, bound to same ReplicaSet. 2 should be evicted.", maxPodsToEvict: 5, pods: []v1.Pod{*p8, *p9, *p10}, expectedEvictedPodCount: 2, + strategy: api.DeschedulerStrategy{}, }, { description: "Three Pods in the `dev` Namespace, three Pods in the `test` Namespace. Bound to ReplicaSet with same name. 4 should be evicted.", maxPodsToEvict: 5, pods: []v1.Pod{*p1, *p2, *p3, *p8, *p9, *p10}, expectedEvictedPodCount: 4, + strategy: api.DeschedulerStrategy{}, }, { description: "Pods are: part of DaemonSet, with local storage, mirror pod annotation, critical pod annotation - none should be evicted.", maxPodsToEvict: 2, pods: []v1.Pod{*p4, *p5, *p6, *p7}, expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{}, }, { description: "Test all Pods: 4 should be evicted.", maxPodsToEvict: 5, pods: []v1.Pod{*p1, *p2, *p3, *p4, *p5, *p6, *p7, *p8, *p9, *p10}, expectedEvictedPodCount: 4, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Pods with the same owner but different images should not be evicted", + maxPodsToEvict: 5, + pods: []v1.Pod{*p11, *p12}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Pods with multiple containers should not match themselves", + maxPodsToEvict: 5, + pods: []v1.Pod{*p13}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Pods with matching ownerrefs and at not all matching image should not trigger an eviction", + maxPodsToEvict: 5, + pods: []v1.Pod{*p11, *p13}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{}, }, } @@ -146,7 +201,7 @@ func TestFindDuplicatePods(t *testing.T) { []*v1.Node{node}, ) - RemoveDuplicatePods(ctx, fakeClient, api.DeschedulerStrategy{}, []*v1.Node{node}, false, podEvictor) + RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, []*v1.Node{node}, false, podEvictor) podsEvicted := podEvictor.TotalEvicted() if podsEvicted != testCase.expectedEvictedPodCount { t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted)