From c6ff87dbd64e1e775622a864fd42532deb416a95 Mon Sep 17 00:00:00 2001 From: Mike Dame Date: Fri, 1 May 2020 14:28:44 -0400 Subject: [PATCH] Consider pod image in duplicates strategy This increases the specificity of the RemoveDuplicates strategy by removing pods which not only have the same owner, but who also must have the same list of container images. This also adds a parameter, `ExcludeOwnerKinds` to the RemoveDuplicates strategy which accepts a list of Kinds. If a pod has any of these Kinds as an owner, that pod is not considered for eviction. --- README.md | 12 ++- pkg/api/types.go | 5 + pkg/api/v1alpha1/types.go | 5 + pkg/api/v1alpha1/zz_generated.conversion.go | 32 +++++++ pkg/api/v1alpha1/zz_generated.deepcopy.go | 26 ++++++ pkg/api/zz_generated.deepcopy.go | 26 ++++++ pkg/descheduler/strategies/duplicates.go | 91 ++++++++++++++----- pkg/descheduler/strategies/duplicates_test.go | 57 +++++++++++- 8 files changed, 228 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index e01f3a6c96..079b15ebcd 100644 --- a/README.md +++ b/README.md @@ -66,15 +66,21 @@ Replication Controller (RC), Deployment, or Job running on the same node. If the those duplicate pods are evicted for better spreading of pods in a cluster. This issue could happen if some nodes went down due to whatever reasons, and pods on them were moved to other nodes leading to more than one pod associated with a RS or RC, for example, running on the same node. Once the failed nodes -are ready again, this strategy could be enabled to evict those duplicate pods. Currently, there are no -parameters associated with this strategy. To disable this strategy, the policy should look like: +are ready again, this strategy could be enabled to evict those duplicate pods. + +It provides one optional parameter, `ExcludeOwnerKinds`, which is a list of OwnerRef `Kind`s. If a pod +has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considered for eviction. ``` apiVersion: "descheduler/v1alpha1" kind: "DeschedulerPolicy" strategies: "RemoveDuplicates": - enabled: false + enabled: true + params: + removeDuplicates: + excludeOwnerKinds: + - "ReplicaSet" ``` ### LowNodeUtilization diff --git a/pkg/api/types.go b/pkg/api/types.go index 34538dcfed..a4e7f17419 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -50,6 +50,7 @@ type StrategyParameters struct { NodeAffinityType []string PodsHavingTooManyRestarts *PodsHavingTooManyRestarts MaxPodLifeTimeSeconds *uint + RemoveDuplicates *RemoveDuplicates } type Percentage float64 @@ -65,3 +66,7 @@ type PodsHavingTooManyRestarts struct { PodRestartThreshold int32 IncludingInitContainers bool } + +type RemoveDuplicates struct { + ExcludeOwnerKinds []string +} diff --git a/pkg/api/v1alpha1/types.go b/pkg/api/v1alpha1/types.go index c03c2fe426..d2be207f1c 100644 --- a/pkg/api/v1alpha1/types.go +++ b/pkg/api/v1alpha1/types.go @@ -50,6 +50,7 @@ type StrategyParameters struct { NodeAffinityType []string `json:"nodeAffinityType,omitempty"` PodsHavingTooManyRestarts *PodsHavingTooManyRestarts `json:"podsHavingTooManyRestarts,omitempty"` MaxPodLifeTimeSeconds *uint `json:"maxPodLifeTimeSeconds,omitempty"` + RemoveDuplicates *RemoveDuplicates `json:"removeDuplicates,omitempty"` } type Percentage float64 @@ -65,3 +66,7 @@ type PodsHavingTooManyRestarts struct { PodRestartThreshold int32 `json:"podRestartThreshold,omitempty"` IncludingInitContainers bool `json:"includingInitContainers,omitempty"` } + +type RemoveDuplicates struct { + ExcludeOwnerKinds []string `json:"excludeOwnerKinds,omitempty"` +} diff --git a/pkg/api/v1alpha1/zz_generated.conversion.go b/pkg/api/v1alpha1/zz_generated.conversion.go index 22ea30d9c0..4317b18c1c 100644 --- a/pkg/api/v1alpha1/zz_generated.conversion.go +++ b/pkg/api/v1alpha1/zz_generated.conversion.go @@ -75,6 +75,16 @@ func RegisterConversions(s *runtime.Scheme) error { }); err != nil { return err } + if err := s.AddGeneratedConversionFunc((*RemoveDuplicates)(nil), (*api.RemoveDuplicates)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_v1alpha1_RemoveDuplicates_To_api_RemoveDuplicates(a.(*RemoveDuplicates), b.(*api.RemoveDuplicates), scope) + }); err != nil { + return err + } + if err := s.AddGeneratedConversionFunc((*api.RemoveDuplicates)(nil), (*RemoveDuplicates)(nil), func(a, b interface{}, scope conversion.Scope) error { + return Convert_api_RemoveDuplicates_To_v1alpha1_RemoveDuplicates(a.(*api.RemoveDuplicates), b.(*RemoveDuplicates), scope) + }); err != nil { + return err + } if err := s.AddGeneratedConversionFunc((*StrategyParameters)(nil), (*api.StrategyParameters)(nil), func(a, b interface{}, scope conversion.Scope) error { return Convert_v1alpha1_StrategyParameters_To_api_StrategyParameters(a.(*StrategyParameters), b.(*api.StrategyParameters), scope) }); err != nil { @@ -182,11 +192,32 @@ func Convert_api_PodsHavingTooManyRestarts_To_v1alpha1_PodsHavingTooManyRestarts return autoConvert_api_PodsHavingTooManyRestarts_To_v1alpha1_PodsHavingTooManyRestarts(in, out, s) } +func autoConvert_v1alpha1_RemoveDuplicates_To_api_RemoveDuplicates(in *RemoveDuplicates, out *api.RemoveDuplicates, s conversion.Scope) error { + out.ExcludeOwnerKinds = *(*[]string)(unsafe.Pointer(&in.ExcludeOwnerKinds)) + return nil +} + +// Convert_v1alpha1_RemoveDuplicates_To_api_RemoveDuplicates is an autogenerated conversion function. +func Convert_v1alpha1_RemoveDuplicates_To_api_RemoveDuplicates(in *RemoveDuplicates, out *api.RemoveDuplicates, s conversion.Scope) error { + return autoConvert_v1alpha1_RemoveDuplicates_To_api_RemoveDuplicates(in, out, s) +} + +func autoConvert_api_RemoveDuplicates_To_v1alpha1_RemoveDuplicates(in *api.RemoveDuplicates, out *RemoveDuplicates, s conversion.Scope) error { + out.ExcludeOwnerKinds = *(*[]string)(unsafe.Pointer(&in.ExcludeOwnerKinds)) + return nil +} + +// Convert_api_RemoveDuplicates_To_v1alpha1_RemoveDuplicates is an autogenerated conversion function. +func Convert_api_RemoveDuplicates_To_v1alpha1_RemoveDuplicates(in *api.RemoveDuplicates, out *RemoveDuplicates, s conversion.Scope) error { + return autoConvert_api_RemoveDuplicates_To_v1alpha1_RemoveDuplicates(in, out, s) +} + func autoConvert_v1alpha1_StrategyParameters_To_api_StrategyParameters(in *StrategyParameters, out *api.StrategyParameters, s conversion.Scope) error { out.NodeResourceUtilizationThresholds = (*api.NodeResourceUtilizationThresholds)(unsafe.Pointer(in.NodeResourceUtilizationThresholds)) out.NodeAffinityType = *(*[]string)(unsafe.Pointer(&in.NodeAffinityType)) out.PodsHavingTooManyRestarts = (*api.PodsHavingTooManyRestarts)(unsafe.Pointer(in.PodsHavingTooManyRestarts)) out.MaxPodLifeTimeSeconds = (*uint)(unsafe.Pointer(in.MaxPodLifeTimeSeconds)) + out.RemoveDuplicates = (*api.RemoveDuplicates)(unsafe.Pointer(in.RemoveDuplicates)) return nil } @@ -200,6 +231,7 @@ func autoConvert_api_StrategyParameters_To_v1alpha1_StrategyParameters(in *api.S out.NodeAffinityType = *(*[]string)(unsafe.Pointer(&in.NodeAffinityType)) out.PodsHavingTooManyRestarts = (*PodsHavingTooManyRestarts)(unsafe.Pointer(in.PodsHavingTooManyRestarts)) out.MaxPodLifeTimeSeconds = (*uint)(unsafe.Pointer(in.MaxPodLifeTimeSeconds)) + out.RemoveDuplicates = (*RemoveDuplicates)(unsafe.Pointer(in.RemoveDuplicates)) return nil } diff --git a/pkg/api/v1alpha1/zz_generated.deepcopy.go b/pkg/api/v1alpha1/zz_generated.deepcopy.go index 035395163e..d5782539ee 100644 --- a/pkg/api/v1alpha1/zz_generated.deepcopy.go +++ b/pkg/api/v1alpha1/zz_generated.deepcopy.go @@ -119,6 +119,27 @@ func (in *PodsHavingTooManyRestarts) DeepCopy() *PodsHavingTooManyRestarts { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RemoveDuplicates) DeepCopyInto(out *RemoveDuplicates) { + *out = *in + if in.ExcludeOwnerKinds != nil { + in, out := &in.ExcludeOwnerKinds, &out.ExcludeOwnerKinds + *out = make([]string, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemoveDuplicates. +func (in *RemoveDuplicates) DeepCopy() *RemoveDuplicates { + if in == nil { + return nil + } + out := new(RemoveDuplicates) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in ResourceThresholds) DeepCopyInto(out *ResourceThresholds) { { @@ -186,6 +207,11 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) { *out = new(uint) **out = **in } + if in.RemoveDuplicates != nil { + in, out := &in.RemoveDuplicates, &out.RemoveDuplicates + *out = new(RemoveDuplicates) + (*in).DeepCopyInto(*out) + } return } diff --git a/pkg/api/zz_generated.deepcopy.go b/pkg/api/zz_generated.deepcopy.go index 857c32738f..1303360c2c 100644 --- a/pkg/api/zz_generated.deepcopy.go +++ b/pkg/api/zz_generated.deepcopy.go @@ -119,6 +119,27 @@ func (in *PodsHavingTooManyRestarts) DeepCopy() *PodsHavingTooManyRestarts { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RemoveDuplicates) DeepCopyInto(out *RemoveDuplicates) { + *out = *in + if in.ExcludeOwnerKinds != nil { + in, out := &in.ExcludeOwnerKinds, &out.ExcludeOwnerKinds + *out = make([]string, len(*in)) + copy(*out, *in) + } + return +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RemoveDuplicates. +func (in *RemoveDuplicates) DeepCopy() *RemoveDuplicates { + if in == nil { + return nil + } + out := new(RemoveDuplicates) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in ResourceThresholds) DeepCopyInto(out *ResourceThresholds) { { @@ -186,6 +207,11 @@ func (in *StrategyParameters) DeepCopyInto(out *StrategyParameters) { *out = new(uint) **out = **in } + if in.RemoveDuplicates != nil { + in, out := &in.RemoveDuplicates, &out.RemoveDuplicates + *out = new(RemoveDuplicates) + (*in).DeepCopyInto(*out) + } return } diff --git a/pkg/descheduler/strategies/duplicates.go b/pkg/descheduler/strategies/duplicates.go index 258521f3c0..471b076170 100644 --- a/pkg/descheduler/strategies/duplicates.go +++ b/pkg/descheduler/strategies/duplicates.go @@ -18,9 +18,13 @@ package strategies import ( "context" + "reflect" + "sort" "strings" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog" @@ -31,7 +35,8 @@ import ( // RemoveDuplicatePods removes the duplicate pods on node. This strategy evicts all duplicate pods on node. // A pod is said to be a duplicate of other if both of them are from same creator, kind and are within the same -// namespace. As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages. +// namespace, and have at least one container with the same image. +// As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages. func RemoveDuplicatePods( ctx context.Context, client clientset.Interface, @@ -42,41 +47,83 @@ func RemoveDuplicatePods( ) { for _, node := range nodes { klog.V(1).Infof("Processing node: %#v", node.Name) - dpm := listDuplicatePodsOnANode(ctx, client, node, evictLocalStoragePods) - for creator, pods := range dpm { - if len(pods) > 1 { - klog.V(1).Infof("%#v", creator) - // i = 0 does not evict the first pod - for i := 1; i < len(pods); i++ { - if _, err := podEvictor.EvictPod(ctx, pods[i], node); err != nil { - klog.Errorf("Error evicting pod: (%#v)", err) - break - } - } + duplicatePods := listDuplicatePodsOnANode(ctx, client, node, strategy, evictLocalStoragePods) + for _, pod := range duplicatePods { + if _, err := podEvictor.EvictPod(ctx, pod, node); err != nil { + klog.Errorf("Error evicting pod: (%#v)", err) + break } } } } -//type creator string -type duplicatePodsMap map[string][]*v1.Pod - // listDuplicatePodsOnANode lists duplicate pods on a given node. -func listDuplicatePodsOnANode(ctx context.Context, client clientset.Interface, node *v1.Node, evictLocalStoragePods bool) duplicatePodsMap { +// It checks for pods which have the same owner and have at least 1 container with the same image spec +func listDuplicatePodsOnANode(ctx context.Context, client clientset.Interface, node *v1.Node, strategy api.DeschedulerStrategy, evictLocalStoragePods bool) []*v1.Pod { pods, err := podutil.ListEvictablePodsOnNode(ctx, client, node, evictLocalStoragePods) if err != nil { return nil } - dpm := duplicatePodsMap{} - // Ignoring the error here as in the ListDuplicatePodsOnNode function we call ListEvictablePodsOnNode which checks for error. + duplicatePods := make([]*v1.Pod, 0, len(pods)) + // Each pod has a list of owners and a list of containers, and each container has 1 image spec. + // For each pod, we go through all the OwnerRef/Image mappings and represent them as a "key" string. + // All of those mappings together makes a list of "key" strings that essentially represent that pod's uniqueness. + // This list of keys representing a single pod is then sorted alphabetically. + // If any other pod has a list that matches that pod's list, those pods are undeniably duplicates for the following reasons: + // - The 2 pods have the exact same ownerrefs + // - The 2 pods have the exact same container images + // + // duplicateKeysMap maps the first Namespace/Kind/Name/Image in a pod's list to a 2D-slice of all the other lists where that is the first key + // (Since we sort each pod's list, we only need to key the map on the first entry in each list. Any pod that doesn't have + // the same first entry is clearly not a duplicate. This makes lookup quick and minimizes storage needed). + // If any of the existing lists for that first key matches the current pod's list, the current pod is a duplicate. + // If not, then we add this pod's list to the list of lists for that key. + duplicateKeysMap := map[string][][]string{} for _, pod := range pods { ownerRefList := podutil.OwnerRef(pod) + if hasExcludedOwnerRefKind(ownerRefList, strategy) { + continue + } + podContainerKeys := make([]string, 0, len(ownerRefList)*len(pod.Spec.Containers)) for _, ownerRef := range ownerRefList { - // Namespace/Kind/Name should be unique for the cluster. - s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name}, "/") - dpm[s] = append(dpm[s], pod) + for _, container := range pod.Spec.Containers { + // Namespace/Kind/Name should be unique for the cluster. + // We also consider the image, as 2 pods could have the same owner but serve different purposes + // So any non-unique Namespace/Kind/Name/Image pattern is a duplicate pod. + s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name, container.Image}, "/") + podContainerKeys = append(podContainerKeys, s) + } + } + sort.Strings(podContainerKeys) + + // If there have been any other pods with the same first "key", look through all the lists to see if any match + if existing, ok := duplicateKeysMap[podContainerKeys[0]]; ok { + for _, keys := range existing { + if reflect.DeepEqual(keys, podContainerKeys) { + duplicatePods = append(duplicatePods, pod) + break + } + // Found no matches, add this list of keys to the list of lists that have the same first key + duplicateKeysMap[podContainerKeys[0]] = append(duplicateKeysMap[podContainerKeys[0]], podContainerKeys) + } + } else { + // This is the first pod we've seen that has this first "key" entry + duplicateKeysMap[podContainerKeys[0]] = [][]string{podContainerKeys} + } + } + return duplicatePods +} + +func hasExcludedOwnerRefKind(ownerRefs []metav1.OwnerReference, strategy api.DeschedulerStrategy) bool { + if strategy.Params.RemoveDuplicates == nil { + return false + } + exclude := sets.NewString(strategy.Params.RemoveDuplicates.ExcludeOwnerKinds...) + for _, owner := range ownerRefs { + if exclude.Has(owner.Kind) { + return true } } - return dpm + return false } diff --git a/pkg/descheduler/strategies/duplicates_test.go b/pkg/descheduler/strategies/duplicates_test.go index 2d781d0544..4208dc9581 100644 --- a/pkg/descheduler/strategies/duplicates_test.go +++ b/pkg/descheduler/strategies/duplicates_test.go @@ -52,6 +52,14 @@ func TestFindDuplicatePods(t *testing.T) { p9.Namespace = "test" p10 := test.BuildTestPod("p10", 100, 0, node.Name, nil) p10.Namespace = "test" + p11 := test.BuildTestPod("p11", 100, 0, node.Name, nil) + p11.Namespace = "different-images" + p12 := test.BuildTestPod("p12", 100, 0, node.Name, nil) + p12.Namespace = "different-images" + p13 := test.BuildTestPod("p13", 100, 0, node.Name, nil) + p13.Namespace = "different-images" + p14 := test.BuildTestPod("p14", 100, 0, node.Name, nil) + p14.Namespace = "different-images" // ### Evictable Pods ### @@ -92,41 +100,88 @@ func TestFindDuplicatePods(t *testing.T) { priority := utils.SystemCriticalPriority p7.Spec.Priority = &priority + // Same owners, but different images + p11.Spec.Containers[0].Image = "foo" + p11.ObjectMeta.OwnerReferences = ownerRef1 + p12.Spec.Containers[0].Image = "bar" + p12.ObjectMeta.OwnerReferences = ownerRef1 + + // Multiple containers + p13.ObjectMeta.OwnerReferences = ownerRef1 + p13.Spec.Containers = append(p13.Spec.Containers, v1.Container{ + Name: "foo", + Image: "foo", + }) + testCases := []struct { description string maxPodsToEvict int pods []v1.Pod expectedEvictedPodCount int + strategy api.DeschedulerStrategy }{ { description: "Three pods in the `dev` Namespace, bound to same ReplicaSet. 2 should be evicted.", maxPodsToEvict: 5, pods: []v1.Pod{*p1, *p2, *p3}, expectedEvictedPodCount: 2, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Three pods in the `dev` Namespace, bound to same ReplicaSet, but ReplicaSet kind is excluded. 0 should be evicted.", + maxPodsToEvict: 5, + pods: []v1.Pod{*p1, *p2, *p3}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{Params: api.StrategyParameters{RemoveDuplicates: &api.RemoveDuplicates{ExcludeOwnerKinds: []string{"ReplicaSet"}}}}, }, { description: "Three Pods in the `test` Namespace, bound to same ReplicaSet. 2 should be evicted.", maxPodsToEvict: 5, pods: []v1.Pod{*p8, *p9, *p10}, expectedEvictedPodCount: 2, + strategy: api.DeschedulerStrategy{}, }, { description: "Three Pods in the `dev` Namespace, three Pods in the `test` Namespace. Bound to ReplicaSet with same name. 4 should be evicted.", maxPodsToEvict: 5, pods: []v1.Pod{*p1, *p2, *p3, *p8, *p9, *p10}, expectedEvictedPodCount: 4, + strategy: api.DeschedulerStrategy{}, }, { description: "Pods are: part of DaemonSet, with local storage, mirror pod annotation, critical pod annotation - none should be evicted.", maxPodsToEvict: 2, pods: []v1.Pod{*p4, *p5, *p6, *p7}, expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{}, }, { description: "Test all Pods: 4 should be evicted.", maxPodsToEvict: 5, pods: []v1.Pod{*p1, *p2, *p3, *p4, *p5, *p6, *p7, *p8, *p9, *p10}, expectedEvictedPodCount: 4, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Pods with the same owner but different images should not be evicted", + maxPodsToEvict: 5, + pods: []v1.Pod{*p11, *p12}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Pods with multiple containers should not match themselves", + maxPodsToEvict: 5, + pods: []v1.Pod{*p13}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{}, + }, + { + description: "Pods with matching ownerrefs and at not all matching image should not trigger an eviction", + maxPodsToEvict: 5, + pods: []v1.Pod{*p11, *p13}, + expectedEvictedPodCount: 0, + strategy: api.DeschedulerStrategy{}, }, } @@ -146,7 +201,7 @@ func TestFindDuplicatePods(t *testing.T) { []*v1.Node{node}, ) - RemoveDuplicatePods(ctx, fakeClient, api.DeschedulerStrategy{}, []*v1.Node{node}, false, podEvictor) + RemoveDuplicatePods(ctx, fakeClient, testCase.strategy, []*v1.Node{node}, false, podEvictor) podsEvicted := podEvictor.TotalEvicted() if podsEvicted != testCase.expectedEvictedPodCount { t.Errorf("Test error for description: %s. Expected evicted pods count %v, got %v", testCase.description, testCase.expectedEvictedPodCount, podsEvicted)