Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consider pod image in duplicates strategy #275

Merged
merged 1 commit into from
May 15, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 9 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -66,15 +66,21 @@ Replication Controller (RC), Deployment, or Job running on the same node. If the
those duplicate pods are evicted for better spreading of pods in a cluster. This issue could happen
if some nodes went down due to whatever reasons, and pods on them were moved to other nodes leading to
more than one pod associated with a RS or RC, for example, running on the same node. Once the failed nodes
are ready again, this strategy could be enabled to evict those duplicate pods. Currently, there are no
parameters associated with this strategy. To disable this strategy, the policy should look like:
are ready again, this strategy could be enabled to evict those duplicate pods.

It provides one optional parameter, `ExcludeOwnerKinds`, which is a list of OwnerRef `Kind`s. If a pod
has any of these `Kind`s listed as an `OwnerRef`, that pod will not be considered for eviction.

```
apiVersion: "descheduler/v1alpha1"
kind: "DeschedulerPolicy"
strategies:
"RemoveDuplicates":
enabled: false
enabled: true
params:
removeDuplicates:
excludeOwnerKinds:
- "ReplicaSet"
```

### LowNodeUtilization
Expand Down
5 changes: 5 additions & 0 deletions pkg/api/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type StrategyParameters struct {
NodeAffinityType []string
PodsHavingTooManyRestarts *PodsHavingTooManyRestarts
MaxPodLifeTimeSeconds *uint
RemoveDuplicates *RemoveDuplicates
}

type Percentage float64
Expand All @@ -65,3 +66,7 @@ type PodsHavingTooManyRestarts struct {
PodRestartThreshold int32
IncludingInitContainers bool
}

type RemoveDuplicates struct {
ExcludeOwnerKinds []string
}
5 changes: 5 additions & 0 deletions pkg/api/v1alpha1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ type StrategyParameters struct {
NodeAffinityType []string `json:"nodeAffinityType,omitempty"`
PodsHavingTooManyRestarts *PodsHavingTooManyRestarts `json:"podsHavingTooManyRestarts,omitempty"`
MaxPodLifeTimeSeconds *uint `json:"maxPodLifeTimeSeconds,omitempty"`
RemoveDuplicates *RemoveDuplicates `json:"removeDuplicates,omitempty"`
}

type Percentage float64
Expand All @@ -65,3 +66,7 @@ type PodsHavingTooManyRestarts struct {
PodRestartThreshold int32 `json:"podRestartThreshold,omitempty"`
IncludingInitContainers bool `json:"includingInitContainers,omitempty"`
}

type RemoveDuplicates struct {
ExcludeOwnerKinds []string `json:"excludeOwnerKinds,omitempty"`
}
32 changes: 32 additions & 0 deletions pkg/api/v1alpha1/zz_generated.conversion.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions pkg/api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions pkg/api/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

91 changes: 69 additions & 22 deletions pkg/descheduler/strategies/duplicates.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,13 @@ package strategies

import (
"context"
"reflect"
"sort"
"strings"

v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/sets"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/klog"

Expand All @@ -31,7 +35,8 @@ import (

// RemoveDuplicatePods removes the duplicate pods on node. This strategy evicts all duplicate pods on node.
// A pod is said to be a duplicate of other if both of them are from same creator, kind and are within the same
// namespace. As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages.
// namespace, and have at least one container with the same image.
// As of now, this strategy won't evict daemonsets, mirror pods, critical pods and pods with local storages.
func RemoveDuplicatePods(
ctx context.Context,
client clientset.Interface,
Expand All @@ -42,41 +47,83 @@ func RemoveDuplicatePods(
) {
for _, node := range nodes {
klog.V(1).Infof("Processing node: %#v", node.Name)
dpm := listDuplicatePodsOnANode(ctx, client, node, evictLocalStoragePods)
for creator, pods := range dpm {
if len(pods) > 1 {
klog.V(1).Infof("%#v", creator)
// i = 0 does not evict the first pod
for i := 1; i < len(pods); i++ {
if _, err := podEvictor.EvictPod(ctx, pods[i], node); err != nil {
klog.Errorf("Error evicting pod: (%#v)", err)
break
}
}
duplicatePods := listDuplicatePodsOnANode(ctx, client, node, strategy, evictLocalStoragePods)
for _, pod := range duplicatePods {
if _, err := podEvictor.EvictPod(ctx, pod, node); err != nil {
klog.Errorf("Error evicting pod: (%#v)", err)
break
}
}
}
}

//type creator string
type duplicatePodsMap map[string][]*v1.Pod

// listDuplicatePodsOnANode lists duplicate pods on a given node.
func listDuplicatePodsOnANode(ctx context.Context, client clientset.Interface, node *v1.Node, evictLocalStoragePods bool) duplicatePodsMap {
// It checks for pods which have the same owner and have at least 1 container with the same image spec
func listDuplicatePodsOnANode(ctx context.Context, client clientset.Interface, node *v1.Node, strategy api.DeschedulerStrategy, evictLocalStoragePods bool) []*v1.Pod {
pods, err := podutil.ListEvictablePodsOnNode(ctx, client, node, evictLocalStoragePods)
if err != nil {
return nil
}

dpm := duplicatePodsMap{}
// Ignoring the error here as in the ListDuplicatePodsOnNode function we call ListEvictablePodsOnNode which checks for error.
duplicatePods := make([]*v1.Pod, 0, len(pods))
// Each pod has a list of owners and a list of containers, and each container has 1 image spec.
// For each pod, we go through all the OwnerRef/Image mappings and represent them as a "key" string.
// All of those mappings together makes a list of "key" strings that essentially represent that pod's uniqueness.
// This list of keys representing a single pod is then sorted alphabetically.
// If any other pod has a list that matches that pod's list, those pods are undeniably duplicates for the following reasons:
// - The 2 pods have the exact same ownerrefs
// - The 2 pods have the exact same container images
//
// duplicateKeysMap maps the first Namespace/Kind/Name/Image in a pod's list to a 2D-slice of all the other lists where that is the first key
// (Since we sort each pod's list, we only need to key the map on the first entry in each list. Any pod that doesn't have
// the same first entry is clearly not a duplicate. This makes lookup quick and minimizes storage needed).
// If any of the existing lists for that first key matches the current pod's list, the current pod is a duplicate.
// If not, then we add this pod's list to the list of lists for that key.
duplicateKeysMap := map[string][][]string{}
for _, pod := range pods {
ownerRefList := podutil.OwnerRef(pod)
if hasExcludedOwnerRefKind(ownerRefList, strategy) {
continue
}
podContainerKeys := make([]string, 0, len(ownerRefList)*len(pod.Spec.Containers))
for _, ownerRef := range ownerRefList {
// Namespace/Kind/Name should be unique for the cluster.
s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name}, "/")
dpm[s] = append(dpm[s], pod)
for _, container := range pod.Spec.Containers {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There's a strong assumption the owner is either RC, RS, SS or alike. I.e. one template for all the instances. Also, there's no policy disallowing to create an owner that's not template based. I.e. all pods owned by the same owner kind/name but each pod created from a different template. As you now mention in the strategy description: A pod is said to be a duplicate of other if both of them are from same creator, kind and are within the same namespace, and have at least one container with the same image. It's perfectly valid for template based owners. However, for non-template based owners, one might face the following situation:
pod1:

  • container: image1
  • container: image2
    pod2:
  • container: image1
  • container: image3
    Which, based on the current definition of a duplicate, pod1 is a duplicate of pod2. What about to extend the definition to say:
A pod is a duplicate of another pod if both pods have:
- the same owner kind/name
- the same namespace
- the same list of containers (the same list of names, each two equal names have the same image).

?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ingvagabund This was actually a problem I considered, whether we want to consider duplicates when there is an exact matching list of images or if just one match is enough. I originally went with the latter, but can see that there is a case for matching exact lists.

So I have pushed 365e355, which updates the strategy to the strictest definition, where pods are only duplicates if they have the exact same list of owners and images.

But because now we're checking uniqueness against a list of strings (instead of just one string), the map we had before wouldn't work (unless we came up with a way to represent the entire list as a unique string, which could get long and messy).

So now, I'm building the list of all the ownerref/image keys for a pod, then sorting that list. I'm keying the duplicates map on the first entry of that list (because if another pod doesn't have the same first entry, it's not a duplicate). The value for that key in the map is a list of all the lists which contain the same first key. If we find a matching list, the pod is a duplicate.

I was going to just make a simple loop using the existing map, where we check if all the key entries have >1 existing match like before. But this could give a false positive for pods with shorter lists.

Please take a look at the new commit (I've kept it separate for now to review easier)

// Namespace/Kind/Name should be unique for the cluster.
// We also consider the image, as 2 pods could have the same owner but serve different purposes
// So any non-unique Namespace/Kind/Name/Image pattern is a duplicate pod.
s := strings.Join([]string{pod.ObjectMeta.Namespace, ownerRef.Kind, ownerRef.Name, container.Image}, "/")
podContainerKeys = append(podContainerKeys, s)
}
}
sort.Strings(podContainerKeys)

// If there have been any other pods with the same first "key", look through all the lists to see if any match
if existing, ok := duplicateKeysMap[podContainerKeys[0]]; ok {
for _, keys := range existing {
if reflect.DeepEqual(keys, podContainerKeys) {
duplicatePods = append(duplicatePods, pod)
break
}
// Found no matches, add this list of keys to the list of lists that have the same first key
duplicateKeysMap[podContainerKeys[0]] = append(duplicateKeysMap[podContainerKeys[0]], podContainerKeys)
}
} else {
// This is the first pod we've seen that has this first "key" entry
duplicateKeysMap[podContainerKeys[0]] = [][]string{podContainerKeys}
}
}
return duplicatePods
}

func hasExcludedOwnerRefKind(ownerRefs []metav1.OwnerReference, strategy api.DeschedulerStrategy) bool {
if strategy.Params.RemoveDuplicates == nil {
return false
}
exclude := sets.NewString(strategy.Params.RemoveDuplicates.ExcludeOwnerKinds...)
for _, owner := range ownerRefs {
if exclude.Has(owner.Kind) {
return true
}
}
return dpm
return false
}
Loading