diff --git a/pkg/descheduler/strategies/lownodeutilization.go b/pkg/descheduler/strategies/lownodeutilization.go index ae530330c3..bc577723c4 100644 --- a/pkg/descheduler/strategies/lownodeutilization.go +++ b/pkg/descheduler/strategies/lownodeutilization.go @@ -19,10 +19,11 @@ package strategies import ( "sort" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" clientset "k8s.io/client-go/kubernetes" "k8s.io/klog" + "sigs.k8s.io/descheduler/cmd/descheduler/app/options" "sigs.k8s.io/descheduler/pkg/api" "sigs.k8s.io/descheduler/pkg/descheduler/evictions" @@ -161,7 +162,9 @@ func evictPodsFromTargetNodes(client clientset.Interface, evictionPolicyGroupVer // upper bound on total number of pods/cpu/memory to be moved var totalPods, totalCPU, totalMem float64 + var taintsOfLowNodes = make(map[string][]v1.Taint, len(lowNodes)) for _, node := range lowNodes { + taintsOfLowNodes[node.node.Name] = node.node.Spec.Taints nodeCapacity := node.node.Status.Capacity if len(node.node.Status.Allocatable) > 0 { nodeCapacity = node.node.Status.Allocatable @@ -202,18 +205,18 @@ func evictPodsFromTargetNodes(client clientset.Interface, evictionPolicyGroupVer // sort the evictable Pods based on priority. This also sorts them based on QoS. If there are multiple pods with same priority, they are sorted based on QoS tiers. sortPodsBasedOnPriority(evictablePods) - evictPods(evictablePods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) + evictPods(evictablePods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict, taintsOfLowNodes) } else { // TODO: Remove this when we support only priority. // Falling back to evicting pods based on priority. klog.V(1).Infof("Evicting pods based on QoS") klog.V(1).Infof("There are %v non-evictable pods on the node", len(node.nonRemovablePods)) // evict best effort pods - evictPods(node.bePods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) + evictPods(node.bePods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict, taintsOfLowNodes) // evict burstable pods - evictPods(node.bPods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) + evictPods(node.bPods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict, taintsOfLowNodes) // evict guaranteed pods - evictPods(node.gPods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict) + evictPods(node.gPods, client, evictionPolicyGroupVersion, targetThresholds, nodeCapacity, node.usage, &totalPods, &totalCPU, &totalMem, ¤tPodsEvicted, dryRun, maxPodsToEvict, taintsOfLowNodes) } nodepodCount[node.node] = currentPodsEvicted podsEvicted = podsEvicted + nodepodCount[node.node] @@ -232,15 +235,24 @@ func evictPods(inputPods []*v1.Pod, totalCPU *float64, totalMem *float64, podsEvicted *int, - dryRun bool, maxPodsToEvict int) { + dryRun bool, + maxPodsToEvict int, + taintsOfLowNodes map[string][]v1.Taint) { if IsNodeAboveTargetUtilization(nodeUsage, targetThresholds) && (*totalPods > 0 || *totalCPU > 0 || *totalMem > 0) { onePodPercentage := api.Percentage((float64(1) * 100) / float64(nodeCapacity.Pods().Value())) for _, pod := range inputPods { if maxPodsToEvict > 0 && *podsEvicted+1 > maxPodsToEvict { break } + + if !utils.PodToleratesTaints(pod, taintsOfLowNodes) { + klog.V(3).Infof("Skipping eviction for Pod: %#v, doesn't tolerate node taint", pod.Name) + continue + } + cUsage := utils.GetResourceRequest(pod, v1.ResourceCPU) mUsage := utils.GetResourceRequest(pod, v1.ResourceMemory) + success, err := evictions.EvictPod(client, pod, evictionPolicyGroupVersion, dryRun) if !success { klog.Warningf("Error when evicting pod: %#v (%#v)", pod.Name, err) diff --git a/pkg/descheduler/strategies/lownodeutilization_test.go b/pkg/descheduler/strategies/lownodeutilization_test.go index c3ec223336..eb90065abb 100644 --- a/pkg/descheduler/strategies/lownodeutilization_test.go +++ b/pkg/descheduler/strategies/lownodeutilization_test.go @@ -23,12 +23,18 @@ import ( "reflect" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + serializer "k8s.io/apimachinery/pkg/runtime/serializer" + "k8s.io/apimachinery/pkg/watch" "k8s.io/client-go/kubernetes/fake" core "k8s.io/client-go/testing" + "sigs.k8s.io/descheduler/cmd/descheduler/app/options" "sigs.k8s.io/descheduler/pkg/api" + "sigs.k8s.io/descheduler/pkg/apis/componentconfig" "sigs.k8s.io/descheduler/pkg/utils" "sigs.k8s.io/descheduler/test" ) @@ -327,3 +333,183 @@ func TestValidateThresholds(t *testing.T) { } } } + +func newFake(objects ...runtime.Object) *core.Fake { + scheme := runtime.NewScheme() + codecs := serializer.NewCodecFactory(scheme) + fake.AddToScheme(scheme) + o := core.NewObjectTracker(scheme, codecs.UniversalDecoder()) + for _, obj := range objects { + if err := o.Add(obj); err != nil { + panic(err) + } + } + + fakePtr := core.Fake{} + fakePtr.AddReactor("list", "pods", func(action core.Action) (bool, runtime.Object, error) { + objs, err := o.List( + schema.GroupVersionResource{Group: "", Version: "v1", Resource: "pods"}, + schema.GroupVersionKind{Group: "", Version: "v1", Kind: "Pod"}, + action.GetNamespace(), + ) + if err != nil { + return true, nil, err + } + + obj := &v1.PodList{ + Items: []v1.Pod{}, + } + for _, pod := range objs.(*v1.PodList).Items { + podFieldSet := fields.Set(map[string]string{ + "spec.nodeName": pod.Spec.NodeName, + "status.phase": string(pod.Status.Phase), + }) + match := action.(core.ListAction).GetListRestrictions().Fields.Matches(podFieldSet) + if !match { + continue + } + obj.Items = append(obj.Items, *pod.DeepCopy()) + } + return true, obj, nil + }) + fakePtr.AddReactor("*", "*", core.ObjectReaction(o)) + fakePtr.AddWatchReactor("*", core.DefaultWatchReactor(watch.NewFake(), nil)) + + return &fakePtr +} + +func TestWithTaints(t *testing.T) { + strategy := api.DeschedulerStrategy{ + Enabled: true, + Params: api.StrategyParameters{ + NodeResourceUtilizationThresholds: api.NodeResourceUtilizationThresholds{ + Thresholds: api.ResourceThresholds{ + v1.ResourcePods: 20, + }, + TargetThresholds: api.ResourceThresholds{ + v1.ResourcePods: 70, + }, + }, + }, + } + + n1 := test.BuildTestNode("n1", 2000, 3000, 10) + n2 := test.BuildTestNode("n2", 1000, 3000, 10) + n3 := test.BuildTestNode("n3", 1000, 3000, 10) + n3withTaints := n3.DeepCopy() + n3withTaints.Spec.Taints = []v1.Taint{ + { + Key: "key", + Value: "value", + Effect: v1.TaintEffectNoSchedule, + }, + } + + podThatToleratesTaint := test.BuildTestPod("tolerate_pod", 200, 0, n1.Name) + podThatToleratesTaint.Spec.Tolerations = []v1.Toleration{ + { + Key: "key", + Value: "value", + }, + } + + tests := []struct { + name string + nodes []*v1.Node + pods []*v1.Pod + evictionsExpected int + }{ + { + name: "No taints", + nodes: []*v1.Node{n1, n2, n3}, + pods: []*v1.Pod{ + //Node 1 pods + test.BuildTestPod(fmt.Sprintf("pod_1_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_2_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_3_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_4_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_5_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_6_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_7_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_8_%s", n1.Name), 200, 0, n1.Name), + // Node 2 pods + test.BuildTestPod(fmt.Sprintf("pod_9_%s", n2.Name), 200, 0, n2.Name), + }, + evictionsExpected: 1, + }, + { + name: "No pod tolerates node taint", + nodes: []*v1.Node{n1, n3withTaints}, + pods: []*v1.Pod{ + //Node 1 pods + test.BuildTestPod(fmt.Sprintf("pod_1_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_2_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_3_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_4_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_5_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_6_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_7_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_8_%s", n1.Name), 200, 0, n1.Name), + // Node 3 pods + test.BuildTestPod(fmt.Sprintf("pod_9_%s", n3withTaints.Name), 200, 0, n3withTaints.Name), + }, + evictionsExpected: 0, + }, + { + name: "Pod which tolerates node taint", + nodes: []*v1.Node{n1, n3withTaints}, + pods: []*v1.Pod{ + //Node 1 pods + test.BuildTestPod(fmt.Sprintf("pod_1_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_2_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_3_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_4_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_5_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_6_%s", n1.Name), 200, 0, n1.Name), + test.BuildTestPod(fmt.Sprintf("pod_7_%s", n1.Name), 200, 0, n1.Name), + podThatToleratesTaint, + // Node 3 pods + test.BuildTestPod(fmt.Sprintf("pod_9_%s", n3withTaints.Name), 200, 0, n3withTaints.Name), + }, + evictionsExpected: 1, + }, + } + + for _, item := range tests { + t.Run(item.name, func(t *testing.T) { + var objs []runtime.Object + for _, node := range item.nodes { + objs = append(objs, node) + } + + for _, pod := range item.pods { + pod.ObjectMeta.OwnerReferences = test.GetReplicaSetOwnerRefList() + objs = append(objs, pod) + } + + fakePtr := newFake(objs...) + var evictionCounter int + fakePtr.PrependReactor("create", "pods", func(action core.Action) (bool, runtime.Object, error) { + if action.GetSubresource() != "eviction" || action.GetResource().Resource != "pods" { + return false, nil, nil + } + evictionCounter++ + return true, nil, nil + }) + + ds := &options.DeschedulerServer{ + Client: &fake.Clientset{Fake: *fakePtr}, + DeschedulerConfiguration: componentconfig.DeschedulerConfiguration{ + EvictLocalStoragePods: false, + }, + } + + nodePodCount := utils.InitializeNodePodCount(item.nodes) + LowNodeUtilization(ds, strategy, "policy/v1", item.nodes, nodePodCount) + + if item.evictionsExpected != evictionCounter { + t.Errorf("Expected %v evictions, got %v", item.evictionsExpected, evictionCounter) + } + }) + } +} diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go index 4eafdcab2e..fa951d7a53 100644 --- a/pkg/utils/pod.go +++ b/pkg/utils/pod.go @@ -2,10 +2,12 @@ package utils import ( "fmt" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/component-base/featuregate" + "k8s.io/klog" ) const ( @@ -179,3 +181,15 @@ func maxResourceList(list, new v1.ResourceList) { } } } + +// PodToleratesTaints returns true if a pod tolerates one node's taints +func PodToleratesTaints(pod *v1.Pod, taintsOfNodes map[string][]v1.Taint) bool { + for nodeName, taintsForNode := range taintsOfNodes { + if len(pod.Spec.Tolerations) >= len(taintsForNode) && TolerationsTolerateTaintsWithFilter(pod.Spec.Tolerations, taintsForNode, nil) { + return true + } + klog.V(5).Infof("pod: %#v doesn't tolerate node %s's taints", pod.Name, nodeName) + } + + return false +} diff --git a/pkg/utils/predicates.go b/pkg/utils/predicates.go index c8a7ad61bf..38bd8de89c 100644 --- a/pkg/utils/predicates.go +++ b/pkg/utils/predicates.go @@ -19,7 +19,7 @@ package utils import ( "fmt" - "k8s.io/api/core/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/labels" "k8s.io/apimachinery/pkg/selection" "k8s.io/klog" @@ -126,3 +126,35 @@ func NodeSelectorRequirementsAsSelector(nsm []v1.NodeSelectorRequirement) (label } return selector, nil } + +// TolerationsTolerateTaint checks if taint is tolerated by any of the tolerations. +func TolerationsTolerateTaint(tolerations []v1.Toleration, taint *v1.Taint) bool { + for i := range tolerations { + if tolerations[i].ToleratesTaint(taint) { + return true + } + } + return false +} + +type taintsFilterFunc func(*v1.Taint) bool + +// TolerationsTolerateTaintsWithFilter checks if given tolerations tolerates +// all the taints that apply to the filter in given taint list. +func TolerationsTolerateTaintsWithFilter(tolerations []v1.Toleration, taints []v1.Taint, applyFilter taintsFilterFunc) bool { + if len(taints) == 0 { + return true + } + + for i := range taints { + if applyFilter != nil && !applyFilter(&taints[i]) { + continue + } + + if !TolerationsTolerateTaint(tolerations, &taints[i]) { + return false + } + } + + return true +}