diff --git a/README.md b/README.md index f730156a99..88c04b04a8 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ pod can or can not be scheduled, are guided by its configurable policy which com rules, called predicates and priorities. The scheduler's decisions are influenced by its view of a Kubernetes cluster at that point of time when a new pod appears for scheduling. As Kubernetes clusters are very dynamic and their state changes over time, there may be desire -to move already running pods to some other nodes for various reasons: +to move already running pods to some other nodes, or to allow the pods to be terminated outright: * Some nodes are under or over utilized. * The original scheduling decision does not hold true any more, as taints or labels are added to @@ -20,9 +20,12 @@ or removed from nodes, pod/node affinity requirements are not satisfied any more * New nodes are added to clusters. Consequently, there might be several pods scheduled on less desired nodes in a cluster. -Descheduler, based on its policy, finds pods that can be moved and evicts them. Please -note, in current implementation, descheduler does not schedule replacement of evicted pods -but relies on the default scheduler for that. +Descheduler, based on its policy, finds pods that can be moved and evicts them. By default, +Descheduler aims to ensure that there is no service degradation across the cluster. In the case +where a pod is no longer runnable on a node and no suitable movement candidate can be found, +Descheduler can also optionally terminate the problematic pod, when service degradation is allowed. +Please note, in its current implementation, descheduler does not schedule replacements of evicted +or terminated pods but instead relies on the default scheduler for that. ## Quick Start @@ -238,6 +241,17 @@ never evicted because these pods won't be recreated. Pods subject to a Pod Disruption Budget(PDB) are not evicted if descheduling violates its PDB. The pods are evicted by using the eviction subresource to handle PDB. +### Degradation + +By default, Pods marked for eviction are only evicted if a suitable node can be found for rescheduling. +This is done to ensure that in the case where no suitable candidate for rescheduling is found, the +pod will continue to run on its current node. + +In certain cases, such as when a pod is scheduled based on labelling criteria which is no longer satisfied, +it can be preferable (and, at times, essential) to terminate the running pod even if it does not have a +rescheduling candidate. This behaviour can be enabled by running Descheduler in an allowed degradation mode, +activated by the `--degradation-allowed` CLI argument. + ## Compatibility Matrix The below compatibility matrix shows the k8s client package(client-go, apimachinery, etc) versions that descheduler is compiled with. At this time descheduler does not have a hard dependency to a specific k8s release. However a diff --git a/cmd/descheduler/app/options/options.go b/cmd/descheduler/app/options/options.go index d398592e00..a421912c5a 100644 --- a/cmd/descheduler/app/options/options.go +++ b/cmd/descheduler/app/options/options.go @@ -52,6 +52,7 @@ func (rs *DeschedulerServer) AddFlags(fs *pflag.FlagSet) { fs.StringVar(&rs.KubeconfigFile, "kubeconfig", rs.KubeconfigFile, "File with kube configuration.") fs.StringVar(&rs.PolicyConfigFile, "policy-config-file", rs.PolicyConfigFile, "File with descheduler policy configuration.") fs.BoolVar(&rs.DryRun, "dry-run", rs.DryRun, "execute descheduler in dry run mode.") + fs.BoolVar(&rs.DegradationAllowed, "degradation-allowed", rs.DegradationAllowed, "Allow descheduling of Pods that have no rescheduling candidates") // node-selector query causes descheduler to run only on nodes that matches the node labels in the query fs.StringVar(&rs.NodeSelector, "node-selector", rs.NodeSelector, "Selector (label query) to filter on, supports '=', '==', and '!='.(e.g. -l key1=value1,key2=value2)") // max-no-pods-to-evict limits the maximum number of pods to be evicted per node by descheduler. diff --git a/docs/user-guide.md b/docs/user-guide.md index 335983eee6..6b30cc1206 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -25,6 +25,7 @@ Available Commands: Flags: --add-dir-header If true, adds the file directory to the header --alsologtostderr log to standard error as well as files + --degradation-allowed Allow descheduling of Pods that have no rescheduling candidates --descheduling-interval duration Time interval between two consecutive descheduler executions. Setting this value instructs the descheduler to run in a continuous loop at the interval specified. --dry-run execute descheduler in dry run mode. --evict-local-storage-pods Enables evicting pods using local storage by descheduler diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go index 1588d59f52..de83b35d9b 100644 --- a/pkg/apis/componentconfig/types.go +++ b/pkg/apis/componentconfig/types.go @@ -40,6 +40,9 @@ type DeschedulerConfiguration struct { // Dry run DryRun bool + // Degradation allowed + DegradationAllowed bool + // Node selectors NodeSelector string diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go index 69121ea636..1e6f3b4fca 100644 --- a/pkg/apis/componentconfig/v1alpha1/types.go +++ b/pkg/apis/componentconfig/v1alpha1/types.go @@ -40,6 +40,9 @@ type DeschedulerConfiguration struct { // Dry run DryRun bool `json:"dryRun,omitempty"` + // Degradation allowed + DegradationAllowed bool `json:"degradationAllowed,omitempty"` + // Node selectors NodeSelector string `json:"nodeSelector,omitempty"` diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go index e6c83f8623..8feedce272 100644 --- a/pkg/descheduler/descheduler.go +++ b/pkg/descheduler/descheduler.go @@ -87,7 +87,7 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer return } - if len(nodes) <= 1 { + if len(nodes) <= 1 && rs.DegradationAllowed == false { klog.V(1).Infof("The cluster size is 0 or 1 meaning eviction causes service disruption or degradation. So aborting..") close(stopChannel) return @@ -97,6 +97,7 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer rs.Client, evictionPolicyGroupVersion, rs.DryRun, + rs.DegradationAllowed, rs.MaxNoOfPodsToEvictPerNode, nodes, ) diff --git a/pkg/descheduler/evictions/evictions.go b/pkg/descheduler/evictions/evictions.go index e387590db1..154567cb56 100644 --- a/pkg/descheduler/evictions/evictions.go +++ b/pkg/descheduler/evictions/evictions.go @@ -40,6 +40,7 @@ type PodEvictor struct { client clientset.Interface policyGroupVersion string dryRun bool + DegradationAllowed bool maxPodsToEvict int nodepodCount nodePodEvictedCount } @@ -48,6 +49,7 @@ func NewPodEvictor( client clientset.Interface, policyGroupVersion string, dryRun bool, + degradationAllowed bool, maxPodsToEvict int, nodes []*v1.Node, ) *PodEvictor { @@ -61,6 +63,7 @@ func NewPodEvictor( client: client, policyGroupVersion: policyGroupVersion, dryRun: dryRun, + DegradationAllowed: degradationAllowed, maxPodsToEvict: maxPodsToEvict, nodepodCount: nodePodCount, } diff --git a/pkg/descheduler/strategies/duplicates_test.go b/pkg/descheduler/strategies/duplicates_test.go index 4208dc9581..93fd8d24dc 100644 --- a/pkg/descheduler/strategies/duplicates_test.go +++ b/pkg/descheduler/strategies/duplicates_test.go @@ -197,6 +197,7 @@ func TestFindDuplicatePods(t *testing.T) { fakeClient, "v1", false, + false, testCase.maxPodsToEvict, []*v1.Node{node}, ) diff --git a/pkg/descheduler/strategies/lownodeutilization_test.go b/pkg/descheduler/strategies/lownodeutilization_test.go index 78441d7151..768699c878 100644 --- a/pkg/descheduler/strategies/lownodeutilization_test.go +++ b/pkg/descheduler/strategies/lownodeutilization_test.go @@ -355,6 +355,7 @@ func TestLowNodeUtilization(t *testing.T) { fakeClient, "v1", false, + false, test.expectedPodsEvicted, nodes, ) @@ -628,6 +629,7 @@ func TestWithTaints(t *testing.T) { &fake.Clientset{Fake: *fakePtr}, "policy/v1", false, + false, item.evictionsExpected, item.nodes, ) diff --git a/pkg/descheduler/strategies/node_affinity.go b/pkg/descheduler/strategies/node_affinity.go index 8a687f34cb..ba3ccafced 100644 --- a/pkg/descheduler/strategies/node_affinity.go +++ b/pkg/descheduler/strategies/node_affinity.go @@ -45,7 +45,7 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter for _, pod := range pods { if pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil && pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil { - if !nodeutil.PodFitsCurrentNode(pod, node) && nodeutil.PodFitsAnyNode(pod, nodes) { + if !nodeutil.PodFitsCurrentNode(pod, node) && (nodeutil.PodFitsAnyNode(pod, nodes) || podEvictor.DegradationAllowed == true) { klog.V(1).Infof("Evicting pod: %v", pod.Name) if _, err := podEvictor.EvictPod(ctx, pod, node); err != nil { klog.Errorf("Error evicting pod: (%#v)", err) diff --git a/pkg/descheduler/strategies/node_affinity_test.go b/pkg/descheduler/strategies/node_affinity_test.go index 06e5f54305..0aacde4e7b 100644 --- a/pkg/descheduler/strategies/node_affinity_test.go +++ b/pkg/descheduler/strategies/node_affinity_test.go @@ -155,6 +155,7 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) { fakeClient, "v1", false, + false, tc.maxPodsToEvict, tc.nodes, ) diff --git a/pkg/descheduler/strategies/node_taint_test.go b/pkg/descheduler/strategies/node_taint_test.go index 46c89c5fd6..38b5e3a470 100644 --- a/pkg/descheduler/strategies/node_taint_test.go +++ b/pkg/descheduler/strategies/node_taint_test.go @@ -168,6 +168,7 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) { fakeClient, "v1", false, + false, tc.maxPodsToEvict, tc.nodes, ) diff --git a/pkg/descheduler/strategies/pod_antiaffinity_test.go b/pkg/descheduler/strategies/pod_antiaffinity_test.go index 4c4febda3e..9b55c05cb2 100644 --- a/pkg/descheduler/strategies/pod_antiaffinity_test.go +++ b/pkg/descheduler/strategies/pod_antiaffinity_test.go @@ -82,6 +82,7 @@ func TestPodAntiAffinity(t *testing.T) { fakeClient, "v1", false, + false, test.maxPodsToEvict, []*v1.Node{node}, ) diff --git a/pkg/descheduler/strategies/pod_lifetime_test.go b/pkg/descheduler/strategies/pod_lifetime_test.go index b389551b82..0828d269ee 100644 --- a/pkg/descheduler/strategies/pod_lifetime_test.go +++ b/pkg/descheduler/strategies/pod_lifetime_test.go @@ -155,6 +155,7 @@ func TestPodLifeTime(t *testing.T) { fakeClient, "v1", false, + false, tc.maxPodsToEvict, []*v1.Node{node}, ) diff --git a/pkg/descheduler/strategies/toomanyrestarts_test.go b/pkg/descheduler/strategies/toomanyrestarts_test.go index b68863a873..ebbbe76e1b 100644 --- a/pkg/descheduler/strategies/toomanyrestarts_test.go +++ b/pkg/descheduler/strategies/toomanyrestarts_test.go @@ -169,6 +169,7 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) { fakeClient, "v1", false, + false, tc.maxPodsToEvict, []*v1.Node{node}, ) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index 5e0c906f6f..5209c14986 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -130,6 +130,7 @@ func startEndToEndForLowNodeUtilization(ctx context.Context, clientset clientset clientset, evictionPolicyGroupVersion, false, + false, 0, nodes, )