kubernetes-sigs · pmundt · May 28, 2020 · May 29, 2020 · ingvagabund · May 28, 2020
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ pod can or can not be scheduled, are guided by its configurable policy which com
 rules, called predicates and priorities. The scheduler's decisions are influenced by its view of
 a Kubernetes cluster at that point of time when a new pod appears for scheduling.
 As Kubernetes clusters are very dynamic and their state changes over time, there may be desire
-to move already running pods to some other nodes for various reasons:
+to move already running pods to some other nodes, or to allow the pods to be terminated outright:
 
 * Some nodes are under or over utilized.
 * The original scheduling decision does not hold true any more, as taints or labels are added to
@@ -20,9 +20,12 @@ or removed from nodes, pod/node affinity requirements are not satisfied any more
 * New nodes are added to clusters.
 
 Consequently, there might be several pods scheduled on less desired nodes in a cluster.
-Descheduler, based on its policy, finds pods that can be moved and evicts them. Please
-note, in current implementation, descheduler does not schedule replacement of evicted pods
-but relies on the default scheduler for that.
+Descheduler, based on its policy, finds pods that can be moved and evicts them. By default,
+Descheduler aims to ensure that there is no service degradation across the cluster. In the case
+where a pod is no longer runnable on a node and no suitable movement candidate can be found,
+Descheduler can also optionally terminate the problematic pod, when service degradation is allowed.
+Please note, in its current implementation, descheduler does not schedule replacements of evicted
+or terminated pods but instead relies on the default scheduler for that.
 
 ## Quick Start
 
@@ -238,6 +241,17 @@ never evicted because these pods won't be recreated.
 Pods subject to a Pod Disruption Budget(PDB) are not evicted if descheduling violates its PDB. The pods
 are evicted by using the eviction subresource to handle PDB.
 
+### Degradation
+
+By default, Pods marked for eviction are only evicted if a suitable node can be found for rescheduling.
+This is done to ensure that in the case where no suitable candidate for rescheduling is found, the
+pod will continue to run on its current node.
+
+In certain cases, such as when a pod is scheduled based on labelling criteria which is no longer satisfied,
+it can be preferable (and, at times, essential) to terminate the running pod even if it does not have a
+rescheduling candidate. This behaviour can be enabled by running Descheduler in an allowed degradation mode,
+activated by the `--degradation-allowed` CLI argument.
+
 ## Compatibility Matrix
 The below compatibility matrix shows the k8s client package(client-go, apimachinery, etc) versions that descheduler
 is compiled with. At this time descheduler does not have a hard dependency to a specific k8s release. However a

diff --git a/cmd/descheduler/app/options/options.go b/cmd/descheduler/app/options/options.go
@@ -52,6 +52,7 @@ func (rs *DeschedulerServer) AddFlags(fs *pflag.FlagSet) {
 	fs.StringVar(&rs.KubeconfigFile, "kubeconfig", rs.KubeconfigFile, "File with  kube configuration.")
 	fs.StringVar(&rs.PolicyConfigFile, "policy-config-file", rs.PolicyConfigFile, "File with descheduler policy configuration.")
 	fs.BoolVar(&rs.DryRun, "dry-run", rs.DryRun, "execute descheduler in dry run mode.")
+	fs.BoolVar(&rs.DegradationAllowed, "degradation-allowed", rs.DegradationAllowed, "Allow descheduling of Pods that have no rescheduling candidates")
 	// node-selector query causes descheduler to run only on nodes that matches the node labels in the query
 	fs.StringVar(&rs.NodeSelector, "node-selector", rs.NodeSelector, "Selector (label query) to filter on, supports '=', '==', and '!='.(e.g. -l key1=value1,key2=value2)")
 	// max-no-pods-to-evict limits the maximum number of pods to be evicted per node by descheduler.

diff --git a/docs/user-guide.md b/docs/user-guide.md
@@ -25,6 +25,7 @@ Available Commands:
 Flags:
       --add-dir-header                   If true, adds the file directory to the header
       --alsologtostderr                  log to standard error as well as files
+      --degradation-allowed              Allow descheduling of Pods that have no rescheduling candidates
       --descheduling-interval duration   Time interval between two consecutive descheduler executions. Setting this value instructs the descheduler to run in a continuous loop at the interval specified.
       --dry-run                          execute descheduler in dry run mode.
       --evict-local-storage-pods         Enables evicting pods using local storage by descheduler

diff --git a/pkg/apis/componentconfig/types.go b/pkg/apis/componentconfig/types.go
@@ -40,6 +40,9 @@ type DeschedulerConfiguration struct {
 	// Dry run
 	DryRun bool
 
+	// Degradation allowed
+	DegradationAllowed bool
+
 	// Node selectors
 	NodeSelector string
 

diff --git a/pkg/apis/componentconfig/v1alpha1/types.go b/pkg/apis/componentconfig/v1alpha1/types.go
@@ -40,6 +40,9 @@ type DeschedulerConfiguration struct {
 	// Dry run
 	DryRun bool `json:"dryRun,omitempty"`
 
+	// Degradation allowed
+	DegradationAllowed bool `json:"degradationAllowed,omitempty"`
+
 	// Node selectors
 	NodeSelector string `json:"nodeSelector,omitempty"`
 

diff --git a/pkg/descheduler/descheduler.go b/pkg/descheduler/descheduler.go
@@ -87,7 +87,7 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer
 			return
 		}
 
-		if len(nodes) <= 1 {
+		if len(nodes) <= 1 && rs.DegradationAllowed == false {
 			klog.V(1).Infof("The cluster size is 0 or 1 meaning eviction causes service disruption or degradation. So aborting..")
 			close(stopChannel)
 			return
@@ -97,6 +97,7 @@ func RunDeschedulerStrategies(ctx context.Context, rs *options.DeschedulerServer
 			rs.Client,
 			evictionPolicyGroupVersion,
 			rs.DryRun,
+			rs.DegradationAllowed,
 			rs.MaxNoOfPodsToEvictPerNode,
 			nodes,
 		)

diff --git a/pkg/descheduler/evictions/evictions.go b/pkg/descheduler/evictions/evictions.go
@@ -40,6 +40,7 @@ type PodEvictor struct {
 	client             clientset.Interface
 	policyGroupVersion string
 	dryRun             bool
+	DegradationAllowed bool
 	maxPodsToEvict     int
 	nodepodCount       nodePodEvictedCount
 }
@@ -48,6 +49,7 @@ func NewPodEvictor(
 	client clientset.Interface,
 	policyGroupVersion string,
 	dryRun bool,
+	degradationAllowed bool,
 	maxPodsToEvict int,
 	nodes []*v1.Node,
 ) *PodEvictor {
@@ -61,6 +63,7 @@ func NewPodEvictor(
 		client:             client,
 		policyGroupVersion: policyGroupVersion,
 		dryRun:             dryRun,
+		DegradationAllowed: degradationAllowed,
 		maxPodsToEvict:     maxPodsToEvict,
 		nodepodCount:       nodePodCount,
 	}

diff --git a/pkg/descheduler/strategies/duplicates_test.go b/pkg/descheduler/strategies/duplicates_test.go
@@ -197,6 +197,7 @@ func TestFindDuplicatePods(t *testing.T) {
 			fakeClient,
 			"v1",
 			false,
+			false,
 			testCase.maxPodsToEvict,
 			[]*v1.Node{node},
 		)

diff --git a/pkg/descheduler/strategies/lownodeutilization_test.go b/pkg/descheduler/strategies/lownodeutilization_test.go
@@ -355,6 +355,7 @@ func TestLowNodeUtilization(t *testing.T) {
 				fakeClient,
 				"v1",
 				false,
+				false,
 				test.expectedPodsEvicted,
 				nodes,
 			)
@@ -628,6 +629,7 @@ func TestWithTaints(t *testing.T) {
 				&fake.Clientset{Fake: *fakePtr},
 				"policy/v1",
 				false,
+				false,
 				item.evictionsExpected,
 				item.nodes,
 			)

diff --git a/pkg/descheduler/strategies/node_affinity.go b/pkg/descheduler/strategies/node_affinity.go
@@ -45,7 +45,7 @@ func RemovePodsViolatingNodeAffinity(ctx context.Context, client clientset.Inter
 
 				for _, pod := range pods {
 					if pod.Spec.Affinity != nil && pod.Spec.Affinity.NodeAffinity != nil && pod.Spec.Affinity.NodeAffinity.RequiredDuringSchedulingIgnoredDuringExecution != nil {
-						if !nodeutil.PodFitsCurrentNode(pod, node) && nodeutil.PodFitsAnyNode(pod, nodes) {
+						if !nodeutil.PodFitsCurrentNode(pod, node) && (nodeutil.PodFitsAnyNode(pod, nodes) || podEvictor.DegradationAllowed == true) {
 							klog.V(1).Infof("Evicting pod: %v", pod.Name)
 							if _, err := podEvictor.EvictPod(ctx, pod, node); err != nil {
 								klog.Errorf("Error evicting pod: (%#v)", err)

diff --git a/pkg/descheduler/strategies/node_affinity_test.go b/pkg/descheduler/strategies/node_affinity_test.go
@@ -155,6 +155,7 @@ func TestRemovePodsViolatingNodeAffinity(t *testing.T) {
 			fakeClient,
 			"v1",
 			false,
+			false,
 			tc.maxPodsToEvict,
 			tc.nodes,
 		)

diff --git a/pkg/descheduler/strategies/node_taint_test.go b/pkg/descheduler/strategies/node_taint_test.go
@@ -168,6 +168,7 @@ func TestDeletePodsViolatingNodeTaints(t *testing.T) {
 			fakeClient,
 			"v1",
 			false,
+			false,
 			tc.maxPodsToEvict,
 			tc.nodes,
 		)

diff --git a/pkg/descheduler/strategies/pod_antiaffinity_test.go b/pkg/descheduler/strategies/pod_antiaffinity_test.go
@@ -82,6 +82,7 @@ func TestPodAntiAffinity(t *testing.T) {
 			fakeClient,
 			"v1",
 			false,
+			false,
 			test.maxPodsToEvict,
 			[]*v1.Node{node},
 		)

diff --git a/pkg/descheduler/strategies/pod_lifetime_test.go b/pkg/descheduler/strategies/pod_lifetime_test.go
@@ -155,6 +155,7 @@ func TestPodLifeTime(t *testing.T) {
 			fakeClient,
 			"v1",
 			false,
+			false,
 			tc.maxPodsToEvict,
 			[]*v1.Node{node},
 		)

diff --git a/pkg/descheduler/strategies/toomanyrestarts_test.go b/pkg/descheduler/strategies/toomanyrestarts_test.go
@@ -169,6 +169,7 @@ func TestRemovePodsHavingTooManyRestarts(t *testing.T) {
 			fakeClient,
 			"v1",
 			false,
+			false,
 			tc.maxPodsToEvict,
 			[]*v1.Node{node},
 		)

diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go
@@ -130,6 +130,7 @@ func startEndToEndForLowNodeUtilization(ctx context.Context, clientset clientset
 		clientset,
 		evictionPolicyGroupVersion,
 		false,
+		false,
 		0,
 		nodes,
 	)