kubernetes-sigs · jonahjon · Jun 12, 2025 · Jun 13, 2025 · Jun 13, 2025 · Jun 13, 2025
diff --git a/README.md b/README.md
@@ -483,6 +483,7 @@ actual usage metrics. Implementing metrics-based descheduling is currently TODO
 |---|---|
 |`thresholds`|map(string:int)|
 |`numberOfNodes`|int|
+| `maxNodesToProcess` | int    | 0       | Maximum number of nodes to process in each pass (0 means no limit).        |
 |`evictionModes`|list(string)|
 |`evictableNamespaces`|(see [namespace filtering](#namespace-filtering))|
 
@@ -527,7 +528,9 @@ Policy should pass the following validation checks:
 There is another parameter associated with the `HighNodeUtilization` strategy, called `numberOfNodes`.
 This parameter can be configured to activate the strategy only when the number of under utilized nodes
 is above the configured value. This could be helpful in large clusters where a few nodes could go
-under utilized frequently or for a short period of time. By default, `numberOfNodes` is set to zero.
+under utilized frequently or for a short period of time. By default, `numberOfNodes` is set to zero. The parameter `maxNodesToProcess` is used to limit how many nodes should be processed by the descheduler plugin on each execution.
+
+
 
 ### RemovePodsViolatingInterPodAntiAffinity
 

diff --git a/pkg/descheduler/node/node.go b/pkg/descheduler/node/node.go
@@ -26,6 +26,7 @@ import (
 	"k8s.io/apimachinery/pkg/api/resource"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
 	"k8s.io/apimachinery/pkg/labels"
+	"k8s.io/apimachinery/pkg/types"
 	clientset "k8s.io/client-go/kubernetes"
 	listersv1 "k8s.io/client-go/listers/core/v1"
 	"k8s.io/client-go/util/workqueue"
@@ -400,3 +401,26 @@ func podMatchesInterPodAntiAffinity(nodeIndexer podutil.GetPodsAssignedToNodeFun
 
 	return false, nil
 }
+
+// UncordonNode removes the Unschedulable flag from a node to allow new pods to be scheduled on it.
+// This is useful when eviction has failed and we want to allow the node to receive new workloads again.
+func UncordonNode(ctx context.Context, client clientset.Interface, node *v1.Node) error {
+	if !node.Spec.Unschedulable {
+		// Node is already uncordoned, nothing to do
+		return nil
+	}
+
+	logger := klog.FromContext(ctx)
+	logger.V(2).InfoS("Uncordoning node", "node", klog.KObj(node))
+
+	// Create a JSON patch to set Unschedulable to false
+	patch := []byte(`[{"op": "replace", "path": "/spec/unschedulable", "value": false}]`)
+	_, err := client.CoreV1().Nodes().Patch(ctx, node.Name, types.JSONPatchType, patch, metav1.PatchOptions{})
+	if err != nil {
+		logger.Error(err, "Failed to uncordon node", "node", klog.KObj(node))
+		return fmt.Errorf("failed to uncordon node %s: %w", node.Name, err)
+	}
+
+	logger.V(1).InfoS("Successfully uncordoned node", "node", klog.KObj(node))
+	return nil
+}
diff --git a/pkg/framework/plugins/defaultevictor/defaultevictor.go b/pkg/framework/plugins/defaultevictor/defaultevictor.go
@@ -126,6 +126,7 @@ func applyEffectivePodProtections(d *DefaultEvictor, podProtections []PodProtect
 	applyDaemonSetPodsProtection(d, protectionMap)
 	applyPVCPodsProtection(d, protectionMap)
 	applyPodsWithoutPDBProtection(d, protectionMap, handle)
+	applyPodsWithPDBBlockingSingleReplicaOwnerProtection(d, protectionMap, handle)
 	applyPodsWithResourceClaimsProtection(d, protectionMap)
 
 	return nil
@@ -324,6 +325,73 @@ func applyPodsWithoutPDBProtection(d *DefaultEvictor, protectionMap map[PodProte
 	}
 }
 
+func applyPodsWithPDBBlockingSingleReplicaOwnerProtection(d *DefaultEvictor, protectionMap map[PodProtection]bool, handle frameworktypes.Handle) {
+	// This setting causes pods with PDBs to be evictable when their owner has only 1 replica.
+	// When this protection is in ExtraEnabled, pods from single-replica deployments with PDBs
+	// are treated as candidates for eviction (not protected).
+	// When not enabled, the PDB still acts as a barrier at the API server level.
+	isSettingEnabled := protectionMap[PodsWithPDBBlockingSingleReplicaOwner]
+	if !isSettingEnabled {
+		return
+	}
+
+	// Add constraint that allows eviction of single-replica PDB-protected pods
+	d.constraints = append(d.constraints, func(pod *v1.Pod) error {
+		// Check if pod is covered by a PDB
+		hasPdb, err := utils.IsPodCoveredByPDB(pod, handle.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister())
+		if err != nil {
+			return fmt.Errorf("unable to check if pod is covered by PodDisruptionBudget: %w", err)
+		}
+
+		if !hasPdb {
+			// No PDB, pass through
+			return nil
+		}
+
+		// Pod has a PDB. Check if its owner has only 1 replica.
+		ownerRefs := podutil.OwnerRef(pod)
+		if len(ownerRefs) == 0 {
+			// Pod has no owner, protect it from eviction due to PDB
+			return fmt.Errorf("pod is covered by PodDisruptionBudget")
+		}
+
+		// For each owner, check if it's a single-replica workload
+		for _, ownerRef := range ownerRefs {
+			ownerObj, err := getOwnerObject(ownerRef, pod.Namespace, handle)
+			if err != nil {
+				// Unable to get owner, protect the pod
+				return fmt.Errorf("unable to determine if pod's owner is single-replica, protecting due to PDB: %w", err)
+			}
+
+			// If owner has single replica, allow eviction (return nil)
+			if ownerObj != nil && utils.OwnerHasSingleReplica(ownerRefs, ownerObj) {
+				// Single replica owner with PDB - we allow eviction
+				return nil
+			}
+		}
+
+		// Multi-replica owner with PDB - protect from eviction
+		return fmt.Errorf("pod is covered by PodDisruptionBudget")
+	})
+}
+
+// getOwnerObject retrieves the owner object from the appropriate informer based on the owner kind
+func getOwnerObject(ownerRef metav1.OwnerReference, namespace string, handle frameworktypes.Handle) (interface{}, error) {
+	switch ownerRef.Kind {
+	case "Deployment":
+		return handle.SharedInformerFactory().Apps().V1().Deployments().Lister().Deployments(namespace).Get(ownerRef.Name)
+	case "StatefulSet":
+		return handle.SharedInformerFactory().Apps().V1().StatefulSets().Lister().StatefulSets(namespace).Get(ownerRef.Name)
+	case "ReplicaSet":
+		return handle.SharedInformerFactory().Apps().V1().ReplicaSets().Lister().ReplicaSets(namespace).Get(ownerRef.Name)
+	case "DaemonSet":
+		// DaemonSets are never single-replica in the traditional sense
+		return nil, nil
+	default:
+		return nil, fmt.Errorf("unsupported owner kind: %s", ownerRef.Kind)
+	}
+}
+
 func applyPodsWithResourceClaimsProtection(d *DefaultEvictor, protectionMap map[PodProtection]bool) {
 	isProtectionEnabled := protectionMap[PodsWithResourceClaims]
 	if isProtectionEnabled {

diff --git a/pkg/framework/plugins/defaultevictor/types.go b/pkg/framework/plugins/defaultevictor/types.go
@@ -55,13 +55,14 @@ type DefaultEvictorArgs struct {
 type PodProtection string
 
 const (
-	PodsWithLocalStorage   PodProtection = "PodsWithLocalStorage"
-	DaemonSetPods          PodProtection = "DaemonSetPods"
-	SystemCriticalPods     PodProtection = "SystemCriticalPods"
-	FailedBarePods         PodProtection = "FailedBarePods"
-	PodsWithPVC            PodProtection = "PodsWithPVC"
-	PodsWithoutPDB         PodProtection = "PodsWithoutPDB"
-	PodsWithResourceClaims PodProtection = "PodsWithResourceClaims"
+	PodsWithLocalStorage                  PodProtection = "PodsWithLocalStorage"
+	DaemonSetPods                         PodProtection = "DaemonSetPods"
+	SystemCriticalPods                    PodProtection = "SystemCriticalPods"
+	FailedBarePods                        PodProtection = "FailedBarePods"
+	PodsWithPVC                           PodProtection = "PodsWithPVC"
+	PodsWithoutPDB                        PodProtection = "PodsWithoutPDB"
+	PodsWithResourceClaims                PodProtection = "PodsWithResourceClaims"
+	PodsWithPDBBlockingSingleReplicaOwner PodProtection = "PodsWithPDBBlockingSingleReplicaOwner"
 )
 
 // PodProtections holds the list of enabled and disabled protection policies.
@@ -131,10 +132,12 @@ var defaultPodProtections = []PodProtection{
 //   - PodsWithPVC: Protects pods using PersistentVolumeClaims.
 //   - PodsWithoutPDB: Protects pods lacking a PodDisruptionBudget.
 //   - PodsWithResourceClaims: Protects pods using ResourceClaims.
+//   - PodsWithPDBBlockingSingleReplicaOwner: Allow eviction of pods with PDBs when their owner has only 1 replica (bypasses PDB protection).
 var extraPodProtections = []PodProtection{
 	PodsWithPVC,
 	PodsWithoutPDB,
 	PodsWithResourceClaims,
+	PodsWithPDBBlockingSingleReplicaOwner,
 }
 
 // NoEvictionPolicy dictates whether a no-eviction policy is preferred or mandatory.

diff --git a/pkg/framework/plugins/nodeutilization/defaults.go b/pkg/framework/plugins/nodeutilization/defaults.go
@@ -49,4 +49,7 @@ func SetDefaults_HighNodeUtilizationArgs(obj runtime.Object) {
 	if args.NumberOfNodes == 0 {
 		args.NumberOfNodes = 0
 	}
+	if args.MaxNodesToProcess == 0 {
+		args.MaxNodesToProcess = 0
+	}
 }
diff --git a/pkg/framework/plugins/nodeutilization/defaults_test.go b/pkg/framework/plugins/nodeutilization/defaults_test.go
@@ -89,8 +89,9 @@ func TestSetDefaults_HighNodeUtilizationArgs(t *testing.T) {
 			name: "HighNodeUtilizationArgs empty",
 			in:   &HighNodeUtilizationArgs{},
 			want: &HighNodeUtilizationArgs{
-				Thresholds:    nil,
-				NumberOfNodes: 0,
+				Thresholds:        nil,
+				NumberOfNodes:     0,
+				MaxNodesToProcess: 0,
 			},
 		},
 		{
@@ -100,14 +101,16 @@ func TestSetDefaults_HighNodeUtilizationArgs(t *testing.T) {
 					v1.ResourceCPU:    20,
 					v1.ResourceMemory: 120,
 				},
-				NumberOfNodes: 10,
+				NumberOfNodes:     10,
+				MaxNodesToProcess: 10,
 			},
 			want: &HighNodeUtilizationArgs{
 				Thresholds: api.ResourceThresholds{
 					v1.ResourceCPU:    20,
 					v1.ResourceMemory: 120,
 				},
-				NumberOfNodes: 10,
+				NumberOfNodes:     10,
+				MaxNodesToProcess: 10,
 			},
 		},
 	}

diff --git a/pkg/framework/plugins/nodeutilization/highnodeutilization.go b/pkg/framework/plugins/nodeutilization/highnodeutilization.go
@@ -213,6 +213,12 @@ func (h *HighNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fr
 
 	lowNodes, schedulableNodes := nodeInfos[0], nodeInfos[1]
 
+	// limit the number of nodes processed each execution if `MaxNodesToProcess` is set
+	if h.args.MaxNodesToProcess > 0 && len(lowNodes) > h.args.MaxNodesToProcess {
+		lowNodes = lowNodes[:h.args.MaxNodesToProcess]
+		logger.V(1).Info("Limiting the number of underutilized nodes to process", "maxNodesToProcess", h.args.MaxNodesToProcess)
+	}
+
 	logger.V(1).Info("Criteria for a node below target utilization", h.criteria...)
 	logger.V(1).Info("Number of underutilized nodes", "totalNumber", len(lowNodes))
 
@@ -269,6 +275,7 @@ func (h *HighNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fr
 		continueEvictionCond,
 		h.usageClient,
 		nil,
+		h.handle,
 	)
 
 	return nil

diff --git a/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go b/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go
@@ -481,6 +481,29 @@ func TestHighNodeUtilization(t *testing.T) {
 			},
 			expectedPodsEvicted: 0,
 		},
+		{
+			name: "limits number of underutilized nodes processed per run with MaxNodesToProcess",
+			thresholds: api.ResourceThresholds{
+				v1.ResourceCPU:  30,
+				v1.ResourcePods: 30,
+			},
+			nodes: []*v1.Node{
+				test.BuildTestNode("n1", 4000, 3000, 10, nil),
+				test.BuildTestNode("n2", 4000, 3000, 10, nil),
+				test.BuildTestNode("n3", 4000, 3000, 10, nil),
+				test.BuildTestNode("n4", 4000, 3000, 10, nil),
+			},
+			pods: []*v1.Pod{
+				test.BuildTestPod("p1", 400, 0, "n1", test.SetRSOwnerRef),
+				test.BuildTestPod("p2", 400, 0, "n2", test.SetRSOwnerRef),
+				test.BuildTestPod("p3", 400, 0, "n3", test.SetRSOwnerRef),
+				test.BuildTestPod("p4", 400, 0, "n4", test.SetRSOwnerRef),
+			},
+			expectedPodsEvicted: 1,                                // Only one node's pod should be evicted per run
+			evictedPods:         []string{"p1", "p2", "p3", "p4"}, // Any one of these is valid
+			evictionModes:       nil,
+			// We'll set MaxNodesToProcess in the plugin args below
+		},
 	}
 
 	for _, testCase := range testCases {
@@ -639,6 +662,7 @@ func TestHighNodeUtilizationWithTaints(t *testing.T) {
 			}
 
 			plugin, err := NewHighNodeUtilization(ctx, &HighNodeUtilizationArgs{
+				MaxNodesToProcess: 1,
 				Thresholds: api.ResourceThresholds{
 					v1.ResourceCPU: 40,
 				},

diff --git a/pkg/framework/plugins/nodeutilization/lownodeutilization.go b/pkg/framework/plugins/nodeutilization/lownodeutilization.go
@@ -316,6 +316,7 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra
 		continueEvictionCond,
 		l.usageClient,
 		nodeLimit,
+		l.handle,
 	)
 
 	return nil

diff --git a/pkg/framework/plugins/nodeutilization/nodeutilization.go b/pkg/framework/plugins/nodeutilization/nodeutilization.go
@@ -175,6 +175,7 @@ func evictPodsFromSourceNodes(
 	continueEviction continueEvictionCond,
 	usageClient usageClient,
 	maxNoOfPodsToEvictPerNode *uint,
+	handle frameworktypes.Handle,
 ) {
 	logger := klog.FromContext(ctx)
 	available, err := assessAvailableResourceInNodes(destinationNodes, resourceNames)
@@ -240,6 +241,13 @@ func evictPodsFromSourceNodes(
 			case *evictions.EvictionTotalLimitError:
 				return
 			default:
+				// Eviction failed, uncordon the node to allow new pods to be scheduled
+				if node.node.Spec.Unschedulable {
+					logger.V(1).Info("Eviction failed, uncordoning node", "node", klog.KObj(node.node))
+					if uncordonErr := nodeutil.UncordonNode(ctx, handle.ClientSet(), node.node); uncordonErr != nil {
+						logger.Error(uncordonErr, "Failed to uncordon node after eviction failure", "node", klog.KObj(node.node))
+					}
+				}
 			}
 		}
 	}

diff --git a/pkg/framework/plugins/nodeutilization/types.go b/pkg/framework/plugins/nodeutilization/types.go
@@ -60,6 +60,11 @@ type HighNodeUtilizationArgs struct {
 	Thresholds    api.ResourceThresholds `json:"thresholds"`
 	NumberOfNodes int                    `json:"numberOfNodes,omitempty"`
 
+	// MaxNodesToProcess limits the number of nodes to process in each
+	// the descheduling execution. This is useful to limit nodes descheduled each run
+	// when turning this plugin on within a cluster with many underutilized nodes.
+	MaxNodesToProcess int `json:"maxNodesToProcess,omitempty"`
+
 	// EvictionModes is a set of modes to be taken into account when the
 	// descheduler evicts pods. For example the mode
 	// `OnlyThresholdingResources` can be used to make sure the descheduler

diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go
@@ -3,6 +3,7 @@ package utils
 import (
 	"fmt"
 
+	appsv1 "k8s.io/api/apps/v1"
 	policy "k8s.io/api/policy/v1"
 	"k8s.io/apimachinery/pkg/labels"
 
@@ -125,6 +126,43 @@ func IsPodCoveredByPDB(pod *v1.Pod, lister policyv1.PodDisruptionBudgetLister) (
 	return len(pdbList) > 0, nil
 }
 
+// OwnerHasSingleReplica checks if the pod's owner (Deployment, StatefulSet, or ReplicaSet)
+// has only 1 replica. Returns true if the owner is single-replica, false otherwise.
+// For DaemonSets, always returns false since they manage one pod per node.
+func OwnerHasSingleReplica(ownerRefs []metav1.OwnerReference, ownerObj interface{}) bool {
+	if ownerObj == nil {
+		return false
+	}
+
+	// Check Deployment
+	if deployment, ok := ownerObj.(*appsv1.Deployment); ok {
+		if deployment.Spec.Replicas != nil && *deployment.Spec.Replicas == 1 {
+			return true
+		}
+		return false
+	}
+
+	// Check StatefulSet
+	if statefulSet, ok := ownerObj.(*appsv1.StatefulSet); ok {
+		if statefulSet.Spec.Replicas != nil && *statefulSet.Spec.Replicas == 1 {
+			return true
+		}
+		return false
+	}
+
+	// Check ReplicaSet
+	if replicaSet, ok := ownerObj.(*appsv1.ReplicaSet); ok {
+		if replicaSet.Spec.Replicas != nil && *replicaSet.Spec.Replicas == 1 {
+			return true
+		}
+		return false
+	}
+
+	// DaemonSet and other workload types are not considered single-replica
+	// because DaemonSet manages one pod per node (by design)
+	return false
+}
+
 // GetPodSource returns the source of the pod based on the annotation.
 func GetPodSource(pod *v1.Pod) (string, error) {
 	if pod.Annotations != nil {

diff --git a/vendor/.DS_Store b/vendor/.DS_Store