diff --git a/README.md b/README.md index b9561d27a5..0f03a8b1bb 100644 --- a/README.md +++ b/README.md @@ -483,6 +483,7 @@ actual usage metrics. Implementing metrics-based descheduling is currently TODO |---|---| |`thresholds`|map(string:int)| |`numberOfNodes`|int| +| `maxNodesToProcess` | int | 0 | Maximum number of nodes to process in each pass (0 means no limit). | |`evictionModes`|list(string)| |`evictableNamespaces`|(see [namespace filtering](#namespace-filtering))| @@ -527,7 +528,9 @@ Policy should pass the following validation checks: There is another parameter associated with the `HighNodeUtilization` strategy, called `numberOfNodes`. This parameter can be configured to activate the strategy only when the number of under utilized nodes is above the configured value. This could be helpful in large clusters where a few nodes could go -under utilized frequently or for a short period of time. By default, `numberOfNodes` is set to zero. +under utilized frequently or for a short period of time. By default, `numberOfNodes` is set to zero. The parameter `maxNodesToProcess` is used to limit how many nodes should be processed by the descheduler plugin on each execution. + + ### RemovePodsViolatingInterPodAntiAffinity diff --git a/pkg/descheduler/node/node.go b/pkg/descheduler/node/node.go index 1742f9717f..419f6afea0 100644 --- a/pkg/descheduler/node/node.go +++ b/pkg/descheduler/node/node.go @@ -26,6 +26,7 @@ import ( "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" clientset "k8s.io/client-go/kubernetes" listersv1 "k8s.io/client-go/listers/core/v1" "k8s.io/client-go/util/workqueue" @@ -400,3 +401,26 @@ func podMatchesInterPodAntiAffinity(nodeIndexer podutil.GetPodsAssignedToNodeFun return false, nil } + +// UncordonNode removes the Unschedulable flag from a node to allow new pods to be scheduled on it. +// This is useful when eviction has failed and we want to allow the node to receive new workloads again. +func UncordonNode(ctx context.Context, client clientset.Interface, node *v1.Node) error { + if !node.Spec.Unschedulable { + // Node is already uncordoned, nothing to do + return nil + } + + logger := klog.FromContext(ctx) + logger.V(2).InfoS("Uncordoning node", "node", klog.KObj(node)) + + // Create a JSON patch to set Unschedulable to false + patch := []byte(`[{"op": "replace", "path": "/spec/unschedulable", "value": false}]`) + _, err := client.CoreV1().Nodes().Patch(ctx, node.Name, types.JSONPatchType, patch, metav1.PatchOptions{}) + if err != nil { + logger.Error(err, "Failed to uncordon node", "node", klog.KObj(node)) + return fmt.Errorf("failed to uncordon node %s: %w", node.Name, err) + } + + logger.V(1).InfoS("Successfully uncordoned node", "node", klog.KObj(node)) + return nil +} diff --git a/pkg/framework/plugins/defaultevictor/defaultevictor.go b/pkg/framework/plugins/defaultevictor/defaultevictor.go index f5f91863b3..fed57de43b 100644 --- a/pkg/framework/plugins/defaultevictor/defaultevictor.go +++ b/pkg/framework/plugins/defaultevictor/defaultevictor.go @@ -126,6 +126,7 @@ func applyEffectivePodProtections(d *DefaultEvictor, podProtections []PodProtect applyDaemonSetPodsProtection(d, protectionMap) applyPVCPodsProtection(d, protectionMap) applyPodsWithoutPDBProtection(d, protectionMap, handle) + applyPodsWithPDBBlockingSingleReplicaOwnerProtection(d, protectionMap, handle) applyPodsWithResourceClaimsProtection(d, protectionMap) return nil @@ -324,6 +325,73 @@ func applyPodsWithoutPDBProtection(d *DefaultEvictor, protectionMap map[PodProte } } +func applyPodsWithPDBBlockingSingleReplicaOwnerProtection(d *DefaultEvictor, protectionMap map[PodProtection]bool, handle frameworktypes.Handle) { + // This setting causes pods with PDBs to be evictable when their owner has only 1 replica. + // When this protection is in ExtraEnabled, pods from single-replica deployments with PDBs + // are treated as candidates for eviction (not protected). + // When not enabled, the PDB still acts as a barrier at the API server level. + isSettingEnabled := protectionMap[PodsWithPDBBlockingSingleReplicaOwner] + if !isSettingEnabled { + return + } + + // Add constraint that allows eviction of single-replica PDB-protected pods + d.constraints = append(d.constraints, func(pod *v1.Pod) error { + // Check if pod is covered by a PDB + hasPdb, err := utils.IsPodCoveredByPDB(pod, handle.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister()) + if err != nil { + return fmt.Errorf("unable to check if pod is covered by PodDisruptionBudget: %w", err) + } + + if !hasPdb { + // No PDB, pass through + return nil + } + + // Pod has a PDB. Check if its owner has only 1 replica. + ownerRefs := podutil.OwnerRef(pod) + if len(ownerRefs) == 0 { + // Pod has no owner, protect it from eviction due to PDB + return fmt.Errorf("pod is covered by PodDisruptionBudget") + } + + // For each owner, check if it's a single-replica workload + for _, ownerRef := range ownerRefs { + ownerObj, err := getOwnerObject(ownerRef, pod.Namespace, handle) + if err != nil { + // Unable to get owner, protect the pod + return fmt.Errorf("unable to determine if pod's owner is single-replica, protecting due to PDB: %w", err) + } + + // If owner has single replica, allow eviction (return nil) + if ownerObj != nil && utils.OwnerHasSingleReplica(ownerRefs, ownerObj) { + // Single replica owner with PDB - we allow eviction + return nil + } + } + + // Multi-replica owner with PDB - protect from eviction + return fmt.Errorf("pod is covered by PodDisruptionBudget") + }) +} + +// getOwnerObject retrieves the owner object from the appropriate informer based on the owner kind +func getOwnerObject(ownerRef metav1.OwnerReference, namespace string, handle frameworktypes.Handle) (interface{}, error) { + switch ownerRef.Kind { + case "Deployment": + return handle.SharedInformerFactory().Apps().V1().Deployments().Lister().Deployments(namespace).Get(ownerRef.Name) + case "StatefulSet": + return handle.SharedInformerFactory().Apps().V1().StatefulSets().Lister().StatefulSets(namespace).Get(ownerRef.Name) + case "ReplicaSet": + return handle.SharedInformerFactory().Apps().V1().ReplicaSets().Lister().ReplicaSets(namespace).Get(ownerRef.Name) + case "DaemonSet": + // DaemonSets are never single-replica in the traditional sense + return nil, nil + default: + return nil, fmt.Errorf("unsupported owner kind: %s", ownerRef.Kind) + } +} + func applyPodsWithResourceClaimsProtection(d *DefaultEvictor, protectionMap map[PodProtection]bool) { isProtectionEnabled := protectionMap[PodsWithResourceClaims] if isProtectionEnabled { diff --git a/pkg/framework/plugins/defaultevictor/types.go b/pkg/framework/plugins/defaultevictor/types.go index 2403c939a4..1a6c3c2df6 100644 --- a/pkg/framework/plugins/defaultevictor/types.go +++ b/pkg/framework/plugins/defaultevictor/types.go @@ -55,13 +55,14 @@ type DefaultEvictorArgs struct { type PodProtection string const ( - PodsWithLocalStorage PodProtection = "PodsWithLocalStorage" - DaemonSetPods PodProtection = "DaemonSetPods" - SystemCriticalPods PodProtection = "SystemCriticalPods" - FailedBarePods PodProtection = "FailedBarePods" - PodsWithPVC PodProtection = "PodsWithPVC" - PodsWithoutPDB PodProtection = "PodsWithoutPDB" - PodsWithResourceClaims PodProtection = "PodsWithResourceClaims" + PodsWithLocalStorage PodProtection = "PodsWithLocalStorage" + DaemonSetPods PodProtection = "DaemonSetPods" + SystemCriticalPods PodProtection = "SystemCriticalPods" + FailedBarePods PodProtection = "FailedBarePods" + PodsWithPVC PodProtection = "PodsWithPVC" + PodsWithoutPDB PodProtection = "PodsWithoutPDB" + PodsWithResourceClaims PodProtection = "PodsWithResourceClaims" + PodsWithPDBBlockingSingleReplicaOwner PodProtection = "PodsWithPDBBlockingSingleReplicaOwner" ) // PodProtections holds the list of enabled and disabled protection policies. @@ -131,10 +132,12 @@ var defaultPodProtections = []PodProtection{ // - PodsWithPVC: Protects pods using PersistentVolumeClaims. // - PodsWithoutPDB: Protects pods lacking a PodDisruptionBudget. // - PodsWithResourceClaims: Protects pods using ResourceClaims. +// - PodsWithPDBBlockingSingleReplicaOwner: Allow eviction of pods with PDBs when their owner has only 1 replica (bypasses PDB protection). var extraPodProtections = []PodProtection{ PodsWithPVC, PodsWithoutPDB, PodsWithResourceClaims, + PodsWithPDBBlockingSingleReplicaOwner, } // NoEvictionPolicy dictates whether a no-eviction policy is preferred or mandatory. diff --git a/pkg/framework/plugins/nodeutilization/defaults.go b/pkg/framework/plugins/nodeutilization/defaults.go index a1c6dfe0bc..f971506e4f 100644 --- a/pkg/framework/plugins/nodeutilization/defaults.go +++ b/pkg/framework/plugins/nodeutilization/defaults.go @@ -49,4 +49,7 @@ func SetDefaults_HighNodeUtilizationArgs(obj runtime.Object) { if args.NumberOfNodes == 0 { args.NumberOfNodes = 0 } + if args.MaxNodesToProcess == 0 { + args.MaxNodesToProcess = 0 + } } diff --git a/pkg/framework/plugins/nodeutilization/defaults_test.go b/pkg/framework/plugins/nodeutilization/defaults_test.go index ef8c188226..7078c0692e 100644 --- a/pkg/framework/plugins/nodeutilization/defaults_test.go +++ b/pkg/framework/plugins/nodeutilization/defaults_test.go @@ -89,8 +89,9 @@ func TestSetDefaults_HighNodeUtilizationArgs(t *testing.T) { name: "HighNodeUtilizationArgs empty", in: &HighNodeUtilizationArgs{}, want: &HighNodeUtilizationArgs{ - Thresholds: nil, - NumberOfNodes: 0, + Thresholds: nil, + NumberOfNodes: 0, + MaxNodesToProcess: 0, }, }, { @@ -100,14 +101,16 @@ func TestSetDefaults_HighNodeUtilizationArgs(t *testing.T) { v1.ResourceCPU: 20, v1.ResourceMemory: 120, }, - NumberOfNodes: 10, + NumberOfNodes: 10, + MaxNodesToProcess: 10, }, want: &HighNodeUtilizationArgs{ Thresholds: api.ResourceThresholds{ v1.ResourceCPU: 20, v1.ResourceMemory: 120, }, - NumberOfNodes: 10, + NumberOfNodes: 10, + MaxNodesToProcess: 10, }, }, } diff --git a/pkg/framework/plugins/nodeutilization/highnodeutilization.go b/pkg/framework/plugins/nodeutilization/highnodeutilization.go index 73633a52da..ee41c0c6a5 100644 --- a/pkg/framework/plugins/nodeutilization/highnodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/highnodeutilization.go @@ -213,6 +213,12 @@ func (h *HighNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fr lowNodes, schedulableNodes := nodeInfos[0], nodeInfos[1] + // limit the number of nodes processed each execution if `MaxNodesToProcess` is set + if h.args.MaxNodesToProcess > 0 && len(lowNodes) > h.args.MaxNodesToProcess { + lowNodes = lowNodes[:h.args.MaxNodesToProcess] + logger.V(1).Info("Limiting the number of underutilized nodes to process", "maxNodesToProcess", h.args.MaxNodesToProcess) + } + logger.V(1).Info("Criteria for a node below target utilization", h.criteria...) logger.V(1).Info("Number of underutilized nodes", "totalNumber", len(lowNodes)) @@ -269,6 +275,7 @@ func (h *HighNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fr continueEvictionCond, h.usageClient, nil, + h.handle, ) return nil diff --git a/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go b/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go index 1e7297f2e1..9befff7677 100644 --- a/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go +++ b/pkg/framework/plugins/nodeutilization/highnodeutilization_test.go @@ -481,6 +481,29 @@ func TestHighNodeUtilization(t *testing.T) { }, expectedPodsEvicted: 0, }, + { + name: "limits number of underutilized nodes processed per run with MaxNodesToProcess", + thresholds: api.ResourceThresholds{ + v1.ResourceCPU: 30, + v1.ResourcePods: 30, + }, + nodes: []*v1.Node{ + test.BuildTestNode("n1", 4000, 3000, 10, nil), + test.BuildTestNode("n2", 4000, 3000, 10, nil), + test.BuildTestNode("n3", 4000, 3000, 10, nil), + test.BuildTestNode("n4", 4000, 3000, 10, nil), + }, + pods: []*v1.Pod{ + test.BuildTestPod("p1", 400, 0, "n1", test.SetRSOwnerRef), + test.BuildTestPod("p2", 400, 0, "n2", test.SetRSOwnerRef), + test.BuildTestPod("p3", 400, 0, "n3", test.SetRSOwnerRef), + test.BuildTestPod("p4", 400, 0, "n4", test.SetRSOwnerRef), + }, + expectedPodsEvicted: 1, // Only one node's pod should be evicted per run + evictedPods: []string{"p1", "p2", "p3", "p4"}, // Any one of these is valid + evictionModes: nil, + // We'll set MaxNodesToProcess in the plugin args below + }, } for _, testCase := range testCases { @@ -639,6 +662,7 @@ func TestHighNodeUtilizationWithTaints(t *testing.T) { } plugin, err := NewHighNodeUtilization(ctx, &HighNodeUtilizationArgs{ + MaxNodesToProcess: 1, Thresholds: api.ResourceThresholds{ v1.ResourceCPU: 40, }, diff --git a/pkg/framework/plugins/nodeutilization/lownodeutilization.go b/pkg/framework/plugins/nodeutilization/lownodeutilization.go index 5748e73778..36a28cef1d 100644 --- a/pkg/framework/plugins/nodeutilization/lownodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/lownodeutilization.go @@ -316,6 +316,7 @@ func (l *LowNodeUtilization) Balance(ctx context.Context, nodes []*v1.Node) *fra continueEvictionCond, l.usageClient, nodeLimit, + l.handle, ) return nil diff --git a/pkg/framework/plugins/nodeutilization/nodeutilization.go b/pkg/framework/plugins/nodeutilization/nodeutilization.go index 2aa45e2f34..295315d9ed 100644 --- a/pkg/framework/plugins/nodeutilization/nodeutilization.go +++ b/pkg/framework/plugins/nodeutilization/nodeutilization.go @@ -175,6 +175,7 @@ func evictPodsFromSourceNodes( continueEviction continueEvictionCond, usageClient usageClient, maxNoOfPodsToEvictPerNode *uint, + handle frameworktypes.Handle, ) { logger := klog.FromContext(ctx) available, err := assessAvailableResourceInNodes(destinationNodes, resourceNames) @@ -240,6 +241,13 @@ func evictPodsFromSourceNodes( case *evictions.EvictionTotalLimitError: return default: + // Eviction failed, uncordon the node to allow new pods to be scheduled + if node.node.Spec.Unschedulable { + logger.V(1).Info("Eviction failed, uncordoning node", "node", klog.KObj(node.node)) + if uncordonErr := nodeutil.UncordonNode(ctx, handle.ClientSet(), node.node); uncordonErr != nil { + logger.Error(uncordonErr, "Failed to uncordon node after eviction failure", "node", klog.KObj(node.node)) + } + } } } } diff --git a/pkg/framework/plugins/nodeutilization/types.go b/pkg/framework/plugins/nodeutilization/types.go index 5f6e947998..9b80f7625d 100644 --- a/pkg/framework/plugins/nodeutilization/types.go +++ b/pkg/framework/plugins/nodeutilization/types.go @@ -60,6 +60,11 @@ type HighNodeUtilizationArgs struct { Thresholds api.ResourceThresholds `json:"thresholds"` NumberOfNodes int `json:"numberOfNodes,omitempty"` + // MaxNodesToProcess limits the number of nodes to process in each + // the descheduling execution. This is useful to limit nodes descheduled each run + // when turning this plugin on within a cluster with many underutilized nodes. + MaxNodesToProcess int `json:"maxNodesToProcess,omitempty"` + // EvictionModes is a set of modes to be taken into account when the // descheduler evicts pods. For example the mode // `OnlyThresholdingResources` can be used to make sure the descheduler diff --git a/pkg/utils/pod.go b/pkg/utils/pod.go index 7cf12fd0ff..c3dae97882 100644 --- a/pkg/utils/pod.go +++ b/pkg/utils/pod.go @@ -3,6 +3,7 @@ package utils import ( "fmt" + appsv1 "k8s.io/api/apps/v1" policy "k8s.io/api/policy/v1" "k8s.io/apimachinery/pkg/labels" @@ -125,6 +126,43 @@ func IsPodCoveredByPDB(pod *v1.Pod, lister policyv1.PodDisruptionBudgetLister) ( return len(pdbList) > 0, nil } +// OwnerHasSingleReplica checks if the pod's owner (Deployment, StatefulSet, or ReplicaSet) +// has only 1 replica. Returns true if the owner is single-replica, false otherwise. +// For DaemonSets, always returns false since they manage one pod per node. +func OwnerHasSingleReplica(ownerRefs []metav1.OwnerReference, ownerObj interface{}) bool { + if ownerObj == nil { + return false + } + + // Check Deployment + if deployment, ok := ownerObj.(*appsv1.Deployment); ok { + if deployment.Spec.Replicas != nil && *deployment.Spec.Replicas == 1 { + return true + } + return false + } + + // Check StatefulSet + if statefulSet, ok := ownerObj.(*appsv1.StatefulSet); ok { + if statefulSet.Spec.Replicas != nil && *statefulSet.Spec.Replicas == 1 { + return true + } + return false + } + + // Check ReplicaSet + if replicaSet, ok := ownerObj.(*appsv1.ReplicaSet); ok { + if replicaSet.Spec.Replicas != nil && *replicaSet.Spec.Replicas == 1 { + return true + } + return false + } + + // DaemonSet and other workload types are not considered single-replica + // because DaemonSet manages one pod per node (by design) + return false +} + // GetPodSource returns the source of the pod based on the annotation. func GetPodSource(pod *v1.Pod) (string, error) { if pod.Annotations != nil { diff --git a/vendor/.DS_Store b/vendor/.DS_Store new file mode 100644 index 0000000000..3f8f4fb1c9 Binary files /dev/null and b/vendor/.DS_Store differ