diff --git a/pkg/estimator/client/general.go b/pkg/estimator/client/general.go index 3810d1e7d57d..0916d5377f8f 100644 --- a/pkg/estimator/client/general.go +++ b/pkg/estimator/client/general.go @@ -19,7 +19,6 @@ package client import ( "context" "fmt" - "maps" "math" "sort" @@ -29,7 +28,11 @@ import ( clusterv1alpha1 "github.com/karmada-io/karmada/pkg/apis/cluster/v1alpha1" workv1alpha2 "github.com/karmada-io/karmada/pkg/apis/work/v1alpha2" + "github.com/karmada-io/karmada/pkg/estimator" + "github.com/karmada-io/karmada/pkg/estimator/pb" "github.com/karmada-io/karmada/pkg/features" + "github.com/karmada-io/karmada/pkg/util" + schedulerframework "github.com/karmada-io/karmada/pkg/util/lifted/scheduler/framework" ) // GeneralEstimator is the default replica estimator. @@ -127,9 +130,9 @@ func (ge *GeneralEstimator) maxAvailableComponentSets(cluster *clusterv1alpha1.C return int32(allowedPods) // #nosec G115: integer overflow conversion int64 -> int32 } - podBound := allowedPods / podsPerSet + podBound := int32(allowedPods / podsPerSet) // #nosec G115: integer overflow conversion int64 -> int32 if len(perSet) == 0 || allZero(perSet) { - return int32(podBound) // #nosec G115: integer overflow conversion int64 -> int32 + return podBound } // Find limiting resource requirement, which will bound maxSet calculation @@ -144,116 +147,58 @@ func (ge *GeneralEstimator) maxAvailableComponentSets(cluster *clusterv1alpha1.C return 0 // no capacity for this resource } - resBound := resAvail / req + resBound := int32(resAvail / req) // #nosec G115: integer overflow conversion int64 -> int32 if resBound < maxSets { maxSets = resBound } } if features.FeatureGate.Enabled(features.CustomizedClusterResourceModeling) && len(cluster.Status.ResourceSummary.AllocatableModelings) > 0 { - if num, err := getMaximumSetsBasedOnResourceModels(cluster, components, podBound); err != nil { + if num, err := getMaximumSetsBasedOnResourceModels(cluster, components, maxSets); err != nil { klog.Warningf("Failed to get maximum sets based on resource models, skipping: %v", err) } else if num < maxSets { maxSets = num } } - return int32(maxSets) // #nosec G115: integer overflow conversion int64 -> int32 + return maxSets } // getMaximumSetsBasedOnResourceModels computes the maximum number of full sets that can be // placed on a cluster using the cluster's ResourceModels. It expands one set into // replica kinds (demand + count) and performs a first-fit-decreasing placement onto model-grade nodes. // `upperBound` caps the search. We can set this using the podBound (allowedPods / podsPerSet) -func getMaximumSetsBasedOnResourceModels( - cluster *clusterv1alpha1.Cluster, - components []workv1alpha2.Component, - upperBound int64, -) (int64, error) { - if upperBound <= 0 { - return 0, nil - } - - // Compressed one-set: per-kind (identical replicas grouped) - oneSetKinds := expandKindsOneSet(components) - if len(oneSetKinds) == 0 { - // If there are no pods to schedule, just return upperBound - return upperBound, nil - } - - // Use cluster "available" totals (allocatable - allocated - allocating) for normalized scoring - // This reflects what the cluster can actually accept now - totals := availableResourceMap(cluster.Status.ResourceSummary) - - for i := range oneSetKinds { - oneSetKinds[i].score = demandScoreNormalized(oneSetKinds[i].dem, totals) - } - sort.Slice(oneSetKinds, func(i, j int) bool { - if oneSetKinds[i].score == oneSetKinds[j].score { - return demandSum(oneSetKinds[i].dem) > demandSum(oneSetKinds[j].dem) - } - return oneSetKinds[i].score > oneSetKinds[j].score - }) - - //Build model nodes from Spec.ResourceModels and Status.AllocatableModelings - nodes, err := buildModelNodes(cluster) +func getMaximumSetsBasedOnResourceModels(cluster *clusterv1alpha1.Cluster, components []workv1alpha2.Component, upperSets int32) (int32, error) { + nodes, err := getNodesAvailableResources(cluster) if err != nil { return -1, err } - if len(nodes) == 0 { - return 0, nil - } - var sets int64 - for sets < upperBound { - if !placeOneSet(oneSetKinds, nodes) { - break + pbComponents := make([]pb.Component, 0, len(components)) + for _, comp := range components { + // Deep-copy so that pointer is not shared between goroutines + var cr *workv1alpha2.ComponentReplicaRequirements + if comp.ReplicaRequirements != nil { + cr = comp.ReplicaRequirements.DeepCopy() } - sets++ - } - return sets, nil -} -// placeOneSet attempts to place exactly ONE full set (all kinds with their per-set replica counts) -// onto the provided working node capacities (in-place) -// Returns true if successful -func placeOneSet(orderedKinds []replicaKind, work []modelNode) bool { - for _, k := range orderedKinds { - remaining := k.count - if remaining <= 0 { - continue - } - // first-fit across nodes - for n := range work { - if remaining <= 0 { - break - } - fit := maxFit(work[n].cap, k.dem) - if fit <= 0 { - continue - } - place := fit - if place > remaining { - place = remaining - } - consumeMul(work[n].cap, k.dem, place) - remaining -= place - } - if remaining > 0 { - return false - } + pbComponents = append(pbComponents, pb.Component{ + Name: comp.Name, + Replicas: comp.Replicas, + ReplicaRequirements: toPBReplicaRequirements(cr), + }) } - return true -} -// modelNode holds remaining capacity for a given node across all resource types -type modelNode struct { - cap map[corev1.ResourceName]int64 + matchNode := func(_ *pb.NodeClaim, _ *schedulerframework.NodeInfo) bool { + // Always match since resource models lack node affinity/toleration info, so we skip these checks. + return true + } + return estimator.NewSchedulingSimulator(nodes, matchNode).SimulateSchedulingFFD(pbComponents, upperSets), nil } // buildModelNodes constructs identical nodes for each model grade using its Min vector, // repeated AllocatableModelings[grade].Count times. Grades are indexed directly. -func buildModelNodes(cluster *clusterv1alpha1.Cluster) ([]modelNode, error) { +func getNodesAvailableResources(cluster *clusterv1alpha1.Cluster) ([]*schedulerframework.NodeInfo, error) { if cluster == nil { return nil, fmt.Errorf("nil cluster") } @@ -267,12 +212,14 @@ func buildModelNodes(cluster *clusterv1alpha1.Cluster) ([]modelNode, error) { } // Build capacity template per grade - capsByGrade := make(map[uint]map[corev1.ResourceName]int64, len(spec)) + capsByGrade := make(map[uint]corev1.ResourceList, len(spec)) for _, m := range spec { - tmpl := make(map[corev1.ResourceName]int64, len(m.Ranges)) + tmpl := make(corev1.ResourceList, len(m.Ranges)) for _, r := range m.Ranges { - tmpl[r.Name] = quantityAsInt64(r.Min) + tmpl[r.Name] = r.Min } + // Pods resource is not modeled in ResourceModels, so we set it to MaxInt64 to avoid limiting scheduling. + tmpl[corev1.ResourcePods] = *resource.NewQuantity(math.MaxInt64, resource.DecimalSI) capsByGrade[m.Grade] = tmpl } @@ -293,122 +240,21 @@ func buildModelNodes(cluster *clusterv1alpha1.Cluster) ([]modelNode, error) { sort.Ints(grades) // Emit nodes for grades present in both spec & status. - var nodes []modelNode + var nodes []*schedulerframework.NodeInfo for _, grade := range grades { tmpl, cnt := capsByGrade[uint(grade)], countByGrade[uint(grade)] // #nosec G115: integer overflow conversion int -> uint if tmpl == nil || cnt == 0 { continue } - for range cnt { - capCopy := maps.Clone(tmpl) - nodes = append(nodes, modelNode{cap: capCopy}) - } - } - return nodes, nil -} -// replicaKind represents a single type of component, including replica demand and count -type replicaKind struct { - dem map[corev1.ResourceName]int64 // per-replica demand - count int64 // how many replicas - score float64 // ordering heuristic (higher first) -} - -// expandKindsOneSet flattens components into a slice of unique replica kinds. -// Each entry holds the per-replica demand and how many replicas of that kind a set needs. -func expandKindsOneSet(components []workv1alpha2.Component) []replicaKind { - kinds := make([]replicaKind, 0, len(components)) - for _, c := range components { - if c.ReplicaRequirements == nil || c.ReplicaRequirements.ResourceRequest == nil { - continue - } - // normalize per-replica demand - base := make(map[corev1.ResourceName]int64, len(c.ReplicaRequirements.ResourceRequest)) - for name, qty := range c.ReplicaRequirements.ResourceRequest { - base[name] = quantityAsInt64(qty) - } - // skip zero-demand or non-positive replica count - if allZero(base) || c.Replicas <= 0 { - continue - } - - k := replicaKind{ - dem: base, - count: int64(c.Replicas), - // score is filled later once we know cluster-wide totals - } - kinds = append(kinds, k) - } - return kinds -} - -// demandScoreNormalized returns the "max utilization ratio" of a demand vector against total capacities -// If a resource is missing/zero in total, treat it as maximally constrained -func demandScoreNormalized( - demand map[corev1.ResourceName]int64, - total map[corev1.ResourceName]int64, -) float64 { - var maxRatio float64 - for res, req := range demand { - if req <= 0 { - continue - } - totalCap := float64(total[res]) - if totalCap <= 0 { - return math.MaxFloat64 - } - ratio := float64(req) / totalCap - if ratio > maxRatio { - maxRatio = ratio - } - } - return maxRatio -} - -// demandSum is used as a tie-breaker when initial scores are equal -func demandSum(m map[corev1.ResourceName]int64) int64 { - var s int64 - for _, v := range m { - if v > 0 { - s += v - } - } - return s -} - -// maxFit returns how many copies of `dem` fit in `cap` simultaneously -func maxFit(capacity map[corev1.ResourceName]int64, dem map[corev1.ResourceName]int64) int64 { - var limit int64 = math.MaxInt64 - for k, req := range dem { - if req <= 0 { - continue - } - avail := capacity[k] - if avail <= 0 { - return 0 - } - bound := avail / req - if bound < limit { - limit = bound - } - } - if limit == math.MaxInt64 { - return 0 - } - return limit -} - -// consumeMul subtracts mult * dem from cap -func consumeMul(capacity map[corev1.ResourceName]int64, dem map[corev1.ResourceName]int64, mult int64) { - if mult <= 0 { - return - } - for k, req := range dem { - if req <= 0 { - continue + for i := 0; i < cnt; i++ { + node := &schedulerframework.NodeInfo{ + Allocatable: util.NewResource(tmpl), + } + nodes = append(nodes, node) } - capacity[k] -= req * mult } + return nodes, nil } // podsInSet computes the total number of pods in the CRD diff --git a/pkg/estimator/client/general_test.go b/pkg/estimator/client/general_test.go index 2a4d95e739cf..f35c03494c01 100644 --- a/pkg/estimator/client/general_test.go +++ b/pkg/estimator/client/general_test.go @@ -244,16 +244,16 @@ func comp(name string, replicas int32, rl corev1.ResourceList) workv1alpha2.Comp func TestGetMaximumSetsBasedOnResourceModels(t *testing.T) { const ( GPU corev1.ResourceName = "nvidia.com/gpu" - BIGU int64 = 100 // define a large upper bound so we can test model decision algo + BIGU int32 = 100 // define a large upper bound so we can test model decision algo ) tests := []struct { name string cluster clusterv1alpha1.Cluster components []workv1alpha2.Component - upperBound int64 + upperBound int32 expectError bool - expectedSets int64 + expectedSets int32 }{ { name: "No grades defined → error", diff --git a/pkg/estimator/scheduling_simulator_components.go b/pkg/estimator/scheduling_simulator_components.go new file mode 100644 index 000000000000..38043b2b4bb5 --- /dev/null +++ b/pkg/estimator/scheduling_simulator_components.go @@ -0,0 +1,175 @@ +/* +Copyright 2025 The Karmada Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package estimator + +import ( + "sort" + + "github.com/karmada-io/karmada/pkg/estimator/pb" + "github.com/karmada-io/karmada/pkg/util" + schedulerframework "github.com/karmada-io/karmada/pkg/util/lifted/scheduler/framework" +) + +// SchedulingSimulator simulates a scheduling process to estimate workload capacity. +// It uses the First Fit Decreasing (FFD) algorithm to pack components into available nodes efficiently. +type SchedulingSimulator struct { + // nodes represent the available cluster nodes with their resource capacity + nodes []*schedulerframework.NodeInfo + // matchNode is a function to check if a component can be scheduled on a specific node + // based on node affinity, tolerations, and other scheduling constraints + matchNode func(nodeClaim *pb.NodeClaim, node *schedulerframework.NodeInfo) bool +} + +// NewSchedulingSimulator creates a new scheduling simulator instance. +func NewSchedulingSimulator(nodes []*schedulerframework.NodeInfo, matchNode func(nodeClaim *pb.NodeClaim, node *schedulerframework.NodeInfo) bool) *SchedulingSimulator { + return &SchedulingSimulator{ + nodes: nodes, + matchNode: matchNode, + } +} + +// SimulateSchedulingFFD implements the First Fit Decreasing (FFD) algorithm to estimate +// the maximum number of complete component sets that can be scheduled on the cluster. +// +// FFD Algorithm Steps: +// 1. Sort components by their "difficulty to schedule" (decreasing order of resource constraints) +// 2. For each complete set, try to schedule all components using first-fit strategy +// 3. Continue until no more complete sets can be scheduled or upper limit is reached +func (s *SchedulingSimulator) SimulateSchedulingFFD(components []pb.Component, upperSets int32) int32 { + // Calculate total cluster resources for component sorting + totalResource := util.EmptyResource() + for _, node := range s.nodes { + totalResource.Add(node.Allocatable.ResourceList()) + } + + componentsCopy := make([]pb.Component, len(components)) + copy(componentsCopy, components) + // Sort components in decreasing order of scheduling difficulty + // Components that are harder to place (fewer possible placements) are scheduled first + sort.Slice(componentsCopy, func(i, j int) bool { + iMaxSets := totalResource.MaxDivided(componentsCopy[i].ReplicaRequirements.ResourceRequest) + jMaxSets := totalResource.MaxDivided(componentsCopy[j].ReplicaRequirements.ResourceRequest) + if iMaxSets == jMaxSets { + // Use tie-breaker when components have similar scheduling difficulty + return tieBreaker(componentsCopy[i], componentsCopy[j]) + } + // Schedule components with fewer possible placements first (harder to place) + return iMaxSets < jMaxSets + }) + + var completeSets int32 + // Try to schedule complete component sets until we can no longer do so or reach the upper limit. + for { + if completeSets < upperSets && s.scheduleComponentSet(componentsCopy) { + completeSets++ + } else { + break + } + } + + return completeSets +} + +// tieBreaker resolves ties when two components have the same scheduling difficulty. +// It compares components by their total resource consumption to prioritize larger components first. +func tieBreaker(componentA, componentB pb.Component) bool { + resourceA := util.NewResource(componentA.ReplicaRequirements.ResourceRequest) + resourceB := util.NewResource(componentB.ReplicaRequirements.ResourceRequest) + + // Calculate total resource score for a component + getScore := func(res *util.Resource) int64 { + score := res.MilliCPU + res.Memory + res.EphemeralStorage + for _, v := range res.ScalarResources { + score += v + } + return score + } + + return getScore(resourceA) > getScore(resourceB) +} + +// scheduleComponentSet attempts to schedule one complete set of all components. +func (s *SchedulingSimulator) scheduleComponentSet(components []pb.Component) bool { + for _, component := range components { + if !s.scheduleComponent(component) { + return false + } + } + + return true +} + +func (s *SchedulingSimulator) scheduleComponent(component pb.Component) bool { + task := newSchedulingTask(component) + for _, node := range s.nodes { + if !s.matchNode(task.nodeClaim, node) { + continue + } + + for node.Allocatable.Allocatable(task.requiredResourcePerReplica) { + // Assign one replica to this node. + task.scheduleOnePod(node) + if task.done() { + // short path + return true + } + } + } + + return task.done() +} + +// componentSchedulingTask represents a single component scheduling task with its requirements and state. +type componentSchedulingTask struct { + // nodeClaim represents the NodeAffinity, NodeSelector and Tolerations required by this component. + nodeClaim *pb.NodeClaim + // requiredResourcePerReplica represents the resources required by a single replica of this component. + requiredResourcePerReplica *util.Resource + // toBeScheduled tracks how many replicas of this component still need to be scheduled + toBeScheduled int32 +} + +// done returns true if the task has been completely scheduled (no replicas remaining). +// This indicates that a complete component has been successfully allocated. +func (t *componentSchedulingTask) done() bool { + return t.toBeScheduled == 0 +} + +// scheduleOnePod schedules one replica of this component on the specified node. +// It decrements the remaining replica count and subtracts the required resources from the node. +// This should be called when a replica has been successfully scheduled on a node. +func (t *componentSchedulingTask) scheduleOnePod(node *schedulerframework.NodeInfo) { + if t.toBeScheduled <= 0 { + // No more replicas to schedule + return + } + + node.Allocatable.SubResource(t.requiredResourcePerReplica) + t.toBeScheduled-- +} + +// newSchedulingTask creates a new component scheduling task from the given component. +// It initializes the task with the component's node claim, required resources per replica, and total replicas to be scheduled. +func newSchedulingTask(component pb.Component) componentSchedulingTask { + needResource := util.NewResource(component.ReplicaRequirements.ResourceRequest) + needResource.AllowedPodNumber = 1 + return componentSchedulingTask{ + nodeClaim: component.ReplicaRequirements.NodeClaim, + requiredResourcePerReplica: needResource, + toBeScheduled: component.Replicas, + } +} diff --git a/pkg/estimator/server/framework/plugins/noderesource/noderesource.go b/pkg/estimator/server/framework/plugins/noderesource/noderesource.go index a45d72c60e8d..5157668a8ed9 100644 --- a/pkg/estimator/server/framework/plugins/noderesource/noderesource.go +++ b/pkg/estimator/server/framework/plugins/noderesource/noderesource.go @@ -25,6 +25,7 @@ import ( corev1 "k8s.io/api/core/v1" "k8s.io/klog/v2" + "github.com/karmada-io/karmada/pkg/estimator" "github.com/karmada-io/karmada/pkg/estimator/pb" "github.com/karmada-io/karmada/pkg/estimator/server/framework" nodeutil "github.com/karmada-io/karmada/pkg/estimator/server/nodes" @@ -128,12 +129,7 @@ func (pl *nodeResourceEstimator) EstimateComponents(_ context.Context, snapshot return 0, framework.AsResult(err) } - var sets int32 - // Keep scheduling full component sets until one fails to fit. - for scheduleComponentSet(components, nodes) { - sets++ - } - + sets := estimator.NewSchedulingSimulator(nodes, matchNode).SimulateSchedulingFFD(components, math.MaxInt32) if sets == 0 { return 0, framework.NewResult(framework.Unschedulable, "no enough resources") } @@ -160,84 +156,6 @@ func getNodesAvailableResources(snapshot *schedcache.Snapshot) ([]*schedulerfram return rest, nil } -// scheduleComponentSet attempts to schedule one complete set of components across the available nodes. -// It returns true if all components in the set can be successfully scheduled, false otherwise. -// The function modifies the node resources as it assigns replicas to simulate actual scheduling. -func scheduleComponentSet(components []pb.Component, allNodes []*schedulerframework.NodeInfo) bool { - for _, component := range components { - if !scheduleComponent(component, allNodes) { - return false - } - } - - return true -} - -// scheduleComponent attempts to schedule all replicas of a single component across the available nodes. -// It iterates through nodes to find suitable ones and schedules as many replicas as possible on each node. -// Returns true if all replicas of the component can be successfully scheduled, false otherwise. -func scheduleComponent(component pb.Component, allNodes []*schedulerframework.NodeInfo) bool { - t := newSchedulingTask(component) - - for _, node := range allNodes { - if !matchNode(t.nodeClaim, node) { - continue - } - - for node.Allocatable.Allocatable(t.requiredResourcePerReplica) { - // Assign one replica to this node. - t.scheduleOnePod(node) - if t.done() { - // short path - return true - } - } - } - - return t.done() -} - -// componentSchedulingTask represents a single component scheduling task with its requirements and state. -type componentSchedulingTask struct { - // nodeClaim represents the NodeAffinity, NodeSelector and Tolerations required by this component. - nodeClaim *pb.NodeClaim - // requiredResourcePerReplica represents the resources required by a single replica of this component. - requiredResourcePerReplica *util.Resource - // toBeScheduled tracks how many replicas of this component still need to be scheduled - toBeScheduled int32 -} - -// newSchedulingTask creates a new component scheduling task from the given component. -// It initializes the task with the component's node claim, required resources per replica, and total replicas to be scheduled. -func newSchedulingTask(component pb.Component) componentSchedulingTask { - needResource := util.NewResource(component.ReplicaRequirements.ResourceRequest) - needResource.AllowedPodNumber = 1 - return componentSchedulingTask{ - nodeClaim: component.ReplicaRequirements.NodeClaim, - requiredResourcePerReplica: needResource, - toBeScheduled: component.Replicas, - } -} - -// done returns true if the task has been completely scheduled (no replicas remaining). -// This indicates that a complete component has been successfully allocated. -func (t *componentSchedulingTask) done() bool { - return t.toBeScheduled == 0 -} - -// scheduleOnePod schedules one replica of this component on the specified node. -// It decrements the remaining replica count and subtracts the required resources from the node. -// This should be called when a replica has been successfully scheduled on a node. -func (t *componentSchedulingTask) scheduleOnePod(node *schedulerframework.NodeInfo) { - if t.toBeScheduled <= 0 { - // No more replicas to schedule - return - } - - node.Allocatable.SubResource(t.requiredResourcePerReplica) - t.toBeScheduled-- -} - // matchNode checks whether the node matches the scheduling constraints defined in the replica requirements. func matchNode(nodeClaim *pb.NodeClaim, node *schedulerframework.NodeInfo) bool { affinity := nodeutil.GetRequiredNodeAffinity(pb.ReplicaRequirements{NodeClaim: nodeClaim})