@@ -33,7 +33,7 @@ import (
33
33
disruptionevents "sigs.k8s.io/karpenter/pkg/controllers/disruption/events"
34
34
"sigs.k8s.io/karpenter/pkg/controllers/disruption/orchestration"
35
35
"sigs.k8s.io/karpenter/pkg/controllers/provisioning"
36
- pscheduling "sigs.k8s.io/karpenter/pkg/controllers/provisioning/scheduling"
36
+ "sigs.k8s.io/karpenter/pkg/controllers/provisioning/scheduling"
37
37
"sigs.k8s.io/karpenter/pkg/controllers/state"
38
38
"sigs.k8s.io/karpenter/pkg/events"
39
39
"sigs.k8s.io/karpenter/pkg/metrics"
@@ -48,7 +48,7 @@ var errCandidateDeleting = fmt.Errorf("candidate is deleting")
48
48
//nolint:gocyclo
49
49
func SimulateScheduling (ctx context.Context , kubeClient client.Client , cluster * state.Cluster , provisioner * provisioning.Provisioner ,
50
50
candidates ... * Candidate ,
51
- ) (pscheduling .Results , error ) {
51
+ ) (scheduling .Results , error ) {
52
52
candidateNames := sets .NewString (lo .Map (candidates , func (t * Candidate , i int ) string { return t .Name () })... )
53
53
nodes := cluster .Nodes ()
54
54
deletingNodes := nodes .Deleting ()
@@ -62,33 +62,45 @@ func SimulateScheduling(ctx context.Context, kubeClient client.Client, cluster *
62
62
if _ , ok := lo .Find (deletingNodes , func (n * state.StateNode ) bool {
63
63
return candidateNames .Has (n .Name ())
64
64
}); ok {
65
- return pscheduling .Results {}, errCandidateDeleting
65
+ return scheduling .Results {}, errCandidateDeleting
66
66
}
67
67
68
68
// We get the pods that are on nodes that are deleting
69
69
deletingNodePods , err := deletingNodes .ReschedulablePods (ctx , kubeClient )
70
70
if err != nil {
71
- return pscheduling .Results {}, fmt .Errorf ("failed to get pods from deleting nodes, %w" , err )
71
+ return scheduling .Results {}, fmt .Errorf ("failed to get pods from deleting nodes, %w" , err )
72
72
}
73
73
// start by getting all pending pods
74
74
pods , err := provisioner .GetPendingPods (ctx )
75
75
if err != nil {
76
- return pscheduling .Results {}, fmt .Errorf ("determining pending pods, %w" , err )
76
+ return scheduling .Results {}, fmt .Errorf ("determining pending pods, %w" , err )
77
77
}
78
78
for _ , n := range candidates {
79
79
pods = append (pods , n .reschedulablePods ... )
80
80
}
81
81
pods = append (pods , deletingNodePods ... )
82
- scheduler , err := provisioner .NewScheduler (log .IntoContext (ctx , operatorlogging .NopLogger ), pods , stateNodes )
82
+ scheduler , err := provisioner .NewScheduler (
83
+ log .IntoContext (ctx , operatorlogging .NopLogger ),
84
+ pods ,
85
+ stateNodes ,
86
+ // ReservedOfferingModeFallback is used for the following reasons:
87
+ // - For consolidation, we're only going to accept a decision if it lowers the cost of the cluster, and if it only
88
+ // requires a single additional nodeclaim. It doesn't matter in this scenario if we fallback.
89
+ // - For drift, fallback is required to ensure progress. Progress is only ensured with strict if multiple scheduling
90
+ // loops are allowed to proceed, but we need to ensure all pods on the drifted node are scheduled within a single
91
+ // iteration. This may result in non-ideal instance choices, but the alternative is deadlock.
92
+ // See issue TODO for more details.
93
+ scheduling .ReservedOfferingModeFallback ,
94
+ )
83
95
if err != nil {
84
- return pscheduling .Results {}, fmt .Errorf ("creating scheduler, %w" , err )
96
+ return scheduling .Results {}, fmt .Errorf ("creating scheduler, %w" , err )
85
97
}
86
98
87
99
deletingNodePodKeys := lo .SliceToMap (deletingNodePods , func (p * corev1.Pod ) (client.ObjectKey , interface {}) {
88
100
return client .ObjectKeyFromObject (p ), nil
89
101
})
90
102
91
- results := scheduler .Solve (log .IntoContext (ctx , operatorlogging .NopLogger ), pods ).TruncateInstanceTypes (pscheduling .MaxInstanceTypes )
103
+ results := scheduler .Solve (log .IntoContext (ctx , operatorlogging .NopLogger ), pods ).TruncateInstanceTypes (scheduling .MaxInstanceTypes )
92
104
for _ , n := range results .ExistingNodes {
93
105
// We consider existing nodes for scheduling. When these nodes are unmanaged, their taint logic should
94
106
// tell us if we can schedule to them or not; however, if these nodes are managed, we will still schedule to them
@@ -100,6 +112,7 @@ func SimulateScheduling(ctx context.Context, kubeClient client.Client, cluster *
100
112
// If the pod is on a deleting node, we assume one of two things has already happened:
101
113
// 1. The node was manually terminated, at which the provisioning controller has scheduled or is scheduling a node
102
114
// for the pod.
115
+ // TODO: clarify this point, not clear to me
103
116
// 2. The node was chosen for a previous disruption command, we assume that the uninitialized node will come up
104
117
// for this command, and we assume it will be successful. If it is not successful, the node will become
105
118
// not terminating, and we will no longer need to consider these pods.
@@ -115,10 +128,10 @@ func SimulateScheduling(ctx context.Context, kubeClient client.Client, cluster *
115
128
// UninitializedNodeError tracks a special pod error for disruption where pods schedule to a node
116
129
// that hasn't been initialized yet, meaning that we can't be confident to make a disruption decision based off of it
117
130
type UninitializedNodeError struct {
118
- * pscheduling .ExistingNode
131
+ * scheduling .ExistingNode
119
132
}
120
133
121
- func NewUninitializedNodeError (node * pscheduling .ExistingNode ) * UninitializedNodeError {
134
+ func NewUninitializedNodeError (node * scheduling .ExistingNode ) * UninitializedNodeError {
122
135
return & UninitializedNodeError {ExistingNode : node }
123
136
}
124
137
0 commit comments