Skip to content

Commit b8db30c

Browse files
committed
Improve events when max total nodes of the cluster is reached.
- log cluster wide event - previous event would never get fired because the estimators would already cap the options they generate and additionally it would fire once and events are kept only for some time - log per pod event explaining why the scale up is not triggered (previously it would either get no scale up because no matching group or it would not get an event at all) This required adding a list of pods that were unschedulable to the status in case when the max total nodes were reached.
1 parent cf115af commit b8db30c

File tree

3 files changed

+42
-3
lines changed

3 files changed

+42
-3
lines changed

cluster-autoscaler/core/static_autoscaler.go

+11
Original file line numberDiff line numberDiff line change
@@ -528,7 +528,18 @@ func (a *StaticAutoscaler) RunOnce(currentTime time.Time) caerrors.AutoscalerErr
528528
} else if a.MaxNodesTotal > 0 && len(readyNodes) >= a.MaxNodesTotal {
529529
scaleUpStatus.Result = status.ScaleUpLimitedByMaxNodesTotal
530530
klog.Warningf("Max total nodes in cluster reached: %v. Current number of ready nodes: %v", a.MaxNodesTotal, len(readyNodes))
531+
autoscalingContext.LogRecorder.Eventf(apiv1.EventTypeWarning, "MaxNodesTotalReached",
532+
"Max total nodes in cluster reached: %v", autoscalingContext.MaxNodesTotal)
531533
shouldScaleUp = false
534+
535+
noScaleUpInfoForPods := []status.NoScaleUpInfo{}
536+
for _, pod := range unschedulablePodsToHelp {
537+
noScaleUpInfo := status.NoScaleUpInfo{
538+
Pod: pod,
539+
}
540+
noScaleUpInfoForPods = append(noScaleUpInfoForPods, noScaleUpInfo)
541+
}
542+
scaleUpStatus.PodsRemainUnschedulable = noScaleUpInfoForPods
532543
} else if len(a.BypassedSchedulers) == 0 && allPodsAreNew(unschedulablePodsToHelp, currentTime) {
533544
// The assumption here is that these pods have been created very recently and probably there
534545
// is more pods to come. In theory we could check the newest pod time but then if pod were created

cluster-autoscaler/processors/status/eventing_scale_up_processor.go

+6-2
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ func (p *EventingScaleUpStatusProcessor) Process(context *context.AutoscalingCon
4141
for _, noScaleUpInfo := range status.PodsRemainUnschedulable {
4242
context.Recorder.Event(noScaleUpInfo.Pod, apiv1.EventTypeNormal, "NotTriggerScaleUp",
4343
fmt.Sprintf("pod didn't trigger scale-up: %s",
44-
ReasonsMessage(noScaleUpInfo, consideredNodeGroupsMap)))
44+
ReasonsMessage(status.Result, noScaleUpInfo, consideredNodeGroupsMap)))
4545
}
4646
} else {
4747
klog.V(4).Infof("Skipping event processing for unschedulable pods since there is a" +
@@ -60,7 +60,11 @@ func (p *EventingScaleUpStatusProcessor) CleanUp() {
6060
}
6161

6262
// ReasonsMessage aggregates reasons from NoScaleUpInfos.
63-
func ReasonsMessage(noScaleUpInfo NoScaleUpInfo, consideredNodeGroups map[string]cloudprovider.NodeGroup) string {
63+
func ReasonsMessage(scaleUpStatus ScaleUpResult, noScaleUpInfo NoScaleUpInfo, consideredNodeGroups map[string]cloudprovider.NodeGroup) string {
64+
if scaleUpStatus == ScaleUpLimitedByMaxNodesTotal {
65+
return "max total nodes in cluster reached"
66+
}
67+
6468
messages := []string{}
6569
aggregated := map[string]int{}
6670
for nodeGroupId, reasons := range noScaleUpInfo.RejectedNodeGroups {

cluster-autoscaler/processors/status/eventing_scale_up_processor_test.go

+25-1
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,21 @@ func TestEventingScaleUpStatusProcessor(t *testing.T) {
101101
expectedTriggered: 0,
102102
expectedNoTriggered: 0,
103103
},
104+
{
105+
caseName: "No scale up; max total nodes in cluster reached",
106+
state: &ScaleUpStatus{
107+
Result: ScaleUpLimitedByMaxNodesTotal,
108+
ScaleUpInfos: []nodegroupset.ScaleUpInfo{{}},
109+
PodsTriggeredScaleUp: []*apiv1.Pod{},
110+
PodsRemainUnschedulable: []NoScaleUpInfo{
111+
{Pod: p1},
112+
{Pod: p2},
113+
{Pod: p3},
114+
},
115+
},
116+
expectedTriggered: 0,
117+
expectedNoTriggered: 3,
118+
},
104119
}
105120

106121
for _, tc := range testCases {
@@ -166,9 +181,18 @@ func TestReasonsMessage(t *testing.T) {
166181
"2 max limit reached",
167182
"1 not ready",
168183
}
169-
result := ReasonsMessage(NoScaleUpInfo{nil, rejected, skipped}, considered)
184+
result := ReasonsMessage(ScaleUpNoOptionsAvailable, NoScaleUpInfo{nil, rejected, skipped}, considered)
170185

171186
for _, part := range expected {
172187
assert.Contains(t, result, part)
173188
}
174189
}
190+
191+
func TestReasonsMessageWhenScaleUpLimitedByMaxNodesTotal(t *testing.T) {
192+
considered := map[string]cloudprovider.NodeGroup{}
193+
noScaleUpInfo := NoScaleUpInfo{
194+
Pod: nil,
195+
}
196+
result := ReasonsMessage(ScaleUpLimitedByMaxNodesTotal, noScaleUpInfo, considered)
197+
assert.Contains(t, result, "max total nodes in cluster reached")
198+
}

0 commit comments

Comments
 (0)