Skip to content

Commit 95ca008

Browse files
mikecirioliapuig
andauthored
[JENKINS-76171] fix overprovisioning of ec2 agents when rapidly scheduling nodes (#1149)
* initial impl fixing overprovisioning * not needed * tests * cleanup * noop * Apply suggestion from @apuig Co-authored-by: Albert Puig <[email protected]> * better test readability * missed commit --------- Co-authored-by: Albert Puig <[email protected]>
1 parent 0a11fb7 commit 95ca008

File tree

4 files changed

+531
-14
lines changed

4 files changed

+531
-14
lines changed

src/main/java/hudson/plugins/ec2/EC2Cloud.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1189,10 +1189,11 @@ public Node call() throws Exception {
11891189

11901190
InstanceStateName state = instance.state().name();
11911191
if (state.equals(InstanceStateName.RUNNING)) {
1192-
// Spot instance are not reconnected automatically,
1193-
// but could be new orphans that has the option enable
1192+
// JENKINS-76171: Always initiate connection when instance is RUNNING
1193+
// This reduces the gap between PlannedNode completion and agent connection,
1194+
// preventing over-provisioning in NodeProvisioner
11941195
Computer c = slave.toComputer();
1195-
if (slave.getStopOnTerminate() && (c != null)) {
1196+
if (c != null) {
11961197
c.connect(false);
11971198
}
11981199

src/main/java/hudson/plugins/ec2/NoDelayProvisionerStrategy.java

Lines changed: 84 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,11 @@
11
package hudson.plugins.ec2;
22

3+
import com.google.common.annotations.VisibleForTesting;
34
import hudson.Extension;
5+
import hudson.model.Computer;
46
import hudson.model.Label;
57
import hudson.model.LoadStatistics;
8+
import hudson.model.Node;
69
import hudson.slaves.Cloud;
710
import hudson.slaves.NodeProvisioner;
811
import java.util.Collection;
@@ -27,12 +30,22 @@ public NodeProvisioner.StrategyDecision apply(NodeProvisioner.StrategyState stra
2730
final Label label = strategyState.getLabel();
2831

2932
LoadStatistics.LoadStatisticsSnapshot snapshot = strategyState.getSnapshot();
30-
int availableCapacity = snapshot.getAvailableExecutors() // live executors
33+
34+
// JENKINS-76171: Count provisioned EC2 nodes that exist but haven't started executing jobs yet.
35+
// This prevents over-provisioning by accounting for nodes in the gap between:
36+
// 1) PlannedNode future completing (instance RUNNING)
37+
// 2) Agent showing as "connecting" in the snapshot
38+
// 3) Agent executing jobs
39+
int provisionedButNotExecuting = countProvisionedButNotExecutingNodes(label);
40+
41+
int availableCapacity = snapshot.getAvailableExecutors() // live executors (idle)
3142
+ snapshot.getConnectingExecutors() // executors present but not yet connected
3243
+ strategyState
3344
.getPlannedCapacitySnapshot() // capacity added by previous strategies from previous rounds
34-
+ strategyState.getAdditionalPlannedCapacity(); // capacity added by previous strategies _this round_
45+
+ strategyState.getAdditionalPlannedCapacity() // capacity added by previous strategies _this round_
46+
+ provisionedButNotExecuting; // EC2 nodes that exist but aren't yet counted above
3547
int currentDemand = snapshot.getQueueLength();
48+
3649
LOGGER.log(
3750
Level.FINE, "Available capacity={0}, currentDemand={1}", new Object[] {availableCapacity, currentDemand
3851
});
@@ -49,8 +62,12 @@ public NodeProvisioner.StrategyDecision apply(NodeProvisioner.StrategyState stra
4962
continue;
5063
}
5164

65+
int numToProvision = currentDemand - availableCapacity;
66+
LOGGER.log(Level.FINE, "Planned {0} new nodes", numToProvision);
67+
5268
Collection<NodeProvisioner.PlannedNode> plannedNodes =
53-
cloud.provision(new Cloud.CloudState(label, 0), currentDemand - availableCapacity);
69+
cloud.provision(new Cloud.CloudState(label, 0), numToProvision);
70+
5471
LOGGER.log(Level.FINE, "Planned {0} new nodes", plannedNodes.size());
5572
strategyState.recordPendingLaunches(plannedNodes);
5673
availableCapacity += plannedNodes.size();
@@ -68,4 +85,68 @@ public NodeProvisioner.StrategyDecision apply(NodeProvisioner.StrategyState stra
6885
return NodeProvisioner.StrategyDecision.CONSULT_REMAINING_STRATEGIES;
6986
}
7087
}
88+
89+
/**
90+
* Counts executors in EC2 nodes that have been provisioned (exist in Jenkins) but are NOT yet counted in the
91+
* LoadStatistics snapshot. This specifically targets the gap where nodes exist but are:
92+
* - Offline (just added to Jenkins, before connecting starts)
93+
*
94+
* We explicitly DO NOT count:
95+
* - Connecting nodes (already in snapshot.getConnectingExecutors())
96+
* - Online nodes (already in snapshot.getAvailableExecutors() or busy executors)
97+
*
98+
* This prevents over-provisioning by accounting for nodes in the critical gap between:
99+
* 1) Node added to Jenkins (after PlannedNode future completes)
100+
* 2) Node starts connecting (shows up in snapshot.getConnectingExecutor())
101+
*
102+
* @param label the label to match, or null for unlabeled nodes
103+
* @return the number of executors from provisioned EC2 nodes in the offline->connecting gap
104+
*/
105+
@VisibleForTesting
106+
int countProvisionedButNotExecutingNodes(Label label) {
107+
Jenkins jenkins = Jenkins.get();
108+
// Use Label.getNodes() to leverage core's label matching and caching
109+
java.util.Set<Node> nodes = (label != null) ? label.getNodes() : java.util.Set.copyOf(jenkins.getNodes());
110+
111+
int count = 0;
112+
int totalEC2Nodes = 0;
113+
int offlineNodes = 0;
114+
int connectingNodes = 0;
115+
int onlineNodes = 0;
116+
117+
for (Node node : nodes) {
118+
// Only count EC2 nodes
119+
if (!(node instanceof EC2AbstractSlave)) {
120+
continue;
121+
}
122+
totalEC2Nodes++;
123+
124+
Computer computer = node.toComputer();
125+
if (computer == null) {
126+
continue;
127+
}
128+
129+
// Track node states for debugging
130+
if (computer.isOnline()) {
131+
onlineNodes++;
132+
} else if (computer.isConnecting()) {
133+
connectingNodes++;
134+
} else if (computer.isOffline()) {
135+
offlineNodes++;
136+
}
137+
138+
// Only count nodes that are OFFLINE (not connecting, not online)
139+
// These are in the gap between being added to Jenkins and starting to connect
140+
if (computer.isOffline() && !computer.isConnecting()) {
141+
count += node.getNumExecutors();
142+
}
143+
}
144+
145+
LOGGER.log(
146+
Level.FINER,
147+
"EC2 nodes for label {0}: total={1}, offline={2}, connecting={3}, online={4}",
148+
new Object[] {label, totalEC2Nodes, offlineNodes, connectingNodes, onlineNodes});
149+
150+
return count;
151+
}
71152
}

src/main/java/hudson/plugins/ec2/util/MinimumInstanceChecker.java

Lines changed: 41 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import java.util.Objects;
2020
import java.util.concurrent.Future;
2121
import java.util.concurrent.TimeUnit;
22+
import java.util.logging.Level;
2223
import java.util.logging.Logger;
2324
import java.util.stream.Stream;
2425
import jenkins.model.Jenkins;
@@ -74,8 +75,30 @@ public static int countQueueItemsForAgentTemplate(@NonNull SlaveTemplate agentTe
7475
.count();
7576
}
7677

77-
public static void checkForMinimumInstances() {
78-
Jenkins.get().clouds.stream()
78+
/**
79+
* Checks all EC2 cloud templates and provisions agents to meet minimum instance requirements.
80+
* Synchronized to prevent concurrent provisioning decisions that could lead to over-provisioning
81+
* when multiple agents accept tasks simultaneously.
82+
*
83+
* @see <a href="https://issues.jenkins.io/browse/JENKINS-76171">JENKINS-76171</a>
84+
*/
85+
public static synchronized void checkForMinimumInstances() {
86+
Jenkins jenkins = Jenkins.get();
87+
88+
// Early exit if no templates have minimum instance requirements
89+
boolean hasMinimumRequirements = jenkins.clouds.stream()
90+
.filter(EC2Cloud.class::isInstance)
91+
.map(EC2Cloud.class::cast)
92+
.flatMap(cloud -> cloud.getTemplates().stream())
93+
.anyMatch(template ->
94+
template.getMinimumNumberOfInstances() > 0 || template.getMinimumNumberOfSpareInstances() > 0);
95+
96+
if (!hasMinimumRequirements) {
97+
// No templates require minimum instances - exit immediately
98+
return;
99+
}
100+
101+
jenkins.clouds.stream()
79102
.filter(EC2Cloud.class::isInstance)
80103
.map(EC2Cloud.class::cast)
81104
.forEach(cloud -> cloud.getTemplates().forEach(agentTemplate -> {
@@ -104,15 +127,25 @@ public static void checkForMinimumInstances() {
104127
// Check if we need to provision any agents because we
105128
// don't have the minimum number of spare agents.
106129
// Don't double provision if minAgents and minSpareAgents are set.
107-
provisionForMinSpareAgents = (requiredMinSpareAgents + currentBuildsWaitingForTemplate)
108-
- (currentNumberOfSpareAgentsForTemplate
109-
+ provisionForMinAgents
110-
+ currentNumberOfProvisioningAgentsForTemplate);
111-
if (provisionForMinSpareAgents < 0) {
112-
provisionForMinSpareAgents = 0;
130+
if (requiredMinSpareAgents > 0) {
131+
provisionForMinSpareAgents = (requiredMinSpareAgents + currentBuildsWaitingForTemplate)
132+
- (currentNumberOfSpareAgentsForTemplate
133+
+ provisionForMinAgents
134+
+ currentNumberOfProvisioningAgentsForTemplate);
135+
if (provisionForMinSpareAgents < 0) {
136+
provisionForMinSpareAgents = 0;
137+
}
113138
}
114139

115140
int numberToProvision = provisionForMinAgents + provisionForMinSpareAgents;
141+
142+
if (numberToProvision > 0 || requiredMinAgents > 0 || requiredMinSpareAgents > 0) {
143+
LOGGER.log(
144+
Level.FINE,
145+
"MinimumInstanceChecker for template {0}: toProvision={1}",
146+
new Object[] {agentTemplate.description, numberToProvision});
147+
}
148+
116149
if (numberToProvision > 0) {
117150
cloud.provision(agentTemplate, numberToProvision);
118151
}

0 commit comments

Comments
 (0)