11package hudson .plugins .ec2 ;
22
3+ import com .google .common .annotations .VisibleForTesting ;
34import hudson .Extension ;
5+ import hudson .model .Computer ;
46import hudson .model .Label ;
57import hudson .model .LoadStatistics ;
8+ import hudson .model .Node ;
69import hudson .slaves .Cloud ;
710import hudson .slaves .NodeProvisioner ;
811import java .util .Collection ;
@@ -27,12 +30,22 @@ public NodeProvisioner.StrategyDecision apply(NodeProvisioner.StrategyState stra
2730 final Label label = strategyState .getLabel ();
2831
2932 LoadStatistics .LoadStatisticsSnapshot snapshot = strategyState .getSnapshot ();
30- int availableCapacity = snapshot .getAvailableExecutors () // live executors
33+
34+ // JENKINS-76171: Count provisioned EC2 nodes that exist but haven't started executing jobs yet.
35+ // This prevents over-provisioning by accounting for nodes in the gap between:
36+ // 1) PlannedNode future completing (instance RUNNING)
37+ // 2) Agent showing as "connecting" in the snapshot
38+ // 3) Agent executing jobs
39+ int provisionedButNotExecuting = countProvisionedButNotExecutingNodes (label );
40+
41+ int availableCapacity = snapshot .getAvailableExecutors () // live executors (idle)
3142 + snapshot .getConnectingExecutors () // executors present but not yet connected
3243 + strategyState
3344 .getPlannedCapacitySnapshot () // capacity added by previous strategies from previous rounds
34- + strategyState .getAdditionalPlannedCapacity (); // capacity added by previous strategies _this round_
45+ + strategyState .getAdditionalPlannedCapacity () // capacity added by previous strategies _this round_
46+ + provisionedButNotExecuting ; // EC2 nodes that exist but aren't yet counted above
3547 int currentDemand = snapshot .getQueueLength ();
48+
3649 LOGGER .log (
3750 Level .FINE , "Available capacity={0}, currentDemand={1}" , new Object [] {availableCapacity , currentDemand
3851 });
@@ -49,8 +62,12 @@ public NodeProvisioner.StrategyDecision apply(NodeProvisioner.StrategyState stra
4962 continue ;
5063 }
5164
65+ int numToProvision = currentDemand - availableCapacity ;
66+ LOGGER .log (Level .FINE , "Planned {0} new nodes" , numToProvision );
67+
5268 Collection <NodeProvisioner .PlannedNode > plannedNodes =
53- cloud .provision (new Cloud .CloudState (label , 0 ), currentDemand - availableCapacity );
69+ cloud .provision (new Cloud .CloudState (label , 0 ), numToProvision );
70+
5471 LOGGER .log (Level .FINE , "Planned {0} new nodes" , plannedNodes .size ());
5572 strategyState .recordPendingLaunches (plannedNodes );
5673 availableCapacity += plannedNodes .size ();
@@ -68,4 +85,68 @@ public NodeProvisioner.StrategyDecision apply(NodeProvisioner.StrategyState stra
6885 return NodeProvisioner .StrategyDecision .CONSULT_REMAINING_STRATEGIES ;
6986 }
7087 }
88+
89+ /**
90+ * Counts executors in EC2 nodes that have been provisioned (exist in Jenkins) but are NOT yet counted in the
91+ * LoadStatistics snapshot. This specifically targets the gap where nodes exist but are:
92+ * - Offline (just added to Jenkins, before connecting starts)
93+ *
94+ * We explicitly DO NOT count:
95+ * - Connecting nodes (already in snapshot.getConnectingExecutors())
96+ * - Online nodes (already in snapshot.getAvailableExecutors() or busy executors)
97+ *
98+ * This prevents over-provisioning by accounting for nodes in the critical gap between:
99+ * 1) Node added to Jenkins (after PlannedNode future completes)
100+ * 2) Node starts connecting (shows up in snapshot.getConnectingExecutor())
101+ *
102+ * @param label the label to match, or null for unlabeled nodes
103+ * @return the number of executors from provisioned EC2 nodes in the offline->connecting gap
104+ */
105+ @ VisibleForTesting
106+ int countProvisionedButNotExecutingNodes (Label label ) {
107+ Jenkins jenkins = Jenkins .get ();
108+ // Use Label.getNodes() to leverage core's label matching and caching
109+ java .util .Set <Node > nodes = (label != null ) ? label .getNodes () : java .util .Set .copyOf (jenkins .getNodes ());
110+
111+ int count = 0 ;
112+ int totalEC2Nodes = 0 ;
113+ int offlineNodes = 0 ;
114+ int connectingNodes = 0 ;
115+ int onlineNodes = 0 ;
116+
117+ for (Node node : nodes ) {
118+ // Only count EC2 nodes
119+ if (!(node instanceof EC2AbstractSlave )) {
120+ continue ;
121+ }
122+ totalEC2Nodes ++;
123+
124+ Computer computer = node .toComputer ();
125+ if (computer == null ) {
126+ continue ;
127+ }
128+
129+ // Track node states for debugging
130+ if (computer .isOnline ()) {
131+ onlineNodes ++;
132+ } else if (computer .isConnecting ()) {
133+ connectingNodes ++;
134+ } else if (computer .isOffline ()) {
135+ offlineNodes ++;
136+ }
137+
138+ // Only count nodes that are OFFLINE (not connecting, not online)
139+ // These are in the gap between being added to Jenkins and starting to connect
140+ if (computer .isOffline () && !computer .isConnecting ()) {
141+ count += node .getNumExecutors ();
142+ }
143+ }
144+
145+ LOGGER .log (
146+ Level .FINER ,
147+ "EC2 nodes for label {0}: total={1}, offline={2}, connecting={3}, online={4}" ,
148+ new Object [] {label , totalEC2Nodes , offlineNodes , connectingNodes , onlineNodes });
149+
150+ return count ;
151+ }
71152}
0 commit comments