diff --git a/src/main/java/hudson/plugins/ec2/EC2RetentionStrategy.java b/src/main/java/hudson/plugins/ec2/EC2RetentionStrategy.java index fd4bcdb59..1abc559fd 100644 --- a/src/main/java/hudson/plugins/ec2/EC2RetentionStrategy.java +++ b/src/main/java/hudson/plugins/ec2/EC2RetentionStrategy.java @@ -41,6 +41,8 @@ import jenkins.model.Jenkins; import org.kohsuke.stapler.DataBoundConstructor; import software.amazon.awssdk.core.exception.SdkException; +import software.amazon.awssdk.services.ec2.Ec2Client; +import software.amazon.awssdk.services.ec2.model.StartInstancesRequest; /** * {@link RetentionStrategy} for EC2. @@ -96,18 +98,24 @@ long getNextCheckAfter() { @Override public long check(EC2Computer c) { + LOGGER.fine("[JENKINS-76200] check() called for " + c.getName() + " (instance: " + c.getInstanceId() + ")"); + if (!checkLock.tryLock()) { + LOGGER.fine("[JENKINS-76200] check() could not acquire lock for " + c.getName()); return CHECK_INTERVAL_MINUTES; } else { try { long currentTime = this.clock.millis(); if (currentTime > nextCheckAfter) { + LOGGER.info("[JENKINS-76200] check() executing for " + c.getName() + + " - calling attemptReconnectIfOffline()"); attemptReconnectIfOffline(c); long intervalMins = internalCheck(c); nextCheckAfter = currentTime + TimeUnit.MINUTES.toMillis(intervalMins); return intervalMins; } else { + LOGGER.fine("[JENKINS-76200] check() skipping (not time yet) for " + c.getName()); return CHECK_INTERVAL_MINUTES; } } finally { @@ -253,28 +261,93 @@ private long internalCheck(EC2Computer computer) { * Try to reconnect the EC2 Instance if it's offline but the status is running. * This could mean unstable ssh connection, so instead of failing the build, * we try to reconnect as soon as the EC2 Instance is running again. + * JENKINS-76200: Start stopped instances before attempting connection. */ private void attemptReconnectIfOffline(EC2Computer computer) { + LOGGER.info("[JENKINS-76200] attemptReconnectIfOffline() called for " + computer.getName() + " (instance: " + + computer.getInstanceId() + ")"); + try { - if (computer.getState() == InstanceState.RUNNING && computer.isOffline()) { + InstanceState state = computer.getState(); + boolean isOffline = computer.isOffline(); + boolean isConnecting = computer.isConnecting(); + + LOGGER.info("[JENKINS-76200] Instance state: " + state + ", isOffline: " + isOffline + ", isConnecting: " + + isConnecting + " for " + computer.getName()); + + // JENKINS-76200: If instance is stopped, start it before attempting connection + // But only if there are jobs waiting for this node + if (InstanceState.STOPPED.equals(state) || InstanceState.STOPPING.equals(state)) { + if (isOffline) { + // Check if there are jobs in the queue waiting for this node + boolean hasQueuedJobs = itemsInQueueForThisSlave(computer); + LOGGER.info("[JENKINS-76200] Instance " + computer.getInstanceId() + " is " + state + + " and offline, jobs in queue: " + hasQueuedJobs); + + if (hasQueuedJobs) { + LOGGER.info("[JENKINS-76200] Jobs are waiting - attempting to start instance " + + computer.getInstanceId()); + EC2Cloud cloud = computer.getCloud(); + if (cloud != null) { + try { + Ec2Client ec2 = cloud.connect(); + StartInstancesRequest request = StartInstancesRequest.builder() + .instanceIds(computer.getInstanceId()) + .build(); + LOGGER.info( + "[JENKINS-76200] Calling AWS startInstances() for " + computer.getInstanceId()); + ec2.startInstances(request); + LOGGER.info("[JENKINS-76200] Successfully called startInstances() for " + + computer.getInstanceId() + " - instance should be starting now"); + } catch (Exception e) { + LOGGER.log( + Level.WARNING, + "[JENKINS-76200] Failed to start stopped instance " + computer.getInstanceId(), + e); + } + } else { + LOGGER.warning("[JENKINS-76200] Cannot start instance " + computer.getInstanceId() + + " - cloud not found for node " + computer.getName()); + } + } else { + LOGGER.info("[JENKINS-76200] No jobs waiting for stopped instance " + computer.getInstanceId() + + " - leaving it stopped"); + } + } else { + LOGGER.info("[JENKINS-76200] Instance " + computer.getInstanceId() + " is " + state + + " but not offline - skipping start attempt"); + } + // Don't attempt connection yet - instance needs time to start + // Will retry on next check() call when state should be PENDING or RUNNING + return; + } + + if (state == InstanceState.RUNNING && isOffline) { LOGGER.warning("EC2Computer " + computer.getName() + " is offline"); - if (!computer.isConnecting()) { + if (!isConnecting) { // Keep retrying connection to agent until the job times out LOGGER.warning("Attempting to reconnect EC2Computer " + computer.getName()); computer.connect(false); + } else { + LOGGER.info("[JENKINS-76200] Skipping reconnect - already connecting for " + computer.getName()); } + } else { + LOGGER.info("[JENKINS-76200] No reconnection needed - state: " + state + ", offline: " + isOffline + + " for " + computer.getName()); } } catch (SdkException | InterruptedException e) { - LOGGER.log(Level.FINE, "Error getting EC2 instance state for " + computer.getName(), e); + LOGGER.log( + Level.WARNING, "[JENKINS-76200] Error in attemptReconnectIfOffline for " + computer.getName(), e); } } /* - * Checks if there are any items in the queue that are waiting for this node explicitly. - * This prevents a node from being taken offline while there are Ivy/Maven Modules waiting to build. + * Checks if there are any items in the queue that can run on this node. + * This prevents a node from being taken offline while there are jobs waiting that could use it. * Need to check entire queue as some modules may be blocked by upstream dependencies. * Accessing the queue in this way can block other threads, so only perform this check just prior * to timing out the slave. + * JENKINS-76200: Check label matching, not just explicit node assignment. */ private boolean itemsInQueueForThisSlave(EC2Computer c) { final EC2AbstractSlave selfNode = c.getNode(); @@ -285,16 +358,19 @@ private boolean itemsInQueueForThisSlave(EC2Computer c) { if (selfNode == null) { return false; } - final Label selfLabel = selfNode.getSelfLabel(); Queue.Item[] items = Jenkins.get().getQueue().getItems(); for (Queue.Item item : items) { final Label assignedLabel = item.getAssignedLabel(); - if (assignedLabel == selfLabel) { - LOGGER.fine("Preventing idle timeout of " + c.getName() - + " as there is at least one item in the queue explicitly waiting for this slave"); + // JENKINS-76200: Check if this node can execute the job based on label matching + // Jobs with no label requirement (null) can run on any node + // Jobs with labels can run on nodes that match those labels + if (assignedLabel == null || assignedLabel.contains(selfNode)) { + LOGGER.fine("[JENKINS-76200] Found queued job that can run on " + c.getName() + + " (job label: " + assignedLabel + ")"); return true; } } + LOGGER.fine("[JENKINS-76200] No queued jobs found that can run on " + c.getName()); return false; } @@ -307,6 +383,8 @@ private boolean itemsInQueueForThisSlave(EC2Computer c) { */ @Override public void start(EC2Computer c) { + LOGGER.info("[JENKINS-76200] start() called for " + c.getName() + " (instance: " + c.getInstanceId() + ")"); + // Jenkins is in the process of starting up if (Jenkins.get().getInitLevel() != InitMilestone.COMPLETED) { InstanceState state = null; @@ -315,6 +393,7 @@ public void start(EC2Computer c) { } catch (SdkException | InterruptedException e) { LOGGER.log(Level.FINE, "Error getting EC2 instance state for " + c.getName(), e); } + LOGGER.info("[JENKINS-76200] During Jenkins startup - instance state: " + state); if (!(InstanceState.PENDING.equals(state) || InstanceState.RUNNING.equals(state))) { LOGGER.info("Ignoring start request for " + c.getName() + " during Jenkins startup due to EC2 instance state of " + state); diff --git a/src/main/java/hudson/plugins/ec2/NoDelayProvisionerStrategy.java b/src/main/java/hudson/plugins/ec2/NoDelayProvisionerStrategy.java index a461dadf8..8e3f6cb8a 100644 --- a/src/main/java/hudson/plugins/ec2/NoDelayProvisionerStrategy.java +++ b/src/main/java/hudson/plugins/ec2/NoDelayProvisionerStrategy.java @@ -90,15 +90,19 @@ public NodeProvisioner.StrategyDecision apply(NodeProvisioner.StrategyState stra * Counts executors in EC2 nodes that have been provisioned (exist in Jenkins) but are NOT yet counted in the * LoadStatistics snapshot. This specifically targets the gap where nodes exist but are: * - Offline (just added to Jenkins, before connecting starts) + * - Instance is PENDING or RUNNING in AWS (will come online soon) * * We explicitly DO NOT count: * - Connecting nodes (already in snapshot.getConnectingExecutors()) * - Online nodes (already in snapshot.getAvailableExecutors() or busy executors) + * - STOPPED instances (won't come online without explicit start action) * * This prevents over-provisioning by accounting for nodes in the critical gap between: * 1) Node added to Jenkins (after PlannedNode future completes) * 2) Node starts connecting (shows up in snapshot.getConnectingExecutor()) * + * JENKINS-76200: Exclude STOPPED instances - they won't come online on their own. + * * @param label the label to match, or null for unlabeled nodes * @return the number of executors from provisioned EC2 nodes in the offline->connecting gap */ @@ -113,6 +117,7 @@ int countProvisionedButNotExecutingNodes(Label label) { int offlineNodes = 0; int connectingNodes = 0; int onlineNodes = 0; + int stoppedNodes = 0; for (Node node : nodes) { // Only count EC2 nodes @@ -136,16 +141,36 @@ int countProvisionedButNotExecutingNodes(Label label) { } // Only count nodes that are OFFLINE (not connecting, not online) - // These are in the gap between being added to Jenkins and starting to connect + // and not STOPPED in AWS (won't come online without explicit start) if (computer.isOffline() && !computer.isConnecting()) { + // JENKINS-76200: Check if instance is STOPPED in AWS + if (computer instanceof EC2Computer ec2Computer) { + try { + InstanceState state = ec2Computer.getState(); + if (state == InstanceState.STOPPED || state == InstanceState.STOPPING) { + stoppedNodes++; + LOGGER.log( + Level.FINE, + "Excluding STOPPED instance {0} from available capacity", + ec2Computer.getInstanceId()); + continue; // Don't count stopped instances + } + } catch (Exception e) { + LOGGER.log( + Level.FINE, + "Could not get state for " + ec2Computer.getName() + ", counting as available", + e); + // If we can't determine state, count it to avoid over-provisioning + } + } count += node.getNumExecutors(); } } LOGGER.log( Level.FINER, - "EC2 nodes for label {0}: total={1}, offline={2}, connecting={3}, online={4}", - new Object[] {label, totalEC2Nodes, offlineNodes, connectingNodes, onlineNodes}); + "EC2 nodes for label {0}: total={1}, offline={2}, connecting={3}, online={4}, stopped={5}", + new Object[] {label, totalEC2Nodes, offlineNodes, connectingNodes, onlineNodes, stoppedNodes}); return count; }