Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 88 additions & 9 deletions src/main/java/hudson/plugins/ec2/EC2RetentionStrategy.java
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@
import jenkins.model.Jenkins;
import org.kohsuke.stapler.DataBoundConstructor;
import software.amazon.awssdk.core.exception.SdkException;
import software.amazon.awssdk.services.ec2.Ec2Client;
import software.amazon.awssdk.services.ec2.model.StartInstancesRequest;

/**
* {@link RetentionStrategy} for EC2.
Expand Down Expand Up @@ -96,18 +98,24 @@ long getNextCheckAfter() {

@Override
public long check(EC2Computer c) {
LOGGER.fine("[JENKINS-76200] check() called for " + c.getName() + " (instance: " + c.getInstanceId() + ")");

if (!checkLock.tryLock()) {
LOGGER.fine("[JENKINS-76200] check() could not acquire lock for " + c.getName());
return CHECK_INTERVAL_MINUTES;
} else {
try {
long currentTime = this.clock.millis();

if (currentTime > nextCheckAfter) {
LOGGER.info("[JENKINS-76200] check() executing for " + c.getName()
+ " - calling attemptReconnectIfOffline()");
attemptReconnectIfOffline(c);
long intervalMins = internalCheck(c);
nextCheckAfter = currentTime + TimeUnit.MINUTES.toMillis(intervalMins);
return intervalMins;
} else {
LOGGER.fine("[JENKINS-76200] check() skipping (not time yet) for " + c.getName());
return CHECK_INTERVAL_MINUTES;
}
} finally {
Expand Down Expand Up @@ -253,28 +261,93 @@ private long internalCheck(EC2Computer computer) {
* Try to reconnect the EC2 Instance if it's offline but the status is running.
* This could mean unstable ssh connection, so instead of failing the build,
* we try to reconnect as soon as the EC2 Instance is running again.
* JENKINS-76200: Start stopped instances before attempting connection.
*/
private void attemptReconnectIfOffline(EC2Computer computer) {
LOGGER.info("[JENKINS-76200] attemptReconnectIfOffline() called for " + computer.getName() + " (instance: "
+ computer.getInstanceId() + ")");

try {
if (computer.getState() == InstanceState.RUNNING && computer.isOffline()) {
InstanceState state = computer.getState();
boolean isOffline = computer.isOffline();
boolean isConnecting = computer.isConnecting();

LOGGER.info("[JENKINS-76200] Instance state: " + state + ", isOffline: " + isOffline + ", isConnecting: "
+ isConnecting + " for " + computer.getName());

// JENKINS-76200: If instance is stopped, start it before attempting connection
// But only if there are jobs waiting for this node
if (InstanceState.STOPPED.equals(state) || InstanceState.STOPPING.equals(state)) {
if (isOffline) {
// Check if there are jobs in the queue waiting for this node
boolean hasQueuedJobs = itemsInQueueForThisSlave(computer);
LOGGER.info("[JENKINS-76200] Instance " + computer.getInstanceId() + " is " + state
+ " and offline, jobs in queue: " + hasQueuedJobs);

if (hasQueuedJobs) {
LOGGER.info("[JENKINS-76200] Jobs are waiting - attempting to start instance "
+ computer.getInstanceId());
EC2Cloud cloud = computer.getCloud();
if (cloud != null) {
try {
Ec2Client ec2 = cloud.connect();
StartInstancesRequest request = StartInstancesRequest.builder()
.instanceIds(computer.getInstanceId())
.build();
LOGGER.info(
"[JENKINS-76200] Calling AWS startInstances() for " + computer.getInstanceId());
ec2.startInstances(request);
LOGGER.info("[JENKINS-76200] Successfully called startInstances() for "
+ computer.getInstanceId() + " - instance should be starting now");
} catch (Exception e) {
LOGGER.log(
Level.WARNING,
"[JENKINS-76200] Failed to start stopped instance " + computer.getInstanceId(),
e);
}
} else {
LOGGER.warning("[JENKINS-76200] Cannot start instance " + computer.getInstanceId()
+ " - cloud not found for node " + computer.getName());
}
} else {
LOGGER.info("[JENKINS-76200] No jobs waiting for stopped instance " + computer.getInstanceId()
+ " - leaving it stopped");
}
} else {
LOGGER.info("[JENKINS-76200] Instance " + computer.getInstanceId() + " is " + state
+ " but not offline - skipping start attempt");
}
// Don't attempt connection yet - instance needs time to start
// Will retry on next check() call when state should be PENDING or RUNNING
return;
}

if (state == InstanceState.RUNNING && isOffline) {
LOGGER.warning("EC2Computer " + computer.getName() + " is offline");
if (!computer.isConnecting()) {
if (!isConnecting) {
// Keep retrying connection to agent until the job times out
LOGGER.warning("Attempting to reconnect EC2Computer " + computer.getName());
computer.connect(false);
} else {
LOGGER.info("[JENKINS-76200] Skipping reconnect - already connecting for " + computer.getName());
}
} else {
LOGGER.info("[JENKINS-76200] No reconnection needed - state: " + state + ", offline: " + isOffline
+ " for " + computer.getName());
}
} catch (SdkException | InterruptedException e) {
LOGGER.log(Level.FINE, "Error getting EC2 instance state for " + computer.getName(), e);
LOGGER.log(
Level.WARNING, "[JENKINS-76200] Error in attemptReconnectIfOffline for " + computer.getName(), e);
}
}

/*
* Checks if there are any items in the queue that are waiting for this node explicitly.
* This prevents a node from being taken offline while there are Ivy/Maven Modules waiting to build.
* Checks if there are any items in the queue that can run on this node.
* This prevents a node from being taken offline while there are jobs waiting that could use it.
* Need to check entire queue as some modules may be blocked by upstream dependencies.
* Accessing the queue in this way can block other threads, so only perform this check just prior
* to timing out the slave.
* JENKINS-76200: Check label matching, not just explicit node assignment.
*/
private boolean itemsInQueueForThisSlave(EC2Computer c) {
final EC2AbstractSlave selfNode = c.getNode();
Expand All @@ -285,16 +358,19 @@ private boolean itemsInQueueForThisSlave(EC2Computer c) {
if (selfNode == null) {
return false;
}
final Label selfLabel = selfNode.getSelfLabel();
Queue.Item[] items = Jenkins.get().getQueue().getItems();
for (Queue.Item item : items) {
final Label assignedLabel = item.getAssignedLabel();
if (assignedLabel == selfLabel) {
LOGGER.fine("Preventing idle timeout of " + c.getName()
+ " as there is at least one item in the queue explicitly waiting for this slave");
// JENKINS-76200: Check if this node can execute the job based on label matching
// Jobs with no label requirement (null) can run on any node
// Jobs with labels can run on nodes that match those labels
if (assignedLabel == null || assignedLabel.contains(selfNode)) {
LOGGER.fine("[JENKINS-76200] Found queued job that can run on " + c.getName()
+ " (job label: " + assignedLabel + ")");
return true;
}
}
LOGGER.fine("[JENKINS-76200] No queued jobs found that can run on " + c.getName());
return false;
}

Expand All @@ -307,6 +383,8 @@ private boolean itemsInQueueForThisSlave(EC2Computer c) {
*/
@Override
public void start(EC2Computer c) {
LOGGER.info("[JENKINS-76200] start() called for " + c.getName() + " (instance: " + c.getInstanceId() + ")");

// Jenkins is in the process of starting up
if (Jenkins.get().getInitLevel() != InitMilestone.COMPLETED) {
InstanceState state = null;
Expand All @@ -315,6 +393,7 @@ public void start(EC2Computer c) {
} catch (SdkException | InterruptedException e) {
LOGGER.log(Level.FINE, "Error getting EC2 instance state for " + c.getName(), e);
}
LOGGER.info("[JENKINS-76200] During Jenkins startup - instance state: " + state);
if (!(InstanceState.PENDING.equals(state) || InstanceState.RUNNING.equals(state))) {
LOGGER.info("Ignoring start request for " + c.getName()
+ " during Jenkins startup due to EC2 instance state of " + state);
Expand Down
31 changes: 28 additions & 3 deletions src/main/java/hudson/plugins/ec2/NoDelayProvisionerStrategy.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,15 +90,19 @@ public NodeProvisioner.StrategyDecision apply(NodeProvisioner.StrategyState stra
* Counts executors in EC2 nodes that have been provisioned (exist in Jenkins) but are NOT yet counted in the
* LoadStatistics snapshot. This specifically targets the gap where nodes exist but are:
* - Offline (just added to Jenkins, before connecting starts)
* - Instance is PENDING or RUNNING in AWS (will come online soon)
*
* We explicitly DO NOT count:
* - Connecting nodes (already in snapshot.getConnectingExecutors())
* - Online nodes (already in snapshot.getAvailableExecutors() or busy executors)
* - STOPPED instances (won't come online without explicit start action)
*
* This prevents over-provisioning by accounting for nodes in the critical gap between:
* 1) Node added to Jenkins (after PlannedNode future completes)
* 2) Node starts connecting (shows up in snapshot.getConnectingExecutor())
*
* JENKINS-76200: Exclude STOPPED instances - they won't come online on their own.
*
* @param label the label to match, or null for unlabeled nodes
* @return the number of executors from provisioned EC2 nodes in the offline->connecting gap
*/
Expand All @@ -113,6 +117,7 @@ int countProvisionedButNotExecutingNodes(Label label) {
int offlineNodes = 0;
int connectingNodes = 0;
int onlineNodes = 0;
int stoppedNodes = 0;

for (Node node : nodes) {
// Only count EC2 nodes
Expand All @@ -136,16 +141,36 @@ int countProvisionedButNotExecutingNodes(Label label) {
}

// Only count nodes that are OFFLINE (not connecting, not online)
// These are in the gap between being added to Jenkins and starting to connect
// and not STOPPED in AWS (won't come online without explicit start)
if (computer.isOffline() && !computer.isConnecting()) {
// JENKINS-76200: Check if instance is STOPPED in AWS
if (computer instanceof EC2Computer ec2Computer) {
try {
InstanceState state = ec2Computer.getState();
if (state == InstanceState.STOPPED || state == InstanceState.STOPPING) {
stoppedNodes++;
LOGGER.log(
Level.FINE,
"Excluding STOPPED instance {0} from available capacity",
ec2Computer.getInstanceId());
continue; // Don't count stopped instances
}
} catch (Exception e) {
LOGGER.log(
Level.FINE,
"Could not get state for " + ec2Computer.getName() + ", counting as available",
e);
// If we can't determine state, count it to avoid over-provisioning
}
}
count += node.getNumExecutors();
}
}

LOGGER.log(
Level.FINER,
"EC2 nodes for label {0}: total={1}, offline={2}, connecting={3}, online={4}",
new Object[] {label, totalEC2Nodes, offlineNodes, connectingNodes, onlineNodes});
"EC2 nodes for label {0}: total={1}, offline={2}, connecting={3}, online={4}, stopped={5}",
new Object[] {label, totalEC2Nodes, offlineNodes, connectingNodes, onlineNodes, stoppedNodes});

return count;
}
Expand Down