Skip to content

Commit 18a9e14

Browse files
committed
Graceful scale-in: send SIGTERM only to healthy Linux instances
1 parent 95fc49c commit 18a9e14

File tree

3 files changed

+57
-77
lines changed

3 files changed

+57
-77
lines changed

scaler/asg.go

Lines changed: 50 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -25,12 +25,13 @@ const (
2525
)
2626

2727
type AutoscaleGroupDetails struct {
28-
Pending int64
29-
DesiredCount int64
30-
MinSize int64
31-
MaxSize int64
32-
InstanceIDs []string // Instance IDs in the ASG
33-
ActualCount int64 // Actual number of running instances
28+
Pending int64
29+
DesiredCount int64
30+
MinSize int64
31+
MaxSize int64
32+
InstanceIDs []string // All instance IDs in the ASG
33+
HealthyInstanceIDs []string // Instance IDs that are healthy (for graceful termination)
34+
ActualCount int64 // Actual number of running instances
3435
}
3536

3637
type ASGDriver struct {
@@ -84,27 +85,22 @@ func (a *ASGDriver) getASGPlatform(ctx context.Context, instances []ec2Types.Ins
8485
return "linux"
8586
}
8687

87-
// getCheckCommand returns the appropriate check command for the platform
88+
// getCheckCommand returns the appropriate check command for the platform.
89+
// It checks if the buildkite-agent service is running.
8890
func (a *ASGDriver) getCheckCommand(platform string) string {
8991
if platform == "windows" {
9092
return `
9193
$AgentStatus = nssm status buildkite-agent 2>&1
9294
if ($AgentStatus -match "SERVICE_RUNNING") {
9395
Write-Output "RUNNING"
9496
} else {
95-
Write-Output "NOT_RUNNING"
97+
Write-Output "NOT_RUNNING: $AgentStatus"
9698
}
9799
`
98100
}
99101

100102
// Default to Linux
101103
return `#!/bin/bash
102-
# Linux check command
103-
if [ -f /tmp/buildkite-agent-termination-marker ]; then
104-
echo "MARKER_EXISTS: Instance is already marked for termination"
105-
exit 0
106-
fi
107-
108104
ACTIVE_STATE=$(systemctl show buildkite-agent -p ActiveState | cut -d= -f2)
109105
case "$ACTIVE_STATE" in
110106
"active"|"activating") echo "RUNNING" ;;
@@ -271,19 +267,26 @@ func (a *ASGDriver) Describe(ctx context.Context) (AutoscaleGroupDetails, error)
271267
}
272268

273269
instanceIDs := make([]string, 0, len(asg.Instances))
270+
healthyInstanceIDs := make([]string, 0, len(asg.Instances))
274271
for _, instance := range asg.Instances {
275272
if instance.InstanceId != nil {
276273
instanceIDs = append(instanceIDs, *instance.InstanceId)
274+
// Only include instances that are healthy and in-service for graceful termination
275+
if instance.HealthStatus != nil && *instance.HealthStatus == "Healthy" &&
276+
instance.LifecycleState == types.LifecycleStateInService {
277+
healthyInstanceIDs = append(healthyInstanceIDs, *instance.InstanceId)
278+
}
277279
}
278280
}
279281

280282
details := AutoscaleGroupDetails{
281-
Pending: pending,
282-
DesiredCount: int64(*result.AutoScalingGroups[0].DesiredCapacity),
283-
MinSize: int64(*result.AutoScalingGroups[0].MinSize),
284-
MaxSize: int64(*result.AutoScalingGroups[0].MaxSize),
285-
InstanceIDs: instanceIDs,
286-
ActualCount: running,
283+
Pending: pending,
284+
DesiredCount: int64(*result.AutoScalingGroups[0].DesiredCapacity),
285+
MinSize: int64(*result.AutoScalingGroups[0].MinSize),
286+
MaxSize: int64(*result.AutoScalingGroups[0].MaxSize),
287+
InstanceIDs: instanceIDs,
288+
HealthyInstanceIDs: healthyInstanceIDs,
289+
ActualCount: running,
287290
}
288291

289292
log.Printf("↳ Got pending=%d, desired=%d, actual=%d, min=%d, max=%d (took %v)",
@@ -370,70 +373,47 @@ type dryRunASG struct {
370373

371374
func (a *ASGDriver) SendSIGTERMToAgents(ctx context.Context, instanceID string) error {
372375
ssmClient := ssm.NewFromConfig(a.Cfg)
376+
ec2Client := ec2.NewFromConfig(a.Cfg)
373377

374-
// Wait for SSM agent to be ready before sending command
375-
if err := a.waitForSSMReady(ctx, instanceID, 30*time.Second); err != nil {
376-
log.Printf("SSM agent not ready on instance %s, cannot send SIGTERM: %v", instanceID, err)
377-
return err
378-
}
378+
// Detect platform for EC2 instance
379+
platform := "linux"
380+
documentName := "AWS-RunShellScript"
381+
stopCommand := "sudo systemctl stop buildkite-agent.service"
379382

380-
// First check if this instance already has a termination marker.
381-
// This makes SIGTERM idempotent: even if we're called multiple times
382-
// across different Lambda invocations, we only send one SIGTERM.
383-
checkMarkerCommand := `#!/bin/bash
384-
if [ -f /tmp/buildkite-agent-termination-marker ]; then
385-
echo "MARKER_EXISTS"
386-
cat /tmp/buildkite-agent-termination-marker
387-
exit 0
388-
else
389-
echo "NO_MARKER"
390-
exit 1
391-
fi
392-
`
393-
checkOutput, err := ssmClient.SendCommand(ctx, &ssm.SendCommandInput{
394-
InstanceIds: []string{instanceID},
395-
DocumentName: aws.String("AWS-RunShellScript"),
396-
Parameters: map[string][]string{"commands": {checkMarkerCommand}},
397-
Comment: aws.String("Check if instance already has termination marker"),
383+
descResp, descErr := ec2Client.DescribeInstances(ctx, &ec2.DescribeInstancesInput{
384+
InstanceIds: []string{instanceID},
398385
})
399-
if err != nil {
400-
log.Printf("[Elastic CI Mode] Warning: Could not check termination marker on %s: %v", instanceID, err)
401-
} else {
402-
time.Sleep(2 * time.Second)
403-
404-
checkResult, getErr := ssmClient.GetCommandInvocation(ctx, &ssm.GetCommandInvocationInput{
405-
CommandId: checkOutput.Command.CommandId,
406-
InstanceId: aws.String(instanceID),
407-
})
408-
if getErr == nil && checkResult.Status == ssmTypes.CommandInvocationStatusSuccess &&
409-
checkResult.StandardOutputContent != nil && strings.Contains(*checkResult.StandardOutputContent, "MARKER_EXISTS") {
410-
log.Printf("[Elastic CI Mode] Instance %s already received SIGTERM (marker exists), skipping", instanceID)
411-
return nil
386+
if descErr != nil {
387+
log.Printf("[Elastic CI Mode] Warning: Failed to detect platform for %s, defaulting to Linux: %v", instanceID, descErr)
388+
} else if len(descResp.Reservations) > 0 && len(descResp.Reservations[0].Instances) > 0 {
389+
instance := descResp.Reservations[0].Instances[0]
390+
if strings.EqualFold(string(instance.Platform), "windows") {
391+
platform = "windows"
392+
documentName = "AWS-RunPowerShellScript"
393+
stopCommand = "nssm stop buildkite-agent"
412394
}
413395
}
414396

415-
// Command that creates a marker file (for idempotency) and then stops the agent gracefully.
416-
// The marker file is checked by CleanupDanglingInstances to avoid treating this instance
417-
// as "dangling" while it's still in the process of shutting down.
418-
command := `#!/bin/bash
419-
set -euo pipefail
420-
echo "Termination requested at $(date)" > /tmp/buildkite-agent-termination-marker
421-
sudo systemctl stop buildkite-agent.service || sudo /opt/buildkite-agent/bin/buildkite-agent stop --signal SIGTERM
422-
`
423-
log.Printf("[Elastic CI Mode] Sending SIGTERM to instance %s", instanceID)
397+
// Wait for SSM agent to be ready before sending command
398+
if err := a.waitForSSMReady(ctx, instanceID, 30*time.Second); err != nil {
399+
log.Printf("[Elastic CI Mode] SSM agent not ready on %s, cannot send stop command: %v", instanceID, err)
400+
return err
401+
}
402+
403+
log.Printf("[Elastic CI Mode] Sending stop command to %s instance %s", platform, instanceID)
424404

425-
_, err = ssmClient.SendCommand(ctx, &ssm.SendCommandInput{
405+
_, err := ssmClient.SendCommand(ctx, &ssm.SendCommandInput{
426406
InstanceIds: []string{instanceID},
427-
DocumentName: aws.String("AWS-RunShellScript"),
428-
Parameters: map[string][]string{"commands": {command}},
407+
DocumentName: aws.String(documentName),
408+
Parameters: map[string][]string{"commands": {stopCommand}},
429409
Comment: aws.String("Gracefully stop Buildkite agent"),
430410
})
431411

432412
if err != nil {
433-
log.Printf("[Elastic CI Mode] Error sending SIGTERM to instance %s: %v", instanceID, err)
413+
log.Printf("[Elastic CI Mode] Error sending stop command to %s: %v", instanceID, err)
434414
return err
435415
}
436-
log.Printf("[Elastic CI Mode] Successfully sent SIGTERM command to instance %s", instanceID)
416+
log.Printf("[Elastic CI Mode] Successfully sent stop command to %s", instanceID)
437417
return nil
438418
}
439419

scaler/asg_test.go

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -90,11 +90,9 @@ func TestGetCheckCommand(t *testing.T) {
9090
"ActiveState",
9191
"RUNNING",
9292
"NOT_RUNNING",
93-
"MARKER_EXISTS",
9493
},
9594
expectedNotContains: []string{
96-
"PowerShell",
97-
"Get-Service",
95+
"nssm",
9896
},
9997
},
10098
{

scaler/scaler.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -388,11 +388,13 @@ func (s *Scaler) scaleIn(ctx context.Context, desired int64, current AutoscaleGr
388388
log.Printf("[Elastic CI Mode] Using graceful termination for %d instances", instancesToTerminate)
389389

390390
// Determine instances to terminate by sorting by launch time (oldest first)
391+
// Only consider healthy instances for graceful termination - unhealthy ones are already
392+
// being terminated by the ASG (e.g., instances marked unhealthy by CleanupDanglingInstances)
391393
maxToTerminate := instancesToTerminate
392394

393395
instancesForTermination := make([]string, 0, maxToTerminate)
394396

395-
if len(current.InstanceIDs) > 0 {
397+
if len(current.HealthyInstanceIDs) > 0 {
396398
// Define a struct to hold instance info for sorting
397399
type instanceInfo struct {
398400
ID string
@@ -401,15 +403,15 @@ func (s *Scaler) scaleIn(ctx context.Context, desired int64, current AutoscaleGr
401403

402404
ec2Svc := ec2.NewFromConfig(s.cfg)
403405

404-
instances := make([]instanceInfo, 0, len(current.InstanceIDs))
406+
instances := make([]instanceInfo, 0, len(current.HealthyInstanceIDs))
405407
describeResult, err := ec2Svc.DescribeInstances(ctx, &ec2.DescribeInstancesInput{
406-
InstanceIds: current.InstanceIDs,
408+
InstanceIds: current.HealthyInstanceIDs,
407409
})
408410

409411
if err != nil {
410412
log.Printf("[Elastic CI Mode] Warning: Could not get instance launch times: %v", err)
411413
// Fall back to unsorted if we can't get launch times
412-
instancesForTermination = current.InstanceIDs
414+
instancesForTermination = current.HealthyInstanceIDs
413415
if int64(len(instancesForTermination)) > maxToTerminate {
414416
instancesForTermination = instancesForTermination[:maxToTerminate]
415417
}

0 commit comments

Comments
 (0)