@@ -25,12 +25,13 @@ const (
2525)
2626
2727type AutoscaleGroupDetails struct {
28- Pending int64
29- DesiredCount int64
30- MinSize int64
31- MaxSize int64
32- InstanceIDs []string // Instance IDs in the ASG
33- ActualCount int64 // Actual number of running instances
28+ Pending int64
29+ DesiredCount int64
30+ MinSize int64
31+ MaxSize int64
32+ InstanceIDs []string // All instance IDs in the ASG
33+ HealthyInstanceIDs []string // Instance IDs that are healthy (for graceful termination)
34+ ActualCount int64 // Actual number of running instances
3435}
3536
3637type ASGDriver struct {
@@ -84,27 +85,22 @@ func (a *ASGDriver) getASGPlatform(ctx context.Context, instances []ec2Types.Ins
8485 return "linux"
8586}
8687
87- // getCheckCommand returns the appropriate check command for the platform
88+ // getCheckCommand returns the appropriate check command for the platform.
89+ // It checks if the buildkite-agent service is running.
8890func (a * ASGDriver ) getCheckCommand (platform string ) string {
8991 if platform == "windows" {
9092 return `
9193$AgentStatus = nssm status buildkite-agent 2>&1
9294if ($AgentStatus -match "SERVICE_RUNNING") {
9395 Write-Output "RUNNING"
9496} else {
95- Write-Output "NOT_RUNNING"
97+ Write-Output "NOT_RUNNING: $AgentStatus "
9698}
9799`
98100 }
99101
100102 // Default to Linux
101103 return `#!/bin/bash
102- # Linux check command
103- if [ -f /tmp/buildkite-agent-termination-marker ]; then
104- echo "MARKER_EXISTS: Instance is already marked for termination"
105- exit 0
106- fi
107-
108104ACTIVE_STATE=$(systemctl show buildkite-agent -p ActiveState | cut -d= -f2)
109105case "$ACTIVE_STATE" in
110106 "active"|"activating") echo "RUNNING" ;;
@@ -271,19 +267,26 @@ func (a *ASGDriver) Describe(ctx context.Context) (AutoscaleGroupDetails, error)
271267 }
272268
273269 instanceIDs := make ([]string , 0 , len (asg .Instances ))
270+ healthyInstanceIDs := make ([]string , 0 , len (asg .Instances ))
274271 for _ , instance := range asg .Instances {
275272 if instance .InstanceId != nil {
276273 instanceIDs = append (instanceIDs , * instance .InstanceId )
274+ // Only include instances that are healthy and in-service for graceful termination
275+ if instance .HealthStatus != nil && * instance .HealthStatus == "Healthy" &&
276+ instance .LifecycleState == types .LifecycleStateInService {
277+ healthyInstanceIDs = append (healthyInstanceIDs , * instance .InstanceId )
278+ }
277279 }
278280 }
279281
280282 details := AutoscaleGroupDetails {
281- Pending : pending ,
282- DesiredCount : int64 (* result .AutoScalingGroups [0 ].DesiredCapacity ),
283- MinSize : int64 (* result .AutoScalingGroups [0 ].MinSize ),
284- MaxSize : int64 (* result .AutoScalingGroups [0 ].MaxSize ),
285- InstanceIDs : instanceIDs ,
286- ActualCount : running ,
283+ Pending : pending ,
284+ DesiredCount : int64 (* result .AutoScalingGroups [0 ].DesiredCapacity ),
285+ MinSize : int64 (* result .AutoScalingGroups [0 ].MinSize ),
286+ MaxSize : int64 (* result .AutoScalingGroups [0 ].MaxSize ),
287+ InstanceIDs : instanceIDs ,
288+ HealthyInstanceIDs : healthyInstanceIDs ,
289+ ActualCount : running ,
287290 }
288291
289292 log .Printf ("↳ Got pending=%d, desired=%d, actual=%d, min=%d, max=%d (took %v)" ,
@@ -370,70 +373,47 @@ type dryRunASG struct {
370373
371374func (a * ASGDriver ) SendSIGTERMToAgents (ctx context.Context , instanceID string ) error {
372375 ssmClient := ssm .NewFromConfig (a .Cfg )
376+ ec2Client := ec2 .NewFromConfig (a .Cfg )
373377
374- // Wait for SSM agent to be ready before sending command
375- if err := a .waitForSSMReady (ctx , instanceID , 30 * time .Second ); err != nil {
376- log .Printf ("SSM agent not ready on instance %s, cannot send SIGTERM: %v" , instanceID , err )
377- return err
378- }
378+ // Detect platform for EC2 instance
379+ platform := "linux"
380+ documentName := "AWS-RunShellScript"
381+ stopCommand := "sudo systemctl stop buildkite-agent.service"
379382
380- // First check if this instance already has a termination marker.
381- // This makes SIGTERM idempotent: even if we're called multiple times
382- // across different Lambda invocations, we only send one SIGTERM.
383- checkMarkerCommand := `#!/bin/bash
384- if [ -f /tmp/buildkite-agent-termination-marker ]; then
385- echo "MARKER_EXISTS"
386- cat /tmp/buildkite-agent-termination-marker
387- exit 0
388- else
389- echo "NO_MARKER"
390- exit 1
391- fi
392- `
393- checkOutput , err := ssmClient .SendCommand (ctx , & ssm.SendCommandInput {
394- InstanceIds : []string {instanceID },
395- DocumentName : aws .String ("AWS-RunShellScript" ),
396- Parameters : map [string ][]string {"commands" : {checkMarkerCommand }},
397- Comment : aws .String ("Check if instance already has termination marker" ),
383+ descResp , descErr := ec2Client .DescribeInstances (ctx , & ec2.DescribeInstancesInput {
384+ InstanceIds : []string {instanceID },
398385 })
399- if err != nil {
400- log .Printf ("[Elastic CI Mode] Warning: Could not check termination marker on %s: %v" , instanceID , err )
401- } else {
402- time .Sleep (2 * time .Second )
403-
404- checkResult , getErr := ssmClient .GetCommandInvocation (ctx , & ssm.GetCommandInvocationInput {
405- CommandId : checkOutput .Command .CommandId ,
406- InstanceId : aws .String (instanceID ),
407- })
408- if getErr == nil && checkResult .Status == ssmTypes .CommandInvocationStatusSuccess &&
409- checkResult .StandardOutputContent != nil && strings .Contains (* checkResult .StandardOutputContent , "MARKER_EXISTS" ) {
410- log .Printf ("[Elastic CI Mode] Instance %s already received SIGTERM (marker exists), skipping" , instanceID )
411- return nil
386+ if descErr != nil {
387+ log .Printf ("[Elastic CI Mode] Warning: Failed to detect platform for %s, defaulting to Linux: %v" , instanceID , descErr )
388+ } else if len (descResp .Reservations ) > 0 && len (descResp .Reservations [0 ].Instances ) > 0 {
389+ instance := descResp .Reservations [0 ].Instances [0 ]
390+ if strings .EqualFold (string (instance .Platform ), "windows" ) {
391+ platform = "windows"
392+ documentName = "AWS-RunPowerShellScript"
393+ stopCommand = "nssm stop buildkite-agent"
412394 }
413395 }
414396
415- // Command that creates a marker file (for idempotency) and then stops the agent gracefully.
416- // The marker file is checked by CleanupDanglingInstances to avoid treating this instance
417- // as "dangling" while it's still in the process of shutting down.
418- command := `#!/bin/bash
419- set -euo pipefail
420- echo "Termination requested at $(date)" > /tmp/buildkite-agent-termination-marker
421- sudo systemctl stop buildkite-agent.service || sudo /opt/buildkite-agent/bin/buildkite-agent stop --signal SIGTERM
422- `
423- log .Printf ("[Elastic CI Mode] Sending SIGTERM to instance %s" , instanceID )
397+ // Wait for SSM agent to be ready before sending command
398+ if err := a .waitForSSMReady (ctx , instanceID , 30 * time .Second ); err != nil {
399+ log .Printf ("[Elastic CI Mode] SSM agent not ready on %s, cannot send stop command: %v" , instanceID , err )
400+ return err
401+ }
402+
403+ log .Printf ("[Elastic CI Mode] Sending stop command to %s instance %s" , platform , instanceID )
424404
425- _ , err = ssmClient .SendCommand (ctx , & ssm.SendCommandInput {
405+ _ , err : = ssmClient .SendCommand (ctx , & ssm.SendCommandInput {
426406 InstanceIds : []string {instanceID },
427- DocumentName : aws .String ("AWS-RunShellScript" ),
428- Parameters : map [string ][]string {"commands" : {command }},
407+ DocumentName : aws .String (documentName ),
408+ Parameters : map [string ][]string {"commands" : {stopCommand }},
429409 Comment : aws .String ("Gracefully stop Buildkite agent" ),
430410 })
431411
432412 if err != nil {
433- log .Printf ("[Elastic CI Mode] Error sending SIGTERM to instance %s: %v" , instanceID , err )
413+ log .Printf ("[Elastic CI Mode] Error sending stop command to %s: %v" , instanceID , err )
434414 return err
435415 }
436- log .Printf ("[Elastic CI Mode] Successfully sent SIGTERM command to instance %s" , instanceID )
416+ log .Printf ("[Elastic CI Mode] Successfully sent stop command to %s" , instanceID )
437417 return nil
438418}
439419
0 commit comments