Skip to content

Commit 982136e

Browse files
authored
Merge pull request #300 from buildkite/fix/dangling-detection-zero-agents-v2
2 parents 1c1cfad + ad91093 commit 982136e

2 files changed

Lines changed: 160 additions & 1 deletion

File tree

scaler/scaler.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -274,6 +274,24 @@ func (s *Scaler) Run(ctx context.Context) (time.Duration, error) {
274274
return metrics.PollDuration, s.scaleIn(ctx, desired, asg)
275275
}
276276

277+
// In Elastic CI mode, detect dangling instances: we have running instances but zero agents reporting.
278+
// This can happen when the buildkite-agent process dies on the instance (e.g. OOM killed) while
279+
// the EC2 instance itself remains healthy. In this case, the normal CleanupDanglingInstances check
280+
// at the top of Run() may not fire due to the minimumInstanceUptime filter, and neither scaleIn()
281+
// nor scaleOut() is called because desired == instanceCount after MaxSize capping.
282+
// We use a 10-minute uptime threshold here to ensure we don't check instances that are still
283+
// booting (typical boot-to-agent-registration takes 3-5 minutes for Elastic CI Stack instances).
284+
// This is shorter than the default 1-hour minimumInstanceUptime but long enough to avoid
285+
// prematurely killing instances that haven't finished starting up.
286+
if s.elasticCIMode && instanceCount > 0 && metrics.TotalAgents == 0 {
287+
log.Printf("🔍 [Elastic CI Mode] Detected %d running instance(s) but 0 agents reporting — checking for dangling instances", instanceCount)
288+
danglingCheckUptime := 10 * time.Minute
289+
if err := s.autoscaling.CleanupDanglingInstances(ctx, danglingCheckUptime, s.maxDanglingInstancesToCheck); err != nil {
290+
log.Printf("⚠️ [Elastic CI Mode] Failed to cleanup dangling instances: %v", err)
291+
}
292+
return metrics.PollDuration, nil
293+
}
294+
277295
log.Printf("No scaling required, currently %d actual instances (desired set to %d)",
278296
instanceCount, asg.DesiredCount)
279297
return metrics.PollDuration, nil

scaler/scaler_test.go

Lines changed: 142 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -452,6 +452,7 @@ type asgTestDriver struct {
452452
err error
453453
desiredCapacity int64
454454
actualCapacity int64 // If 0, will default to desiredCapacity
455+
maxSize int64 // If 0, defaults to 100
455456
sigTermsSent []string
456457
elasticCIMode bool
457458
danglingInstancesFound int
@@ -469,11 +470,16 @@ func (d *asgTestDriver) Describe(ctx context.Context) (AutoscaleGroupDetails, er
469470
actualCount = d.desiredCapacity
470471
}
471472

473+
maxSize := d.maxSize
474+
if maxSize == 0 {
475+
maxSize = 100
476+
}
477+
472478
return AutoscaleGroupDetails{
473479
DesiredCount: d.desiredCapacity,
474480
ActualCount: actualCount,
475481
MinSize: 0,
476-
MaxSize: 100,
482+
MaxSize: maxSize,
477483
InstanceIDs: instanceIDs,
478484
}, d.err
479485
}
@@ -642,3 +648,138 @@ func TestAvailabilityBasedScaling(t *testing.T) {
642648
})
643649
}
644650
}
651+
652+
func TestDanglingInstanceDetection(t *testing.T) {
653+
testCases := []struct {
654+
name string
655+
metrics buildkite.AgentMetrics
656+
asgDesired int64
657+
asgActual int64
658+
maxSize int64
659+
agentsPerInstance int
660+
elasticCIMode bool
661+
expectedDesiredCapacity int64
662+
expectedDanglingChecks int
663+
}{
664+
// Core scenario from SUP-5153: agents die on running instance, MaxSize=1 caps
665+
// desired to 1, scaler sees desired==actual and says "No scaling required".
666+
// With the fix, it should detect total=0 with instances running and trigger cleanup.
667+
{
668+
name: "Elastic CI detects dangling instance when agents die and MaxSize caps desired",
669+
metrics: buildkite.AgentMetrics{
670+
ScheduledJobs: 4,
671+
RunningJobs: 0,
672+
WaitingJobs: 7,
673+
TotalAgents: 0,
674+
},
675+
asgDesired: 1,
676+
asgActual: 1,
677+
maxSize: 1,
678+
agentsPerInstance: 6,
679+
elasticCIMode: true,
680+
expectedDesiredCapacity: 1, // capacity unchanged — cleanup handles recovery
681+
expectedDanglingChecks: 1,
682+
},
683+
// Same scenario but without Elastic CI Mode — should NOT trigger dangling check.
684+
{
685+
name: "Non-Elastic CI mode does not trigger dangling check",
686+
metrics: buildkite.AgentMetrics{
687+
ScheduledJobs: 4,
688+
RunningJobs: 0,
689+
WaitingJobs: 7,
690+
TotalAgents: 0,
691+
},
692+
asgDesired: 1,
693+
asgActual: 1,
694+
maxSize: 1,
695+
agentsPerInstance: 6,
696+
elasticCIMode: false,
697+
expectedDesiredCapacity: 1,
698+
expectedDanglingChecks: 0,
699+
},
700+
// When there are zero instances AND zero agents (normal idle state),
701+
// should NOT trigger dangling check — there's nothing dangling.
702+
{
703+
name: "No dangling check when zero instances and zero agents",
704+
metrics: buildkite.AgentMetrics{
705+
ScheduledJobs: 0,
706+
RunningJobs: 0,
707+
TotalAgents: 0,
708+
},
709+
asgDesired: 0,
710+
asgActual: 0,
711+
maxSize: 1,
712+
agentsPerInstance: 6,
713+
elasticCIMode: true,
714+
expectedDesiredCapacity: 0,
715+
expectedDanglingChecks: 0,
716+
},
717+
// When agents are healthy (total > 0), no dangling check needed.
718+
{
719+
name: "No dangling check when agents are healthy",
720+
metrics: buildkite.AgentMetrics{
721+
ScheduledJobs: 0,
722+
RunningJobs: 0,
723+
TotalAgents: 6,
724+
},
725+
asgDesired: 1,
726+
asgActual: 1,
727+
maxSize: 1,
728+
agentsPerInstance: 6,
729+
elasticCIMode: true,
730+
expectedDesiredCapacity: 0, // scales in to 0 (idle agents, no jobs)
731+
expectedDanglingChecks: 0,
732+
},
733+
// Multiple instances with zero agents — should still trigger dangling check.
734+
{
735+
name: "Elastic CI detects dangling with multiple instances and zero agents",
736+
metrics: buildkite.AgentMetrics{
737+
ScheduledJobs: 10,
738+
RunningJobs: 0,
739+
TotalAgents: 0,
740+
},
741+
asgDesired: 3,
742+
asgActual: 3,
743+
maxSize: 3,
744+
agentsPerInstance: 4,
745+
elasticCIMode: true,
746+
expectedDesiredCapacity: 3, // capacity unchanged — cleanup handles recovery
747+
expectedDanglingChecks: 1,
748+
},
749+
}
750+
751+
for _, tc := range testCases {
752+
t.Run(tc.name, func(t *testing.T) {
753+
asg := &asgTestDriver{
754+
desiredCapacity: tc.asgDesired,
755+
actualCapacity: tc.asgActual,
756+
maxSize: tc.maxSize,
757+
}
758+
759+
s := Scaler{
760+
autoscaling: asg,
761+
bk: &buildkiteTestDriver{metrics: tc.metrics},
762+
elasticCIMode: tc.elasticCIMode,
763+
scaling: ScalingCalculator{
764+
includeWaiting: true,
765+
agentsPerInstance: tc.agentsPerInstance,
766+
},
767+
}
768+
769+
_, err := s.Run(context.Background())
770+
if err != nil {
771+
t.Fatalf("Unexpected error: %v", err)
772+
}
773+
774+
if asg.desiredCapacity != tc.expectedDesiredCapacity {
775+
t.Errorf("Expected desired capacity: %d, got: %d",
776+
tc.expectedDesiredCapacity, asg.desiredCapacity)
777+
}
778+
779+
if asg.danglingInstancesFound != tc.expectedDanglingChecks {
780+
t.Errorf("Expected %d dangling instance check(s), got: %d",
781+
tc.expectedDanglingChecks, asg.danglingInstancesFound)
782+
}
783+
})
784+
}
785+
}

0 commit comments

Comments
 (0)