@@ -452,6 +452,7 @@ type asgTestDriver struct {
452452 err error
453453 desiredCapacity int64
454454 actualCapacity int64 // If 0, will default to desiredCapacity
455+ maxSize int64 // If 0, defaults to 100
455456 sigTermsSent []string
456457 elasticCIMode bool
457458 danglingInstancesFound int
@@ -469,11 +470,16 @@ func (d *asgTestDriver) Describe(ctx context.Context) (AutoscaleGroupDetails, er
469470 actualCount = d .desiredCapacity
470471 }
471472
473+ maxSize := d .maxSize
474+ if maxSize == 0 {
475+ maxSize = 100
476+ }
477+
472478 return AutoscaleGroupDetails {
473479 DesiredCount : d .desiredCapacity ,
474480 ActualCount : actualCount ,
475481 MinSize : 0 ,
476- MaxSize : 100 ,
482+ MaxSize : maxSize ,
477483 InstanceIDs : instanceIDs ,
478484 }, d .err
479485}
@@ -642,3 +648,138 @@ func TestAvailabilityBasedScaling(t *testing.T) {
642648 })
643649 }
644650}
651+
652+ func TestDanglingInstanceDetection (t * testing.T ) {
653+ testCases := []struct {
654+ name string
655+ metrics buildkite.AgentMetrics
656+ asgDesired int64
657+ asgActual int64
658+ maxSize int64
659+ agentsPerInstance int
660+ elasticCIMode bool
661+ expectedDesiredCapacity int64
662+ expectedDanglingChecks int
663+ }{
664+ // Core scenario from SUP-5153: agents die on running instance, MaxSize=1 caps
665+ // desired to 1, scaler sees desired==actual and says "No scaling required".
666+ // With the fix, it should detect total=0 with instances running and trigger cleanup.
667+ {
668+ name : "Elastic CI detects dangling instance when agents die and MaxSize caps desired" ,
669+ metrics : buildkite.AgentMetrics {
670+ ScheduledJobs : 4 ,
671+ RunningJobs : 0 ,
672+ WaitingJobs : 7 ,
673+ TotalAgents : 0 ,
674+ },
675+ asgDesired : 1 ,
676+ asgActual : 1 ,
677+ maxSize : 1 ,
678+ agentsPerInstance : 6 ,
679+ elasticCIMode : true ,
680+ expectedDesiredCapacity : 1 , // capacity unchanged — cleanup handles recovery
681+ expectedDanglingChecks : 1 ,
682+ },
683+ // Same scenario but without Elastic CI Mode — should NOT trigger dangling check.
684+ {
685+ name : "Non-Elastic CI mode does not trigger dangling check" ,
686+ metrics : buildkite.AgentMetrics {
687+ ScheduledJobs : 4 ,
688+ RunningJobs : 0 ,
689+ WaitingJobs : 7 ,
690+ TotalAgents : 0 ,
691+ },
692+ asgDesired : 1 ,
693+ asgActual : 1 ,
694+ maxSize : 1 ,
695+ agentsPerInstance : 6 ,
696+ elasticCIMode : false ,
697+ expectedDesiredCapacity : 1 ,
698+ expectedDanglingChecks : 0 ,
699+ },
700+ // When there are zero instances AND zero agents (normal idle state),
701+ // should NOT trigger dangling check — there's nothing dangling.
702+ {
703+ name : "No dangling check when zero instances and zero agents" ,
704+ metrics : buildkite.AgentMetrics {
705+ ScheduledJobs : 0 ,
706+ RunningJobs : 0 ,
707+ TotalAgents : 0 ,
708+ },
709+ asgDesired : 0 ,
710+ asgActual : 0 ,
711+ maxSize : 1 ,
712+ agentsPerInstance : 6 ,
713+ elasticCIMode : true ,
714+ expectedDesiredCapacity : 0 ,
715+ expectedDanglingChecks : 0 ,
716+ },
717+ // When agents are healthy (total > 0), no dangling check needed.
718+ {
719+ name : "No dangling check when agents are healthy" ,
720+ metrics : buildkite.AgentMetrics {
721+ ScheduledJobs : 0 ,
722+ RunningJobs : 0 ,
723+ TotalAgents : 6 ,
724+ },
725+ asgDesired : 1 ,
726+ asgActual : 1 ,
727+ maxSize : 1 ,
728+ agentsPerInstance : 6 ,
729+ elasticCIMode : true ,
730+ expectedDesiredCapacity : 0 , // scales in to 0 (idle agents, no jobs)
731+ expectedDanglingChecks : 0 ,
732+ },
733+ // Multiple instances with zero agents — should still trigger dangling check.
734+ {
735+ name : "Elastic CI detects dangling with multiple instances and zero agents" ,
736+ metrics : buildkite.AgentMetrics {
737+ ScheduledJobs : 10 ,
738+ RunningJobs : 0 ,
739+ TotalAgents : 0 ,
740+ },
741+ asgDesired : 3 ,
742+ asgActual : 3 ,
743+ maxSize : 3 ,
744+ agentsPerInstance : 4 ,
745+ elasticCIMode : true ,
746+ expectedDesiredCapacity : 3 , // capacity unchanged — cleanup handles recovery
747+ expectedDanglingChecks : 1 ,
748+ },
749+ }
750+
751+ for _ , tc := range testCases {
752+ t .Run (tc .name , func (t * testing.T ) {
753+ asg := & asgTestDriver {
754+ desiredCapacity : tc .asgDesired ,
755+ actualCapacity : tc .asgActual ,
756+ maxSize : tc .maxSize ,
757+ }
758+
759+ s := Scaler {
760+ autoscaling : asg ,
761+ bk : & buildkiteTestDriver {metrics : tc .metrics },
762+ elasticCIMode : tc .elasticCIMode ,
763+ scaling : ScalingCalculator {
764+ includeWaiting : true ,
765+ agentsPerInstance : tc .agentsPerInstance ,
766+ },
767+ }
768+
769+ _ , err := s .Run (context .Background ())
770+ if err != nil {
771+ t .Fatalf ("Unexpected error: %v" , err )
772+ }
773+
774+ if asg .desiredCapacity != tc .expectedDesiredCapacity {
775+ t .Errorf ("Expected desired capacity: %d, got: %d" ,
776+ tc .expectedDesiredCapacity , asg .desiredCapacity )
777+ }
778+
779+ if asg .danglingInstancesFound != tc .expectedDanglingChecks {
780+ t .Errorf ("Expected %d dangling instance check(s), got: %d" ,
781+ tc .expectedDanglingChecks , asg .danglingInstancesFound )
782+ }
783+ })
784+ }
785+ }
0 commit comments