Fix availability-based scaling and simplify threshold logic (#257)

scadu · web-flow · commit 9a1d0237d665 · 2025-11-19T15:33:22.000+01:00
diff --git a/README.md b/README.md
@@ -23,7 +23,28 @@ wanted more granular control over:
 The lambda (or cli version) polls the Buildkite Metrics API every 10 seconds, and based on the
 results sets the `DesiredCount` to exactly what is needed. This allows much faster scale up.
 
+## Configuration
+
+### Availability-based scaling
+
+The scaler monitors agent availability to handle situations where EC2 instances are healthy but Buildkite agents aren't connecting. This can happen due to network issues, agent configuration problems, or instance startup delays.
+
+**`AVAILABILITY_THRESHOLD`** (default: `0.5`)
+
+When jobs are queued, the scaler checks if the percentage of connected agents meets this threshold. For example, with 4 agents per instance and 2 instances running (8 expected agents), if only 3 agents are online, that's 37.5% availability.
+
+When availability drops below the threshold and the ASG has converged (actual instances match desired), the scaler adds one instance to help recover availability.
+
+Set `AVAILABILITY_THRESHOLD=0` to disable availability-based scaling. The scaler will then scale based only on job count.
+
+**Threshold tuning:**
+
+* **Lower threshold (e.g., 0.3)**: Tolerates slower agent connection times, reduces instance churn
+* **Higher threshold (e.g., 0.8)**: Aggressive scaling to maintain high availability when agents are expected to connect quickly
+* **Disabled (0)**: Job-based scaling only, suitable when agents connect reliably
+
 ## Gracefully scaling in
+
 :construction: For [Elastic CI Stack][], there's now available a dedicated and experimental mode configured with `ELASTIC_CI_MODE` variable. You can read more about it [in here](./docs/elastic_ci_mode.md). :construction:
 ___
 
@@ -55,17 +76,17 @@ of the metrics that the [buildkite-agent-metrics][] binary collects:
 An AWS Lambda bundle is created and published as part of the build process. The lambda will require
 the following IAM permissions:
 
-- `cloudwatch:PutMetricData`
-- `autoscaling:DescribeAutoScalingGroups`
-- `autoscaling:DescribeScalingActivities`
-- `autoscaling:SetDesiredCapacity`
+* `cloudwatch:PutMetricData`
+* `autoscaling:DescribeAutoScalingGroups`
+* `autoscaling:DescribeScalingActivities`
+* `autoscaling:SetDesiredCapacity`
 
 Its handler is `bootstrap`, it uses a `provided.al2` runtime and requires the following env vars:
 
-- `BUILDKITE_AGENT_TOKEN` or `BUILDKITE_AGENT_TOKEN_SSM_KEY`
-- `BUILDKITE_QUEUE`
-- `AGENTS_PER_INSTANCE`
-- `ASG_NAME`
+* `BUILDKITE_AGENT_TOKEN` or `BUILDKITE_AGENT_TOKEN_SSM_KEY`
+* `BUILDKITE_QUEUE`
+* `AGENTS_PER_INSTANCE`
+* `ASG_NAME`
 
 If `BUILDKITE_AGENT_TOKEN_SSM_KEY` is set, the token will be read from
 [AWS Systems Manager Parameter Store GetParameter](https://docs.aws.amazon.com/systems-manager/latest/APIReference/API_GetParameter.html)
@@ -82,8 +103,9 @@ aws lambda create-function \
 ```
 
 ## Development
+
 This project uses [mise](https://mise.jdx.dev/) to manage development tooling ensuring all the tooling needed is installed with one step, and in expected versions.
-To install mise, execute [./bin/mise](./bin/mise) bootstrap script or follow [mise documentation](https://mise.jdx.dev/installing-mise.html). 
+To install mise, execute [./bin/mise](./bin/mise) bootstrap script or follow [mise documentation](https://mise.jdx.dev/installing-mise.html).
 Run `mise install` to install all the required tooling defined in [mise.toml](./mise.toml).
 
 ### Running agent-scaler locally
@@ -103,7 +125,6 @@ The scaler is set up automatically by the [Elastic CI Stack][]'s CloudFormation
 reference the agent token and a queue name. A Lambda function running the scaler is then generated
 using these references (e.g., `BUILDKITE_AGENT_TOKEN_SSM_KEY` and `BUILDKITE_QUEUE`).
 
-
 ## Copyright
 
 Copyright (c) 2014-2019 Buildkite Pty Ltd. See [LICENSE](./LICENSE.txt) for details.
diff --git a/docs/elastic_ci_mode.md b/docs/elastic_ci_mode.md
@@ -91,9 +91,14 @@ Job dispatch delays, other issues related to job processing.
 ```
 
 ## Configuration Parameters
+
+### Availability Monitoring (applies to all modes)
+- `AVAILABILITY_THRESHOLD`: Minimum agent availability percentage before triggering scale-out (default 50%, i.e. with 4 agents per instance, we scale out when fewer than 2 agents are online). Set to `0` to disable availability-based scaling.
+
+### Elastic CI Mode Specific
+**Note:** The following settings apply **only when `ELASTIC_CI_MODE=true`**
+
 - `ELASTIC_CI_MODE`: Enable enhanced safety features, only for [Elastic CI Stack](https://github.com/buildkite/elastic-ci-stack-for-aws)! (boolean)
-- `AVAILABILITY_THRESHOLD`: Minimum agent availability percentage (default 90%)
-- `MIN_AGENTS_PERCENTAGE`: Minimum acceptable percentage of expected agents — ratio of desired agents number to actual (default 50%, i.e. we tolerate 2 agent instances running on a single EC2 out of desired 4)
 - `DANGLING_CHECK_MINIMUM_INSTANCE_UPTIME`: Minimum instance uptime before checking for dangling instances (default 1h)
 - `MAX_DANGLING_INSTANCES_TO_CHECK`: Maximum number of instances to scan for dangling detection (default 5)
-- `SCALE_IN_COOLDOWN_PERIOD`: Time to wait between scale-in operations (default 1h)
+- `SCALE_IN_COOLDOWN_PERIOD`: Time to wait between scale-in operations (default 1h for Elastic CI Mode, 0 otherwise)
diff --git a/lambda/main.go b/lambda/main.go
@@ -74,12 +74,10 @@ func Handler(ctx context.Context, evt json.RawMessage) (string, error) {
 	includeWaiting := EnvBool("INCLUDE_WAITING")
 	instanceBuffer := EnvInt("INSTANCE_BUFFER", 0)
 	maxDescribeScalingActivitiesPages := EnvInt("MAX_DESCRIBE_SCALING_ACTIVITIES_PAGES", -1)
-	// Below settings only applicable when elasticCIMode is enabled!
-	availabilityThreshold := EnvFloat("AVAILABILITY_THRESHOLD")   // Default to 90% in scaling calculator
-	minAgentsPercentage := EnvFloat("MIN_AGENTS_PERCENTAGE", 0.5) // Default to 50% in scaling calculator
+	availabilityThreshold := EnvFloat("AVAILABILITY_THRESHOLD", 0.5) // Default to 50%
+	// Below settings only applicable when elasticCIMode is enabled
 	minimumInstanceUptime := EnvDuration("DANGLING_CHECK_MINIMUM_INSTANCE_UPTIME", 1*time.Hour)
 	maxDanglingInstancesToCheck := EnvInt("MAX_DANGLING_INSTANCES_TO_CHECK", 5) // Maximum number of instances to check for dangling instances (only used for dangling instance scanning, not for normal scale-in)
-	// Above settings only applicable when elasticCIMode is enabled!
 
 	publishCloudWatchMetrics := EnvBool("CLOUDWATCH_METRICS")
 	if publishCloudWatchMetrics {
@@ -189,7 +187,6 @@ func Handler(ctx context.Context, evt json.RawMessage) (string, error) {
 		ScaleOnlyAfterAllEvent:      scaleOnlyAfterAllEvent,
 		PublishCloudWatchMetrics:    publishCloudWatchMetrics,
 		AvailabilityThreshold:       availabilityThreshold,
-		MinAgentsPercentage:         minAgentsPercentage,
 		ElasticCIMode:               elasticCIMode,
 		MinimumInstanceUptime:       minimumInstanceUptime,
 		MaxDanglingInstancesToCheck: maxDanglingInstancesToCheck,
diff --git a/scaler/scaler.go b/scaler/scaler.go
@@ -33,8 +33,7 @@ type Params struct {
 	ScaleOutParams              ScaleParams
 	InstanceBuffer              int
 	ScaleOnlyAfterAllEvent      bool
-	AvailabilityThreshold       float64       // Threshold for agent availability
-	MinAgentsPercentage         float64       // Minimum acceptable percentage of expected agents
+	AvailabilityThreshold       float64       // Threshold for agent availability (default 50%, all modes)
 	ASGActivityCooldown         time.Duration // How long to wait after an ASG activity before scaling again
 	ElasticCIMode               bool          // Special mode for Elastic CI Stack with additional safety checks
 	MinimumInstanceUptime       time.Duration // How long instance should be online before being eligible for dangling instance check
@@ -88,7 +87,6 @@ func NewScaler(client *buildkite.Client, cfg aws.Config, params Params) (*Scaler
 		includeWaiting:        params.IncludeWaiting,
 		agentsPerInstance:     params.AgentsPerInstance,
 		availabilityThreshold: params.AvailabilityThreshold,
-		minAgentsPercentage:   params.MinAgentsPercentage,
 		elasticCIMode:         params.ElasticCIMode,
 	}
 
@@ -201,7 +199,9 @@ func (s *Scaler) Run(ctx context.Context) (time.Duration, error) {
 			proportionalBuffer = int64(s.instanceBuffer)
 		}
 
-		log.Printf("↳ 🧮 Adding proportional instance buffer: %d (based on %d total jobs)", proportionalBuffer, totalJobs)
+		if proportionalBuffer > 0 {
+			log.Printf("↳ 🧮 Adding proportional instance buffer: %d (based on %d total jobs)", proportionalBuffer, totalJobs)
+		}
 		desired += proportionalBuffer
 	}
 
diff --git a/scaler/scaler_test.go b/scaler/scaler_test.go
@@ -268,7 +268,7 @@ func TestScalingOutWithoutError(t *testing.T) {
 				scaling: ScalingCalculator{
 					includeWaiting:        tc.params.IncludeWaiting,
 					agentsPerInstance:     tc.params.AgentsPerInstance,
-					availabilityThreshold: 0.0, // Disable availability threshold for tests
+					availabilityThreshold: 0, // Disable availability threshold for tests
 				},
 			}
 
@@ -417,7 +417,7 @@ func TestScalingInWithoutError(t *testing.T) {
 				scaling: ScalingCalculator{
 					includeWaiting:        tc.params.IncludeWaiting,
 					agentsPerInstance:     tc.params.AgentsPerInstance,
-					availabilityThreshold: 0.0, // Disable availability threshold for tests
+					availabilityThreshold: 0, // Disable availability threshold for tests
 				},
 				scaleInParams:          tc.params.ScaleInParams,
 				scaleOutParams:         tc.params.ScaleOutParams,
@@ -451,6 +451,7 @@ func (d *buildkiteTestDriver) GetAgentMetrics(ctx context.Context) (buildkite.Ag
 type asgTestDriver struct {
 	err                    error
 	desiredCapacity        int64
+	actualCapacity         int64 // If 0, will default to desiredCapacity
 	sigTermsSent           []string
 	elasticCIMode          bool
 	danglingInstancesFound int
@@ -463,8 +464,14 @@ func (d *asgTestDriver) Describe(ctx context.Context) (AutoscaleGroupDetails, er
 		instanceIDs[i] = fmt.Sprintf("i-%012d", i)
 	}
 
+	actualCount := d.actualCapacity
+	if actualCount == 0 {
+		actualCount = d.desiredCapacity
+	}
+
 	return AutoscaleGroupDetails{
 		DesiredCount: d.desiredCapacity,
+		ActualCount:  actualCount,
 		MinSize:      0,
 		MaxSize:      100,
 		InstanceIDs:  instanceIDs,
@@ -488,3 +495,150 @@ func (d *asgTestDriver) CleanupDanglingInstances(ctx context.Context, minimumIns
 	d.danglingInstancesFound++
 	return d.err
 }
+
+func TestAvailabilityBasedScaling(t *testing.T) {
+	testCases := []struct {
+		name                    string
+		metrics                 buildkite.AgentMetrics
+		asgDesired              int64
+		asgActual               int64
+		agentsPerInstance       int
+		availabilityThreshold   float64
+		expectedDesiredCapacity int64
+	}{
+		// With 2 instances @ 4 agents each = 8 expected, but only 3 online (37.5%).
+		// Should scale from 2 to 3 instances when ASG has converged.
+		{
+			name: "Low availability triggers scale-out when ASG converged",
+			metrics: buildkite.AgentMetrics{
+				ScheduledJobs: 5,
+				RunningJobs:   2,
+				TotalAgents:   3,
+			},
+			asgDesired:              2,
+			asgActual:               2,
+			agentsPerInstance:       4,
+			availabilityThreshold:   0.5,
+			expectedDesiredCapacity: 3,
+		},
+		// ASG not converged (actual 1 != desired 2), should wait for convergence
+		// before applying availability-based scaling.
+		{
+			name: "Low availability does not trigger when ASG still converging",
+			metrics: buildkite.AgentMetrics{
+				ScheduledJobs: 5,
+				RunningJobs:   2,
+				TotalAgents:   3,
+			},
+			asgDesired:              2,
+			asgActual:               1,
+			agentsPerInstance:       4,
+			availabilityThreshold:   0.5,
+			expectedDesiredCapacity: 2,
+		},
+		// 7 out of 8 expected agents (87.5% availability) is above 50% threshold.
+		// No scale-out needed.
+		{
+			name: "Good availability does not trigger scale-out",
+			metrics: buildkite.AgentMetrics{
+				ScheduledJobs: 5,
+				RunningJobs:   2,
+				TotalAgents:   7,
+			},
+			asgDesired:              2,
+			asgActual:               2,
+			agentsPerInstance:       4,
+			availabilityThreshold:   0.5,
+			expectedDesiredCapacity: 2,
+		},
+		// Threshold set to 0 disables availability-based scaling.
+		// No scale-out despite only 2 out of 8 agents online (25%).
+		{
+			name: "Availability threshold disabled (0) does not trigger",
+			metrics: buildkite.AgentMetrics{
+				ScheduledJobs: 5,
+				RunningJobs:   2,
+				TotalAgents:   2,
+			},
+			asgDesired:              2,
+			asgActual:               2,
+			agentsPerInstance:       4,
+			availabilityThreshold:   0,
+			expectedDesiredCapacity: 2,
+		},
+		// With 0 instances, job-based scaling takes over.
+		// Need 2 instances for 5 jobs (at 4 agents per instance).
+		{
+			name: "Low availability from zero instances scales to 1",
+			metrics: buildkite.AgentMetrics{
+				ScheduledJobs: 5,
+				RunningJobs:   0,
+				TotalAgents:   0,
+			},
+			asgDesired:              0,
+			asgActual:               0,
+			agentsPerInstance:       4,
+			availabilityThreshold:   0.5,
+			expectedDesiredCapacity: 2,
+		},
+		// Only 2 out of 12 expected agents online (16.7% availability).
+		// Availability-based boost from 3 to 4 overrides lower job-based need (1).
+		{
+			name: "Availability boost when job-based need is lower",
+			metrics: buildkite.AgentMetrics{
+				ScheduledJobs: 2,
+				RunningJobs:   0,
+				TotalAgents:   2,
+			},
+			asgDesired:              3,
+			asgActual:               3,
+			agentsPerInstance:       4,
+			availabilityThreshold:   0.5,
+			expectedDesiredCapacity: 4,
+		},
+		// Need 5 instances for 20 jobs. Job-based scaling (5) dominates
+		// over availability boost (3), despite low availability (25%).
+		{
+			name: "No boost when job-based need is higher",
+			metrics: buildkite.AgentMetrics{
+				ScheduledJobs: 20,
+				RunningJobs:   0,
+				TotalAgents:   2,
+			},
+			asgDesired:              2,
+			asgActual:               2,
+			agentsPerInstance:       4,
+			availabilityThreshold:   0.5,
+			expectedDesiredCapacity: 5,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			asg := &asgTestDriver{
+				desiredCapacity: tc.asgDesired,
+				actualCapacity:  tc.asgActual,
+			}
+
+			s := Scaler{
+				autoscaling: asg,
+				bk:          &buildkiteTestDriver{metrics: tc.metrics},
+				scaling: ScalingCalculator{
+					includeWaiting:        false,
+					agentsPerInstance:     tc.agentsPerInstance,
+					availabilityThreshold: tc.availabilityThreshold,
+				},
+			}
+
+			_, err := s.Run(context.Background())
+			if err != nil {
+				t.Fatalf("Unexpected error: %v", err)
+			}
+
+			if asg.desiredCapacity != tc.expectedDesiredCapacity {
+				t.Errorf("Expected desired capacity: %d, got: %d",
+					tc.expectedDesiredCapacity, asg.desiredCapacity)
+			}
+		})
+	}
+}
diff --git a/scaler/scaling_calculator.go b/scaler/scaling_calculator.go
diff --git a/template.yaml b/template.yaml