Skip to content

Commit fcb9e2d

Browse files
untracked instances cause groups to throttle (#119)
* untracked instances cause groups to throttle flag on groups for throttle behavior on/off audit adds function for querying untracked by group instance launcher checks untrack counts * get untracked from metrics loop not audit * only throttle if untracked count exceeds threshold * global max for scale up threshold
1 parent bccdd88 commit fcb9e2d

5 files changed

Lines changed: 62 additions & 8 deletions

File tree

src/app.ts

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -143,22 +143,24 @@ const autoscaleProcessor = new AutoscaleProcessor({
143143
audit: audit,
144144
});
145145

146+
const metricsLoop = new MetricsLoop({
147+
redisClient: redisClient,
148+
metricsTTL: config.ServiceLevelMetricsTTL,
149+
instanceGroupManager: instanceGroupManager,
150+
instanceTracker: instanceTracker,
151+
ctx: initCtx,
152+
});
153+
146154
const instanceLauncher = new InstanceLauncher({
155+
maxThrottleThreshold: config.MaxThrottleThreshold,
147156
instanceTracker: instanceTracker,
148157
cloudManager: cloudManager,
149158
instanceGroupManager: instanceGroupManager,
150159
lockManager: lockManager,
151160
redisClient,
152161
shutdownManager,
153162
audit: audit,
154-
});
155-
156-
const metricsLoop = new MetricsLoop({
157-
redisClient: redisClient,
158-
metricsTTL: config.ServiceLevelMetricsTTL,
159-
instanceGroupManager: instanceGroupManager,
160-
instanceTracker: instanceTracker,
161-
ctx: initCtx,
163+
metricsLoop,
162164
});
163165

164166
const groupReportGenerator = new GroupReportGenerator({

src/config.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ const env = envalid.cleanEnv(process.env, {
4444
SHUTDOWN_TTL_SEC: envalid.num({ default: 86400 }), // default 1 day
4545
SHUTDOWN_STATUS_TTL_SEC: envalid.num({ default: 600 }), // default 10 minutes
4646
AUDIT_TTL_SEC: envalid.num({ default: 172800 }), // default 2 day
47+
MAX_THROTTLE_THRESHOLD: envalid.num({ default: 40 }), // default max of 40 untracked per group to throttle scale up
4748
GROUP_RELATED_DATA_TTL_SEC: envalid.num({ default: 172800 }), // default 2 day; keep group related data max 2 days after the group is deleted or no action is performed on it
4849
GROUP_LOCK_TTL_MS: envalid.num({ default: 180000 }), // time in ms
4950
GROUP_JOBS_CREATION_INTERVAL_SEC: envalid.num({ default: 30 }), // with what interval this instance should try producing jobs for group processing
@@ -127,6 +128,8 @@ export default {
127128
GroupRelatedDataTTL: env.GROUP_RELATED_DATA_TTL_SEC,
128129
// group processing lock
129130
GroupLockTTLMs: env.GROUP_LOCK_TTL_MS,
131+
// group untracked threshold
132+
MaxThrottleThreshold: env.MAX_THROTTLE_THRESHOLD,
130133
// queue jobs producers
131134
GroupJobsCreationIntervalSec: env.GROUP_JOBS_CREATION_INTERVAL_SEC,
132135
SanityJobsCreationIntervalSec: env.SANITY_JOBS_CREATION_INTERVAL_SEC,

src/handlers.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ interface InstanceGroupScalingActivitiesRequest {
1717
enableAutoScale?: boolean;
1818
enableLaunch?: boolean;
1919
enableScheduler?: boolean;
20+
enableUntrackedThrottle?: boolean;
2021
}
2122

2223
export interface InstanceGroupDesiredValuesRequest {
@@ -189,6 +190,10 @@ class Handlers {
189190
if (scalingActivitiesRequest.enableScheduler != null) {
190191
instanceGroup.enableScheduler = scalingActivitiesRequest.enableScheduler;
191192
}
193+
if (scalingActivitiesRequest.enableUntrackedThrottle != null) {
194+
instanceGroup.enableUntrackedThrottle = scalingActivitiesRequest.enableUntrackedThrottle;
195+
}
196+
192197
await this.instanceGroupManager.upsertInstanceGroup(req.context, instanceGroup);
193198
res.status(200);
194199
res.send({ save: 'OK' });

src/instance_group.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ export interface InstanceGroup {
2525
enableAutoScale: boolean;
2626
enableLaunch: boolean;
2727
enableScheduler: boolean;
28+
enableUntrackedThrottle: boolean;
2829
gracePeriodTTLSec: number;
2930
protectedTTLSec: number;
3031
scalingOptions: ScalingOptions;

src/instance_launcher.ts

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import { Context } from './context';
77
import * as promClient from 'prom-client';
88
import ShutdownManager from './shutdown_manager';
99
import Audit from './audit';
10+
import MetricsLoop from './metrics_loop';
1011

1112
const instancesLaunchedCounter = new promClient.Counter({
1213
name: 'autoscaling_instance_launched_total',
@@ -27,23 +28,27 @@ const instanceErrorsCounter = new promClient.Counter({
2728
});
2829

2930
export interface InstanceLauncherOptions {
31+
maxThrottleThreshold?: number;
3032
instanceTracker: InstanceTracker;
3133
cloudManager: CloudManager;
3234
instanceGroupManager: InstanceGroupManager;
3335
lockManager: LockManager;
3436
redisClient: Redis.Redis;
3537
shutdownManager: ShutdownManager;
3638
audit: Audit;
39+
metricsLoop: MetricsLoop;
3740
}
3841

3942
export default class InstanceLauncher {
43+
private maxThrottleThreshold = 40;
4044
private instanceTracker: InstanceTracker;
4145
private instanceGroupManager: InstanceGroupManager;
4246
private cloudManager: CloudManager;
4347
private redisClient: Redis.Redis;
4448
private lockManager: LockManager;
4549
private shutdownManager: ShutdownManager;
4650
private audit: Audit;
51+
private metricsLoop: MetricsLoop;
4752

4853
constructor(options: InstanceLauncherOptions) {
4954
this.instanceTracker = options.instanceTracker;
@@ -53,6 +58,11 @@ export default class InstanceLauncher {
5358
this.redisClient = options.redisClient;
5459
this.shutdownManager = options.shutdownManager;
5560
this.audit = options.audit;
61+
this.metricsLoop = options.metricsLoop;
62+
63+
if (options.maxThrottleThreshold) {
64+
this.maxThrottleThreshold = options.maxThrottleThreshold;
65+
}
5666

5767
this.launchOrShutdownInstancesByGroup = this.launchOrShutdownInstancesByGroup.bind(this);
5868
}
@@ -79,6 +89,39 @@ export default class InstanceLauncher {
7989

8090
const actualScaleUpQuantity =
8191
Math.min(group.scalingOptions.maxDesired, group.scalingOptions.desiredCount) - count;
92+
93+
// if untracked throttle enabled, only scale up if there aren't too many untracked instances
94+
if (group.enableUntrackedThrottle == null || group.enableUntrackedThrottle == true) {
95+
// use desired scaleUpQuantity to ensure we only scale up this many (plus one) until the previous batch are ready
96+
// ensure a maximum threshold from config (default of 40, much higher than ever seen except in cases in which throttling is desired)
97+
const untrackedThrottleThreshold = Math.min(
98+
group.scalingOptions.maxDesired + 1,
99+
this.maxThrottleThreshold,
100+
);
101+
const untrackedCount = await this.metricsLoop.getUnTrackedCount(group.name);
102+
// only allow scale up if untracked count is less than the threshold
103+
const allowedScaleUp = untrackedCount < untrackedThrottleThreshold;
104+
105+
ctx.logger.debug(
106+
`[Launcher] Scaling throttle check for group ${groupName} with ${count} instances.`,
107+
{ actualScaleUpQuantity, untrackedThrottleThreshold, untrackedCount, allowedScaleUp },
108+
);
109+
if (!allowedScaleUp) {
110+
// not allow to scale at all, error out here
111+
ctx.logger.error(
112+
`[Launcher] Scaling throttle launch of ALL new instances for group ${groupName} with ${count} instances.`,
113+
{ untrackedCount, actualScaleUpQuantity, allowedScaleUp },
114+
);
115+
throw new Error(
116+
`[Launcher] Scaling throttled, failed to launch ALL new instances for group ${groupName}`,
117+
);
118+
} else {
119+
ctx.logger.debug(`[Launcher] Scaling throttle check passed for group ${groupName}.`);
120+
}
121+
} else {
122+
ctx.logger.debug(`[Launcher] Scaling throttle disabled for group ${groupName}.`);
123+
}
124+
82125
const scaleDownProtected = await this.instanceGroupManager.isScaleDownProtected(group.name);
83126
const scaleUpCount = await this.cloudManager.scaleUp(
84127
ctx,

0 commit comments

Comments
 (0)