Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds scaleUpHealing cron #6390

Closed
wants to merge 28 commits into from
Closed
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
85aed2f
more changes
ZainRizvi Dec 4, 2024
a908ce4
temp changes
ZainRizvi Dec 5, 2024
c3e3b7a
rename columns
ZainRizvi Dec 5, 2024
aabb321
add metrics and don't use sqs queue
Camyll Mar 5, 2025
18fe3a0
set timeout
Camyll Mar 5, 2025
f549fe0
add tests
Camyll Mar 6, 2025
fd8b270
fix tests
Camyll Mar 6, 2025
e0632a4
stash testing changes
Camyll Mar 6, 2025
621c708
working tests for scaleupchron
Camyll Mar 7, 2025
71285d9
Fixing linting errors
jeanschmidt Mar 11, 2025
a4c5895
fix build errorsclear
Camyll Mar 11, 2025
0419e58
complete scaleupchron and lambda tests and add scale-up-chron.json
Camyll Mar 11, 2025
4214cf8
add handling for no response data and move hud query to variable
Camyll Mar 11, 2025
5245d02
Typescript and other small improvements
jeanschmidt Mar 12, 2025
d1a7ce3
Copying permissions I believe are relevant from scaleUp to scaleUpCro…
jeanschmidt Mar 12, 2025
b43e52f
Fixes on terraform plan, permissions from scaleUp, not apply scaleUpC…
jeanschmidt Mar 12, 2025
e6e7e71
Works in production!
jeanschmidt Mar 13, 2025
03b220f
Removed test code
jeanschmidt Mar 13, 2025
79b89ae
[Mobile Benchmark Test] Add os, job_arn, and job_conclusion to artifa…
yangw-dev Mar 12, 2025
4635c2d
add regex filter for cost page (#6393)
wdvr Mar 12, 2025
e2e9454
monsterized failures in grouped view (#6394)
wdvr Mar 12, 2025
c75bc11
[ez][HUD] Fix build artifacts grouping in workflow box (#6395)
clee2000 Mar 12, 2025
c8a0ca7
[Benchmark] Prepare for execuTorch failure handling (#6391)
yangw-dev Mar 13, 2025
4e5c126
Update test matrix for release validations for 2.7 (#6401)
atalman Mar 13, 2025
0935d68
Adds additional tests to getRunnerTypes, simplifies code a bit, adds …
jeanschmidt Mar 13, 2025
084c572
Add ephemeral variants for all runner types, and updates validate_sca…
jeanschmidt Mar 13, 2025
d571353
[Queue Time Historgam] add schema for QueueTimeHistorgam (#6355)
yangw-dev Mar 13, 2025
b1c7c6e
fix test input and make naming consistent
Camyll Mar 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions terraform-aws-github-runner/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ module "runners" {
environment = var.environment
tags = local.tags

scale_config_org = var.scale_config_org
scale_config_repo = var.scale_config_repo
scale_config_repo_path = var.scale_config_repo_path

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,8 @@ module.exports = {
lines: 93,
statements: 94
}
}
},
moduleNameMapper: {
axios: 'axios/dist/node/axios.cjs', // Allow axios to work in tests
},
};
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
"@types/uuid": "^9.0.1",
"async-mutex": "^0.4.0",
"aws-sdk": "^2.863.0",
"axios": "^1.7.7",
"cron-parser": "^3.3.0",
"generic-pool": "^3.9.0",
"lru-cache": "^6.0.0",
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import { scaleDown as scaleDownL, scaleUp as scaleUpL } from './lambda';
import { scaleDown as scaleDownL, scaleUp as scaleUpL, scaleUpChron as scaleUpChronL } from './lambda';

import nock from 'nock';
import { Config } from './scale-runners/config';
import { Context, SQSEvent, ScheduledEvent } from 'aws-lambda';
import { mocked } from 'ts-jest/utils';
import { scaleDown } from './scale-runners/scale-down';
import { scaleUp, RetryableScalingError } from './scale-runners/scale-up';
import { scaleUpChron } from './scale-runners/scale-up-chron';
import { sqsSendMessages, sqsDeleteMessageBatch } from './scale-runners/sqs';
import * as MetricsModule from './scale-runners/metrics';

Expand All @@ -21,8 +22,10 @@ jest.mock('aws-sdk', () => ({
jest.mock('./scale-runners/scale-down');
jest.mock('./scale-runners/scale-up');
jest.mock('./scale-runners/sqs');
jest.mock('./scale-runners/scale-up-chron');

const metrics = new MetricsModule.ScaleUpMetrics();
const mockScaleUpMetrics = new MetricsModule.ScaleUpMetrics();
const mockScaleUpChronMetrics = new MetricsModule.ScaleUpChronMetrics();

beforeEach(() => {
jest.resetModules();
Expand All @@ -34,7 +37,7 @@ beforeEach(() => {
describe('scaleUp', () => {
beforeEach(() => {
jest.spyOn(global.Math, 'random').mockReturnValue(1.0);
jest.spyOn(MetricsModule, 'ScaleUpMetrics').mockReturnValue(metrics);
jest.spyOn(MetricsModule, 'ScaleUpMetrics').mockReturnValue(mockScaleUpMetrics);
});

afterEach(() => {
Expand All @@ -55,8 +58,8 @@ describe('scaleUp', () => {
callback,
);
expect(mockedScaleUp).toBeCalledTimes(2);
expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }, metrics);
expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 2 }, metrics);
expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }, mockScaleUpMetrics);
expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 2 }, mockScaleUpMetrics);
expect(callback).toBeCalledTimes(1);
expect(callback).toBeCalledWith(null);
});
Expand Down Expand Up @@ -88,12 +91,12 @@ describe('scaleUp', () => {
callback,
);
expect(mockedScaleUp).toBeCalledTimes(1);
expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }, metrics);
expect(mockedScaleUp).toBeCalledWith('aws:sqs', { id: 1 }, mockScaleUpMetrics);
expect(callback).toBeCalledTimes(1);
expect(callback).toBeCalledWith('Failed handling SQS event');

expect(sqsDeleteMessageBatch).toBeCalledTimes(1);
expect(sqsDeleteMessageBatch).toBeCalledWith(metrics, evts);
expect(sqsDeleteMessageBatch).toBeCalledWith(mockScaleUpMetrics, evts);
});

it('stochasticOvershoot when retryCount > 5', async () => {
Expand Down Expand Up @@ -137,7 +140,7 @@ describe('scaleUp', () => {
},
];
expect(sqsSendMessages).toBeCalledTimes(1);
expect(sqsSendMessages).toBeCalledWith(metrics, expected, 'asdf');
expect(sqsSendMessages).toBeCalledWith(mockScaleUpMetrics, expected, 'asdf');

expect(sqsDeleteMessageBatch).toBeCalledTimes(0);
});
Expand Down Expand Up @@ -205,10 +208,10 @@ describe('scaleUp', () => {
},
];
expect(sqsSendMessages).toBeCalledTimes(1);
expect(sqsSendMessages).toBeCalledWith(metrics, expected, 'asdf');
expect(sqsSendMessages).toBeCalledWith(mockScaleUpMetrics, expected, 'asdf');

expect(sqsDeleteMessageBatch).toBeCalledTimes(1);
expect(sqsDeleteMessageBatch).toBeCalledWith(metrics, records);
expect(sqsDeleteMessageBatch).toBeCalledWith(mockScaleUpMetrics, records);
});
});

Expand All @@ -231,3 +234,28 @@ describe('scaleDown', () => {
expect(callback).toBeCalledWith('Failed');
});
});

describe('scaleUpChron', () => {
beforeEach(() => {
jest.spyOn(MetricsModule, 'ScaleUpChronMetrics').mockReturnValue(mockScaleUpChronMetrics);
});

it('succeeds', async () => {
const mockedScaleUpChron = mocked(scaleUpChron).mockResolvedValue(undefined);
const callback = jest.fn();
await scaleUpChronL({} as unknown as ScheduledEvent, {} as unknown as Context, callback);
expect(mockedScaleUpChron).toBeCalledTimes(1);
expect(mockedScaleUpChron).toBeCalledWith(mockScaleUpChronMetrics);
expect(callback).toBeCalledTimes(1);
expect(callback).toBeCalledWith(null);
});

it('fails', async () => {
const mockedScaleUpChron = mocked(scaleUpChron).mockRejectedValue(Error('error'));
const callback = jest.fn();
await scaleUpChronL({} as unknown as ScheduledEvent, {} as unknown as Context, callback);
expect(mockedScaleUpChron).toBeCalledTimes(1);
expect(callback).toBeCalledTimes(1);
expect(callback).toBeCalledWith('Failed');
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,15 @@ import { ActionRequestMessage, RetryableScalingError, scaleUp as scaleUpR } from
import { Context, SQSEvent, SQSRecord, ScheduledEvent } from 'aws-lambda';

import { Config } from './scale-runners/config';
import { ScaleUpMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics';
import {
ScaleUpMetrics,
ScaleUpChronMetrics,
sendMetricsAtTimeout,
sendMetricsTimeoutVars,
} from './scale-runners/metrics';
import { getDelayWithJitterRetryCount, stochaticRunOvershoot } from './scale-runners/utils';
import { scaleDown as scaleDownR } from './scale-runners/scale-down';
import { scaleUpChron as scaleUpChronR } from './scale-runners/scale-up-chron';
import { sqsSendMessages, sqsDeleteMessageBatch } from './scale-runners/sqs';

async function sendRetryEvents(evtFailed: Array<[SQSRecord, boolean, number]>, metrics: ScaleUpMetrics) {
Expand Down Expand Up @@ -155,3 +161,35 @@ export async function scaleDown(event: ScheduledEvent, context: Context, callbac
return callback('Failed');
}
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
export async function scaleUpChron(event: ScheduledEvent, context: Context, callback: any) {
// we mantain open connections to redis, so the event pool is only cleaned when the SIGTERM is sent
context.callbackWaitsForEmptyEventLoop = false;

const metrics = new ScaleUpChronMetrics();
const sndMetricsTimout: sendMetricsTimeoutVars = {
metrics: metrics,
};
sndMetricsTimout.setTimeout = setTimeout(
sendMetricsAtTimeout(sndMetricsTimout),
(Config.Instance.lambdaTimeout - 10) * 1000,
);

try {
await scaleUpChronR(metrics);
return callback(null);
} catch (e) {
console.error(e);
return callback('Failed');
} finally {
try {
clearTimeout(sndMetricsTimout.setTimeout);
sndMetricsTimout.metrics = undefined;
sndMetricsTimout.setTimeout = undefined;
await metrics.sendMetrics();
} catch (e) {
console.error(`Error sending metrics: ${e}`);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@ export class Config {
readonly retryScaleUpRecordQueueUrl: string | undefined;
readonly runnerGroupName: string | undefined;
readonly runnersExtraLabels: undefined | string;
readonly scaleConfigOrg: string;
readonly scaleConfigRepo: string;
readonly scaleConfigRepoPath: string;
readonly scaleUpMinQueueTimeMinutes: number;
readonly scaleUpRecordQueueUrl: string | undefined;
readonly secretsManagerSecretsId: string | undefined;
readonly sSMParamCleanupAgeDays: number;
readonly sSMParamMaxCleanupAllowance: number;
Expand Down Expand Up @@ -94,8 +97,13 @@ export class Config {
/* istanbul ignore next */
this.retryScaleUpRecordJitterPct = Number(process.env.RETRY_SCALE_UP_RECORD_JITTER_PCT || '0');
this.retryScaleUpRecordQueueUrl = process.env.RETRY_SCALE_UP_RECORD_QUEUE_URL;
this.scaleUpRecordQueueUrl = process.env.SCALE_UP_RECORD_QUEUE_URL;
this.scaleUpMinQueueTimeMinutes = process.env.SCALE_UP_MIN_QUEUE_TIME_MINUTES
? Number(process.env.SCALE_UP_MIN_QUEUE_TIME_MINUTES)
: 30;
this.runnerGroupName = process.env.RUNNER_GROUP_NAME;
this.runnersExtraLabels = process.env.RUNNER_EXTRA_LABELS;
this.scaleConfigOrg = process.env.SCALE_CONFIG_ORG || '';
/* istanbul ignore next */
this.scaleConfigRepo = process.env.SCALE_CONFIG_REPO || '';
if (this.enableOrganizationRunners && !this.scaleConfigRepo) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1630,6 +1630,58 @@ export class ScaleDownMetrics extends Metrics {
}
}

export class ScaleUpChronMetrics extends ScaleUpMetrics {
constructor() {
super();
}
queuedRunnerStats(org: string, runnerType: string, numQueuedJobs: number) {
const dimensions = new Map([
['Org', org],
['RunnerType', runnerType],
['numQueuedJobs', numQueuedJobs.toString()],
]);
this.addEntry('gh.scaleupchron.queuedRunners', 3, dimensions);
}
queuedRunnerFailure(error: string) {
const dimensions = new Map([['error', error]]);
this.countEntry('gh.scaleupchron.queuedRunners.failure', 1, dimensions);
}
/* istanbul ignore next */
getQueuedJobsEndpointSuccess(ms: number) {
this.countEntry(`gh.calls.total`, 1);
this.countEntry(`gh.calls.getQueuedJobsEndpoint.count`, 1);
this.countEntry(`gh.calls.getQueuedJobsEndpoint.success`, 1);
this.addEntry(`gh.calls.getQueuedJobsEndpoint.wallclock`, ms);
}

/* istanbul ignore next */
getQueuedJobsEndpointFailure(ms: number) {
this.countEntry(`gh.calls.total`, 1);
this.countEntry(`gh.calls.getQueuedJobsEndpoint.count`, 1);
this.countEntry(`gh.calls.getQueuedJobsEndpoint.failure`, 1);
this.addEntry(`gh.calls.getQueuedJobsEndpoint.wallclock`, ms);
}

scaleUpInstanceSuccess() {
this.scaleUpSuccess();
this.countEntry('run.scaleupchron.success');
}
scaleUpInstanceFailureNonRetryable(error: string) {
const dimensions = new Map([['error', error]]);
// should we add more information about this or do we not care since it'll be requeued?
this.countEntry('run.scaleupchron.failure.nonRetryable', 1, dimensions);
}
scaleUpInstanceFailureRetryable(error: string) {
const dimensions = new Map([['error', error]]);

// should we add more information about this or do we not care since it'll be requeued?
this.countEntry('run.scaleupchron.failure.retryable', 1, dimensions);
}
scaleUpInstanceNoOp() {
this.countEntry('run.scaleupchron.noop');
}
}

export interface sendMetricsTimeoutVars {
metrics?: Metrics;
setTimeout?: ReturnType<typeof setTimeout>;
Expand Down
Loading
Loading