Skip to content

Commit 676e79b

Browse files
Add experiment/stress test code (#6520)
As part of the infra stress test, we want to introduce the capability of running code experiments leveraging Redis. This also introduce the checks we are planning to perform for the infra stress test. --------- Co-authored-by: Zain Rizvi <[email protected]>
1 parent 7b36bd5 commit 676e79b

File tree

9 files changed

+477
-55
lines changed

9 files changed

+477
-55
lines changed

Diff for: terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/cache.test.ts

+174-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,12 @@
1-
import { locallyCached, redisCached, clearLocalCache, shutdownRedisPool, redisClearCacheKeyPattern } from './cache';
1+
import {
2+
locallyCached,
3+
redisCached,
4+
clearLocalCache,
5+
shutdownRedisPool,
6+
redisClearCacheKeyPattern,
7+
getExperimentValue,
8+
getJoinedStressTestExperiment,
9+
} from './cache';
210
import { mocked } from 'ts-jest/utils';
311
import { v4 as uuidv4 } from 'uuid';
412
import nock from 'nock';
@@ -131,6 +139,171 @@ describe('locallyCached', () => {
131139
});
132140
});
133141

142+
describe('experiment functions', () => {
143+
beforeEach(async () => {
144+
await shutdownRedisPool();
145+
clearLocalCache();
146+
jest.restoreAllMocks();
147+
jest.clearAllMocks();
148+
});
149+
150+
afterEach(async () => {
151+
await shutdownRedisPool();
152+
});
153+
154+
describe('getExperimentValue', () => {
155+
it('returns the value from redis when it exists', async () => {
156+
const experimentKey = 'test-experiment';
157+
const experimentValue = '42';
158+
const defaultValue = 10;
159+
160+
mockedRedisClient.get.mockResolvedValueOnce(experimentValue);
161+
162+
const result = await getExperimentValue(experimentKey, defaultValue);
163+
164+
expect(result).toBe(42);
165+
expect(mockedRedisClient.get).toBeCalledTimes(1);
166+
expect(mockedRedisClient.get).toBeCalledWith('gh-ci.EXPERIMENT.test-experiment');
167+
});
168+
169+
it('returns default value when key does not exist', async () => {
170+
const experimentKey = 'missing-experiment';
171+
const defaultValue = 10;
172+
173+
mockedRedisClient.get.mockResolvedValueOnce(null);
174+
175+
const result = await getExperimentValue(experimentKey, defaultValue);
176+
177+
expect(result).toBe(defaultValue);
178+
expect(mockedRedisClient.get).toBeCalledTimes(1);
179+
expect(mockedRedisClient.get).toBeCalledWith('gh-ci.EXPERIMENT.missing-experiment');
180+
});
181+
182+
it('returns default value when redis throws an error', async () => {
183+
const experimentKey = 'error-experiment';
184+
const defaultValue = 10;
185+
186+
mockedRedisClient.get.mockRejectedValueOnce(new Error('Redis error'));
187+
188+
const result = await getExperimentValue(experimentKey, defaultValue);
189+
190+
expect(result).toBe(defaultValue);
191+
expect(mockedRedisClient.get).toBeCalledTimes(1);
192+
});
193+
194+
it('returns default value when value is not a valid number', async () => {
195+
const experimentKey = 'invalid-experiment';
196+
const defaultValue = 10;
197+
198+
mockedRedisClient.get.mockResolvedValueOnce('not-a-number');
199+
200+
const result = await getExperimentValue(experimentKey, defaultValue);
201+
202+
expect(result).toBe(defaultValue);
203+
expect(mockedRedisClient.get).toBeCalledTimes(1);
204+
});
205+
});
206+
207+
describe('getJoinedStressTestExperiment', () => {
208+
it('returns false when RUNNER_NAME_SUFFIX is not set', async () => {
209+
mockedRedisClient.get.mockResolvedValueOnce(null);
210+
211+
const result = await getJoinedStressTestExperiment('TEST_EXPERIMENT', 'runner-name');
212+
213+
expect(result).toBe(false);
214+
expect(mockedRedisClient.get).toBeCalledTimes(1);
215+
expect(mockedRedisClient.get).toBeCalledWith('gh-ci.EXPERIMENT.RUNNER_NAME_SUFFIX');
216+
});
217+
218+
it('returns false when runner name does not match suffix', async () => {
219+
mockedRedisClient.get.mockResolvedValueOnce('-suffix');
220+
221+
const result = await getJoinedStressTestExperiment('TEST_EXPERIMENT', 'runner-name-without-match');
222+
223+
expect(result).toBe(false);
224+
expect(mockedRedisClient.get).toBeCalledTimes(1);
225+
expect(mockedRedisClient.get).toBeCalledWith('gh-ci.EXPERIMENT.RUNNER_NAME_SUFFIX');
226+
});
227+
228+
it('returns false when probability is less than random value', async () => {
229+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.6);
230+
231+
mockedRedisClient.get.mockResolvedValueOnce('-suffix');
232+
mockedRedisClient.get.mockResolvedValueOnce('50');
233+
234+
const result = await getJoinedStressTestExperiment('TEST_EXPERIMENT', 'runner-name-suffix');
235+
236+
expect(result).toBe(false);
237+
expect(mockedRedisClient.get).toBeCalledTimes(2);
238+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(1, 'gh-ci.EXPERIMENT.RUNNER_NAME_SUFFIX');
239+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(2, 'gh-ci.EXPERIMENT.TEST_EXPERIMENT');
240+
});
241+
242+
it('returns true when probability is greater than random value', async () => {
243+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.4);
244+
245+
mockedRedisClient.get.mockResolvedValueOnce('-suffix');
246+
mockedRedisClient.get.mockResolvedValueOnce('50');
247+
248+
const result = await getJoinedStressTestExperiment('TEST_EXPERIMENT', 'runner-name-suffix');
249+
250+
expect(result).toBe(true);
251+
expect(mockedRedisClient.get).toBeCalledTimes(2);
252+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(1, 'gh-ci.EXPERIMENT.RUNNER_NAME_SUFFIX');
253+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(2, 'gh-ci.EXPERIMENT.TEST_EXPERIMENT');
254+
});
255+
256+
it('returns false when experiment value is zero', async () => {
257+
mockedRedisClient.get.mockResolvedValueOnce('-suffix');
258+
mockedRedisClient.get.mockResolvedValueOnce('0');
259+
260+
const result = await getJoinedStressTestExperiment('TEST_EXPERIMENT', 'runner-name-suffix');
261+
262+
expect(result).toBe(false);
263+
expect(mockedRedisClient.get).toBeCalledTimes(2);
264+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(1, 'gh-ci.EXPERIMENT.RUNNER_NAME_SUFFIX');
265+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(2, 'gh-ci.EXPERIMENT.TEST_EXPERIMENT');
266+
});
267+
268+
it('returns true when experiment value is 100', async () => {
269+
jest.spyOn(global.Math, 'random').mockReturnValueOnce(0.99);
270+
271+
mockedRedisClient.get.mockResolvedValueOnce('-suffix');
272+
mockedRedisClient.get.mockResolvedValueOnce('100');
273+
274+
const result = await getJoinedStressTestExperiment('TEST_EXPERIMENT', 'runner-name-suffix');
275+
276+
expect(result).toBe(true);
277+
expect(mockedRedisClient.get).toBeCalledTimes(2);
278+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(1, 'gh-ci.EXPERIMENT.RUNNER_NAME_SUFFIX');
279+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(2, 'gh-ci.EXPERIMENT.TEST_EXPERIMENT');
280+
});
281+
282+
it('returns false when experiment value is not a valid number', async () => {
283+
mockedRedisClient.get.mockResolvedValueOnce('-suffix');
284+
mockedRedisClient.get.mockResolvedValueOnce('not-a-number');
285+
286+
const result = await getJoinedStressTestExperiment('TEST_EXPERIMENT', 'runner-name-suffix');
287+
288+
expect(result).toBe(false);
289+
expect(mockedRedisClient.get).toBeCalledTimes(2);
290+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(1, 'gh-ci.EXPERIMENT.RUNNER_NAME_SUFFIX');
291+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(2, 'gh-ci.EXPERIMENT.TEST_EXPERIMENT');
292+
});
293+
294+
it('returns false when experiment query throws an error', async () => {
295+
mockedRedisClient.get.mockResolvedValueOnce('-suffix');
296+
mockedRedisClient.get.mockRejectedValueOnce(new Error('Redis error'));
297+
298+
const result = await getJoinedStressTestExperiment('TEST_EXPERIMENT', 'runner-name-suffix');
299+
300+
expect(result).toBe(false);
301+
expect(mockedRedisClient.get).toBeCalledTimes(2);
302+
expect(mockedRedisClient.get).toHaveBeenNthCalledWith(1, 'gh-ci.EXPERIMENT.RUNNER_NAME_SUFFIX');
303+
});
304+
});
305+
});
306+
134307
describe('redisCached', () => {
135308
beforeEach(async () => {
136309
await shutdownRedisPool();

Diff for: terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/cache.ts

+79
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,85 @@ export async function redisLocked<T>(
278278
throw new Error(`Could not acquire lock for ${nameSpace}-${key}`);
279279
}
280280

281+
export async function getExperimentValue<T>(experimentKey: string, defaultValue: T): Promise<T> {
282+
return locallyCached('EXPERIMENT', experimentKey, 60, async (): Promise<T> => {
283+
await startupRedisPool();
284+
if (!redisPool) throw Error('Redis should be initialized!');
285+
286+
const queryKey = `${Config.Instance.environment}.EXPERIMENT.${experimentKey}`;
287+
288+
console.debug(`Checking redis entry for experiment ${experimentKey} (${queryKey})`);
289+
try {
290+
// eslint-disable-next-line @typescript-eslint/no-explicit-any
291+
const experimentValue: any = await redisPool.use(async (client: RedisClientType) => {
292+
return await client.get(queryKey);
293+
});
294+
295+
if (experimentValue === undefined || experimentValue === null) {
296+
console.debug(`Experiment ${queryKey} not found, returning default value ${defaultValue}`);
297+
return defaultValue;
298+
}
299+
300+
if (typeof defaultValue === 'number' && typeof experimentValue === 'string') {
301+
const numValue = Number(experimentValue);
302+
if (!isNaN(numValue)) {
303+
console.debug(`Experiment ${queryKey} found and converted to number, returning experiment value ${numValue}`);
304+
return numValue as T;
305+
} else {
306+
console.warn(
307+
`Experiment ${queryKey} found but value is not a valid number: ` +
308+
`${experimentValue}, returning default value ${defaultValue}`,
309+
);
310+
return defaultValue;
311+
}
312+
}
313+
314+
if (typeof defaultValue === typeof experimentValue) {
315+
console.debug(
316+
`Experiment ${queryKey} found and with the correct type (${typeof experimentValue}),` +
317+
` returning experiment value ${experimentValue}`,
318+
);
319+
return experimentValue;
320+
}
321+
} catch (e) {
322+
console.error(`Error retrieving experiment ${queryKey}, returning default value ${defaultValue}: ${e}`);
323+
}
324+
325+
return defaultValue;
326+
});
327+
}
328+
329+
export async function getJoinedStressTestExperiment(experimentKey: string, runnerName: string): Promise<boolean> {
330+
const runnerNameSuffix = await getExperimentValue('RUNNER_NAME_SUFFIX', '');
331+
if (runnerNameSuffix === undefined || runnerNameSuffix === null || runnerNameSuffix === '') {
332+
console.debug(`Experiment ${experimentKey} check ignored, as RUNNER_NAME_SUFFIX is not set`);
333+
return false;
334+
}
335+
336+
if (!runnerName.endsWith(runnerNameSuffix)) {
337+
console.debug(
338+
`Runner name ${runnerName} does not match suffix ${runnerNameSuffix} when checking experiment ${experimentKey}`,
339+
);
340+
return false;
341+
}
342+
343+
const experimentValue = await getExperimentValue(experimentKey, 0);
344+
345+
if (Math.random() * 100 < experimentValue) {
346+
console.debug(
347+
`Enabling experiment ${experimentKey} for runner ${runnerName}. ` +
348+
`Reached probability threshold of ${experimentValue}%`,
349+
);
350+
return true;
351+
}
352+
353+
console.debug(
354+
`Skipping experiment ${experimentKey} for runner ${runnerName}. ` +
355+
`Didn't reach probability threshold of ${experimentValue}%`,
356+
);
357+
return false;
358+
}
359+
281360
export async function redisCached<T>(
282361
nameSpace: string,
283362
key: string,

Diff for: terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.test.ts

+5-1
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,10 @@ jest.mock('./cache', () => ({
7171
.mockImplementation(async <T>(ns: string, k: string, t: number, j: number, fn: () => Promise<T>): Promise<T> => {
7272
return await locallyCached(ns, k, t, fn);
7373
}),
74+
/* eslint-disable-next-line @typescript-eslint/no-unused-vars */
75+
getJoinedStressTestExperiment: jest.fn().mockImplementation(async (experimentKey: string, defaultValue: string) => {
76+
return false;
77+
}),
7478
}));
7579
jest.mock('./gh-auth');
7680

@@ -143,7 +147,7 @@ function createExpectedRunInstancesLinux(
143147

144148
const metrics = new ScaleUpMetrics();
145149

146-
beforeEach(() => {
150+
beforeEach(async () => {
147151
jest.resetModules();
148152
jest.clearAllMocks();
149153
jest.restoreAllMocks();

Diff for: terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts

+25-1
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,9 @@ import { RunnerInfo, expBackOff, getRepo, shuffleArrayInPlace } from './utils';
55
import { Config } from './config';
66
import LRU from 'lru-cache';
77
import { Metrics, ScaleUpMetrics } from './metrics';
8-
import { redisCached, redisLocked } from './cache';
8+
import { getJoinedStressTestExperiment, redisCached, redisLocked } from './cache';
99
import moment from 'moment';
10+
import { RetryableScalingError } from './scale-up';
1011

1112
export interface ListRunnerFilters {
1213
applicationDeployDatetime?: string;
@@ -472,6 +473,13 @@ export async function tryReuseRunner(
472473
repoName: runnerParameters.repoName,
473474
runnerType: runnerParameters.runnerType.runnerTypeName,
474475
};
476+
if (await getJoinedStressTestExperiment('stresstest_awsfail', runnerParameters.runnerType.runnerTypeName)) {
477+
console.warn(
478+
`Joining stress test stresstest_awsfail, failing AWS reuse for ${runnerParameters.runnerType.runnerTypeName}`,
479+
);
480+
throw new RetryableScalingError('Stress test stockout');
481+
}
482+
475483
const runners = shuffleArrayInPlace(await listRunners(metrics, filters));
476484

477485
/* istanbul ignore next */
@@ -666,6 +674,21 @@ export async function createRunner(runnerParameters: RunnerInputParameters, metr
666674
try {
667675
console.debug('Runner configuration: ' + JSON.stringify(runnerParameters));
668676

677+
if (await getJoinedStressTestExperiment('stresstest_awsfail', runnerParameters.runnerType.runnerTypeName)) {
678+
console.warn(
679+
`Joining stress test stresstest_awsfail, failing instance creation` +
680+
` for ${runnerParameters.runnerType.runnerTypeName}`,
681+
);
682+
throw new RetryableScalingError('Stress test stresstest_awsfail');
683+
}
684+
if (await getJoinedStressTestExperiment('stresstest_stockout', runnerParameters.runnerType.runnerTypeName)) {
685+
console.warn(
686+
`Joining stress test stresstest_stockout, failing instance ` +
687+
`creation for ${runnerParameters.runnerType.runnerTypeName}`,
688+
);
689+
throw new RetryableScalingError('Stress test stresstest_stockout');
690+
}
691+
669692
const storageDeviceName = runnerParameters.runnerType.os === 'linux' ? '/dev/xvda' : '/dev/sda1';
670693
const tags = [
671694
{ Key: 'Application', Value: 'github-action-runner' },
@@ -739,6 +762,7 @@ export async function createRunner(runnerParameters: RunnerInputParameters, metr
739762
`[${awsRegion}] [${vpcId}] [${subnet}] Attempting to create ` +
740763
`instance ${runnerParameters.runnerType.instance_type}${labelsStrLog}`,
741764
);
765+
742766
const runInstancesResponse = await expBackOff(() => {
743767
return metrics.trackRequestRegion(
744768
awsRegion,

Diff for: terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts

+1-2
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,9 @@ export async function scaleUpChron(metrics: ScaleUpChronMetrics): Promise<void>
2424
);
2525
throw new Error('scaleUpChronRecordQueueUrl is not set. Cannot send queued scale up requests');
2626
}
27-
const scaleUpChronRecordQueueUrl = Config.Instance.scaleUpChronRecordQueueUrl;
2827
// Only proactively scale up the jobs that have been queued for longer than normal
2928
// Filter out the queued jobs that are do not correspond to a valid runner type
30-
const queuedJobs = (await getQueuedJobs(metrics, scaleUpChronRecordQueueUrl))
29+
const queuedJobs = (await getQueuedJobs(metrics, Config.Instance.scaleUpChronRecordQueueUrl))
3130
.filter((runner) => {
3231
return (
3332
runner.min_queue_time_minutes >= minAutoScaleupDelayMinutes && runner.org === Config.Instance.scaleConfigOrg

0 commit comments

Comments
 (0)