diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.test.ts index a1d434c2f0..56a39d341f 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.test.ts @@ -15,6 +15,7 @@ import { removeGithubRunnerOrg, removeGithubRunnerRepo, resetGHRunnersCaches, + getGitHubRateLimit, } from './gh-runners'; import * as MetricsModule from './metrics'; import { @@ -51,6 +52,7 @@ jest.mock('./gh-runners', () => ({ removeGithubRunnerOrg: jest.fn(), removeGithubRunnerRepo: jest.fn(), resetGHRunnersCaches: jest.fn(), + getGitHubRateLimit: jest.fn(), })); jest.mock('./runners', () => ({ @@ -1433,9 +1435,9 @@ describe('scale-down', () => { describe('getGHRunnerRepo', () => { const ghRunners = [ - { name: 'instance-id-01', busy: true }, - { name: 'instance-id-02', busy: false }, - ] as GhRunners; + mockRunner({ id: 101, name: 'instance-id-01', busy: true, status: 'online', os: 'linux', labels: [] }), + mockRunner({ id: 102, name: 'instance-id-02', busy: false, status: 'online', os: 'linux', labels: [] }), + ]; const repo: Repo = { owner: 'the-org', repo: 'a-repo', @@ -1504,6 +1506,110 @@ describe('scale-down', () => { expect(mockedGetRunnerRepo).toBeCalledTimes(1); expect(mockedGetRunnerRepo).toBeCalledWith(repo, ec2runner.ghRunnerId, metrics); }); + + it('finds idle runner in listGithubRunnersRepo and double checks with enough API quota', async () => { + const mockedListGithubRunnersRepo = mocked(listGithubRunnersRepo); + const mockedGetRunnerRepo = mocked(getRunnerRepo); + const mockedGetGitHubRateLimit = mocked(getGitHubRateLimit); + const ec2runner: RunnerInfo = { + awsRegion: baseConfig.awsRegion, + repo: repoKey, + instanceId: 'instance-id-02', + runnerType: 'runnerType-01', + ghRunnerId: 'ghRunnerId-02', + }; + const directGhRunner = mockRunner({ + id: 202, + name: 'instance-id-02', + busy: true, + status: 'busy', + os: 'linux', + labels: [], + }); + + mockedListGithubRunnersRepo.mockResolvedValueOnce(ghRunners); + mockedGetGitHubRateLimit.mockResolvedValueOnce({ limit: 5000, remaining: 4000, used: 1000 }); + mockedGetRunnerRepo.mockResolvedValueOnce(directGhRunner); + + const result = await getGHRunnerRepo(ec2runner, metrics, true); + expect(result).toEqual(directGhRunner); + expect(result?.busy).toBe(true); + + expect(mockedListGithubRunnersRepo).toBeCalledTimes(1); + expect(mockedListGithubRunnersRepo).toBeCalledWith(repo, metrics); + expect(mockedGetGitHubRateLimit).toBeCalledTimes(1); + expect(mockedGetGitHubRateLimit).toBeCalledWith(repo, metrics); + expect(mockedGetRunnerRepo).toBeCalledTimes(1); + expect(mockedGetRunnerRepo).toBeCalledWith(repo, ec2runner.ghRunnerId, metrics); + }); + + it('finds idle runner in listGithubRunnersRepo but cannot double check due to low API quota', async () => { + const mockedListGithubRunnersRepo = mocked(listGithubRunnersRepo); + const mockedGetRunnerRepo = mocked(getRunnerRepo); + const mockedGetGitHubRateLimit = mocked(getGitHubRateLimit); + const ec2runner: RunnerInfo = { + awsRegion: baseConfig.awsRegion, + repo: repoKey, + instanceId: 'instance-id-02', + runnerType: 'runnerType-01', + ghRunnerId: 'ghRunnerId-02', + }; + + mockedListGithubRunnersRepo.mockResolvedValueOnce(ghRunners); + mockedGetGitHubRateLimit.mockResolvedValueOnce({ limit: 5000, remaining: 500, used: 4500 }); + + const result = await getGHRunnerRepo(ec2runner, metrics, true); + expect(result).toEqual({ ...ghRunners[1], busy: true }); + + expect(mockedListGithubRunnersRepo).toBeCalledTimes(1); + expect(mockedListGithubRunnersRepo).toBeCalledWith(repo, metrics); + expect(mockedGetGitHubRateLimit).toBeCalledTimes(1); + expect(mockedGetGitHubRateLimit).toBeCalledWith(repo, metrics); + expect(mockedGetRunnerRepo).not.toBeCalled(); + }); + + it('finds idle runner in listGithubRunnersRepo and no double check with dontTrustIdleFromList=false', async () => { + const mockedListGithubRunnersRepo = mocked(listGithubRunnersRepo); + const mockedGetRunnerRepo = mocked(getRunnerRepo); + const mockedGetGitHubRateLimit = mocked(getGitHubRateLimit); + const ec2runner: RunnerInfo = { + awsRegion: baseConfig.awsRegion, + repo: repoKey, + instanceId: 'instance-id-02', + runnerType: 'runnerType-01', + ghRunnerId: 'ghRunnerId-02', + }; + + // Create special test runner list with a runner we know is not busy + const specialTestRunner = mockRunner({ + id: 102, + name: 'instance-id-02', + busy: false, + status: 'online', + os: 'linux', + labels: [], + }); + const testGhRunners = [ + mockRunner({ id: 101, name: 'instance-id-01', busy: true, status: 'online', os: 'linux', labels: [] }), + specialTestRunner, + ]; + + mockedListGithubRunnersRepo.mockResolvedValueOnce(testGhRunners); + + // Override the mock implementation directly + mockedListGithubRunnersRepo.mockImplementationOnce(() => { + return Promise.resolve(testGhRunners); + }); + + const result = await getGHRunnerRepo(ec2runner, metrics, false); + // Make sure the runner stayed not busy + expect(result?.busy).toBe(false); + + expect(mockedListGithubRunnersRepo).toBeCalledTimes(1); + expect(mockedListGithubRunnersRepo).toBeCalledWith(repo, metrics); + expect(mockedGetGitHubRateLimit).not.toBeCalled(); + expect(mockedGetRunnerRepo).not.toBeCalled(); + }); }); describe('cleanupOldSSMParameters', () => { @@ -1624,9 +1730,9 @@ describe('scale-down', () => { describe('getGHRunnerOrg', () => { const ghRunners = [ - { name: 'instance-id-01', busy: true }, - { name: 'instance-id-02', busy: false }, - ] as GhRunners; + mockRunner({ id: 201, name: 'instance-id-01', busy: true, status: 'online', os: 'linux', labels: [] }), + mockRunner({ id: 202, name: 'instance-id-02', busy: false, status: 'online', os: 'linux', labels: [] }), + ]; const org = 'the-org'; it('finds on listGithubRunnersOrg, busy === true', async () => { @@ -1726,5 +1832,109 @@ describe('scale-down', () => { await expect(getGHRunnerOrg(ec2runner, metrics)).rejects.toThrow(RequestError); }); + + it('finds idle runner in listGithubRunnersOrg and double checks with enough API quota', async () => { + const mockedListGithubRunnersOrg = mocked(listGithubRunnersOrg); + const mockedGetRunnerOrg = mocked(getRunnerOrg); + const mockedGetGitHubRateLimit = mocked(getGitHubRateLimit); + const ec2runner: RunnerInfo = { + awsRegion: baseConfig.awsRegion, + org: org, + instanceId: 'instance-id-02', + runnerType: 'runnerType-01', + ghRunnerId: 'ghRunnerId-02', + }; + const directGhRunner = mockRunner({ + id: 202, + name: 'instance-id-02', + busy: true, + status: 'busy', + os: 'linux', + labels: [], + }); + + mockedListGithubRunnersOrg.mockResolvedValueOnce(ghRunners); + mockedGetGitHubRateLimit.mockResolvedValueOnce({ limit: 5000, remaining: 4000, used: 1000 }); + mockedGetRunnerOrg.mockResolvedValueOnce(directGhRunner); + + const result = await getGHRunnerOrg(ec2runner, metrics, true); + expect(result).toEqual(directGhRunner); + expect(result?.busy).toBe(true); + + expect(mockedListGithubRunnersOrg).toBeCalledTimes(1); + expect(mockedListGithubRunnersOrg).toBeCalledWith(org, metrics); + expect(mockedGetGitHubRateLimit).toBeCalledTimes(1); + expect(mockedGetGitHubRateLimit).toBeCalledWith({ owner: org, repo: '' }, metrics); + expect(mockedGetRunnerOrg).toBeCalledTimes(1); + expect(mockedGetRunnerOrg).toBeCalledWith(org, ec2runner.ghRunnerId, metrics); + }); + + it('finds idle runner in listGithubRunnersOrg but cannot double check due to low API quota', async () => { + const mockedListGithubRunnersOrg = mocked(listGithubRunnersOrg); + const mockedGetRunnerOrg = mocked(getRunnerOrg); + const mockedGetGitHubRateLimit = mocked(getGitHubRateLimit); + const ec2runner: RunnerInfo = { + awsRegion: baseConfig.awsRegion, + org: org, + instanceId: 'instance-id-02', + runnerType: 'runnerType-01', + ghRunnerId: 'ghRunnerId-02', + }; + + mockedListGithubRunnersOrg.mockResolvedValueOnce(ghRunners); + mockedGetGitHubRateLimit.mockResolvedValueOnce({ limit: 5000, remaining: 500, used: 4500 }); + + const result = await getGHRunnerOrg(ec2runner, metrics, true); + expect(result).toEqual({ ...ghRunners[1], busy: true }); + + expect(mockedListGithubRunnersOrg).toBeCalledTimes(1); + expect(mockedListGithubRunnersOrg).toBeCalledWith(org, metrics); + expect(mockedGetGitHubRateLimit).toBeCalledTimes(1); + expect(mockedGetGitHubRateLimit).toBeCalledWith({ owner: org, repo: '' }, metrics); + expect(mockedGetRunnerOrg).not.toBeCalled(); + }); + + it('finds idle runner in listGithubRunnersOrg and no double check with dontTrustIdleFromList=false', async () => { + const mockedListGithubRunnersOrg = mocked(listGithubRunnersOrg); + const mockedGetRunnerOrg = mocked(getRunnerOrg); + const mockedGetGitHubRateLimit = mocked(getGitHubRateLimit); + const ec2runner: RunnerInfo = { + awsRegion: baseConfig.awsRegion, + org: org, + instanceId: 'instance-id-02', + runnerType: 'runnerType-01', + ghRunnerId: 'ghRunnerId-02', + }; + + // Create special test runner list with a runner we know is not busy + const specialTestRunner = mockRunner({ + id: 202, + name: 'instance-id-02', + busy: false, + status: 'online', + os: 'linux', + labels: [], + }); + const testGhRunners = [ + mockRunner({ id: 201, name: 'instance-id-01', busy: true, status: 'online', os: 'linux', labels: [] }), + specialTestRunner, + ]; + + mockedListGithubRunnersOrg.mockResolvedValueOnce(testGhRunners); + + // Override the mock implementation directly + mockedListGithubRunnersOrg.mockImplementationOnce(() => { + return Promise.resolve(testGhRunners); + }); + + const result = await getGHRunnerOrg(ec2runner, metrics, false); + // Make sure the runner stayed not busy + expect(result?.busy).toBe(false); + + expect(mockedListGithubRunnersOrg).toBeCalledTimes(1); + expect(mockedListGithubRunnersOrg).toBeCalledWith(org, metrics); + expect(mockedGetGitHubRateLimit).not.toBeCalled(); + expect(mockedGetRunnerOrg).not.toBeCalled(); + }); }); }); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.ts index 9fd1067b7e..f3f9592e63 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-down.ts @@ -2,6 +2,7 @@ import moment from 'moment'; import { Config } from './config'; import { resetSecretCache } from './gh-auth'; import { + getGitHubRateLimit, getRunnerOrg, getRunnerRepo, getRunnerTypes, @@ -66,7 +67,7 @@ export async function scaleDown(): Promise { // REPO assigned runners if (ec2runner.repo !== undefined) { foundRepos.add(ec2runner.repo); - const ghRunner = await getGHRunnerRepo(ec2runner, metrics); + const ghRunner = await getGHRunnerRepo(ec2runner, metrics, true); // if configured to repo, don't mess with organization runners if (!Config.Instance.enableOrganizationRunners) { metrics.runnerFound(ec2runner); @@ -81,7 +82,7 @@ export async function scaleDown(): Promise { // ORG assigned runners } else if (ec2runner.org !== undefined) { foundOrgs.add(ec2runner.org); - const ghRunner = await getGHRunnerOrg(ec2runner, metrics); + const ghRunner = await getGHRunnerOrg(ec2runner, metrics, true); // if configured to org, don't mess with repo runners if (Config.Instance.enableOrganizationRunners) { metrics.runnerFound(ec2runner); @@ -290,7 +291,11 @@ export async function cleanupOldSSMParameters(runnersRegions: Set, metri } } -export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMetrics): Promise { +export async function getGHRunnerOrg( + ec2runner: RunnerInfo, + metrics: ScaleDownMetrics, + dontTrustIdleFromList = true, +): Promise { const org = ec2runner.org as string; let ghRunner: GhRunner | undefined = undefined; @@ -304,25 +309,97 @@ export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMe } } - if (ghRunner === undefined && ec2runner.ghRunnerId !== undefined) { - console.warn( - `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) not found in ` + - `listGithubRunnersOrg call, attempting to grab directly`, + if (ghRunner === undefined) { + if (ec2runner.ghRunnerId === undefined) { + console.warn( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) was neither found in ` + + `the list of runners returned by the listGithubRunnersOrg api call, nor did it have the ` + + `GithubRunnerId EC2 tag set. This can happen if there's no runner running on the instance.`, + ); + } else { + console.warn( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) not found in ` + + `listGithubRunnersOrg call, attempting to grab directly`, + ); + try { + ghRunner = await getRunnerOrg(ec2runner.org as string, ec2runner.ghRunnerId, metrics); + } catch (e) { + console.warn( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) error when getRunnerOrg call: ${e}`, + ); + /* istanbul ignore next */ + if (isGHRateLimitError(e)) { + throw e; + } + } + } + } else if (dontTrustIdleFromList && !ghRunner.busy && ec2runner.ghRunnerId !== undefined) { + console.debug( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) was found in the list of runners,` + + ` it should not be busy, but the flag dontTrustIdleFromList is set to true, so we will not trust it ` + + `and try to grab it directly if we have GHA quotas for it.`, ); + + let safeToCallGHApi = false; try { - ghRunner = await getRunnerOrg(ec2runner.org as string, ec2runner.ghRunnerId, metrics); + const ghLimitInfo = await getGitHubRateLimit({ owner: org, repo: '' }, metrics); + metrics.gitHubRateLimitStats(ghLimitInfo.limit, ghLimitInfo.remaining, ghLimitInfo.used); + if (ghLimitInfo.remaining > ghLimitInfo.limit * 0.4) { + console.debug( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) - We have enough GHA API quotas` + + ` to call the API and double-check runner status by grabbing it directly. ` + + `Remaning: ${ghLimitInfo.remaining} / Limit: ${ghLimitInfo.limit} / Used: ${ghLimitInfo.used}`, + ); + safeToCallGHApi = true; + } else { + console.warn( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) - We DON'T have enough GHA API quotas` + + ` to call the API and double-check runner status by grabbing it directly. Assuming it is busy. Remaning: ` + + `${ghLimitInfo.remaining} / Limit: ${ghLimitInfo.limit} / Used: ${ghLimitInfo.used}`, + ); + } } catch (e) { - console.warn( - `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) error when ` + - `listGithubRunnersOrg call: ${e}`, - ); /* istanbul ignore next */ - if (isGHRateLimitError(e)) { - throw e; + console.error(`Error getting GitHub rate limit: ${e}`); + } + + if (safeToCallGHApi) { + try { + const ghRunnerDirect = await getRunnerOrg(org, ec2runner.ghRunnerId, metrics); + if (ghRunnerDirect !== undefined) { + ghRunner = ghRunnerDirect; + console.warn( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) - ` + + `Information grabbed directly. New status is: ${ghRunner.status} - ` + + `Busy: ${ghRunner.busy} - Labels: ${ghRunner.labels}`, + ); + } else { + console.error( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) - Information grabbed directly. ` + + `But it was not found in GHA API. This should NOT happen in a healthy GHA API status!. ` + + `Considering the runner busy, for now`, + ); + ghRunner.busy = true; + } + } catch (e) { + console.warn( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) error when getRunnerOrg call: ${e}`, + ); + /* istanbul ignore next */ + if (isGHRateLimitError(e)) { + throw e; + } } + } else { + console.error( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${org}) - Not safe to call GHA API due to lack ` + + `of quotas to retrieve runner information directly. We will consider the runner busy.`, + ); + ghRunner.busy = true; } } - if (ghRunner) { + + if (ghRunner !== undefined) { if (ghRunner.busy) { metrics.runnerGhFoundBusyOrg(org, ec2runner); } else { @@ -334,7 +411,11 @@ export async function getGHRunnerOrg(ec2runner: RunnerInfo, metrics: ScaleDownMe return ghRunner; } -export async function getGHRunnerRepo(ec2runner: RunnerInfo, metrics: ScaleDownMetrics): Promise { +export async function getGHRunnerRepo( + ec2runner: RunnerInfo, + metrics: ScaleDownMetrics, + dontTrustIdleFromList = true, +): Promise { const repo = getRepo(ec2runner.repo as string); let ghRunner: GhRunner | undefined = undefined; @@ -373,7 +454,72 @@ export async function getGHRunnerRepo(ec2runner: RunnerInfo, metrics: ScaleDownM } } } + } else if (dontTrustIdleFromList && !ghRunner.busy && ec2runner.ghRunnerId !== undefined) { + console.debug( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) was found in the list of runners,` + + ` it should not be busy, but the flag dontTrustIdleFromList is set to true, so we will not trust it ` + + `and try to grab it directly if we have GHA quotas for it.`, + ); + + let safeToCallGHApi = false; + try { + const ghLimitInfo = await getGitHubRateLimit(repo, metrics); + metrics.gitHubRateLimitStats(ghLimitInfo.limit, ghLimitInfo.remaining, ghLimitInfo.used); + if (ghLimitInfo.remaining > ghLimitInfo.limit * 0.4) { + console.debug( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) - We have enough GHA API quotas` + + ` to call the API and double-check runner status by grabbing it directly. ` + + `Remaning: ${ghLimitInfo.remaining} / Limit: ${ghLimitInfo.limit} / Used: ${ghLimitInfo.used}`, + ); + safeToCallGHApi = true; + } else { + console.warn( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) - We DON'T have enough GHA API quotas` + + ` to call the API and double-check runner status by grabbing it directly. Assuming it is busy. Remaning: ` + + `${ghLimitInfo.remaining} / Limit: ${ghLimitInfo.limit} / Used: ${ghLimitInfo.used}`, + ); + } + } catch (e) { + /* istanbul ignore next */ + console.error(`Error getting GitHub rate limit: ${e}`); + } + + if (safeToCallGHApi) { + try { + const ghRunnerDirect = await getRunnerRepo(repo, ec2runner.ghRunnerId, metrics); + if (ghRunnerDirect !== undefined) { + ghRunner = ghRunnerDirect; + console.warn( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) - ` + + `Information grabbed directly. New status is: ${ghRunner.status} - ` + + `Busy: ${ghRunner.busy} - Labels: ${ghRunner.labels}`, + ); + } else { + console.error( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) - Information grabbed directly. ` + + `But it was not found in GHA API. This should NOT happen in a healthy GHA API status!. ` + + `Considering the runner busy, for now`, + ); + ghRunner.busy = true; + } + } catch (e) { + console.warn( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) error when getRunnerRepo call: ${e}`, + ); + /* istanbul ignore next */ + if (isGHRateLimitError(e)) { + throw e; + } + } + } else { + console.error( + `Runner '${ec2runner.instanceId}' [${ec2runner.runnerType}](${repo}) - Not safe to call GHA API due to lack ` + + `of quotas to retrieve runner information directly. We will consider the runner busy.`, + ); + ghRunner.busy = true; + } } + if (ghRunner !== undefined) { if (ghRunner.busy) { metrics.runnerGhFoundBusyRepo(repo, ec2runner);