Skip to content

Commit ef7ee1e

Browse files
committed
add metrics and don't use sqs queue
1 parent 897c1cf commit ef7ee1e

File tree

3 files changed

+40
-9
lines changed

3 files changed

+40
-9
lines changed

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts

+22-2
Original file line numberDiff line numberDiff line change
@@ -1424,10 +1424,30 @@ export class ScaleDownMetrics extends Metrics {
14241424
}
14251425
}
14261426

1427-
export class ScaleUpChronMetrics extends Metrics {
1427+
export class ScaleUpChronMetrics extends ScaleUpMetrics {
14281428
constructor() {
1429-
super('scaleUpChron');
1429+
super();
14301430
}
1431+
queuedRunnerStats(org: string, runnerType: string, numQueuedJobs: number) {
1432+
const dimensions = new Map([['Org', org], ['RunnerType', runnerType], ['numQueuedJobs', numQueuedJobs.toString()]]);
1433+
this.addEntry('run.scaleupchron.queuedRunners', 3, dimensions);
1434+
}
1435+
queuedRunnerFailure(error: string) {
1436+
const dimensions = new Map([['error', error]]);
1437+
this.countEntry('run.scaleupchron.queuedRunners.failure', 1, dimensions);
1438+
}
1439+
1440+
scaleUpChronSuccess() {
1441+
this.scaleUpSuccess();
1442+
this.countEntry('run.scaleupchron.success');
1443+
}
1444+
scaleUpChronFailure(error:string) {
1445+
const dimensions = new Map([['error', error]]);
1446+
1447+
// should we add more information about this or do we not care since it'll be requeued?
1448+
this.countEntry('run.scaleupchron.failure', 1, dimensions);
1449+
}
1450+
14311451
}
14321452

14331453
export interface sendMetricsTimeoutVars {

terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts

+17-6
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,11 @@
11
import axios from 'axios';
22

33
import { Config } from './config';
4-
import { getRepo } from './utils';
4+
import { getRepo, shuffleArrayInPlace } from './utils';
55
import { ScaleUpChronMetrics } from './metrics';
66
import { getRunnerTypes } from './gh-runners';
77
import { sqsSendMessages } from './sqs';
8-
import { ActionRequestMessage } from './scale-up';
8+
import { ActionRequestMessage, scaleUp} from './scale-up';
99
import { randomUUID } from 'crypto';
1010

1111
export async function scaleUpChron(): Promise<void> {
@@ -14,12 +14,13 @@ export async function scaleUpChron(): Promise<void> {
1414
// 2. Polls scale-config to filter the list to ones that are self-hosted by this fleet and
1515
// are ephemeral
1616
// 3. Sends a SQS request to the scale-up lambda to provision more of those instances
17-
let queuedJobs = await getQueuedJobs();
17+
const metrics = new ScaleUpChronMetrics();
18+
19+
let queuedJobs = await getQueuedJobs(metrics);
1820

1921
const scaleConfigRepo = getRepo(Config.Instance.scaleConfigOrg, Config.Instance.scaleConfigRepo);
2022

2123

22-
const metrics = new ScaleUpChronMetrics();
2324
const validRunnerTypes = await getRunnerTypes(scaleConfigRepo, metrics);
2425

2526
const minAutoScaleupDelayMinutes = 30;
@@ -51,7 +52,15 @@ export async function scaleUpChron(): Promise<void> {
5152
throw new Error('scaleUpRecordQueueUrl is not set. Cannot send scale up requests');
5253
}
5354

54-
await sqsSendMessages(metrics, scaleUpRequests, Config.Instance.scaleUpRecordQueueUrl);
55+
for (const request of shuffleArrayInPlace(scaleUpRequests)) {
56+
try{
57+
await scaleUp("aws:sqs", request, metrics);
58+
metrics.scaleUpChronSuccess();
59+
60+
} catch (error) {
61+
metrics.scaleUpChronFailure((error as Error).message);
62+
}
63+
5564
}
5665

5766
class QueuedJobsForRunner {
@@ -72,7 +81,7 @@ class QueuedJobsForRunner {
7281
}
7382
}
7483

75-
export async function getQueuedJobs(): Promise<QueuedJobsForRunner[]> {
84+
export async function getQueuedJobs(metrics: ScaleUpChronMetrics): Promise<QueuedJobsForRunner[]> {
7685
// This function queries the HUD for queued runners
7786
// and returns a list of them
7887

@@ -83,6 +92,7 @@ export async function getQueuedJobs(): Promise<QueuedJobsForRunner[]> {
8392

8493
// Map the response to the class
8594
const queued_runners = response.data.map((runner: any) => {
95+
metrics.queuedRunnerStats(runner.org, runner.runner_label, runner.num_queued_jobs,);
8696
return new QueuedJobsForRunner(
8797
runner.runner_label,
8898
runner.org,
@@ -93,6 +103,7 @@ export async function getQueuedJobs(): Promise<QueuedJobsForRunner[]> {
93103
});
94104
return queued_runners;
95105
} catch (error) {
106+
metrics.queuedRunnerFailure((error as Error).message);
96107
console.error('Error fetching queued runners:', error);
97108
return [];
98109
}

terraform-aws-github-runner/modules/runners/variables.tf

+1-1
Original file line numberDiff line numberDiff line change
@@ -97,7 +97,7 @@ variable "scale_down_schedule_expression" {
9797
variable "scale_up_chron_schedule_expression" {
9898
description = "Scheduler expression to check every x for scale down."
9999
type = string
100-
default = "cron(*/15 * * * ? *)" # every 15 minutes
100+
default = "cron(*/30 * * * ? *)" # every 30 minutes
101101
}
102102

103103
variable "minimum_running_time_in_minutes" {

0 commit comments

Comments
 (0)