From e68beca8a5a8cc6cc658e3c355ce4ea7a3d3d821 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Wed, 4 Dec 2024 12:22:47 -0600 Subject: [PATCH 01/16] more changes --- .../runners/lambdas/runners/jest.config.js | 5 +- .../runners/lambdas/runners/package.json | 1 + .../runners/lambdas/runners/src/lambda.ts | 15 ++ .../src/scale-runners/run-test.test.ts | 9 + .../src/scale-runners/scale-up-chron.test.ts | 18 ++ .../src/scale-runners/scale-up-chron.ts | 57 +++++++ .../runners/lambdas/runners/src/template.yml | 13 ++ .../modules/runners/lambdas/runners/yarn.lock | 28 +++ .../modules/runners/scale-up-chron.tf | 161 ++++++++++++++++++ .../modules/runners/variables.tf | 12 ++ 10 files changed, 318 insertions(+), 1 deletion(-) create mode 100644 terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/run-test.test.ts create mode 100644 terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts create mode 100644 terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts create mode 100644 terraform-aws-github-runner/modules/runners/lambdas/runners/src/template.yml create mode 100644 terraform-aws-github-runner/modules/runners/scale-up-chron.tf diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js b/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js index c474887a66..6d1916820c 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js @@ -11,5 +11,8 @@ module.exports = { lines: 80, statements: 80 } - } + }, + moduleNameMapper: { + axios: 'axios/dist/node/axios.cjs', // Allow axios to work in tests + }, }; diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json b/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json index 6e5217a9ef..27976ad2c7 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json @@ -42,6 +42,7 @@ "@types/uuid": "^9.0.1", "async-mutex": "^0.4.0", "aws-sdk": "^2.863.0", + "axios": "^1.7.7", "cron-parser": "^3.3.0", "generic-pool": "^3.9.0", "lru-cache": "^6.0.0", diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts index acbb6e52b9..ceaf889056 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts @@ -5,6 +5,7 @@ import { Config } from './scale-runners/config'; import { ScaleUpMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics'; import { getDelayWithJitterRetryCount, stochaticRunOvershoot } from './scale-runners/utils'; import { scaleDown as scaleDownR } from './scale-runners/scale-down'; +import { scaleUpChron as scaleUpChronR } from './scale-runners/scale-up-chron'; import { sqsSendMessages, sqsDeleteMessageBatch } from './scale-runners/sqs'; async function sendRetryEvents(evtFailed: Array<[SQSRecord, boolean, number]>, metrics: ScaleUpMetrics) { @@ -155,3 +156,17 @@ export async function scaleDown(event: ScheduledEvent, context: Context, callbac return callback('Failed'); } } + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export async function scaleUpChron(event: ScheduledEvent, context: Context, callback: any) { + // we mantain open connections to redis, so the event pool is only cleaned when the SIGTERM is sent + context.callbackWaitsForEmptyEventLoop = false; + + try { + await scaleUpChronR(); + return callback(null); + } catch (e) { + console.error(e); + return callback('Failed'); + } +} \ No newline at end of file diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/run-test.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/run-test.test.ts new file mode 100644 index 0000000000..d11b0dec8f --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/run-test.test.ts @@ -0,0 +1,9 @@ +// Import the required modules +import { getQueuedJobs } from './scale-up-chron'; + + +test('getQueuedRunners should fetch queued runners', async () => { + const runners = await getQueuedJobs(); + console.log('Queued Runners:', runners); + expect(runners).toBeDefined(); +}); \ No newline at end of file diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts new file mode 100644 index 0000000000..ac4157bbdb --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts @@ -0,0 +1,18 @@ +import moment from 'moment'; +import nock from 'nock'; +import { mocked } from 'ts-jest/utils'; +import { Config } from './config'; + +// Import the required modules +import { getQueuedJobs } from './scale-up-chron'; + + +describe('scaleUp', () => { + + it('getQueuedRunners should fetch queued runners', async () => { + const runners = await getQueuedJobs(); + console.log('Queued Runners:', runners); + expect(runners).toBeDefined(); + }); +}); + diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts new file mode 100644 index 0000000000..0733503e2f --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts @@ -0,0 +1,57 @@ +import axios from 'axios'; + +import { Config } from './config'; + +export async function scaleUpChron(): Promise { + // This function does the following: + // 1. Queries for queued runners via HUD + // 2. Polls scale-config to filter the list to ones that are self-hosted by this fleet and + // are ephemeral + // 3. Sends a SQS request to the scale-up lambda to provision more of those instances + + +} + +class QueuedJobsForRunner { + runner_label: string; + org: string; + full_repo: string; + num_queued_jobs: number; + min_queue_time_min: number; + max_queue_time_min: number; + + constructor(runner_label: string, org: string, full_repo: string, num_queued_jobs: number, min_queue_time_min: number, max_queue_time_min: number) { + this.runner_label = runner_label; + this.org = org; + this.full_repo = full_repo; + this.num_queued_jobs = num_queued_jobs; + this.min_queue_time_min = min_queue_time_min; + this.max_queue_time_min = max_queue_time_min; + } +} + +export async function getQueuedJobs(): Promise { + // This function queries the HUD for queued runners + // and returns a list of them + + const url = 'https://hud.pytorch.org/api/clickhouse/queued_jobs_aggregate?parameters=%5B%5D'; + + try { + const response = await axios.get(url); + + // Map the response to the class + const queued_runners = response.data.map((runner: any) => { + return new QueuedJobsForRunner( + runner.runner_label, + runner.org, + runner.full_repo, + runner.num_queued_jobs, + runner.min_queue_time_min, + runner.max_queue_time_min); + }); + return queued_runners; + } catch (error) { + console.error('Error fetching queued runners:', error); + return []; + } +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/template.yml b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/template.yml new file mode 100644 index 0000000000..fda647cd9e --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/template.yml @@ -0,0 +1,13 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: 'AWS::Serverless-2016-10-31' +Resources: + ScaleUpChronFunction: + Type: 'AWS::Serverless::Function' + Properties: + Handler: index.scaleUpChron + Runtime: nodejs20.x + Events: + ScheduledEvent: + Type: Schedule + Properties: + Schedule: 'rate(1 minute)' \ No newline at end of file diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock b/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock index 7930c0018f..6ed2cc9b2c 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock @@ -1381,6 +1381,15 @@ aws-sdk@^2.863.0: uuid "8.0.0" xml2js "0.6.2" +axios@^1.7.7: + version "1.7.7" + resolved "https://registry.yarnpkg.com/axios/-/axios-1.7.7.tgz#2f554296f9892a72ac8d8e4c5b79c14a91d0a47f" + integrity sha512-S4kL7XrjgBmvdGut0sN3yJxqYzrDOnivkBiN0OFs6hLiUam3UPvswUo0kqGyhqUZGEOytHyumEdXsAkgCOUf3Q== + dependencies: + follow-redirects "^1.15.6" + form-data "^4.0.0" + proxy-from-env "^1.1.0" + babel-jest@^26.6.3: version "26.6.3" resolved "https://registry.yarnpkg.com/babel-jest/-/babel-jest-26.6.3.tgz#d87d25cb0037577a0c89f82e5755c5d293c01056" @@ -2326,6 +2335,11 @@ flatted@^3.2.9: resolved "https://registry.yarnpkg.com/flatted/-/flatted-3.3.1.tgz#21db470729a6734d4997002f439cb308987f567a" integrity sha512-X8cqMLLie7KsNUDSdzeN8FYK9rEt4Dt67OsG/DNGnYTSDBG4uFAJFBnUeiV+zCVAvwFy56IjM9sH51jVaEhNxw== +follow-redirects@^1.15.6: + version "1.15.9" + resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.15.9.tgz#a604fa10e443bf98ca94228d9eebcc2e8a2c8ee1" + integrity sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ== + for-each@^0.3.3: version "0.3.3" resolved "https://registry.yarnpkg.com/for-each/-/for-each-0.3.3.tgz#69b447e88a0a5d32c3e7084f3f1710034b21376e" @@ -2347,6 +2361,15 @@ form-data@^3.0.0: combined-stream "^1.0.8" mime-types "^2.1.12" +form-data@^4.0.0: + version "4.0.1" + resolved "https://registry.yarnpkg.com/form-data/-/form-data-4.0.1.tgz#ba1076daaaa5bfd7e99c1a6cb02aa0a5cff90d48" + integrity sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw== + dependencies: + asynckit "^0.4.0" + combined-stream "^1.0.8" + mime-types "^2.1.12" + fragment-cache@^0.2.1: version "0.2.1" resolved "https://registry.yarnpkg.com/fragment-cache/-/fragment-cache-0.2.1.tgz#4290fad27f13e89be7f33799c6bc5a0abfff0d19" @@ -4028,6 +4051,11 @@ propagate@^2.0.0: resolved "https://registry.yarnpkg.com/propagate/-/propagate-2.0.1.tgz#40cdedab18085c792334e64f0ac17256d38f9a45" integrity sha512-vGrhOavPSTz4QVNuBNdcNXePNdNMaO1xj9yBeH1ScQPjk/rhg9sSlCXPhMkFuaNNW/syTvYqsnbIJxMBfRbbag== +proxy-from-env@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2" + integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg== + psl@^1.1.33: version "1.9.0" resolved "https://registry.yarnpkg.com/psl/-/psl-1.9.0.tgz#d0df2a137f00794565fcaf3b2c00cd09f8d5a5a7" diff --git a/terraform-aws-github-runner/modules/runners/scale-up-chron.tf b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf new file mode 100644 index 0000000000..a7222ce9b6 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf @@ -0,0 +1,161 @@ +resource "aws_kms_grant" "scale_up_chron" { + count = var.encryption.encrypt ? 1 : 0 + name = "${var.environment}-scale-up-chron" + key_id = var.encryption.kms_key_id + grantee_principal = aws_iam_role.scale_up_chron.arn + operations = ["Decrypt"] + + constraints { + encryption_context_equals = { + Environment = var.environment + } + } +} + +resource "aws_lambda_function" "scale_up_chron" { + s3_bucket = var.lambda_s3_bucket != null ? var.lambda_s3_bucket : null + s3_key = var.runners_lambda_s3_key != null ? var.runners_lambda_s3_key : null + s3_object_version = var.runners_lambda_s3_object_version != null ? var.runners_lambda_s3_object_version : null + filename = var.lambda_s3_bucket == null ? local.lambda_zip : null + source_code_hash = var.lambda_s3_bucket == null ? filebase64sha256(local.lambda_zip) : null + function_name = "${var.environment}-scale-up-chron" + role = aws_iam_role.scale_up_chron.arn + handler = "index.scaleUpChron" + runtime = "nodejs20.x" + timeout = var.lambda_timeout_scale_up_chron + tags = local.tags + memory_size = 2048 + + environment { + variables = { + AWS_REGION_INSTANCES = join(",", var.aws_region_instances) + DATETIME_DEPLOY = local.datetime_deploy + ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners + ENVIRONMENT = var.environment + GHES_URL = var.ghes_url + GITHUB_APP_CLIENT_ID = var.github_app.client_id + GITHUB_APP_CLIENT_SECRET = var.github_app_client_secret + GITHUB_APP_ID = var.github_app.id + GITHUB_APP_KEY_BASE64 = var.github_app_key_base64 + KMS_KEY_ID = var.encryption.kms_key_id + LAMBDA_TIMEOUT = var.lambda_timeout_scale_up_chron + MIN_AVAILABLE_RUNNERS = var.min_available_runners + MINIMUM_RUNNING_TIME_IN_MINUTES = var.minimum_running_time_in_minutes + REDIS_ENDPOINT = var.redis_endpoint + REDIS_LOGIN = var.redis_login + SCALE_CONFIG_REPO = var.scale_config_repo + SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path + scale_up_chron_CONFIG = jsonencode(var.idle_config) + SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id + AWS_REGIONS_TO_VPC_IDS = join( + ",", + sort(distinct([ + for region_vpc in var.vpc_ids : + format("%s|%s", region_vpc.region, region_vpc.vpc) + ])) + ) + VPC_ID_TO_SECURITY_GROUP_IDS = join( + ",", + sort(distinct(concat( + [ + for vpc in var.vpc_ids : + format( + "%s|%s", + vpc.vpc, + var.runners_security_group_ids[local.vpc_id_to_idx[vpc.vpc]] + ) + ], + [ + for vpc_subnet in var.vpc_sgs : + format("%s|%s", vpc_subnet.vpc, vpc_subnet.sg) + ] + ))) + ) + VPC_ID_TO_SUBNET_IDS = join( + ",", + sort(distinct([ + for vpc_subnet in var.subnet_vpc_ids : + format("%s|%s", vpc_subnet.vpc, vpc_subnet.subnet) + ])) + ) + SUBNET_ID_TO_AZ = join( + ",", + sort(distinct([ + for subnet_az in var.subnet_azs : + format("%s|%s", subnet_az.subnet, subnet_az.az) + ])) + ) + } + } + + vpc_config { + security_group_ids = concat( + var.lambda_security_group_ids, + [var.runners_security_group_ids[0]] + ) + subnet_ids = var.lambda_subnet_ids + } +} + +resource "aws_cloudwatch_log_group" "scale_up_chron" { + name = "/aws/lambda/${aws_lambda_function.scale_up_chron.function_name}" + retention_in_days = var.logging_retention_in_days + tags = var.tags +} + +resource "aws_cloudwatch_event_rule" "scale_up_chron" { + name = "${var.environment}-scale-up-chron-rule" + schedule_expression = var.scale_up_chron_schedule_expression + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "scale_up_chron" { + rule = aws_cloudwatch_event_rule.scale_up_chron.name + arn = aws_lambda_function.scale_up_chron.arn +} + +resource "aws_lambda_permission" "scale_up_chron" { + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.scale_up_chron.function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.scale_up_chron.arn +} + +resource "aws_iam_role" "scale_up_chron" { + name = "${var.environment}-action-scale-up-chron-lambda-role" + assume_role_policy = data.aws_iam_policy_document.lambda_assume_role_policy.json + path = local.role_path + permissions_boundary = var.role_permissions_boundary + tags = local.tags +} + +resource "aws_iam_role_policy" "scale_up_chron" { + name = "${var.environment}-lambda-scale-up-chron-policy" + role = aws_iam_role.scale_up_chron.name + policy = templatefile("${path.module}/policies/lambda-scale-up-chron.json", { + arn_ssm_parameters = "arn:aws:ssm:${var.aws_region}:${data.aws_caller_identity.current.account_id}:parameter/${var.environment}-*" + }) +} + +resource "aws_iam_role_policy" "scale_up_chron_logging" { + name = "${var.environment}-lambda-logging" + role = aws_iam_role.scale_up_chron.name + policy = templatefile("${path.module}/policies/lambda-cloudwatch.json", { + log_group_arn = aws_cloudwatch_log_group.scale_up_chron.arn + }) +} + +resource "aws_iam_role_policy_attachment" "scale_up_chron_vpc_execution_role" { + count = length(var.lambda_subnet_ids) > 0 ? 1 : 0 + role = aws_iam_role.scale_up_chron.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" +} + +resource "aws_iam_role_policy" "scale_up_chron_secretsmanager_access" { + count = var.secretsmanager_secrets_id != null ? 1 : 0 + role = aws_iam_role.scale_up_chron.name + policy = templatefile("${path.module}/policies/lambda-secretsmanager.json", { + secretsmanager_arn = data.aws_secretsmanager_secret_version.app_creds.arn + }) +} diff --git a/terraform-aws-github-runner/modules/runners/variables.tf b/terraform-aws-github-runner/modules/runners/variables.tf index 1387af2a47..d9a665e345 100644 --- a/terraform-aws-github-runner/modules/runners/variables.tf +++ b/terraform-aws-github-runner/modules/runners/variables.tf @@ -94,6 +94,12 @@ variable "scale_down_schedule_expression" { default = "cron(*/5 * * * ? *)" } +variable "scale_up_chron_schedule_expression" { + description = "Scheduler expression to check every x for scale down." + type = string + default = "cron(*/15 * * * ? *)" # every 15 minutes +} + variable "minimum_running_time_in_minutes" { description = "The time an ec2 action runner should be running at minimum before terminated if non busy." type = number @@ -112,6 +118,12 @@ variable "lambda_timeout_scale_up" { default = 60 } +variable "lambda_timeout_scale_up_chron" { + description = "Time out for the scale up chron lambda in seconds." + type = number + default = 60 +} + variable "role_permissions_boundary" { description = "Permissions boundary that will be added to the created role for the lambda." type = string From 6bf68968d83094be5c29e7b06fc54eb0e37fdb11 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Thu, 5 Dec 2024 16:01:18 -0600 Subject: [PATCH 02/16] temp changes --- terraform-aws-github-runner/main.tf | 1 + .../runners/src/scale-runners/config.ts | 4 ++ .../runners/src/scale-runners/metrics.ts | 6 +++ .../src/scale-runners/scale-up-chron.ts | 37 +++++++++++++++++++ .../modules/runners/scale-down.tf | 1 + .../modules/runners/scale-up-chron.tf | 2 + .../modules/runners/scale-up.tf | 1 + .../modules/runners/variables.tf | 5 +++ terraform-aws-github-runner/variables.tf | 5 +++ 9 files changed, 62 insertions(+) diff --git a/terraform-aws-github-runner/main.tf b/terraform-aws-github-runner/main.tf index 094389bb06..b32cc3ffb7 100644 --- a/terraform-aws-github-runner/main.tf +++ b/terraform-aws-github-runner/main.tf @@ -104,6 +104,7 @@ module "runners" { environment = var.environment tags = local.tags + scale_config_org = var.scale_config_org scale_config_repo = var.scale_config_repo scale_config_repo_path = var.scale_config_repo_path diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts index 0755e86970..4707914e91 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts @@ -36,8 +36,10 @@ export class Config { readonly retryScaleUpRecordQueueUrl: string | undefined; readonly runnerGroupName: string | undefined; readonly runnersExtraLabels: undefined | string; + readonly scaleConfigOrg: string; readonly scaleConfigRepo: string; readonly scaleConfigRepoPath: string; + readonly scaleUpRecordQueueUrl: string | undefined; readonly secretsManagerSecretsId: string | undefined; readonly sSMParamCleanupAgeDays: number; readonly sSMParamMaxCleanupAllowance: number; @@ -94,8 +96,10 @@ export class Config { /* istanbul ignore next */ this.retryScaleUpRecordJitterPct = Number(process.env.RETRY_SCALE_UP_RECORD_JITTER_PCT || '0'); this.retryScaleUpRecordQueueUrl = process.env.RETRY_SCALE_UP_RECORD_QUEUE_URL; + this.scaleUpRecordQueueUrl = process.env.SCALE_UP_RECORD_QUEUE_URL; this.runnerGroupName = process.env.RUNNER_GROUP_NAME; this.runnersExtraLabels = process.env.RUNNER_EXTRA_LABELS; + this.scaleConfigOrg = process.env.SCALE_CONFIG_ORG || ''; /* istanbul ignore next */ this.scaleConfigRepo = process.env.SCALE_CONFIG_REPO || ''; if (this.enableOrganizationRunners && !this.scaleConfigRepo) { diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts index 64c5998919..4c6ea88ea8 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts @@ -1424,6 +1424,12 @@ export class ScaleDownMetrics extends Metrics { } } +export class ScaleUpChronMetrics extends Metrics { + constructor() { + super('scaleUpChron'); + } +} + export interface sendMetricsTimeoutVars { metrics?: Metrics; setTimeout?: ReturnType; diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts index 0733503e2f..9a5a834a5e 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts @@ -1,6 +1,12 @@ import axios from 'axios'; import { Config } from './config'; +import { getRepo } from './utils'; +import { ScaleUpChronMetrics } from './metrics'; +import { getRunnerTypes } from './gh-runners'; +import { sqsSendMessages } from './sqs'; +import { ActionRequestMessage } from './scale-up'; +import { randomUUID } from 'crypto'; export async function scaleUpChron(): Promise { // This function does the following: @@ -8,8 +14,39 @@ export async function scaleUpChron(): Promise { // 2. Polls scale-config to filter the list to ones that are self-hosted by this fleet and // are ephemeral // 3. Sends a SQS request to the scale-up lambda to provision more of those instances + let queuedJobs = await getQueuedJobs(); + const scaleConfigRepo = getRepo(Config.Instance.scaleConfigOrg, Config.Instance.scaleConfigRepo); + + const metrics = new ScaleUpChronMetrics(); + const validRunnerTypes = await getRunnerTypes(scaleConfigRepo, metrics); + + const minAutoScaleupDelayMinutes = 30; + // Only proactively scale up the jobs that have been queued for longer than normal + queuedJobs = queuedJobs.filter((runner) => { + return runner.min_queue_time_min >= minAutoScaleupDelayMinutes && + runner.org === Config.Instance.scaleConfigOrg; + }); + + // Filter out the queued jobs that are do not correspond to a valid runner type + queuedJobs = queuedJobs.filter((requested_runner) => { + return Array.from(validRunnerTypes.keys()).some((available_runner_label) => { + return available_runner_label === requested_runner.runner_label; + }); + }); + + // Send a message to the SQS queue to scale up the runners + let scaleUpRequests : Array = queuedJobs.map((runner) => { + return { + "id": Math.floor(Math.random() * 100000000000000), + "eventType": "workflow_job", + "repositoryName": runner.full_repo.split('/')[1], + "repositoryOwner": runner.org, + } + } + + sqsSendMessages(metrics, queuedJobs, Config.Instance.scaleUpRecordQueueUrl); } class QueuedJobsForRunner { diff --git a/terraform-aws-github-runner/modules/runners/scale-down.tf b/terraform-aws-github-runner/modules/runners/scale-down.tf index 9daa764b2e..6398555085 100644 --- a/terraform-aws-github-runner/modules/runners/scale-down.tf +++ b/terraform-aws-github-runner/modules/runners/scale-down.tf @@ -53,6 +53,7 @@ resource "aws_lambda_function" "scale_down" { MINIMUM_RUNNING_TIME_IN_MINUTES = var.minimum_running_time_in_minutes REDIS_ENDPOINT = var.redis_endpoint REDIS_LOGIN = var.redis_login + SCALE_CONFIG_ORG = var.scale_config_org SCALE_CONFIG_REPO = var.scale_config_repo SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path SCALE_DOWN_CONFIG = jsonencode(var.idle_config) diff --git a/terraform-aws-github-runner/modules/runners/scale-up-chron.tf b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf index a7222ce9b6..4a21499145 100644 --- a/terraform-aws-github-runner/modules/runners/scale-up-chron.tf +++ b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf @@ -43,8 +43,10 @@ resource "aws_lambda_function" "scale_up_chron" { MINIMUM_RUNNING_TIME_IN_MINUTES = var.minimum_running_time_in_minutes REDIS_ENDPOINT = var.redis_endpoint REDIS_LOGIN = var.redis_login + SCALE_CONFIG_ORG = var.scale_config_org SCALE_CONFIG_REPO = var.scale_config_repo SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path + SCALE_UP_RECORD_QUEUE_URL = var.sqs_build_queue.url scale_up_chron_CONFIG = jsonencode(var.idle_config) SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id AWS_REGIONS_TO_VPC_IDS = join( diff --git a/terraform-aws-github-runner/modules/runners/scale-up.tf b/terraform-aws-github-runner/modules/runners/scale-up.tf index 8a47122534..36d9cd821a 100644 --- a/terraform-aws-github-runner/modules/runners/scale-up.tf +++ b/terraform-aws-github-runner/modules/runners/scale-up.tf @@ -67,6 +67,7 @@ resource "aws_lambda_function" "scale_up" { RETRY_SCALE_UP_RECORD_JITTER_PCT = "0.5" RETRY_SCALE_UP_RECORD_QUEUE_URL = var.sqs_build_queue_retry.url RUNNER_EXTRA_LABELS = var.runner_extra_labels + SCALE_CONFIG_ORG = var.scale_config_org SCALE_CONFIG_REPO = var.scale_config_repo SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id diff --git a/terraform-aws-github-runner/modules/runners/variables.tf b/terraform-aws-github-runner/modules/runners/variables.tf index d9a665e345..dfeaeab168 100644 --- a/terraform-aws-github-runner/modules/runners/variables.tf +++ b/terraform-aws-github-runner/modules/runners/variables.tf @@ -297,6 +297,11 @@ variable "role_runner_arn" { type = string } +variable "scale_config_org" { + description = "Organization to fetch scale config from." + type = string +} + variable "scale_config_repo" { description = "Repository to fetch scale config from." default = "" diff --git a/terraform-aws-github-runner/variables.tf b/terraform-aws-github-runner/variables.tf index b99af920cf..603a02e9ca 100644 --- a/terraform-aws-github-runner/variables.tf +++ b/terraform-aws-github-runner/variables.tf @@ -345,6 +345,11 @@ variable "cant_have_issues_labels" { default = [] } +variable "scale_config_org" { + description = "Organization to fetch scale config from." + type = string +} + variable "scale_config_repo" { description = "Repository to fetch scale config from. Optional if `enable_organization_runners` is set to false, in which case the job's repo will be used" default = "" From 897c1cfbd16b18a421699d3841c2c8fea7f754bb Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Thu, 5 Dec 2024 17:05:15 -0600 Subject: [PATCH 03/16] rename columns --- .../src/scale-runners/scale-up-chron.ts | 33 +++++++++++-------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts index 9a5a834a5e..f075cd5866 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts @@ -25,7 +25,7 @@ export async function scaleUpChron(): Promise { const minAutoScaleupDelayMinutes = 30; // Only proactively scale up the jobs that have been queued for longer than normal queuedJobs = queuedJobs.filter((runner) => { - return runner.min_queue_time_min >= minAutoScaleupDelayMinutes && + return runner.min_queue_time_minutes >= minAutoScaleupDelayMinutes && runner.org === Config.Instance.scaleConfigOrg; }); @@ -41,29 +41,34 @@ export async function scaleUpChron(): Promise { return { "id": Math.floor(Math.random() * 100000000000000), "eventType": "workflow_job", - "repositoryName": runner.full_repo.split('/')[1], + "repositoryName": runner.repo, "repositoryOwner": runner.org, - } + "runnerLabels": [runner.runner_label], + }; + }); + + if (!Config.Instance.scaleUpRecordQueueUrl) { + throw new Error('scaleUpRecordQueueUrl is not set. Cannot send scale up requests'); } - sqsSendMessages(metrics, queuedJobs, Config.Instance.scaleUpRecordQueueUrl); + await sqsSendMessages(metrics, scaleUpRequests, Config.Instance.scaleUpRecordQueueUrl); } class QueuedJobsForRunner { runner_label: string; org: string; - full_repo: string; + repo: string; num_queued_jobs: number; - min_queue_time_min: number; - max_queue_time_min: number; + min_queue_time_minutes: number; + max_queue_time_minutes: number; - constructor(runner_label: string, org: string, full_repo: string, num_queued_jobs: number, min_queue_time_min: number, max_queue_time_min: number) { + constructor(runner_label: string, org: string, repo: string, num_queued_jobs: number, min_queue_time_minutes: number, max_queue_time_minutes: number) { this.runner_label = runner_label; this.org = org; - this.full_repo = full_repo; + this.repo = repo; this.num_queued_jobs = num_queued_jobs; - this.min_queue_time_min = min_queue_time_min; - this.max_queue_time_min = max_queue_time_min; + this.min_queue_time_minutes = min_queue_time_minutes; + this.max_queue_time_minutes = max_queue_time_minutes; } } @@ -81,10 +86,10 @@ export async function getQueuedJobs(): Promise { return new QueuedJobsForRunner( runner.runner_label, runner.org, - runner.full_repo, + runner.repo, runner.num_queued_jobs, - runner.min_queue_time_min, - runner.max_queue_time_min); + runner.min_queue_time_minutes, + runner.max_queue_time_minutes); }); return queued_runners; } catch (error) { From ef7ee1ebfa92d8f0bccdda8ec23026702d662869 Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Tue, 4 Mar 2025 16:10:07 -0800 Subject: [PATCH 04/16] add metrics and don't use sqs queue --- .../runners/src/scale-runners/metrics.ts | 24 +++++++++++++++++-- .../src/scale-runners/scale-up-chron.ts | 23 +++++++++++++----- .../modules/runners/variables.tf | 2 +- 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts index 4c6ea88ea8..afcd3b849e 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts @@ -1424,10 +1424,30 @@ export class ScaleDownMetrics extends Metrics { } } -export class ScaleUpChronMetrics extends Metrics { +export class ScaleUpChronMetrics extends ScaleUpMetrics { constructor() { - super('scaleUpChron'); + super(); } + queuedRunnerStats(org: string, runnerType: string, numQueuedJobs: number) { + const dimensions = new Map([['Org', org], ['RunnerType', runnerType], ['numQueuedJobs', numQueuedJobs.toString()]]); + this.addEntry('run.scaleupchron.queuedRunners', 3, dimensions); + } + queuedRunnerFailure(error: string) { + const dimensions = new Map([['error', error]]); + this.countEntry('run.scaleupchron.queuedRunners.failure', 1, dimensions); + } + + scaleUpChronSuccess() { + this.scaleUpSuccess(); + this.countEntry('run.scaleupchron.success'); + } + scaleUpChronFailure(error:string) { + const dimensions = new Map([['error', error]]); + + // should we add more information about this or do we not care since it'll be requeued? + this.countEntry('run.scaleupchron.failure', 1, dimensions); + } + } export interface sendMetricsTimeoutVars { diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts index f075cd5866..9d0f5d8fe0 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts @@ -1,11 +1,11 @@ import axios from 'axios'; import { Config } from './config'; -import { getRepo } from './utils'; +import { getRepo, shuffleArrayInPlace } from './utils'; import { ScaleUpChronMetrics } from './metrics'; import { getRunnerTypes } from './gh-runners'; import { sqsSendMessages } from './sqs'; -import { ActionRequestMessage } from './scale-up'; +import { ActionRequestMessage, scaleUp} from './scale-up'; import { randomUUID } from 'crypto'; export async function scaleUpChron(): Promise { @@ -14,12 +14,13 @@ export async function scaleUpChron(): Promise { // 2. Polls scale-config to filter the list to ones that are self-hosted by this fleet and // are ephemeral // 3. Sends a SQS request to the scale-up lambda to provision more of those instances - let queuedJobs = await getQueuedJobs(); + const metrics = new ScaleUpChronMetrics(); + + let queuedJobs = await getQueuedJobs(metrics); const scaleConfigRepo = getRepo(Config.Instance.scaleConfigOrg, Config.Instance.scaleConfigRepo); - const metrics = new ScaleUpChronMetrics(); const validRunnerTypes = await getRunnerTypes(scaleConfigRepo, metrics); const minAutoScaleupDelayMinutes = 30; @@ -51,7 +52,15 @@ export async function scaleUpChron(): Promise { throw new Error('scaleUpRecordQueueUrl is not set. Cannot send scale up requests'); } - await sqsSendMessages(metrics, scaleUpRequests, Config.Instance.scaleUpRecordQueueUrl); + for (const request of shuffleArrayInPlace(scaleUpRequests)) { + try{ + await scaleUp("aws:sqs", request, metrics); + metrics.scaleUpChronSuccess(); + + } catch (error) { + metrics.scaleUpChronFailure((error as Error).message); + } + } class QueuedJobsForRunner { @@ -72,7 +81,7 @@ class QueuedJobsForRunner { } } -export async function getQueuedJobs(): Promise { +export async function getQueuedJobs(metrics: ScaleUpChronMetrics): Promise { // This function queries the HUD for queued runners // and returns a list of them @@ -83,6 +92,7 @@ export async function getQueuedJobs(): Promise { // Map the response to the class const queued_runners = response.data.map((runner: any) => { + metrics.queuedRunnerStats(runner.org, runner.runner_label, runner.num_queued_jobs,); return new QueuedJobsForRunner( runner.runner_label, runner.org, @@ -93,6 +103,7 @@ export async function getQueuedJobs(): Promise { }); return queued_runners; } catch (error) { + metrics.queuedRunnerFailure((error as Error).message); console.error('Error fetching queued runners:', error); return []; } diff --git a/terraform-aws-github-runner/modules/runners/variables.tf b/terraform-aws-github-runner/modules/runners/variables.tf index dfeaeab168..67d1739b8f 100644 --- a/terraform-aws-github-runner/modules/runners/variables.tf +++ b/terraform-aws-github-runner/modules/runners/variables.tf @@ -97,7 +97,7 @@ variable "scale_down_schedule_expression" { variable "scale_up_chron_schedule_expression" { description = "Scheduler expression to check every x for scale down." type = string - default = "cron(*/15 * * * ? *)" # every 15 minutes + default = "cron(*/30 * * * ? *)" # every 30 minutes } variable "minimum_running_time_in_minutes" { From bd845249c2572677839c68af9f4bf2b4a3ded2d5 Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Tue, 4 Mar 2025 16:26:41 -0800 Subject: [PATCH 05/16] set timeout --- .../runners/lambdas/runners/src/lambda.ts | 24 ++++++++++++++++--- .../src/scale-runners/scale-up-chron.test.ts | 1 - .../src/scale-runners/scale-up-chron.ts | 3 +-- 3 files changed, 22 insertions(+), 6 deletions(-) diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts index ceaf889056..61beb786bd 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts @@ -2,7 +2,7 @@ import { ActionRequestMessage, RetryableScalingError, scaleUp as scaleUpR } from import { Context, SQSEvent, SQSRecord, ScheduledEvent } from 'aws-lambda'; import { Config } from './scale-runners/config'; -import { ScaleUpMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics'; +import { ScaleUpMetrics, ScaleUpChronMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics'; import { getDelayWithJitterRetryCount, stochaticRunOvershoot } from './scale-runners/utils'; import { scaleDown as scaleDownR } from './scale-runners/scale-down'; import { scaleUpChron as scaleUpChronR } from './scale-runners/scale-up-chron'; @@ -162,11 +162,29 @@ export async function scaleUpChron(event: ScheduledEvent, context: Context, call // we mantain open connections to redis, so the event pool is only cleaned when the SIGTERM is sent context.callbackWaitsForEmptyEventLoop = false; + const metrics = new ScaleUpChronMetrics(); + const sndMetricsTimout: sendMetricsTimeoutVars = { + metrics: metrics, + }; + sndMetricsTimout.setTimeout = setTimeout( + sendMetricsAtTimeout(sndMetricsTimout), + (Config.Instance.lambdaTimeout - 10) * 1000, + ); + try { - await scaleUpChronR(); + await scaleUpChronR(metrics); return callback(null); } catch (e) { console.error(e); return callback('Failed'); + } finally { + try { + clearTimeout(sndMetricsTimout.setTimeout); + sndMetricsTimout.metrics = undefined; + sndMetricsTimout.setTimeout = undefined; + await metrics.sendMetrics(); + } catch (e) { + console.error(`Error sending metrics: ${e}`); + } } -} \ No newline at end of file +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts index ac4157bbdb..5746d70978 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts @@ -15,4 +15,3 @@ describe('scaleUp', () => { expect(runners).toBeDefined(); }); }); - diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts index 9d0f5d8fe0..8ade39cefe 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts @@ -8,13 +8,12 @@ import { sqsSendMessages } from './sqs'; import { ActionRequestMessage, scaleUp} from './scale-up'; import { randomUUID } from 'crypto'; -export async function scaleUpChron(): Promise { +export async function scaleUpChron(metrics: ScaleUpChronMetrics): Promise { // This function does the following: // 1. Queries for queued runners via HUD // 2. Polls scale-config to filter the list to ones that are self-hosted by this fleet and // are ephemeral // 3. Sends a SQS request to the scale-up lambda to provision more of those instances - const metrics = new ScaleUpChronMetrics(); let queuedJobs = await getQueuedJobs(metrics); From 818bf441ad36213c3b638de2dbe984ef94882993 Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Wed, 5 Mar 2025 17:43:58 -0800 Subject: [PATCH 06/16] add tests --- .../runners/src/scale-runners/config.ts | 2 + .../runners/src/scale-runners/metrics.ts | 33 +++- .../src/scale-runners/run-test.test.ts | 9 -- .../src/scale-runners/scale-up-chron.test.ts | 149 +++++++++++++++++- .../src/scale-runners/scale-up-chron.ts | 81 +++++----- .../modules/runners/scale-up-chron.tf | 3 +- 6 files changed, 215 insertions(+), 62 deletions(-) delete mode 100644 terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/run-test.test.ts diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts index 4707914e91..59d9c4c2b3 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts @@ -39,6 +39,7 @@ export class Config { readonly scaleConfigOrg: string; readonly scaleConfigRepo: string; readonly scaleConfigRepoPath: string; + readonly scaleUpMinQueueTimeMinutes: number; readonly scaleUpRecordQueueUrl: string | undefined; readonly secretsManagerSecretsId: string | undefined; readonly sSMParamCleanupAgeDays: number; @@ -97,6 +98,7 @@ export class Config { this.retryScaleUpRecordJitterPct = Number(process.env.RETRY_SCALE_UP_RECORD_JITTER_PCT || '0'); this.retryScaleUpRecordQueueUrl = process.env.RETRY_SCALE_UP_RECORD_QUEUE_URL; this.scaleUpRecordQueueUrl = process.env.SCALE_UP_RECORD_QUEUE_URL; + this.scaleUpMinQueueTimeMinutes = process.env.SCALE_UP_MIN_QUEUE_TIME_MINUTES ? Number(process.env.SCALE_UP_MIN_QUEUE_TIME_MINUTES) : 30 this.runnerGroupName = process.env.RUNNER_GROUP_NAME; this.runnersExtraLabels = process.env.RUNNER_EXTRA_LABELS; this.scaleConfigOrg = process.env.SCALE_CONFIG_ORG || ''; diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts index afcd3b849e..4981874431 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts @@ -1430,22 +1430,45 @@ export class ScaleUpChronMetrics extends ScaleUpMetrics { } queuedRunnerStats(org: string, runnerType: string, numQueuedJobs: number) { const dimensions = new Map([['Org', org], ['RunnerType', runnerType], ['numQueuedJobs', numQueuedJobs.toString()]]); - this.addEntry('run.scaleupchron.queuedRunners', 3, dimensions); + this.addEntry('gh.scaleupchron.queuedRunners', 3, dimensions); } queuedRunnerFailure(error: string) { const dimensions = new Map([['error', error]]); - this.countEntry('run.scaleupchron.queuedRunners.failure', 1, dimensions); + this.countEntry('gh.scaleupchron.queuedRunners.failure', 1, dimensions); + } + /* istanbul ignore next */ + getQueuedJobsEndpointSuccess(ms: number) { + this.countEntry(`gh.calls.total`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.count`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.success`, 1); + this.addEntry(`gh.calls.getQueuedJobsEndpoint.wallclock`, ms); } - scaleUpChronSuccess() { + /* istanbul ignore next */ + getQueuedJobsEndpointFailure(ms: number) { + this.countEntry(`gh.calls.total`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.count`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.failure`, 1); + this.addEntry(`gh.calls.getQueuedJobsEndpoint.wallclock`, ms); + } + + scaleUpInstanceSuccess() { this.scaleUpSuccess(); this.countEntry('run.scaleupchron.success'); } - scaleUpChronFailure(error:string) { + scaleUpInstanceFailureNonRetryable(error:string) { + const dimensions = new Map([['error', error]]); + // should we add more information about this or do we not care since it'll be requeued? + this.countEntry('run.scaleupchron.failure.nonRetryable', 1, dimensions); + } + scaleUpInstanceFailureRetryable(error:string) { const dimensions = new Map([['error', error]]); // should we add more information about this or do we not care since it'll be requeued? - this.countEntry('run.scaleupchron.failure', 1, dimensions); + this.countEntry('run.scaleupchron.failure.retryable', 1, dimensions); + } + scaleUpInstanceNoOp() { + this.countEntry('run.scaleupchron.noop'); } } diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/run-test.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/run-test.test.ts deleted file mode 100644 index d11b0dec8f..0000000000 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/run-test.test.ts +++ /dev/null @@ -1,9 +0,0 @@ -// Import the required modules -import { getQueuedJobs } from './scale-up-chron'; - - -test('getQueuedRunners should fetch queued runners', async () => { - const runners = await getQueuedJobs(); - console.log('Queued Runners:', runners); - expect(runners).toBeDefined(); -}); \ No newline at end of file diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts index 5746d70978..432a1cea35 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts @@ -1,13 +1,102 @@ -import moment from 'moment'; -import nock from 'nock'; -import { mocked } from 'ts-jest/utils'; +import { createRunner } from './runners'; +import { + createRegistrationTokenOrg, + createRegistrationTokenRepo, + getGitHubRateLimit, + getRunnerTypes, + listGithubRunnersOrg, + listGithubRunnersRepo, +} from './gh-runners'; + import { Config } from './config'; +import { getRepoIssuesWithLabel, GhIssues } from './gh-issues'; +import { mocked } from 'ts-jest/utils'; +import nock from 'nock'; +import { scaleUp, _calculateScaleUpAmount } from './scale-up'; +import { scaleUpChron, getQueuedJobs } from './scale-up-chron'; + +import * as MetricsModule from './metrics'; + +jest.mock('./runners'); +jest.mock('./gh-runners'); +jest.mock('./gh-issues'); + // Import the required modules import { getQueuedJobs } from './scale-up-chron'; +const metrics = new MetricsModule.ScaleUpChronMetrics(); + +describe('scaleUpChron', () => { + beforeEach(() => { + const mockedGetRepo = mocked(getRepo).mockReturnValue('repo'); + const mockedvalidRunnerTypes = mocked(getRunnerTypes).mockResolvedValue( + new Map([ + [ + 'label1', + { + instance_type: 'instance_type', + os: 'os', + max_available: 33, + disk_size: 113, + runnerTypeName: 'runnerTypeName', + is_ephemeral: false, + }, + ], + ]), + ); + }); + + + const minAutoScaleupDelayMinutes = Config.Instance.scaleUpMinQueueTimeMinutes; + if (!Config.Instance.scaleUpRecordQueueUrl) { + metrics.scaleUpInstanceFailureNonRetryable('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); + throw new Error('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); + } + + it('invalid scaleUpRecordQueueUrl', async () => { + jest.spyOn(Config, 'Instance', 'get').mockReturnValue(null) + expect(await scaleUpChron(metrics)).rejects.toThrow('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); + }); + + it('queued jobs do not match available runners', async () => { + jest.spyOn(Config, 'Instance', 'scaleUpMinQueueTimeMinutes').mockReturnValue('url') + jest.spyOn(Config, 'Instance', 'scaleConfigOrg').mockReturnValue('test_org1') + + const mockedGetQueuedJobs = mocked(getQueuedJobs).mockResolvedValue([ + { + runner_label: 'label1-nomatch', + org: 'test_org1', + repo: 'test_repo1', + num_queued_jobs: 1, + min_queue_time_minutes: 1, + max_queue_time_minutes: 1 + } + ]) + const scaleUpInstanceNoOp = jest.spyOn(metrics, 'scaleUpInstanceNoOp'); + await scaleUpChron(metrics) + expect(scaleUpInstanceNoOp).toBeCalledTimes(1); + }); + + it('queued jobs do not match scale config org', async () => { + jest.spyOn(Config, 'Instance', 'scaleUpMinQueueTimeMinutes').mockReturnValue('url') + jest.spyOn(Config, 'Instance', 'scaleConfigOrg').mockReturnValue('test_org1') + const mockedGetQueuedJobs = mocked(getQueuedJobs).mockResolvedValue([ + { + runner_label: 'label1', + org: 'test_org1-nomatch', + repo: 'test_repo1', + num_queued_jobs: 1, + min_queue_time_minutes: 1, + max_queue_time_minutes: 1 + } + ]) + const scaleUpInstanceNoOp = jest.spyOn(metrics, 'scaleUpInstanceNoOp'); + await scaleUpChron(metrics) + expect(scaleUpInstanceNoOp).toBeCalledTimes(1); + }); + -describe('scaleUp', () => { it('getQueuedRunners should fetch queued runners', async () => { const runners = await getQueuedJobs(); @@ -15,3 +104,55 @@ describe('scaleUp', () => { expect(runners).toBeDefined(); }); }); + +describe('getQueuedJobs', () => { + it('get queue data from url request with valid response', async () => { + const dataMap1 = new Map([ + ['runner_type', 'test_runner_type1'], + ['org', 'test_org1'], + ['repo', 'test_repo1'], + ['num_queued_jobs', '1'], + ['min_queue_time_minutes', '1'], + ['max_queue_time_minutes', '1'] + ]) + const dataMap2 = new Map([ + ['runner_type', 'test_runner_type2'], + ['org', 'test_org2'], + ['repo', 'test_repo2'], + ['num_queued_jobs', '2'], + ['min_queue_time_minutes', '2'], + ['max_queue_time_minutes', '2'] + ]) + jest.spyOn(axios, 'get').mockReturnValue(new Map([['data', [dataMap1, dataMap2]]])); + expect(await getQueuedJobs(metrics, 'url')).toEqual([ + { + runner_label: 'test_runner_type1', + org: 'test_org1', + repo: 'test_repo1', + num_queued_jobs: 1, + min_queue_time_minutes: 1, + max_queue_time_minutes: 1 + }, { + runner_label: 'test_runner_type2', + org: 'test_org2', + repo: 'test_repo2', + num_queued_jobs: 2, + min_queue_time_minutes: 2, + } + ]); + + }); + + it('get queue data from url request with invalid response', async () => { + jest.spyOn(axios, 'get').mockReturnValue(new Map([['noDataHere', 'whoops']])); + const runners = await getQueuedJobs(); + await expect(getQueuedJobs(metrics, 'url')).rejects.toThrow('Error fetching queued runners: {TODO:camyllh test and add error message}'); + }); + + it('get queue data from url request with invalid response', async () => { + jest.spyOn(axios, 'get').mockReturnValue(new Map([['noDataHere', 'whoops']])); + const runners = await getQueuedJobs(); + await expect(getQueuedJobs(metrics, 'url')).rejects.toThrow('Error fetching queued runners: {TODO:camyllh test and add error message}'); + }); + +}); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts index 8ade39cefe..f52b4271e2 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts @@ -1,7 +1,7 @@ import axios from 'axios'; import { Config } from './config'; -import { getRepo, shuffleArrayInPlace } from './utils'; +import { getRepo, shuffleArrayInPlace, expBackOff } from './utils'; import { ScaleUpChronMetrics } from './metrics'; import { getRunnerTypes } from './gh-runners'; import { sqsSendMessages } from './sqs'; @@ -12,32 +12,37 @@ export async function scaleUpChron(metrics: ScaleUpChronMetrics): Promise // This function does the following: // 1. Queries for queued runners via HUD // 2. Polls scale-config to filter the list to ones that are self-hosted by this fleet and - // are ephemeral + // are ephemeral and nonephemeral // 3. Sends a SQS request to the scale-up lambda to provision more of those instances - let queuedJobs = await getQueuedJobs(metrics); - const scaleConfigRepo = getRepo(Config.Instance.scaleConfigOrg, Config.Instance.scaleConfigRepo); - const validRunnerTypes = await getRunnerTypes(scaleConfigRepo, metrics); - const minAutoScaleupDelayMinutes = 30; + const minAutoScaleupDelayMinutes = Config.Instance.scaleUpMinQueueTimeMinutes; + if (!Config.Instance.scaleUpRecordQueueUrl) { + metrics.scaleUpInstanceFailureNonRetryable('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); + throw new Error('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); + } + const scaleUpRecordQueueUrl = Config.Instance.scaleUpRecordQueueUrl; // Only proactively scale up the jobs that have been queued for longer than normal - queuedJobs = queuedJobs.filter((runner) => { + // Filter out the queued jobs that are do not correspond to a valid runner type + const queuedJobs = (await getQueuedJobs(metrics, scaleUpRecordQueueUrl)).filter((runner) => { return runner.min_queue_time_minutes >= minAutoScaleupDelayMinutes && runner.org === Config.Instance.scaleConfigOrg; - }); - - // Filter out the queued jobs that are do not correspond to a valid runner type - queuedJobs = queuedJobs.filter((requested_runner) => { + }).filter((requested_runner) => { return Array.from(validRunnerTypes.keys()).some((available_runner_label) => { return available_runner_label === requested_runner.runner_label; }); - }); + });; + + if (queuedJobs.length === 0) { + metrics.scaleUpInstanceNoOp(); + return + } // Send a message to the SQS queue to scale up the runners - let scaleUpRequests : Array = queuedJobs.map((runner) => { + const scaleUpRequests : Array = queuedJobs.map((runner) => { return { "id": Math.floor(Math.random() * 100000000000000), "eventType": "workflow_job", @@ -47,60 +52,50 @@ export async function scaleUpChron(metrics: ScaleUpChronMetrics): Promise }; }); - if (!Config.Instance.scaleUpRecordQueueUrl) { - throw new Error('scaleUpRecordQueueUrl is not set. Cannot send scale up requests'); - } - for (const request of shuffleArrayInPlace(scaleUpRequests)) { try{ await scaleUp("aws:sqs", request, metrics); - metrics.scaleUpChronSuccess(); - + metrics.scaleUpInstanceSuccess(); } catch (error) { - metrics.scaleUpChronFailure((error as Error).message); + metrics.scaleUpInstanceFailureRetryable((error as Error).message); } - + } } -class QueuedJobsForRunner { +interface QueuedJobsForRunner { runner_label: string; org: string; repo: string; num_queued_jobs: number; min_queue_time_minutes: number; max_queue_time_minutes: number; - - constructor(runner_label: string, org: string, repo: string, num_queued_jobs: number, min_queue_time_minutes: number, max_queue_time_minutes: number) { - this.runner_label = runner_label; - this.org = org; - this.repo = repo; - this.num_queued_jobs = num_queued_jobs; - this.min_queue_time_minutes = min_queue_time_minutes; - this.max_queue_time_minutes = max_queue_time_minutes; - } } -export async function getQueuedJobs(metrics: ScaleUpChronMetrics): Promise { +export async function getQueuedJobs(metrics: ScaleUpChronMetrics, scaleUpRecordQueueUrl: string): Promise { // This function queries the HUD for queued runners // and returns a list of them - const url = 'https://hud.pytorch.org/api/clickhouse/queued_jobs_aggregate?parameters=%5B%5D'; + const url = scaleUpRecordQueueUrl; try { - const response = await axios.get(url); + const response = await expBackOff(() => { + return metrics.trackRequest(metrics.getQueuedJobsEndpointSuccess, metrics.getQueuedJobsEndpointFailure, () => { + return axios.get(url); + }); + }); // Map the response to the class - const queued_runners = response.data.map((runner: any) => { + return response?.data.map((runner: any) => { metrics.queuedRunnerStats(runner.org, runner.runner_label, runner.num_queued_jobs,); - return new QueuedJobsForRunner( - runner.runner_label, - runner.org, - runner.repo, - runner.num_queued_jobs, - runner.min_queue_time_minutes, - runner.max_queue_time_minutes); + return { + runner_label: runner.runner_label, + org: runner.org, + repo: runner.repo, + num_queued_jobs: Number(runner.num_queued_jobs), + min_queue_time_minutes: Number(runner.min_queue_time_minutes), + max_queue_time_minutes: Number(runner.max_queue_time_minutes) + }; }); - return queued_runners; } catch (error) { metrics.queuedRunnerFailure((error as Error).message); console.error('Error fetching queued runners:', error); diff --git a/terraform-aws-github-runner/modules/runners/scale-up-chron.tf b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf index 4a21499145..3224b5684e 100644 --- a/terraform-aws-github-runner/modules/runners/scale-up-chron.tf +++ b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf @@ -46,7 +46,8 @@ resource "aws_lambda_function" "scale_up_chron" { SCALE_CONFIG_ORG = var.scale_config_org SCALE_CONFIG_REPO = var.scale_config_repo SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path - SCALE_UP_RECORD_QUEUE_URL = var.sqs_build_queue.url + SCALE_UP_MIN_QUEUE_TIME_MINUTES = 30 + SCALE_UP_RECORD_QUEUE_URL = 'https://hud.pytorch.org/api/clickhouse/queued_jobs_aggregate?parameters=%5B%5D' scale_up_chron_CONFIG = jsonencode(var.idle_config) SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id AWS_REGIONS_TO_VPC_IDS = join( From a4c6f8c1bfb1d3abbbf4823c171336596573808c Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Wed, 5 Mar 2025 17:58:04 -0800 Subject: [PATCH 07/16] fix tests --- .../src/scale-runners/scale-up-chron.test.ts | 37 +++++++++---------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts index 432a1cea35..f16c4b1116 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts @@ -14,6 +14,7 @@ import { mocked } from 'ts-jest/utils'; import nock from 'nock'; import { scaleUp, _calculateScaleUpAmount } from './scale-up'; import { scaleUpChron, getQueuedJobs } from './scale-up-chron'; +import axios from 'axios'; import * as MetricsModule from './metrics'; @@ -23,7 +24,13 @@ jest.mock('./gh-issues'); // Import the required modules -import { getQueuedJobs } from './scale-up-chron'; +import { getRepo } from './utils'; + +const baseCfg = { + scaleConfigOrg: 'test_org1', + scaleUpMinQueueTimeMinutes: 30, + scaleUpRecordQueueUrl: 'url', +} as unknown as Config; const metrics = new MetricsModule.ScaleUpChronMetrics(); @@ -46,7 +53,7 @@ describe('scaleUpChron', () => { ]), ); }); - + jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg); const minAutoScaleupDelayMinutes = Config.Instance.scaleUpMinQueueTimeMinutes; if (!Config.Instance.scaleUpRecordQueueUrl) { @@ -55,14 +62,16 @@ describe('scaleUpChron', () => { } it('invalid scaleUpRecordQueueUrl', async () => { - jest.spyOn(Config, 'Instance', 'get').mockReturnValue(null) - expect(await scaleUpChron(metrics)).rejects.toThrow('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); + jest.spyOn(Config, 'Instance', 'get').mockImplementation( + () => + ({ + ...baseCfg, + scaleUpRecordQueueUrl: null, + } as unknown as Config), + ); expect(await scaleUpChron(metrics)).rejects.toThrow('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); }); it('queued jobs do not match available runners', async () => { - jest.spyOn(Config, 'Instance', 'scaleUpMinQueueTimeMinutes').mockReturnValue('url') - jest.spyOn(Config, 'Instance', 'scaleConfigOrg').mockReturnValue('test_org1') - const mockedGetQueuedJobs = mocked(getQueuedJobs).mockResolvedValue([ { runner_label: 'label1-nomatch', @@ -79,8 +88,6 @@ describe('scaleUpChron', () => { }); it('queued jobs do not match scale config org', async () => { - jest.spyOn(Config, 'Instance', 'scaleUpMinQueueTimeMinutes').mockReturnValue('url') - jest.spyOn(Config, 'Instance', 'scaleConfigOrg').mockReturnValue('test_org1') const mockedGetQueuedJobs = mocked(getQueuedJobs).mockResolvedValue([ { runner_label: 'label1', @@ -95,14 +102,6 @@ describe('scaleUpChron', () => { await scaleUpChron(metrics) expect(scaleUpInstanceNoOp).toBeCalledTimes(1); }); - - - - it('getQueuedRunners should fetch queued runners', async () => { - const runners = await getQueuedJobs(); - console.log('Queued Runners:', runners); - expect(runners).toBeDefined(); - }); }); describe('getQueuedJobs', () => { @@ -145,13 +144,13 @@ describe('getQueuedJobs', () => { it('get queue data from url request with invalid response', async () => { jest.spyOn(axios, 'get').mockReturnValue(new Map([['noDataHere', 'whoops']])); - const runners = await getQueuedJobs(); + const runners = await getQueuedJobs(metrics, 'url'); await expect(getQueuedJobs(metrics, 'url')).rejects.toThrow('Error fetching queued runners: {TODO:camyllh test and add error message}'); }); it('get queue data from url request with invalid response', async () => { jest.spyOn(axios, 'get').mockReturnValue(new Map([['noDataHere', 'whoops']])); - const runners = await getQueuedJobs(); + const runners = await getQueuedJobs(metrics, 'url'); await expect(getQueuedJobs(metrics, 'url')).rejects.toThrow('Error fetching queued runners: {TODO:camyllh test and add error message}'); }); From f27e0cf185a3438ce51734f42ec9f4e6de21ab60 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 5 Mar 2025 16:25:26 +0000 Subject: [PATCH 08/16] Update of release scripts and pypi release validations (#6349) Release preparation and validation scripts update. 1. Display python versions when dumping METADATA contents of pypi package 2. Add torchao and torchdata changes --- release/promote.sh | 1 + release/pypi/promote_pypi_to_staging.sh | 7 ++++--- release/pypi/upload_pypi_to_staging.sh | 5 ++++- release/release_versions.sh | 3 ++- tools/analytics/validate_pypi_staging.py | 22 ++++++++++++++-------- 5 files changed, 25 insertions(+), 13 deletions(-) diff --git a/release/promote.sh b/release/promote.sh index 8b79397833..3d792f454e 100644 --- a/release/promote.sh +++ b/release/promote.sh @@ -107,6 +107,7 @@ promote_pypi() { # promote_s3 fbgemm-gpu whl "${FBGEMMGPU_VERSION}" # promote_s3 "libtorch-*" libtorch "${PYTORCH_VERSION}" # promote_s3 "torch_tensorrt" whl "${TENSORRT_VERSION}" +# promote_s3 "torchao" whl "${TORCHAO_VERSION}" # promote_conda torchtriton conda "2.1.0" # promote_conda pytorch-cuda conda "11.8" diff --git a/release/pypi/promote_pypi_to_staging.sh b/release/pypi/promote_pypi_to_staging.sh index bfda4b6f8f..6c8befc09b 100644 --- a/release/pypi/promote_pypi_to_staging.sh +++ b/release/pypi/promote_pypi_to_staging.sh @@ -47,10 +47,11 @@ PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" #PLATFORM="linux_x86" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging executorch "${EXECUTORCH_VERSION}" #PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging executorch "${EXECUTORCH_VERSION}" +# PLATFORM="manylinux" VERSION_SUFFIX="${LINUX_VERSION_SUFFIX}" ARCH="cu124" upload_pypi_to_staging torchao "${TORCHAO_VERSION}" +# PLATFORM="none-any" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchao "${TORCHAO_VERSION}" + #PLATFORM="linux_x86" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" #PLATFORM="win_amd64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" #PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" -#PLATFORM="linux_x86_64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" -#PLATFORM="win_amd64" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" -#PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" +#PLATFORM="none-any" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" diff --git a/release/pypi/upload_pypi_to_staging.sh b/release/pypi/upload_pypi_to_staging.sh index 1f32984b40..38455cea74 100644 --- a/release/pypi/upload_pypi_to_staging.sh +++ b/release/pypi/upload_pypi_to_staging.sh @@ -21,11 +21,14 @@ PLATFORM=${PLATFORM:-} # i.e. cpu, cu121, cu124 ARCH=${ARCH:-cpu} +# This extract links to packages from the index.html +# We strip all extra characters including final sha256 char pkgs_to_promote=$(\ curl -fsSL "https://download.pytorch.org/whl/test/${ARCH}/${PACKAGE_NAME}/index.html" \ | grep "${PACKAGE_NAME}-${PACKAGE_VERSION}${VERSION_SUFFIX}-" \ | grep "${PLATFORM}" \ - | cut -d '"' -f2 + | cut -d '"' -f2 \ + | cut -d "#" -f1 ) tmp_dir="$(mktemp -d)" diff --git a/release/release_versions.sh b/release/release_versions.sh index 32e8c47d8b..71b9c75d0f 100644 --- a/release/release_versions.sh +++ b/release/release_versions.sh @@ -8,6 +8,7 @@ TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.18.0} TORCHREC_VERSION=${TORCHREC_VERSION:-0.8.0} TENSORRT_VERSION=${TENSORRT_VERSION:-2.4.0} EXECUTORCH_VERSION=${EXECUTORCH_VERSION:-0.3.0} -TORCHAO_VERSION=${TORCHAO_VERSION:-0.4.0} +TORCHAO_VERSION=${TORCHAO_VERSION:-0.9.0} +TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.11.0} TORCHTUNE_VERSION=${TORCHTUNE_VERSION:-0.2.1} FBGEMMGPU_VERSION=${FBGEMMGPU_VERSION:-1.0.0} diff --git a/tools/analytics/validate_pypi_staging.py b/tools/analytics/validate_pypi_staging.py index 8e8801ff8c..0c06ca2185 100644 --- a/tools/analytics/validate_pypi_staging.py +++ b/tools/analytics/validate_pypi_staging.py @@ -12,22 +12,24 @@ PLATFORMS = [ "manylinux1_x86_64", - "manylinux2014_aarch64", + "manylinux_2_28_aarch64", "win_amd64", "macosx_11_0_arm64", ] -PYTHON_VERSIONS = ["cp38", "cp39", "cp310", "cp311", "cp312"] +PYTHON_VERSIONS = ["cp39", "cp310", "cp311", "cp312", "cp313"] S3_PYPI_STAGING = "pytorch-backup" PACKAGE_RELEASES = { - "torch": "2.3.1", - "torchvision": "0.18.1", - "torchaudio": "2.3.1", - "torchtext": "0.18.0", - "executorch": "0.2.1", + "torch": "2.6.0", + "torchvision": "0.21.0", + "torchaudio": "2.6.0", + # "torchtext": "0.18.0", + # "executorch": "0.2.1", } PATTERN_V = "Version:" PATTERN_RD = "Requires-Dist:" +PATTERN_PYTHON = "Requires-Python:" +PATTERN_PROGRAMMING = "Classifier: Programming Language :: Python ::" s3 = boto3.client("s3") @@ -104,7 +106,11 @@ def validate_file_metadata(build: str, package: str, version: str): f"FAILURE VERSION DOES NOT MATCH expected {version} got {exttracted_version}" ) - elif line.startswith(PATTERN_RD): + elif ( + line.startswith(PATTERN_RD) + or line.startswith(PATTERN_PYTHON) + or line.startswith(PATTERN_PROGRAMMING) + ): print(f"{line}", end="") shutil.rmtree(temp_dir) From fee54b5fe03b37747b057f45c5cecd99114d4ab9 Mon Sep 17 00:00:00 2001 From: Jean Schmidt <4520845+jeanschmidt@users.noreply.github.com> Date: Wed, 5 Mar 2025 18:10:48 +0100 Subject: [PATCH 09/16] [Bugfix] wait for ssm parameter to be created (#6359) Sometimes SSM parameter is not properly created. After investigation I identified that the promise is not being properly awaited. What could cause some operations to be canceled. --- .../runners/lambdas/runners/src/scale-runners/runners.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts index 3c0e6885e9..4201946513 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts @@ -572,7 +572,7 @@ export async function createRunner(runnerParameters: RunnerInputParameters, metr ` [${runnerParameters.runnerType.runnerTypeName}] [AMI?:${customAmi}] ${labelsStrLog}: `, runInstancesResponse.Instances.map((i) => i.InstanceId).join(','), ); - addSSMParameterRunnerConfig( + await addSSMParameterRunnerConfig( runInstancesResponse.Instances, runnerParameters, customAmiExperiment, From 04b74b3fb9fc6da60ba7499f5aee2149bd2266f0 Mon Sep 17 00:00:00 2001 From: Andrey Talman Date: Wed, 5 Mar 2025 18:28:24 +0000 Subject: [PATCH 10/16] [py 3.13t] Fix for vision nightly failure on MacOS M1 machines (#6358) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow up after: https://github.com/pytorch/test-infra/pull/6242 Newer conda is having trouble installing py3.13t artifacts, please see: https://github.com/pytorch/vision/actions/runs/13659383307/job/38186887882#step:10:98 ``` Could not solve for environment specs The following packages are incompatible ├─ libpng =* * is installable with the potential options │ ├─ libpng 1.6.37 would require │ │ └─ zlib >=1.2.11,<1.3.0a0 *, which can be installed; │ └─ libpng 1.6.39 would require │ └─ zlib >=1.2.13,<1.3.0a0 *, which can be installed; └─ python-freethreading =* * is not installable because it requires ├─ cpython =3.13.2 *, which requires │ └─ python =3.13.2 * with the potential options │ ├─ python 3.13.2 would require │ │ └─ libzlib >=1.3.1,<2.0a0 *, which requires │ │ └─ zlib ==1.3.1 *_2, which conflicts with any installable versions previously reported; │ └─ python 3.13.2, which can be installed; └─ python_abi =* *_cp313t, which requires └─ python =3.13 *_cp313t, which conflicts with any installable versions previously reported. ``` As a consequence vision is compiled without png support and failing smoke tests Test PR: https://github.com/pytorch/test-infra/pull/6356 --- .github/actions/setup-binary-builds/action.yml | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/.github/actions/setup-binary-builds/action.yml b/.github/actions/setup-binary-builds/action.yml index 7cbadbc9e8..fbcb49a315 100644 --- a/.github/actions/setup-binary-builds/action.yml +++ b/.github/actions/setup-binary-builds/action.yml @@ -144,12 +144,16 @@ runs: if [[ "${PYTHON_VERSION:-}" == "3.13t" ]]; then export PYTHON_VERSION=3.13 export CONDA_EXTRA_PARAM=" python-freethreading -c conda-forge" - if [[ "$(uname)" != Darwin ]]; then - # Pin conda and conda-libmamba-solver for 3.13t linux build - # this solver allows us to install anaconda dependencies on - # python-freethreading on conda-forge environment - conda install conda==24.7.1 conda-libmamba-solver=24.1.0 + + # downgrade conda version for python 3.13t install. + # TODO: remove this once python 3.13t is fully suported on conda + # Please see : https://github.com/conda/conda/issues/14554 + if [[ "$(uname)" == Darwin ]]; then + # required to be able to downgrade on MacOS m1 side + conda install -y python=3.9 + conda uninstall -y conda-anaconda-telemetry conda-anaconda-tos fi + conda install -y conda=24.7.1 conda-libmamba-solver=24.1.0 fi conda create \ From 31bd2cd37a6c23543a33fec21b0a7853542de69c Mon Sep 17 00:00:00 2001 From: clee2000 <44682903+clee2000@users.noreply.github.com> Date: Wed, 5 Mar 2025 12:49:24 -0800 Subject: [PATCH 11/16] [viable/strict] Ignore failing jobs with unstable issues (#6362) Allows the viable strict promotion script to use unstable issues. Jobs that have unstable issues open will be ignored in viable strict promotion. Tested with 0ef2e938d0a9a4b90434f98b5e128d0ffacaae26 (passed, only thing failing is libtorch debug build which has an issue right now) 96afa8a2bb78e5410a83038fd1e3f83911601700 (failed since there's something pending) c5d92edd5acfa56bae4f0c1057d667c6356fd6c1 (failed since lint failed) --- tools/scripts/fetch_latest_green_commit.py | 22 +++++++++++-- tools/tests/test_fetch_latest_green_commit.py | 32 +++++++++++++++---- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/tools/scripts/fetch_latest_green_commit.py b/tools/scripts/fetch_latest_green_commit.py index 9b346b3e79..b3f0e43097 100644 --- a/tools/scripts/fetch_latest_green_commit.py +++ b/tools/scripts/fetch_latest_green_commit.py @@ -1,6 +1,7 @@ import json import re import sys +from functools import lru_cache from pathlib import Path from typing import Any, cast, Dict, List, NamedTuple, Optional, Tuple @@ -80,6 +81,23 @@ def get_commit_results( return workflow_checks +@lru_cache +def fetch_unstable_issues() -> List[str]: + issues = query_clickhouse_saved("issue_query", {"label": "unstable"}) + return [ + issue["title"][len("UNSTABLE") :].strip() + for issue in issues + if issue["title"].startswith("UNSTABLE") and issue["state"] == "open" + ] + + +def is_unstable(job: dict[str, Any]) -> bool: + # Check if the job is an unstable job, either by name or by issue + if "unstable" in job["jobName"]: + return True + return job["name"] in fetch_unstable_issues() + + def is_green( commit: str, requires: List[str], results: List[Dict[str, Any]] ) -> Tuple[bool, str]: @@ -88,9 +106,9 @@ def is_green( regex = {check: False for check in requires} for check in workflow_checks: - jobName = check["jobName"] + jobName = check["name"] # Ignore result from unstable job, be it success or failure - if "unstable" in jobName: + if "unstable" in jobName or jobName in fetch_unstable_issues(): continue workflow_name = check["workflowName"] diff --git a/tools/tests/test_fetch_latest_green_commit.py b/tools/tests/test_fetch_latest_green_commit.py index e4f11de938..1b580072be 100644 --- a/tools/tests/test_fetch_latest_green_commit.py +++ b/tools/tests/test_fetch_latest_green_commit.py @@ -47,12 +47,18 @@ def make_test_checks(self) -> List[Dict[str, Any]]: return workflow_checks +@mock.patch( + "tools.scripts.fetch_latest_green_commit.fetch_unstable_issues", + return_value=[], +) class TestPrintCommits(TestCase): @mock.patch( "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_all_successful(self, mock_get_commit_results: Any) -> None: + def test_all_successful( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with workflows are successful""" workflow_checks = mock_get_commit_results() self.assertTrue(is_green("sha", requires, workflow_checks)[0]) @@ -61,7 +67,9 @@ def test_all_successful(self, mock_get_commit_results: Any) -> None: "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_necessary_successful(self, mock_get_commit_results: Any) -> None: + def test_necessary_successful( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with necessary workflows are successful""" workflow_checks = mock_get_commit_results() workflow_checks = set_workflow_job_status( @@ -85,7 +93,9 @@ def test_necessary_successful(self, mock_get_commit_results: Any) -> None: "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_necessary_skipped(self, mock_get_commit_results: Any) -> None: + def test_necessary_skipped( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with necessary job (ex: pull) skipped""" workflow_checks = mock_get_commit_results() workflow_checks = set_workflow_job_status(workflow_checks, "pull", "skipped") @@ -96,7 +106,9 @@ def test_necessary_skipped(self, mock_get_commit_results: Any) -> None: "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_skippable_skipped(self, mock_get_commit_results: Any) -> None: + def test_skippable_skipped( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with skippable jobs (periodic and docker-release-builds skipped""" workflow_checks = mock_get_commit_results() workflow_checks = set_workflow_job_status( @@ -111,7 +123,9 @@ def test_skippable_skipped(self, mock_get_commit_results: Any) -> None: "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_necessary_failed(self, mock_get_commit_results: Any) -> None: + def test_necessary_failed( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with necessary job (ex: Lint) failed""" workflow_checks = mock_get_commit_results() workflow_checks = set_workflow_job_status(workflow_checks, "Lint", "failed") @@ -123,7 +137,9 @@ def test_necessary_failed(self, mock_get_commit_results: Any) -> None: "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_skippable_failed(self, mock_get_commit_results: Any) -> None: + def test_skippable_failed( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with failing skippable jobs (ex: docker-release-builds) should pass""" workflow_checks = mock_get_commit_results() workflow_checks = set_workflow_job_status( @@ -138,7 +154,9 @@ def test_skippable_failed(self, mock_get_commit_results: Any) -> None: @mock.patch( "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value={} ) - def test_no_workflows(self, mock_get_commit_results: Any) -> None: + def test_no_workflows( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with missing workflows""" workflow_checks = mock_get_commit_results() result = is_green("sha", requires, workflow_checks) From 27793d044c6b3c9d9adec3330caf8e631bc19cb5 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Thu, 6 Mar 2025 06:04:46 +0900 Subject: [PATCH 12/16] [Fix Bug] remove bug to not parse the rcommit (#6364) https://github.com/pytorch/test-infra/issues/6363 this is a typo when refactor the benchmark code, simply remove the [], I think visual studio auto-gen triggered it --- .../llms/components/dashboardPicker/LLMsDashboardPicker.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchci/components/benchmark/llms/components/dashboardPicker/LLMsDashboardPicker.tsx b/torchci/components/benchmark/llms/components/dashboardPicker/LLMsDashboardPicker.tsx index 2f666c02f9..ff84ab3671 100644 --- a/torchci/components/benchmark/llms/components/dashboardPicker/LLMsDashboardPicker.tsx +++ b/torchci/components/benchmark/llms/components/dashboardPicker/LLMsDashboardPicker.tsx @@ -69,7 +69,7 @@ export const LLMsDashboardPicker = ({ }} titlePrefix={"New"} fallbackIndex={0} // Default to the latest commit - timeRange={[props.timeRange]} + timeRange={props.timeRange} /> From d5497ac34056bd6b3d7bf54ebecb4e2104d0112c Mon Sep 17 00:00:00 2001 From: "Wang, Chuanqi" Date: Thu, 6 Mar 2025 07:07:30 +0800 Subject: [PATCH 13/16] [XPU] Upgrade XPU support packages version on Windows (#6361) Align with https://github.com/pytorch/pytorch/pull/148313 --- .github/scripts/install_xpu.bat | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/scripts/install_xpu.bat b/.github/scripts/install_xpu.bat index c31276c0b5..afdfe1da87 100644 --- a/.github/scripts/install_xpu.bat +++ b/.github/scripts/install_xpu.bat @@ -39,9 +39,9 @@ set XPU_EXTRA_INSTALLED=0 set XPU_EXTRA_UNINSTALL=0 if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.0] ( - set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/efc86abd-cb77-452e-a03f-a741895b8ece/intel-deep-learning-essentials-2025.0.0.336_offline.exe + set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product - set XPU_BUNDLE_VERSION=2025.0.0+335 + set XPU_BUNDLE_VERSION=2025.0.1+20 set XPU_BUNDLE_INSTALLED=0 set XPU_BUNDLE_UNINSTALL=0 set XPU_EXTRA_URL=NULL From 0d532e9b4f3ac3dcde90dc8de57da2140b4c8b53 Mon Sep 17 00:00:00 2001 From: Jean Schmidt <4520845+jeanschmidt@users.noreply.github.com> Date: Thu, 6 Mar 2025 18:30:26 +0100 Subject: [PATCH 14/16] Adding tooling and documentation for locally run tflint (#6370) created a Makefile on `./terraform-aws-github-runner` to perform tflint actions, and replaced the tflint calls on CI (`tflint.yml`) with this makefile. This makes much easier to test locally and make sure to get green signals on CI. Reducing the loop time to fix small syntax bugs. --- .github/workflows/tflint.yml | 104 +++----------------------- terraform-aws-github-runner/Makefile | 62 +++++++++++++++ terraform-aws-github-runner/README.md | 12 +++ 3 files changed, 85 insertions(+), 93 deletions(-) create mode 100644 terraform-aws-github-runner/Makefile diff --git a/.github/workflows/tflint.yml b/.github/workflows/tflint.yml index 6324045d29..f41e5889ca 100644 --- a/.github/workflows/tflint.yml +++ b/.github/workflows/tflint.yml @@ -24,102 +24,20 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} tflint_version: v0.54.0 - - name: Install Terraform - uses: hashicorp/setup-terraform@v2 + - name: Install Tofu + uses: opentofu/setup-opentofu@v1 with: - terraform_version: 1.5.1 + terraform_version: 1.5.7 terraform_wrapper: false - - name: Show version - run: tflint --version + - name: Show tflint version + run: + tflint --version - - name: "Init TFLint download-lambda" - working-directory: terraform-aws-github-runner/modules/download-lambda - run: tflint --init - - name: "Init terraform download-lambda" - working-directory: terraform-aws-github-runner/modules/download-lambda - run: terraform init - - name: "Run TFLint download-lambda" - working-directory: terraform-aws-github-runner/modules/download-lambda - run: tflint --call-module-type=all - - name: "Run terraform validate download-lambda" - working-directory: terraform-aws-github-runner/modules/download-lambda - run: terraform validate + - name: Show tofu version + run: + tofu --version - - name: "Init TFLint runner-binaries-syncer" - working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer - run: tflint --init - - name: "Init terraform runner-binaries-syncer" - working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer - run: terraform init - - name: "Run TFLint runner-binaries-syncer" - working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer - run: tflint --call-module-type=all - - name: "Run terraform validate runner-binaries-syncer" - working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer - run: terraform validate - - - name: "Init TFLint runners-instances" - working-directory: terraform-aws-github-runner/modules/runners-instances - run: tflint --init - - name: "Init terraform runners-instances" - working-directory: terraform-aws-github-runner/modules/runners-instances - run: terraform init - - name: "Run TFLint runners-instances" - working-directory: terraform-aws-github-runner/modules/runners-instances - run: tflint --call-module-type=all - - name: "Run terraform validate runners-instances" - working-directory: terraform-aws-github-runner/modules/runners-instances - run: terraform validate - - - name: "Init TFLint runners" - working-directory: terraform-aws-github-runner/modules/runners - run: tflint --init - - name: "Init terraform runners" - working-directory: terraform-aws-github-runner/modules/runners - run: terraform init - - name: "Run TFLint runners" - working-directory: terraform-aws-github-runner/modules/runners - run: tflint --call-module-type=all - - name: "Run terraform validate runners" - working-directory: terraform-aws-github-runner/modules/runners - run: terraform validate - - - name: "Init TFLint setup-iam-permissions" - working-directory: terraform-aws-github-runner/modules/setup-iam-permissions - run: tflint --init - - name: "Init terraform setup-iam-permissions" - working-directory: terraform-aws-github-runner/modules/setup-iam-permissions - run: terraform init - - name: "Run TFLint setup-iam-permissions" - working-directory: terraform-aws-github-runner/modules/setup-iam-permissions - run: tflint --call-module-type=all - - name: "Run terraform validate setup-iam-permissions" - working-directory: terraform-aws-github-runner/modules/setup-iam-permissions - run: terraform validate - - - name: "Init TFLint webhook" - working-directory: terraform-aws-github-runner/modules/webhook - run: tflint --init - - name: "Init terraform webhook" - working-directory: terraform-aws-github-runner/modules/webhook - run: terraform init - - name: "Run TFLint webhook" - working-directory: terraform-aws-github-runner/modules/webhook - run: tflint --call-module-type=all - - name: "Run terraform validate webhook" - working-directory: terraform-aws-github-runner/modules/webhook - run: terraform validate - - - name: "Init TFLint main" - working-directory: terraform-aws-github-runner - run: tflint --init - - name: "Init terraform main" - working-directory: terraform-aws-github-runner - run: terraform init - - name: "Run TFLint main" - working-directory: terraform-aws-github-runner - run: tflint --call-module-type=all - - name: "Run terraform validate terraform-aws-github-runner" + - name: "tflint" working-directory: terraform-aws-github-runner - run: terraform validate + run: make tflint diff --git a/terraform-aws-github-runner/Makefile b/terraform-aws-github-runner/Makefile new file mode 100644 index 0000000000..072c7d5568 --- /dev/null +++ b/terraform-aws-github-runner/Makefile @@ -0,0 +1,62 @@ +all: tflint + +@PHONY: tflint +tflint: tflint-download-lambda tflint-runner-binaries-syncer tflint-runners-instances tflint-runners tflint-setup-iam-permissions tflint-webhook tflint-main + +@PHONY: tflint-download-lambda +tflint-download-lambda: + cd modules/download-lambda && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-runner-binaries-syncer +tflint-runner-binaries-syncer: + cd modules/runner-binaries-syncer && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-runners-instances +tflint-runners-instances: + cd modules/runners-instances && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-runners +tflint-runners: + cd modules/runners && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-setup-iam-permissions +tflint-setup-iam-permissions: + cd modules/setup-iam-permissions && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-webhook +tflint-webhook: + cd modules/webhook && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-main +tflint-main: + tofu init + tflint --init + tflint --call-module-type=all --recursive + tofu validate + +clean: + rm -rf .terraform terraform.lock.hcl diff --git a/terraform-aws-github-runner/README.md b/terraform-aws-github-runner/README.md index f872da3ca0..cb464cda8a 100644 --- a/terraform-aws-github-runner/README.md +++ b/terraform-aws-github-runner/README.md @@ -2,6 +2,18 @@ This is a terraform module that sets up self hosted github runners on AWS along with the infra needed to autoscale them +# Testing your changes +In order to verify if your changes will pass CI testing, you can simply run from this directory: + +``` +$ make tflint +``` + +This depends on Tofu, CMake and TFLint being installed. + +# Checking plan changes of your changes +This module is not stand alone. It is a reusable module designed to be imported, configured, and used in your project. + # Release Terraform code that uses this module specify the tag (version of test-infra) that they use via a file called `Terrafile`. We need to create a new tag for any changes here that we want to deploy and update the `Terrafile` to refer to that tag: From e4220531700777ea3231f063cfb1580f6fe58ebc Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Thu, 6 Mar 2025 13:59:05 -0800 Subject: [PATCH 15/16] stash testing changes --- .../src/scale-runners/scale-up-chron.test.ts | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts index f16c4b1116..0eb33712bf 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts @@ -1,18 +1,12 @@ import { createRunner } from './runners'; import { - createRegistrationTokenOrg, - createRegistrationTokenRepo, - getGitHubRateLimit, getRunnerTypes, - listGithubRunnersOrg, - listGithubRunnersRepo, + } from './gh-runners'; import { Config } from './config'; -import { getRepoIssuesWithLabel, GhIssues } from './gh-issues'; -import { mocked } from 'ts-jest/utils'; -import nock from 'nock'; -import { scaleUp, _calculateScaleUpAmount } from './scale-up'; +import { mocked } from "jest-mock"; + import { scaleUpChron, getQueuedJobs } from './scale-up-chron'; import axios from 'axios'; @@ -36,7 +30,7 @@ const metrics = new MetricsModule.ScaleUpChronMetrics(); describe('scaleUpChron', () => { beforeEach(() => { - const mockedGetRepo = mocked(getRepo).mockReturnValue('repo'); + const mockedGetRepo = mocked(getRepo).mockReturnValue({ owner: 'owner', repo: 'repo' }); const mockedvalidRunnerTypes = mocked(getRunnerTypes).mockResolvedValue( new Map([ [ From b437c423614e5f08b055b69525ef720eaaa87588 Mon Sep 17 00:00:00 2001 From: Camyll Harajli Date: Fri, 7 Mar 2025 15:58:01 -0800 Subject: [PATCH 16/16] working tests for scaleupchron --- .../src/scale-runners/scale-up-chron.test.ts | 144 ++++++++---------- .../src/scale-runners/scale-up-chron.ts | 3 +- .../modules/runners/scale-up-chron.tf | 2 +- 3 files changed, 65 insertions(+), 84 deletions(-) diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts index 0eb33712bf..3a64896821 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts @@ -1,24 +1,23 @@ -import { createRunner } from './runners'; -import { - getRunnerTypes, -} from './gh-runners'; import { Config } from './config'; -import { mocked } from "jest-mock"; +import { mocked } from 'ts-jest/utils'; +import { getRepo, expBackOff } from './utils'; +// import * as ScaleUpChronModule from './scale-up-chron'; import { scaleUpChron, getQueuedJobs } from './scale-up-chron'; -import axios from 'axios'; import * as MetricsModule from './metrics'; jest.mock('./runners'); jest.mock('./gh-runners'); jest.mock('./gh-issues'); +jest.mock('./utils'); +jest.mock('axios'); - -// Import the required modules -import { getRepo } from './utils'; +const responseString1 = '[{"runner_label":"test_runner_type1","org":"test_org1","repo":"test_repo1","num_queued_jobs":1,"min_queue_time_minutes":1,"max_queue_time_minutes":1},{"runner_label":"test_runner_type2","org":"test_org2","repo":"test_repo2","num_queued_jobs":2,"min_queue_time_minutes":2,"max_queue_time_minutes":2}]'; +const responseString2 = '[{"runner_label":"label1-nomatch","org":"test_org1","repo":"test_repo1","num_queued_jobs":1,"min_queue_time_minutes":1,"max_queue_time_minutes":1},{"runner_label":"test_runner_type2","org":"test_org2","repo":"test_repo2","num_queued_jobs":2,"min_queue_time_minutes":2,"max_queue_time_minutes":2}]'; +const responseString3 = '[{"runner_label":"label1","org":"test_org1-nomatch","repo":"test_repo1","num_queued_jobs":1,"min_queue_time_minutes":1,"max_queue_time_minutes":1},{"runner_label":"test_runner_type2","org":"test_org2","repo":"test_repo2","num_queued_jobs":2,"min_queue_time_minutes":2,"max_queue_time_minutes":2}]'; const baseCfg = { scaleConfigOrg: 'test_org1', @@ -27,71 +26,63 @@ const baseCfg = { } as unknown as Config; const metrics = new MetricsModule.ScaleUpChronMetrics(); - +// beforeEach(() => { +// jest.resetModules(); +// jest.clearAllMocks(); +// jest.restoreAllMocks(); + + // mocked(getRepo).mockReturnValue ({ owner: 'owner', repo: 'repo' }); + + // mocked(getRunnerTypes).mockResolvedValue( + // new Map([ + // [ + // 'label1', + // { + // instance_type: 'instance_type', + // os: 'os', + // max_available: 33, + // disk_size: 113, + // runnerTypeName: 'runnerTypeName', + // is_ephemeral: false, + // }, + // ], + // ]), + // ); +// }); describe('scaleUpChron', () => { - beforeEach(() => { - const mockedGetRepo = mocked(getRepo).mockReturnValue({ owner: 'owner', repo: 'repo' }); - const mockedvalidRunnerTypes = mocked(getRunnerTypes).mockResolvedValue( - new Map([ - [ - 'label1', - { - instance_type: 'instance_type', - os: 'os', - max_available: 33, - disk_size: 113, - runnerTypeName: 'runnerTypeName', - is_ephemeral: false, - }, - ], - ]), - ); - }); - jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg); - - const minAutoScaleupDelayMinutes = Config.Instance.scaleUpMinQueueTimeMinutes; - if (!Config.Instance.scaleUpRecordQueueUrl) { - metrics.scaleUpInstanceFailureNonRetryable('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); - throw new Error('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); - } it('invalid scaleUpRecordQueueUrl', async () => { + jest.clearAllMocks(); jest.spyOn(Config, 'Instance', 'get').mockImplementation( () => ({ ...baseCfg, scaleUpRecordQueueUrl: null, } as unknown as Config), - ); expect(await scaleUpChron(metrics)).rejects.toThrow('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); + ); + mocked(getRepo).mockReturnValue ({ owner: 'owner', repo: 'repo' }); + const scaleUpChron = jest.requireActual('./scale-up-chron').scaleUpChron; + await expect(scaleUpChron(metrics)).rejects.toThrow(new Error('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests')); }); it('queued jobs do not match available runners', async () => { - const mockedGetQueuedJobs = mocked(getQueuedJobs).mockResolvedValue([ - { - runner_label: 'label1-nomatch', - org: 'test_org1', - repo: 'test_repo1', - num_queued_jobs: 1, - min_queue_time_minutes: 1, - max_queue_time_minutes: 1 - } - ]) - const scaleUpInstanceNoOp = jest.spyOn(metrics, 'scaleUpInstanceNoOp'); + jest.clearAllMocks(); + jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg); + mocked(getRepo).mockReturnValue ({ owner: 'test_org1', repo: 'test_repo1' }); + mocked(expBackOff).mockResolvedValue({ data: responseString2 }); + + const scaleUpInstanceNoOpSpy = jest.spyOn(metrics, 'scaleUpInstanceNoOp'); + await scaleUpChron(metrics) - expect(scaleUpInstanceNoOp).toBeCalledTimes(1); + expect(scaleUpInstanceNoOpSpy).toBeCalledTimes(1); }); it('queued jobs do not match scale config org', async () => { - const mockedGetQueuedJobs = mocked(getQueuedJobs).mockResolvedValue([ - { - runner_label: 'label1', - org: 'test_org1-nomatch', - repo: 'test_repo1', - num_queued_jobs: 1, - min_queue_time_minutes: 1, - max_queue_time_minutes: 1 - } - ]) + jest.clearAllMocks(); + jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg); + mocked(getRepo).mockReturnValue ({ owner: 'test_org1', repo: 'test_repo1' }); + mocked(expBackOff).mockResolvedValue({ data: responseString3 }); + const scaleUpInstanceNoOp = jest.spyOn(metrics, 'scaleUpInstanceNoOp'); await scaleUpChron(metrics) expect(scaleUpInstanceNoOp).toBeCalledTimes(1); @@ -100,23 +91,8 @@ describe('scaleUpChron', () => { describe('getQueuedJobs', () => { it('get queue data from url request with valid response', async () => { - const dataMap1 = new Map([ - ['runner_type', 'test_runner_type1'], - ['org', 'test_org1'], - ['repo', 'test_repo1'], - ['num_queued_jobs', '1'], - ['min_queue_time_minutes', '1'], - ['max_queue_time_minutes', '1'] - ]) - const dataMap2 = new Map([ - ['runner_type', 'test_runner_type2'], - ['org', 'test_org2'], - ['repo', 'test_repo2'], - ['num_queued_jobs', '2'], - ['min_queue_time_minutes', '2'], - ['max_queue_time_minutes', '2'] - ]) - jest.spyOn(axios, 'get').mockReturnValue(new Map([['data', [dataMap1, dataMap2]]])); + mocked(expBackOff).mockResolvedValue({ data: responseString1 }); + expect(await getQueuedJobs(metrics, 'url')).toEqual([ { runner_label: 'test_runner_type1', @@ -131,21 +107,25 @@ describe('getQueuedJobs', () => { repo: 'test_repo2', num_queued_jobs: 2, min_queue_time_minutes: 2, + max_queue_time_minutes:2 } ]); - }); it('get queue data from url request with invalid response', async () => { - jest.spyOn(axios, 'get').mockReturnValue(new Map([['noDataHere', 'whoops']])); + const errorResponse = ''; + mocked(expBackOff).mockImplementation( + () => {throw new Error('Throwing a fake error!')}); + const runners = await getQueuedJobs(metrics, 'url'); - await expect(getQueuedJobs(metrics, 'url')).rejects.toThrow('Error fetching queued runners: {TODO:camyllh test and add error message}'); + expect(await getQueuedJobs(metrics, 'url')).toEqual([]); }); - it('get queue data from url request with invalid response', async () => { - jest.spyOn(axios, 'get').mockReturnValue(new Map([['noDataHere', 'whoops']])); + it('get queue data from url request with empty response', async () => { + const errorResponse = ''; + mocked(expBackOff).mockResolvedValue({ data: errorResponse }); + const runners = await getQueuedJobs(metrics, 'url'); - await expect(getQueuedJobs(metrics, 'url')).rejects.toThrow('Error fetching queued runners: {TODO:camyllh test and add error message}'); + expect(await getQueuedJobs(metrics, 'url')).toEqual([]); }); - }); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts index f52b4271e2..bad2816b70 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts @@ -85,7 +85,8 @@ export async function getQueuedJobs(metrics: ScaleUpChronMetrics, scaleUpRecordQ }); // Map the response to the class - return response?.data.map((runner: any) => { + const responseData = JSON.parse(response.data); + return responseData.map((runner: any) => { metrics.queuedRunnerStats(runner.org, runner.runner_label, runner.num_queued_jobs,); return { runner_label: runner.runner_label, diff --git a/terraform-aws-github-runner/modules/runners/scale-up-chron.tf b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf index 3224b5684e..c38382ea33 100644 --- a/terraform-aws-github-runner/modules/runners/scale-up-chron.tf +++ b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf @@ -47,7 +47,7 @@ resource "aws_lambda_function" "scale_up_chron" { SCALE_CONFIG_REPO = var.scale_config_repo SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path SCALE_UP_MIN_QUEUE_TIME_MINUTES = 30 - SCALE_UP_RECORD_QUEUE_URL = 'https://hud.pytorch.org/api/clickhouse/queued_jobs_aggregate?parameters=%5B%5D' + SCALE_UP_RECORD_QUEUE_URL = "https://hud.pytorch.org/api/clickhouse/queued_jobs_aggregate?parameters=%5B%5D" scale_up_chron_CONFIG = jsonencode(var.idle_config) SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id AWS_REGIONS_TO_VPC_IDS = join(