diff --git a/.github/actions/setup-binary-builds/action.yml b/.github/actions/setup-binary-builds/action.yml index 7cbadbc9e8..fbcb49a315 100644 --- a/.github/actions/setup-binary-builds/action.yml +++ b/.github/actions/setup-binary-builds/action.yml @@ -144,12 +144,16 @@ runs: if [[ "${PYTHON_VERSION:-}" == "3.13t" ]]; then export PYTHON_VERSION=3.13 export CONDA_EXTRA_PARAM=" python-freethreading -c conda-forge" - if [[ "$(uname)" != Darwin ]]; then - # Pin conda and conda-libmamba-solver for 3.13t linux build - # this solver allows us to install anaconda dependencies on - # python-freethreading on conda-forge environment - conda install conda==24.7.1 conda-libmamba-solver=24.1.0 + + # downgrade conda version for python 3.13t install. + # TODO: remove this once python 3.13t is fully suported on conda + # Please see : https://github.com/conda/conda/issues/14554 + if [[ "$(uname)" == Darwin ]]; then + # required to be able to downgrade on MacOS m1 side + conda install -y python=3.9 + conda uninstall -y conda-anaconda-telemetry conda-anaconda-tos fi + conda install -y conda=24.7.1 conda-libmamba-solver=24.1.0 fi conda create \ diff --git a/.github/scripts/install_xpu.bat b/.github/scripts/install_xpu.bat index c31276c0b5..afdfe1da87 100644 --- a/.github/scripts/install_xpu.bat +++ b/.github/scripts/install_xpu.bat @@ -39,9 +39,9 @@ set XPU_EXTRA_INSTALLED=0 set XPU_EXTRA_UNINSTALL=0 if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.0] ( - set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/efc86abd-cb77-452e-a03f-a741895b8ece/intel-deep-learning-essentials-2025.0.0.336_offline.exe + set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product - set XPU_BUNDLE_VERSION=2025.0.0+335 + set XPU_BUNDLE_VERSION=2025.0.1+20 set XPU_BUNDLE_INSTALLED=0 set XPU_BUNDLE_UNINSTALL=0 set XPU_EXTRA_URL=NULL diff --git a/.github/workflows/tflint.yml b/.github/workflows/tflint.yml index 6324045d29..f41e5889ca 100644 --- a/.github/workflows/tflint.yml +++ b/.github/workflows/tflint.yml @@ -24,102 +24,20 @@ jobs: github_token: ${{ secrets.GITHUB_TOKEN }} tflint_version: v0.54.0 - - name: Install Terraform - uses: hashicorp/setup-terraform@v2 + - name: Install Tofu + uses: opentofu/setup-opentofu@v1 with: - terraform_version: 1.5.1 + terraform_version: 1.5.7 terraform_wrapper: false - - name: Show version - run: tflint --version + - name: Show tflint version + run: + tflint --version - - name: "Init TFLint download-lambda" - working-directory: terraform-aws-github-runner/modules/download-lambda - run: tflint --init - - name: "Init terraform download-lambda" - working-directory: terraform-aws-github-runner/modules/download-lambda - run: terraform init - - name: "Run TFLint download-lambda" - working-directory: terraform-aws-github-runner/modules/download-lambda - run: tflint --call-module-type=all - - name: "Run terraform validate download-lambda" - working-directory: terraform-aws-github-runner/modules/download-lambda - run: terraform validate + - name: Show tofu version + run: + tofu --version - - name: "Init TFLint runner-binaries-syncer" - working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer - run: tflint --init - - name: "Init terraform runner-binaries-syncer" - working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer - run: terraform init - - name: "Run TFLint runner-binaries-syncer" - working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer - run: tflint --call-module-type=all - - name: "Run terraform validate runner-binaries-syncer" - working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer - run: terraform validate - - - name: "Init TFLint runners-instances" - working-directory: terraform-aws-github-runner/modules/runners-instances - run: tflint --init - - name: "Init terraform runners-instances" - working-directory: terraform-aws-github-runner/modules/runners-instances - run: terraform init - - name: "Run TFLint runners-instances" - working-directory: terraform-aws-github-runner/modules/runners-instances - run: tflint --call-module-type=all - - name: "Run terraform validate runners-instances" - working-directory: terraform-aws-github-runner/modules/runners-instances - run: terraform validate - - - name: "Init TFLint runners" - working-directory: terraform-aws-github-runner/modules/runners - run: tflint --init - - name: "Init terraform runners" - working-directory: terraform-aws-github-runner/modules/runners - run: terraform init - - name: "Run TFLint runners" - working-directory: terraform-aws-github-runner/modules/runners - run: tflint --call-module-type=all - - name: "Run terraform validate runners" - working-directory: terraform-aws-github-runner/modules/runners - run: terraform validate - - - name: "Init TFLint setup-iam-permissions" - working-directory: terraform-aws-github-runner/modules/setup-iam-permissions - run: tflint --init - - name: "Init terraform setup-iam-permissions" - working-directory: terraform-aws-github-runner/modules/setup-iam-permissions - run: terraform init - - name: "Run TFLint setup-iam-permissions" - working-directory: terraform-aws-github-runner/modules/setup-iam-permissions - run: tflint --call-module-type=all - - name: "Run terraform validate setup-iam-permissions" - working-directory: terraform-aws-github-runner/modules/setup-iam-permissions - run: terraform validate - - - name: "Init TFLint webhook" - working-directory: terraform-aws-github-runner/modules/webhook - run: tflint --init - - name: "Init terraform webhook" - working-directory: terraform-aws-github-runner/modules/webhook - run: terraform init - - name: "Run TFLint webhook" - working-directory: terraform-aws-github-runner/modules/webhook - run: tflint --call-module-type=all - - name: "Run terraform validate webhook" - working-directory: terraform-aws-github-runner/modules/webhook - run: terraform validate - - - name: "Init TFLint main" - working-directory: terraform-aws-github-runner - run: tflint --init - - name: "Init terraform main" - working-directory: terraform-aws-github-runner - run: terraform init - - name: "Run TFLint main" - working-directory: terraform-aws-github-runner - run: tflint --call-module-type=all - - name: "Run terraform validate terraform-aws-github-runner" + - name: "tflint" working-directory: terraform-aws-github-runner - run: terraform validate + run: make tflint diff --git a/release/promote.sh b/release/promote.sh index 8b79397833..3d792f454e 100644 --- a/release/promote.sh +++ b/release/promote.sh @@ -107,6 +107,7 @@ promote_pypi() { # promote_s3 fbgemm-gpu whl "${FBGEMMGPU_VERSION}" # promote_s3 "libtorch-*" libtorch "${PYTORCH_VERSION}" # promote_s3 "torch_tensorrt" whl "${TENSORRT_VERSION}" +# promote_s3 "torchao" whl "${TORCHAO_VERSION}" # promote_conda torchtriton conda "2.1.0" # promote_conda pytorch-cuda conda "11.8" diff --git a/release/pypi/promote_pypi_to_staging.sh b/release/pypi/promote_pypi_to_staging.sh index bfda4b6f8f..6c8befc09b 100644 --- a/release/pypi/promote_pypi_to_staging.sh +++ b/release/pypi/promote_pypi_to_staging.sh @@ -47,10 +47,11 @@ PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" #PLATFORM="linux_x86" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging executorch "${EXECUTORCH_VERSION}" #PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging executorch "${EXECUTORCH_VERSION}" +# PLATFORM="manylinux" VERSION_SUFFIX="${LINUX_VERSION_SUFFIX}" ARCH="cu124" upload_pypi_to_staging torchao "${TORCHAO_VERSION}" +# PLATFORM="none-any" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchao "${TORCHAO_VERSION}" + #PLATFORM="linux_x86" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" #PLATFORM="win_amd64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" #PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}" -#PLATFORM="linux_x86_64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" -#PLATFORM="win_amd64" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" -#PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" +#PLATFORM="none-any" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}" diff --git a/release/pypi/upload_pypi_to_staging.sh b/release/pypi/upload_pypi_to_staging.sh index 1f32984b40..38455cea74 100644 --- a/release/pypi/upload_pypi_to_staging.sh +++ b/release/pypi/upload_pypi_to_staging.sh @@ -21,11 +21,14 @@ PLATFORM=${PLATFORM:-} # i.e. cpu, cu121, cu124 ARCH=${ARCH:-cpu} +# This extract links to packages from the index.html +# We strip all extra characters including final sha256 char pkgs_to_promote=$(\ curl -fsSL "https://download.pytorch.org/whl/test/${ARCH}/${PACKAGE_NAME}/index.html" \ | grep "${PACKAGE_NAME}-${PACKAGE_VERSION}${VERSION_SUFFIX}-" \ | grep "${PLATFORM}" \ - | cut -d '"' -f2 + | cut -d '"' -f2 \ + | cut -d "#" -f1 ) tmp_dir="$(mktemp -d)" diff --git a/release/release_versions.sh b/release/release_versions.sh index 32e8c47d8b..71b9c75d0f 100644 --- a/release/release_versions.sh +++ b/release/release_versions.sh @@ -8,6 +8,7 @@ TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.18.0} TORCHREC_VERSION=${TORCHREC_VERSION:-0.8.0} TENSORRT_VERSION=${TENSORRT_VERSION:-2.4.0} EXECUTORCH_VERSION=${EXECUTORCH_VERSION:-0.3.0} -TORCHAO_VERSION=${TORCHAO_VERSION:-0.4.0} +TORCHAO_VERSION=${TORCHAO_VERSION:-0.9.0} +TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.11.0} TORCHTUNE_VERSION=${TORCHTUNE_VERSION:-0.2.1} FBGEMMGPU_VERSION=${FBGEMMGPU_VERSION:-1.0.0} diff --git a/terraform-aws-github-runner/Makefile b/terraform-aws-github-runner/Makefile new file mode 100644 index 0000000000..072c7d5568 --- /dev/null +++ b/terraform-aws-github-runner/Makefile @@ -0,0 +1,62 @@ +all: tflint + +@PHONY: tflint +tflint: tflint-download-lambda tflint-runner-binaries-syncer tflint-runners-instances tflint-runners tflint-setup-iam-permissions tflint-webhook tflint-main + +@PHONY: tflint-download-lambda +tflint-download-lambda: + cd modules/download-lambda && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-runner-binaries-syncer +tflint-runner-binaries-syncer: + cd modules/runner-binaries-syncer && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-runners-instances +tflint-runners-instances: + cd modules/runners-instances && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-runners +tflint-runners: + cd modules/runners && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-setup-iam-permissions +tflint-setup-iam-permissions: + cd modules/setup-iam-permissions && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-webhook +tflint-webhook: + cd modules/webhook && \ + tofu init && \ + tflint --init && \ + tflint --call-module-type=all && \ + tofu validate + +@PHONY: tflint-main +tflint-main: + tofu init + tflint --init + tflint --call-module-type=all --recursive + tofu validate + +clean: + rm -rf .terraform terraform.lock.hcl diff --git a/terraform-aws-github-runner/README.md b/terraform-aws-github-runner/README.md index f872da3ca0..cb464cda8a 100644 --- a/terraform-aws-github-runner/README.md +++ b/terraform-aws-github-runner/README.md @@ -2,6 +2,18 @@ This is a terraform module that sets up self hosted github runners on AWS along with the infra needed to autoscale them +# Testing your changes +In order to verify if your changes will pass CI testing, you can simply run from this directory: + +``` +$ make tflint +``` + +This depends on Tofu, CMake and TFLint being installed. + +# Checking plan changes of your changes +This module is not stand alone. It is a reusable module designed to be imported, configured, and used in your project. + # Release Terraform code that uses this module specify the tag (version of test-infra) that they use via a file called `Terrafile`. We need to create a new tag for any changes here that we want to deploy and update the `Terrafile` to refer to that tag: diff --git a/terraform-aws-github-runner/main.tf b/terraform-aws-github-runner/main.tf index 094389bb06..b32cc3ffb7 100644 --- a/terraform-aws-github-runner/main.tf +++ b/terraform-aws-github-runner/main.tf @@ -104,6 +104,7 @@ module "runners" { environment = var.environment tags = local.tags + scale_config_org = var.scale_config_org scale_config_repo = var.scale_config_repo scale_config_repo_path = var.scale_config_repo_path diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js b/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js index c474887a66..6d1916820c 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/jest.config.js @@ -11,5 +11,8 @@ module.exports = { lines: 80, statements: 80 } - } + }, + moduleNameMapper: { + axios: 'axios/dist/node/axios.cjs', // Allow axios to work in tests + }, }; diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json b/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json index 6e5217a9ef..27976ad2c7 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/package.json @@ -42,6 +42,7 @@ "@types/uuid": "^9.0.1", "async-mutex": "^0.4.0", "aws-sdk": "^2.863.0", + "axios": "^1.7.7", "cron-parser": "^3.3.0", "generic-pool": "^3.9.0", "lru-cache": "^6.0.0", diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts index acbb6e52b9..61beb786bd 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/lambda.ts @@ -2,9 +2,10 @@ import { ActionRequestMessage, RetryableScalingError, scaleUp as scaleUpR } from import { Context, SQSEvent, SQSRecord, ScheduledEvent } from 'aws-lambda'; import { Config } from './scale-runners/config'; -import { ScaleUpMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics'; +import { ScaleUpMetrics, ScaleUpChronMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics'; import { getDelayWithJitterRetryCount, stochaticRunOvershoot } from './scale-runners/utils'; import { scaleDown as scaleDownR } from './scale-runners/scale-down'; +import { scaleUpChron as scaleUpChronR } from './scale-runners/scale-up-chron'; import { sqsSendMessages, sqsDeleteMessageBatch } from './scale-runners/sqs'; async function sendRetryEvents(evtFailed: Array<[SQSRecord, boolean, number]>, metrics: ScaleUpMetrics) { @@ -155,3 +156,35 @@ export async function scaleDown(event: ScheduledEvent, context: Context, callbac return callback('Failed'); } } + +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export async function scaleUpChron(event: ScheduledEvent, context: Context, callback: any) { + // we mantain open connections to redis, so the event pool is only cleaned when the SIGTERM is sent + context.callbackWaitsForEmptyEventLoop = false; + + const metrics = new ScaleUpChronMetrics(); + const sndMetricsTimout: sendMetricsTimeoutVars = { + metrics: metrics, + }; + sndMetricsTimout.setTimeout = setTimeout( + sendMetricsAtTimeout(sndMetricsTimout), + (Config.Instance.lambdaTimeout - 10) * 1000, + ); + + try { + await scaleUpChronR(metrics); + return callback(null); + } catch (e) { + console.error(e); + return callback('Failed'); + } finally { + try { + clearTimeout(sndMetricsTimout.setTimeout); + sndMetricsTimout.metrics = undefined; + sndMetricsTimout.setTimeout = undefined; + await metrics.sendMetrics(); + } catch (e) { + console.error(`Error sending metrics: ${e}`); + } + } +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts index 0755e86970..59d9c4c2b3 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/config.ts @@ -36,8 +36,11 @@ export class Config { readonly retryScaleUpRecordQueueUrl: string | undefined; readonly runnerGroupName: string | undefined; readonly runnersExtraLabels: undefined | string; + readonly scaleConfigOrg: string; readonly scaleConfigRepo: string; readonly scaleConfigRepoPath: string; + readonly scaleUpMinQueueTimeMinutes: number; + readonly scaleUpRecordQueueUrl: string | undefined; readonly secretsManagerSecretsId: string | undefined; readonly sSMParamCleanupAgeDays: number; readonly sSMParamMaxCleanupAllowance: number; @@ -94,8 +97,11 @@ export class Config { /* istanbul ignore next */ this.retryScaleUpRecordJitterPct = Number(process.env.RETRY_SCALE_UP_RECORD_JITTER_PCT || '0'); this.retryScaleUpRecordQueueUrl = process.env.RETRY_SCALE_UP_RECORD_QUEUE_URL; + this.scaleUpRecordQueueUrl = process.env.SCALE_UP_RECORD_QUEUE_URL; + this.scaleUpMinQueueTimeMinutes = process.env.SCALE_UP_MIN_QUEUE_TIME_MINUTES ? Number(process.env.SCALE_UP_MIN_QUEUE_TIME_MINUTES) : 30 this.runnerGroupName = process.env.RUNNER_GROUP_NAME; this.runnersExtraLabels = process.env.RUNNER_EXTRA_LABELS; + this.scaleConfigOrg = process.env.SCALE_CONFIG_ORG || ''; /* istanbul ignore next */ this.scaleConfigRepo = process.env.SCALE_CONFIG_REPO || ''; if (this.enableOrganizationRunners && !this.scaleConfigRepo) { diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts index 64c5998919..4981874431 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/metrics.ts @@ -1424,6 +1424,55 @@ export class ScaleDownMetrics extends Metrics { } } +export class ScaleUpChronMetrics extends ScaleUpMetrics { + constructor() { + super(); + } + queuedRunnerStats(org: string, runnerType: string, numQueuedJobs: number) { + const dimensions = new Map([['Org', org], ['RunnerType', runnerType], ['numQueuedJobs', numQueuedJobs.toString()]]); + this.addEntry('gh.scaleupchron.queuedRunners', 3, dimensions); + } + queuedRunnerFailure(error: string) { + const dimensions = new Map([['error', error]]); + this.countEntry('gh.scaleupchron.queuedRunners.failure', 1, dimensions); + } + /* istanbul ignore next */ + getQueuedJobsEndpointSuccess(ms: number) { + this.countEntry(`gh.calls.total`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.count`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.success`, 1); + this.addEntry(`gh.calls.getQueuedJobsEndpoint.wallclock`, ms); + } + + /* istanbul ignore next */ + getQueuedJobsEndpointFailure(ms: number) { + this.countEntry(`gh.calls.total`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.count`, 1); + this.countEntry(`gh.calls.getQueuedJobsEndpoint.failure`, 1); + this.addEntry(`gh.calls.getQueuedJobsEndpoint.wallclock`, ms); + } + + scaleUpInstanceSuccess() { + this.scaleUpSuccess(); + this.countEntry('run.scaleupchron.success'); + } + scaleUpInstanceFailureNonRetryable(error:string) { + const dimensions = new Map([['error', error]]); + // should we add more information about this or do we not care since it'll be requeued? + this.countEntry('run.scaleupchron.failure.nonRetryable', 1, dimensions); + } + scaleUpInstanceFailureRetryable(error:string) { + const dimensions = new Map([['error', error]]); + + // should we add more information about this or do we not care since it'll be requeued? + this.countEntry('run.scaleupchron.failure.retryable', 1, dimensions); + } + scaleUpInstanceNoOp() { + this.countEntry('run.scaleupchron.noop'); + } + +} + export interface sendMetricsTimeoutVars { metrics?: Metrics; setTimeout?: ReturnType; diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts index 3c0e6885e9..4201946513 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/runners.ts @@ -572,7 +572,7 @@ export async function createRunner(runnerParameters: RunnerInputParameters, metr ` [${runnerParameters.runnerType.runnerTypeName}] [AMI?:${customAmi}] ${labelsStrLog}: `, runInstancesResponse.Instances.map((i) => i.InstanceId).join(','), ); - addSSMParameterRunnerConfig( + await addSSMParameterRunnerConfig( runInstancesResponse.Instances, runnerParameters, customAmiExperiment, diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts new file mode 100644 index 0000000000..3a64896821 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.test.ts @@ -0,0 +1,131 @@ + + +import { Config } from './config'; +import { mocked } from 'ts-jest/utils'; +import { getRepo, expBackOff } from './utils'; + +// import * as ScaleUpChronModule from './scale-up-chron'; +import { scaleUpChron, getQueuedJobs } from './scale-up-chron'; + +import * as MetricsModule from './metrics'; + +jest.mock('./runners'); +jest.mock('./gh-runners'); +jest.mock('./gh-issues'); +jest.mock('./utils'); +jest.mock('axios'); + +const responseString1 = '[{"runner_label":"test_runner_type1","org":"test_org1","repo":"test_repo1","num_queued_jobs":1,"min_queue_time_minutes":1,"max_queue_time_minutes":1},{"runner_label":"test_runner_type2","org":"test_org2","repo":"test_repo2","num_queued_jobs":2,"min_queue_time_minutes":2,"max_queue_time_minutes":2}]'; +const responseString2 = '[{"runner_label":"label1-nomatch","org":"test_org1","repo":"test_repo1","num_queued_jobs":1,"min_queue_time_minutes":1,"max_queue_time_minutes":1},{"runner_label":"test_runner_type2","org":"test_org2","repo":"test_repo2","num_queued_jobs":2,"min_queue_time_minutes":2,"max_queue_time_minutes":2}]'; +const responseString3 = '[{"runner_label":"label1","org":"test_org1-nomatch","repo":"test_repo1","num_queued_jobs":1,"min_queue_time_minutes":1,"max_queue_time_minutes":1},{"runner_label":"test_runner_type2","org":"test_org2","repo":"test_repo2","num_queued_jobs":2,"min_queue_time_minutes":2,"max_queue_time_minutes":2}]'; + +const baseCfg = { + scaleConfigOrg: 'test_org1', + scaleUpMinQueueTimeMinutes: 30, + scaleUpRecordQueueUrl: 'url', +} as unknown as Config; + +const metrics = new MetricsModule.ScaleUpChronMetrics(); +// beforeEach(() => { +// jest.resetModules(); +// jest.clearAllMocks(); +// jest.restoreAllMocks(); + + // mocked(getRepo).mockReturnValue ({ owner: 'owner', repo: 'repo' }); + + // mocked(getRunnerTypes).mockResolvedValue( + // new Map([ + // [ + // 'label1', + // { + // instance_type: 'instance_type', + // os: 'os', + // max_available: 33, + // disk_size: 113, + // runnerTypeName: 'runnerTypeName', + // is_ephemeral: false, + // }, + // ], + // ]), + // ); +// }); +describe('scaleUpChron', () => { + + it('invalid scaleUpRecordQueueUrl', async () => { + jest.clearAllMocks(); + jest.spyOn(Config, 'Instance', 'get').mockImplementation( + () => + ({ + ...baseCfg, + scaleUpRecordQueueUrl: null, + } as unknown as Config), + ); + mocked(getRepo).mockReturnValue ({ owner: 'owner', repo: 'repo' }); + const scaleUpChron = jest.requireActual('./scale-up-chron').scaleUpChron; + await expect(scaleUpChron(metrics)).rejects.toThrow(new Error('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests')); + }); + + it('queued jobs do not match available runners', async () => { + jest.clearAllMocks(); + jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg); + mocked(getRepo).mockReturnValue ({ owner: 'test_org1', repo: 'test_repo1' }); + mocked(expBackOff).mockResolvedValue({ data: responseString2 }); + + const scaleUpInstanceNoOpSpy = jest.spyOn(metrics, 'scaleUpInstanceNoOp'); + + await scaleUpChron(metrics) + expect(scaleUpInstanceNoOpSpy).toBeCalledTimes(1); + }); + + it('queued jobs do not match scale config org', async () => { + jest.clearAllMocks(); + jest.spyOn(Config, 'Instance', 'get').mockImplementation(() => baseCfg); + mocked(getRepo).mockReturnValue ({ owner: 'test_org1', repo: 'test_repo1' }); + mocked(expBackOff).mockResolvedValue({ data: responseString3 }); + + const scaleUpInstanceNoOp = jest.spyOn(metrics, 'scaleUpInstanceNoOp'); + await scaleUpChron(metrics) + expect(scaleUpInstanceNoOp).toBeCalledTimes(1); + }); +}); + +describe('getQueuedJobs', () => { + it('get queue data from url request with valid response', async () => { + mocked(expBackOff).mockResolvedValue({ data: responseString1 }); + + expect(await getQueuedJobs(metrics, 'url')).toEqual([ + { + runner_label: 'test_runner_type1', + org: 'test_org1', + repo: 'test_repo1', + num_queued_jobs: 1, + min_queue_time_minutes: 1, + max_queue_time_minutes: 1 + }, { + runner_label: 'test_runner_type2', + org: 'test_org2', + repo: 'test_repo2', + num_queued_jobs: 2, + min_queue_time_minutes: 2, + max_queue_time_minutes:2 + } + ]); + }); + + it('get queue data from url request with invalid response', async () => { + const errorResponse = ''; + mocked(expBackOff).mockImplementation( + () => {throw new Error('Throwing a fake error!')}); + + const runners = await getQueuedJobs(metrics, 'url'); + expect(await getQueuedJobs(metrics, 'url')).toEqual([]); + }); + + it('get queue data from url request with empty response', async () => { + const errorResponse = ''; + mocked(expBackOff).mockResolvedValue({ data: errorResponse }); + + const runners = await getQueuedJobs(metrics, 'url'); + expect(await getQueuedJobs(metrics, 'url')).toEqual([]); + }); +}); diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts new file mode 100644 index 0000000000..bad2816b70 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/scale-runners/scale-up-chron.ts @@ -0,0 +1,105 @@ +import axios from 'axios'; + +import { Config } from './config'; +import { getRepo, shuffleArrayInPlace, expBackOff } from './utils'; +import { ScaleUpChronMetrics } from './metrics'; +import { getRunnerTypes } from './gh-runners'; +import { sqsSendMessages } from './sqs'; +import { ActionRequestMessage, scaleUp} from './scale-up'; +import { randomUUID } from 'crypto'; + +export async function scaleUpChron(metrics: ScaleUpChronMetrics): Promise { + // This function does the following: + // 1. Queries for queued runners via HUD + // 2. Polls scale-config to filter the list to ones that are self-hosted by this fleet and + // are ephemeral and nonephemeral + // 3. Sends a SQS request to the scale-up lambda to provision more of those instances + + const scaleConfigRepo = getRepo(Config.Instance.scaleConfigOrg, Config.Instance.scaleConfigRepo); + + const validRunnerTypes = await getRunnerTypes(scaleConfigRepo, metrics); + + const minAutoScaleupDelayMinutes = Config.Instance.scaleUpMinQueueTimeMinutes; + if (!Config.Instance.scaleUpRecordQueueUrl) { + metrics.scaleUpInstanceFailureNonRetryable('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); + throw new Error('scaleUpRecordQueueUrl is not set. Cannot send queued scale up requests'); + } + const scaleUpRecordQueueUrl = Config.Instance.scaleUpRecordQueueUrl; + // Only proactively scale up the jobs that have been queued for longer than normal + // Filter out the queued jobs that are do not correspond to a valid runner type + const queuedJobs = (await getQueuedJobs(metrics, scaleUpRecordQueueUrl)).filter((runner) => { + return runner.min_queue_time_minutes >= minAutoScaleupDelayMinutes && + runner.org === Config.Instance.scaleConfigOrg; + }).filter((requested_runner) => { + return Array.from(validRunnerTypes.keys()).some((available_runner_label) => { + return available_runner_label === requested_runner.runner_label; + }); + });; + + if (queuedJobs.length === 0) { + metrics.scaleUpInstanceNoOp(); + return + } + + // Send a message to the SQS queue to scale up the runners + const scaleUpRequests : Array = queuedJobs.map((runner) => { + return { + "id": Math.floor(Math.random() * 100000000000000), + "eventType": "workflow_job", + "repositoryName": runner.repo, + "repositoryOwner": runner.org, + "runnerLabels": [runner.runner_label], + }; + }); + + for (const request of shuffleArrayInPlace(scaleUpRequests)) { + try{ + await scaleUp("aws:sqs", request, metrics); + metrics.scaleUpInstanceSuccess(); + } catch (error) { + metrics.scaleUpInstanceFailureRetryable((error as Error).message); + } + } +} + +interface QueuedJobsForRunner { + runner_label: string; + org: string; + repo: string; + num_queued_jobs: number; + min_queue_time_minutes: number; + max_queue_time_minutes: number; +} + +export async function getQueuedJobs(metrics: ScaleUpChronMetrics, scaleUpRecordQueueUrl: string): Promise { + // This function queries the HUD for queued runners + // and returns a list of them + + const url = scaleUpRecordQueueUrl; + + try { + const response = await expBackOff(() => { + return metrics.trackRequest(metrics.getQueuedJobsEndpointSuccess, metrics.getQueuedJobsEndpointFailure, () => { + return axios.get(url); + }); + }); + + // Map the response to the class + const responseData = JSON.parse(response.data); + return responseData.map((runner: any) => { + metrics.queuedRunnerStats(runner.org, runner.runner_label, runner.num_queued_jobs,); + return { + runner_label: runner.runner_label, + org: runner.org, + repo: runner.repo, + num_queued_jobs: Number(runner.num_queued_jobs), + min_queue_time_minutes: Number(runner.min_queue_time_minutes), + max_queue_time_minutes: Number(runner.max_queue_time_minutes) + }; + }); + } catch (error) { + metrics.queuedRunnerFailure((error as Error).message); + console.error('Error fetching queued runners:', error); + return []; + } +} diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/src/template.yml b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/template.yml new file mode 100644 index 0000000000..fda647cd9e --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/src/template.yml @@ -0,0 +1,13 @@ +AWSTemplateFormatVersion: '2010-09-09' +Transform: 'AWS::Serverless-2016-10-31' +Resources: + ScaleUpChronFunction: + Type: 'AWS::Serverless::Function' + Properties: + Handler: index.scaleUpChron + Runtime: nodejs20.x + Events: + ScheduledEvent: + Type: Schedule + Properties: + Schedule: 'rate(1 minute)' \ No newline at end of file diff --git a/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock b/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock index 7930c0018f..6ed2cc9b2c 100644 --- a/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock +++ b/terraform-aws-github-runner/modules/runners/lambdas/runners/yarn.lock @@ -1381,6 +1381,15 @@ aws-sdk@^2.863.0: uuid "8.0.0" xml2js "0.6.2" +axios@^1.7.7: + version "1.7.7" + resolved "https://registry.yarnpkg.com/axios/-/axios-1.7.7.tgz#2f554296f9892a72ac8d8e4c5b79c14a91d0a47f" + integrity sha512-S4kL7XrjgBmvdGut0sN3yJxqYzrDOnivkBiN0OFs6hLiUam3UPvswUo0kqGyhqUZGEOytHyumEdXsAkgCOUf3Q== + dependencies: + follow-redirects "^1.15.6" + form-data "^4.0.0" + proxy-from-env "^1.1.0" + babel-jest@^26.6.3: version "26.6.3" resolved "https://registry.yarnpkg.com/babel-jest/-/babel-jest-26.6.3.tgz#d87d25cb0037577a0c89f82e5755c5d293c01056" @@ -2326,6 +2335,11 @@ flatted@^3.2.9: resolved "https://registry.yarnpkg.com/flatted/-/flatted-3.3.1.tgz#21db470729a6734d4997002f439cb308987f567a" integrity sha512-X8cqMLLie7KsNUDSdzeN8FYK9rEt4Dt67OsG/DNGnYTSDBG4uFAJFBnUeiV+zCVAvwFy56IjM9sH51jVaEhNxw== +follow-redirects@^1.15.6: + version "1.15.9" + resolved "https://registry.yarnpkg.com/follow-redirects/-/follow-redirects-1.15.9.tgz#a604fa10e443bf98ca94228d9eebcc2e8a2c8ee1" + integrity sha512-gew4GsXizNgdoRyqmyfMHyAmXsZDk6mHkSxZFCzW9gwlbtOW44CDtYavM+y+72qD/Vq2l550kMF52DT8fOLJqQ== + for-each@^0.3.3: version "0.3.3" resolved "https://registry.yarnpkg.com/for-each/-/for-each-0.3.3.tgz#69b447e88a0a5d32c3e7084f3f1710034b21376e" @@ -2347,6 +2361,15 @@ form-data@^3.0.0: combined-stream "^1.0.8" mime-types "^2.1.12" +form-data@^4.0.0: + version "4.0.1" + resolved "https://registry.yarnpkg.com/form-data/-/form-data-4.0.1.tgz#ba1076daaaa5bfd7e99c1a6cb02aa0a5cff90d48" + integrity sha512-tzN8e4TX8+kkxGPK8D5u0FNmjPUjw3lwC9lSLxxoB/+GtsJG91CO8bSWy73APlgAZzZbXEYZJuxjkHH2w+Ezhw== + dependencies: + asynckit "^0.4.0" + combined-stream "^1.0.8" + mime-types "^2.1.12" + fragment-cache@^0.2.1: version "0.2.1" resolved "https://registry.yarnpkg.com/fragment-cache/-/fragment-cache-0.2.1.tgz#4290fad27f13e89be7f33799c6bc5a0abfff0d19" @@ -4028,6 +4051,11 @@ propagate@^2.0.0: resolved "https://registry.yarnpkg.com/propagate/-/propagate-2.0.1.tgz#40cdedab18085c792334e64f0ac17256d38f9a45" integrity sha512-vGrhOavPSTz4QVNuBNdcNXePNdNMaO1xj9yBeH1ScQPjk/rhg9sSlCXPhMkFuaNNW/syTvYqsnbIJxMBfRbbag== +proxy-from-env@^1.1.0: + version "1.1.0" + resolved "https://registry.yarnpkg.com/proxy-from-env/-/proxy-from-env-1.1.0.tgz#e102f16ca355424865755d2c9e8ea4f24d58c3e2" + integrity sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg== + psl@^1.1.33: version "1.9.0" resolved "https://registry.yarnpkg.com/psl/-/psl-1.9.0.tgz#d0df2a137f00794565fcaf3b2c00cd09f8d5a5a7" diff --git a/terraform-aws-github-runner/modules/runners/scale-down.tf b/terraform-aws-github-runner/modules/runners/scale-down.tf index 9daa764b2e..6398555085 100644 --- a/terraform-aws-github-runner/modules/runners/scale-down.tf +++ b/terraform-aws-github-runner/modules/runners/scale-down.tf @@ -53,6 +53,7 @@ resource "aws_lambda_function" "scale_down" { MINIMUM_RUNNING_TIME_IN_MINUTES = var.minimum_running_time_in_minutes REDIS_ENDPOINT = var.redis_endpoint REDIS_LOGIN = var.redis_login + SCALE_CONFIG_ORG = var.scale_config_org SCALE_CONFIG_REPO = var.scale_config_repo SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path SCALE_DOWN_CONFIG = jsonencode(var.idle_config) diff --git a/terraform-aws-github-runner/modules/runners/scale-up-chron.tf b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf new file mode 100644 index 0000000000..c38382ea33 --- /dev/null +++ b/terraform-aws-github-runner/modules/runners/scale-up-chron.tf @@ -0,0 +1,164 @@ +resource "aws_kms_grant" "scale_up_chron" { + count = var.encryption.encrypt ? 1 : 0 + name = "${var.environment}-scale-up-chron" + key_id = var.encryption.kms_key_id + grantee_principal = aws_iam_role.scale_up_chron.arn + operations = ["Decrypt"] + + constraints { + encryption_context_equals = { + Environment = var.environment + } + } +} + +resource "aws_lambda_function" "scale_up_chron" { + s3_bucket = var.lambda_s3_bucket != null ? var.lambda_s3_bucket : null + s3_key = var.runners_lambda_s3_key != null ? var.runners_lambda_s3_key : null + s3_object_version = var.runners_lambda_s3_object_version != null ? var.runners_lambda_s3_object_version : null + filename = var.lambda_s3_bucket == null ? local.lambda_zip : null + source_code_hash = var.lambda_s3_bucket == null ? filebase64sha256(local.lambda_zip) : null + function_name = "${var.environment}-scale-up-chron" + role = aws_iam_role.scale_up_chron.arn + handler = "index.scaleUpChron" + runtime = "nodejs20.x" + timeout = var.lambda_timeout_scale_up_chron + tags = local.tags + memory_size = 2048 + + environment { + variables = { + AWS_REGION_INSTANCES = join(",", var.aws_region_instances) + DATETIME_DEPLOY = local.datetime_deploy + ENABLE_ORGANIZATION_RUNNERS = var.enable_organization_runners + ENVIRONMENT = var.environment + GHES_URL = var.ghes_url + GITHUB_APP_CLIENT_ID = var.github_app.client_id + GITHUB_APP_CLIENT_SECRET = var.github_app_client_secret + GITHUB_APP_ID = var.github_app.id + GITHUB_APP_KEY_BASE64 = var.github_app_key_base64 + KMS_KEY_ID = var.encryption.kms_key_id + LAMBDA_TIMEOUT = var.lambda_timeout_scale_up_chron + MIN_AVAILABLE_RUNNERS = var.min_available_runners + MINIMUM_RUNNING_TIME_IN_MINUTES = var.minimum_running_time_in_minutes + REDIS_ENDPOINT = var.redis_endpoint + REDIS_LOGIN = var.redis_login + SCALE_CONFIG_ORG = var.scale_config_org + SCALE_CONFIG_REPO = var.scale_config_repo + SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path + SCALE_UP_MIN_QUEUE_TIME_MINUTES = 30 + SCALE_UP_RECORD_QUEUE_URL = "https://hud.pytorch.org/api/clickhouse/queued_jobs_aggregate?parameters=%5B%5D" + scale_up_chron_CONFIG = jsonencode(var.idle_config) + SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id + AWS_REGIONS_TO_VPC_IDS = join( + ",", + sort(distinct([ + for region_vpc in var.vpc_ids : + format("%s|%s", region_vpc.region, region_vpc.vpc) + ])) + ) + VPC_ID_TO_SECURITY_GROUP_IDS = join( + ",", + sort(distinct(concat( + [ + for vpc in var.vpc_ids : + format( + "%s|%s", + vpc.vpc, + var.runners_security_group_ids[local.vpc_id_to_idx[vpc.vpc]] + ) + ], + [ + for vpc_subnet in var.vpc_sgs : + format("%s|%s", vpc_subnet.vpc, vpc_subnet.sg) + ] + ))) + ) + VPC_ID_TO_SUBNET_IDS = join( + ",", + sort(distinct([ + for vpc_subnet in var.subnet_vpc_ids : + format("%s|%s", vpc_subnet.vpc, vpc_subnet.subnet) + ])) + ) + SUBNET_ID_TO_AZ = join( + ",", + sort(distinct([ + for subnet_az in var.subnet_azs : + format("%s|%s", subnet_az.subnet, subnet_az.az) + ])) + ) + } + } + + vpc_config { + security_group_ids = concat( + var.lambda_security_group_ids, + [var.runners_security_group_ids[0]] + ) + subnet_ids = var.lambda_subnet_ids + } +} + +resource "aws_cloudwatch_log_group" "scale_up_chron" { + name = "/aws/lambda/${aws_lambda_function.scale_up_chron.function_name}" + retention_in_days = var.logging_retention_in_days + tags = var.tags +} + +resource "aws_cloudwatch_event_rule" "scale_up_chron" { + name = "${var.environment}-scale-up-chron-rule" + schedule_expression = var.scale_up_chron_schedule_expression + tags = var.tags +} + +resource "aws_cloudwatch_event_target" "scale_up_chron" { + rule = aws_cloudwatch_event_rule.scale_up_chron.name + arn = aws_lambda_function.scale_up_chron.arn +} + +resource "aws_lambda_permission" "scale_up_chron" { + statement_id = "AllowExecutionFromCloudWatch" + action = "lambda:InvokeFunction" + function_name = aws_lambda_function.scale_up_chron.function_name + principal = "events.amazonaws.com" + source_arn = aws_cloudwatch_event_rule.scale_up_chron.arn +} + +resource "aws_iam_role" "scale_up_chron" { + name = "${var.environment}-action-scale-up-chron-lambda-role" + assume_role_policy = data.aws_iam_policy_document.lambda_assume_role_policy.json + path = local.role_path + permissions_boundary = var.role_permissions_boundary + tags = local.tags +} + +resource "aws_iam_role_policy" "scale_up_chron" { + name = "${var.environment}-lambda-scale-up-chron-policy" + role = aws_iam_role.scale_up_chron.name + policy = templatefile("${path.module}/policies/lambda-scale-up-chron.json", { + arn_ssm_parameters = "arn:aws:ssm:${var.aws_region}:${data.aws_caller_identity.current.account_id}:parameter/${var.environment}-*" + }) +} + +resource "aws_iam_role_policy" "scale_up_chron_logging" { + name = "${var.environment}-lambda-logging" + role = aws_iam_role.scale_up_chron.name + policy = templatefile("${path.module}/policies/lambda-cloudwatch.json", { + log_group_arn = aws_cloudwatch_log_group.scale_up_chron.arn + }) +} + +resource "aws_iam_role_policy_attachment" "scale_up_chron_vpc_execution_role" { + count = length(var.lambda_subnet_ids) > 0 ? 1 : 0 + role = aws_iam_role.scale_up_chron.name + policy_arn = "arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole" +} + +resource "aws_iam_role_policy" "scale_up_chron_secretsmanager_access" { + count = var.secretsmanager_secrets_id != null ? 1 : 0 + role = aws_iam_role.scale_up_chron.name + policy = templatefile("${path.module}/policies/lambda-secretsmanager.json", { + secretsmanager_arn = data.aws_secretsmanager_secret_version.app_creds.arn + }) +} diff --git a/terraform-aws-github-runner/modules/runners/scale-up.tf b/terraform-aws-github-runner/modules/runners/scale-up.tf index 8a47122534..36d9cd821a 100644 --- a/terraform-aws-github-runner/modules/runners/scale-up.tf +++ b/terraform-aws-github-runner/modules/runners/scale-up.tf @@ -67,6 +67,7 @@ resource "aws_lambda_function" "scale_up" { RETRY_SCALE_UP_RECORD_JITTER_PCT = "0.5" RETRY_SCALE_UP_RECORD_QUEUE_URL = var.sqs_build_queue_retry.url RUNNER_EXTRA_LABELS = var.runner_extra_labels + SCALE_CONFIG_ORG = var.scale_config_org SCALE_CONFIG_REPO = var.scale_config_repo SCALE_CONFIG_REPO_PATH = var.scale_config_repo_path SECRETSMANAGER_SECRETS_ID = var.secretsmanager_secrets_id diff --git a/terraform-aws-github-runner/modules/runners/variables.tf b/terraform-aws-github-runner/modules/runners/variables.tf index 1387af2a47..67d1739b8f 100644 --- a/terraform-aws-github-runner/modules/runners/variables.tf +++ b/terraform-aws-github-runner/modules/runners/variables.tf @@ -94,6 +94,12 @@ variable "scale_down_schedule_expression" { default = "cron(*/5 * * * ? *)" } +variable "scale_up_chron_schedule_expression" { + description = "Scheduler expression to check every x for scale down." + type = string + default = "cron(*/30 * * * ? *)" # every 30 minutes +} + variable "minimum_running_time_in_minutes" { description = "The time an ec2 action runner should be running at minimum before terminated if non busy." type = number @@ -112,6 +118,12 @@ variable "lambda_timeout_scale_up" { default = 60 } +variable "lambda_timeout_scale_up_chron" { + description = "Time out for the scale up chron lambda in seconds." + type = number + default = 60 +} + variable "role_permissions_boundary" { description = "Permissions boundary that will be added to the created role for the lambda." type = string @@ -285,6 +297,11 @@ variable "role_runner_arn" { type = string } +variable "scale_config_org" { + description = "Organization to fetch scale config from." + type = string +} + variable "scale_config_repo" { description = "Repository to fetch scale config from." default = "" diff --git a/terraform-aws-github-runner/variables.tf b/terraform-aws-github-runner/variables.tf index b99af920cf..603a02e9ca 100644 --- a/terraform-aws-github-runner/variables.tf +++ b/terraform-aws-github-runner/variables.tf @@ -345,6 +345,11 @@ variable "cant_have_issues_labels" { default = [] } +variable "scale_config_org" { + description = "Organization to fetch scale config from." + type = string +} + variable "scale_config_repo" { description = "Repository to fetch scale config from. Optional if `enable_organization_runners` is set to false, in which case the job's repo will be used" default = "" diff --git a/tools/analytics/validate_pypi_staging.py b/tools/analytics/validate_pypi_staging.py index 8e8801ff8c..0c06ca2185 100644 --- a/tools/analytics/validate_pypi_staging.py +++ b/tools/analytics/validate_pypi_staging.py @@ -12,22 +12,24 @@ PLATFORMS = [ "manylinux1_x86_64", - "manylinux2014_aarch64", + "manylinux_2_28_aarch64", "win_amd64", "macosx_11_0_arm64", ] -PYTHON_VERSIONS = ["cp38", "cp39", "cp310", "cp311", "cp312"] +PYTHON_VERSIONS = ["cp39", "cp310", "cp311", "cp312", "cp313"] S3_PYPI_STAGING = "pytorch-backup" PACKAGE_RELEASES = { - "torch": "2.3.1", - "torchvision": "0.18.1", - "torchaudio": "2.3.1", - "torchtext": "0.18.0", - "executorch": "0.2.1", + "torch": "2.6.0", + "torchvision": "0.21.0", + "torchaudio": "2.6.0", + # "torchtext": "0.18.0", + # "executorch": "0.2.1", } PATTERN_V = "Version:" PATTERN_RD = "Requires-Dist:" +PATTERN_PYTHON = "Requires-Python:" +PATTERN_PROGRAMMING = "Classifier: Programming Language :: Python ::" s3 = boto3.client("s3") @@ -104,7 +106,11 @@ def validate_file_metadata(build: str, package: str, version: str): f"FAILURE VERSION DOES NOT MATCH expected {version} got {exttracted_version}" ) - elif line.startswith(PATTERN_RD): + elif ( + line.startswith(PATTERN_RD) + or line.startswith(PATTERN_PYTHON) + or line.startswith(PATTERN_PROGRAMMING) + ): print(f"{line}", end="") shutil.rmtree(temp_dir) diff --git a/tools/scripts/fetch_latest_green_commit.py b/tools/scripts/fetch_latest_green_commit.py index 9b346b3e79..b3f0e43097 100644 --- a/tools/scripts/fetch_latest_green_commit.py +++ b/tools/scripts/fetch_latest_green_commit.py @@ -1,6 +1,7 @@ import json import re import sys +from functools import lru_cache from pathlib import Path from typing import Any, cast, Dict, List, NamedTuple, Optional, Tuple @@ -80,6 +81,23 @@ def get_commit_results( return workflow_checks +@lru_cache +def fetch_unstable_issues() -> List[str]: + issues = query_clickhouse_saved("issue_query", {"label": "unstable"}) + return [ + issue["title"][len("UNSTABLE") :].strip() + for issue in issues + if issue["title"].startswith("UNSTABLE") and issue["state"] == "open" + ] + + +def is_unstable(job: dict[str, Any]) -> bool: + # Check if the job is an unstable job, either by name or by issue + if "unstable" in job["jobName"]: + return True + return job["name"] in fetch_unstable_issues() + + def is_green( commit: str, requires: List[str], results: List[Dict[str, Any]] ) -> Tuple[bool, str]: @@ -88,9 +106,9 @@ def is_green( regex = {check: False for check in requires} for check in workflow_checks: - jobName = check["jobName"] + jobName = check["name"] # Ignore result from unstable job, be it success or failure - if "unstable" in jobName: + if "unstable" in jobName or jobName in fetch_unstable_issues(): continue workflow_name = check["workflowName"] diff --git a/tools/tests/test_fetch_latest_green_commit.py b/tools/tests/test_fetch_latest_green_commit.py index e4f11de938..1b580072be 100644 --- a/tools/tests/test_fetch_latest_green_commit.py +++ b/tools/tests/test_fetch_latest_green_commit.py @@ -47,12 +47,18 @@ def make_test_checks(self) -> List[Dict[str, Any]]: return workflow_checks +@mock.patch( + "tools.scripts.fetch_latest_green_commit.fetch_unstable_issues", + return_value=[], +) class TestPrintCommits(TestCase): @mock.patch( "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_all_successful(self, mock_get_commit_results: Any) -> None: + def test_all_successful( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with workflows are successful""" workflow_checks = mock_get_commit_results() self.assertTrue(is_green("sha", requires, workflow_checks)[0]) @@ -61,7 +67,9 @@ def test_all_successful(self, mock_get_commit_results: Any) -> None: "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_necessary_successful(self, mock_get_commit_results: Any) -> None: + def test_necessary_successful( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with necessary workflows are successful""" workflow_checks = mock_get_commit_results() workflow_checks = set_workflow_job_status( @@ -85,7 +93,9 @@ def test_necessary_successful(self, mock_get_commit_results: Any) -> None: "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_necessary_skipped(self, mock_get_commit_results: Any) -> None: + def test_necessary_skipped( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with necessary job (ex: pull) skipped""" workflow_checks = mock_get_commit_results() workflow_checks = set_workflow_job_status(workflow_checks, "pull", "skipped") @@ -96,7 +106,9 @@ def test_necessary_skipped(self, mock_get_commit_results: Any) -> None: "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_skippable_skipped(self, mock_get_commit_results: Any) -> None: + def test_skippable_skipped( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with skippable jobs (periodic and docker-release-builds skipped""" workflow_checks = mock_get_commit_results() workflow_checks = set_workflow_job_status( @@ -111,7 +123,9 @@ def test_skippable_skipped(self, mock_get_commit_results: Any) -> None: "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_necessary_failed(self, mock_get_commit_results: Any) -> None: + def test_necessary_failed( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with necessary job (ex: Lint) failed""" workflow_checks = mock_get_commit_results() workflow_checks = set_workflow_job_status(workflow_checks, "Lint", "failed") @@ -123,7 +137,9 @@ def test_necessary_failed(self, mock_get_commit_results: Any) -> None: "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value=TestChecks().make_test_checks(), ) - def test_skippable_failed(self, mock_get_commit_results: Any) -> None: + def test_skippable_failed( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with failing skippable jobs (ex: docker-release-builds) should pass""" workflow_checks = mock_get_commit_results() workflow_checks = set_workflow_job_status( @@ -138,7 +154,9 @@ def test_skippable_failed(self, mock_get_commit_results: Any) -> None: @mock.patch( "tools.scripts.fetch_latest_green_commit.get_commit_results", return_value={} ) - def test_no_workflows(self, mock_get_commit_results: Any) -> None: + def test_no_workflows( + self, mock_get_commit_results: Any, mock_fetch_unstable_issues: Any + ) -> None: """Test with missing workflows""" workflow_checks = mock_get_commit_results() result = is_green("sha", requires, workflow_checks) diff --git a/torchci/components/benchmark/llms/components/dashboardPicker/LLMsDashboardPicker.tsx b/torchci/components/benchmark/llms/components/dashboardPicker/LLMsDashboardPicker.tsx index 2f666c02f9..ff84ab3671 100644 --- a/torchci/components/benchmark/llms/components/dashboardPicker/LLMsDashboardPicker.tsx +++ b/torchci/components/benchmark/llms/components/dashboardPicker/LLMsDashboardPicker.tsx @@ -69,7 +69,7 @@ export const LLMsDashboardPicker = ({ }} titlePrefix={"New"} fallbackIndex={0} // Default to the latest commit - timeRange={[props.timeRange]} + timeRange={props.timeRange} />