Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scaleup healing #6018

Draft
wants to merge 16 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions .github/actions/setup-binary-builds/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,16 @@ runs:
if [[ "${PYTHON_VERSION:-}" == "3.13t" ]]; then
export PYTHON_VERSION=3.13
export CONDA_EXTRA_PARAM=" python-freethreading -c conda-forge"
if [[ "$(uname)" != Darwin ]]; then
# Pin conda and conda-libmamba-solver for 3.13t linux build
# this solver allows us to install anaconda dependencies on
# python-freethreading on conda-forge environment
conda install conda==24.7.1 conda-libmamba-solver=24.1.0

# downgrade conda version for python 3.13t install.
# TODO: remove this once python 3.13t is fully suported on conda
# Please see : https://github.com/conda/conda/issues/14554
if [[ "$(uname)" == Darwin ]]; then
# required to be able to downgrade on MacOS m1 side
conda install -y python=3.9
conda uninstall -y conda-anaconda-telemetry conda-anaconda-tos
fi
conda install -y conda=24.7.1 conda-libmamba-solver=24.1.0
fi

conda create \
Expand Down
4 changes: 2 additions & 2 deletions .github/scripts/install_xpu.bat
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@ set XPU_EXTRA_INSTALLED=0
set XPU_EXTRA_UNINSTALL=0

if not [%XPU_VERSION%]==[] if [%XPU_VERSION%]==[2025.0] (
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/efc86abd-cb77-452e-a03f-a741895b8ece/intel-deep-learning-essentials-2025.0.0.336_offline.exe
set XPU_BUNDLE_URL=https://registrationcenter-download.intel.com/akdlm/IRC_NAS/9d6d6c17-ca2d-4735-9331-99447e4a1280/intel-deep-learning-essentials-2025.0.1.28_offline.exe
set XPU_BUNDLE_PRODUCT_NAME=intel.oneapi.win.deep-learning-essentials.product
set XPU_BUNDLE_VERSION=2025.0.0+335
set XPU_BUNDLE_VERSION=2025.0.1+20
set XPU_BUNDLE_INSTALLED=0
set XPU_BUNDLE_UNINSTALL=0
set XPU_EXTRA_URL=NULL
Expand Down
104 changes: 11 additions & 93 deletions .github/workflows/tflint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,102 +24,20 @@ jobs:
github_token: ${{ secrets.GITHUB_TOKEN }}
tflint_version: v0.54.0

- name: Install Terraform
uses: hashicorp/setup-terraform@v2
- name: Install Tofu
uses: opentofu/setup-opentofu@v1
with:
terraform_version: 1.5.1
terraform_version: 1.5.7
terraform_wrapper: false

- name: Show version
run: tflint --version
- name: Show tflint version
run:
tflint --version

- name: "Init TFLint download-lambda"
working-directory: terraform-aws-github-runner/modules/download-lambda
run: tflint --init
- name: "Init terraform download-lambda"
working-directory: terraform-aws-github-runner/modules/download-lambda
run: terraform init
- name: "Run TFLint download-lambda"
working-directory: terraform-aws-github-runner/modules/download-lambda
run: tflint --call-module-type=all
- name: "Run terraform validate download-lambda"
working-directory: terraform-aws-github-runner/modules/download-lambda
run: terraform validate
- name: Show tofu version
run:
tofu --version

- name: "Init TFLint runner-binaries-syncer"
working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer
run: tflint --init
- name: "Init terraform runner-binaries-syncer"
working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer
run: terraform init
- name: "Run TFLint runner-binaries-syncer"
working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer
run: tflint --call-module-type=all
- name: "Run terraform validate runner-binaries-syncer"
working-directory: terraform-aws-github-runner/modules/runner-binaries-syncer
run: terraform validate

- name: "Init TFLint runners-instances"
working-directory: terraform-aws-github-runner/modules/runners-instances
run: tflint --init
- name: "Init terraform runners-instances"
working-directory: terraform-aws-github-runner/modules/runners-instances
run: terraform init
- name: "Run TFLint runners-instances"
working-directory: terraform-aws-github-runner/modules/runners-instances
run: tflint --call-module-type=all
- name: "Run terraform validate runners-instances"
working-directory: terraform-aws-github-runner/modules/runners-instances
run: terraform validate

- name: "Init TFLint runners"
working-directory: terraform-aws-github-runner/modules/runners
run: tflint --init
- name: "Init terraform runners"
working-directory: terraform-aws-github-runner/modules/runners
run: terraform init
- name: "Run TFLint runners"
working-directory: terraform-aws-github-runner/modules/runners
run: tflint --call-module-type=all
- name: "Run terraform validate runners"
working-directory: terraform-aws-github-runner/modules/runners
run: terraform validate

- name: "Init TFLint setup-iam-permissions"
working-directory: terraform-aws-github-runner/modules/setup-iam-permissions
run: tflint --init
- name: "Init terraform setup-iam-permissions"
working-directory: terraform-aws-github-runner/modules/setup-iam-permissions
run: terraform init
- name: "Run TFLint setup-iam-permissions"
working-directory: terraform-aws-github-runner/modules/setup-iam-permissions
run: tflint --call-module-type=all
- name: "Run terraform validate setup-iam-permissions"
working-directory: terraform-aws-github-runner/modules/setup-iam-permissions
run: terraform validate

- name: "Init TFLint webhook"
working-directory: terraform-aws-github-runner/modules/webhook
run: tflint --init
- name: "Init terraform webhook"
working-directory: terraform-aws-github-runner/modules/webhook
run: terraform init
- name: "Run TFLint webhook"
working-directory: terraform-aws-github-runner/modules/webhook
run: tflint --call-module-type=all
- name: "Run terraform validate webhook"
working-directory: terraform-aws-github-runner/modules/webhook
run: terraform validate

- name: "Init TFLint main"
working-directory: terraform-aws-github-runner
run: tflint --init
- name: "Init terraform main"
working-directory: terraform-aws-github-runner
run: terraform init
- name: "Run TFLint main"
working-directory: terraform-aws-github-runner
run: tflint --call-module-type=all
- name: "Run terraform validate terraform-aws-github-runner"
- name: "tflint"
working-directory: terraform-aws-github-runner
run: terraform validate
run: make tflint
1 change: 1 addition & 0 deletions release/promote.sh
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ promote_pypi() {
# promote_s3 fbgemm-gpu whl "${FBGEMMGPU_VERSION}"
# promote_s3 "libtorch-*" libtorch "${PYTORCH_VERSION}"
# promote_s3 "torch_tensorrt" whl "${TENSORRT_VERSION}"
# promote_s3 "torchao" whl "${TORCHAO_VERSION}"

# promote_conda torchtriton conda "2.1.0"
# promote_conda pytorch-cuda conda "11.8"
Expand Down
7 changes: 4 additions & 3 deletions release/pypi/promote_pypi_to_staging.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,11 @@ PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX=""
#PLATFORM="linux_x86" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging executorch "${EXECUTORCH_VERSION}"
#PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging executorch "${EXECUTORCH_VERSION}"

# PLATFORM="manylinux" VERSION_SUFFIX="${LINUX_VERSION_SUFFIX}" ARCH="cu124" upload_pypi_to_staging torchao "${TORCHAO_VERSION}"
# PLATFORM="none-any" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchao "${TORCHAO_VERSION}"

#PLATFORM="linux_x86" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}"
#PLATFORM="win_amd64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}"
#PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchtext "${TORCHTEXT_VERSION}"

#PLATFORM="linux_x86_64" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}"
#PLATFORM="win_amd64" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}"
#PLATFORM="${MACOS_ARM64}" VERSION_SUFFIX="" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}"
#PLATFORM="none-any" VERSION_SUFFIX="${CPU_VERSION_SUFFIX}" upload_pypi_to_staging torchdata "${TORCHDATA_VERSION}"
5 changes: 4 additions & 1 deletion release/pypi/upload_pypi_to_staging.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@ PLATFORM=${PLATFORM:-}
# i.e. cpu, cu121, cu124
ARCH=${ARCH:-cpu}

# This extract links to packages from the index.html
# We strip all extra characters including final sha256 char
pkgs_to_promote=$(\
curl -fsSL "https://download.pytorch.org/whl/test/${ARCH}/${PACKAGE_NAME}/index.html" \
| grep "${PACKAGE_NAME}-${PACKAGE_VERSION}${VERSION_SUFFIX}-" \
| grep "${PLATFORM}" \
| cut -d '"' -f2
| cut -d '"' -f2 \
| cut -d "#" -f1
)

tmp_dir="$(mktemp -d)"
Expand Down
3 changes: 2 additions & 1 deletion release/release_versions.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ TORCHTEXT_VERSION=${TORCHTEXT_VERSION:-0.18.0}
TORCHREC_VERSION=${TORCHREC_VERSION:-0.8.0}
TENSORRT_VERSION=${TENSORRT_VERSION:-2.4.0}
EXECUTORCH_VERSION=${EXECUTORCH_VERSION:-0.3.0}
TORCHAO_VERSION=${TORCHAO_VERSION:-0.4.0}
TORCHAO_VERSION=${TORCHAO_VERSION:-0.9.0}
TORCHDATA_VERSION=${TORCHDATA_VERSION:-0.11.0}
TORCHTUNE_VERSION=${TORCHTUNE_VERSION:-0.2.1}
FBGEMMGPU_VERSION=${FBGEMMGPU_VERSION:-1.0.0}
62 changes: 62 additions & 0 deletions terraform-aws-github-runner/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
all: tflint

@PHONY: tflint
tflint: tflint-download-lambda tflint-runner-binaries-syncer tflint-runners-instances tflint-runners tflint-setup-iam-permissions tflint-webhook tflint-main

@PHONY: tflint-download-lambda
tflint-download-lambda:
cd modules/download-lambda && \
tofu init && \
tflint --init && \
tflint --call-module-type=all && \
tofu validate

@PHONY: tflint-runner-binaries-syncer
tflint-runner-binaries-syncer:
cd modules/runner-binaries-syncer && \
tofu init && \
tflint --init && \
tflint --call-module-type=all && \
tofu validate

@PHONY: tflint-runners-instances
tflint-runners-instances:
cd modules/runners-instances && \
tofu init && \
tflint --init && \
tflint --call-module-type=all && \
tofu validate

@PHONY: tflint-runners
tflint-runners:
cd modules/runners && \
tofu init && \
tflint --init && \
tflint --call-module-type=all && \
tofu validate

@PHONY: tflint-setup-iam-permissions
tflint-setup-iam-permissions:
cd modules/setup-iam-permissions && \
tofu init && \
tflint --init && \
tflint --call-module-type=all && \
tofu validate

@PHONY: tflint-webhook
tflint-webhook:
cd modules/webhook && \
tofu init && \
tflint --init && \
tflint --call-module-type=all && \
tofu validate

@PHONY: tflint-main
tflint-main:
tofu init
tflint --init
tflint --call-module-type=all --recursive
tofu validate

clean:
rm -rf .terraform terraform.lock.hcl
12 changes: 12 additions & 0 deletions terraform-aws-github-runner/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,18 @@

This is a terraform module that sets up self hosted github runners on AWS along with the infra needed to autoscale them

# Testing your changes
In order to verify if your changes will pass CI testing, you can simply run from this directory:

```
$ make tflint
```

This depends on Tofu, CMake and TFLint being installed.

# Checking plan changes of your changes
This module is not stand alone. It is a reusable module designed to be imported, configured, and used in your project.

# Release
Terraform code that uses this module specify the tag (version of test-infra) that they use via a file called `Terrafile`. We need to create a new tag for any changes here that we want to deploy and update the `Terrafile` to refer to that tag:

Expand Down
1 change: 1 addition & 0 deletions terraform-aws-github-runner/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ module "runners" {
environment = var.environment
tags = local.tags

scale_config_org = var.scale_config_org
scale_config_repo = var.scale_config_repo
scale_config_repo_path = var.scale_config_repo_path

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,8 @@ module.exports = {
lines: 80,
statements: 80
}
}
},
moduleNameMapper: {
axios: 'axios/dist/node/axios.cjs', // Allow axios to work in tests
},
};
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
"@types/uuid": "^9.0.1",
"async-mutex": "^0.4.0",
"aws-sdk": "^2.863.0",
"axios": "^1.7.7",
"cron-parser": "^3.3.0",
"generic-pool": "^3.9.0",
"lru-cache": "^6.0.0",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@ import { ActionRequestMessage, RetryableScalingError, scaleUp as scaleUpR } from
import { Context, SQSEvent, SQSRecord, ScheduledEvent } from 'aws-lambda';

import { Config } from './scale-runners/config';
import { ScaleUpMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics';
import { ScaleUpMetrics, ScaleUpChronMetrics, sendMetricsAtTimeout, sendMetricsTimeoutVars } from './scale-runners/metrics';
import { getDelayWithJitterRetryCount, stochaticRunOvershoot } from './scale-runners/utils';
import { scaleDown as scaleDownR } from './scale-runners/scale-down';
import { scaleUpChron as scaleUpChronR } from './scale-runners/scale-up-chron';
import { sqsSendMessages, sqsDeleteMessageBatch } from './scale-runners/sqs';

async function sendRetryEvents(evtFailed: Array<[SQSRecord, boolean, number]>, metrics: ScaleUpMetrics) {
Expand Down Expand Up @@ -155,3 +156,35 @@ export async function scaleDown(event: ScheduledEvent, context: Context, callbac
return callback('Failed');
}
}

// eslint-disable-next-line @typescript-eslint/no-explicit-any
export async function scaleUpChron(event: ScheduledEvent, context: Context, callback: any) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we want to test this as well, but here tests can be more relaxed

// we mantain open connections to redis, so the event pool is only cleaned when the SIGTERM is sent
context.callbackWaitsForEmptyEventLoop = false;

const metrics = new ScaleUpChronMetrics();
const sndMetricsTimout: sendMetricsTimeoutVars = {
metrics: metrics,
};
sndMetricsTimout.setTimeout = setTimeout(
sendMetricsAtTimeout(sndMetricsTimout),
(Config.Instance.lambdaTimeout - 10) * 1000,
);

try {
await scaleUpChronR(metrics);
return callback(null);
} catch (e) {
console.error(e);
return callback('Failed');
} finally {
try {
clearTimeout(sndMetricsTimout.setTimeout);
sndMetricsTimout.metrics = undefined;
sndMetricsTimout.setTimeout = undefined;
await metrics.sendMetrics();
} catch (e) {
console.error(`Error sending metrics: ${e}`);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,11 @@ export class Config {
readonly retryScaleUpRecordQueueUrl: string | undefined;
readonly runnerGroupName: string | undefined;
readonly runnersExtraLabels: undefined | string;
readonly scaleConfigOrg: string;
readonly scaleConfigRepo: string;
readonly scaleConfigRepoPath: string;
readonly scaleUpMinQueueTimeMinutes: number;
readonly scaleUpRecordQueueUrl: string | undefined;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what is this variable doing? I could not find any use in your code except fail if it is not set....

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ZainRizvi could you answer this? I'm also not sure

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I altered it. to be the constant for the URL which is what I thought it was meant to be? But lmk if not

readonly secretsManagerSecretsId: string | undefined;
readonly sSMParamCleanupAgeDays: number;
readonly sSMParamMaxCleanupAllowance: number;
Expand Down Expand Up @@ -94,8 +97,11 @@ export class Config {
/* istanbul ignore next */
this.retryScaleUpRecordJitterPct = Number(process.env.RETRY_SCALE_UP_RECORD_JITTER_PCT || '0');
this.retryScaleUpRecordQueueUrl = process.env.RETRY_SCALE_UP_RECORD_QUEUE_URL;
this.scaleUpRecordQueueUrl = process.env.SCALE_UP_RECORD_QUEUE_URL;
this.scaleUpMinQueueTimeMinutes = process.env.SCALE_UP_MIN_QUEUE_TIME_MINUTES ? Number(process.env.SCALE_UP_MIN_QUEUE_TIME_MINUTES) : 30
this.runnerGroupName = process.env.RUNNER_GROUP_NAME;
this.runnersExtraLabels = process.env.RUNNER_EXTRA_LABELS;
this.scaleConfigOrg = process.env.SCALE_CONFIG_ORG || '';
/* istanbul ignore next */
this.scaleConfigRepo = process.env.SCALE_CONFIG_REPO || '';
if (this.enableOrganizationRunners && !this.scaleConfigRepo) {
Expand Down
Loading
Loading